@sanity/ailf 0.4.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. package/config/features.ts +23 -0
  2. package/config/models.ts +83 -0
  3. package/config/prompts.ts +16 -0
  4. package/config/rubrics.ts +225 -0
  5. package/config/schedules.ts +47 -0
  6. package/config/sinks.ts +37 -0
  7. package/config/sources.ts +21 -0
  8. package/config/thresholds.ts +61 -0
  9. package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
  10. package/dist/_vendor/ailf-core/config-helpers.js +150 -0
  11. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  12. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  13. package/dist/_vendor/ailf-core/examples/index.d.ts +10 -10
  14. package/dist/_vendor/ailf-core/examples/index.js +10 -10
  15. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  16. package/dist/_vendor/ailf-core/index.js +5 -0
  17. package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
  18. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  19. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  20. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  21. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  22. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  23. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  24. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
  25. package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
  26. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
  27. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
  28. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +32 -31
  29. package/dist/_vendor/ailf-core/schemas/pipeline.js +52 -12
  30. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  31. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  32. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  33. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  34. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  35. package/dist/_vendor/ailf-core/services/index.js +2 -1
  36. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  37. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  38. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  39. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  40. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  41. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  42. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  43. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  44. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  46. package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
  47. package/dist/_vendor/ailf-core/types/index.js +8 -1
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
  50. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  51. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  52. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  53. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  54. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  55. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  56. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  57. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  58. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  59. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  60. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  61. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  62. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  63. package/dist/_vendor/ailf-shared/index.js +0 -1
  64. package/dist/adapters/api-client/build-request.js +14 -13
  65. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  66. package/dist/adapters/config-sources/file-config-adapter.js +38 -12
  67. package/dist/adapters/config-sources/index.d.ts +2 -0
  68. package/dist/adapters/config-sources/index.js +1 -0
  69. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  70. package/dist/adapters/config-sources/ts-config-loader.js +133 -0
  71. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  72. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  73. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  74. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  75. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  76. package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
  77. package/dist/adapters/task-sources/index.d.ts +1 -0
  78. package/dist/adapters/task-sources/index.js +1 -0
  79. package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
  80. package/dist/adapters/task-sources/repo-task-source.js +69 -16
  81. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  82. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  83. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  84. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  85. package/dist/cli.js +0 -2
  86. package/dist/commands/baseline.js +4 -1
  87. package/dist/commands/calculate-scores.js +1 -1
  88. package/dist/commands/coverage-audit.js +7 -1
  89. package/dist/commands/explain-handler.js +25 -23
  90. package/dist/commands/fetch-docs.js +3 -2
  91. package/dist/commands/generate-configs.js +1 -1
  92. package/dist/commands/interactive.js +11 -7
  93. package/dist/commands/pipeline-action.d.ts +2 -0
  94. package/dist/commands/pipeline-action.js +16 -6
  95. package/dist/commands/pipeline.d.ts +1 -0
  96. package/dist/commands/pipeline.js +4 -2
  97. package/dist/commands/pr-comment.js +1 -1
  98. package/dist/commands/publish.js +2 -2
  99. package/dist/commands/readiness-report.js +13 -6
  100. package/dist/composition-root.d.ts +1 -1
  101. package/dist/composition-root.js +67 -4
  102. package/dist/orchestration/build-app-context.js +1 -0
  103. package/dist/orchestration/build-step-sequence.js +24 -6
  104. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  105. package/dist/orchestration/steps/fetch-docs-step.js +6 -4
  106. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  107. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  108. package/dist/orchestration/steps/generate-configs-step.js +245 -51
  109. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  110. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  111. package/dist/orchestration/steps/readiness-step.js +5 -6
  112. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  113. package/dist/orchestration/steps/run-eval-step.js +8 -7
  114. package/dist/pipeline/cache.d.ts +1 -1
  115. package/dist/pipeline/cache.js +36 -8
  116. package/dist/pipeline/calculate-scores.d.ts +5 -7
  117. package/dist/pipeline/calculate-scores.js +74 -153
  118. package/dist/pipeline/checks.js +2 -2
  119. package/dist/pipeline/compare.js +8 -8
  120. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  121. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  122. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  123. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  124. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  125. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  126. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  127. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  128. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  129. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
  130. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  131. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  132. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  133. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  134. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  135. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
  136. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  137. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  138. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  139. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  140. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  141. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  142. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  143. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  144. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  145. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  146. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  147. package/dist/pipeline/compiler/config-loader.js +111 -0
  148. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  149. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  150. package/dist/pipeline/compiler/hash.d.ts +11 -0
  151. package/dist/pipeline/compiler/hash.js +18 -0
  152. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  153. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  154. package/dist/pipeline/compiler/index.d.ts +29 -0
  155. package/dist/pipeline/compiler/index.js +45 -0
  156. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  157. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  158. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  159. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  160. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  161. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  162. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  163. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  164. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  165. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  166. package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
  167. package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
  168. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  169. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  170. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  171. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  172. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  173. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
  174. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
  175. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
  176. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  177. package/dist/pipeline/compiler/presets/index.js +8 -0
  178. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
  179. package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
  180. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  181. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  182. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  183. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  184. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  185. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  186. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  187. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  188. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  189. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  190. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  191. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  192. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  193. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  194. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  195. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  196. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  197. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  198. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  199. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  200. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  201. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  202. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  203. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  204. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  205. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  206. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  207. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  208. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  209. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  210. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  211. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  212. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  213. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  214. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  215. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  216. package/dist/pipeline/coverage-audit.d.ts +15 -5
  217. package/dist/pipeline/coverage-audit.js +41 -22
  218. package/dist/pipeline/eval-constants.d.ts +16 -6
  219. package/dist/pipeline/eval-constants.js +25 -4
  220. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  221. package/dist/pipeline/eval-fingerprint.js +8 -9
  222. package/dist/pipeline/expand-tasks.d.ts +23 -14
  223. package/dist/pipeline/expand-tasks.js +37 -31
  224. package/dist/pipeline/gap-analysis.d.ts +1 -1
  225. package/dist/pipeline/gap-analysis.js +2 -2
  226. package/dist/pipeline/generate-configs.d.ts +22 -4
  227. package/dist/pipeline/generate-configs.js +53 -24
  228. package/dist/pipeline/grader-api.d.ts +3 -3
  229. package/dist/pipeline/grader-api.js +5 -12
  230. package/dist/pipeline/grader-compare-runner.js +20 -27
  231. package/dist/pipeline/grader-comparison.d.ts +4 -8
  232. package/dist/pipeline/grader-comparison.js +11 -17
  233. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  234. package/dist/pipeline/grader-consistency-runner.js +18 -21
  235. package/dist/pipeline/grader-consistency.d.ts +6 -10
  236. package/dist/pipeline/grader-consistency.js +13 -32
  237. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  238. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  239. package/dist/pipeline/grader-sensitivity.js +10 -10
  240. package/dist/pipeline/grader-validate-runner.js +7 -5
  241. package/dist/pipeline/grader-validation.d.ts +2 -6
  242. package/dist/pipeline/grader-validation.js +14 -22
  243. package/dist/pipeline/map-request-to-config.js +6 -1
  244. package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
  245. package/dist/pipeline/mirror-repo-tasks.js +16 -15
  246. package/dist/pipeline/normalize-mode.d.ts +49 -0
  247. package/dist/pipeline/normalize-mode.js +64 -0
  248. package/dist/pipeline/plan.d.ts +5 -2
  249. package/dist/pipeline/plan.js +134 -78
  250. package/dist/pipeline/pr-comment.js +2 -0
  251. package/dist/pipeline/profile-resolution.d.ts +47 -0
  252. package/dist/pipeline/profile-resolution.js +91 -0
  253. package/dist/pipeline/provenance.d.ts +2 -2
  254. package/dist/pipeline/provenance.js +12 -17
  255. package/dist/pipeline/release-report.js +4 -4
  256. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  257. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  258. package/dist/pipeline/rubric-loader.d.ts +20 -0
  259. package/dist/pipeline/rubric-loader.js +37 -0
  260. package/dist/pipeline/validate.d.ts +4 -4
  261. package/dist/pipeline/validate.js +64 -53
  262. package/dist/schedules/loader.js +18 -8
  263. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  264. package/dist/scripts/migrate-task-mode.js +85 -0
  265. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  266. package/dist/scripts/validate-task-sources.d.ts +1 -1
  267. package/dist/scripts/validate-task-sources.js +15 -15
  268. package/dist/sinks/loader.js +5 -7
  269. package/dist/sources.d.ts +7 -7
  270. package/dist/sources.js +22 -24
  271. package/dist/webhook/dispatch.js +2 -1
  272. package/package.json +6 -3
  273. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  274. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  275. package/tasks/literacy/frameworks.task.ts +128 -0
  276. package/tasks/literacy/functions.task.ts +69 -0
  277. package/tasks/literacy/groq.task.ts +258 -0
  278. package/tasks/literacy/nextjs-live.task.ts +75 -0
  279. package/tasks/literacy/studio-setup.task.ts +131 -0
  280. package/tasks/literacy/visual-editing.task.ts +146 -0
  281. package/config/features.yaml +0 -116
  282. package/config/models.yaml +0 -116
  283. package/config/prompts.yaml +0 -75
  284. package/config/rubrics.yaml +0 -62
  285. package/config/schedules.yaml +0 -43
  286. package/config/sinks.yaml +0 -54
  287. package/config/sources.yaml +0 -51
  288. package/config/thresholds.yaml +0 -49
  289. package/dist/agent-observer/test-imports.d.ts +0 -7
  290. package/dist/agent-observer/test-imports.js +0 -185
@@ -0,0 +1,146 @@
1
+ /**
2
+ * Cost tracking — model pricing, pre-run estimation, and post-run actuals.
3
+ *
4
+ * Uses a pricing table (YAML config or TS `definePricingTable()`) to compute
5
+ * USD cost from token usage. Supports budget controls with warn/stop thresholds.
6
+ *
7
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
8
+ */
9
+ // ---------------------------------------------------------------------------
10
+ // Pricing table
11
+ // ---------------------------------------------------------------------------
12
+ /** Default pricing table (updated periodically) */
13
+ const DEFAULT_PRICING = {
14
+ "anthropic:messages:claude-opus-4-6": {
15
+ input: 15.0,
16
+ output: 75.0,
17
+ cachedInput: 1.5,
18
+ },
19
+ "anthropic:messages:claude-sonnet-4-6": {
20
+ input: 3.0,
21
+ output: 15.0,
22
+ cachedInput: 0.3,
23
+ },
24
+ "openai:chat:gpt-4.1": {
25
+ input: 2.0,
26
+ output: 8.0,
27
+ cachedInput: 0.5,
28
+ },
29
+ "openai:chat:gpt-4.1-mini": {
30
+ input: 0.4,
31
+ output: 1.6,
32
+ cachedInput: 0.1,
33
+ },
34
+ "openai:chat:gpt-4o": {
35
+ input: 2.5,
36
+ output: 10.0,
37
+ cachedInput: 1.25,
38
+ },
39
+ "openai:chat:gpt-5": {
40
+ input: 5.0,
41
+ output: 15.0,
42
+ cachedInput: 1.0,
43
+ },
44
+ };
45
+ // ---------------------------------------------------------------------------
46
+ // Public API
47
+ // ---------------------------------------------------------------------------
48
+ /**
49
+ * Compute actual cost from token usage and model pricing.
50
+ *
51
+ * @param usage - Token counts from provider response
52
+ * @param pricing - Per-model pricing (USD per 1M tokens)
53
+ * @returns Cost in USD
54
+ */
55
+ export function computeCost(usage, pricing) {
56
+ const cached = usage.toolTokens ?? 0;
57
+ const uncachedPrompt = usage.promptTokens - cached;
58
+ const inputCost = (uncachedPrompt * pricing.input) / 1_000_000;
59
+ const cachedCost = pricing.cachedInput !== undefined
60
+ ? (cached * pricing.cachedInput) / 1_000_000
61
+ : (cached * pricing.input) / 1_000_000;
62
+ const outputCost = (usage.completionTokens * pricing.output) / 1_000_000;
63
+ return inputCost + cachedCost + outputCost;
64
+ }
65
+ /**
66
+ * Look up pricing for a model ID.
67
+ *
68
+ * Tries exact match first, then falls back to prefix matching
69
+ * (e.g., "openai:chat:gpt-4o-2024-11-20" matches "openai:chat:gpt-4o").
70
+ */
71
+ export function lookupPricing(modelId, customPricing) {
72
+ // 1. Exact match in custom pricing
73
+ if (customPricing?.[modelId])
74
+ return customPricing[modelId];
75
+ // 2. Exact match in defaults
76
+ if (DEFAULT_PRICING[modelId])
77
+ return DEFAULT_PRICING[modelId];
78
+ // 3. Prefix match in custom pricing
79
+ if (customPricing) {
80
+ for (const [key, pricing] of Object.entries(customPricing)) {
81
+ if (modelId.startsWith(key))
82
+ return pricing;
83
+ }
84
+ }
85
+ // 4. Prefix match in defaults
86
+ for (const [key, pricing] of Object.entries(DEFAULT_PRICING)) {
87
+ if (modelId.startsWith(key))
88
+ return pricing;
89
+ }
90
+ return undefined;
91
+ }
92
+ /**
93
+ * Estimate cost for a pipeline run before execution.
94
+ *
95
+ * Uses task count, estimated tokens per task complexity, and model pricing.
96
+ */
97
+ export function estimateRunCost(taskCount, modelIds, budget, customPricing) {
98
+ // Rough token estimates per task (empirical averages)
99
+ const AVG_PROMPT_TOKENS = 2000;
100
+ const AVG_COMPLETION_TOKENS = 1500;
101
+ const perModel = modelIds.map((modelId) => {
102
+ const pricing = lookupPricing(modelId, customPricing);
103
+ if (!pricing) {
104
+ return { modelId, estimatedUSD: 0 };
105
+ }
106
+ const estimatedUSD = computeCost({
107
+ promptTokens: AVG_PROMPT_TOKENS * taskCount,
108
+ completionTokens: AVG_COMPLETION_TOKENS * taskCount,
109
+ totalTokens: (AVG_PROMPT_TOKENS + AVG_COMPLETION_TOKENS) * taskCount,
110
+ }, pricing);
111
+ return { modelId, estimatedUSD };
112
+ });
113
+ const totalUSD = perModel.reduce((sum, m) => sum + m.estimatedUSD, 0);
114
+ return {
115
+ totalUSD,
116
+ perModel,
117
+ exceedsWarning: budget?.perRun ? totalUSD >= budget.perRun.warn : false,
118
+ exceedsStop: budget?.perRun ? totalUSD >= budget.perRun.stop : false,
119
+ };
120
+ }
121
+ /**
122
+ * Check if current spend exceeds budget thresholds.
123
+ */
124
+ export function checkBudget(currentUSD, budget, level) {
125
+ const limits = budget[level];
126
+ if (!limits) {
127
+ return { proceed: true, currentUSD };
128
+ }
129
+ if (currentUSD >= limits.stop) {
130
+ return {
131
+ proceed: false,
132
+ warning: `Budget exceeded: $${currentUSD.toFixed(4)} >= $${limits.stop} (${level} stop limit)`,
133
+ currentUSD,
134
+ limitUSD: limits.stop,
135
+ };
136
+ }
137
+ if (currentUSD >= limits.warn) {
138
+ return {
139
+ proceed: true,
140
+ warning: `Budget warning: $${currentUSD.toFixed(4)} >= $${limits.warn} (${level} warn threshold)`,
141
+ currentUSD,
142
+ limitUSD: limits.warn,
143
+ };
144
+ }
145
+ return { proceed: true, currentUSD };
146
+ }
@@ -0,0 +1,14 @@
1
+ /**
2
+ * Telemetry — observability infrastructure for evaluation traces.
3
+ *
4
+ * Captures tool calls, token usage, cost, and timing for every evaluation.
5
+ * Full traces go to blob storage; sanitized summaries to Content Lake.
6
+ *
7
+ * @see docs/exec-plans/architecture-overhaul/phase-6-observability.md
8
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
9
+ */
10
+ export { collectTrace, mergeTraces, type ProviderResponse, type RawToolCall, type TraceCollectorOptions, } from "./trace-collector.js";
11
+ export { classifyToolCall, classifyToolCalls } from "./tool-classifier.js";
12
+ export { checkBudget, computeCost, estimateRunCost, lookupPricing, type ActualCost, type BudgetCheckResult, type BudgetConfig, type CostEstimate, type ModelPricing, } from "./cost-tracker.js";
13
+ export { extractTraceSummary, LocalTraceStore, type TraceSummary, type TraceStore, type TraceStoreResult, } from "./trace-store.js";
14
+ export { createRedactionConfig, DEFAULT_REDACTION_RULES, redactTrace, type RedactionConfig, type RedactionResult, type RedactionRule, } from "./redactor.js";
@@ -0,0 +1,19 @@
1
+ /**
2
+ * Telemetry — observability infrastructure for evaluation traces.
3
+ *
4
+ * Captures tool calls, token usage, cost, and timing for every evaluation.
5
+ * Full traces go to blob storage; sanitized summaries to Content Lake.
6
+ *
7
+ * @see docs/exec-plans/architecture-overhaul/phase-6-observability.md
8
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
9
+ */
10
+ // Trace collection
11
+ export { collectTrace, mergeTraces, } from "./trace-collector.js";
12
+ // Tool call classification
13
+ export { classifyToolCall, classifyToolCalls } from "./tool-classifier.js";
14
+ // Cost tracking
15
+ export { checkBudget, computeCost, estimateRunCost, lookupPricing, } from "./cost-tracker.js";
16
+ // Trace storage
17
+ export { extractTraceSummary, LocalTraceStore, } from "./trace-store.js";
18
+ // Redaction
19
+ export { createRedactionConfig, DEFAULT_REDACTION_RULES, redactTrace, } from "./redactor.js";
@@ -0,0 +1,58 @@
1
+ /**
2
+ * Redaction pipeline — strips sensitive data from traces before storage.
3
+ *
4
+ * Applied before ANY storage (both blob and Content Lake). Configurable
5
+ * patterns handle Bearer tokens, API keys, Sanity tokens, and other
6
+ * common secret formats.
7
+ *
8
+ * Principles:
9
+ * 1. Redact before store — sensitive data never reaches storage
10
+ * 2. Configurable patterns — teams can add project-specific rules
11
+ * 3. Truncation for cost — large outputs truncated to max bytes
12
+ * 4. No PII by default — tasks shouldn't contain PII, this is a safety net
13
+ *
14
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
15
+ */
16
+ import type { EvalTrace } from "../../../_vendor/ailf-core/index.d.ts";
17
+ /** A single redaction rule */
18
+ export interface RedactionRule {
19
+ /** Rule name (for logging) */
20
+ name: string;
21
+ /** Regex pattern to match */
22
+ pattern: RegExp;
23
+ /** Replacement string (use $1, $2 for capture groups) */
24
+ replacement: string;
25
+ }
26
+ /** Redaction configuration */
27
+ export interface RedactionConfig {
28
+ /** Regex-based substitution rules */
29
+ rules: RedactionRule[];
30
+ /** Fields to omit entirely from stored traces */
31
+ omitFields: string[];
32
+ /** Maximum tool call output size in bytes */
33
+ maxOutputBytes: number;
34
+ }
35
+ /** Result of redaction */
36
+ export interface RedactionResult {
37
+ /** Redacted trace */
38
+ trace: EvalTrace;
39
+ /** Number of redactions applied */
40
+ redactionCount: number;
41
+ /** Which rules fired */
42
+ rulesApplied: string[];
43
+ }
44
+ /** Built-in redaction rules for common secret patterns */
45
+ export declare const DEFAULT_REDACTION_RULES: RedactionRule[];
46
+ /**
47
+ * Create a default redaction config.
48
+ *
49
+ * @param overrides - Custom rules or settings to merge
50
+ */
51
+ export declare function createRedactionConfig(overrides?: Partial<RedactionConfig>): RedactionConfig;
52
+ /**
53
+ * Apply redaction to an evaluation trace.
54
+ *
55
+ * Processes tool call inputs and outputs, event data, and search terms.
56
+ * Returns a new trace (does not mutate the original).
57
+ */
58
+ export declare function redactTrace(trace: EvalTrace, config?: RedactionConfig): RedactionResult;
@@ -0,0 +1,222 @@
1
+ /**
2
+ * Redaction pipeline — strips sensitive data from traces before storage.
3
+ *
4
+ * Applied before ANY storage (both blob and Content Lake). Configurable
5
+ * patterns handle Bearer tokens, API keys, Sanity tokens, and other
6
+ * common secret formats.
7
+ *
8
+ * Principles:
9
+ * 1. Redact before store — sensitive data never reaches storage
10
+ * 2. Configurable patterns — teams can add project-specific rules
11
+ * 3. Truncation for cost — large outputs truncated to max bytes
12
+ * 4. No PII by default — tasks shouldn't contain PII, this is a safety net
13
+ *
14
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
15
+ */
16
+ // ---------------------------------------------------------------------------
17
+ // Default rules
18
+ // ---------------------------------------------------------------------------
19
+ /** Built-in redaction rules for common secret patterns */
20
+ export const DEFAULT_REDACTION_RULES = [
21
+ {
22
+ name: "bearer_tokens",
23
+ pattern: /Bearer\s+[A-Za-z0-9._~+/=-]{10,}/g,
24
+ replacement: "Bearer [REDACTED]",
25
+ },
26
+ {
27
+ name: "sanity_tokens",
28
+ pattern: /sk[A-Za-z0-9]{30,}/g,
29
+ replacement: "[REDACTED_SANITY_TOKEN]",
30
+ },
31
+ {
32
+ name: "openai_keys",
33
+ pattern: /sk-[A-Za-z0-9_-]{20,}/g,
34
+ replacement: "[REDACTED_OPENAI_KEY]",
35
+ },
36
+ {
37
+ name: "api_key_values",
38
+ pattern: /((?:api[_-]?key|token|secret|password|authorization)\s*[:=]\s*)(["']?)(?!\[REDACTED)[^\s"']{8,}\2/gi,
39
+ replacement: "$1$2[REDACTED]$2",
40
+ },
41
+ {
42
+ name: "slack_tokens",
43
+ pattern: /xoxb-[A-Za-z0-9-]{20,}/g,
44
+ replacement: "[REDACTED_SLACK_TOKEN]",
45
+ },
46
+ {
47
+ name: "github_tokens",
48
+ pattern: /gh[ps]_[A-Za-z0-9]{30,}/g,
49
+ replacement: "[REDACTED_GITHUB_TOKEN]",
50
+ },
51
+ {
52
+ name: "anthropic_keys",
53
+ pattern: /sk-ant-[A-Za-z0-9_-]{20,}/g,
54
+ replacement: "[REDACTED_ANTHROPIC_KEY]",
55
+ },
56
+ {
57
+ name: "base64_credentials",
58
+ pattern: /Basic\s+[A-Za-z0-9+/=]{20,}/g,
59
+ replacement: "Basic [REDACTED]",
60
+ },
61
+ ];
62
+ /** Default fields to omit entirely */
63
+ const DEFAULT_OMIT_FIELDS = [
64
+ "toolCalls[*].input.headers.Authorization",
65
+ "toolCalls[*].input.headers.Cookie",
66
+ "toolCalls[*].input.headers.Set-Cookie",
67
+ ];
68
+ const DEFAULT_MAX_OUTPUT_BYTES = 10_240;
69
+ // ---------------------------------------------------------------------------
70
+ // Public API
71
+ // ---------------------------------------------------------------------------
72
+ /**
73
+ * Create a default redaction config.
74
+ *
75
+ * @param overrides - Custom rules or settings to merge
76
+ */
77
+ export function createRedactionConfig(overrides) {
78
+ return {
79
+ rules: overrides?.rules
80
+ ? [...DEFAULT_REDACTION_RULES, ...overrides.rules]
81
+ : DEFAULT_REDACTION_RULES,
82
+ omitFields: overrides?.omitFields
83
+ ? [...DEFAULT_OMIT_FIELDS, ...overrides.omitFields]
84
+ : DEFAULT_OMIT_FIELDS,
85
+ maxOutputBytes: overrides?.maxOutputBytes ?? DEFAULT_MAX_OUTPUT_BYTES,
86
+ };
87
+ }
88
+ /**
89
+ * Apply redaction to an evaluation trace.
90
+ *
91
+ * Processes tool call inputs and outputs, event data, and search terms.
92
+ * Returns a new trace (does not mutate the original).
93
+ */
94
+ export function redactTrace(trace, config) {
95
+ const cfg = config ?? createRedactionConfig();
96
+ let redactionCount = 0;
97
+ const rulesApplied = new Set();
98
+ // Deep clone to avoid mutation
99
+ const redacted = JSON.parse(JSON.stringify(trace));
100
+ // Redact tool calls
101
+ redacted.toolCalls = redacted.toolCalls.map((call) => {
102
+ const result = redactToolCall(call, cfg);
103
+ redactionCount += result.count;
104
+ for (const rule of result.rules)
105
+ rulesApplied.add(rule);
106
+ return result.call;
107
+ });
108
+ // Redact events
109
+ redacted.events = redacted.events.map((event) => {
110
+ const dataStr = JSON.stringify(event.data);
111
+ const { text, count, rules } = applyRules(dataStr, cfg.rules);
112
+ redactionCount += count;
113
+ for (const rule of rules)
114
+ rulesApplied.add(rule);
115
+ return { ...event, data: JSON.parse(text) };
116
+ });
117
+ // Redact search terms (may contain embedded secrets)
118
+ redacted.searchTerms = redacted.searchTerms.map((term) => {
119
+ const { text, count, rules } = applyRules(term, cfg.rules);
120
+ redactionCount += count;
121
+ for (const rule of rules)
122
+ rulesApplied.add(rule);
123
+ return text;
124
+ });
125
+ return {
126
+ trace: redacted,
127
+ redactionCount,
128
+ rulesApplied: [...rulesApplied],
129
+ };
130
+ }
131
+ // ---------------------------------------------------------------------------
132
+ // Tool call redaction
133
+ // ---------------------------------------------------------------------------
134
+ function redactToolCall(call, config) {
135
+ let count = 0;
136
+ const rules = [];
137
+ // Redact input
138
+ const inputStr = JSON.stringify(call.input);
139
+ const inputResult = applyRules(inputStr, config.rules);
140
+ count += inputResult.count;
141
+ rules.push(...inputResult.rules);
142
+ // Redact output
143
+ let outputStr = JSON.stringify(call.output);
144
+ // Truncate output if too large
145
+ if (outputStr.length > config.maxOutputBytes) {
146
+ outputStr = outputStr.slice(0, config.maxOutputBytes) + "... [truncated]";
147
+ }
148
+ const outputResult = applyRules(outputStr, config.rules);
149
+ count += outputResult.count;
150
+ rules.push(...outputResult.rules);
151
+ // Omit specific fields from input
152
+ let parsedInput = JSON.parse(inputResult.text);
153
+ parsedInput = omitFields(parsedInput, config.omitFields, "input");
154
+ return {
155
+ call: {
156
+ ...call,
157
+ input: parsedInput,
158
+ output: parseJsonSafe(outputResult.text),
159
+ },
160
+ count,
161
+ rules,
162
+ };
163
+ }
164
+ // ---------------------------------------------------------------------------
165
+ // Rule application
166
+ // ---------------------------------------------------------------------------
167
+ function applyRules(text, rules) {
168
+ let result = text;
169
+ let count = 0;
170
+ const appliedRules = [];
171
+ for (const rule of rules) {
172
+ // Reset lastIndex before match() — global regexes are stateful
173
+ rule.pattern.lastIndex = 0;
174
+ const matches = result.match(rule.pattern);
175
+ if (matches && matches.length > 0) {
176
+ count += matches.length;
177
+ appliedRules.push(rule.name);
178
+ // Reset again before replace() — match() may leave lastIndex dirty
179
+ rule.pattern.lastIndex = 0;
180
+ result = result.replace(rule.pattern, rule.replacement);
181
+ }
182
+ }
183
+ return { text: result, count, rules: appliedRules };
184
+ }
185
+ // ---------------------------------------------------------------------------
186
+ // Field omission
187
+ // ---------------------------------------------------------------------------
188
+ function omitFields(obj, patterns, context) {
189
+ for (const pattern of patterns) {
190
+ // Simple field path handling (not full JSONPath)
191
+ // Handles: "toolCalls[*].input.headers.Authorization" when context is "input"
192
+ if (pattern.includes(context)) {
193
+ const parts = pattern.split(".");
194
+ const fieldIndex = parts.indexOf(context);
195
+ if (fieldIndex >= 0) {
196
+ const remainingPath = parts.slice(fieldIndex + 1);
197
+ deleteNestedField(obj, remainingPath);
198
+ }
199
+ }
200
+ }
201
+ return obj;
202
+ }
203
+ function deleteNestedField(obj, path) {
204
+ if (path.length === 0)
205
+ return;
206
+ if (path.length === 1) {
207
+ delete obj[path[0]];
208
+ return;
209
+ }
210
+ const child = obj[path[0]];
211
+ if (child && typeof child === "object") {
212
+ deleteNestedField(child, path.slice(1));
213
+ }
214
+ }
215
+ function parseJsonSafe(text) {
216
+ try {
217
+ return JSON.parse(text);
218
+ }
219
+ catch {
220
+ return text;
221
+ }
222
+ }
@@ -0,0 +1,32 @@
1
+ /**
2
+ * Tool call classification — maps raw provider tool names to categories.
3
+ *
4
+ * Raw tool names from providers are noisy and inconsistent (`WebSearch` vs
5
+ * `web_search` vs `Browser.search`). This module normalizes every tool call
6
+ * into one of six standard categories for cross-model comparison.
7
+ *
8
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
9
+ */
10
+ import type { ToolCallCategory } from "../../../_vendor/ailf-core/index.d.ts";
11
+ /**
12
+ * Classify a tool call by its raw name.
13
+ *
14
+ * Resolution order:
15
+ * 1. Exact match in custom overrides (if provided)
16
+ * 2. Exact match in default tool categories
17
+ * 3. Heuristic pattern matching on the name
18
+ * 4. Falls back to "execute" (safest default for unknown tools)
19
+ *
20
+ * @param name - Raw tool name from the provider
21
+ * @param customMappings - Optional custom tool → category overrides
22
+ * @returns The classified category
23
+ */
24
+ export declare function classifyToolCall(name: string, customMappings?: Record<string, ToolCallCategory>): ToolCallCategory;
25
+ /**
26
+ * Classify multiple tool calls, returning the category for each.
27
+ * Also tracks unrecognized names for the caller to log warnings.
28
+ */
29
+ export declare function classifyToolCalls(names: string[], customMappings?: Record<string, ToolCallCategory>): {
30
+ categories: ToolCallCategory[];
31
+ unrecognized: string[];
32
+ };
@@ -0,0 +1,120 @@
1
+ /**
2
+ * Tool call classification — maps raw provider tool names to categories.
3
+ *
4
+ * Raw tool names from providers are noisy and inconsistent (`WebSearch` vs
5
+ * `web_search` vs `Browser.search`). This module normalizes every tool call
6
+ * into one of six standard categories for cross-model comparison.
7
+ *
8
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
9
+ */
10
+ // ---------------------------------------------------------------------------
11
+ // Default tool name → category mapping
12
+ // ---------------------------------------------------------------------------
13
+ const DEFAULT_TOOL_CATEGORIES = {
14
+ // Search tools
15
+ Grep: "search",
16
+ WebSearch: "search",
17
+ grep: "search",
18
+ search: "search",
19
+ semantic_search: "search",
20
+ web_search: "search",
21
+ // Read tools
22
+ Glob: "read",
23
+ Read: "read",
24
+ WebFetch: "read",
25
+ cat: "read",
26
+ curl: "read",
27
+ file_read: "read",
28
+ read_file: "read",
29
+ web_fetch: "read",
30
+ // Write tools
31
+ Edit: "write",
32
+ FileEdit: "write",
33
+ Write: "write",
34
+ file_write: "write",
35
+ patch: "write",
36
+ write_file: "write",
37
+ // Execute tools
38
+ Bash: "execute",
39
+ RunCode: "execute",
40
+ bash: "execute",
41
+ exec: "execute",
42
+ python: "execute",
43
+ run_code: "execute",
44
+ shell: "execute",
45
+ // Navigate tools
46
+ "Browser.navigate": "navigate",
47
+ FollowLink: "navigate",
48
+ browse: "navigate",
49
+ follow_link: "navigate",
50
+ navigate: "navigate",
51
+ open_url: "navigate",
52
+ // Communicate tools
53
+ AskUser: "communicate",
54
+ TodoRead: "communicate",
55
+ TodoWrite: "communicate",
56
+ ask_user: "communicate",
57
+ submit_response: "communicate",
58
+ };
59
+ // ---------------------------------------------------------------------------
60
+ // Heuristic patterns (fallback when name not in lookup table)
61
+ // ---------------------------------------------------------------------------
62
+ const HEURISTIC_PATTERNS = [
63
+ [/search|find|query|lookup|grep/i, "search"],
64
+ [/read|fetch|get|load|cat|view/i, "read"],
65
+ [/write|create|edit|update|patch|save|put|post/i, "write"],
66
+ [/exec|run|bash|shell|python|code|command/i, "execute"],
67
+ [/navigate|browse|open|follow|link|url/i, "navigate"],
68
+ [/ask|user|chat|message|submit|todo|response/i, "communicate"],
69
+ ];
70
+ // ---------------------------------------------------------------------------
71
+ // Public API
72
+ // ---------------------------------------------------------------------------
73
+ /**
74
+ * Classify a tool call by its raw name.
75
+ *
76
+ * Resolution order:
77
+ * 1. Exact match in custom overrides (if provided)
78
+ * 2. Exact match in default tool categories
79
+ * 3. Heuristic pattern matching on the name
80
+ * 4. Falls back to "execute" (safest default for unknown tools)
81
+ *
82
+ * @param name - Raw tool name from the provider
83
+ * @param customMappings - Optional custom tool → category overrides
84
+ * @returns The classified category
85
+ */
86
+ export function classifyToolCall(name, customMappings) {
87
+ // 1. Custom overrides
88
+ if (customMappings?.[name]) {
89
+ return customMappings[name];
90
+ }
91
+ // 2. Default lookup
92
+ if (DEFAULT_TOOL_CATEGORIES[name]) {
93
+ return DEFAULT_TOOL_CATEGORIES[name];
94
+ }
95
+ // 3. Heuristic matching
96
+ for (const [pattern, category] of HEURISTIC_PATTERNS) {
97
+ if (pattern.test(name)) {
98
+ return category;
99
+ }
100
+ }
101
+ // 4. Unknown → execute (safest default)
102
+ return "execute";
103
+ }
104
+ /**
105
+ * Classify multiple tool calls, returning the category for each.
106
+ * Also tracks unrecognized names for the caller to log warnings.
107
+ */
108
+ export function classifyToolCalls(names, customMappings) {
109
+ const categories = [];
110
+ const unrecognized = [];
111
+ for (const name of names) {
112
+ const category = classifyToolCall(name, customMappings);
113
+ categories.push(category);
114
+ // Track names that required heuristic or default fallback
115
+ if (!DEFAULT_TOOL_CATEGORIES[name] && !customMappings?.[name]) {
116
+ unrecognized.push(name);
117
+ }
118
+ }
119
+ return { categories, unrecognized };
120
+ }
@@ -0,0 +1,75 @@
1
+ /**
2
+ * TraceCollector — extracts structured trace data from provider responses.
3
+ *
4
+ * Parses tool calls, token usage, and timing data from Promptfoo result
5
+ * objects and normalizes them into the canonical `EvalTrace` shape.
6
+ *
7
+ * Works via inline extraction — parsing provider response metadata
8
+ * directly, without requiring additional infrastructure.
9
+ *
10
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
11
+ * @see packages/core/src/types/trace.ts — EvalTrace types
12
+ */
13
+ import type { EvalTrace, ToolCallCategory } from "../../../_vendor/ailf-core/index.d.ts";
14
+ /** Raw provider response shape (subset of Promptfoo's result object) */
15
+ export interface ProviderResponse {
16
+ /** Raw text output */
17
+ output?: string;
18
+ /** Token usage (varies by provider) */
19
+ tokenUsage?: {
20
+ completion?: number;
21
+ prompt?: number;
22
+ total?: number;
23
+ cached?: number;
24
+ };
25
+ /** Provider-specific metadata (e.g., Claude's toolCalls) */
26
+ metadata?: {
27
+ toolCalls?: RawToolCall[];
28
+ [key: string]: unknown;
29
+ };
30
+ /** Response latency in milliseconds */
31
+ latencyMs?: number;
32
+ }
33
+ /** Raw tool call from a provider (pre-normalization) */
34
+ export interface RawToolCall {
35
+ name?: string;
36
+ input?: Record<string, unknown>;
37
+ output?: unknown;
38
+ error?: string;
39
+ durationMs?: number;
40
+ /** Alternative field names used by some providers */
41
+ function?: {
42
+ name?: string;
43
+ arguments?: string;
44
+ };
45
+ type?: string;
46
+ }
47
+ /** Options for trace collection */
48
+ export interface TraceCollectorOptions {
49
+ /** Run ID to associate with this trace */
50
+ runId: string;
51
+ /** Task ID that produced this test case */
52
+ taskId: string;
53
+ /** Test case index within the task */
54
+ testCaseIndex: number;
55
+ /** Model under evaluation */
56
+ modelId: string;
57
+ /** Custom tool → category mappings */
58
+ toolCategories?: Record<string, ToolCallCategory>;
59
+ /** Maximum output size per tool call (bytes) */
60
+ maxOutputBytes?: number;
61
+ }
62
+ /**
63
+ * Collect a trace from a single provider response.
64
+ *
65
+ * Extracts tool calls, token usage, timing, and builds the
66
+ * chronological event log.
67
+ */
68
+ export declare function collectTrace(response: ProviderResponse, options: TraceCollectorOptions): EvalTrace;
69
+ /**
70
+ * Merge multiple per-turn traces into a single test case trace.
71
+ *
72
+ * Each turn produces its own trace. This function combines them into
73
+ * a parent trace with per-turn spans.
74
+ */
75
+ export declare function mergeTraces(turns: EvalTrace[], parentOptions: TraceCollectorOptions): EvalTrace;