@sanity/ailf 0.5.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. package/README.md +0 -1
  2. package/config/features.ts +23 -0
  3. package/config/models.ts +95 -0
  4. package/config/prompts.ts +16 -0
  5. package/config/rubrics.ts +225 -0
  6. package/config/schedules.ts +47 -0
  7. package/config/sinks.ts +37 -0
  8. package/config/sources.ts +21 -0
  9. package/config/thresholds.ts +61 -0
  10. package/dist/_vendor/ailf-core/config-helpers.d.ts +171 -0
  11. package/dist/_vendor/ailf-core/config-helpers.js +170 -0
  12. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  13. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  14. package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
  15. package/dist/_vendor/ailf-core/examples/index.js +25 -0
  16. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  17. package/dist/_vendor/ailf-core/index.js +5 -0
  18. package/dist/_vendor/ailf-core/ports/context.d.ts +17 -2
  19. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  20. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  21. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  22. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  23. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  24. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  25. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +8 -2
  26. package/dist/_vendor/ailf-core/schemas/eval-config.js +17 -2
  27. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +9 -3
  28. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +8 -1
  29. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -31
  30. package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -9
  31. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  32. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  33. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  34. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  35. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  36. package/dist/_vendor/ailf-core/services/index.js +2 -1
  37. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  38. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  39. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  40. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  41. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  42. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  43. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  44. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +332 -0
  46. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  47. package/dist/_vendor/ailf-core/types/index.d.ts +45 -83
  48. package/dist/_vendor/ailf-core/types/index.js +8 -1
  49. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +257 -0
  50. package/dist/_vendor/ailf-core/types/plugin-registry.js +185 -0
  51. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  52. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  53. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  54. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  55. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  56. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  57. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  58. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  59. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  60. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  61. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  62. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  63. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  64. package/dist/_vendor/ailf-shared/index.js +0 -1
  65. package/dist/adapters/api-client/build-request.js +14 -13
  66. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  67. package/dist/adapters/config-sources/file-config-adapter.js +39 -12
  68. package/dist/adapters/config-sources/index.d.ts +2 -0
  69. package/dist/adapters/config-sources/index.js +1 -0
  70. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  71. package/dist/adapters/config-sources/ts-config-loader.js +141 -0
  72. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  73. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  74. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  75. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  76. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  77. package/dist/adapters/task-sources/content-lake-task-source.js +35 -39
  78. package/dist/adapters/task-sources/index.d.ts +3 -2
  79. package/dist/adapters/task-sources/index.js +3 -2
  80. package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
  81. package/dist/adapters/task-sources/repo-schemas.js +227 -19
  82. package/dist/adapters/task-sources/repo-task-source.d.ts +16 -12
  83. package/dist/adapters/task-sources/repo-task-source.js +92 -80
  84. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  85. package/dist/adapters/task-sources/repo-validation.js +126 -5
  86. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  87. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  88. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  89. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  90. package/dist/cli.js +0 -2
  91. package/dist/commands/baseline.js +4 -1
  92. package/dist/commands/calculate-scores.js +1 -1
  93. package/dist/commands/coverage-audit.js +9 -1
  94. package/dist/commands/explain-handler.js +25 -23
  95. package/dist/commands/fetch-docs.js +3 -2
  96. package/dist/commands/generate-configs.js +1 -1
  97. package/dist/commands/init.d.ts +6 -4
  98. package/dist/commands/init.js +302 -23
  99. package/dist/commands/interactive.js +11 -7
  100. package/dist/commands/pipeline-action.d.ts +2 -0
  101. package/dist/commands/pipeline-action.js +16 -6
  102. package/dist/commands/pipeline.d.ts +1 -0
  103. package/dist/commands/pipeline.js +4 -2
  104. package/dist/commands/pr-comment.js +1 -1
  105. package/dist/commands/publish.js +2 -2
  106. package/dist/commands/readiness-report.js +13 -6
  107. package/dist/commands/validate-tasks.d.ts +2 -2
  108. package/dist/commands/validate-tasks.js +26 -15
  109. package/dist/composition-root.d.ts +13 -1
  110. package/dist/composition-root.js +99 -4
  111. package/dist/index.d.ts +41 -0
  112. package/dist/index.js +48 -0
  113. package/dist/orchestration/build-app-context.js +1 -0
  114. package/dist/orchestration/build-step-sequence.js +28 -8
  115. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  116. package/dist/orchestration/steps/fetch-docs-step.js +8 -7
  117. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  118. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  119. package/dist/orchestration/steps/generate-configs-step.js +261 -51
  120. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  121. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  122. package/dist/orchestration/steps/readiness-step.js +5 -6
  123. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  124. package/dist/orchestration/steps/run-eval-step.js +8 -7
  125. package/dist/pipeline/cache.d.ts +1 -1
  126. package/dist/pipeline/cache.js +36 -8
  127. package/dist/pipeline/calculate-scores.d.ts +2 -4
  128. package/dist/pipeline/calculate-scores.js +43 -113
  129. package/dist/pipeline/checks.js +2 -2
  130. package/dist/pipeline/compare.js +8 -8
  131. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  132. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  133. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  134. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  135. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  136. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  137. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  138. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  139. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  140. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +392 -0
  141. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  142. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  143. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  144. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  145. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  146. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +404 -0
  147. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  148. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  149. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  150. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  151. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  152. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  153. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  154. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  155. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  156. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  157. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  158. package/dist/pipeline/compiler/config-loader.js +111 -0
  159. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  160. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  161. package/dist/pipeline/compiler/hash.d.ts +11 -0
  162. package/dist/pipeline/compiler/hash.js +18 -0
  163. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  164. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  165. package/dist/pipeline/compiler/index.d.ts +29 -0
  166. package/dist/pipeline/compiler/index.js +45 -0
  167. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  168. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  169. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  170. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  171. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  172. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  173. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  174. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  175. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
  176. package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
  177. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  178. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  179. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  180. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  181. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  182. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  183. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  184. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  185. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  186. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  187. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  188. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  189. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  190. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  191. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  192. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  193. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  194. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  195. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  196. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  197. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
  198. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  199. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  200. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  201. package/dist/pipeline/compiler/mode-handlers/index.d.ts +15 -0
  202. package/dist/pipeline/compiler/mode-handlers/index.js +19 -0
  203. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  204. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  205. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  206. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  207. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  208. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  209. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  210. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  211. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  212. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  213. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  214. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  215. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  216. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  217. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  218. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  219. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  220. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  221. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  222. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  223. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  224. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  225. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  226. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  227. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
  228. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
  229. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  230. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  231. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  232. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  233. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  234. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  235. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  236. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
  237. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  238. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  239. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  240. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  241. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  242. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
  243. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  244. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
  245. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  246. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  247. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
  248. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
  249. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
  250. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  251. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  252. package/dist/pipeline/compiler/preset-loader.js +99 -0
  253. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  254. package/dist/pipeline/compiler/presets/index.js +8 -0
  255. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +42 -0
  256. package/dist/pipeline/compiler/presets/sanity-literacy.js +208 -0
  257. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  258. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  259. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  260. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  261. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  262. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  263. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  264. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  265. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  266. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  267. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  268. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  269. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  270. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  271. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  272. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  273. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  274. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  275. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  276. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  277. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  278. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  279. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  280. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  281. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  282. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  283. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  284. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  285. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  286. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  287. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  288. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  289. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  290. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  291. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  292. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  293. package/dist/pipeline/coverage-audit.d.ts +15 -5
  294. package/dist/pipeline/coverage-audit.js +41 -22
  295. package/dist/pipeline/eval-constants.d.ts +16 -6
  296. package/dist/pipeline/eval-constants.js +25 -4
  297. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  298. package/dist/pipeline/eval-fingerprint.js +8 -9
  299. package/dist/pipeline/expand-tasks.d.ts +19 -10
  300. package/dist/pipeline/expand-tasks.js +34 -28
  301. package/dist/pipeline/gap-analysis.d.ts +1 -1
  302. package/dist/pipeline/gap-analysis.js +2 -2
  303. package/dist/pipeline/generate-configs.d.ts +22 -4
  304. package/dist/pipeline/generate-configs.js +53 -24
  305. package/dist/pipeline/grader-api.d.ts +3 -3
  306. package/dist/pipeline/grader-api.js +5 -12
  307. package/dist/pipeline/grader-compare-runner.js +20 -27
  308. package/dist/pipeline/grader-comparison.d.ts +4 -8
  309. package/dist/pipeline/grader-comparison.js +11 -17
  310. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  311. package/dist/pipeline/grader-consistency-runner.js +16 -20
  312. package/dist/pipeline/grader-consistency.d.ts +6 -10
  313. package/dist/pipeline/grader-consistency.js +13 -32
  314. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  315. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  316. package/dist/pipeline/grader-sensitivity.js +10 -10
  317. package/dist/pipeline/grader-validate-runner.js +7 -5
  318. package/dist/pipeline/grader-validation.d.ts +2 -6
  319. package/dist/pipeline/grader-validation.js +14 -22
  320. package/dist/pipeline/map-request-to-config.js +7 -1
  321. package/dist/pipeline/mirror-repo-tasks.d.ts +13 -13
  322. package/dist/pipeline/mirror-repo-tasks.js +22 -21
  323. package/dist/pipeline/normalize-mode.d.ts +49 -0
  324. package/dist/pipeline/normalize-mode.js +64 -0
  325. package/dist/pipeline/plan.d.ts +5 -2
  326. package/dist/pipeline/plan.js +134 -78
  327. package/dist/pipeline/pr-comment.js +2 -0
  328. package/dist/pipeline/profile-resolution.d.ts +22 -14
  329. package/dist/pipeline/profile-resolution.js +41 -19
  330. package/dist/pipeline/provenance.d.ts +2 -2
  331. package/dist/pipeline/provenance.js +12 -17
  332. package/dist/pipeline/release-report.js +4 -4
  333. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  334. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  335. package/dist/pipeline/rubric-loader.d.ts +20 -0
  336. package/dist/pipeline/rubric-loader.js +37 -0
  337. package/dist/pipeline/validate.d.ts +4 -4
  338. package/dist/pipeline/validate.js +64 -53
  339. package/dist/schedules/loader.js +18 -8
  340. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  341. package/dist/scripts/migrate-task-mode.js +85 -0
  342. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  343. package/dist/scripts/validate-task-sources.d.ts +1 -1
  344. package/dist/scripts/validate-task-sources.js +15 -15
  345. package/dist/sinks/loader.js +5 -7
  346. package/dist/sources.d.ts +7 -7
  347. package/dist/sources.js +22 -24
  348. package/dist/webhook/dispatch.js +2 -1
  349. package/package.json +15 -4
  350. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  351. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  352. package/tasks/literacy/frameworks.task.ts +128 -0
  353. package/tasks/literacy/functions.task.ts +69 -0
  354. package/tasks/literacy/groq.task.ts +258 -0
  355. package/tasks/literacy/nextjs-live.task.ts +75 -0
  356. package/tasks/literacy/studio-setup.task.ts +131 -0
  357. package/tasks/literacy/visual-editing.task.ts +146 -0
  358. package/config/features.yaml +0 -116
  359. package/config/models.yaml +0 -116
  360. package/config/prompts.yaml +0 -75
  361. package/config/rubrics.yaml +0 -81
  362. package/config/schedules.yaml +0 -43
  363. package/config/sinks.yaml +0 -54
  364. package/config/sources.yaml +0 -51
  365. package/config/thresholds.yaml +0 -49
  366. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  367. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  368. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  369. package/dist/_vendor/ailf-tasks/index.js +0 -16
  370. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  371. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  372. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  373. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  374. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  375. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  376. package/dist/agent-observer/test-imports.d.ts +0 -7
  377. package/dist/agent-observer/test-imports.js +0 -185
@@ -0,0 +1,222 @@
1
+ /**
2
+ * Redaction pipeline — strips sensitive data from traces before storage.
3
+ *
4
+ * Applied before ANY storage (both blob and Content Lake). Configurable
5
+ * patterns handle Bearer tokens, API keys, Sanity tokens, and other
6
+ * common secret formats.
7
+ *
8
+ * Principles:
9
+ * 1. Redact before store — sensitive data never reaches storage
10
+ * 2. Configurable patterns — teams can add project-specific rules
11
+ * 3. Truncation for cost — large outputs truncated to max bytes
12
+ * 4. No PII by default — tasks shouldn't contain PII, this is a safety net
13
+ *
14
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
15
+ */
16
+ // ---------------------------------------------------------------------------
17
+ // Default rules
18
+ // ---------------------------------------------------------------------------
19
+ /** Built-in redaction rules for common secret patterns */
20
+ export const DEFAULT_REDACTION_RULES = [
21
+ {
22
+ name: "bearer_tokens",
23
+ pattern: /Bearer\s+[A-Za-z0-9._~+/=-]{10,}/g,
24
+ replacement: "Bearer [REDACTED]",
25
+ },
26
+ {
27
+ name: "sanity_tokens",
28
+ pattern: /sk[A-Za-z0-9]{30,}/g,
29
+ replacement: "[REDACTED_SANITY_TOKEN]",
30
+ },
31
+ {
32
+ name: "openai_keys",
33
+ pattern: /sk-[A-Za-z0-9_-]{20,}/g,
34
+ replacement: "[REDACTED_OPENAI_KEY]",
35
+ },
36
+ {
37
+ name: "api_key_values",
38
+ pattern: /((?:api[_-]?key|token|secret|password|authorization)\s*[:=]\s*)(["']?)(?!\[REDACTED)[^\s"']{8,}\2/gi,
39
+ replacement: "$1$2[REDACTED]$2",
40
+ },
41
+ {
42
+ name: "slack_tokens",
43
+ pattern: /xoxb-[A-Za-z0-9-]{20,}/g,
44
+ replacement: "[REDACTED_SLACK_TOKEN]",
45
+ },
46
+ {
47
+ name: "github_tokens",
48
+ pattern: /gh[ps]_[A-Za-z0-9]{30,}/g,
49
+ replacement: "[REDACTED_GITHUB_TOKEN]",
50
+ },
51
+ {
52
+ name: "anthropic_keys",
53
+ pattern: /sk-ant-[A-Za-z0-9_-]{20,}/g,
54
+ replacement: "[REDACTED_ANTHROPIC_KEY]",
55
+ },
56
+ {
57
+ name: "base64_credentials",
58
+ pattern: /Basic\s+[A-Za-z0-9+/=]{20,}/g,
59
+ replacement: "Basic [REDACTED]",
60
+ },
61
+ ];
62
+ /** Default fields to omit entirely */
63
+ const DEFAULT_OMIT_FIELDS = [
64
+ "toolCalls[*].input.headers.Authorization",
65
+ "toolCalls[*].input.headers.Cookie",
66
+ "toolCalls[*].input.headers.Set-Cookie",
67
+ ];
68
+ const DEFAULT_MAX_OUTPUT_BYTES = 10_240;
69
+ // ---------------------------------------------------------------------------
70
+ // Public API
71
+ // ---------------------------------------------------------------------------
72
+ /**
73
+ * Create a default redaction config.
74
+ *
75
+ * @param overrides - Custom rules or settings to merge
76
+ */
77
+ export function createRedactionConfig(overrides) {
78
+ return {
79
+ rules: overrides?.rules
80
+ ? [...DEFAULT_REDACTION_RULES, ...overrides.rules]
81
+ : DEFAULT_REDACTION_RULES,
82
+ omitFields: overrides?.omitFields
83
+ ? [...DEFAULT_OMIT_FIELDS, ...overrides.omitFields]
84
+ : DEFAULT_OMIT_FIELDS,
85
+ maxOutputBytes: overrides?.maxOutputBytes ?? DEFAULT_MAX_OUTPUT_BYTES,
86
+ };
87
+ }
88
+ /**
89
+ * Apply redaction to an evaluation trace.
90
+ *
91
+ * Processes tool call inputs and outputs, event data, and search terms.
92
+ * Returns a new trace (does not mutate the original).
93
+ */
94
+ export function redactTrace(trace, config) {
95
+ const cfg = config ?? createRedactionConfig();
96
+ let redactionCount = 0;
97
+ const rulesApplied = new Set();
98
+ // Deep clone to avoid mutation
99
+ const redacted = JSON.parse(JSON.stringify(trace));
100
+ // Redact tool calls
101
+ redacted.toolCalls = redacted.toolCalls.map((call) => {
102
+ const result = redactToolCall(call, cfg);
103
+ redactionCount += result.count;
104
+ for (const rule of result.rules)
105
+ rulesApplied.add(rule);
106
+ return result.call;
107
+ });
108
+ // Redact events
109
+ redacted.events = redacted.events.map((event) => {
110
+ const dataStr = JSON.stringify(event.data);
111
+ const { text, count, rules } = applyRules(dataStr, cfg.rules);
112
+ redactionCount += count;
113
+ for (const rule of rules)
114
+ rulesApplied.add(rule);
115
+ return { ...event, data: JSON.parse(text) };
116
+ });
117
+ // Redact search terms (may contain embedded secrets)
118
+ redacted.searchTerms = redacted.searchTerms.map((term) => {
119
+ const { text, count, rules } = applyRules(term, cfg.rules);
120
+ redactionCount += count;
121
+ for (const rule of rules)
122
+ rulesApplied.add(rule);
123
+ return text;
124
+ });
125
+ return {
126
+ trace: redacted,
127
+ redactionCount,
128
+ rulesApplied: [...rulesApplied],
129
+ };
130
+ }
131
+ // ---------------------------------------------------------------------------
132
+ // Tool call redaction
133
+ // ---------------------------------------------------------------------------
134
+ function redactToolCall(call, config) {
135
+ let count = 0;
136
+ const rules = [];
137
+ // Redact input
138
+ const inputStr = JSON.stringify(call.input);
139
+ const inputResult = applyRules(inputStr, config.rules);
140
+ count += inputResult.count;
141
+ rules.push(...inputResult.rules);
142
+ // Redact output
143
+ let outputStr = JSON.stringify(call.output);
144
+ // Truncate output if too large
145
+ if (outputStr.length > config.maxOutputBytes) {
146
+ outputStr = outputStr.slice(0, config.maxOutputBytes) + "... [truncated]";
147
+ }
148
+ const outputResult = applyRules(outputStr, config.rules);
149
+ count += outputResult.count;
150
+ rules.push(...outputResult.rules);
151
+ // Omit specific fields from input
152
+ let parsedInput = JSON.parse(inputResult.text);
153
+ parsedInput = omitFields(parsedInput, config.omitFields, "input");
154
+ return {
155
+ call: {
156
+ ...call,
157
+ input: parsedInput,
158
+ output: parseJsonSafe(outputResult.text),
159
+ },
160
+ count,
161
+ rules,
162
+ };
163
+ }
164
+ // ---------------------------------------------------------------------------
165
+ // Rule application
166
+ // ---------------------------------------------------------------------------
167
+ function applyRules(text, rules) {
168
+ let result = text;
169
+ let count = 0;
170
+ const appliedRules = [];
171
+ for (const rule of rules) {
172
+ // Reset lastIndex before match() — global regexes are stateful
173
+ rule.pattern.lastIndex = 0;
174
+ const matches = result.match(rule.pattern);
175
+ if (matches && matches.length > 0) {
176
+ count += matches.length;
177
+ appliedRules.push(rule.name);
178
+ // Reset again before replace() — match() may leave lastIndex dirty
179
+ rule.pattern.lastIndex = 0;
180
+ result = result.replace(rule.pattern, rule.replacement);
181
+ }
182
+ }
183
+ return { text: result, count, rules: appliedRules };
184
+ }
185
+ // ---------------------------------------------------------------------------
186
+ // Field omission
187
+ // ---------------------------------------------------------------------------
188
+ function omitFields(obj, patterns, context) {
189
+ for (const pattern of patterns) {
190
+ // Simple field path handling (not full JSONPath)
191
+ // Handles: "toolCalls[*].input.headers.Authorization" when context is "input"
192
+ if (pattern.includes(context)) {
193
+ const parts = pattern.split(".");
194
+ const fieldIndex = parts.indexOf(context);
195
+ if (fieldIndex >= 0) {
196
+ const remainingPath = parts.slice(fieldIndex + 1);
197
+ deleteNestedField(obj, remainingPath);
198
+ }
199
+ }
200
+ }
201
+ return obj;
202
+ }
203
+ function deleteNestedField(obj, path) {
204
+ if (path.length === 0)
205
+ return;
206
+ if (path.length === 1) {
207
+ delete obj[path[0]];
208
+ return;
209
+ }
210
+ const child = obj[path[0]];
211
+ if (child && typeof child === "object") {
212
+ deleteNestedField(child, path.slice(1));
213
+ }
214
+ }
215
+ function parseJsonSafe(text) {
216
+ try {
217
+ return JSON.parse(text);
218
+ }
219
+ catch {
220
+ return text;
221
+ }
222
+ }
@@ -0,0 +1,32 @@
1
+ /**
2
+ * Tool call classification — maps raw provider tool names to categories.
3
+ *
4
+ * Raw tool names from providers are noisy and inconsistent (`WebSearch` vs
5
+ * `web_search` vs `Browser.search`). This module normalizes every tool call
6
+ * into one of six standard categories for cross-model comparison.
7
+ *
8
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
9
+ */
10
+ import type { ToolCallCategory } from "../../../_vendor/ailf-core/index.d.ts";
11
+ /**
12
+ * Classify a tool call by its raw name.
13
+ *
14
+ * Resolution order:
15
+ * 1. Exact match in custom overrides (if provided)
16
+ * 2. Exact match in default tool categories
17
+ * 3. Heuristic pattern matching on the name
18
+ * 4. Falls back to "execute" (safest default for unknown tools)
19
+ *
20
+ * @param name - Raw tool name from the provider
21
+ * @param customMappings - Optional custom tool → category overrides
22
+ * @returns The classified category
23
+ */
24
+ export declare function classifyToolCall(name: string, customMappings?: Record<string, ToolCallCategory>): ToolCallCategory;
25
+ /**
26
+ * Classify multiple tool calls, returning the category for each.
27
+ * Also tracks unrecognized names for the caller to log warnings.
28
+ */
29
+ export declare function classifyToolCalls(names: string[], customMappings?: Record<string, ToolCallCategory>): {
30
+ categories: ToolCallCategory[];
31
+ unrecognized: string[];
32
+ };
@@ -0,0 +1,120 @@
1
+ /**
2
+ * Tool call classification — maps raw provider tool names to categories.
3
+ *
4
+ * Raw tool names from providers are noisy and inconsistent (`WebSearch` vs
5
+ * `web_search` vs `Browser.search`). This module normalizes every tool call
6
+ * into one of six standard categories for cross-model comparison.
7
+ *
8
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
9
+ */
10
+ // ---------------------------------------------------------------------------
11
+ // Default tool name → category mapping
12
+ // ---------------------------------------------------------------------------
13
+ const DEFAULT_TOOL_CATEGORIES = {
14
+ // Search tools
15
+ Grep: "search",
16
+ WebSearch: "search",
17
+ grep: "search",
18
+ search: "search",
19
+ semantic_search: "search",
20
+ web_search: "search",
21
+ // Read tools
22
+ Glob: "read",
23
+ Read: "read",
24
+ WebFetch: "read",
25
+ cat: "read",
26
+ curl: "read",
27
+ file_read: "read",
28
+ read_file: "read",
29
+ web_fetch: "read",
30
+ // Write tools
31
+ Edit: "write",
32
+ FileEdit: "write",
33
+ Write: "write",
34
+ file_write: "write",
35
+ patch: "write",
36
+ write_file: "write",
37
+ // Execute tools
38
+ Bash: "execute",
39
+ RunCode: "execute",
40
+ bash: "execute",
41
+ exec: "execute",
42
+ python: "execute",
43
+ run_code: "execute",
44
+ shell: "execute",
45
+ // Navigate tools
46
+ "Browser.navigate": "navigate",
47
+ FollowLink: "navigate",
48
+ browse: "navigate",
49
+ follow_link: "navigate",
50
+ navigate: "navigate",
51
+ open_url: "navigate",
52
+ // Communicate tools
53
+ AskUser: "communicate",
54
+ TodoRead: "communicate",
55
+ TodoWrite: "communicate",
56
+ ask_user: "communicate",
57
+ submit_response: "communicate",
58
+ };
59
+ // ---------------------------------------------------------------------------
60
+ // Heuristic patterns (fallback when name not in lookup table)
61
+ // ---------------------------------------------------------------------------
62
+ const HEURISTIC_PATTERNS = [
63
+ [/search|find|query|lookup|grep/i, "search"],
64
+ [/read|fetch|get|load|cat|view/i, "read"],
65
+ [/write|create|edit|update|patch|save|put|post/i, "write"],
66
+ [/exec|run|bash|shell|python|code|command/i, "execute"],
67
+ [/navigate|browse|open|follow|link|url/i, "navigate"],
68
+ [/ask|user|chat|message|submit|todo|response/i, "communicate"],
69
+ ];
70
+ // ---------------------------------------------------------------------------
71
+ // Public API
72
+ // ---------------------------------------------------------------------------
73
+ /**
74
+ * Classify a tool call by its raw name.
75
+ *
76
+ * Resolution order:
77
+ * 1. Exact match in custom overrides (if provided)
78
+ * 2. Exact match in default tool categories
79
+ * 3. Heuristic pattern matching on the name
80
+ * 4. Falls back to "execute" (safest default for unknown tools)
81
+ *
82
+ * @param name - Raw tool name from the provider
83
+ * @param customMappings - Optional custom tool → category overrides
84
+ * @returns The classified category
85
+ */
86
+ export function classifyToolCall(name, customMappings) {
87
+ // 1. Custom overrides
88
+ if (customMappings?.[name]) {
89
+ return customMappings[name];
90
+ }
91
+ // 2. Default lookup
92
+ if (DEFAULT_TOOL_CATEGORIES[name]) {
93
+ return DEFAULT_TOOL_CATEGORIES[name];
94
+ }
95
+ // 3. Heuristic matching
96
+ for (const [pattern, category] of HEURISTIC_PATTERNS) {
97
+ if (pattern.test(name)) {
98
+ return category;
99
+ }
100
+ }
101
+ // 4. Unknown → execute (safest default)
102
+ return "execute";
103
+ }
104
+ /**
105
+ * Classify multiple tool calls, returning the category for each.
106
+ * Also tracks unrecognized names for the caller to log warnings.
107
+ */
108
+ export function classifyToolCalls(names, customMappings) {
109
+ const categories = [];
110
+ const unrecognized = [];
111
+ for (const name of names) {
112
+ const category = classifyToolCall(name, customMappings);
113
+ categories.push(category);
114
+ // Track names that required heuristic or default fallback
115
+ if (!DEFAULT_TOOL_CATEGORIES[name] && !customMappings?.[name]) {
116
+ unrecognized.push(name);
117
+ }
118
+ }
119
+ return { categories, unrecognized };
120
+ }
@@ -0,0 +1,75 @@
1
+ /**
2
+ * TraceCollector — extracts structured trace data from provider responses.
3
+ *
4
+ * Parses tool calls, token usage, and timing data from Promptfoo result
5
+ * objects and normalizes them into the canonical `EvalTrace` shape.
6
+ *
7
+ * Works via inline extraction — parsing provider response metadata
8
+ * directly, without requiring additional infrastructure.
9
+ *
10
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
11
+ * @see packages/core/src/types/trace.ts — EvalTrace types
12
+ */
13
+ import type { EvalTrace, ToolCallCategory } from "../../../_vendor/ailf-core/index.d.ts";
14
+ /** Raw provider response shape (subset of Promptfoo's result object) */
15
+ export interface ProviderResponse {
16
+ /** Raw text output */
17
+ output?: string;
18
+ /** Token usage (varies by provider) */
19
+ tokenUsage?: {
20
+ completion?: number;
21
+ prompt?: number;
22
+ total?: number;
23
+ cached?: number;
24
+ };
25
+ /** Provider-specific metadata (e.g., Claude's toolCalls) */
26
+ metadata?: {
27
+ toolCalls?: RawToolCall[];
28
+ [key: string]: unknown;
29
+ };
30
+ /** Response latency in milliseconds */
31
+ latencyMs?: number;
32
+ }
33
+ /** Raw tool call from a provider (pre-normalization) */
34
+ export interface RawToolCall {
35
+ name?: string;
36
+ input?: Record<string, unknown>;
37
+ output?: unknown;
38
+ error?: string;
39
+ durationMs?: number;
40
+ /** Alternative field names used by some providers */
41
+ function?: {
42
+ name?: string;
43
+ arguments?: string;
44
+ };
45
+ type?: string;
46
+ }
47
+ /** Options for trace collection */
48
+ export interface TraceCollectorOptions {
49
+ /** Run ID to associate with this trace */
50
+ runId: string;
51
+ /** Task ID that produced this test case */
52
+ taskId: string;
53
+ /** Test case index within the task */
54
+ testCaseIndex: number;
55
+ /** Model under evaluation */
56
+ modelId: string;
57
+ /** Custom tool → category mappings */
58
+ toolCategories?: Record<string, ToolCallCategory>;
59
+ /** Maximum output size per tool call (bytes) */
60
+ maxOutputBytes?: number;
61
+ }
62
+ /**
63
+ * Collect a trace from a single provider response.
64
+ *
65
+ * Extracts tool calls, token usage, timing, and builds the
66
+ * chronological event log.
67
+ */
68
+ export declare function collectTrace(response: ProviderResponse, options: TraceCollectorOptions): EvalTrace;
69
+ /**
70
+ * Merge multiple per-turn traces into a single test case trace.
71
+ *
72
+ * Each turn produces its own trace. This function combines them into
73
+ * a parent trace with per-turn spans.
74
+ */
75
+ export declare function mergeTraces(turns: EvalTrace[], parentOptions: TraceCollectorOptions): EvalTrace;