@sanity/ailf 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (444) hide show
  1. package/canonical/grader-references/README.md +2 -2
  2. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  3. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  4. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  5. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  6. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  7. package/config/features.ts +1 -1
  8. package/config/models.ts +28 -23
  9. package/config/sources.ts +1 -1
  10. package/config/thresholds.ts +1 -1
  11. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  13. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  17. package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
  18. package/dist/_vendor/ailf-core/config-helpers.js +29 -0
  19. package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
  20. package/dist/_vendor/ailf-core/examples/index.js +208 -114
  21. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  22. package/dist/_vendor/ailf-core/index.js +1 -0
  23. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  25. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  27. package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
  28. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  29. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  30. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  31. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  32. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  33. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
  34. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
  35. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  36. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  37. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  38. package/dist/_vendor/ailf-core/services/index.js +1 -1
  39. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  40. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
  41. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  42. package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
  43. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
  44. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  45. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  46. package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
  47. package/dist/_vendor/ailf-tasks/cli.js +61 -0
  48. package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
  49. package/dist/_vendor/ailf-tasks/index.js +16 -0
  50. package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
  51. package/dist/_vendor/ailf-tasks/parser.js +73 -0
  52. package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
  53. package/dist/_vendor/ailf-tasks/schemas.js +180 -0
  54. package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
  55. package/dist/_vendor/ailf-tasks/validation.js +162 -0
  56. package/dist/adapters/api-client/remediation.js +2 -2
  57. package/dist/adapters/config-sources/file-config-adapter.js +6 -1
  58. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  59. package/dist/adapters/index.d.ts +0 -1
  60. package/dist/adapters/index.js +0 -1
  61. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  62. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  63. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  64. package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
  65. package/dist/adapters/task-sources/index.d.ts +1 -2
  66. package/dist/adapters/task-sources/index.js +1 -2
  67. package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
  68. package/dist/adapters/task-sources/repo-schemas.js +2 -2
  69. package/dist/adapters/task-sources/repo-task-source.js +1 -1
  70. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  71. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
  73. package/dist/adapters/task-sources/task-file-loader.js +20 -6
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/explain-handler.d.ts +1 -1
  95. package/dist/commands/explain-handler.js +37 -8
  96. package/dist/commands/fetch-docs.js +1 -0
  97. package/dist/commands/generate-configs.d.ts +3 -3
  98. package/dist/commands/generate-configs.js +20 -8
  99. package/dist/commands/init.d.ts +2 -3
  100. package/dist/commands/init.js +56 -170
  101. package/dist/commands/pipeline-action.d.ts +7 -1
  102. package/dist/commands/pipeline-action.js +43 -19
  103. package/dist/commands/pipeline.d.ts +6 -1
  104. package/dist/commands/pipeline.js +7 -2
  105. package/dist/commands/pr-comment.js +1 -0
  106. package/dist/commands/publish.js +1 -0
  107. package/dist/commands/shared/help.js +2 -2
  108. package/dist/commands/update-quality-scores.d.ts +5 -0
  109. package/dist/commands/update-quality-scores.js +20 -0
  110. package/dist/composition-root.d.ts +2 -3
  111. package/dist/composition-root.js +27 -14
  112. package/dist/config/features.ts +23 -0
  113. package/dist/config/models.ts +100 -0
  114. package/dist/config/prompts.ts +16 -0
  115. package/dist/config/rubrics.ts +225 -0
  116. package/dist/config/schedules.ts +47 -0
  117. package/dist/config/sinks.ts +37 -0
  118. package/dist/config/sources.ts +21 -0
  119. package/dist/config/thresholds.ts +61 -0
  120. package/dist/lib/agent-behavior-report.d.ts +8 -0
  121. package/dist/lib/agent-behavior-report.js +185 -0
  122. package/dist/lib/baseline.d.ts +19 -0
  123. package/dist/lib/baseline.js +153 -0
  124. package/dist/lib/calculate-scores.d.ts +23 -0
  125. package/dist/lib/calculate-scores.js +42 -0
  126. package/dist/lib/compare.d.ts +18 -0
  127. package/dist/lib/compare.js +170 -0
  128. package/dist/lib/coverage-audit.d.ts +4 -0
  129. package/dist/lib/coverage-audit.js +42 -0
  130. package/dist/lib/discovery-report.d.ts +13 -0
  131. package/dist/lib/discovery-report.js +57 -0
  132. package/dist/lib/fetch-docs.d.ts +30 -0
  133. package/dist/lib/fetch-docs.js +171 -0
  134. package/dist/lib/generate-configs.d.ts +25 -0
  135. package/dist/lib/generate-configs.js +42 -0
  136. package/dist/lib/grader-api.d.ts +21 -0
  137. package/dist/lib/grader-api.js +34 -0
  138. package/dist/lib/grader-compare.d.ts +19 -0
  139. package/dist/lib/grader-compare.js +91 -0
  140. package/dist/lib/grader-consistency.d.ts +27 -0
  141. package/dist/lib/grader-consistency.js +79 -0
  142. package/dist/lib/grader-sensitivity.d.ts +19 -0
  143. package/dist/lib/grader-sensitivity.js +75 -0
  144. package/dist/lib/grader-validate.d.ts +19 -0
  145. package/dist/lib/grader-validate.js +78 -0
  146. package/dist/lib/measure-retrieval.d.ts +14 -0
  147. package/dist/lib/measure-retrieval.js +71 -0
  148. package/dist/lib/pr-comment.d.ts +16 -0
  149. package/dist/lib/pr-comment.js +28 -0
  150. package/dist/lib/readiness-report.d.ts +13 -0
  151. package/dist/lib/readiness-report.js +108 -0
  152. package/dist/lib/webhook-server.d.ts +11 -0
  153. package/dist/lib/webhook-server.js +24 -0
  154. package/dist/lib/weekly-digest.d.ts +24 -0
  155. package/dist/lib/weekly-digest.js +148 -0
  156. package/dist/orchestration/build-app-context.js +13 -0
  157. package/dist/orchestration/cache-context.d.ts +23 -0
  158. package/dist/orchestration/cache-context.js +43 -0
  159. package/dist/orchestration/env-bridge.d.ts +21 -0
  160. package/dist/orchestration/env-bridge.js +66 -0
  161. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  162. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  163. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  164. package/dist/orchestration/step-runner.js +5 -1
  165. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  166. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  167. package/dist/orchestration/steps/callback-step.js +10 -1
  168. package/dist/orchestration/steps/compare-step.js +6 -3
  169. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  170. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  171. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  172. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  173. package/dist/orchestration/steps/fetch-docs-step.js +30 -16
  174. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  175. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  176. package/dist/orchestration/steps/generate-configs-step.js +50 -15
  177. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  178. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  179. package/dist/orchestration/steps/publish-report-step.js +19 -0
  180. package/dist/orchestration/steps/readiness-step.js +8 -3
  181. package/dist/orchestration/steps/report-step.js +17 -4
  182. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  183. package/dist/orchestration/steps/run-eval-step.js +52 -32
  184. package/dist/pipeline/agent-behavior-report.js +6 -0
  185. package/dist/pipeline/attribution.d.ts +1 -1
  186. package/dist/pipeline/attribution.js +1 -1
  187. package/dist/pipeline/cache.js +29 -15
  188. package/dist/pipeline/calculate-scores.d.ts +2 -0
  189. package/dist/pipeline/calculate-scores.js +70 -33
  190. package/dist/pipeline/checks.d.ts +8 -3
  191. package/dist/pipeline/checks.js +23 -3
  192. package/dist/pipeline/chronic-failures.d.ts +55 -0
  193. package/dist/pipeline/chronic-failures.js +110 -0
  194. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
  195. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  196. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  197. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  198. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  199. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  200. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  201. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  202. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  203. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  204. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  205. package/dist/pipeline/compiler/config-loader.js +42 -2
  206. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  207. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  208. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  209. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  210. package/dist/pipeline/compiler/index.d.ts +2 -5
  211. package/dist/pipeline/compiler/index.js +2 -5
  212. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  213. package/dist/pipeline/compiler/literacy-bridge.js +1 -1
  214. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
  215. package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
  216. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
  217. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
  218. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
  219. package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
  220. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
  221. package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
  222. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
  223. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
  224. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  225. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  226. package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
  227. package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
  228. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
  229. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
  230. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  231. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  232. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
  233. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
  234. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  235. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  236. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  237. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
  238. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
  239. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
  240. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
  241. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
  242. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
  243. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
  244. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
  245. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
  246. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
  247. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
  248. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
  249. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
  250. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  251. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  252. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
  253. package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
  254. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  255. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  256. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  257. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  258. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  259. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  260. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  261. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  262. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  263. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  264. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  265. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  266. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  267. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  268. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  269. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  270. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  271. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  272. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  273. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  274. package/dist/pipeline/compiler/task-bridge.js +92 -0
  275. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  276. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  277. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  278. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  279. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  280. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  281. package/dist/pipeline/coverage-audit.d.ts +1 -1
  282. package/dist/pipeline/coverage-audit.js +1 -1
  283. package/dist/pipeline/degradations.d.ts +1 -1
  284. package/dist/pipeline/degradations.js +1 -1
  285. package/dist/pipeline/failure-modes.d.ts +1 -1
  286. package/dist/pipeline/failure-modes.js +13 -1
  287. package/dist/pipeline/gap-analysis.d.ts +1 -1
  288. package/dist/pipeline/gap-analysis.js +3 -1
  289. package/dist/pipeline/generate-configs.d.ts +2 -2
  290. package/dist/pipeline/generate-configs.js +15 -8
  291. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  292. package/dist/pipeline/grader-compare-runner.js +7 -1
  293. package/dist/pipeline/grader-comparison.d.ts +1 -1
  294. package/dist/pipeline/grader-comparison.js +1 -1
  295. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  296. package/dist/pipeline/grader-consistency-runner.js +7 -1
  297. package/dist/pipeline/grader-consistency.d.ts +1 -1
  298. package/dist/pipeline/grader-consistency.js +1 -1
  299. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  300. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  301. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  302. package/dist/pipeline/grader-sensitivity.js +1 -1
  303. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  304. package/dist/pipeline/grader-validate-runner.js +2 -2
  305. package/dist/pipeline/grader-validation.d.ts +1 -1
  306. package/dist/pipeline/grader-validation.js +1 -1
  307. package/dist/pipeline/map-request-to-config.js +15 -2
  308. package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
  309. package/dist/pipeline/mirror-repo-tasks.js +1 -1
  310. package/dist/pipeline/plan-format.d.ts +1 -1
  311. package/dist/pipeline/plan-format.js +1 -1
  312. package/dist/pipeline/plan.d.ts +1 -1
  313. package/dist/pipeline/plan.js +67 -29
  314. package/dist/pipeline/probe.d.ts +1 -1
  315. package/dist/pipeline/probe.js +1 -1
  316. package/dist/pipeline/readiness-report.d.ts +2 -2
  317. package/dist/pipeline/readiness-report.js +2 -2
  318. package/dist/pipeline/release-classification.d.ts +1 -1
  319. package/dist/pipeline/release-classification.js +1 -1
  320. package/dist/pipeline/release-report.d.ts +1 -1
  321. package/dist/pipeline/release-report.js +1 -1
  322. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  323. package/dist/pipeline/repo-eval-comment.js +1 -1
  324. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  325. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  326. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  327. package/dist/pipeline/resolve-mappings.js +44 -44
  328. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  329. package/dist/pipeline/retrieval-metrics.js +28 -20
  330. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  331. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  332. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  333. package/dist/pipeline/steps/compare-step.js +90 -0
  334. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  335. package/dist/pipeline/steps/eval-step.js +347 -0
  336. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  337. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  338. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  339. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  340. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  341. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  342. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  343. package/dist/pipeline/steps/publish-report-step.js +243 -0
  344. package/dist/pipeline/steps/report-step.d.ts +13 -0
  345. package/dist/pipeline/steps/report-step.js +56 -0
  346. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  347. package/dist/pipeline/steps/update-scores-step.js +42 -0
  348. package/dist/pipeline/targeted-loo.d.ts +1 -1
  349. package/dist/pipeline/targeted-loo.js +1 -1
  350. package/dist/pipeline/thresholds.d.ts +1 -1
  351. package/dist/pipeline/thresholds.js +1 -1
  352. package/dist/pipeline/validate.js +13 -0
  353. package/dist/report-store.d.ts +17 -0
  354. package/dist/report-store.js +24 -0
  355. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  356. package/dist/scripts/agent-behavior-report.js +315 -0
  357. package/dist/scripts/baseline.d.ts +43 -0
  358. package/dist/scripts/baseline.js +267 -0
  359. package/dist/scripts/calculate-scores.d.ts +166 -0
  360. package/dist/scripts/calculate-scores.js +1296 -0
  361. package/dist/scripts/compare.d.ts +22 -0
  362. package/dist/scripts/compare.js +334 -0
  363. package/dist/scripts/coverage-audit.d.ts +44 -0
  364. package/dist/scripts/coverage-audit.js +209 -0
  365. package/dist/scripts/debug-eval.d.ts +19 -0
  366. package/dist/scripts/debug-eval.js +73 -0
  367. package/dist/scripts/discovery-report.d.ts +58 -0
  368. package/dist/scripts/discovery-report.js +250 -0
  369. package/dist/scripts/fetch-docs.d.ts +35 -0
  370. package/dist/scripts/fetch-docs.js +472 -0
  371. package/dist/scripts/generate-configs.d.ts +66 -0
  372. package/dist/scripts/generate-configs.js +459 -0
  373. package/dist/scripts/grader-api.d.ts +27 -0
  374. package/dist/scripts/grader-api.js +206 -0
  375. package/dist/scripts/grader-compare.d.ts +22 -0
  376. package/dist/scripts/grader-compare.js +368 -0
  377. package/dist/scripts/grader-consistency.d.ts +20 -0
  378. package/dist/scripts/grader-consistency.js +313 -0
  379. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  380. package/dist/scripts/grader-sensitivity.js +354 -0
  381. package/dist/scripts/grader-validate.d.ts +19 -0
  382. package/dist/scripts/grader-validate.js +267 -0
  383. package/dist/scripts/measure-retrieval.d.ts +10 -0
  384. package/dist/scripts/measure-retrieval.js +145 -0
  385. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  386. package/dist/scripts/migrate-task-mode.js +1 -1
  387. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  388. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  389. package/dist/scripts/pipeline.d.ts +76 -0
  390. package/dist/scripts/pipeline.js +1031 -0
  391. package/dist/scripts/pr-comment.d.ts +10 -0
  392. package/dist/scripts/pr-comment.js +510 -0
  393. package/dist/scripts/readiness-report.d.ts +88 -0
  394. package/dist/scripts/readiness-report.js +342 -0
  395. package/dist/scripts/update-quality-scores.d.ts +15 -0
  396. package/dist/scripts/update-quality-scores.js +184 -0
  397. package/dist/scripts/validate-task-sources.d.ts +1 -1
  398. package/dist/scripts/validate-task-sources.js +1 -1
  399. package/dist/scripts/validate.d.ts +13 -0
  400. package/dist/scripts/validate.js +79 -0
  401. package/dist/scripts/webhook-server.d.ts +26 -0
  402. package/dist/scripts/webhook-server.js +147 -0
  403. package/dist/scripts/weekly-digest.d.ts +24 -0
  404. package/dist/scripts/weekly-digest.js +144 -0
  405. package/dist/sinks/format-slack.d.ts +64 -0
  406. package/dist/sinks/format-slack.js +306 -0
  407. package/dist/sinks/slack-sink.d.ts +27 -0
  408. package/dist/sinks/slack-sink.js +78 -0
  409. package/dist/sinks/types.d.ts +1 -1
  410. package/dist/sinks/types.js +1 -1
  411. package/dist/sinks/webhook-sink.d.ts +19 -0
  412. package/dist/sinks/webhook-sink.js +50 -0
  413. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  414. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  415. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  416. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  417. package/dist/tasks/literacy/functions.task.ts +70 -0
  418. package/dist/tasks/literacy/groq.task.ts +259 -0
  419. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  420. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  421. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  422. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  423. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  424. package/package.json +25 -25
  425. package/tasks/.expanded.agentic.yaml +280 -0
  426. package/tasks/.expanded.yaml +565 -0
  427. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  428. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  429. package/tasks/literacy/content-lake.task.ts +181 -0
  430. package/tasks/literacy/frameworks.task.ts +1 -0
  431. package/tasks/literacy/functions.task.ts +1 -0
  432. package/tasks/literacy/groq.task.ts +1 -0
  433. package/tasks/literacy/image-handling.task.ts +95 -0
  434. package/tasks/literacy/nextjs-live.task.ts +2 -1
  435. package/tasks/literacy/portable-text.task.ts +169 -0
  436. package/tasks/literacy/studio-setup.task.ts +5 -2
  437. package/tasks/literacy/visual-editing.task.ts +1 -0
  438. package/LICENSE +0 -21
  439. package/tasks/frameworks.yaml +0 -98
  440. package/tasks/functions.yaml +0 -51
  441. package/tasks/groq.yaml +0 -216
  442. package/tasks/nextjs-live.yaml +0 -62
  443. package/tasks/studio-setup.yaml +0 -111
  444. package/tasks/visual-editing.yaml +0 -120
@@ -0,0 +1,509 @@
1
+ /**
2
+ * tool-loop-openai.test.ts — Tests for the OpenAI MCP tool loop.
3
+ *
4
+ * Tests both API variants (Chat Completions and Responses) with mocked
5
+ * fetch to verify tool calling, error handling, token tracking, and
6
+ * round exhaustion.
7
+ *
8
+ * Run: npx tsx --test src/pipeline/compiler/__tests__/tool-loop-openai.test.ts
9
+ */
10
+ import assert from "node:assert/strict";
11
+ import { afterEach, beforeEach, describe, it } from "node:test";
12
+ import { runOpenAIToolLoop } from "../mode-handlers/mcp-tool-provider/tool-loop-openai.js";
13
+ // ---------------------------------------------------------------------------
14
+ // Test fixtures
15
+ // ---------------------------------------------------------------------------
16
+ const TEST_TOOLS = [
17
+ {
18
+ name: "query_documents",
19
+ description: "Query Sanity documents using GROQ",
20
+ inputSchema: {
21
+ type: "object",
22
+ properties: {
23
+ query: { type: "string" },
24
+ projectId: { type: "string" },
25
+ dataset: { type: "string" },
26
+ },
27
+ required: ["query"],
28
+ },
29
+ },
30
+ {
31
+ name: "get_schema",
32
+ description: "Get the Sanity schema for a project",
33
+ inputSchema: {
34
+ type: "object",
35
+ properties: { projectId: { type: "string" } },
36
+ },
37
+ },
38
+ ];
39
+ function makeCallTool(results = {}) {
40
+ return async (name, _args) => {
41
+ if (results[name]) {
42
+ return { content: results[name] };
43
+ }
44
+ return { content: `Result from ${name}` };
45
+ };
46
+ }
47
+ function makeThrowingCallTool(errorMessage) {
48
+ return async () => {
49
+ throw new Error(errorMessage);
50
+ };
51
+ }
52
+ function baseConfig(overrides) {
53
+ return {
54
+ prompt: "Query all documents from project abc123",
55
+ tools: TEST_TOOLS,
56
+ callTool: makeCallTool(),
57
+ maxToolRounds: 5,
58
+ model: "gpt-5.2",
59
+ temperature: 0.2,
60
+ maxTokens: 4096,
61
+ apiKey: "test-api-key",
62
+ ...overrides,
63
+ };
64
+ }
65
+ // ---------------------------------------------------------------------------
66
+ // Chat Completions API response builders
67
+ // ---------------------------------------------------------------------------
68
+ function chatResponse(opts) {
69
+ const message = {
70
+ role: "assistant",
71
+ content: opts.content ?? null,
72
+ };
73
+ if (opts.toolCalls) {
74
+ message.tool_calls = opts.toolCalls.map((tc) => ({
75
+ id: tc.id,
76
+ type: "function",
77
+ function: { name: tc.name, arguments: tc.arguments },
78
+ }));
79
+ }
80
+ return {
81
+ choices: [
82
+ {
83
+ message,
84
+ finish_reason: opts.finishReason ?? (opts.toolCalls ? "tool_calls" : "stop"),
85
+ },
86
+ ],
87
+ usage: {
88
+ prompt_tokens: opts.promptTokens ?? 100,
89
+ completion_tokens: opts.completionTokens ?? 50,
90
+ total_tokens: (opts.promptTokens ?? 100) + (opts.completionTokens ?? 50),
91
+ },
92
+ };
93
+ }
94
+ function chatErrorResponse(message) {
95
+ return { error: { message } };
96
+ }
97
+ // ---------------------------------------------------------------------------
98
+ // Responses API response builders
99
+ // ---------------------------------------------------------------------------
100
+ function responsesResponse(opts) {
101
+ const output = [];
102
+ if (opts.functionCalls) {
103
+ for (const fc of opts.functionCalls) {
104
+ output.push({
105
+ type: "function_call",
106
+ id: `fc_${fc.callId}`,
107
+ call_id: fc.callId,
108
+ name: fc.name,
109
+ arguments: fc.arguments,
110
+ });
111
+ }
112
+ }
113
+ if (opts.text !== undefined) {
114
+ output.push({
115
+ type: "message",
116
+ id: "msg_001",
117
+ role: "assistant",
118
+ content: [{ type: "output_text", text: opts.text }],
119
+ });
120
+ }
121
+ return {
122
+ id: opts.id ?? "resp_001",
123
+ status: "completed",
124
+ output,
125
+ usage: {
126
+ input_tokens: opts.inputTokens ?? 100,
127
+ output_tokens: opts.outputTokens ?? 50,
128
+ total_tokens: (opts.inputTokens ?? 100) + (opts.outputTokens ?? 50),
129
+ },
130
+ };
131
+ }
132
+ function responsesErrorResponse(message) {
133
+ return { error: { message } };
134
+ }
135
+ // ---------------------------------------------------------------------------
136
+ // Mock fetch helper
137
+ // ---------------------------------------------------------------------------
138
+ let originalFetch;
139
+ let fetchCalls;
140
+ /** Mock fetch that returns JSON responses. All responses have ok: true (API-level errors are in the body). */
141
+ function mockFetch(responses) {
142
+ let callIndex = 0;
143
+ fetchCalls = [];
144
+ globalThis.fetch = (async (url, init) => {
145
+ const body = init?.body ? JSON.parse(String(init.body)) : undefined;
146
+ fetchCalls.push({ url: url.toString(), body });
147
+ const responseData = responses[callIndex] ?? responses[responses.length - 1];
148
+ callIndex++;
149
+ return {
150
+ json: async () => responseData,
151
+ ok: true,
152
+ status: 200,
153
+ };
154
+ });
155
+ }
156
+ /** Mock fetch that returns an HTTP error (non-JSON body) */
157
+ function mockHttpError(status, body) {
158
+ fetchCalls = [];
159
+ globalThis.fetch = (async (url, init) => {
160
+ const reqBody = init?.body ? JSON.parse(String(init.body)) : undefined;
161
+ fetchCalls.push({ url: url.toString(), body: reqBody });
162
+ return new Response(body, { status, statusText: "Error" });
163
+ });
164
+ }
165
+ // ---------------------------------------------------------------------------
166
+ // Tests: Chat Completions API
167
+ // ---------------------------------------------------------------------------
168
+ describe("runOpenAIToolLoop — Chat Completions API", () => {
169
+ beforeEach(() => {
170
+ originalFetch = globalThis.fetch;
171
+ });
172
+ afterEach(() => {
173
+ globalThis.fetch = originalFetch;
174
+ });
175
+ it("returns text when model answers without tool calls", async () => {
176
+ mockFetch([
177
+ chatResponse({
178
+ content: "There are 42 documents.",
179
+ finishReason: "stop",
180
+ }),
181
+ ]);
182
+ const result = await runOpenAIToolLoop(baseConfig({ apiVariant: "chat" }));
183
+ assert.equal(result.output, "There are 42 documents.");
184
+ assert.equal(result.toolCallLog.length, 0);
185
+ assert.equal(result.toolRounds, 0);
186
+ assert.equal(result.exhaustedRounds, undefined);
187
+ });
188
+ it("executes a single tool call and returns final answer", async () => {
189
+ mockFetch([
190
+ // Round 0: model calls query_documents
191
+ chatResponse({
192
+ toolCalls: [
193
+ {
194
+ id: "call_1",
195
+ name: "query_documents",
196
+ arguments: '{"query":"*[_type==\\"post\\"]"}',
197
+ },
198
+ ],
199
+ }),
200
+ // Round 1: model synthesizes answer
201
+ chatResponse({ content: "Found 10 posts.", finishReason: "stop" }),
202
+ ]);
203
+ const result = await runOpenAIToolLoop(baseConfig({ apiVariant: "chat" }));
204
+ assert.equal(result.output, "Found 10 posts.");
205
+ assert.equal(result.toolCallLog.length, 1);
206
+ assert.equal(result.toolCallLog[0].name, "query_documents");
207
+ assert.equal(result.toolRounds, 1);
208
+ });
209
+ it("executes multi-turn tool calls", async () => {
210
+ mockFetch([
211
+ // Round 0: model calls get_schema
212
+ chatResponse({
213
+ toolCalls: [
214
+ {
215
+ id: "call_1",
216
+ name: "get_schema",
217
+ arguments: '{"projectId":"abc123"}',
218
+ },
219
+ ],
220
+ }),
221
+ // Round 1: model calls query_documents
222
+ chatResponse({
223
+ toolCalls: [
224
+ { id: "call_2", name: "query_documents", arguments: '{"query":"*"}' },
225
+ ],
226
+ }),
227
+ // Round 2: model synthesizes
228
+ chatResponse({
229
+ content: "Schema has 5 types, 100 documents.",
230
+ finishReason: "stop",
231
+ }),
232
+ ]);
233
+ const result = await runOpenAIToolLoop(baseConfig({ apiVariant: "chat" }));
234
+ assert.equal(result.toolCallLog.length, 2);
235
+ assert.equal(result.toolCallLog[0].name, "get_schema");
236
+ assert.equal(result.toolCallLog[1].name, "query_documents");
237
+ assert.equal(result.toolRounds, 2);
238
+ });
239
+ it("captures tool execution errors in toolCallLog", async () => {
240
+ mockFetch([
241
+ chatResponse({
242
+ toolCalls: [
243
+ { id: "call_1", name: "query_documents", arguments: '{"query":"*"}' },
244
+ ],
245
+ }),
246
+ chatResponse({
247
+ content: "Tool failed, but I'll answer.",
248
+ finishReason: "stop",
249
+ }),
250
+ ]);
251
+ const result = await runOpenAIToolLoop(baseConfig({
252
+ apiVariant: "chat",
253
+ callTool: makeThrowingCallTool("Connection refused"),
254
+ }));
255
+ assert.equal(result.toolCallLog.length, 1);
256
+ assert.equal(result.toolCallLog[0].output, "Error: Connection refused");
257
+ assert.equal(result.output, "Tool failed, but I'll answer.");
258
+ });
259
+ it("handles exhausted rounds", async () => {
260
+ // Model keeps calling tools for all 3 rounds (maxToolRounds=2 means rounds 0,1,2)
261
+ mockFetch([
262
+ chatResponse({
263
+ toolCalls: [{ id: "call_1", name: "get_schema", arguments: "{}" }],
264
+ }),
265
+ chatResponse({
266
+ toolCalls: [{ id: "call_2", name: "get_schema", arguments: "{}" }],
267
+ }),
268
+ // Last round: tool_choice "none" forces text, but model returns nothing useful
269
+ chatResponse({ content: null, finishReason: "stop" }),
270
+ ]);
271
+ const result = await runOpenAIToolLoop(baseConfig({ apiVariant: "chat", maxToolRounds: 2 }));
272
+ // Round 2 (the last) gets tool_choice: "none", model stops
273
+ assert.equal(result.toolCallLog.length, 2);
274
+ assert.equal(result.toolRounds, 2);
275
+ // The model returned content: null with finishReason: stop on the last round
276
+ assert.equal(result.output, "");
277
+ });
278
+ it("throws on API-level error in JSON body", async () => {
279
+ mockFetch([chatErrorResponse("Rate limit exceeded")]);
280
+ await assert.rejects(() => runOpenAIToolLoop(baseConfig({ apiVariant: "chat" })), { message: "Rate limit exceeded" });
281
+ });
282
+ it("throws on HTTP error with non-JSON body", async () => {
283
+ mockHttpError(502, "<html>Bad Gateway</html>");
284
+ await assert.rejects(() => runOpenAIToolLoop(baseConfig({ apiVariant: "chat" })), (err) => err.message.includes("HTTP 502") && err.message.includes("Bad Gateway"));
285
+ });
286
+ it("accumulates token usage across rounds", async () => {
287
+ mockFetch([
288
+ chatResponse({
289
+ toolCalls: [{ id: "call_1", name: "get_schema", arguments: "{}" }],
290
+ promptTokens: 200,
291
+ completionTokens: 50,
292
+ }),
293
+ chatResponse({
294
+ content: "Done.",
295
+ finishReason: "stop",
296
+ promptTokens: 300,
297
+ completionTokens: 80,
298
+ }),
299
+ ]);
300
+ const result = await runOpenAIToolLoop(baseConfig({ apiVariant: "chat" }));
301
+ assert.equal(result.tokenUsage.prompt, 500); // 200 + 300
302
+ assert.equal(result.tokenUsage.completion, 130); // 50 + 80
303
+ });
304
+ it("sends max_completion_tokens for GPT-5.x models", async () => {
305
+ mockFetch([chatResponse({ content: "Answer.", finishReason: "stop" })]);
306
+ await runOpenAIToolLoop(baseConfig({ apiVariant: "chat", model: "gpt-5.2" }));
307
+ assert.equal(fetchCalls.length, 1);
308
+ const body = fetchCalls[0].body;
309
+ assert.equal(body.max_completion_tokens, 4096);
310
+ assert.equal(body.max_tokens, undefined);
311
+ });
312
+ it("sends max_tokens for older models", async () => {
313
+ mockFetch([chatResponse({ content: "Answer.", finishReason: "stop" })]);
314
+ await runOpenAIToolLoop(baseConfig({ apiVariant: "chat", model: "gpt-4o" }));
315
+ assert.equal(fetchCalls.length, 1);
316
+ const body = fetchCalls[0].body;
317
+ assert.equal(body.max_tokens, 4096);
318
+ assert.equal(body.max_completion_tokens, undefined);
319
+ });
320
+ it("sends tool_choice 'none' on last round", async () => {
321
+ mockFetch([
322
+ chatResponse({
323
+ toolCalls: [{ id: "call_1", name: "get_schema", arguments: "{}" }],
324
+ }),
325
+ chatResponse({ content: "Final.", finishReason: "stop" }),
326
+ ]);
327
+ await runOpenAIToolLoop(baseConfig({ apiVariant: "chat", maxToolRounds: 1 }));
328
+ // Round 0: auto, Round 1 (last): none
329
+ assert.equal(fetchCalls.length, 2);
330
+ assert.equal(fetchCalls[0].body.tool_choice, "auto");
331
+ assert.equal(fetchCalls[1].body.tool_choice, "none");
332
+ });
333
+ });
334
+ // ---------------------------------------------------------------------------
335
+ // Tests: Responses API
336
+ // ---------------------------------------------------------------------------
337
+ describe("runOpenAIToolLoop — Responses API", () => {
338
+ beforeEach(() => {
339
+ originalFetch = globalThis.fetch;
340
+ });
341
+ afterEach(() => {
342
+ globalThis.fetch = originalFetch;
343
+ });
344
+ it("returns text when model answers without tool calls", async () => {
345
+ mockFetch([responsesResponse({ text: "42 documents found." })]);
346
+ const result = await runOpenAIToolLoop(baseConfig({ apiVariant: "responses" }));
347
+ assert.equal(result.output, "42 documents found.");
348
+ assert.equal(result.toolCallLog.length, 0);
349
+ assert.equal(result.toolRounds, 0);
350
+ });
351
+ it("executes a single tool call and returns final answer", async () => {
352
+ mockFetch([
353
+ // Round 0: model calls query_documents
354
+ responsesResponse({
355
+ id: "resp_001",
356
+ functionCalls: [
357
+ {
358
+ callId: "call_1",
359
+ name: "query_documents",
360
+ arguments: '{"query":"*"}',
361
+ },
362
+ ],
363
+ }),
364
+ // Round 1: model synthesizes
365
+ responsesResponse({ id: "resp_002", text: "Found 10 posts." }),
366
+ ]);
367
+ const result = await runOpenAIToolLoop(baseConfig({ apiVariant: "responses" }));
368
+ assert.equal(result.output, "Found 10 posts.");
369
+ assert.equal(result.toolCallLog.length, 1);
370
+ assert.equal(result.toolCallLog[0].name, "query_documents");
371
+ assert.equal(result.toolRounds, 1);
372
+ });
373
+ it("chains via previous_response_id", async () => {
374
+ mockFetch([
375
+ responsesResponse({
376
+ id: "resp_001",
377
+ functionCalls: [
378
+ { callId: "call_1", name: "get_schema", arguments: "{}" },
379
+ ],
380
+ }),
381
+ responsesResponse({ id: "resp_002", text: "Schema loaded." }),
382
+ ]);
383
+ await runOpenAIToolLoop(baseConfig({ apiVariant: "responses" }));
384
+ // Second request should chain via previous_response_id
385
+ assert.equal(fetchCalls.length, 2);
386
+ const secondBody = fetchCalls[1].body;
387
+ assert.equal(secondBody.previous_response_id, "resp_001");
388
+ });
389
+ it("captures tool execution errors in toolCallLog", async () => {
390
+ mockFetch([
391
+ responsesResponse({
392
+ functionCalls: [
393
+ {
394
+ callId: "call_1",
395
+ name: "query_documents",
396
+ arguments: '{"query":"*"}',
397
+ },
398
+ ],
399
+ }),
400
+ responsesResponse({ text: "Handled the error." }),
401
+ ]);
402
+ const result = await runOpenAIToolLoop(baseConfig({
403
+ apiVariant: "responses",
404
+ callTool: makeThrowingCallTool("Server unavailable"),
405
+ }));
406
+ assert.equal(result.toolCallLog.length, 1);
407
+ assert.equal(result.toolCallLog[0].output, "Error: Server unavailable");
408
+ assert.equal(result.output, "Handled the error.");
409
+ });
410
+ it("handles exhausted rounds", async () => {
411
+ mockFetch([
412
+ responsesResponse({
413
+ id: "resp_001",
414
+ functionCalls: [
415
+ { callId: "call_1", name: "get_schema", arguments: "{}" },
416
+ ],
417
+ }),
418
+ responsesResponse({
419
+ id: "resp_002",
420
+ functionCalls: [
421
+ { callId: "call_2", name: "get_schema", arguments: "{}" },
422
+ ],
423
+ }),
424
+ // Last round with tool_choice: "none" — model must return text
425
+ // But if it doesn't produce function calls, we get empty output
426
+ responsesResponse({ id: "resp_003", text: "" }),
427
+ ]);
428
+ const result = await runOpenAIToolLoop(baseConfig({ apiVariant: "responses", maxToolRounds: 2 }));
429
+ assert.equal(result.toolCallLog.length, 2);
430
+ assert.equal(result.toolRounds, 2);
431
+ });
432
+ it("throws on API-level error in JSON body", async () => {
433
+ mockFetch([responsesErrorResponse("Invalid model")]);
434
+ await assert.rejects(() => runOpenAIToolLoop(baseConfig({ apiVariant: "responses" })), { message: "Invalid model" });
435
+ });
436
+ it("throws on HTTP error with non-JSON body", async () => {
437
+ mockHttpError(503, "Service Unavailable");
438
+ await assert.rejects(() => runOpenAIToolLoop(baseConfig({ apiVariant: "responses" })), (err) => err.message.includes("HTTP 503") &&
439
+ err.message.includes("Service Unavailable"));
440
+ });
441
+ it("accumulates token usage across rounds", async () => {
442
+ mockFetch([
443
+ responsesResponse({
444
+ functionCalls: [
445
+ { callId: "call_1", name: "get_schema", arguments: "{}" },
446
+ ],
447
+ inputTokens: 150,
448
+ outputTokens: 40,
449
+ }),
450
+ responsesResponse({
451
+ text: "Done.",
452
+ inputTokens: 250,
453
+ outputTokens: 60,
454
+ }),
455
+ ]);
456
+ const result = await runOpenAIToolLoop(baseConfig({ apiVariant: "responses" }));
457
+ assert.equal(result.tokenUsage.prompt, 400); // 150 + 250
458
+ assert.equal(result.tokenUsage.completion, 100); // 40 + 60
459
+ });
460
+ it("sends max_output_tokens (not max_tokens)", async () => {
461
+ mockFetch([responsesResponse({ text: "Answer." })]);
462
+ await runOpenAIToolLoop(baseConfig({ apiVariant: "responses" }));
463
+ const body = fetchCalls[0].body;
464
+ assert.equal(body.max_output_tokens, 4096);
465
+ assert.equal(body.max_tokens, undefined);
466
+ assert.equal(body.max_completion_tokens, undefined);
467
+ });
468
+ it("uses correct endpoint URL", async () => {
469
+ mockFetch([responsesResponse({ text: "Hi." })]);
470
+ await runOpenAIToolLoop(baseConfig({ apiVariant: "responses" }));
471
+ assert.ok(fetchCalls[0].url.includes("/v1/responses"));
472
+ });
473
+ it("passes reasoning_effort and omits temperature", async () => {
474
+ mockFetch([responsesResponse({ text: "Thought carefully." })]);
475
+ await runOpenAIToolLoop(baseConfig({
476
+ apiVariant: "responses",
477
+ providerConfig: { reasoning_effort: "medium" },
478
+ }));
479
+ const body = fetchCalls[0].body;
480
+ assert.deepEqual(body.reasoning, { effort: "medium" });
481
+ assert.equal(body.temperature, undefined);
482
+ });
483
+ it("includes temperature when reasoning_effort is not set", async () => {
484
+ mockFetch([responsesResponse({ text: "Answer." })]);
485
+ await runOpenAIToolLoop(baseConfig({ apiVariant: "responses" }));
486
+ const body = fetchCalls[0].body;
487
+ assert.equal(body.temperature, 0.2);
488
+ assert.equal(body.reasoning, undefined);
489
+ });
490
+ });
491
+ // ---------------------------------------------------------------------------
492
+ // Tests: Default routing (no apiVariant)
493
+ // ---------------------------------------------------------------------------
494
+ describe("runOpenAIToolLoop — default routing", () => {
495
+ beforeEach(() => {
496
+ originalFetch = globalThis.fetch;
497
+ });
498
+ afterEach(() => {
499
+ globalThis.fetch = originalFetch;
500
+ });
501
+ it("defaults to Chat Completions when apiVariant is undefined", async () => {
502
+ mockFetch([
503
+ chatResponse({ content: "Default path.", finishReason: "stop" }),
504
+ ]);
505
+ const result = await runOpenAIToolLoop(baseConfig());
506
+ assert.equal(result.output, "Default path.");
507
+ assert.ok(fetchCalls[0].url.includes("/v1/chat/completions"));
508
+ });
509
+ });
@@ -12,7 +12,7 @@
12
12
  * and normalizes weight fields.
13
13
  *
14
14
  * @see docs/design-docs/architecture-overhaul/scoring-rubrics-assertions.md
15
- * @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
15
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
16
16
  */
17
17
  import type { GeneralizedAssertionDefinition } from "../../_vendor/ailf-core/index.d.ts";
18
18
  import type { EvalMode } from "../../_vendor/ailf-shared/index.d.ts";
@@ -12,7 +12,7 @@
12
12
  * and normalizes weight fields.
13
13
  *
14
14
  * @see docs/design-docs/architecture-overhaul/scoring-rubrics-assertions.md
15
- * @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
15
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
16
16
  */
17
17
  // ---------------------------------------------------------------------------
18
18
  // Known assertion types and their mode compatibility
@@ -1,15 +1,10 @@
1
1
  /**
2
2
  * compiler-to-yaml.ts — Serialize compiled Promptfoo config to YAML files.
3
3
  *
4
- * This is the bridge between the new compiler pipeline (in-memory
5
- * CompiledPromptfooConfig) and the existing RunEvalStep which reads
4
+ * This is the bridge between the compiler pipeline (in-memory
5
+ * CompiledPromptfooConfig) and the RunEvalStep which reads
6
6
  * YAML config files from disk.
7
7
  *
8
- * The output YAML files are identical in structure to what the legacy
9
- * generate-configs.ts produces, so RunEvalStep, CalculateScoresStep,
10
- * and all downstream steps work without modification.
11
- *
12
- * @see packages/eval/src/pipeline/generate-configs.ts — legacy path
13
8
  * @see packages/eval/src/orchestration/steps/run-eval-step.ts — consumer
14
9
  */
15
10
  import type { Logger, ModeCompileResult } from "../../_vendor/ailf-core/index.d.ts";
@@ -1,15 +1,10 @@
1
1
  /**
2
2
  * compiler-to-yaml.ts — Serialize compiled Promptfoo config to YAML files.
3
3
  *
4
- * This is the bridge between the new compiler pipeline (in-memory
5
- * CompiledPromptfooConfig) and the existing RunEvalStep which reads
4
+ * This is the bridge between the compiler pipeline (in-memory
5
+ * CompiledPromptfooConfig) and the RunEvalStep which reads
6
6
  * YAML config files from disk.
7
7
  *
8
- * The output YAML files are identical in structure to what the legacy
9
- * generate-configs.ts produces, so RunEvalStep, CalculateScoresStep,
10
- * and all downstream steps work without modification.
11
- *
12
- * @see packages/eval/src/pipeline/generate-configs.ts — legacy path
13
8
  * @see packages/eval/src/orchestration/steps/run-eval-step.ts — consumer
14
9
  */
15
10
  import { existsSync, mkdirSync, writeFileSync } from "fs";
@@ -54,3 +54,17 @@ export declare class ConfigNotFoundError extends Error {
54
54
  readonly searchDir: string;
55
55
  constructor(configName: string, searchDir: string);
56
56
  }
57
+ /**
58
+ * Resolve a package subdirectory, preferring the vendored copy in dist/
59
+ * when @sanity/ailf-core isn't directly resolvable (i.e., installed outside
60
+ * the monorepo via npx or npm).
61
+ *
62
+ * The build step (bundle-workspace-deps.ts) copies config/ and tasks/ into
63
+ * dist/ with @sanity/ailf-core imports rewritten to the _vendor/ path.
64
+ * This function detects that situation and returns the dist/ path instead.
65
+ *
66
+ * @param rootDir - Package root directory (packages/eval)
67
+ * @param subdir - Subdirectory to resolve (e.g., "config", "tasks/literacy")
68
+ * @returns The resolved subdirectory path — either source or dist/ vendored
69
+ */
70
+ export declare function resolveVendoredSubdir(rootDir: string, subdir: string): string;
@@ -36,7 +36,8 @@ import { resolve } from "path";
36
36
  */
37
37
  export function loadConfigFile(name, rootDir, options) {
38
38
  const subdir = options?.subdir ?? "config";
39
- const basePath = resolve(rootDir, subdir, name);
39
+ const baseDir = resolveVendoredSubdir(rootDir, subdir);
40
+ const basePath = resolve(baseDir, name);
40
41
  // Priority chain: .ts > .js > .yaml > .yml > .json
41
42
  const candidates = [
42
43
  { ext: ".ts", format: "ts" },
@@ -60,7 +61,7 @@ export function loadConfigFile(name, rootDir, options) {
60
61
  }
61
62
  }
62
63
  // Always throw ConfigNotFoundError so tryLoadConfigFile can catch it
63
- throw new ConfigNotFoundError(name, resolve(rootDir, subdir));
64
+ throw new ConfigNotFoundError(name, baseDir);
64
65
  }
65
66
  /**
66
67
  * Try to load a config file, returning null if not found.
@@ -88,6 +89,45 @@ export class ConfigNotFoundError extends Error {
88
89
  }
89
90
  }
90
91
  // ---------------------------------------------------------------------------
92
+ // Vendored config detection
93
+ // ---------------------------------------------------------------------------
94
+ /**
95
+ * Resolve a package subdirectory, preferring the vendored copy in dist/
96
+ * when @sanity/ailf-core isn't directly resolvable (i.e., installed outside
97
+ * the monorepo via npx or npm).
98
+ *
99
+ * The build step (bundle-workspace-deps.ts) copies config/ and tasks/ into
100
+ * dist/ with @sanity/ailf-core imports rewritten to the _vendor/ path.
101
+ * This function detects that situation and returns the dist/ path instead.
102
+ *
103
+ * @param rootDir - Package root directory (packages/eval)
104
+ * @param subdir - Subdirectory to resolve (e.g., "config", "tasks/literacy")
105
+ * @returns The resolved subdirectory path — either source or dist/ vendored
106
+ */
107
+ export function resolveVendoredSubdir(rootDir, subdir) {
108
+ if (shouldUseVendoredConfigs(rootDir)) {
109
+ const vendoredDir = resolve(rootDir, "dist", subdir);
110
+ if (existsSync(vendoredDir))
111
+ return resolve(rootDir, "dist", subdir);
112
+ }
113
+ return resolve(rootDir, subdir);
114
+ }
115
+ let _vendoredCache;
116
+ function shouldUseVendoredConfigs(rootDir) {
117
+ if (_vendoredCache !== undefined)
118
+ return _vendoredCache;
119
+ // Check if @sanity/ailf-core resolves (i.e., we're in the monorepo)
120
+ try {
121
+ const require = createRequire(resolve(rootDir, "package.json"));
122
+ require.resolve("@sanity/ailf-core");
123
+ _vendoredCache = false;
124
+ }
125
+ catch {
126
+ _vendoredCache = true;
127
+ }
128
+ return _vendoredCache;
129
+ }
130
+ // ---------------------------------------------------------------------------
91
131
  // Format-specific loaders
92
132
  // ---------------------------------------------------------------------------
93
133
  function loadTsFile(filePath, format) {
@@ -15,7 +15,7 @@
15
15
  * as described in the fixtures-artifacts design doc.
16
16
  *
17
17
  * @see docs/design-docs/architecture-overhaul/fixtures-artifacts.md
18
- * @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
18
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
19
19
  */
20
20
  import type { GeneralizedTaskDefinition, ResolvedFixture, VariableEnvelope } from "../../_vendor/ailf-core/index.d.ts";
21
21
  /** Options for fixture resolution */
@@ -15,7 +15,7 @@
15
15
  * as described in the fixtures-artifacts design doc.
16
16
  *
17
17
  * @see docs/design-docs/architecture-overhaul/fixtures-artifacts.md
18
- * @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
18
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
19
19
  */
20
20
  import { existsSync, readFileSync } from "fs";
21
21
  import { resolve } from "path";
@@ -17,7 +17,7 @@
17
17
  * ]
18
18
  * ```
19
19
  *
20
- * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
20
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
21
21
  */
22
22
  /**
23
23
  * Strip specified fields from an object using dot-notation paths.