@sanity/ailf 2.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +28 -23
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
- package/dist/_vendor/ailf-core/config-helpers.js +29 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
- package/dist/_vendor/ailf-core/examples/index.js +208 -114
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
- package/dist/_vendor/ailf-tasks/cli.js +61 -0
- package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
- package/dist/_vendor/ailf-tasks/index.js +16 -0
- package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
- package/dist/_vendor/ailf-tasks/parser.js +73 -0
- package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
- package/dist/_vendor/ailf-tasks/schemas.js +180 -0
- package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
- package/dist/_vendor/ailf-tasks/validation.js +162 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +6 -1
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
- package/dist/adapters/task-sources/index.d.ts +1 -2
- package/dist/adapters/task-sources/index.js +1 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
- package/dist/adapters/task-sources/repo-schemas.js +2 -2
- package/dist/adapters/task-sources/repo-task-source.js +1 -1
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
- package/dist/adapters/task-sources/task-file-loader.js +20 -6
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +2 -3
- package/dist/commands/init.js +56 -170
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/composition-root.d.ts +2 -3
- package/dist/composition-root.js +27 -14
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +30 -16
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +50 -15
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +51 -31
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
- package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
- package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
- package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +15 -8
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +15 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/dist/pipeline/mirror-repo-tasks.js +1 -1
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +67 -29
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +24 -24
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -0,0 +1,509 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* tool-loop-openai.test.ts — Tests for the OpenAI MCP tool loop.
|
|
3
|
+
*
|
|
4
|
+
* Tests both API variants (Chat Completions and Responses) with mocked
|
|
5
|
+
* fetch to verify tool calling, error handling, token tracking, and
|
|
6
|
+
* round exhaustion.
|
|
7
|
+
*
|
|
8
|
+
* Run: npx tsx --test src/pipeline/compiler/__tests__/tool-loop-openai.test.ts
|
|
9
|
+
*/
|
|
10
|
+
import assert from "node:assert/strict";
|
|
11
|
+
import { afterEach, beforeEach, describe, it } from "node:test";
|
|
12
|
+
import { runOpenAIToolLoop } from "../mode-handlers/mcp-tool-provider/tool-loop-openai.js";
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
// Test fixtures
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
const TEST_TOOLS = [
|
|
17
|
+
{
|
|
18
|
+
name: "query_documents",
|
|
19
|
+
description: "Query Sanity documents using GROQ",
|
|
20
|
+
inputSchema: {
|
|
21
|
+
type: "object",
|
|
22
|
+
properties: {
|
|
23
|
+
query: { type: "string" },
|
|
24
|
+
projectId: { type: "string" },
|
|
25
|
+
dataset: { type: "string" },
|
|
26
|
+
},
|
|
27
|
+
required: ["query"],
|
|
28
|
+
},
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
name: "get_schema",
|
|
32
|
+
description: "Get the Sanity schema for a project",
|
|
33
|
+
inputSchema: {
|
|
34
|
+
type: "object",
|
|
35
|
+
properties: { projectId: { type: "string" } },
|
|
36
|
+
},
|
|
37
|
+
},
|
|
38
|
+
];
|
|
39
|
+
function makeCallTool(results = {}) {
|
|
40
|
+
return async (name, _args) => {
|
|
41
|
+
if (results[name]) {
|
|
42
|
+
return { content: results[name] };
|
|
43
|
+
}
|
|
44
|
+
return { content: `Result from ${name}` };
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
function makeThrowingCallTool(errorMessage) {
|
|
48
|
+
return async () => {
|
|
49
|
+
throw new Error(errorMessage);
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
function baseConfig(overrides) {
|
|
53
|
+
return {
|
|
54
|
+
prompt: "Query all documents from project abc123",
|
|
55
|
+
tools: TEST_TOOLS,
|
|
56
|
+
callTool: makeCallTool(),
|
|
57
|
+
maxToolRounds: 5,
|
|
58
|
+
model: "gpt-5.2",
|
|
59
|
+
temperature: 0.2,
|
|
60
|
+
maxTokens: 4096,
|
|
61
|
+
apiKey: "test-api-key",
|
|
62
|
+
...overrides,
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
// ---------------------------------------------------------------------------
|
|
66
|
+
// Chat Completions API response builders
|
|
67
|
+
// ---------------------------------------------------------------------------
|
|
68
|
+
function chatResponse(opts) {
|
|
69
|
+
const message = {
|
|
70
|
+
role: "assistant",
|
|
71
|
+
content: opts.content ?? null,
|
|
72
|
+
};
|
|
73
|
+
if (opts.toolCalls) {
|
|
74
|
+
message.tool_calls = opts.toolCalls.map((tc) => ({
|
|
75
|
+
id: tc.id,
|
|
76
|
+
type: "function",
|
|
77
|
+
function: { name: tc.name, arguments: tc.arguments },
|
|
78
|
+
}));
|
|
79
|
+
}
|
|
80
|
+
return {
|
|
81
|
+
choices: [
|
|
82
|
+
{
|
|
83
|
+
message,
|
|
84
|
+
finish_reason: opts.finishReason ?? (opts.toolCalls ? "tool_calls" : "stop"),
|
|
85
|
+
},
|
|
86
|
+
],
|
|
87
|
+
usage: {
|
|
88
|
+
prompt_tokens: opts.promptTokens ?? 100,
|
|
89
|
+
completion_tokens: opts.completionTokens ?? 50,
|
|
90
|
+
total_tokens: (opts.promptTokens ?? 100) + (opts.completionTokens ?? 50),
|
|
91
|
+
},
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
function chatErrorResponse(message) {
|
|
95
|
+
return { error: { message } };
|
|
96
|
+
}
|
|
97
|
+
// ---------------------------------------------------------------------------
|
|
98
|
+
// Responses API response builders
|
|
99
|
+
// ---------------------------------------------------------------------------
|
|
100
|
+
function responsesResponse(opts) {
|
|
101
|
+
const output = [];
|
|
102
|
+
if (opts.functionCalls) {
|
|
103
|
+
for (const fc of opts.functionCalls) {
|
|
104
|
+
output.push({
|
|
105
|
+
type: "function_call",
|
|
106
|
+
id: `fc_${fc.callId}`,
|
|
107
|
+
call_id: fc.callId,
|
|
108
|
+
name: fc.name,
|
|
109
|
+
arguments: fc.arguments,
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
if (opts.text !== undefined) {
|
|
114
|
+
output.push({
|
|
115
|
+
type: "message",
|
|
116
|
+
id: "msg_001",
|
|
117
|
+
role: "assistant",
|
|
118
|
+
content: [{ type: "output_text", text: opts.text }],
|
|
119
|
+
});
|
|
120
|
+
}
|
|
121
|
+
return {
|
|
122
|
+
id: opts.id ?? "resp_001",
|
|
123
|
+
status: "completed",
|
|
124
|
+
output,
|
|
125
|
+
usage: {
|
|
126
|
+
input_tokens: opts.inputTokens ?? 100,
|
|
127
|
+
output_tokens: opts.outputTokens ?? 50,
|
|
128
|
+
total_tokens: (opts.inputTokens ?? 100) + (opts.outputTokens ?? 50),
|
|
129
|
+
},
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
function responsesErrorResponse(message) {
|
|
133
|
+
return { error: { message } };
|
|
134
|
+
}
|
|
135
|
+
// ---------------------------------------------------------------------------
|
|
136
|
+
// Mock fetch helper
|
|
137
|
+
// ---------------------------------------------------------------------------
|
|
138
|
+
let originalFetch;
|
|
139
|
+
let fetchCalls;
|
|
140
|
+
/** Mock fetch that returns JSON responses. All responses have ok: true (API-level errors are in the body). */
|
|
141
|
+
function mockFetch(responses) {
|
|
142
|
+
let callIndex = 0;
|
|
143
|
+
fetchCalls = [];
|
|
144
|
+
globalThis.fetch = (async (url, init) => {
|
|
145
|
+
const body = init?.body ? JSON.parse(String(init.body)) : undefined;
|
|
146
|
+
fetchCalls.push({ url: url.toString(), body });
|
|
147
|
+
const responseData = responses[callIndex] ?? responses[responses.length - 1];
|
|
148
|
+
callIndex++;
|
|
149
|
+
return {
|
|
150
|
+
json: async () => responseData,
|
|
151
|
+
ok: true,
|
|
152
|
+
status: 200,
|
|
153
|
+
};
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
/** Mock fetch that returns an HTTP error (non-JSON body) */
|
|
157
|
+
function mockHttpError(status, body) {
|
|
158
|
+
fetchCalls = [];
|
|
159
|
+
globalThis.fetch = (async (url, init) => {
|
|
160
|
+
const reqBody = init?.body ? JSON.parse(String(init.body)) : undefined;
|
|
161
|
+
fetchCalls.push({ url: url.toString(), body: reqBody });
|
|
162
|
+
return new Response(body, { status, statusText: "Error" });
|
|
163
|
+
});
|
|
164
|
+
}
|
|
165
|
+
// ---------------------------------------------------------------------------
|
|
166
|
+
// Tests: Chat Completions API
|
|
167
|
+
// ---------------------------------------------------------------------------
|
|
168
|
+
describe("runOpenAIToolLoop — Chat Completions API", () => {
|
|
169
|
+
beforeEach(() => {
|
|
170
|
+
originalFetch = globalThis.fetch;
|
|
171
|
+
});
|
|
172
|
+
afterEach(() => {
|
|
173
|
+
globalThis.fetch = originalFetch;
|
|
174
|
+
});
|
|
175
|
+
it("returns text when model answers without tool calls", async () => {
|
|
176
|
+
mockFetch([
|
|
177
|
+
chatResponse({
|
|
178
|
+
content: "There are 42 documents.",
|
|
179
|
+
finishReason: "stop",
|
|
180
|
+
}),
|
|
181
|
+
]);
|
|
182
|
+
const result = await runOpenAIToolLoop(baseConfig({ apiVariant: "chat" }));
|
|
183
|
+
assert.equal(result.output, "There are 42 documents.");
|
|
184
|
+
assert.equal(result.toolCallLog.length, 0);
|
|
185
|
+
assert.equal(result.toolRounds, 0);
|
|
186
|
+
assert.equal(result.exhaustedRounds, undefined);
|
|
187
|
+
});
|
|
188
|
+
it("executes a single tool call and returns final answer", async () => {
|
|
189
|
+
mockFetch([
|
|
190
|
+
// Round 0: model calls query_documents
|
|
191
|
+
chatResponse({
|
|
192
|
+
toolCalls: [
|
|
193
|
+
{
|
|
194
|
+
id: "call_1",
|
|
195
|
+
name: "query_documents",
|
|
196
|
+
arguments: '{"query":"*[_type==\\"post\\"]"}',
|
|
197
|
+
},
|
|
198
|
+
],
|
|
199
|
+
}),
|
|
200
|
+
// Round 1: model synthesizes answer
|
|
201
|
+
chatResponse({ content: "Found 10 posts.", finishReason: "stop" }),
|
|
202
|
+
]);
|
|
203
|
+
const result = await runOpenAIToolLoop(baseConfig({ apiVariant: "chat" }));
|
|
204
|
+
assert.equal(result.output, "Found 10 posts.");
|
|
205
|
+
assert.equal(result.toolCallLog.length, 1);
|
|
206
|
+
assert.equal(result.toolCallLog[0].name, "query_documents");
|
|
207
|
+
assert.equal(result.toolRounds, 1);
|
|
208
|
+
});
|
|
209
|
+
it("executes multi-turn tool calls", async () => {
|
|
210
|
+
mockFetch([
|
|
211
|
+
// Round 0: model calls get_schema
|
|
212
|
+
chatResponse({
|
|
213
|
+
toolCalls: [
|
|
214
|
+
{
|
|
215
|
+
id: "call_1",
|
|
216
|
+
name: "get_schema",
|
|
217
|
+
arguments: '{"projectId":"abc123"}',
|
|
218
|
+
},
|
|
219
|
+
],
|
|
220
|
+
}),
|
|
221
|
+
// Round 1: model calls query_documents
|
|
222
|
+
chatResponse({
|
|
223
|
+
toolCalls: [
|
|
224
|
+
{ id: "call_2", name: "query_documents", arguments: '{"query":"*"}' },
|
|
225
|
+
],
|
|
226
|
+
}),
|
|
227
|
+
// Round 2: model synthesizes
|
|
228
|
+
chatResponse({
|
|
229
|
+
content: "Schema has 5 types, 100 documents.",
|
|
230
|
+
finishReason: "stop",
|
|
231
|
+
}),
|
|
232
|
+
]);
|
|
233
|
+
const result = await runOpenAIToolLoop(baseConfig({ apiVariant: "chat" }));
|
|
234
|
+
assert.equal(result.toolCallLog.length, 2);
|
|
235
|
+
assert.equal(result.toolCallLog[0].name, "get_schema");
|
|
236
|
+
assert.equal(result.toolCallLog[1].name, "query_documents");
|
|
237
|
+
assert.equal(result.toolRounds, 2);
|
|
238
|
+
});
|
|
239
|
+
it("captures tool execution errors in toolCallLog", async () => {
|
|
240
|
+
mockFetch([
|
|
241
|
+
chatResponse({
|
|
242
|
+
toolCalls: [
|
|
243
|
+
{ id: "call_1", name: "query_documents", arguments: '{"query":"*"}' },
|
|
244
|
+
],
|
|
245
|
+
}),
|
|
246
|
+
chatResponse({
|
|
247
|
+
content: "Tool failed, but I'll answer.",
|
|
248
|
+
finishReason: "stop",
|
|
249
|
+
}),
|
|
250
|
+
]);
|
|
251
|
+
const result = await runOpenAIToolLoop(baseConfig({
|
|
252
|
+
apiVariant: "chat",
|
|
253
|
+
callTool: makeThrowingCallTool("Connection refused"),
|
|
254
|
+
}));
|
|
255
|
+
assert.equal(result.toolCallLog.length, 1);
|
|
256
|
+
assert.equal(result.toolCallLog[0].output, "Error: Connection refused");
|
|
257
|
+
assert.equal(result.output, "Tool failed, but I'll answer.");
|
|
258
|
+
});
|
|
259
|
+
it("handles exhausted rounds", async () => {
|
|
260
|
+
// Model keeps calling tools for all 3 rounds (maxToolRounds=2 means rounds 0,1,2)
|
|
261
|
+
mockFetch([
|
|
262
|
+
chatResponse({
|
|
263
|
+
toolCalls: [{ id: "call_1", name: "get_schema", arguments: "{}" }],
|
|
264
|
+
}),
|
|
265
|
+
chatResponse({
|
|
266
|
+
toolCalls: [{ id: "call_2", name: "get_schema", arguments: "{}" }],
|
|
267
|
+
}),
|
|
268
|
+
// Last round: tool_choice "none" forces text, but model returns nothing useful
|
|
269
|
+
chatResponse({ content: null, finishReason: "stop" }),
|
|
270
|
+
]);
|
|
271
|
+
const result = await runOpenAIToolLoop(baseConfig({ apiVariant: "chat", maxToolRounds: 2 }));
|
|
272
|
+
// Round 2 (the last) gets tool_choice: "none", model stops
|
|
273
|
+
assert.equal(result.toolCallLog.length, 2);
|
|
274
|
+
assert.equal(result.toolRounds, 2);
|
|
275
|
+
// The model returned content: null with finishReason: stop on the last round
|
|
276
|
+
assert.equal(result.output, "");
|
|
277
|
+
});
|
|
278
|
+
it("throws on API-level error in JSON body", async () => {
|
|
279
|
+
mockFetch([chatErrorResponse("Rate limit exceeded")]);
|
|
280
|
+
await assert.rejects(() => runOpenAIToolLoop(baseConfig({ apiVariant: "chat" })), { message: "Rate limit exceeded" });
|
|
281
|
+
});
|
|
282
|
+
it("throws on HTTP error with non-JSON body", async () => {
|
|
283
|
+
mockHttpError(502, "<html>Bad Gateway</html>");
|
|
284
|
+
await assert.rejects(() => runOpenAIToolLoop(baseConfig({ apiVariant: "chat" })), (err) => err.message.includes("HTTP 502") && err.message.includes("Bad Gateway"));
|
|
285
|
+
});
|
|
286
|
+
it("accumulates token usage across rounds", async () => {
|
|
287
|
+
mockFetch([
|
|
288
|
+
chatResponse({
|
|
289
|
+
toolCalls: [{ id: "call_1", name: "get_schema", arguments: "{}" }],
|
|
290
|
+
promptTokens: 200,
|
|
291
|
+
completionTokens: 50,
|
|
292
|
+
}),
|
|
293
|
+
chatResponse({
|
|
294
|
+
content: "Done.",
|
|
295
|
+
finishReason: "stop",
|
|
296
|
+
promptTokens: 300,
|
|
297
|
+
completionTokens: 80,
|
|
298
|
+
}),
|
|
299
|
+
]);
|
|
300
|
+
const result = await runOpenAIToolLoop(baseConfig({ apiVariant: "chat" }));
|
|
301
|
+
assert.equal(result.tokenUsage.prompt, 500); // 200 + 300
|
|
302
|
+
assert.equal(result.tokenUsage.completion, 130); // 50 + 80
|
|
303
|
+
});
|
|
304
|
+
it("sends max_completion_tokens for GPT-5.x models", async () => {
|
|
305
|
+
mockFetch([chatResponse({ content: "Answer.", finishReason: "stop" })]);
|
|
306
|
+
await runOpenAIToolLoop(baseConfig({ apiVariant: "chat", model: "gpt-5.2" }));
|
|
307
|
+
assert.equal(fetchCalls.length, 1);
|
|
308
|
+
const body = fetchCalls[0].body;
|
|
309
|
+
assert.equal(body.max_completion_tokens, 4096);
|
|
310
|
+
assert.equal(body.max_tokens, undefined);
|
|
311
|
+
});
|
|
312
|
+
it("sends max_tokens for older models", async () => {
|
|
313
|
+
mockFetch([chatResponse({ content: "Answer.", finishReason: "stop" })]);
|
|
314
|
+
await runOpenAIToolLoop(baseConfig({ apiVariant: "chat", model: "gpt-4o" }));
|
|
315
|
+
assert.equal(fetchCalls.length, 1);
|
|
316
|
+
const body = fetchCalls[0].body;
|
|
317
|
+
assert.equal(body.max_tokens, 4096);
|
|
318
|
+
assert.equal(body.max_completion_tokens, undefined);
|
|
319
|
+
});
|
|
320
|
+
it("sends tool_choice 'none' on last round", async () => {
|
|
321
|
+
mockFetch([
|
|
322
|
+
chatResponse({
|
|
323
|
+
toolCalls: [{ id: "call_1", name: "get_schema", arguments: "{}" }],
|
|
324
|
+
}),
|
|
325
|
+
chatResponse({ content: "Final.", finishReason: "stop" }),
|
|
326
|
+
]);
|
|
327
|
+
await runOpenAIToolLoop(baseConfig({ apiVariant: "chat", maxToolRounds: 1 }));
|
|
328
|
+
// Round 0: auto, Round 1 (last): none
|
|
329
|
+
assert.equal(fetchCalls.length, 2);
|
|
330
|
+
assert.equal(fetchCalls[0].body.tool_choice, "auto");
|
|
331
|
+
assert.equal(fetchCalls[1].body.tool_choice, "none");
|
|
332
|
+
});
|
|
333
|
+
});
|
|
334
|
+
// ---------------------------------------------------------------------------
|
|
335
|
+
// Tests: Responses API
|
|
336
|
+
// ---------------------------------------------------------------------------
|
|
337
|
+
describe("runOpenAIToolLoop — Responses API", () => {
|
|
338
|
+
beforeEach(() => {
|
|
339
|
+
originalFetch = globalThis.fetch;
|
|
340
|
+
});
|
|
341
|
+
afterEach(() => {
|
|
342
|
+
globalThis.fetch = originalFetch;
|
|
343
|
+
});
|
|
344
|
+
it("returns text when model answers without tool calls", async () => {
|
|
345
|
+
mockFetch([responsesResponse({ text: "42 documents found." })]);
|
|
346
|
+
const result = await runOpenAIToolLoop(baseConfig({ apiVariant: "responses" }));
|
|
347
|
+
assert.equal(result.output, "42 documents found.");
|
|
348
|
+
assert.equal(result.toolCallLog.length, 0);
|
|
349
|
+
assert.equal(result.toolRounds, 0);
|
|
350
|
+
});
|
|
351
|
+
it("executes a single tool call and returns final answer", async () => {
|
|
352
|
+
mockFetch([
|
|
353
|
+
// Round 0: model calls query_documents
|
|
354
|
+
responsesResponse({
|
|
355
|
+
id: "resp_001",
|
|
356
|
+
functionCalls: [
|
|
357
|
+
{
|
|
358
|
+
callId: "call_1",
|
|
359
|
+
name: "query_documents",
|
|
360
|
+
arguments: '{"query":"*"}',
|
|
361
|
+
},
|
|
362
|
+
],
|
|
363
|
+
}),
|
|
364
|
+
// Round 1: model synthesizes
|
|
365
|
+
responsesResponse({ id: "resp_002", text: "Found 10 posts." }),
|
|
366
|
+
]);
|
|
367
|
+
const result = await runOpenAIToolLoop(baseConfig({ apiVariant: "responses" }));
|
|
368
|
+
assert.equal(result.output, "Found 10 posts.");
|
|
369
|
+
assert.equal(result.toolCallLog.length, 1);
|
|
370
|
+
assert.equal(result.toolCallLog[0].name, "query_documents");
|
|
371
|
+
assert.equal(result.toolRounds, 1);
|
|
372
|
+
});
|
|
373
|
+
it("chains via previous_response_id", async () => {
|
|
374
|
+
mockFetch([
|
|
375
|
+
responsesResponse({
|
|
376
|
+
id: "resp_001",
|
|
377
|
+
functionCalls: [
|
|
378
|
+
{ callId: "call_1", name: "get_schema", arguments: "{}" },
|
|
379
|
+
],
|
|
380
|
+
}),
|
|
381
|
+
responsesResponse({ id: "resp_002", text: "Schema loaded." }),
|
|
382
|
+
]);
|
|
383
|
+
await runOpenAIToolLoop(baseConfig({ apiVariant: "responses" }));
|
|
384
|
+
// Second request should chain via previous_response_id
|
|
385
|
+
assert.equal(fetchCalls.length, 2);
|
|
386
|
+
const secondBody = fetchCalls[1].body;
|
|
387
|
+
assert.equal(secondBody.previous_response_id, "resp_001");
|
|
388
|
+
});
|
|
389
|
+
it("captures tool execution errors in toolCallLog", async () => {
|
|
390
|
+
mockFetch([
|
|
391
|
+
responsesResponse({
|
|
392
|
+
functionCalls: [
|
|
393
|
+
{
|
|
394
|
+
callId: "call_1",
|
|
395
|
+
name: "query_documents",
|
|
396
|
+
arguments: '{"query":"*"}',
|
|
397
|
+
},
|
|
398
|
+
],
|
|
399
|
+
}),
|
|
400
|
+
responsesResponse({ text: "Handled the error." }),
|
|
401
|
+
]);
|
|
402
|
+
const result = await runOpenAIToolLoop(baseConfig({
|
|
403
|
+
apiVariant: "responses",
|
|
404
|
+
callTool: makeThrowingCallTool("Server unavailable"),
|
|
405
|
+
}));
|
|
406
|
+
assert.equal(result.toolCallLog.length, 1);
|
|
407
|
+
assert.equal(result.toolCallLog[0].output, "Error: Server unavailable");
|
|
408
|
+
assert.equal(result.output, "Handled the error.");
|
|
409
|
+
});
|
|
410
|
+
it("handles exhausted rounds", async () => {
|
|
411
|
+
mockFetch([
|
|
412
|
+
responsesResponse({
|
|
413
|
+
id: "resp_001",
|
|
414
|
+
functionCalls: [
|
|
415
|
+
{ callId: "call_1", name: "get_schema", arguments: "{}" },
|
|
416
|
+
],
|
|
417
|
+
}),
|
|
418
|
+
responsesResponse({
|
|
419
|
+
id: "resp_002",
|
|
420
|
+
functionCalls: [
|
|
421
|
+
{ callId: "call_2", name: "get_schema", arguments: "{}" },
|
|
422
|
+
],
|
|
423
|
+
}),
|
|
424
|
+
// Last round with tool_choice: "none" — model must return text
|
|
425
|
+
// But if it doesn't produce function calls, we get empty output
|
|
426
|
+
responsesResponse({ id: "resp_003", text: "" }),
|
|
427
|
+
]);
|
|
428
|
+
const result = await runOpenAIToolLoop(baseConfig({ apiVariant: "responses", maxToolRounds: 2 }));
|
|
429
|
+
assert.equal(result.toolCallLog.length, 2);
|
|
430
|
+
assert.equal(result.toolRounds, 2);
|
|
431
|
+
});
|
|
432
|
+
it("throws on API-level error in JSON body", async () => {
|
|
433
|
+
mockFetch([responsesErrorResponse("Invalid model")]);
|
|
434
|
+
await assert.rejects(() => runOpenAIToolLoop(baseConfig({ apiVariant: "responses" })), { message: "Invalid model" });
|
|
435
|
+
});
|
|
436
|
+
it("throws on HTTP error with non-JSON body", async () => {
|
|
437
|
+
mockHttpError(503, "Service Unavailable");
|
|
438
|
+
await assert.rejects(() => runOpenAIToolLoop(baseConfig({ apiVariant: "responses" })), (err) => err.message.includes("HTTP 503") &&
|
|
439
|
+
err.message.includes("Service Unavailable"));
|
|
440
|
+
});
|
|
441
|
+
it("accumulates token usage across rounds", async () => {
|
|
442
|
+
mockFetch([
|
|
443
|
+
responsesResponse({
|
|
444
|
+
functionCalls: [
|
|
445
|
+
{ callId: "call_1", name: "get_schema", arguments: "{}" },
|
|
446
|
+
],
|
|
447
|
+
inputTokens: 150,
|
|
448
|
+
outputTokens: 40,
|
|
449
|
+
}),
|
|
450
|
+
responsesResponse({
|
|
451
|
+
text: "Done.",
|
|
452
|
+
inputTokens: 250,
|
|
453
|
+
outputTokens: 60,
|
|
454
|
+
}),
|
|
455
|
+
]);
|
|
456
|
+
const result = await runOpenAIToolLoop(baseConfig({ apiVariant: "responses" }));
|
|
457
|
+
assert.equal(result.tokenUsage.prompt, 400); // 150 + 250
|
|
458
|
+
assert.equal(result.tokenUsage.completion, 100); // 40 + 60
|
|
459
|
+
});
|
|
460
|
+
it("sends max_output_tokens (not max_tokens)", async () => {
|
|
461
|
+
mockFetch([responsesResponse({ text: "Answer." })]);
|
|
462
|
+
await runOpenAIToolLoop(baseConfig({ apiVariant: "responses" }));
|
|
463
|
+
const body = fetchCalls[0].body;
|
|
464
|
+
assert.equal(body.max_output_tokens, 4096);
|
|
465
|
+
assert.equal(body.max_tokens, undefined);
|
|
466
|
+
assert.equal(body.max_completion_tokens, undefined);
|
|
467
|
+
});
|
|
468
|
+
it("uses correct endpoint URL", async () => {
|
|
469
|
+
mockFetch([responsesResponse({ text: "Hi." })]);
|
|
470
|
+
await runOpenAIToolLoop(baseConfig({ apiVariant: "responses" }));
|
|
471
|
+
assert.ok(fetchCalls[0].url.includes("/v1/responses"));
|
|
472
|
+
});
|
|
473
|
+
it("passes reasoning_effort and omits temperature", async () => {
|
|
474
|
+
mockFetch([responsesResponse({ text: "Thought carefully." })]);
|
|
475
|
+
await runOpenAIToolLoop(baseConfig({
|
|
476
|
+
apiVariant: "responses",
|
|
477
|
+
providerConfig: { reasoning_effort: "medium" },
|
|
478
|
+
}));
|
|
479
|
+
const body = fetchCalls[0].body;
|
|
480
|
+
assert.deepEqual(body.reasoning, { effort: "medium" });
|
|
481
|
+
assert.equal(body.temperature, undefined);
|
|
482
|
+
});
|
|
483
|
+
it("includes temperature when reasoning_effort is not set", async () => {
|
|
484
|
+
mockFetch([responsesResponse({ text: "Answer." })]);
|
|
485
|
+
await runOpenAIToolLoop(baseConfig({ apiVariant: "responses" }));
|
|
486
|
+
const body = fetchCalls[0].body;
|
|
487
|
+
assert.equal(body.temperature, 0.2);
|
|
488
|
+
assert.equal(body.reasoning, undefined);
|
|
489
|
+
});
|
|
490
|
+
});
|
|
491
|
+
// ---------------------------------------------------------------------------
|
|
492
|
+
// Tests: Default routing (no apiVariant)
|
|
493
|
+
// ---------------------------------------------------------------------------
|
|
494
|
+
describe("runOpenAIToolLoop — default routing", () => {
|
|
495
|
+
beforeEach(() => {
|
|
496
|
+
originalFetch = globalThis.fetch;
|
|
497
|
+
});
|
|
498
|
+
afterEach(() => {
|
|
499
|
+
globalThis.fetch = originalFetch;
|
|
500
|
+
});
|
|
501
|
+
it("defaults to Chat Completions when apiVariant is undefined", async () => {
|
|
502
|
+
mockFetch([
|
|
503
|
+
chatResponse({ content: "Default path.", finishReason: "stop" }),
|
|
504
|
+
]);
|
|
505
|
+
const result = await runOpenAIToolLoop(baseConfig());
|
|
506
|
+
assert.equal(result.output, "Default path.");
|
|
507
|
+
assert.ok(fetchCalls[0].url.includes("/v1/chat/completions"));
|
|
508
|
+
});
|
|
509
|
+
});
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
* and normalizes weight fields.
|
|
13
13
|
*
|
|
14
14
|
* @see docs/design-docs/architecture-overhaul/scoring-rubrics-assertions.md
|
|
15
|
-
* @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
15
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
16
16
|
*/
|
|
17
17
|
import type { GeneralizedAssertionDefinition } from "../../_vendor/ailf-core/index.d.ts";
|
|
18
18
|
import type { EvalMode } from "../../_vendor/ailf-shared/index.d.ts";
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
* and normalizes weight fields.
|
|
13
13
|
*
|
|
14
14
|
* @see docs/design-docs/architecture-overhaul/scoring-rubrics-assertions.md
|
|
15
|
-
* @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
15
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
16
16
|
*/
|
|
17
17
|
// ---------------------------------------------------------------------------
|
|
18
18
|
// Known assertion types and their mode compatibility
|
|
@@ -1,15 +1,10 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* compiler-to-yaml.ts — Serialize compiled Promptfoo config to YAML files.
|
|
3
3
|
*
|
|
4
|
-
* This is the bridge between the
|
|
5
|
-
* CompiledPromptfooConfig) and the
|
|
4
|
+
* This is the bridge between the compiler pipeline (in-memory
|
|
5
|
+
* CompiledPromptfooConfig) and the RunEvalStep which reads
|
|
6
6
|
* YAML config files from disk.
|
|
7
7
|
*
|
|
8
|
-
* The output YAML files are identical in structure to what the legacy
|
|
9
|
-
* generate-configs.ts produces, so RunEvalStep, CalculateScoresStep,
|
|
10
|
-
* and all downstream steps work without modification.
|
|
11
|
-
*
|
|
12
|
-
* @see packages/eval/src/pipeline/generate-configs.ts — legacy path
|
|
13
8
|
* @see packages/eval/src/orchestration/steps/run-eval-step.ts — consumer
|
|
14
9
|
*/
|
|
15
10
|
import type { Logger, ModeCompileResult } from "../../_vendor/ailf-core/index.d.ts";
|
|
@@ -1,15 +1,10 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* compiler-to-yaml.ts — Serialize compiled Promptfoo config to YAML files.
|
|
3
3
|
*
|
|
4
|
-
* This is the bridge between the
|
|
5
|
-
* CompiledPromptfooConfig) and the
|
|
4
|
+
* This is the bridge between the compiler pipeline (in-memory
|
|
5
|
+
* CompiledPromptfooConfig) and the RunEvalStep which reads
|
|
6
6
|
* YAML config files from disk.
|
|
7
7
|
*
|
|
8
|
-
* The output YAML files are identical in structure to what the legacy
|
|
9
|
-
* generate-configs.ts produces, so RunEvalStep, CalculateScoresStep,
|
|
10
|
-
* and all downstream steps work without modification.
|
|
11
|
-
*
|
|
12
|
-
* @see packages/eval/src/pipeline/generate-configs.ts — legacy path
|
|
13
8
|
* @see packages/eval/src/orchestration/steps/run-eval-step.ts — consumer
|
|
14
9
|
*/
|
|
15
10
|
import { existsSync, mkdirSync, writeFileSync } from "fs";
|
|
@@ -54,3 +54,17 @@ export declare class ConfigNotFoundError extends Error {
|
|
|
54
54
|
readonly searchDir: string;
|
|
55
55
|
constructor(configName: string, searchDir: string);
|
|
56
56
|
}
|
|
57
|
+
/**
|
|
58
|
+
* Resolve a package subdirectory, preferring the vendored copy in dist/
|
|
59
|
+
* when @sanity/ailf-core isn't directly resolvable (i.e., installed outside
|
|
60
|
+
* the monorepo via npx or npm).
|
|
61
|
+
*
|
|
62
|
+
* The build step (bundle-workspace-deps.ts) copies config/ and tasks/ into
|
|
63
|
+
* dist/ with @sanity/ailf-core imports rewritten to the _vendor/ path.
|
|
64
|
+
* This function detects that situation and returns the dist/ path instead.
|
|
65
|
+
*
|
|
66
|
+
* @param rootDir - Package root directory (packages/eval)
|
|
67
|
+
* @param subdir - Subdirectory to resolve (e.g., "config", "tasks/literacy")
|
|
68
|
+
* @returns The resolved subdirectory path — either source or dist/ vendored
|
|
69
|
+
*/
|
|
70
|
+
export declare function resolveVendoredSubdir(rootDir: string, subdir: string): string;
|
|
@@ -36,7 +36,8 @@ import { resolve } from "path";
|
|
|
36
36
|
*/
|
|
37
37
|
export function loadConfigFile(name, rootDir, options) {
|
|
38
38
|
const subdir = options?.subdir ?? "config";
|
|
39
|
-
const
|
|
39
|
+
const baseDir = resolveVendoredSubdir(rootDir, subdir);
|
|
40
|
+
const basePath = resolve(baseDir, name);
|
|
40
41
|
// Priority chain: .ts > .js > .yaml > .yml > .json
|
|
41
42
|
const candidates = [
|
|
42
43
|
{ ext: ".ts", format: "ts" },
|
|
@@ -60,7 +61,7 @@ export function loadConfigFile(name, rootDir, options) {
|
|
|
60
61
|
}
|
|
61
62
|
}
|
|
62
63
|
// Always throw ConfigNotFoundError so tryLoadConfigFile can catch it
|
|
63
|
-
throw new ConfigNotFoundError(name,
|
|
64
|
+
throw new ConfigNotFoundError(name, baseDir);
|
|
64
65
|
}
|
|
65
66
|
/**
|
|
66
67
|
* Try to load a config file, returning null if not found.
|
|
@@ -88,6 +89,45 @@ export class ConfigNotFoundError extends Error {
|
|
|
88
89
|
}
|
|
89
90
|
}
|
|
90
91
|
// ---------------------------------------------------------------------------
|
|
92
|
+
// Vendored config detection
|
|
93
|
+
// ---------------------------------------------------------------------------
|
|
94
|
+
/**
|
|
95
|
+
* Resolve a package subdirectory, preferring the vendored copy in dist/
|
|
96
|
+
* when @sanity/ailf-core isn't directly resolvable (i.e., installed outside
|
|
97
|
+
* the monorepo via npx or npm).
|
|
98
|
+
*
|
|
99
|
+
* The build step (bundle-workspace-deps.ts) copies config/ and tasks/ into
|
|
100
|
+
* dist/ with @sanity/ailf-core imports rewritten to the _vendor/ path.
|
|
101
|
+
* This function detects that situation and returns the dist/ path instead.
|
|
102
|
+
*
|
|
103
|
+
* @param rootDir - Package root directory (packages/eval)
|
|
104
|
+
* @param subdir - Subdirectory to resolve (e.g., "config", "tasks/literacy")
|
|
105
|
+
* @returns The resolved subdirectory path — either source or dist/ vendored
|
|
106
|
+
*/
|
|
107
|
+
export function resolveVendoredSubdir(rootDir, subdir) {
|
|
108
|
+
if (shouldUseVendoredConfigs(rootDir)) {
|
|
109
|
+
const vendoredDir = resolve(rootDir, "dist", subdir);
|
|
110
|
+
if (existsSync(vendoredDir))
|
|
111
|
+
return resolve(rootDir, "dist", subdir);
|
|
112
|
+
}
|
|
113
|
+
return resolve(rootDir, subdir);
|
|
114
|
+
}
|
|
115
|
+
let _vendoredCache;
|
|
116
|
+
function shouldUseVendoredConfigs(rootDir) {
|
|
117
|
+
if (_vendoredCache !== undefined)
|
|
118
|
+
return _vendoredCache;
|
|
119
|
+
// Check if @sanity/ailf-core resolves (i.e., we're in the monorepo)
|
|
120
|
+
try {
|
|
121
|
+
const require = createRequire(resolve(rootDir, "package.json"));
|
|
122
|
+
require.resolve("@sanity/ailf-core");
|
|
123
|
+
_vendoredCache = false;
|
|
124
|
+
}
|
|
125
|
+
catch {
|
|
126
|
+
_vendoredCache = true;
|
|
127
|
+
}
|
|
128
|
+
return _vendoredCache;
|
|
129
|
+
}
|
|
130
|
+
// ---------------------------------------------------------------------------
|
|
91
131
|
// Format-specific loaders
|
|
92
132
|
// ---------------------------------------------------------------------------
|
|
93
133
|
function loadTsFile(filePath, format) {
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
* as described in the fixtures-artifacts design doc.
|
|
16
16
|
*
|
|
17
17
|
* @see docs/design-docs/architecture-overhaul/fixtures-artifacts.md
|
|
18
|
-
* @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
18
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
19
19
|
*/
|
|
20
20
|
import type { GeneralizedTaskDefinition, ResolvedFixture, VariableEnvelope } from "../../_vendor/ailf-core/index.d.ts";
|
|
21
21
|
/** Options for fixture resolution */
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
* as described in the fixtures-artifacts design doc.
|
|
16
16
|
*
|
|
17
17
|
* @see docs/design-docs/architecture-overhaul/fixtures-artifacts.md
|
|
18
|
-
* @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
18
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
19
19
|
*/
|
|
20
20
|
import { existsSync, readFileSync } from "fs";
|
|
21
21
|
import { resolve } from "path";
|