@sanity/ailf 1.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +29 -12
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
- package/dist/_vendor/ailf-core/config-helpers.js +51 -2
- package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
- package/dist/_vendor/ailf-core/examples/index.js +213 -94
- package/dist/_vendor/ailf-core/index.d.ts +3 -2
- package/dist/_vendor/ailf-core/index.js +2 -1
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
- package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +7 -1
- package/dist/adapters/config-sources/ts-config-loader.js +21 -13
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
- package/dist/adapters/task-sources/index.d.ts +3 -4
- package/dist/adapters/task-sources/index.js +3 -4
- package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
- package/dist/adapters/task-sources/repo-schemas.js +228 -20
- package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
- package/dist/adapters/task-sources/repo-task-source.js +81 -122
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
- package/dist/adapters/task-sources/task-file-loader.js +21 -7
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/coverage-audit.js +3 -1
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +5 -4
- package/dist/commands/init.js +190 -25
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +15 -4
- package/dist/composition-root.js +100 -55
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/build-step-sequence.js +4 -2
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +32 -19
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +77 -26
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +51 -31
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
- package/dist/pipeline/compiler/literacy-bridge.js +2 -2
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
- package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
- package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/expand-tasks.d.ts +2 -2
- package/dist/pipeline/expand-tasks.js +2 -2
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +16 -9
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +16 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
- package/dist/pipeline/mirror-repo-tasks.js +10 -10
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +68 -30
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +32 -24
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenAI multi-turn tool execution loop.
|
|
3
|
+
*
|
|
4
|
+
* Supports two OpenAI API surfaces:
|
|
5
|
+
* - **Chat Completions** (`/v1/chat/completions`) — used by `openai:chat:*` models
|
|
6
|
+
* - **Responses** (`/v1/responses`) — used by `openai:responses:*` models (GPT-5.x)
|
|
7
|
+
*
|
|
8
|
+
* Both follow the same loop pattern: send prompt → model calls tools → execute
|
|
9
|
+
* via MCP → feed results back → repeat until final text or maxToolRounds.
|
|
10
|
+
*/
|
|
11
|
+
/** Convert MCP tools to OpenAI Chat Completions function tool format */
|
|
12
|
+
function toOpenAIChatTools(tools) {
|
|
13
|
+
return tools.map((t) => ({
|
|
14
|
+
type: "function",
|
|
15
|
+
function: {
|
|
16
|
+
name: t.name,
|
|
17
|
+
description: t.description || `MCP tool: ${t.name}`,
|
|
18
|
+
parameters: t.inputSchema || { type: "object", properties: {} },
|
|
19
|
+
},
|
|
20
|
+
}));
|
|
21
|
+
}
|
|
22
|
+
/** Convert MCP tools to OpenAI Responses API function tool format */
|
|
23
|
+
function toOpenAIResponsesTools(tools) {
|
|
24
|
+
return tools.map((t) => ({
|
|
25
|
+
type: "function",
|
|
26
|
+
name: t.name,
|
|
27
|
+
description: t.description || `MCP tool: ${t.name}`,
|
|
28
|
+
parameters: t.inputSchema || { type: "object", properties: {} },
|
|
29
|
+
}));
|
|
30
|
+
}
|
|
31
|
+
const SYSTEM_PROMPT = "You are an AI assistant with access to tools provided by an MCP server. " +
|
|
32
|
+
"Use the available tools to complete the task. Call tools with correct parameters, " +
|
|
33
|
+
"interpret responses, and provide a complete answer.";
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
// Chat Completions tool loop
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
async function runChatCompletionsLoop(config) {
|
|
38
|
+
const { prompt, tools, callTool, maxToolRounds, model, temperature, maxTokens, apiKey, providerConfig, } = config;
|
|
39
|
+
const openaiTools = toOpenAIChatTools(tools);
|
|
40
|
+
// GPT-5.x and o-series use max_completion_tokens instead of max_tokens.
|
|
41
|
+
// NOTE: This model-name heuristic must be updated when OpenAI ships new
|
|
42
|
+
// model families. Prefer setting max_completion_tokens in providerConfig
|
|
43
|
+
// for new models. Matches the pattern in agentic-provider.ts.
|
|
44
|
+
const useMaxCompletionTokens = providerConfig?.max_output_tokens != null ||
|
|
45
|
+
providerConfig?.max_completion_tokens != null ||
|
|
46
|
+
model.startsWith("gpt-5") ||
|
|
47
|
+
model.startsWith("o3") ||
|
|
48
|
+
model.startsWith("o4");
|
|
49
|
+
const tokenLimitParam = useMaxCompletionTokens
|
|
50
|
+
? { max_completion_tokens: maxTokens }
|
|
51
|
+
: { max_tokens: maxTokens };
|
|
52
|
+
const messages = [
|
|
53
|
+
{ content: SYSTEM_PROMPT, role: "system" },
|
|
54
|
+
{ content: prompt, role: "user" },
|
|
55
|
+
];
|
|
56
|
+
let promptTokens = 0;
|
|
57
|
+
let completionTokens = 0;
|
|
58
|
+
let totalTokens = 0;
|
|
59
|
+
const startTime = Date.now();
|
|
60
|
+
const toolCallLog = [];
|
|
61
|
+
for (let round = 0; round <= maxToolRounds; round++) {
|
|
62
|
+
const isLastRound = round === maxToolRounds;
|
|
63
|
+
const response = await fetch("https://api.openai.com/v1/chat/completions", {
|
|
64
|
+
body: JSON.stringify({
|
|
65
|
+
...tokenLimitParam,
|
|
66
|
+
messages,
|
|
67
|
+
model,
|
|
68
|
+
temperature,
|
|
69
|
+
tool_choice: isLastRound ? "none" : "auto",
|
|
70
|
+
tools: openaiTools,
|
|
71
|
+
}),
|
|
72
|
+
headers: {
|
|
73
|
+
Authorization: `Bearer ${apiKey}`,
|
|
74
|
+
"Content-Type": "application/json",
|
|
75
|
+
},
|
|
76
|
+
method: "POST",
|
|
77
|
+
});
|
|
78
|
+
if (!response.ok) {
|
|
79
|
+
const text = await response.text().catch(() => "");
|
|
80
|
+
throw new Error(`OpenAI Chat Completions API returned HTTP ${response.status}: ${text.slice(0, 200)}`);
|
|
81
|
+
}
|
|
82
|
+
const data = (await response.json());
|
|
83
|
+
if (data.error) {
|
|
84
|
+
throw new Error(data.error.message ?? "Unknown OpenAI Chat Completions error");
|
|
85
|
+
}
|
|
86
|
+
promptTokens += data.usage?.prompt_tokens ?? 0;
|
|
87
|
+
completionTokens += data.usage?.completion_tokens ?? 0;
|
|
88
|
+
totalTokens += data.usage?.total_tokens ?? 0;
|
|
89
|
+
const assistantMessage = data.choices?.[0]?.message;
|
|
90
|
+
const finishReason = data.choices?.[0]?.finish_reason;
|
|
91
|
+
if (!assistantMessage) {
|
|
92
|
+
return {
|
|
93
|
+
output: "",
|
|
94
|
+
toolCallLog,
|
|
95
|
+
tokenUsage: {
|
|
96
|
+
prompt: promptTokens,
|
|
97
|
+
completion: completionTokens,
|
|
98
|
+
total: totalTokens,
|
|
99
|
+
},
|
|
100
|
+
toolRounds: round,
|
|
101
|
+
latencyMs: Date.now() - startTime,
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
messages.push(assistantMessage);
|
|
105
|
+
// If the model didn't call tools, it's done
|
|
106
|
+
if (finishReason !== "tool_calls" || !assistantMessage.tool_calls?.length) {
|
|
107
|
+
return {
|
|
108
|
+
output: assistantMessage.content ?? "",
|
|
109
|
+
toolCallLog,
|
|
110
|
+
tokenUsage: {
|
|
111
|
+
prompt: promptTokens,
|
|
112
|
+
completion: completionTokens,
|
|
113
|
+
total: totalTokens,
|
|
114
|
+
},
|
|
115
|
+
toolRounds: round,
|
|
116
|
+
latencyMs: Date.now() - startTime,
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
// Execute each tool call via MCP
|
|
120
|
+
for (const toolCall of assistantMessage.tool_calls) {
|
|
121
|
+
const toolName = toolCall.function.name;
|
|
122
|
+
let toolInput;
|
|
123
|
+
try {
|
|
124
|
+
toolInput = JSON.parse(toolCall.function.arguments);
|
|
125
|
+
}
|
|
126
|
+
catch {
|
|
127
|
+
toolInput = { _raw: toolCall.function.arguments };
|
|
128
|
+
}
|
|
129
|
+
try {
|
|
130
|
+
const result = await callTool(toolName, toolInput);
|
|
131
|
+
const content = result.error
|
|
132
|
+
? JSON.stringify({ error: result.error })
|
|
133
|
+
: result.content;
|
|
134
|
+
toolCallLog.push({ name: toolName, input: toolInput, output: content });
|
|
135
|
+
messages.push({
|
|
136
|
+
content,
|
|
137
|
+
role: "tool",
|
|
138
|
+
tool_call_id: toolCall.id,
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
catch (err) {
|
|
142
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
143
|
+
toolCallLog.push({
|
|
144
|
+
name: toolName,
|
|
145
|
+
input: toolInput,
|
|
146
|
+
output: `Error: ${errMsg}`,
|
|
147
|
+
});
|
|
148
|
+
messages.push({
|
|
149
|
+
content: JSON.stringify({ error: errMsg }),
|
|
150
|
+
role: "tool",
|
|
151
|
+
tool_call_id: toolCall.id,
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
// Exhausted rounds — extract last assistant text
|
|
157
|
+
const lastAssistant = messages
|
|
158
|
+
.filter((m) => m.role === "assistant" && m.content)
|
|
159
|
+
.pop();
|
|
160
|
+
return {
|
|
161
|
+
output: lastAssistant?.content ?? "[Exhausted tool rounds without final answer]",
|
|
162
|
+
toolCallLog,
|
|
163
|
+
tokenUsage: {
|
|
164
|
+
prompt: promptTokens,
|
|
165
|
+
completion: completionTokens,
|
|
166
|
+
total: totalTokens,
|
|
167
|
+
},
|
|
168
|
+
toolRounds: maxToolRounds,
|
|
169
|
+
exhaustedRounds: true,
|
|
170
|
+
latencyMs: Date.now() - startTime,
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
// ---------------------------------------------------------------------------
|
|
174
|
+
// Responses API tool loop
|
|
175
|
+
// ---------------------------------------------------------------------------
|
|
176
|
+
async function runResponsesLoop(config) {
|
|
177
|
+
const { prompt, tools, callTool, maxToolRounds, model, temperature, maxTokens, apiKey, providerConfig, } = config;
|
|
178
|
+
const openaiTools = toOpenAIResponsesTools(tools);
|
|
179
|
+
// Input for the current round — replaced each iteration with tool outputs
|
|
180
|
+
let input = [
|
|
181
|
+
{ type: "message", role: "system", content: SYSTEM_PROMPT },
|
|
182
|
+
{ type: "message", role: "user", content: prompt },
|
|
183
|
+
];
|
|
184
|
+
let inputTokens = 0;
|
|
185
|
+
let outputTokens = 0;
|
|
186
|
+
let totalTokens = 0;
|
|
187
|
+
const startTime = Date.now();
|
|
188
|
+
const toolCallLog = [];
|
|
189
|
+
// Track the previous response ID for conversation chaining
|
|
190
|
+
let previousResponseId;
|
|
191
|
+
// Pass through reasoning_effort if configured
|
|
192
|
+
const reasoningEffort = providerConfig?.reasoning_effort;
|
|
193
|
+
for (let round = 0; round <= maxToolRounds; round++) {
|
|
194
|
+
const isLastRound = round === maxToolRounds;
|
|
195
|
+
const body = {
|
|
196
|
+
model,
|
|
197
|
+
max_output_tokens: maxTokens,
|
|
198
|
+
tools: openaiTools,
|
|
199
|
+
tool_choice: isLastRound ? "none" : "auto",
|
|
200
|
+
};
|
|
201
|
+
// Reasoning models (GPT-5.x, o-series) use reasoning.effort instead of
|
|
202
|
+
// temperature — the Responses API rejects temperature when reasoning is set.
|
|
203
|
+
if (reasoningEffort) {
|
|
204
|
+
body.reasoning = { effort: reasoningEffort };
|
|
205
|
+
}
|
|
206
|
+
else {
|
|
207
|
+
body.temperature = temperature;
|
|
208
|
+
}
|
|
209
|
+
body.input = input;
|
|
210
|
+
if (previousResponseId) {
|
|
211
|
+
body.previous_response_id = previousResponseId;
|
|
212
|
+
}
|
|
213
|
+
const response = await fetch("https://api.openai.com/v1/responses", {
|
|
214
|
+
body: JSON.stringify(body),
|
|
215
|
+
headers: {
|
|
216
|
+
Authorization: `Bearer ${apiKey}`,
|
|
217
|
+
"Content-Type": "application/json",
|
|
218
|
+
},
|
|
219
|
+
method: "POST",
|
|
220
|
+
});
|
|
221
|
+
if (!response.ok) {
|
|
222
|
+
const text = await response.text().catch(() => "");
|
|
223
|
+
throw new Error(`OpenAI Responses API returned HTTP ${response.status}: ${text.slice(0, 200)}`);
|
|
224
|
+
}
|
|
225
|
+
const data = (await response.json());
|
|
226
|
+
if (data.error) {
|
|
227
|
+
throw new Error(data.error.message ?? "Unknown OpenAI Responses API error");
|
|
228
|
+
}
|
|
229
|
+
inputTokens += data.usage?.input_tokens ?? 0;
|
|
230
|
+
outputTokens += data.usage?.output_tokens ?? 0;
|
|
231
|
+
totalTokens += data.usage?.total_tokens ?? 0;
|
|
232
|
+
previousResponseId = data.id;
|
|
233
|
+
// Extract function calls and message outputs
|
|
234
|
+
const functionCalls = data.output.filter((item) => item.type === "function_call");
|
|
235
|
+
const messageItems = data.output.filter((item) => item.type === "message");
|
|
236
|
+
// If no function calls, model is done — extract text
|
|
237
|
+
if (functionCalls.length === 0) {
|
|
238
|
+
const output = messageItems
|
|
239
|
+
.flatMap((m) => m.content)
|
|
240
|
+
.filter((c) => c.type === "output_text")
|
|
241
|
+
.map((c) => c.text)
|
|
242
|
+
.join("\n") || "";
|
|
243
|
+
return {
|
|
244
|
+
output,
|
|
245
|
+
toolCallLog,
|
|
246
|
+
tokenUsage: {
|
|
247
|
+
prompt: inputTokens,
|
|
248
|
+
completion: outputTokens,
|
|
249
|
+
total: totalTokens,
|
|
250
|
+
},
|
|
251
|
+
toolRounds: round,
|
|
252
|
+
latencyMs: Date.now() - startTime,
|
|
253
|
+
};
|
|
254
|
+
}
|
|
255
|
+
// Execute each function call via MCP and build tool output items for next request
|
|
256
|
+
const toolOutputItems = [];
|
|
257
|
+
for (const fc of functionCalls) {
|
|
258
|
+
const toolName = fc.name;
|
|
259
|
+
let toolInput;
|
|
260
|
+
try {
|
|
261
|
+
toolInput = JSON.parse(fc.arguments);
|
|
262
|
+
}
|
|
263
|
+
catch {
|
|
264
|
+
toolInput = { _raw: fc.arguments };
|
|
265
|
+
}
|
|
266
|
+
try {
|
|
267
|
+
const result = await callTool(toolName, toolInput);
|
|
268
|
+
const content = result.error
|
|
269
|
+
? JSON.stringify({ error: result.error })
|
|
270
|
+
: result.content;
|
|
271
|
+
toolCallLog.push({ name: toolName, input: toolInput, output: content });
|
|
272
|
+
toolOutputItems.push({
|
|
273
|
+
type: "function_call_output",
|
|
274
|
+
call_id: fc.call_id,
|
|
275
|
+
output: content,
|
|
276
|
+
});
|
|
277
|
+
}
|
|
278
|
+
catch (err) {
|
|
279
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
280
|
+
toolCallLog.push({
|
|
281
|
+
name: toolName,
|
|
282
|
+
input: toolInput,
|
|
283
|
+
output: `Error: ${errMsg}`,
|
|
284
|
+
});
|
|
285
|
+
toolOutputItems.push({
|
|
286
|
+
type: "function_call_output",
|
|
287
|
+
call_id: fc.call_id,
|
|
288
|
+
output: JSON.stringify({ error: errMsg }),
|
|
289
|
+
});
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
// For the next round, send tool outputs as input (chained via previous_response_id)
|
|
293
|
+
input = toolOutputItems;
|
|
294
|
+
}
|
|
295
|
+
// Exhausted rounds — no final text available
|
|
296
|
+
return {
|
|
297
|
+
output: "[Exhausted tool rounds without final answer]",
|
|
298
|
+
toolCallLog,
|
|
299
|
+
tokenUsage: {
|
|
300
|
+
prompt: inputTokens,
|
|
301
|
+
completion: outputTokens,
|
|
302
|
+
total: totalTokens,
|
|
303
|
+
},
|
|
304
|
+
toolRounds: maxToolRounds,
|
|
305
|
+
exhaustedRounds: true,
|
|
306
|
+
latencyMs: Date.now() - startTime,
|
|
307
|
+
};
|
|
308
|
+
}
|
|
309
|
+
// ---------------------------------------------------------------------------
|
|
310
|
+
// Public entry point
|
|
311
|
+
// ---------------------------------------------------------------------------
|
|
312
|
+
/**
|
|
313
|
+
* Run a multi-turn tool loop using the OpenAI API.
|
|
314
|
+
*
|
|
315
|
+
* Routes to Chat Completions or Responses API based on `config.apiVariant`:
|
|
316
|
+
* - `"responses"` → Responses API (`/v1/responses`)
|
|
317
|
+
* - `"chat"` or undefined → Chat Completions API (`/v1/chat/completions`)
|
|
318
|
+
*/
|
|
319
|
+
export async function runOpenAIToolLoop(config) {
|
|
320
|
+
return config.apiVariant === "responses"
|
|
321
|
+
? runResponsesLoop(config)
|
|
322
|
+
: runChatCompletionsLoop(config);
|
|
323
|
+
}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared types for the MCP tool provider.
|
|
3
|
+
*/
|
|
4
|
+
export interface CallApiContextParams {
|
|
5
|
+
prompt?: {
|
|
6
|
+
raw: string;
|
|
7
|
+
label?: string;
|
|
8
|
+
};
|
|
9
|
+
vars?: Record<string, object | string>;
|
|
10
|
+
}
|
|
11
|
+
export interface ProviderOptions {
|
|
12
|
+
config?: Record<string, unknown>;
|
|
13
|
+
id?: string;
|
|
14
|
+
}
|
|
15
|
+
export interface ProviderResponse {
|
|
16
|
+
cached?: boolean;
|
|
17
|
+
cost?: number;
|
|
18
|
+
error?: string;
|
|
19
|
+
metadata?: Record<string, unknown>;
|
|
20
|
+
output?: object | string;
|
|
21
|
+
tokenUsage?: {
|
|
22
|
+
total?: number;
|
|
23
|
+
prompt?: number;
|
|
24
|
+
completion?: number;
|
|
25
|
+
cached?: number;
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
export interface MCPTool {
|
|
29
|
+
name: string;
|
|
30
|
+
description?: string;
|
|
31
|
+
inputSchema?: Record<string, unknown>;
|
|
32
|
+
}
|
|
33
|
+
export interface MCPClient {
|
|
34
|
+
getAllTools: () => MCPTool[];
|
|
35
|
+
callTool: (name: string, args: Record<string, unknown>) => Promise<{
|
|
36
|
+
content: string;
|
|
37
|
+
error?: string;
|
|
38
|
+
}>;
|
|
39
|
+
cleanup: () => Promise<void>;
|
|
40
|
+
}
|
|
41
|
+
export interface ToolCallEntry {
|
|
42
|
+
name: string;
|
|
43
|
+
input: unknown;
|
|
44
|
+
output: string;
|
|
45
|
+
}
|
|
46
|
+
/** Configuration passed from the provider to an LLM backend's tool loop */
|
|
47
|
+
export interface ToolLoopConfig {
|
|
48
|
+
/** The user's prompt */
|
|
49
|
+
prompt: string;
|
|
50
|
+
/** MCP tools available (already filtered by capabilities) */
|
|
51
|
+
tools: MCPTool[];
|
|
52
|
+
/** Function to call an MCP tool by name */
|
|
53
|
+
callTool: (name: string, args: Record<string, unknown>) => Promise<{
|
|
54
|
+
content: string;
|
|
55
|
+
error?: string;
|
|
56
|
+
}>;
|
|
57
|
+
/** Max rounds of tool calls before forcing a final answer */
|
|
58
|
+
maxToolRounds: number;
|
|
59
|
+
/** LLM model name (provider-specific, e.g., "claude-opus-4-6") */
|
|
60
|
+
model: string;
|
|
61
|
+
/** Sampling temperature */
|
|
62
|
+
temperature: number;
|
|
63
|
+
/** Max output tokens */
|
|
64
|
+
maxTokens: number;
|
|
65
|
+
/** API key for the LLM provider */
|
|
66
|
+
apiKey: string;
|
|
67
|
+
/**
|
|
68
|
+
* OpenAI API variant — determines which endpoint to call.
|
|
69
|
+
* - `"chat"` → `/v1/chat/completions` (default)
|
|
70
|
+
* - `"responses"` → `/v1/responses`
|
|
71
|
+
*
|
|
72
|
+
* Extracted from model ID: `openai:responses:gpt-5.4` → `"responses"`
|
|
73
|
+
*/
|
|
74
|
+
apiVariant?: "chat" | "responses";
|
|
75
|
+
/** Additional model-specific config passed through from the provider */
|
|
76
|
+
providerConfig?: Record<string, unknown>;
|
|
77
|
+
}
|
|
78
|
+
/** Result returned from an LLM backend's tool loop */
|
|
79
|
+
export interface ToolLoopResult {
|
|
80
|
+
/** The LLM's final text output */
|
|
81
|
+
output: string;
|
|
82
|
+
/** Log of all tool calls made during the loop */
|
|
83
|
+
toolCallLog: ToolCallEntry[];
|
|
84
|
+
/** Token usage stats */
|
|
85
|
+
tokenUsage: {
|
|
86
|
+
prompt: number;
|
|
87
|
+
completion: number;
|
|
88
|
+
total: number;
|
|
89
|
+
};
|
|
90
|
+
/** Number of tool rounds completed */
|
|
91
|
+
toolRounds: number;
|
|
92
|
+
/** Whether the loop exhausted maxToolRounds */
|
|
93
|
+
exhaustedRounds?: boolean;
|
|
94
|
+
/** Wall-clock duration in ms */
|
|
95
|
+
latencyMs: number;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* An LLM backend that can run a multi-turn tool loop.
|
|
99
|
+
*
|
|
100
|
+
* Each backend (Anthropic, OpenAI) implements this function signature.
|
|
101
|
+
* The provider selects the appropriate backend based on the model ID prefix.
|
|
102
|
+
*/
|
|
103
|
+
export type RunToolLoop = (config: ToolLoopConfig) => Promise<ToolLoopResult>;
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* mcp-tool-provider.ts — Custom Promptfoo provider for MCP tool-use evaluation.
|
|
3
|
+
*
|
|
4
|
+
* Implements a multi-turn tool execution loop: the LLM receives a prompt,
|
|
5
|
+
* discovers MCP tools, calls them, gets results, and continues until it
|
|
6
|
+
* produces a final text answer or exhausts maxToolRounds.
|
|
7
|
+
*
|
|
8
|
+
* Promptfoo's built-in Anthropic/OpenAI providers with config.mcp only do
|
|
9
|
+
* single-turn tool calls. This provider fills that gap by managing the
|
|
10
|
+
* full conversation loop, similar to the agentic-provider.ts pattern.
|
|
11
|
+
*
|
|
12
|
+
* Promptfoo config usage:
|
|
13
|
+
*
|
|
14
|
+
* providers:
|
|
15
|
+
* - id: file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js
|
|
16
|
+
* label: "Claude Opus 4.6 + MCP"
|
|
17
|
+
* config:
|
|
18
|
+
* model: anthropic:messages:claude-opus-4-6
|
|
19
|
+
* maxToolRounds: 5
|
|
20
|
+
* temperature: 0.2
|
|
21
|
+
* max_tokens: 4096
|
|
22
|
+
* mcpServer:
|
|
23
|
+
* url: https://mcp.sanity.io
|
|
24
|
+
* auth: { type: bearer, token: "{{env.SANITY_API_TOKEN}}" }
|
|
25
|
+
* name: mcp-live-query-documents
|
|
26
|
+
* mcpTools: [query_documents, get_schema]
|
|
27
|
+
*/
|
|
28
|
+
interface CallApiContextParams {
|
|
29
|
+
prompt?: {
|
|
30
|
+
raw: string;
|
|
31
|
+
label?: string;
|
|
32
|
+
};
|
|
33
|
+
vars?: Record<string, object | string>;
|
|
34
|
+
}
|
|
35
|
+
interface ProviderOptions {
|
|
36
|
+
config?: Record<string, unknown>;
|
|
37
|
+
id?: string;
|
|
38
|
+
}
|
|
39
|
+
interface ProviderResponse {
|
|
40
|
+
cached?: boolean;
|
|
41
|
+
cost?: number;
|
|
42
|
+
error?: string;
|
|
43
|
+
metadata?: Record<string, unknown>;
|
|
44
|
+
output?: object | string;
|
|
45
|
+
tokenUsage?: {
|
|
46
|
+
total?: number;
|
|
47
|
+
prompt?: number;
|
|
48
|
+
completion?: number;
|
|
49
|
+
cached?: number;
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
export default class MCPToolProvider {
|
|
53
|
+
config: Record<string, unknown>;
|
|
54
|
+
private providerId;
|
|
55
|
+
constructor(options?: ProviderOptions);
|
|
56
|
+
id(): string;
|
|
57
|
+
callApi(prompt: string, _context?: CallApiContextParams): Promise<ProviderResponse>;
|
|
58
|
+
private runAnthropicLoop;
|
|
59
|
+
private connectMCP;
|
|
60
|
+
/**
|
|
61
|
+
* Resolve {{env.VAR}} templates in config values.
|
|
62
|
+
*/
|
|
63
|
+
private resolveEnvTemplates;
|
|
64
|
+
}
|
|
65
|
+
export {};
|