@sanity/ailf 1.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +29 -12
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
- package/dist/_vendor/ailf-core/config-helpers.js +51 -2
- package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
- package/dist/_vendor/ailf-core/examples/index.js +213 -94
- package/dist/_vendor/ailf-core/index.d.ts +3 -2
- package/dist/_vendor/ailf-core/index.js +2 -1
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
- package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +7 -1
- package/dist/adapters/config-sources/ts-config-loader.js +21 -13
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
- package/dist/adapters/task-sources/index.d.ts +3 -4
- package/dist/adapters/task-sources/index.js +3 -4
- package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
- package/dist/adapters/task-sources/repo-schemas.js +228 -20
- package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
- package/dist/adapters/task-sources/repo-task-source.js +81 -122
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
- package/dist/adapters/task-sources/task-file-loader.js +21 -7
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/coverage-audit.js +3 -1
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +5 -4
- package/dist/commands/init.js +190 -25
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +15 -4
- package/dist/composition-root.js +100 -55
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/build-step-sequence.js +4 -2
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +32 -19
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +77 -26
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +51 -31
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
- package/dist/pipeline/compiler/literacy-bridge.js +2 -2
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
- package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
- package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/expand-tasks.d.ts +2 -2
- package/dist/pipeline/expand-tasks.js +2 -2
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +16 -9
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +16 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
- package/dist/pipeline/mirror-repo-tasks.js +10 -10
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +68 -30
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +32 -24
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCPToolProvider — Custom Promptfoo provider for MCP tool-use evaluation.
|
|
3
|
+
*
|
|
4
|
+
* Orchestrates the MCP evaluation flow:
|
|
5
|
+
* 1. Connects to the MCP server and discovers available tools
|
|
6
|
+
* 2. Selects the appropriate LLM backend based on model ID prefix
|
|
7
|
+
* 3. Delegates the multi-turn tool loop to the backend
|
|
8
|
+
* 4. Formats the result for Promptfoo (including tool call summary)
|
|
9
|
+
*
|
|
10
|
+
* Promptfoo config usage:
|
|
11
|
+
*
|
|
12
|
+
* providers:
|
|
13
|
+
* - id: file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js
|
|
14
|
+
* label: "Claude Opus 4.6 + MCP"
|
|
15
|
+
* config:
|
|
16
|
+
* model: anthropic:messages:claude-opus-4-6
|
|
17
|
+
* maxToolRounds: 5
|
|
18
|
+
* temperature: 0.2
|
|
19
|
+
* max_tokens: 4096
|
|
20
|
+
* mcpServer:
|
|
21
|
+
* url: https://mcp.sanity.io
|
|
22
|
+
* auth: { type: bearer, token: "{{env.SANITY_API_TOKEN}}" }
|
|
23
|
+
* name: mcp-live-query-documents
|
|
24
|
+
* mcpTools: [query_documents, get_schema]
|
|
25
|
+
*/
|
|
26
|
+
import type { CallApiContextParams, ProviderOptions, ProviderResponse } from "./types.js";
|
|
27
|
+
export default class MCPToolProvider {
|
|
28
|
+
config: Record<string, unknown>;
|
|
29
|
+
private providerId;
|
|
30
|
+
constructor(options?: ProviderOptions);
|
|
31
|
+
id(): string;
|
|
32
|
+
callApi(prompt: string, _context?: CallApiContextParams): Promise<ProviderResponse>;
|
|
33
|
+
}
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCPToolProvider — Custom Promptfoo provider for MCP tool-use evaluation.
|
|
3
|
+
*
|
|
4
|
+
* Orchestrates the MCP evaluation flow:
|
|
5
|
+
* 1. Connects to the MCP server and discovers available tools
|
|
6
|
+
* 2. Selects the appropriate LLM backend based on model ID prefix
|
|
7
|
+
* 3. Delegates the multi-turn tool loop to the backend
|
|
8
|
+
* 4. Formats the result for Promptfoo (including tool call summary)
|
|
9
|
+
*
|
|
10
|
+
* Promptfoo config usage:
|
|
11
|
+
*
|
|
12
|
+
* providers:
|
|
13
|
+
* - id: file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js
|
|
14
|
+
* label: "Claude Opus 4.6 + MCP"
|
|
15
|
+
* config:
|
|
16
|
+
* model: anthropic:messages:claude-opus-4-6
|
|
17
|
+
* maxToolRounds: 5
|
|
18
|
+
* temperature: 0.2
|
|
19
|
+
* max_tokens: 4096
|
|
20
|
+
* mcpServer:
|
|
21
|
+
* url: https://mcp.sanity.io
|
|
22
|
+
* auth: { type: bearer, token: "{{env.SANITY_API_TOKEN}}" }
|
|
23
|
+
* name: mcp-live-query-documents
|
|
24
|
+
* mcpTools: [query_documents, get_schema]
|
|
25
|
+
*/
|
|
26
|
+
import { config as loadDotenv } from "dotenv";
|
|
27
|
+
import { connectMCP } from "./mcp-connection.js";
|
|
28
|
+
import { runAnthropicToolLoop } from "./tool-loop-anthropic.js";
|
|
29
|
+
import { runOpenAIToolLoop } from "./tool-loop-openai.js";
|
|
30
|
+
loadDotenv({
|
|
31
|
+
override: true,
|
|
32
|
+
path: new URL("../../../../../.env", import.meta.url).pathname,
|
|
33
|
+
});
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
// Backend registry — maps model ID prefixes to tool loop implementations
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
const BACKENDS = {
|
|
38
|
+
anthropic: runAnthropicToolLoop,
|
|
39
|
+
openai: runOpenAIToolLoop,
|
|
40
|
+
};
|
|
41
|
+
/**
|
|
42
|
+
* Resolve the LLM backend from a model ID.
|
|
43
|
+
*
|
|
44
|
+
* Model IDs follow the pattern `provider:type:model-name` (e.g.,
|
|
45
|
+
* `anthropic:messages:claude-opus-4-6`). The first segment determines
|
|
46
|
+
* which backend handles the tool loop. For OpenAI, the second segment
|
|
47
|
+
* determines the API variant (`chat` → Chat Completions, `responses` →
|
|
48
|
+
* Responses API).
|
|
49
|
+
*/
|
|
50
|
+
function resolveBackend(modelId) {
|
|
51
|
+
const parts = modelId.split(":");
|
|
52
|
+
const prefix = parts[0];
|
|
53
|
+
const backend = BACKENDS[prefix];
|
|
54
|
+
if (!backend) {
|
|
55
|
+
const supported = Object.keys(BACKENDS).join(", ");
|
|
56
|
+
throw new Error(`No backend for model "${modelId}". Supported prefixes: ${supported}`);
|
|
57
|
+
}
|
|
58
|
+
// Extract the model name for the API (e.g., "claude-opus-4-6" from "anthropic:messages:claude-opus-4-6")
|
|
59
|
+
const modelName = parts.length > 2 ? parts.slice(2).join(":") : parts[parts.length - 1];
|
|
60
|
+
// For OpenAI, extract the API variant from the second segment
|
|
61
|
+
let apiVariant;
|
|
62
|
+
if (prefix === "openai" && parts.length > 2) {
|
|
63
|
+
const variant = parts[1];
|
|
64
|
+
if (variant === "responses" || variant === "chat") {
|
|
65
|
+
apiVariant = variant;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
return { backend, modelName, apiVariant };
|
|
69
|
+
}
|
|
70
|
+
// ---------------------------------------------------------------------------
|
|
71
|
+
// Helpers
|
|
72
|
+
// ---------------------------------------------------------------------------
|
|
73
|
+
/** Append a machine-readable tool call summary for assertion detection */
|
|
74
|
+
function appendToolSummary(text, log) {
|
|
75
|
+
if (log.length === 0)
|
|
76
|
+
return text;
|
|
77
|
+
const names = JSON.stringify(log.map((tc) => tc.name));
|
|
78
|
+
return `${text}\n\n<!-- MCP_TOOLS_CALLED: ${names} -->`;
|
|
79
|
+
}
|
|
80
|
+
/** Resolve the API key for a given model prefix */
|
|
81
|
+
function resolveApiKey(prefix, config) {
|
|
82
|
+
if (config.apiKey)
|
|
83
|
+
return String(config.apiKey);
|
|
84
|
+
const envMap = {
|
|
85
|
+
anthropic: "ANTHROPIC_API_KEY",
|
|
86
|
+
openai: "OPENAI_API_KEY",
|
|
87
|
+
};
|
|
88
|
+
const envVar = envMap[prefix];
|
|
89
|
+
return envVar ? process.env[envVar] : undefined;
|
|
90
|
+
}
|
|
91
|
+
// ---------------------------------------------------------------------------
|
|
92
|
+
// Provider class
|
|
93
|
+
// ---------------------------------------------------------------------------
|
|
94
|
+
export default class MCPToolProvider {
|
|
95
|
+
config;
|
|
96
|
+
providerId;
|
|
97
|
+
constructor(options = {}) {
|
|
98
|
+
this.config = options.config || {};
|
|
99
|
+
this.providerId = options.id || "mcp-tool-provider";
|
|
100
|
+
}
|
|
101
|
+
id() {
|
|
102
|
+
return this.providerId;
|
|
103
|
+
}
|
|
104
|
+
async callApi(prompt, _context) {
|
|
105
|
+
const mcpServerConfig = this.config.mcpServer;
|
|
106
|
+
if (!mcpServerConfig) {
|
|
107
|
+
return { error: "mcpServer config is required", output: undefined };
|
|
108
|
+
}
|
|
109
|
+
// Resolve model and backend
|
|
110
|
+
const modelId = this.config.model || "anthropic:messages:claude-opus-4-6";
|
|
111
|
+
let backend;
|
|
112
|
+
let modelName;
|
|
113
|
+
let apiVariant;
|
|
114
|
+
try {
|
|
115
|
+
const resolved = resolveBackend(modelId);
|
|
116
|
+
backend = resolved.backend;
|
|
117
|
+
modelName = resolved.modelName;
|
|
118
|
+
apiVariant = resolved.apiVariant;
|
|
119
|
+
}
|
|
120
|
+
catch (err) {
|
|
121
|
+
return {
|
|
122
|
+
error: err instanceof Error ? err.message : String(err),
|
|
123
|
+
output: undefined,
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
// Resolve API key
|
|
127
|
+
const prefix = modelId.split(":")[0];
|
|
128
|
+
const apiKey = resolveApiKey(prefix, this.config);
|
|
129
|
+
if (!apiKey) {
|
|
130
|
+
return {
|
|
131
|
+
error: `API key not found for ${prefix}. Set ${prefix.toUpperCase()}_API_KEY in env or config.apiKey.`,
|
|
132
|
+
output: undefined,
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
// Connect to MCP server
|
|
136
|
+
let mcpClient;
|
|
137
|
+
try {
|
|
138
|
+
mcpClient = await connectMCP(mcpServerConfig);
|
|
139
|
+
}
|
|
140
|
+
catch (err) {
|
|
141
|
+
return {
|
|
142
|
+
error: `Failed to connect to MCP server: ${err instanceof Error ? err.message : String(err)}`,
|
|
143
|
+
output: undefined,
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
try {
|
|
147
|
+
// Filter tools by capabilities
|
|
148
|
+
const allTools = mcpClient.getAllTools();
|
|
149
|
+
const toolFilter = this.config.mcpTools;
|
|
150
|
+
const tools = toolFilter
|
|
151
|
+
? allTools.filter((t) => toolFilter.includes(t.name))
|
|
152
|
+
: allTools;
|
|
153
|
+
if (tools.length === 0) {
|
|
154
|
+
return {
|
|
155
|
+
error: "No MCP tools available after filtering. Check mcpTools config and server capabilities.",
|
|
156
|
+
output: undefined,
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
// Run the tool loop
|
|
160
|
+
const result = await backend({
|
|
161
|
+
prompt,
|
|
162
|
+
tools,
|
|
163
|
+
callTool: mcpClient.callTool,
|
|
164
|
+
maxToolRounds: this.config.maxToolRounds || 5,
|
|
165
|
+
model: modelName,
|
|
166
|
+
temperature: this.config.temperature ?? 0.2,
|
|
167
|
+
maxTokens: this.config.max_output_tokens ||
|
|
168
|
+
this.config.max_completion_tokens ||
|
|
169
|
+
this.config.max_tokens ||
|
|
170
|
+
4096,
|
|
171
|
+
apiKey,
|
|
172
|
+
apiVariant,
|
|
173
|
+
providerConfig: this.config,
|
|
174
|
+
});
|
|
175
|
+
return {
|
|
176
|
+
cost: 0,
|
|
177
|
+
metadata: {
|
|
178
|
+
toolRounds: result.toolRounds,
|
|
179
|
+
toolCallLog: result.toolCallLog,
|
|
180
|
+
exhaustedRounds: result.exhaustedRounds,
|
|
181
|
+
latencyMs: result.latencyMs,
|
|
182
|
+
},
|
|
183
|
+
output: appendToolSummary(result.output, result.toolCallLog),
|
|
184
|
+
tokenUsage: result.tokenUsage,
|
|
185
|
+
};
|
|
186
|
+
}
|
|
187
|
+
finally {
|
|
188
|
+
await mcpClient.cleanup().catch(() => { });
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP server connection and tool discovery.
|
|
3
|
+
*
|
|
4
|
+
* Handles connecting to an MCP server via streamable-http or stdio transport,
|
|
5
|
+
* discovering available tools, and resolving {{env.VAR}} templates in config.
|
|
6
|
+
*/
|
|
7
|
+
import type { MCPClient } from "./types.js";
|
|
8
|
+
/**
|
|
9
|
+
* Connect to an MCP server and return a client for tool discovery and execution.
|
|
10
|
+
*
|
|
11
|
+
* Supports two transport types:
|
|
12
|
+
* - `url` → streamable-http (remote MCP servers like mcp.sanity.io)
|
|
13
|
+
* - `command` → stdio (local MCP server processes)
|
|
14
|
+
*/
|
|
15
|
+
export declare function connectMCP(serverConfig: Record<string, unknown>): Promise<MCPClient>;
|
|
16
|
+
/**
|
|
17
|
+
* Resolve `{{env.VAR}}` templates in config values, recursively.
|
|
18
|
+
*/
|
|
19
|
+
export declare function resolveEnvTemplates(config: Record<string, unknown>): Record<string, unknown>;
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP server connection and tool discovery.
|
|
3
|
+
*
|
|
4
|
+
* Handles connecting to an MCP server via streamable-http or stdio transport,
|
|
5
|
+
* discovering available tools, and resolving {{env.VAR}} templates in config.
|
|
6
|
+
*/
|
|
7
|
+
/**
|
|
8
|
+
* Connect to an MCP server and return a client for tool discovery and execution.
|
|
9
|
+
*
|
|
10
|
+
* Supports two transport types:
|
|
11
|
+
* - `url` → streamable-http (remote MCP servers like mcp.sanity.io)
|
|
12
|
+
* - `command` → stdio (local MCP server processes)
|
|
13
|
+
*/
|
|
14
|
+
export async function connectMCP(serverConfig) {
|
|
15
|
+
const { Client } = await import("@modelcontextprotocol/sdk/client/index.js");
|
|
16
|
+
const client = new Client({
|
|
17
|
+
name: "ailf-mcp-eval",
|
|
18
|
+
version: "1.0.0",
|
|
19
|
+
});
|
|
20
|
+
const resolvedConfig = resolveEnvTemplates(serverConfig);
|
|
21
|
+
let closeTransport;
|
|
22
|
+
if (resolvedConfig.command) {
|
|
23
|
+
const { StdioClientTransport } = await import("@modelcontextprotocol/sdk/client/stdio.js");
|
|
24
|
+
const parts = String(resolvedConfig.command).split(/\s+/);
|
|
25
|
+
const transport = new StdioClientTransport({
|
|
26
|
+
command: parts[0],
|
|
27
|
+
args: parts.slice(1),
|
|
28
|
+
env: process.env,
|
|
29
|
+
});
|
|
30
|
+
await client.connect(transport);
|
|
31
|
+
closeTransport = () => transport.close();
|
|
32
|
+
}
|
|
33
|
+
else if (resolvedConfig.url) {
|
|
34
|
+
const { StreamableHTTPClientTransport } = await import("@modelcontextprotocol/sdk/client/streamableHttp.js");
|
|
35
|
+
const headers = {};
|
|
36
|
+
// Auth-derived headers (structured auth config)
|
|
37
|
+
const auth = resolvedConfig.auth;
|
|
38
|
+
if (auth?.type === "bearer" && auth.token) {
|
|
39
|
+
headers["Authorization"] = `Bearer ${auth.token}`;
|
|
40
|
+
}
|
|
41
|
+
// Explicit headers override auth-derived ones
|
|
42
|
+
const customHeaders = resolvedConfig.headers;
|
|
43
|
+
if (customHeaders) {
|
|
44
|
+
Object.assign(headers, customHeaders);
|
|
45
|
+
}
|
|
46
|
+
const transport = new StreamableHTTPClientTransport(new URL(String(resolvedConfig.url)), { requestInit: { headers } });
|
|
47
|
+
await client.connect(transport);
|
|
48
|
+
closeTransport = () => transport.close();
|
|
49
|
+
}
|
|
50
|
+
else {
|
|
51
|
+
throw new Error("MCP server config must have either 'command' (stdio) or 'url' (http)");
|
|
52
|
+
}
|
|
53
|
+
// Discover tools
|
|
54
|
+
const { tools: toolsList } = await client.listTools();
|
|
55
|
+
const allTools = toolsList.map((t) => ({
|
|
56
|
+
name: t.name,
|
|
57
|
+
description: t.description,
|
|
58
|
+
inputSchema: t.inputSchema,
|
|
59
|
+
}));
|
|
60
|
+
return {
|
|
61
|
+
getAllTools: () => allTools,
|
|
62
|
+
callTool: async (name, args) => {
|
|
63
|
+
const result = await client.callTool({ name, arguments: args });
|
|
64
|
+
let content = "";
|
|
65
|
+
if (result?.content) {
|
|
66
|
+
if (Array.isArray(result.content)) {
|
|
67
|
+
content = result.content
|
|
68
|
+
.map((c) => c.text || JSON.stringify(c))
|
|
69
|
+
.join("\n");
|
|
70
|
+
}
|
|
71
|
+
else {
|
|
72
|
+
content = String(result.content);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
return { content, error: result.isError ? content : undefined };
|
|
76
|
+
},
|
|
77
|
+
cleanup: async () => {
|
|
78
|
+
await closeTransport().catch(() => { });
|
|
79
|
+
},
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Resolve `{{env.VAR}}` templates in config values, recursively.
|
|
84
|
+
*/
|
|
85
|
+
export function resolveEnvTemplates(config) {
|
|
86
|
+
const resolved = {};
|
|
87
|
+
for (const [key, value] of Object.entries(config)) {
|
|
88
|
+
if (typeof value === "string") {
|
|
89
|
+
resolved[key] = value.replace(/\{\{env\.(\w+)\}\}/g, (_, varName) => {
|
|
90
|
+
return process.env[varName] || "";
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
else if (value && typeof value === "object" && !Array.isArray(value)) {
|
|
94
|
+
resolved[key] = resolveEnvTemplates(value);
|
|
95
|
+
}
|
|
96
|
+
else {
|
|
97
|
+
resolved[key] = value;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
return resolved;
|
|
101
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Anthropic multi-turn tool execution loop.
|
|
3
|
+
*
|
|
4
|
+
* Sends a prompt to the Anthropic Messages API with MCP tools attached.
|
|
5
|
+
* When the model calls a tool, executes it via the MCP client, feeds
|
|
6
|
+
* the result back, and continues until the model produces a final text
|
|
7
|
+
* response or maxToolRounds is exhausted.
|
|
8
|
+
*/
|
|
9
|
+
import type { ToolLoopConfig, ToolLoopResult } from "./types.js";
|
|
10
|
+
/**
|
|
11
|
+
* Run a multi-turn tool loop using the Anthropic Messages API.
|
|
12
|
+
*
|
|
13
|
+
* The loop:
|
|
14
|
+
* 1. Sends the prompt with available tools to Claude
|
|
15
|
+
* 2. If Claude calls tools → executes them via MCP, sends results back
|
|
16
|
+
* 3. Repeats until Claude produces a text-only response or maxToolRounds is hit
|
|
17
|
+
* 4. On the last round, omits tools to force a synthesis response
|
|
18
|
+
*/
|
|
19
|
+
export declare function runAnthropicToolLoop(config: ToolLoopConfig): Promise<ToolLoopResult>;
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Anthropic multi-turn tool execution loop.
|
|
3
|
+
*
|
|
4
|
+
* Sends a prompt to the Anthropic Messages API with MCP tools attached.
|
|
5
|
+
* When the model calls a tool, executes it via the MCP client, feeds
|
|
6
|
+
* the result back, and continues until the model produces a final text
|
|
7
|
+
* response or maxToolRounds is exhausted.
|
|
8
|
+
*/
|
|
9
|
+
// ---------------------------------------------------------------------------
|
|
10
|
+
// Tool loop implementation
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
/**
|
|
13
|
+
* Run a multi-turn tool loop using the Anthropic Messages API.
|
|
14
|
+
*
|
|
15
|
+
* The loop:
|
|
16
|
+
* 1. Sends the prompt with available tools to Claude
|
|
17
|
+
* 2. If Claude calls tools → executes them via MCP, sends results back
|
|
18
|
+
* 3. Repeats until Claude produces a text-only response or maxToolRounds is hit
|
|
19
|
+
* 4. On the last round, omits tools to force a synthesis response
|
|
20
|
+
*/
|
|
21
|
+
export async function runAnthropicToolLoop(config) {
|
|
22
|
+
const { prompt, tools, callTool, maxToolRounds, model, temperature, maxTokens, apiKey, } = config;
|
|
23
|
+
const anthropicTools = tools.map((t) => ({
|
|
24
|
+
name: t.name,
|
|
25
|
+
description: t.description || `MCP tool: ${t.name}`,
|
|
26
|
+
input_schema: t.inputSchema || { type: "object", properties: {} },
|
|
27
|
+
}));
|
|
28
|
+
const systemPrompt = "You are an AI assistant with access to tools provided by an MCP server. " +
|
|
29
|
+
"Use the available tools to complete the task. Call tools with correct parameters, " +
|
|
30
|
+
"interpret responses, and provide a complete answer.";
|
|
31
|
+
const messages = [{ content: prompt, role: "user" }];
|
|
32
|
+
let inputTokens = 0;
|
|
33
|
+
let outputTokens = 0;
|
|
34
|
+
const startTime = Date.now();
|
|
35
|
+
const toolCallLog = [];
|
|
36
|
+
for (let round = 0; round <= maxToolRounds; round++) {
|
|
37
|
+
const isLastRound = round === maxToolRounds;
|
|
38
|
+
// On the last round, omit tools to force a text-only response.
|
|
39
|
+
// Anthropic doesn't support tool_choice: "none" — the way to disable
|
|
40
|
+
// tools is to simply not include them in the request.
|
|
41
|
+
if (isLastRound) {
|
|
42
|
+
const lastMsg = messages[messages.length - 1];
|
|
43
|
+
const synthesisText = "You've used the tools available. Based on the information gathered, " +
|
|
44
|
+
"provide your complete, final answer now.";
|
|
45
|
+
if (lastMsg?.role === "user" && Array.isArray(lastMsg.content)) {
|
|
46
|
+
;
|
|
47
|
+
lastMsg.content.push({
|
|
48
|
+
type: "text",
|
|
49
|
+
text: synthesisText,
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
else {
|
|
53
|
+
messages.push({ content: synthesisText, role: "user" });
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
const body = {
|
|
57
|
+
max_tokens: maxTokens,
|
|
58
|
+
messages,
|
|
59
|
+
model,
|
|
60
|
+
system: systemPrompt,
|
|
61
|
+
temperature,
|
|
62
|
+
};
|
|
63
|
+
if (!isLastRound) {
|
|
64
|
+
body.tools = anthropicTools;
|
|
65
|
+
}
|
|
66
|
+
const response = await fetch("https://api.anthropic.com/v1/messages", {
|
|
67
|
+
body: JSON.stringify(body),
|
|
68
|
+
headers: {
|
|
69
|
+
"anthropic-version": "2023-06-01",
|
|
70
|
+
"Content-Type": "application/json",
|
|
71
|
+
"x-api-key": apiKey,
|
|
72
|
+
},
|
|
73
|
+
method: "POST",
|
|
74
|
+
});
|
|
75
|
+
const data = (await response.json());
|
|
76
|
+
if (data.error) {
|
|
77
|
+
throw new Error(data.error.message ??
|
|
78
|
+
`Anthropic API error: ${JSON.stringify(data.error)}`);
|
|
79
|
+
}
|
|
80
|
+
inputTokens += data.usage?.input_tokens ?? 0;
|
|
81
|
+
outputTokens += data.usage?.output_tokens ?? 0;
|
|
82
|
+
if (!data.content?.length) {
|
|
83
|
+
return {
|
|
84
|
+
output: "",
|
|
85
|
+
toolCallLog,
|
|
86
|
+
tokenUsage: {
|
|
87
|
+
prompt: inputTokens,
|
|
88
|
+
completion: outputTokens,
|
|
89
|
+
total: inputTokens + outputTokens,
|
|
90
|
+
},
|
|
91
|
+
toolRounds: round,
|
|
92
|
+
latencyMs: Date.now() - startTime,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
// Add assistant response to history
|
|
96
|
+
messages.push({ content: data.content, role: "assistant" });
|
|
97
|
+
// Check if model wants to use tools
|
|
98
|
+
const toolUseBlocks = data.content.filter((b) => b.type === "tool_use");
|
|
99
|
+
if (data.stop_reason !== "tool_use" || toolUseBlocks.length === 0) {
|
|
100
|
+
// Model is done — extract text
|
|
101
|
+
const textBlocks = data.content.filter((b) => b.type === "text");
|
|
102
|
+
const output = textBlocks.map((b) => b.text || "").join("\n") || "";
|
|
103
|
+
return {
|
|
104
|
+
output,
|
|
105
|
+
toolCallLog,
|
|
106
|
+
tokenUsage: {
|
|
107
|
+
prompt: inputTokens,
|
|
108
|
+
completion: outputTokens,
|
|
109
|
+
total: inputTokens + outputTokens,
|
|
110
|
+
},
|
|
111
|
+
toolRounds: round,
|
|
112
|
+
latencyMs: Date.now() - startTime,
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
// Execute each tool call via MCP
|
|
116
|
+
const toolResults = [];
|
|
117
|
+
for (const toolUse of toolUseBlocks) {
|
|
118
|
+
const toolName = toolUse.name;
|
|
119
|
+
const toolInput = (toolUse.input || {});
|
|
120
|
+
try {
|
|
121
|
+
const result = await callTool(toolName, toolInput);
|
|
122
|
+
const content = result.error
|
|
123
|
+
? JSON.stringify({ error: result.error })
|
|
124
|
+
: result.content;
|
|
125
|
+
toolCallLog.push({ name: toolName, input: toolInput, output: content });
|
|
126
|
+
toolResults.push({
|
|
127
|
+
content,
|
|
128
|
+
tool_use_id: toolUse.id,
|
|
129
|
+
type: "tool_result",
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
catch (err) {
|
|
133
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
134
|
+
toolCallLog.push({
|
|
135
|
+
name: toolName,
|
|
136
|
+
input: toolInput,
|
|
137
|
+
output: `Error: ${errMsg}`,
|
|
138
|
+
});
|
|
139
|
+
toolResults.push({
|
|
140
|
+
content: JSON.stringify({ error: errMsg }),
|
|
141
|
+
tool_use_id: toolUse.id,
|
|
142
|
+
type: "tool_result",
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
// Add tool results to conversation
|
|
147
|
+
messages.push({ content: toolResults, role: "user" });
|
|
148
|
+
}
|
|
149
|
+
// Exhausted rounds — extract last text
|
|
150
|
+
const lastAssistant = [...messages]
|
|
151
|
+
.reverse()
|
|
152
|
+
.find((m) => m.role === "assistant");
|
|
153
|
+
let lastText = "";
|
|
154
|
+
if (lastAssistant && Array.isArray(lastAssistant.content)) {
|
|
155
|
+
lastText = lastAssistant.content
|
|
156
|
+
.filter((b) => b.type === "text")
|
|
157
|
+
.map((b) => b.text || "")
|
|
158
|
+
.join("\n");
|
|
159
|
+
}
|
|
160
|
+
return {
|
|
161
|
+
output: lastText || "[Exhausted tool rounds without final answer]",
|
|
162
|
+
toolCallLog,
|
|
163
|
+
tokenUsage: {
|
|
164
|
+
prompt: inputTokens,
|
|
165
|
+
completion: outputTokens,
|
|
166
|
+
total: inputTokens + outputTokens,
|
|
167
|
+
},
|
|
168
|
+
toolRounds: maxToolRounds,
|
|
169
|
+
exhaustedRounds: true,
|
|
170
|
+
latencyMs: Date.now() - startTime,
|
|
171
|
+
};
|
|
172
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenAI multi-turn tool execution loop.
|
|
3
|
+
*
|
|
4
|
+
* Supports two OpenAI API surfaces:
|
|
5
|
+
* - **Chat Completions** (`/v1/chat/completions`) — used by `openai:chat:*` models
|
|
6
|
+
* - **Responses** (`/v1/responses`) — used by `openai:responses:*` models (GPT-5.x)
|
|
7
|
+
*
|
|
8
|
+
* Both follow the same loop pattern: send prompt → model calls tools → execute
|
|
9
|
+
* via MCP → feed results back → repeat until final text or maxToolRounds.
|
|
10
|
+
*/
|
|
11
|
+
import type { ToolLoopConfig, ToolLoopResult } from "./types.js";
|
|
12
|
+
/**
|
|
13
|
+
* Run a multi-turn tool loop using the OpenAI API.
|
|
14
|
+
*
|
|
15
|
+
* Routes to Chat Completions or Responses API based on `config.apiVariant`:
|
|
16
|
+
* - `"responses"` → Responses API (`/v1/responses`)
|
|
17
|
+
* - `"chat"` or undefined → Chat Completions API (`/v1/chat/completions`)
|
|
18
|
+
*/
|
|
19
|
+
export declare function runOpenAIToolLoop(config: ToolLoopConfig): Promise<ToolLoopResult>;
|