@sanity/ailf 1.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +29 -12
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
- package/dist/_vendor/ailf-core/config-helpers.js +51 -2
- package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
- package/dist/_vendor/ailf-core/examples/index.js +213 -94
- package/dist/_vendor/ailf-core/index.d.ts +3 -2
- package/dist/_vendor/ailf-core/index.js +2 -1
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
- package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +7 -1
- package/dist/adapters/config-sources/ts-config-loader.js +21 -13
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
- package/dist/adapters/task-sources/index.d.ts +3 -4
- package/dist/adapters/task-sources/index.js +3 -4
- package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
- package/dist/adapters/task-sources/repo-schemas.js +228 -20
- package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
- package/dist/adapters/task-sources/repo-task-source.js +81 -122
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
- package/dist/adapters/task-sources/task-file-loader.js +21 -7
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/coverage-audit.js +3 -1
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +5 -4
- package/dist/commands/init.js +190 -25
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +15 -4
- package/dist/composition-root.js +100 -55
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/build-step-sequence.js +4 -2
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +32 -19
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +77 -26
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +51 -31
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
- package/dist/pipeline/compiler/literacy-bridge.js +2 -2
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
- package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
- package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/expand-tasks.d.ts +2 -2
- package/dist/pipeline/expand-tasks.js +2 -2
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +16 -9
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +16 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
- package/dist/pipeline/mirror-repo-tasks.js +10 -10
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +68 -30
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +32 -24
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP server task compilation — core compiler logic.
|
|
3
|
+
*
|
|
4
|
+
* Produces Promptfoo configuration from MCP server task definitions:
|
|
5
|
+
* 1. A provider config pointing to the MCP server
|
|
6
|
+
* 2. Test cases with tool-call assertions
|
|
7
|
+
* 3. Appropriate prompts for the evaluation
|
|
8
|
+
*/
|
|
9
|
+
import { buildMCPAssertions } from "./assertions.js";
|
|
10
|
+
import { buildMCPProvider } from "./provider-config.js";
|
|
11
|
+
import { validateMCPTask } from "./validation.js";
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
// Public API
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
/**
|
|
16
|
+
* Compile an MCP server task definition into Promptfoo configuration.
|
|
17
|
+
*
|
|
18
|
+
* This is the core of the MCP mode handler. It produces:
|
|
19
|
+
* 1. A provider config pointing to the MCP server
|
|
20
|
+
* 2. Test cases with tool-call assertions
|
|
21
|
+
* 3. Appropriate prompts for the evaluation
|
|
22
|
+
*/
|
|
23
|
+
export function compileMCPTask(task, options) {
|
|
24
|
+
const warnings = [];
|
|
25
|
+
// Validate
|
|
26
|
+
const validationErrors = validateMCPTask(task);
|
|
27
|
+
if (validationErrors.length > 0) {
|
|
28
|
+
for (const err of validationErrors) {
|
|
29
|
+
warnings.push(`MCP task "${task.id}": ${err.field} — ${err.message}`);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
// Build providers (one LLM provider per model, each with MCP config)
|
|
33
|
+
const providers = buildMCPProvider(task, options?.models ?? [], warnings);
|
|
34
|
+
// Build prompts
|
|
35
|
+
const prompts = buildMCPPrompts(task);
|
|
36
|
+
// Build test cases
|
|
37
|
+
const tests = buildMCPTestCases(task, options, warnings);
|
|
38
|
+
return { providers, tests, prompts, warnings };
|
|
39
|
+
}
|
|
40
|
+
// ---------------------------------------------------------------------------
|
|
41
|
+
// Prompt assembly
|
|
42
|
+
// ---------------------------------------------------------------------------
|
|
43
|
+
function buildMCPPrompts(task) {
|
|
44
|
+
// MCP mode uses a single prompt — the task description
|
|
45
|
+
const promptText = task.prompt?.text ??
|
|
46
|
+
task.prompt?.vars?.task ??
|
|
47
|
+
task.description ??
|
|
48
|
+
`Test MCP server: ${task.title}`;
|
|
49
|
+
return [
|
|
50
|
+
{
|
|
51
|
+
id: "mcp-test",
|
|
52
|
+
label: `MCP: ${task.title}`,
|
|
53
|
+
raw: String(promptText),
|
|
54
|
+
},
|
|
55
|
+
];
|
|
56
|
+
}
|
|
57
|
+
// ---------------------------------------------------------------------------
|
|
58
|
+
// Test case assembly
|
|
59
|
+
// ---------------------------------------------------------------------------
|
|
60
|
+
function buildMCPTestCases(task, options, warnings) {
|
|
61
|
+
const tests = [];
|
|
62
|
+
// Build assertion context
|
|
63
|
+
const assertionContext = {
|
|
64
|
+
capabilities: task.capabilities ?? [],
|
|
65
|
+
graderProvider: options?.graderProvider,
|
|
66
|
+
taskId: task.id,
|
|
67
|
+
};
|
|
68
|
+
// Compile assertions
|
|
69
|
+
// Cast GeneralizedAssertionDefinition[] → AssertionInput[] (structurally compatible)
|
|
70
|
+
const assertions = [];
|
|
71
|
+
if (task.assertions) {
|
|
72
|
+
const rawAssertions = task.assertions;
|
|
73
|
+
const { assertions: mapped, warnings: assertionWarnings } = buildMCPAssertions(rawAssertions, assertionContext);
|
|
74
|
+
assertions.push(...mapped);
|
|
75
|
+
warnings.push(...assertionWarnings);
|
|
76
|
+
}
|
|
77
|
+
// Build test case vars
|
|
78
|
+
const vars = {
|
|
79
|
+
task: task.prompt?.vars?.task ?? task.description ?? `Test: ${task.title}`,
|
|
80
|
+
...(task.prompt?.vars ?? {}),
|
|
81
|
+
};
|
|
82
|
+
// Primary test case
|
|
83
|
+
tests.push({
|
|
84
|
+
description: `${task.id} — ${task.title}`,
|
|
85
|
+
vars,
|
|
86
|
+
...(assertions.length > 0 ? { assert: assertions } : {}),
|
|
87
|
+
});
|
|
88
|
+
// Multi-turn test cases
|
|
89
|
+
if (task.multiTurn?.turns && task.multiTurn.turns.length > 0) {
|
|
90
|
+
tests.push({
|
|
91
|
+
description: `${task.id} — ${task.title} [multi-turn]`,
|
|
92
|
+
vars: {
|
|
93
|
+
...vars,
|
|
94
|
+
__multiTurn: task.multiTurn.turns,
|
|
95
|
+
},
|
|
96
|
+
...(assertions.length > 0 ? { assert: assertions } : {}),
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
return tests;
|
|
100
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP Server mode handler — directory barrel.
|
|
3
|
+
*
|
|
4
|
+
* MCPServerModeHandler — compilation rules for `mcp-server` evaluation mode.
|
|
5
|
+
*
|
|
6
|
+
* This is the first non-literacy mode handler, proving the compiler
|
|
7
|
+
* architecture works end-to-end. It translates MCP server task definitions
|
|
8
|
+
* into Promptfoo configuration with:
|
|
9
|
+
*
|
|
10
|
+
* - An MCP provider that wraps the server under test
|
|
11
|
+
* - Tool-call assertions compiled to Promptfoo `javascript` assertions
|
|
12
|
+
* - Server lifecycle management via Promptfoo provider hooks
|
|
13
|
+
* - Multi-turn conversation support via Promptfoo's `steps` syntax
|
|
14
|
+
*
|
|
15
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
|
|
16
|
+
* @see packages/core/src/types/eval-mode-config.ts — MCPServerModeConfig
|
|
17
|
+
* @see packages/core/src/types/generalized-task.ts — MCPServerTaskDefinition
|
|
18
|
+
*/
|
|
19
|
+
import type { ModeHandler } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
20
|
+
/** ModeHandler-conformant export for the mcp-server evaluation mode. */
|
|
21
|
+
export declare const handler: ModeHandler;
|
|
22
|
+
export type { MCPAssertionContext, MCPCompileOptions, MCPCompileResult, MCPValidationError, } from "./types.js";
|
|
23
|
+
export { buildMCPAssertions } from "./assertions.js";
|
|
24
|
+
export { compileMCPTask } from "./compiler.js";
|
|
25
|
+
export { validateMCPTask } from "./validation.js";
|
|
26
|
+
export { MCP_PROMPT_TEMPLATES } from "./prompts.js";
|
|
27
|
+
export { DEFAULT_MAX_TOOL_ROUNDS, MCP_PROVIDER_PATH, } from "./provider-config.js";
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP Server mode handler — directory barrel.
|
|
3
|
+
*
|
|
4
|
+
* MCPServerModeHandler — compilation rules for `mcp-server` evaluation mode.
|
|
5
|
+
*
|
|
6
|
+
* This is the first non-literacy mode handler, proving the compiler
|
|
7
|
+
* architecture works end-to-end. It translates MCP server task definitions
|
|
8
|
+
* into Promptfoo configuration with:
|
|
9
|
+
*
|
|
10
|
+
* - An MCP provider that wraps the server under test
|
|
11
|
+
* - Tool-call assertions compiled to Promptfoo `javascript` assertions
|
|
12
|
+
* - Server lifecycle management via Promptfoo provider hooks
|
|
13
|
+
* - Multi-turn conversation support via Promptfoo's `steps` syntax
|
|
14
|
+
*
|
|
15
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
|
|
16
|
+
* @see packages/core/src/types/eval-mode-config.ts — MCPServerModeConfig
|
|
17
|
+
* @see packages/core/src/types/generalized-task.ts — MCPServerTaskDefinition
|
|
18
|
+
*/
|
|
19
|
+
import { compileMCPTask } from "./compiler.js";
|
|
20
|
+
import { MCP_PROMPT_TEMPLATES } from "./prompts.js";
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// ModeHandler adapter
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
/** ModeHandler-conformant export for the mcp-server evaluation mode. */
|
|
25
|
+
export const handler = {
|
|
26
|
+
getPrompts() {
|
|
27
|
+
return MCP_PROMPT_TEMPLATES;
|
|
28
|
+
},
|
|
29
|
+
compileTask(task, ctx) {
|
|
30
|
+
if (!("mode" in task) || task.mode !== "mcp-server") {
|
|
31
|
+
throw new Error(`MCP server handler received task with mode "${task.mode ?? "undefined"}" — expected "mcp-server"`);
|
|
32
|
+
}
|
|
33
|
+
const result = compileMCPTask(task, {
|
|
34
|
+
graderProvider: ctx.graderProvider,
|
|
35
|
+
models: ctx.models,
|
|
36
|
+
});
|
|
37
|
+
return {
|
|
38
|
+
providers: result.providers,
|
|
39
|
+
tests: result.tests,
|
|
40
|
+
prompts: result.prompts,
|
|
41
|
+
warnings: result.warnings,
|
|
42
|
+
};
|
|
43
|
+
},
|
|
44
|
+
};
|
|
45
|
+
// Assertions
|
|
46
|
+
export { buildMCPAssertions } from "./assertions.js";
|
|
47
|
+
// Compilation
|
|
48
|
+
export { compileMCPTask } from "./compiler.js";
|
|
49
|
+
// Validation
|
|
50
|
+
export { validateMCPTask } from "./validation.js";
|
|
51
|
+
// Prompts
|
|
52
|
+
export { MCP_PROMPT_TEMPLATES } from "./prompts.js";
|
|
53
|
+
// Provider config
|
|
54
|
+
export { DEFAULT_MAX_TOOL_ROUNDS, MCP_PROVIDER_PATH, } from "./provider-config.js";
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Canonical MCP server prompt templates.
|
|
3
|
+
*
|
|
4
|
+
* Handler-owned prompts for MCP server evaluations. Instructs the model to
|
|
5
|
+
* interact with MCP tools rather than writing standalone code.
|
|
6
|
+
*/
|
|
7
|
+
import type { PromptTemplate } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
8
|
+
export declare const MCP_PROMPT_TEMPLATES: Record<string, PromptTemplate>;
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Canonical MCP server prompt templates.
|
|
3
|
+
*
|
|
4
|
+
* Handler-owned prompts for MCP server evaluations. Instructs the model to
|
|
5
|
+
* interact with MCP tools rather than writing standalone code.
|
|
6
|
+
*/
|
|
7
|
+
export const MCP_PROMPT_TEMPLATES = {
|
|
8
|
+
"mcp-server": {
|
|
9
|
+
id: "mcp-server",
|
|
10
|
+
label: "MCP Server Tool Use",
|
|
11
|
+
template: `You are an AI assistant with access to an MCP (Model Context Protocol) server that provides tools for interacting with a Sanity content backend.
|
|
12
|
+
|
|
13
|
+
## Task
|
|
14
|
+
{{task}}
|
|
15
|
+
|
|
16
|
+
## Instructions
|
|
17
|
+
|
|
18
|
+
1. Use the available MCP tools to complete the task
|
|
19
|
+
2. Call tools with the correct parameters as described in their schemas
|
|
20
|
+
3. Interpret tool responses and use the results to accomplish the goal
|
|
21
|
+
4. If a tool returns an error, explain the issue clearly
|
|
22
|
+
5. Prefer using specific tools over broad queries when possible
|
|
23
|
+
|
|
24
|
+
Complete the task using the MCP tools provided:
|
|
25
|
+
`,
|
|
26
|
+
variables: ["task"],
|
|
27
|
+
},
|
|
28
|
+
};
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP server provider assembly — builds Promptfoo provider configs.
|
|
3
|
+
*/
|
|
4
|
+
import type { MCPServerTaskDefinition, ModeProviderEntry } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
5
|
+
import type { PromptfooProvider } from "../../promptfoo-compiler.js";
|
|
6
|
+
/** Default max tool rounds for MCP multi-turn execution */
|
|
7
|
+
export declare const DEFAULT_MAX_TOOL_ROUNDS = 5;
|
|
8
|
+
/** Provider path relative to eval package dist */
|
|
9
|
+
export declare const MCP_PROVIDER_PATH = "file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js";
|
|
10
|
+
/**
|
|
11
|
+
* Build custom MCP tool provider configs — one per model.
|
|
12
|
+
*
|
|
13
|
+
* Each provider uses the custom mcp-tool-provider.ts which implements a
|
|
14
|
+
* multi-turn tool execution loop. The LLM receives a prompt, discovers
|
|
15
|
+
* MCP tools, calls them, gets results, and continues until it produces
|
|
16
|
+
* a final text answer or exhausts maxToolRounds.
|
|
17
|
+
*
|
|
18
|
+
* Config shape passed to the custom provider:
|
|
19
|
+
* { model, mcpServer: { url, auth, name }, mcpTools, maxToolRounds, temperature, ... }
|
|
20
|
+
*/
|
|
21
|
+
export declare function buildMCPProvider(task: MCPServerTaskDefinition, models: ModeProviderEntry[], warnings: string[]): PromptfooProvider[];
|
|
22
|
+
/**
|
|
23
|
+
* Build the MCP server connection config for the custom provider.
|
|
24
|
+
*
|
|
25
|
+
* Shape: { url?, command?, name?, auth? }
|
|
26
|
+
* The custom mcp-tool-provider.ts uses this to connect to the MCP server.
|
|
27
|
+
*/
|
|
28
|
+
export declare function buildMCPServerConfig(task: MCPServerTaskDefinition, warnings: string[]): Record<string, unknown>;
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP server provider assembly — builds Promptfoo provider configs.
|
|
3
|
+
*/
|
|
4
|
+
// ---------------------------------------------------------------------------
|
|
5
|
+
// Constants
|
|
6
|
+
// ---------------------------------------------------------------------------
|
|
7
|
+
/** Default max tool rounds for MCP multi-turn execution */
|
|
8
|
+
export const DEFAULT_MAX_TOOL_ROUNDS = 5;
|
|
9
|
+
/** Provider path relative to eval package dist */
|
|
10
|
+
export const MCP_PROVIDER_PATH = "file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js";
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
// Provider assembly
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
/**
|
|
15
|
+
* Build custom MCP tool provider configs — one per model.
|
|
16
|
+
*
|
|
17
|
+
* Each provider uses the custom mcp-tool-provider.ts which implements a
|
|
18
|
+
* multi-turn tool execution loop. The LLM receives a prompt, discovers
|
|
19
|
+
* MCP tools, calls them, gets results, and continues until it produces
|
|
20
|
+
* a final text answer or exhausts maxToolRounds.
|
|
21
|
+
*
|
|
22
|
+
* Config shape passed to the custom provider:
|
|
23
|
+
* { model, mcpServer: { url, auth, name }, mcpTools, maxToolRounds, temperature, ... }
|
|
24
|
+
*/
|
|
25
|
+
export function buildMCPProvider(task, models, warnings) {
|
|
26
|
+
// Build the MCP server config
|
|
27
|
+
const mcpServer = buildMCPServerConfig(task, warnings);
|
|
28
|
+
const mcpTools = task.capabilities ?? undefined;
|
|
29
|
+
const maxToolRounds = task.maxToolRounds ?? DEFAULT_MAX_TOOL_ROUNDS;
|
|
30
|
+
// Helper to build a provider entry for a given model
|
|
31
|
+
function makeProvider(modelId, label, modelConfig) {
|
|
32
|
+
return {
|
|
33
|
+
id: MCP_PROVIDER_PATH,
|
|
34
|
+
label: `${label} + MCP`,
|
|
35
|
+
config: {
|
|
36
|
+
model: modelId,
|
|
37
|
+
mcpServer,
|
|
38
|
+
...(mcpTools ? { mcpTools } : {}),
|
|
39
|
+
maxToolRounds,
|
|
40
|
+
...(modelConfig ?? {}),
|
|
41
|
+
},
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
// Task-level model override takes precedence over registry models
|
|
45
|
+
const taskModels = task.models;
|
|
46
|
+
if (taskModels && taskModels.length > 0) {
|
|
47
|
+
return taskModels.map((modelId) => makeProvider(modelId, modelId));
|
|
48
|
+
}
|
|
49
|
+
// Use registry models (already filtered to mcp-server mode)
|
|
50
|
+
if (models.length === 0) {
|
|
51
|
+
warnings.push(`MCP task "${task.id}": no models available. Add "mcp-server" to a ` +
|
|
52
|
+
"model's modes array in config/models.ts, or set models on the task.");
|
|
53
|
+
return [
|
|
54
|
+
makeProvider("anthropic:messages:claude-sonnet-4-20250514", "Claude Sonnet 4"),
|
|
55
|
+
];
|
|
56
|
+
}
|
|
57
|
+
return models.map((model) => makeProvider(model.id, model.label, model.config));
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Build the MCP server connection config for the custom provider.
|
|
61
|
+
*
|
|
62
|
+
* Shape: { url?, command?, name?, auth? }
|
|
63
|
+
* The custom mcp-tool-provider.ts uses this to connect to the MCP server.
|
|
64
|
+
*/
|
|
65
|
+
export function buildMCPServerConfig(task, warnings) {
|
|
66
|
+
const config = task.serverConfig;
|
|
67
|
+
if (!config) {
|
|
68
|
+
warnings.push(`MCP task "${task.id}": no serverConfig — using placeholder. ` +
|
|
69
|
+
"Set serverConfig.command or serverConfig.url to point to your MCP server.");
|
|
70
|
+
return { name: task.id };
|
|
71
|
+
}
|
|
72
|
+
const serverConfig = { name: task.id };
|
|
73
|
+
if (config.transport === "stdio") {
|
|
74
|
+
serverConfig.command = config.command;
|
|
75
|
+
}
|
|
76
|
+
else {
|
|
77
|
+
serverConfig.url = config.url;
|
|
78
|
+
}
|
|
79
|
+
// Explicit headers for HTTP transports
|
|
80
|
+
if (config.headers) {
|
|
81
|
+
serverConfig.headers = config.headers;
|
|
82
|
+
}
|
|
83
|
+
// Auth config
|
|
84
|
+
if (config.auth) {
|
|
85
|
+
serverConfig.auth = config.auth;
|
|
86
|
+
}
|
|
87
|
+
else if (config.env) {
|
|
88
|
+
const tokenKey = Object.keys(config.env).find((k) => /token|auth|key/i.test(k));
|
|
89
|
+
if (tokenKey) {
|
|
90
|
+
const val = config.env[tokenKey];
|
|
91
|
+
let envVar = val;
|
|
92
|
+
if (val.startsWith("$env(") && val.endsWith(")")) {
|
|
93
|
+
envVar = val.slice(5, -1);
|
|
94
|
+
}
|
|
95
|
+
if (!envVar || !/^[A-Za-z_][A-Za-z0-9_]*$/.test(envVar)) {
|
|
96
|
+
warnings.push(`MCP task: env var name "${envVar}" from "${val}" is not a valid ` +
|
|
97
|
+
"identifier — skipping auth config");
|
|
98
|
+
}
|
|
99
|
+
else {
|
|
100
|
+
serverConfig.auth = {
|
|
101
|
+
type: "bearer",
|
|
102
|
+
token: `{{env.${envVar}}}`,
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
return serverConfig;
|
|
108
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared types for the MCP server mode handler.
|
|
3
|
+
*/
|
|
4
|
+
import type { ModeProviderEntry } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
5
|
+
import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
|
|
6
|
+
/** Options for compiling an MCP server task */
|
|
7
|
+
export interface MCPCompileOptions {
|
|
8
|
+
/** Grader provider for LLM-graded assertions */
|
|
9
|
+
graderProvider?: string;
|
|
10
|
+
/** Model providers to evaluate with (from registry, filtered by mcp-server mode) */
|
|
11
|
+
models?: ModeProviderEntry[];
|
|
12
|
+
}
|
|
13
|
+
/** Result of compiling a single MCP task */
|
|
14
|
+
export interface MCPCompileResult {
|
|
15
|
+
/** Promptfoo provider config for the MCP server */
|
|
16
|
+
providers: PromptfooProvider[];
|
|
17
|
+
/** Compiled test cases */
|
|
18
|
+
tests: PromptfooTestCase[];
|
|
19
|
+
/** Prompts for MCP evaluation */
|
|
20
|
+
prompts: PromptfooPrompt[];
|
|
21
|
+
/** Warnings generated during compilation */
|
|
22
|
+
warnings: string[];
|
|
23
|
+
}
|
|
24
|
+
/** Validation errors for MCP task definitions */
|
|
25
|
+
export interface MCPValidationError {
|
|
26
|
+
field: string;
|
|
27
|
+
message: string;
|
|
28
|
+
}
|
|
29
|
+
/** Context for building MCP assertions */
|
|
30
|
+
export interface MCPAssertionContext {
|
|
31
|
+
/** Task ID (for error messages) */
|
|
32
|
+
taskId: string;
|
|
33
|
+
/** Expected server capabilities */
|
|
34
|
+
capabilities: string[];
|
|
35
|
+
/** Grader provider for LLM-graded assertions */
|
|
36
|
+
graderProvider?: string;
|
|
37
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Validation for MCP server task definitions.
|
|
3
|
+
*/
|
|
4
|
+
import type { MCPServerTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
5
|
+
import type { MCPValidationError } from "./types.js";
|
|
6
|
+
/**
|
|
7
|
+
* Validate that an MCP task definition has all required fields.
|
|
8
|
+
*/
|
|
9
|
+
export declare function validateMCPTask(task: MCPServerTaskDefinition): MCPValidationError[];
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Validation for MCP server task definitions.
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Validate that an MCP task definition has all required fields.
|
|
6
|
+
*/
|
|
7
|
+
export function validateMCPTask(task) {
|
|
8
|
+
const errors = [];
|
|
9
|
+
if (!task.id) {
|
|
10
|
+
errors.push({ field: "id", message: "Task ID is required" });
|
|
11
|
+
}
|
|
12
|
+
if (!task.title) {
|
|
13
|
+
errors.push({ field: "title", message: "Task title is required" });
|
|
14
|
+
}
|
|
15
|
+
if (task.serverConfig) {
|
|
16
|
+
const { transport, command, url } = task.serverConfig;
|
|
17
|
+
if (transport === "stdio" && !command) {
|
|
18
|
+
errors.push({
|
|
19
|
+
field: "serverConfig.command",
|
|
20
|
+
message: "Server command is required for stdio transport (e.g., 'node dist/server.js')",
|
|
21
|
+
});
|
|
22
|
+
}
|
|
23
|
+
if ((transport === "sse" || transport === "streamable-http") && !url) {
|
|
24
|
+
errors.push({
|
|
25
|
+
field: "serverConfig.url",
|
|
26
|
+
message: `Server URL is required for ${transport} transport`,
|
|
27
|
+
});
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
// Assertions should reference MCP-compatible types
|
|
31
|
+
if (task.assertions) {
|
|
32
|
+
for (const assertion of task.assertions) {
|
|
33
|
+
if (assertion.type === "tool-called" &&
|
|
34
|
+
!("value" in assertion && assertion.value)) {
|
|
35
|
+
errors.push({
|
|
36
|
+
field: "assertions",
|
|
37
|
+
message: 'tool-called assertion requires a "value" specifying the tool name',
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
return errors;
|
|
43
|
+
}
|
|
@@ -26,13 +26,15 @@
|
|
|
26
26
|
* @see packages/core/src/types/eval-mode-config.ts — MCPServerModeConfig
|
|
27
27
|
* @see packages/core/src/types/generalized-task.ts — MCPServerTaskDefinition
|
|
28
28
|
*/
|
|
29
|
-
import type { MCPServerTaskDefinition, ModeHandler, PromptTemplate } from "../../../_vendor/ailf-core/index.d.ts";
|
|
29
|
+
import type { MCPServerTaskDefinition, ModeHandler, ModeProviderEntry, PromptTemplate } from "../../../_vendor/ailf-core/index.d.ts";
|
|
30
30
|
import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../promptfoo-compiler.js";
|
|
31
31
|
export declare const MCP_PROMPT_TEMPLATES: Record<string, PromptTemplate>;
|
|
32
32
|
/** Options for compiling an MCP server task */
|
|
33
33
|
export interface MCPCompileOptions {
|
|
34
34
|
/** Grader provider for LLM-graded assertions */
|
|
35
35
|
graderProvider?: string;
|
|
36
|
+
/** Model providers to evaluate with (from registry, filtered by mcp-server mode) */
|
|
37
|
+
models?: ModeProviderEntry[];
|
|
36
38
|
}
|
|
37
39
|
/** Result of compiling a single MCP task */
|
|
38
40
|
export interface MCPCompileResult {
|
|
@@ -114,8 +114,8 @@ export function compileMCPTask(task, options) {
|
|
|
114
114
|
warnings.push(`MCP task "${task.id}": ${err.field} — ${err.message}`);
|
|
115
115
|
}
|
|
116
116
|
}
|
|
117
|
-
// Build provider
|
|
118
|
-
const providers = buildMCPProvider(task, warnings);
|
|
117
|
+
// Build providers (one LLM provider per model, each with MCP config)
|
|
118
|
+
const providers = buildMCPProvider(task, options?.models ?? [], warnings);
|
|
119
119
|
// Build prompts
|
|
120
120
|
const prompts = buildMCPPrompts(task);
|
|
121
121
|
// Build test cases
|
|
@@ -125,103 +125,100 @@ export function compileMCPTask(task, options) {
|
|
|
125
125
|
// ---------------------------------------------------------------------------
|
|
126
126
|
// Provider assembly
|
|
127
127
|
// ---------------------------------------------------------------------------
|
|
128
|
+
/** Default max tool rounds for MCP multi-turn execution */
|
|
129
|
+
const DEFAULT_MAX_TOOL_ROUNDS = 5;
|
|
130
|
+
/** Provider path relative to eval package dist */
|
|
131
|
+
const MCP_PROVIDER_PATH = "file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js";
|
|
128
132
|
/**
|
|
129
|
-
* Build
|
|
133
|
+
* Build custom MCP tool provider configs — one per model.
|
|
130
134
|
*
|
|
131
|
-
*
|
|
132
|
-
*
|
|
135
|
+
* Each provider uses the custom mcp-tool-provider.ts which implements a
|
|
136
|
+
* multi-turn tool execution loop. The LLM receives a prompt, discovers
|
|
137
|
+
* MCP tools, calls them, gets results, and continues until it produces
|
|
138
|
+
* a final text answer or exhausts maxToolRounds.
|
|
133
139
|
*
|
|
134
|
-
*
|
|
135
|
-
* {
|
|
136
|
-
* tools?, exclude_tools?, timeout?, debug? }
|
|
140
|
+
* Config shape passed to the custom provider:
|
|
141
|
+
* { model, mcpServer: { url, auth, name }, mcpTools, maxToolRounds, temperature, ... }
|
|
137
142
|
*/
|
|
138
|
-
function buildMCPProvider(task, warnings) {
|
|
143
|
+
function buildMCPProvider(task, models, warnings) {
|
|
144
|
+
// Build the MCP server config
|
|
145
|
+
const mcpServer = buildMCPServerConfig(task, warnings);
|
|
146
|
+
const mcpTools = task.capabilities ?? undefined;
|
|
147
|
+
const maxToolRounds = task.maxToolRounds ?? DEFAULT_MAX_TOOL_ROUNDS;
|
|
148
|
+
// Helper to build a provider entry for a given model
|
|
149
|
+
function makeProvider(modelId, label, modelConfig) {
|
|
150
|
+
return {
|
|
151
|
+
id: MCP_PROVIDER_PATH,
|
|
152
|
+
label: `${label} + MCP`,
|
|
153
|
+
config: {
|
|
154
|
+
model: modelId,
|
|
155
|
+
mcpServer,
|
|
156
|
+
...(mcpTools ? { mcpTools } : {}),
|
|
157
|
+
maxToolRounds,
|
|
158
|
+
...(modelConfig ?? {}),
|
|
159
|
+
},
|
|
160
|
+
};
|
|
161
|
+
}
|
|
162
|
+
// Task-level model override takes precedence over registry models
|
|
163
|
+
const taskModels = task.models;
|
|
164
|
+
if (taskModels && taskModels.length > 0) {
|
|
165
|
+
return taskModels.map((modelId) => makeProvider(modelId, modelId));
|
|
166
|
+
}
|
|
167
|
+
// Use registry models (already filtered to mcp-server mode)
|
|
168
|
+
if (models.length === 0) {
|
|
169
|
+
warnings.push(`MCP task "${task.id}": no models available. Add "mcp-server" to a ` +
|
|
170
|
+
"model's modes array in config/models.ts, or set models on the task.");
|
|
171
|
+
return [
|
|
172
|
+
makeProvider("anthropic:messages:claude-sonnet-4-20250514", "Claude Sonnet 4"),
|
|
173
|
+
];
|
|
174
|
+
}
|
|
175
|
+
return models.map((model) => makeProvider(model.id, model.label, model.config));
|
|
176
|
+
}
|
|
177
|
+
/**
|
|
178
|
+
* Build the MCP server connection config for the custom provider.
|
|
179
|
+
*
|
|
180
|
+
* Shape: { url?, command?, name?, auth? }
|
|
181
|
+
* The custom mcp-tool-provider.ts uses this to connect to the MCP server.
|
|
182
|
+
*/
|
|
183
|
+
function buildMCPServerConfig(task, warnings) {
|
|
139
184
|
const config = task.serverConfig;
|
|
140
185
|
if (!config) {
|
|
141
|
-
warnings.push(`MCP task "${task.id}": no serverConfig — using placeholder
|
|
186
|
+
warnings.push(`MCP task "${task.id}": no serverConfig — using placeholder. ` +
|
|
142
187
|
"Set serverConfig.command or serverConfig.url to point to your MCP server.");
|
|
143
|
-
return
|
|
144
|
-
{
|
|
145
|
-
id: "mcp",
|
|
146
|
-
label: `MCP Server: ${task.title}`,
|
|
147
|
-
config: { enabled: true, server: { name: task.id } },
|
|
148
|
-
},
|
|
149
|
-
];
|
|
188
|
+
return { name: task.id };
|
|
150
189
|
}
|
|
151
|
-
|
|
152
|
-
const server = { name: task.id };
|
|
190
|
+
const serverConfig = { name: task.id };
|
|
153
191
|
if (config.transport === "stdio") {
|
|
154
|
-
|
|
155
|
-
const parts = config.command?.split(/\s+/) ?? [];
|
|
156
|
-
server.command = parts[0] ?? "node";
|
|
157
|
-
if (parts.length > 1) {
|
|
158
|
-
server.args = parts.slice(1);
|
|
159
|
-
}
|
|
192
|
+
serverConfig.command = config.command;
|
|
160
193
|
}
|
|
161
194
|
else {
|
|
162
|
-
|
|
163
|
-
server.url = config.url;
|
|
195
|
+
serverConfig.url = config.url;
|
|
164
196
|
}
|
|
165
|
-
// Auth config
|
|
197
|
+
// Auth config
|
|
166
198
|
if (config.auth) {
|
|
167
|
-
|
|
199
|
+
serverConfig.auth = config.auth;
|
|
168
200
|
}
|
|
169
201
|
else if (config.env) {
|
|
170
|
-
// Backward compat: if env has a token-like variable, convert to
|
|
171
|
-
// bearer auth using Promptfoo's {{env.VAR}} template syntax
|
|
172
202
|
const tokenKey = Object.keys(config.env).find((k) => /token|auth|key/i.test(k));
|
|
173
203
|
if (tokenKey) {
|
|
174
204
|
const val = config.env[tokenKey];
|
|
175
|
-
// Convert $env(VAR) syntax to Promptfoo's {{env.VAR}} syntax
|
|
176
205
|
let envVar = val;
|
|
177
206
|
if (val.startsWith("$env(") && val.endsWith(")")) {
|
|
178
|
-
envVar = val.slice(5, -1);
|
|
207
|
+
envVar = val.slice(5, -1);
|
|
179
208
|
}
|
|
180
|
-
// Validate extracted env var name is non-empty and valid
|
|
181
209
|
if (!envVar || !/^[A-Za-z_][A-Za-z0-9_]*$/.test(envVar)) {
|
|
182
210
|
warnings.push(`MCP task: env var name "${envVar}" from "${val}" is not a valid ` +
|
|
183
211
|
"identifier — skipping auth config");
|
|
184
212
|
}
|
|
185
213
|
else {
|
|
186
|
-
|
|
214
|
+
serverConfig.auth = {
|
|
187
215
|
type: "bearer",
|
|
188
216
|
token: `{{env.${envVar}}}`,
|
|
189
217
|
};
|
|
190
218
|
}
|
|
191
219
|
}
|
|
192
220
|
}
|
|
193
|
-
|
|
194
|
-
if (config.env) {
|
|
195
|
-
const headers = {};
|
|
196
|
-
for (const [key, val] of Object.entries(config.env)) {
|
|
197
|
-
if (/header[_.]?/i.test(key)) {
|
|
198
|
-
headers[key.replace(/^header[_.]?/i, "")] = val;
|
|
199
|
-
}
|
|
200
|
-
}
|
|
201
|
-
if (Object.keys(headers).length > 0) {
|
|
202
|
-
server.headers = headers;
|
|
203
|
-
}
|
|
204
|
-
}
|
|
205
|
-
// Build top-level provider config
|
|
206
|
-
const providerConfig = {
|
|
207
|
-
enabled: true,
|
|
208
|
-
server,
|
|
209
|
-
};
|
|
210
|
-
// Tool filtering — map AILF capabilities to Promptfoo tools
|
|
211
|
-
if (task.capabilities && task.capabilities.length > 0) {
|
|
212
|
-
providerConfig.tools = task.capabilities;
|
|
213
|
-
}
|
|
214
|
-
// Timeout
|
|
215
|
-
if (config.startupTimeoutMs) {
|
|
216
|
-
providerConfig.timeout = config.startupTimeoutMs;
|
|
217
|
-
}
|
|
218
|
-
return [
|
|
219
|
-
{
|
|
220
|
-
id: "mcp",
|
|
221
|
-
label: `MCP Server: ${task.title}`,
|
|
222
|
-
config: providerConfig,
|
|
223
|
-
},
|
|
224
|
-
];
|
|
221
|
+
return serverConfig;
|
|
225
222
|
}
|
|
226
223
|
// ---------------------------------------------------------------------------
|
|
227
224
|
// Prompt assembly
|
|
@@ -298,6 +295,7 @@ export const handler = {
|
|
|
298
295
|
}
|
|
299
296
|
const result = compileMCPTask(task, {
|
|
300
297
|
graderProvider: ctx.graderProvider,
|
|
298
|
+
models: ctx.models,
|
|
301
299
|
});
|
|
302
300
|
return {
|
|
303
301
|
providers: result.providers,
|