@sanity/ailf 0.5.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/config/features.ts +23 -0
- package/config/models.ts +95 -0
- package/config/prompts.ts +16 -0
- package/config/rubrics.ts +225 -0
- package/config/schedules.ts +47 -0
- package/config/sinks.ts +37 -0
- package/config/sources.ts +21 -0
- package/config/thresholds.ts +61 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +171 -0
- package/dist/_vendor/ailf-core/config-helpers.js +170 -0
- package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
- package/dist/_vendor/ailf-core/env-helper.js +45 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/examples/index.js +25 -0
- package/dist/_vendor/ailf-core/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/index.js +5 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +17 -2
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
- package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +8 -2
- package/dist/_vendor/ailf-core/schemas/eval-config.js +17 -2
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +9 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +8 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -31
- package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -9
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
- package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/services/index.js +2 -1
- package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
- package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
- package/dist/_vendor/ailf-core/services/scoring.js +25 -15
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +332 -0
- package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +45 -83
- package/dist/_vendor/ailf-core/types/index.js +8 -1
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +257 -0
- package/dist/_vendor/ailf-core/types/plugin-registry.js +185 -0
- package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
- package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
- package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
- package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
- package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
- package/dist/_vendor/ailf-core/types/trace.js +18 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
- package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
- package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
- package/dist/_vendor/ailf-shared/index.d.ts +0 -1
- package/dist/_vendor/ailf-shared/index.js +0 -1
- package/dist/adapters/api-client/build-request.js +14 -13
- package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
- package/dist/adapters/config-sources/file-config-adapter.js +39 -12
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
- package/dist/adapters/config-sources/ts-config-loader.js +141 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
- package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +35 -39
- package/dist/adapters/task-sources/index.d.ts +3 -2
- package/dist/adapters/task-sources/index.js +3 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
- package/dist/adapters/task-sources/repo-schemas.js +227 -19
- package/dist/adapters/task-sources/repo-task-source.d.ts +16 -12
- package/dist/adapters/task-sources/repo-task-source.js +92 -80
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
- package/dist/adapters/task-sources/task-file-loader.js +83 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
- package/dist/adapters/task-sources/yaml-task-source.js +19 -16
- package/dist/cli.js +0 -2
- package/dist/commands/baseline.js +4 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/coverage-audit.js +9 -1
- package/dist/commands/explain-handler.js +25 -23
- package/dist/commands/fetch-docs.js +3 -2
- package/dist/commands/generate-configs.js +1 -1
- package/dist/commands/init.d.ts +6 -4
- package/dist/commands/init.js +302 -23
- package/dist/commands/interactive.js +11 -7
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +16 -6
- package/dist/commands/pipeline.d.ts +1 -0
- package/dist/commands/pipeline.js +4 -2
- package/dist/commands/pr-comment.js +1 -1
- package/dist/commands/publish.js +2 -2
- package/dist/commands/readiness-report.js +13 -6
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +13 -1
- package/dist/composition-root.js +99 -4
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +28 -8
- package/dist/orchestration/steps/calculate-scores-step.js +24 -11
- package/dist/orchestration/steps/fetch-docs-step.js +8 -7
- package/dist/orchestration/steps/gap-analysis-step.js +8 -7
- package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
- package/dist/orchestration/steps/generate-configs-step.js +261 -51
- package/dist/orchestration/steps/grader-consistency-step.js +7 -4
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/readiness-step.js +5 -6
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
- package/dist/orchestration/steps/run-eval-step.js +8 -7
- package/dist/pipeline/cache.d.ts +1 -1
- package/dist/pipeline/cache.js +36 -8
- package/dist/pipeline/calculate-scores.d.ts +2 -4
- package/dist/pipeline/calculate-scores.js +43 -113
- package/dist/pipeline/checks.js +2 -2
- package/dist/pipeline/compare.js +8 -8
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +392 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +404 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
- package/dist/pipeline/compiler/assertion-mapper.js +175 -0
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
- package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
- package/dist/pipeline/compiler/config-loader.d.ts +56 -0
- package/dist/pipeline/compiler/config-loader.js +111 -0
- package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
- package/dist/pipeline/compiler/fixture-resolver.js +113 -0
- package/dist/pipeline/compiler/hash.d.ts +11 -0
- package/dist/pipeline/compiler/hash.js +18 -0
- package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
- package/dist/pipeline/compiler/ignore-fields.js +113 -0
- package/dist/pipeline/compiler/index.d.ts +29 -0
- package/dist/pipeline/compiler/index.js +45 -0
- package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
- package/dist/pipeline/compiler/literacy-bridge.js +172 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +15 -0
- package/dist/pipeline/compiler/mode-handlers/index.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/index.d.ts +9 -0
- package/dist/pipeline/compiler/presets/index.js +8 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +42 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.js +208 -0
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
- package/dist/pipeline/compiler/provider-assembler.js +137 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
- package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
- package/dist/pipeline/compiler/sandbox/index.js +11 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
- package/dist/pipeline/compiler/scoring-bridge.js +114 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
- package/dist/pipeline/compiler/task-graph-builder.js +291 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
- package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
- package/dist/pipeline/compiler/telemetry/index.js +19 -0
- package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
- package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
- package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
- package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
- package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
- package/dist/pipeline/compiler/variable-resolver.js +115 -0
- package/dist/pipeline/coverage-audit.d.ts +15 -5
- package/dist/pipeline/coverage-audit.js +41 -22
- package/dist/pipeline/eval-constants.d.ts +16 -6
- package/dist/pipeline/eval-constants.js +25 -4
- package/dist/pipeline/eval-fingerprint.d.ts +2 -2
- package/dist/pipeline/eval-fingerprint.js +8 -9
- package/dist/pipeline/expand-tasks.d.ts +19 -10
- package/dist/pipeline/expand-tasks.js +34 -28
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +2 -2
- package/dist/pipeline/generate-configs.d.ts +22 -4
- package/dist/pipeline/generate-configs.js +53 -24
- package/dist/pipeline/grader-api.d.ts +3 -3
- package/dist/pipeline/grader-api.js +5 -12
- package/dist/pipeline/grader-compare-runner.js +20 -27
- package/dist/pipeline/grader-comparison.d.ts +4 -8
- package/dist/pipeline/grader-comparison.js +11 -17
- package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
- package/dist/pipeline/grader-consistency-runner.js +16 -20
- package/dist/pipeline/grader-consistency.d.ts +6 -10
- package/dist/pipeline/grader-consistency.js +13 -32
- package/dist/pipeline/grader-sensitivity-runner.js +7 -5
- package/dist/pipeline/grader-sensitivity.d.ts +2 -6
- package/dist/pipeline/grader-sensitivity.js +10 -10
- package/dist/pipeline/grader-validate-runner.js +7 -5
- package/dist/pipeline/grader-validation.d.ts +2 -6
- package/dist/pipeline/grader-validation.js +14 -22
- package/dist/pipeline/map-request-to-config.js +7 -1
- package/dist/pipeline/mirror-repo-tasks.d.ts +13 -13
- package/dist/pipeline/mirror-repo-tasks.js +22 -21
- package/dist/pipeline/normalize-mode.d.ts +49 -0
- package/dist/pipeline/normalize-mode.js +64 -0
- package/dist/pipeline/plan.d.ts +5 -2
- package/dist/pipeline/plan.js +134 -78
- package/dist/pipeline/pr-comment.js +2 -0
- package/dist/pipeline/profile-resolution.d.ts +22 -14
- package/dist/pipeline/profile-resolution.js +41 -19
- package/dist/pipeline/provenance.d.ts +2 -2
- package/dist/pipeline/provenance.js +12 -17
- package/dist/pipeline/release-report.js +4 -4
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/rubric-loader.d.ts +20 -0
- package/dist/pipeline/rubric-loader.js +37 -0
- package/dist/pipeline/validate.d.ts +4 -4
- package/dist/pipeline/validate.js +64 -53
- package/dist/schedules/loader.js +18 -8
- package/dist/scripts/migrate-task-mode.d.ts +24 -0
- package/dist/scripts/migrate-task-mode.js +85 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +15 -15
- package/dist/sinks/loader.js +5 -7
- package/dist/sources.d.ts +7 -7
- package/dist/sources.js +22 -24
- package/dist/webhook/dispatch.js +2 -1
- package/package.json +15 -4
- package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
- package/tasks/literacy/frameworks.task.ts +128 -0
- package/tasks/literacy/functions.task.ts +69 -0
- package/tasks/literacy/groq.task.ts +258 -0
- package/tasks/literacy/nextjs-live.task.ts +75 -0
- package/tasks/literacy/studio-setup.task.ts +131 -0
- package/tasks/literacy/visual-editing.task.ts +146 -0
- package/config/features.yaml +0 -116
- package/config/models.yaml +0 -116
- package/config/prompts.yaml +0 -75
- package/config/rubrics.yaml +0 -81
- package/config/schedules.yaml +0 -43
- package/config/sinks.yaml +0 -54
- package/config/sources.yaml +0 -51
- package/config/thresholds.yaml +0 -49
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP-specific assertion types — ergonomic assertions for MCP server testing.
|
|
3
|
+
*
|
|
4
|
+
* Each assertion type compiles down to a Promptfoo `javascript` assertion
|
|
5
|
+
* with the appropriate validation logic. The developer writes:
|
|
6
|
+
*
|
|
7
|
+
* ```typescript
|
|
8
|
+
* assertions: [
|
|
9
|
+
* { type: "tool-called", value: "getDocument" },
|
|
10
|
+
* { type: "tool-input-matches", value: { documentId: "doc-123" } },
|
|
11
|
+
* { type: "tool-output-matches", value: { title: "Hello" } },
|
|
12
|
+
* { type: "error-returned", value: { code: -32602 } },
|
|
13
|
+
* ]
|
|
14
|
+
* ```
|
|
15
|
+
*
|
|
16
|
+
* The compiler transforms these into Promptfoo-compatible `javascript`
|
|
17
|
+
* assertions that inspect the tool call trace in the evaluation output.
|
|
18
|
+
*
|
|
19
|
+
* @see docs/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
|
|
20
|
+
*/
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// Public API
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
/**
|
|
25
|
+
* Build MCP-specific assertions from task assertion definitions.
|
|
26
|
+
*
|
|
27
|
+
* Handles both MCP-specific types (tool-called, tool-input-matches, etc.)
|
|
28
|
+
* and standard assertion types (contains, llm-rubric, etc.) which are
|
|
29
|
+
* passed through unchanged.
|
|
30
|
+
*/
|
|
31
|
+
export function buildMCPAssertions(assertions, context) {
|
|
32
|
+
const result = [];
|
|
33
|
+
const warnings = [];
|
|
34
|
+
for (const assertion of assertions) {
|
|
35
|
+
const mapped = mapMCPAssertion(assertion, context, warnings);
|
|
36
|
+
if (mapped) {
|
|
37
|
+
result.push(mapped);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return { assertions: result, warnings };
|
|
41
|
+
}
|
|
42
|
+
// ---------------------------------------------------------------------------
|
|
43
|
+
// Assertion mapping
|
|
44
|
+
// ---------------------------------------------------------------------------
|
|
45
|
+
function mapMCPAssertion(assertion, context, warnings) {
|
|
46
|
+
switch (assertion.type) {
|
|
47
|
+
case "tool-called":
|
|
48
|
+
return buildToolCalledAssertion(assertion, context);
|
|
49
|
+
case "tool-input-matches":
|
|
50
|
+
return buildToolInputMatchesAssertion(assertion, context);
|
|
51
|
+
case "tool-output-matches":
|
|
52
|
+
return buildToolOutputMatchesAssertion(assertion, context);
|
|
53
|
+
case "error-returned":
|
|
54
|
+
return buildErrorReturnedAssertion(assertion, context);
|
|
55
|
+
case "capability-available":
|
|
56
|
+
return buildCapabilityAssertion(assertion, context);
|
|
57
|
+
// Standard assertions — pass through
|
|
58
|
+
case "contains":
|
|
59
|
+
case "equals":
|
|
60
|
+
case "regex":
|
|
61
|
+
case "is-json":
|
|
62
|
+
case "llm-rubric":
|
|
63
|
+
case "javascript":
|
|
64
|
+
case "python":
|
|
65
|
+
return {
|
|
66
|
+
type: assertion.type,
|
|
67
|
+
...("value" in assertion ? { value: assertion.value } : {}),
|
|
68
|
+
...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
|
|
69
|
+
...(assertion.type === "llm-rubric" && context.graderProvider
|
|
70
|
+
? { provider: context.graderProvider }
|
|
71
|
+
: {}),
|
|
72
|
+
};
|
|
73
|
+
default:
|
|
74
|
+
warnings.push(`MCP task "${context.taskId}": unknown assertion type "${assertion.type}" — passed through`);
|
|
75
|
+
return {
|
|
76
|
+
type: assertion.type,
|
|
77
|
+
...("value" in assertion ? { value: assertion.value } : {}),
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
// ---------------------------------------------------------------------------
|
|
82
|
+
// tool-called — asserts the model called a specific tool by name
|
|
83
|
+
// ---------------------------------------------------------------------------
|
|
84
|
+
function buildToolCalledAssertion(assertion, _context) {
|
|
85
|
+
const toolName = String(assertion.value ?? "");
|
|
86
|
+
// Strategy: check multiple sources for tool call evidence.
|
|
87
|
+
// 1. context.vars.__toolCalls (structured, if Promptfoo populates it)
|
|
88
|
+
// 2. Response metadata toolCallLog (from custom mcp-tool-provider)
|
|
89
|
+
// 3. Response output text (LLM+MCP providers embed tool_use JSON blocks)
|
|
90
|
+
return {
|
|
91
|
+
type: "javascript",
|
|
92
|
+
value: buildJsAssertion(`tool-called: ${toolName}`, `
|
|
93
|
+
var toolName = ${JSON.stringify(toolName)};
|
|
94
|
+
|
|
95
|
+
// Strategy 1: structured tool calls from Promptfoo
|
|
96
|
+
var toolCalls = context.vars.__toolCalls || [];
|
|
97
|
+
if (Array.isArray(toolCalls) && toolCalls.length > 0) {
|
|
98
|
+
var called = toolCalls.some(function(tc) { return tc.name === toolName; });
|
|
99
|
+
return {
|
|
100
|
+
pass: called,
|
|
101
|
+
score: called ? 1 : 0,
|
|
102
|
+
reason: called
|
|
103
|
+
? 'Tool "' + toolName + '" was called (via __toolCalls)'
|
|
104
|
+
: 'Expected tool "' + toolName + '" but found: ' + toolCalls.map(function(tc) { return tc.name; }).join(', '),
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Strategy 2: MCP_TOOLS_CALLED summary appended by custom mcp-tool-provider
|
|
109
|
+
var outputStr = typeof output === 'string' ? output : JSON.stringify(output || '');
|
|
110
|
+
var summaryMatch = outputStr.match(/<!-- MCP_TOOLS_CALLED: (\\[.*?\\]) -->/);
|
|
111
|
+
if (summaryMatch) {
|
|
112
|
+
try {
|
|
113
|
+
var calledTools = JSON.parse(summaryMatch[1]);
|
|
114
|
+
var called = calledTools.includes(toolName);
|
|
115
|
+
var count = calledTools.filter(function(n) { return n === toolName; }).length;
|
|
116
|
+
return {
|
|
117
|
+
pass: called,
|
|
118
|
+
score: called ? 1 : 0,
|
|
119
|
+
reason: called
|
|
120
|
+
? 'Tool "' + toolName + '" was called ' + count + ' time(s)'
|
|
121
|
+
: 'Expected tool "' + toolName + '" but found: ' + calledTools.join(', '),
|
|
122
|
+
};
|
|
123
|
+
} catch (e) { /* fall through to Strategy 3 */ }
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// Strategy 3: parse output for tool_use blocks (built-in provider fallback)
|
|
127
|
+
var outputStr = typeof output === 'string' ? output : JSON.stringify(output || '');
|
|
128
|
+
var toolUsePattern = /"type"\\s*:\\s*"tool_use"[^}]*"name"\\s*:\\s*"([^"]+)"/g;
|
|
129
|
+
var foundTools = [];
|
|
130
|
+
var match;
|
|
131
|
+
while ((match = toolUsePattern.exec(outputStr)) !== null) {
|
|
132
|
+
foundTools.push(match[1]);
|
|
133
|
+
}
|
|
134
|
+
var fnCallPattern = /"function"\\s*:\\s*\\{[^}]*"name"\\s*:\\s*"([^"]+)"/g;
|
|
135
|
+
while ((match = fnCallPattern.exec(outputStr)) !== null) {
|
|
136
|
+
foundTools.push(match[1]);
|
|
137
|
+
}
|
|
138
|
+
if (foundTools.length === 0 && outputStr.includes(toolName) && outputStr.includes('tool_use')) {
|
|
139
|
+
foundTools.push(toolName);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
var called = foundTools.includes(toolName);
|
|
143
|
+
return {
|
|
144
|
+
pass: called,
|
|
145
|
+
score: called ? 1 : 0,
|
|
146
|
+
reason: called
|
|
147
|
+
? 'Tool "' + toolName + '" was called (detected in output)'
|
|
148
|
+
: 'Expected tool "' + toolName + '" to be called. ' +
|
|
149
|
+
(foundTools.length > 0
|
|
150
|
+
? 'Tools found in output: ' + foundTools.join(', ')
|
|
151
|
+
: 'No tool calls detected in output'),
|
|
152
|
+
};`),
|
|
153
|
+
...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
|
|
154
|
+
};
|
|
155
|
+
}
|
|
156
|
+
// ---------------------------------------------------------------------------
|
|
157
|
+
// tool-input-matches — asserts tool call inputs match a schema/value
|
|
158
|
+
// ---------------------------------------------------------------------------
|
|
159
|
+
function buildToolInputMatchesAssertion(assertion, _context) {
|
|
160
|
+
const expected = assertion.value;
|
|
161
|
+
const toolName = assertion.toolName ?? assertion.tool;
|
|
162
|
+
return {
|
|
163
|
+
type: "javascript",
|
|
164
|
+
value: buildJsAssertion(`tool-input-matches${toolName ? `: ${toolName}` : ""}`, `
|
|
165
|
+
const toolCalls = context.vars.__toolCalls || [];
|
|
166
|
+
const expected = ${JSON.stringify(expected)};
|
|
167
|
+
const toolFilter = ${JSON.stringify(toolName ?? null)};
|
|
168
|
+
|
|
169
|
+
const targetCalls = toolFilter
|
|
170
|
+
? toolCalls.filter(tc => tc.name === toolFilter)
|
|
171
|
+
: toolCalls;
|
|
172
|
+
|
|
173
|
+
if (targetCalls.length === 0) {
|
|
174
|
+
return {
|
|
175
|
+
pass: false,
|
|
176
|
+
score: 0,
|
|
177
|
+
reason: toolFilter
|
|
178
|
+
? 'No calls to tool "' + toolFilter + '" found'
|
|
179
|
+
: 'No tool calls found',
|
|
180
|
+
};
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Check if any call's input matches the expected value
|
|
184
|
+
const match = targetCalls.some(tc => {
|
|
185
|
+
const input = tc.input || tc.arguments || {};
|
|
186
|
+
return Object.entries(expected).every(([k, v]) =>
|
|
187
|
+
JSON.stringify(input[k]) === JSON.stringify(v)
|
|
188
|
+
);
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
return {
|
|
192
|
+
pass: match,
|
|
193
|
+
score: match ? 1 : 0,
|
|
194
|
+
reason: match
|
|
195
|
+
? 'Tool input matches expected values'
|
|
196
|
+
: 'Tool input does not match. Expected: ' + JSON.stringify(expected) +
|
|
197
|
+
', Got: ' + JSON.stringify(targetCalls.map(tc => tc.input || tc.arguments)),
|
|
198
|
+
};`),
|
|
199
|
+
...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
// ---------------------------------------------------------------------------
|
|
203
|
+
// tool-output-matches — asserts tool outputs match expected shape/values
|
|
204
|
+
// ---------------------------------------------------------------------------
|
|
205
|
+
function buildToolOutputMatchesAssertion(assertion, _context) {
|
|
206
|
+
const expected = assertion.value;
|
|
207
|
+
const toolName = assertion.toolName ?? assertion.tool;
|
|
208
|
+
return {
|
|
209
|
+
type: "javascript",
|
|
210
|
+
value: buildJsAssertion(`tool-output-matches${toolName ? `: ${toolName}` : ""}`, `
|
|
211
|
+
const toolCalls = context.vars.__toolCalls || [];
|
|
212
|
+
const expected = ${JSON.stringify(expected)};
|
|
213
|
+
const toolFilter = ${JSON.stringify(toolName ?? null)};
|
|
214
|
+
|
|
215
|
+
const targetCalls = toolFilter
|
|
216
|
+
? toolCalls.filter(tc => tc.name === toolFilter)
|
|
217
|
+
: toolCalls;
|
|
218
|
+
|
|
219
|
+
if (targetCalls.length === 0) {
|
|
220
|
+
return {
|
|
221
|
+
pass: false,
|
|
222
|
+
score: 0,
|
|
223
|
+
reason: toolFilter
|
|
224
|
+
? 'No calls to tool "' + toolFilter + '" found'
|
|
225
|
+
: 'No tool calls found',
|
|
226
|
+
};
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
const match = targetCalls.some(tc => {
|
|
230
|
+
const output = tc.output || tc.result || {};
|
|
231
|
+
return Object.entries(expected).every(([k, v]) =>
|
|
232
|
+
JSON.stringify(output[k]) === JSON.stringify(v)
|
|
233
|
+
);
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
return {
|
|
237
|
+
pass: match,
|
|
238
|
+
score: match ? 1 : 0,
|
|
239
|
+
reason: match
|
|
240
|
+
? 'Tool output matches expected values'
|
|
241
|
+
: 'Tool output does not match. Expected: ' + JSON.stringify(expected),
|
|
242
|
+
};`),
|
|
243
|
+
...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
|
|
244
|
+
};
|
|
245
|
+
}
|
|
246
|
+
// ---------------------------------------------------------------------------
|
|
247
|
+
// error-returned — asserts the server returned a specific error
|
|
248
|
+
// ---------------------------------------------------------------------------
|
|
249
|
+
function buildErrorReturnedAssertion(assertion, _context) {
|
|
250
|
+
const expected = assertion.value;
|
|
251
|
+
return {
|
|
252
|
+
type: "javascript",
|
|
253
|
+
value: buildJsAssertion("error-returned", `
|
|
254
|
+
const toolCalls = context.vars.__toolCalls || [];
|
|
255
|
+
const expected = ${JSON.stringify(expected ?? {})};
|
|
256
|
+
|
|
257
|
+
const errorCall = toolCalls.find(tc => tc.error);
|
|
258
|
+
if (!errorCall) {
|
|
259
|
+
return {
|
|
260
|
+
pass: false,
|
|
261
|
+
score: 0,
|
|
262
|
+
reason: 'Expected an error response but no errors were returned',
|
|
263
|
+
};
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
const error = errorCall.error;
|
|
267
|
+
let pass = true;
|
|
268
|
+
const reasons = [];
|
|
269
|
+
|
|
270
|
+
if (expected.code !== undefined && error.code !== expected.code) {
|
|
271
|
+
pass = false;
|
|
272
|
+
reasons.push('Expected error code ' + expected.code + ', got ' + error.code);
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
if (expected.message !== undefined) {
|
|
276
|
+
const msgMatch = typeof error.message === 'string' &&
|
|
277
|
+
error.message.includes(expected.message);
|
|
278
|
+
if (!msgMatch) {
|
|
279
|
+
pass = false;
|
|
280
|
+
reasons.push('Expected error message containing "' + expected.message +
|
|
281
|
+
'", got "' + (error.message || '') + '"');
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
if (pass) {
|
|
286
|
+
reasons.push('Error matches expected pattern');
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
return {
|
|
290
|
+
pass,
|
|
291
|
+
score: pass ? 1 : 0,
|
|
292
|
+
reason: reasons.join('; '),
|
|
293
|
+
};`),
|
|
294
|
+
...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
|
|
295
|
+
};
|
|
296
|
+
}
|
|
297
|
+
// ---------------------------------------------------------------------------
|
|
298
|
+
// capability-available — asserts the server advertises a capability
|
|
299
|
+
// ---------------------------------------------------------------------------
|
|
300
|
+
function buildCapabilityAssertion(assertion, _context) {
|
|
301
|
+
const capability = String(assertion.value ?? "");
|
|
302
|
+
return {
|
|
303
|
+
type: "javascript",
|
|
304
|
+
value: buildJsAssertion(`capability-available: ${capability}`, `
|
|
305
|
+
const capabilities = context.vars.__serverCapabilities || [];
|
|
306
|
+
const expected = ${JSON.stringify(capability)};
|
|
307
|
+
const available = capabilities.includes(expected);
|
|
308
|
+
|
|
309
|
+
return {
|
|
310
|
+
pass: available,
|
|
311
|
+
score: available ? 1 : 0,
|
|
312
|
+
reason: available
|
|
313
|
+
? 'Server advertises capability "' + expected + '"'
|
|
314
|
+
: 'Server does not advertise capability "' + expected + '". ' +
|
|
315
|
+
'Available: ' + (capabilities.join(', ') || 'none'),
|
|
316
|
+
};`),
|
|
317
|
+
...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
|
|
318
|
+
};
|
|
319
|
+
}
|
|
320
|
+
// ---------------------------------------------------------------------------
|
|
321
|
+
// Helpers
|
|
322
|
+
// ---------------------------------------------------------------------------
|
|
323
|
+
/**
|
|
324
|
+
* Build a Promptfoo-compatible JavaScript assertion string.
|
|
325
|
+
*
|
|
326
|
+
* Wraps the assertion body in a function that receives `output` and `context`
|
|
327
|
+
* from Promptfoo's assertion runner.
|
|
328
|
+
*/
|
|
329
|
+
function buildJsAssertion(label, body) {
|
|
330
|
+
// No IIFE wrapper — Promptfoo wraps the assertion in its own function via
|
|
331
|
+
// new Function('output', 'context', ...). The body must use `return` at
|
|
332
|
+
// the top level for the result to reach Promptfoo's validator.
|
|
333
|
+
return `// MCP assertion: ${label}\n${body.trim()}`;
|
|
334
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP server task compilation — core compiler logic.
|
|
3
|
+
*
|
|
4
|
+
* Produces Promptfoo configuration from MCP server task definitions:
|
|
5
|
+
* 1. A provider config pointing to the MCP server
|
|
6
|
+
* 2. Test cases with tool-call assertions
|
|
7
|
+
* 3. Appropriate prompts for the evaluation
|
|
8
|
+
*/
|
|
9
|
+
import type { MCPServerTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
10
|
+
import type { MCPCompileOptions, MCPCompileResult } from "./types.js";
|
|
11
|
+
/**
|
|
12
|
+
* Compile an MCP server task definition into Promptfoo configuration.
|
|
13
|
+
*
|
|
14
|
+
* This is the core of the MCP mode handler. It produces:
|
|
15
|
+
* 1. A provider config pointing to the MCP server
|
|
16
|
+
* 2. Test cases with tool-call assertions
|
|
17
|
+
* 3. Appropriate prompts for the evaluation
|
|
18
|
+
*/
|
|
19
|
+
export declare function compileMCPTask(task: MCPServerTaskDefinition, options?: MCPCompileOptions): MCPCompileResult;
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP server task compilation — core compiler logic.
|
|
3
|
+
*
|
|
4
|
+
* Produces Promptfoo configuration from MCP server task definitions:
|
|
5
|
+
* 1. A provider config pointing to the MCP server
|
|
6
|
+
* 2. Test cases with tool-call assertions
|
|
7
|
+
* 3. Appropriate prompts for the evaluation
|
|
8
|
+
*/
|
|
9
|
+
import { buildMCPAssertions } from "./assertions.js";
|
|
10
|
+
import { buildMCPProvider } from "./provider-config.js";
|
|
11
|
+
import { validateMCPTask } from "./validation.js";
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
// Public API
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
/**
|
|
16
|
+
* Compile an MCP server task definition into Promptfoo configuration.
|
|
17
|
+
*
|
|
18
|
+
* This is the core of the MCP mode handler. It produces:
|
|
19
|
+
* 1. A provider config pointing to the MCP server
|
|
20
|
+
* 2. Test cases with tool-call assertions
|
|
21
|
+
* 3. Appropriate prompts for the evaluation
|
|
22
|
+
*/
|
|
23
|
+
export function compileMCPTask(task, options) {
|
|
24
|
+
const warnings = [];
|
|
25
|
+
// Validate
|
|
26
|
+
const validationErrors = validateMCPTask(task);
|
|
27
|
+
if (validationErrors.length > 0) {
|
|
28
|
+
for (const err of validationErrors) {
|
|
29
|
+
warnings.push(`MCP task "${task.id}": ${err.field} — ${err.message}`);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
// Build providers (one LLM provider per model, each with MCP config)
|
|
33
|
+
const providers = buildMCPProvider(task, options?.models ?? [], warnings);
|
|
34
|
+
// Build prompts
|
|
35
|
+
const prompts = buildMCPPrompts(task);
|
|
36
|
+
// Build test cases
|
|
37
|
+
const tests = buildMCPTestCases(task, options, warnings);
|
|
38
|
+
return { providers, tests, prompts, warnings };
|
|
39
|
+
}
|
|
40
|
+
// ---------------------------------------------------------------------------
|
|
41
|
+
// Prompt assembly
|
|
42
|
+
// ---------------------------------------------------------------------------
|
|
43
|
+
function buildMCPPrompts(task) {
|
|
44
|
+
// MCP mode uses a single prompt — the task description
|
|
45
|
+
const promptText = task.prompt?.text ??
|
|
46
|
+
task.prompt?.vars?.task ??
|
|
47
|
+
task.description ??
|
|
48
|
+
`Test MCP server: ${task.title}`;
|
|
49
|
+
return [
|
|
50
|
+
{
|
|
51
|
+
id: "mcp-test",
|
|
52
|
+
label: `MCP: ${task.title}`,
|
|
53
|
+
raw: String(promptText),
|
|
54
|
+
},
|
|
55
|
+
];
|
|
56
|
+
}
|
|
57
|
+
// ---------------------------------------------------------------------------
|
|
58
|
+
// Test case assembly
|
|
59
|
+
// ---------------------------------------------------------------------------
|
|
60
|
+
function buildMCPTestCases(task, options, warnings) {
|
|
61
|
+
const tests = [];
|
|
62
|
+
// Build assertion context
|
|
63
|
+
const assertionContext = {
|
|
64
|
+
capabilities: task.capabilities ?? [],
|
|
65
|
+
graderProvider: options?.graderProvider,
|
|
66
|
+
taskId: task.id,
|
|
67
|
+
};
|
|
68
|
+
// Compile assertions
|
|
69
|
+
// Cast GeneralizedAssertionDefinition[] → AssertionInput[] (structurally compatible)
|
|
70
|
+
const assertions = [];
|
|
71
|
+
if (task.assertions) {
|
|
72
|
+
const rawAssertions = task.assertions;
|
|
73
|
+
const { assertions: mapped, warnings: assertionWarnings } = buildMCPAssertions(rawAssertions, assertionContext);
|
|
74
|
+
assertions.push(...mapped);
|
|
75
|
+
warnings.push(...assertionWarnings);
|
|
76
|
+
}
|
|
77
|
+
// Build test case vars
|
|
78
|
+
const vars = {
|
|
79
|
+
task: task.prompt?.vars?.task ?? task.description ?? `Test: ${task.title}`,
|
|
80
|
+
...(task.prompt?.vars ?? {}),
|
|
81
|
+
};
|
|
82
|
+
// Primary test case
|
|
83
|
+
tests.push({
|
|
84
|
+
description: `${task.id} — ${task.title}`,
|
|
85
|
+
vars,
|
|
86
|
+
...(assertions.length > 0 ? { assert: assertions } : {}),
|
|
87
|
+
});
|
|
88
|
+
// Multi-turn test cases
|
|
89
|
+
if (task.multiTurn?.turns && task.multiTurn.turns.length > 0) {
|
|
90
|
+
tests.push({
|
|
91
|
+
description: `${task.id} — ${task.title} [multi-turn]`,
|
|
92
|
+
vars: {
|
|
93
|
+
...vars,
|
|
94
|
+
__multiTurn: task.multiTurn.turns,
|
|
95
|
+
},
|
|
96
|
+
...(assertions.length > 0 ? { assert: assertions } : {}),
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
return tests;
|
|
100
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP Server mode handler — directory barrel.
|
|
3
|
+
*
|
|
4
|
+
* MCPServerModeHandler — compilation rules for `mcp-server` evaluation mode.
|
|
5
|
+
*
|
|
6
|
+
* This is the first non-literacy mode handler, proving the compiler
|
|
7
|
+
* architecture works end-to-end. It translates MCP server task definitions
|
|
8
|
+
* into Promptfoo configuration with:
|
|
9
|
+
*
|
|
10
|
+
* - An MCP provider that wraps the server under test
|
|
11
|
+
* - Tool-call assertions compiled to Promptfoo `javascript` assertions
|
|
12
|
+
* - Server lifecycle management via Promptfoo provider hooks
|
|
13
|
+
* - Multi-turn conversation support via Promptfoo's `steps` syntax
|
|
14
|
+
*
|
|
15
|
+
* @see docs/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
|
|
16
|
+
* @see packages/core/src/types/eval-mode-config.ts — MCPServerModeConfig
|
|
17
|
+
* @see packages/core/src/types/generalized-task.ts — MCPServerTaskDefinition
|
|
18
|
+
*/
|
|
19
|
+
import type { ModeHandler } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
20
|
+
/** ModeHandler-conformant export for the mcp-server evaluation mode. */
|
|
21
|
+
export declare const handler: ModeHandler;
|
|
22
|
+
export type { MCPAssertionContext, MCPCompileOptions, MCPCompileResult, MCPValidationError, } from "./types.js";
|
|
23
|
+
export { buildMCPAssertions } from "./assertions.js";
|
|
24
|
+
export { compileMCPTask } from "./compiler.js";
|
|
25
|
+
export { validateMCPTask } from "./validation.js";
|
|
26
|
+
export { MCP_PROMPT_TEMPLATES } from "./prompts.js";
|
|
27
|
+
export { DEFAULT_MAX_TOOL_ROUNDS, MCP_PROVIDER_PATH, } from "./provider-config.js";
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP Server mode handler — directory barrel.
|
|
3
|
+
*
|
|
4
|
+
* MCPServerModeHandler — compilation rules for `mcp-server` evaluation mode.
|
|
5
|
+
*
|
|
6
|
+
* This is the first non-literacy mode handler, proving the compiler
|
|
7
|
+
* architecture works end-to-end. It translates MCP server task definitions
|
|
8
|
+
* into Promptfoo configuration with:
|
|
9
|
+
*
|
|
10
|
+
* - An MCP provider that wraps the server under test
|
|
11
|
+
* - Tool-call assertions compiled to Promptfoo `javascript` assertions
|
|
12
|
+
* - Server lifecycle management via Promptfoo provider hooks
|
|
13
|
+
* - Multi-turn conversation support via Promptfoo's `steps` syntax
|
|
14
|
+
*
|
|
15
|
+
* @see docs/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
|
|
16
|
+
* @see packages/core/src/types/eval-mode-config.ts — MCPServerModeConfig
|
|
17
|
+
* @see packages/core/src/types/generalized-task.ts — MCPServerTaskDefinition
|
|
18
|
+
*/
|
|
19
|
+
import { compileMCPTask } from "./compiler.js";
|
|
20
|
+
import { MCP_PROMPT_TEMPLATES } from "./prompts.js";
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// ModeHandler adapter
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
/** ModeHandler-conformant export for the mcp-server evaluation mode. */
|
|
25
|
+
export const handler = {
|
|
26
|
+
getPrompts() {
|
|
27
|
+
return MCP_PROMPT_TEMPLATES;
|
|
28
|
+
},
|
|
29
|
+
compileTask(task, ctx) {
|
|
30
|
+
if (!("mode" in task) || task.mode !== "mcp-server") {
|
|
31
|
+
throw new Error(`MCP server handler received task with mode "${task.mode ?? "undefined"}" — expected "mcp-server"`);
|
|
32
|
+
}
|
|
33
|
+
const result = compileMCPTask(task, {
|
|
34
|
+
graderProvider: ctx.graderProvider,
|
|
35
|
+
models: ctx.models,
|
|
36
|
+
});
|
|
37
|
+
return {
|
|
38
|
+
providers: result.providers,
|
|
39
|
+
tests: result.tests,
|
|
40
|
+
prompts: result.prompts,
|
|
41
|
+
warnings: result.warnings,
|
|
42
|
+
};
|
|
43
|
+
},
|
|
44
|
+
};
|
|
45
|
+
// Assertions
|
|
46
|
+
export { buildMCPAssertions } from "./assertions.js";
|
|
47
|
+
// Compilation
|
|
48
|
+
export { compileMCPTask } from "./compiler.js";
|
|
49
|
+
// Validation
|
|
50
|
+
export { validateMCPTask } from "./validation.js";
|
|
51
|
+
// Prompts
|
|
52
|
+
export { MCP_PROMPT_TEMPLATES } from "./prompts.js";
|
|
53
|
+
// Provider config
|
|
54
|
+
export { DEFAULT_MAX_TOOL_ROUNDS, MCP_PROVIDER_PATH, } from "./provider-config.js";
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Canonical MCP server prompt templates.
|
|
3
|
+
*
|
|
4
|
+
* Handler-owned prompts for MCP server evaluations. Instructs the model to
|
|
5
|
+
* interact with MCP tools rather than writing standalone code.
|
|
6
|
+
*/
|
|
7
|
+
import type { PromptTemplate } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
8
|
+
export declare const MCP_PROMPT_TEMPLATES: Record<string, PromptTemplate>;
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Canonical MCP server prompt templates.
|
|
3
|
+
*
|
|
4
|
+
* Handler-owned prompts for MCP server evaluations. Instructs the model to
|
|
5
|
+
* interact with MCP tools rather than writing standalone code.
|
|
6
|
+
*/
|
|
7
|
+
export const MCP_PROMPT_TEMPLATES = {
|
|
8
|
+
"mcp-server": {
|
|
9
|
+
id: "mcp-server",
|
|
10
|
+
label: "MCP Server Tool Use",
|
|
11
|
+
template: `You are an AI assistant with access to an MCP (Model Context Protocol) server that provides tools for interacting with a Sanity content backend.
|
|
12
|
+
|
|
13
|
+
## Task
|
|
14
|
+
{{task}}
|
|
15
|
+
|
|
16
|
+
## Instructions
|
|
17
|
+
|
|
18
|
+
1. Use the available MCP tools to complete the task
|
|
19
|
+
2. Call tools with the correct parameters as described in their schemas
|
|
20
|
+
3. Interpret tool responses and use the results to accomplish the goal
|
|
21
|
+
4. If a tool returns an error, explain the issue clearly
|
|
22
|
+
5. Prefer using specific tools over broad queries when possible
|
|
23
|
+
|
|
24
|
+
Complete the task using the MCP tools provided:
|
|
25
|
+
`,
|
|
26
|
+
variables: ["task"],
|
|
27
|
+
},
|
|
28
|
+
};
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP server provider assembly — builds Promptfoo provider configs.
|
|
3
|
+
*/
|
|
4
|
+
import type { MCPServerTaskDefinition, ModeProviderEntry } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
5
|
+
import type { PromptfooProvider } from "../../promptfoo-compiler.js";
|
|
6
|
+
/** Default max tool rounds for MCP multi-turn execution */
|
|
7
|
+
export declare const DEFAULT_MAX_TOOL_ROUNDS = 5;
|
|
8
|
+
/** Provider path relative to eval package dist */
|
|
9
|
+
export declare const MCP_PROVIDER_PATH = "file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js";
|
|
10
|
+
/**
|
|
11
|
+
* Build custom MCP tool provider configs — one per model.
|
|
12
|
+
*
|
|
13
|
+
* Each provider uses the custom mcp-tool-provider.ts which implements a
|
|
14
|
+
* multi-turn tool execution loop. The LLM receives a prompt, discovers
|
|
15
|
+
* MCP tools, calls them, gets results, and continues until it produces
|
|
16
|
+
* a final text answer or exhausts maxToolRounds.
|
|
17
|
+
*
|
|
18
|
+
* Config shape passed to the custom provider:
|
|
19
|
+
* { model, mcpServer: { url, auth, name }, mcpTools, maxToolRounds, temperature, ... }
|
|
20
|
+
*/
|
|
21
|
+
export declare function buildMCPProvider(task: MCPServerTaskDefinition, models: ModeProviderEntry[], warnings: string[]): PromptfooProvider[];
|
|
22
|
+
/**
|
|
23
|
+
* Build the MCP server connection config for the custom provider.
|
|
24
|
+
*
|
|
25
|
+
* Shape: { url?, command?, name?, auth? }
|
|
26
|
+
* The custom mcp-tool-provider.ts uses this to connect to the MCP server.
|
|
27
|
+
*/
|
|
28
|
+
export declare function buildMCPServerConfig(task: MCPServerTaskDefinition, warnings: string[]): Record<string, unknown>;
|