@sanity/ailf 1.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +29 -12
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
- package/dist/_vendor/ailf-core/config-helpers.js +51 -2
- package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
- package/dist/_vendor/ailf-core/examples/index.js +213 -94
- package/dist/_vendor/ailf-core/index.d.ts +3 -2
- package/dist/_vendor/ailf-core/index.js +2 -1
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
- package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +7 -1
- package/dist/adapters/config-sources/ts-config-loader.js +21 -13
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
- package/dist/adapters/task-sources/index.d.ts +3 -4
- package/dist/adapters/task-sources/index.js +3 -4
- package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
- package/dist/adapters/task-sources/repo-schemas.js +228 -20
- package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
- package/dist/adapters/task-sources/repo-task-source.js +81 -122
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
- package/dist/adapters/task-sources/task-file-loader.js +21 -7
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/coverage-audit.js +3 -1
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +5 -4
- package/dist/commands/init.js +190 -25
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +15 -4
- package/dist/composition-root.js +100 -55
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/build-step-sequence.js +4 -2
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +32 -19
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +77 -26
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +51 -31
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
- package/dist/pipeline/compiler/literacy-bridge.js +2 -2
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
- package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
- package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/expand-tasks.d.ts +2 -2
- package/dist/pipeline/expand-tasks.js +2 -2
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +16 -9
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +16 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
- package/dist/pipeline/mirror-repo-tasks.js +10 -10
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +68 -30
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +32 -24
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* mcp-tool-provider.ts — Custom Promptfoo provider for MCP tool-use evaluation.
|
|
3
|
+
*
|
|
4
|
+
* Implements a multi-turn tool execution loop: the LLM receives a prompt,
|
|
5
|
+
* discovers MCP tools, calls them, gets results, and continues until it
|
|
6
|
+
* produces a final text answer or exhausts maxToolRounds.
|
|
7
|
+
*
|
|
8
|
+
* Promptfoo's built-in Anthropic/OpenAI providers with config.mcp only do
|
|
9
|
+
* single-turn tool calls. This provider fills that gap by managing the
|
|
10
|
+
* full conversation loop, similar to the agentic-provider.ts pattern.
|
|
11
|
+
*
|
|
12
|
+
* Promptfoo config usage:
|
|
13
|
+
*
|
|
14
|
+
* providers:
|
|
15
|
+
* - id: file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js
|
|
16
|
+
* label: "Claude Opus 4.6 + MCP"
|
|
17
|
+
* config:
|
|
18
|
+
* model: anthropic:messages:claude-opus-4-6
|
|
19
|
+
* maxToolRounds: 5
|
|
20
|
+
* temperature: 0.2
|
|
21
|
+
* max_tokens: 4096
|
|
22
|
+
* mcpServer:
|
|
23
|
+
* url: https://mcp.sanity.io
|
|
24
|
+
* auth: { type: bearer, token: "{{env.SANITY_API_TOKEN}}" }
|
|
25
|
+
* name: mcp-live-query-documents
|
|
26
|
+
* mcpTools: [query_documents, get_schema]
|
|
27
|
+
*/
|
|
28
|
+
import { config as loadDotenv } from "dotenv";
|
|
29
|
+
loadDotenv({
|
|
30
|
+
override: true,
|
|
31
|
+
path: new URL("../../../../.env", import.meta.url).pathname,
|
|
32
|
+
});
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
// Provider
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
export default class MCPToolProvider {
|
|
37
|
+
config;
|
|
38
|
+
providerId;
|
|
39
|
+
constructor(options = {}) {
|
|
40
|
+
this.config = options.config || {};
|
|
41
|
+
this.providerId = options.id || "mcp-tool-provider";
|
|
42
|
+
}
|
|
43
|
+
id() {
|
|
44
|
+
return this.providerId;
|
|
45
|
+
}
|
|
46
|
+
async callApi(prompt, _context) {
|
|
47
|
+
const mcpServerConfig = this.config.mcpServer;
|
|
48
|
+
if (!mcpServerConfig) {
|
|
49
|
+
return { error: "mcpServer config is required", output: undefined };
|
|
50
|
+
}
|
|
51
|
+
// Resolve model provider
|
|
52
|
+
const modelId = this.config.model ||
|
|
53
|
+
"anthropic:messages:claude-sonnet-4-20250514";
|
|
54
|
+
if (modelId.startsWith("anthropic:")) {
|
|
55
|
+
return this.runAnthropicLoop(prompt, mcpServerConfig, modelId);
|
|
56
|
+
}
|
|
57
|
+
// For now, only Anthropic is supported. OpenAI support can be added later.
|
|
58
|
+
return {
|
|
59
|
+
error: `MCP tool provider only supports Anthropic models for now. Got: ${modelId}`,
|
|
60
|
+
output: undefined,
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
// -------------------------------------------------------------------------
|
|
64
|
+
// Anthropic multi-turn MCP tool loop
|
|
65
|
+
// -------------------------------------------------------------------------
|
|
66
|
+
async runAnthropicLoop(prompt, mcpServerConfig, modelId) {
|
|
67
|
+
// Parse model name from provider ID (e.g., "anthropic:messages:claude-opus-4-6" → "claude-opus-4-6")
|
|
68
|
+
const modelParts = modelId.split(":");
|
|
69
|
+
const model = modelParts.length > 2
|
|
70
|
+
? modelParts.slice(2).join(":")
|
|
71
|
+
: modelParts[modelParts.length - 1];
|
|
72
|
+
const temperature = this.config.temperature ?? 0.2;
|
|
73
|
+
const maxTokens = this.config.max_tokens || 4096;
|
|
74
|
+
const maxToolRounds = this.config.maxToolRounds || 5;
|
|
75
|
+
const apiKey = this.config.apiKey || process.env.ANTHROPIC_API_KEY;
|
|
76
|
+
if (!apiKey) {
|
|
77
|
+
return {
|
|
78
|
+
error: "ANTHROPIC_API_KEY not set. Configure it in env or provider config.",
|
|
79
|
+
output: undefined,
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
// Connect to MCP server and discover tools
|
|
83
|
+
let mcpClient;
|
|
84
|
+
try {
|
|
85
|
+
mcpClient = await this.connectMCP(mcpServerConfig);
|
|
86
|
+
}
|
|
87
|
+
catch (err) {
|
|
88
|
+
return {
|
|
89
|
+
error: `Failed to connect to MCP server: ${err instanceof Error ? err.message : String(err)}`,
|
|
90
|
+
output: undefined,
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
try {
|
|
94
|
+
// Get available tools and convert to Anthropic format
|
|
95
|
+
const mcpTools = mcpClient.getAllTools();
|
|
96
|
+
const toolFilter = this.config.mcpTools;
|
|
97
|
+
const filteredTools = toolFilter
|
|
98
|
+
? mcpTools.filter((t) => toolFilter.includes(t.name))
|
|
99
|
+
: mcpTools;
|
|
100
|
+
const tools = filteredTools.map((t) => ({
|
|
101
|
+
name: t.name,
|
|
102
|
+
description: t.description || `MCP tool: ${t.name}`,
|
|
103
|
+
input_schema: t.inputSchema || { type: "object", properties: {} },
|
|
104
|
+
}));
|
|
105
|
+
if (tools.length === 0) {
|
|
106
|
+
return {
|
|
107
|
+
error: "No MCP tools available after filtering. Check mcpTools config and server capabilities.",
|
|
108
|
+
output: undefined,
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
/** Append a machine-readable tool call summary to output for assertion detection */
|
|
112
|
+
function appendToolSummary(text, log) {
|
|
113
|
+
if (log.length === 0)
|
|
114
|
+
return text;
|
|
115
|
+
const names = JSON.stringify(log.map((tc) => tc.name));
|
|
116
|
+
return `${text}\n\n<!-- MCP_TOOLS_CALLED: ${names} -->`;
|
|
117
|
+
}
|
|
118
|
+
const systemPrompt = "You are an AI assistant with access to tools provided by an MCP server. " +
|
|
119
|
+
"Use the available tools to complete the task. Call tools with correct parameters, " +
|
|
120
|
+
"interpret responses, and provide a complete answer.";
|
|
121
|
+
const messages = [{ content: prompt, role: "user" }];
|
|
122
|
+
let inputTokens = 0;
|
|
123
|
+
let outputTokens = 0;
|
|
124
|
+
const startTime = Date.now();
|
|
125
|
+
const toolCallLog = [];
|
|
126
|
+
for (let round = 0; round <= maxToolRounds; round++) {
|
|
127
|
+
const isLastRound = round === maxToolRounds;
|
|
128
|
+
// On last round, omit tools to force a final text response
|
|
129
|
+
if (isLastRound) {
|
|
130
|
+
const lastMsg = messages[messages.length - 1];
|
|
131
|
+
const synthesisText = "You've used the tools available. Based on the information gathered, " +
|
|
132
|
+
"provide your complete, final answer now.";
|
|
133
|
+
if (lastMsg?.role === "user" && Array.isArray(lastMsg.content)) {
|
|
134
|
+
;
|
|
135
|
+
lastMsg.content.push({
|
|
136
|
+
type: "text",
|
|
137
|
+
text: synthesisText,
|
|
138
|
+
});
|
|
139
|
+
}
|
|
140
|
+
else {
|
|
141
|
+
messages.push({ content: synthesisText, role: "user" });
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
const body = {
|
|
145
|
+
max_tokens: maxTokens,
|
|
146
|
+
messages,
|
|
147
|
+
model,
|
|
148
|
+
system: systemPrompt,
|
|
149
|
+
temperature,
|
|
150
|
+
};
|
|
151
|
+
if (!isLastRound) {
|
|
152
|
+
body.tools = tools;
|
|
153
|
+
}
|
|
154
|
+
const response = await fetch("https://api.anthropic.com/v1/messages", {
|
|
155
|
+
body: JSON.stringify(body),
|
|
156
|
+
headers: {
|
|
157
|
+
"anthropic-version": "2023-06-01",
|
|
158
|
+
"Content-Type": "application/json",
|
|
159
|
+
"x-api-key": apiKey,
|
|
160
|
+
},
|
|
161
|
+
method: "POST",
|
|
162
|
+
});
|
|
163
|
+
const data = (await response.json());
|
|
164
|
+
if (data.error) {
|
|
165
|
+
return {
|
|
166
|
+
error: data.error.message ??
|
|
167
|
+
`Anthropic API error: ${JSON.stringify(data.error)}`,
|
|
168
|
+
output: undefined,
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
inputTokens += data.usage?.input_tokens ?? 0;
|
|
172
|
+
outputTokens += data.usage?.output_tokens ?? 0;
|
|
173
|
+
if (!data.content?.length) {
|
|
174
|
+
return {
|
|
175
|
+
cost: 0,
|
|
176
|
+
metadata: { toolRounds: round, toolCallLog },
|
|
177
|
+
output: "",
|
|
178
|
+
tokenUsage: {
|
|
179
|
+
completion: outputTokens,
|
|
180
|
+
prompt: inputTokens,
|
|
181
|
+
total: inputTokens + outputTokens,
|
|
182
|
+
},
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
// Add assistant response to history
|
|
186
|
+
messages.push({ content: data.content, role: "assistant" });
|
|
187
|
+
// Check if model wants to use tools
|
|
188
|
+
const toolUseBlocks = data.content.filter((b) => b.type === "tool_use");
|
|
189
|
+
if (data.stop_reason !== "tool_use" || toolUseBlocks.length === 0) {
|
|
190
|
+
// Model is done — extract text
|
|
191
|
+
const textBlocks = data.content.filter((b) => b.type === "text");
|
|
192
|
+
const rawOutput = textBlocks.map((b) => b.text || "").join("\n") || "";
|
|
193
|
+
return {
|
|
194
|
+
cost: 0,
|
|
195
|
+
metadata: {
|
|
196
|
+
toolRounds: round,
|
|
197
|
+
toolCallLog,
|
|
198
|
+
latencyMs: Date.now() - startTime,
|
|
199
|
+
},
|
|
200
|
+
output: appendToolSummary(rawOutput, toolCallLog),
|
|
201
|
+
tokenUsage: {
|
|
202
|
+
completion: outputTokens,
|
|
203
|
+
prompt: inputTokens,
|
|
204
|
+
total: inputTokens + outputTokens,
|
|
205
|
+
},
|
|
206
|
+
};
|
|
207
|
+
}
|
|
208
|
+
// Execute each tool call via MCP
|
|
209
|
+
const toolResults = [];
|
|
210
|
+
for (const toolUse of toolUseBlocks) {
|
|
211
|
+
const toolName = toolUse.name;
|
|
212
|
+
const toolInput = (toolUse.input || {});
|
|
213
|
+
try {
|
|
214
|
+
const result = await mcpClient.callTool(toolName, toolInput);
|
|
215
|
+
const content = result.error
|
|
216
|
+
? JSON.stringify({ error: result.error })
|
|
217
|
+
: result.content;
|
|
218
|
+
toolCallLog.push({
|
|
219
|
+
name: toolName,
|
|
220
|
+
input: toolInput,
|
|
221
|
+
output: content,
|
|
222
|
+
});
|
|
223
|
+
toolResults.push({
|
|
224
|
+
content,
|
|
225
|
+
tool_use_id: toolUse.id,
|
|
226
|
+
type: "tool_result",
|
|
227
|
+
});
|
|
228
|
+
}
|
|
229
|
+
catch (err) {
|
|
230
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
231
|
+
toolCallLog.push({
|
|
232
|
+
name: toolName,
|
|
233
|
+
input: toolInput,
|
|
234
|
+
output: `Error: ${errMsg}`,
|
|
235
|
+
});
|
|
236
|
+
toolResults.push({
|
|
237
|
+
content: JSON.stringify({ error: errMsg }),
|
|
238
|
+
tool_use_id: toolUse.id,
|
|
239
|
+
type: "tool_result",
|
|
240
|
+
});
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
// Add tool results to conversation
|
|
244
|
+
messages.push({ content: toolResults, role: "user" });
|
|
245
|
+
}
|
|
246
|
+
// Exhausted rounds
|
|
247
|
+
const lastAssistant = [...messages]
|
|
248
|
+
.reverse()
|
|
249
|
+
.find((m) => m.role === "assistant");
|
|
250
|
+
let lastText = "";
|
|
251
|
+
if (lastAssistant && Array.isArray(lastAssistant.content)) {
|
|
252
|
+
lastText = lastAssistant.content
|
|
253
|
+
.filter((b) => b.type === "text")
|
|
254
|
+
.map((b) => b.text || "")
|
|
255
|
+
.join("\n");
|
|
256
|
+
}
|
|
257
|
+
return {
|
|
258
|
+
cost: 0,
|
|
259
|
+
metadata: {
|
|
260
|
+
toolRounds: maxToolRounds,
|
|
261
|
+
exhaustedRounds: true,
|
|
262
|
+
toolCallLog,
|
|
263
|
+
latencyMs: Date.now() - startTime,
|
|
264
|
+
},
|
|
265
|
+
output: appendToolSummary(lastText || "[Exhausted tool rounds without final answer]", toolCallLog),
|
|
266
|
+
tokenUsage: {
|
|
267
|
+
completion: outputTokens,
|
|
268
|
+
prompt: inputTokens,
|
|
269
|
+
total: inputTokens + outputTokens,
|
|
270
|
+
},
|
|
271
|
+
};
|
|
272
|
+
}
|
|
273
|
+
finally {
|
|
274
|
+
await mcpClient.cleanup().catch(() => { });
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
// -------------------------------------------------------------------------
|
|
278
|
+
// MCP client management
|
|
279
|
+
// -------------------------------------------------------------------------
|
|
280
|
+
async connectMCP(serverConfig) {
|
|
281
|
+
// Dynamically import Promptfoo's MCPClient — reuse its MCP SDK integration
|
|
282
|
+
// rather than adding a direct dependency on @modelcontextprotocol/sdk
|
|
283
|
+
const { Client } = await import("@modelcontextprotocol/sdk/client/index.js");
|
|
284
|
+
const client = new Client({
|
|
285
|
+
name: "ailf-mcp-eval",
|
|
286
|
+
version: "1.0.0",
|
|
287
|
+
});
|
|
288
|
+
// Resolve auth — render {{env.VAR}} templates
|
|
289
|
+
const resolvedConfig = this.resolveEnvTemplates(serverConfig);
|
|
290
|
+
// Determine transport type and connect
|
|
291
|
+
let closeTransport;
|
|
292
|
+
if (resolvedConfig.command) {
|
|
293
|
+
// stdio transport
|
|
294
|
+
const { StdioClientTransport } = await import("@modelcontextprotocol/sdk/client/stdio.js");
|
|
295
|
+
const parts = String(resolvedConfig.command).split(/\s+/);
|
|
296
|
+
const transport = new StdioClientTransport({
|
|
297
|
+
command: parts[0],
|
|
298
|
+
args: parts.slice(1),
|
|
299
|
+
env: process.env,
|
|
300
|
+
});
|
|
301
|
+
await client.connect(transport);
|
|
302
|
+
closeTransport = () => transport.close();
|
|
303
|
+
}
|
|
304
|
+
else if (resolvedConfig.url) {
|
|
305
|
+
// streamable-http transport
|
|
306
|
+
const { StreamableHTTPClientTransport } = await import("@modelcontextprotocol/sdk/client/streamableHttp.js");
|
|
307
|
+
const headers = {};
|
|
308
|
+
const auth = resolvedConfig.auth;
|
|
309
|
+
if (auth?.type === "bearer" && auth.token) {
|
|
310
|
+
headers["Authorization"] = `Bearer ${auth.token}`;
|
|
311
|
+
}
|
|
312
|
+
const transport = new StreamableHTTPClientTransport(new URL(String(resolvedConfig.url)), { requestInit: { headers } });
|
|
313
|
+
await client.connect(transport);
|
|
314
|
+
closeTransport = () => transport.close();
|
|
315
|
+
}
|
|
316
|
+
else {
|
|
317
|
+
throw new Error("MCP server config must have either 'command' (stdio) or 'url' (http)");
|
|
318
|
+
}
|
|
319
|
+
// Discover tools
|
|
320
|
+
const { tools: toolsList } = await client.listTools();
|
|
321
|
+
const allTools = toolsList.map((t) => ({
|
|
322
|
+
name: t.name,
|
|
323
|
+
description: t.description,
|
|
324
|
+
inputSchema: t.inputSchema,
|
|
325
|
+
}));
|
|
326
|
+
return {
|
|
327
|
+
getAllTools: () => allTools,
|
|
328
|
+
callTool: async (name, args) => {
|
|
329
|
+
const result = await client.callTool({ name, arguments: args });
|
|
330
|
+
let content = "";
|
|
331
|
+
if (result?.content) {
|
|
332
|
+
if (Array.isArray(result.content)) {
|
|
333
|
+
content = result.content
|
|
334
|
+
.map((c) => c.text || JSON.stringify(c))
|
|
335
|
+
.join("\n");
|
|
336
|
+
}
|
|
337
|
+
else {
|
|
338
|
+
content = String(result.content);
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
return { content, error: result.isError ? content : undefined };
|
|
342
|
+
},
|
|
343
|
+
cleanup: async () => {
|
|
344
|
+
await closeTransport().catch(() => { });
|
|
345
|
+
},
|
|
346
|
+
};
|
|
347
|
+
}
|
|
348
|
+
/**
|
|
349
|
+
* Resolve {{env.VAR}} templates in config values.
|
|
350
|
+
*/
|
|
351
|
+
resolveEnvTemplates(config) {
|
|
352
|
+
const resolved = {};
|
|
353
|
+
for (const [key, value] of Object.entries(config)) {
|
|
354
|
+
if (typeof value === "string") {
|
|
355
|
+
resolved[key] = value.replace(/\{\{env\.(\w+)\}\}/g, (_, varName) => {
|
|
356
|
+
return process.env[varName] || "";
|
|
357
|
+
});
|
|
358
|
+
}
|
|
359
|
+
else if (value && typeof value === "object" && !Array.isArray(value)) {
|
|
360
|
+
resolved[key] = this.resolveEnvTemplates(value);
|
|
361
|
+
}
|
|
362
|
+
else {
|
|
363
|
+
resolved[key] = value;
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
return resolved;
|
|
367
|
+
}
|
|
368
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Preset loader — resolves and loads external PresetDefinition files.
|
|
3
|
+
*
|
|
4
|
+
* Supports two reference formats:
|
|
5
|
+
* - Relative path (./foo.ts, ../presets/bar.ts) → loaded from disk via jiti
|
|
6
|
+
* - npm package name (my-preset-package) → resolved via Node require
|
|
7
|
+
*
|
|
8
|
+
* Uses jiti for synchronous TypeScript/JavaScript loading, matching the
|
|
9
|
+
* sync constraint of createAppContext() and createRegistry().
|
|
10
|
+
*
|
|
11
|
+
* @see packages/eval/src/composition-root.ts — where this is called
|
|
12
|
+
*/
|
|
13
|
+
import type { PresetDefinition } from "../../_vendor/ailf-core/index.d.ts";
|
|
14
|
+
/**
|
|
15
|
+
* Load external presets from file paths or npm package names.
|
|
16
|
+
*
|
|
17
|
+
* @param refs - Preset references (relative paths or package names)
|
|
18
|
+
* @param rootDir - Base directory for resolving relative paths
|
|
19
|
+
* @returns Loaded PresetDefinition array (preserves input order)
|
|
20
|
+
* @throws If a preset file is not found or doesn't export a valid PresetDefinition
|
|
21
|
+
*/
|
|
22
|
+
export declare function loadExternalPresets(refs: string[], rootDir: string): PresetDefinition[];
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Preset loader — resolves and loads external PresetDefinition files.
|
|
3
|
+
*
|
|
4
|
+
* Supports two reference formats:
|
|
5
|
+
* - Relative path (./foo.ts, ../presets/bar.ts) → loaded from disk via jiti
|
|
6
|
+
* - npm package name (my-preset-package) → resolved via Node require
|
|
7
|
+
*
|
|
8
|
+
* Uses jiti for synchronous TypeScript/JavaScript loading, matching the
|
|
9
|
+
* sync constraint of createAppContext() and createRegistry().
|
|
10
|
+
*
|
|
11
|
+
* @see packages/eval/src/composition-root.ts — where this is called
|
|
12
|
+
*/
|
|
13
|
+
import { existsSync } from "fs";
|
|
14
|
+
import { resolve } from "path";
|
|
15
|
+
import { pathToFileURL } from "url";
|
|
16
|
+
import { createJiti } from "jiti";
|
|
17
|
+
/** Thrown for preset-specific load errors (distinguishes from third-party errors) */
|
|
18
|
+
class PresetLoadError extends Error {
|
|
19
|
+
constructor(message) {
|
|
20
|
+
super(message);
|
|
21
|
+
this.name = "PresetLoadError";
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Load external presets from file paths or npm package names.
|
|
26
|
+
*
|
|
27
|
+
* @param refs - Preset references (relative paths or package names)
|
|
28
|
+
* @param rootDir - Base directory for resolving relative paths
|
|
29
|
+
* @returns Loaded PresetDefinition array (preserves input order)
|
|
30
|
+
* @throws If a preset file is not found or doesn't export a valid PresetDefinition
|
|
31
|
+
*/
|
|
32
|
+
export function loadExternalPresets(refs, rootDir) {
|
|
33
|
+
const presets = [];
|
|
34
|
+
for (const ref of refs) {
|
|
35
|
+
const preset = loadSinglePreset(ref, rootDir);
|
|
36
|
+
presets.push(preset);
|
|
37
|
+
}
|
|
38
|
+
return presets;
|
|
39
|
+
}
|
|
40
|
+
function loadSinglePreset(ref, rootDir) {
|
|
41
|
+
const isPath = ref.startsWith("./") || ref.startsWith("../") || ref.startsWith("/");
|
|
42
|
+
const filePath = isPath ? resolve(rootDir, ref) : ref;
|
|
43
|
+
// Containment check: resolved path must be within rootDir to prevent
|
|
44
|
+
// path traversal (e.g., "../../other-project/secret.ts")
|
|
45
|
+
if (isPath) {
|
|
46
|
+
const resolvedRoot = resolve(rootDir);
|
|
47
|
+
if (!resolve(filePath).startsWith(resolvedRoot)) {
|
|
48
|
+
throw new PresetLoadError(`Preset "${ref}" resolves outside the project root (${resolvedRoot}). ` +
|
|
49
|
+
`Preset paths must be within the project directory.`);
|
|
50
|
+
}
|
|
51
|
+
if (!existsSync(filePath)) {
|
|
52
|
+
throw new PresetLoadError(`Preset file not found: ${filePath} (resolved from "${ref}" relative to ${rootDir})`);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
try {
|
|
56
|
+
const jiti = createJiti(pathToFileURL(rootDir).href, {
|
|
57
|
+
interopDefault: true,
|
|
58
|
+
requireCache: true,
|
|
59
|
+
});
|
|
60
|
+
// jiti() is the synchronous loader
|
|
61
|
+
const mod = jiti(filePath);
|
|
62
|
+
const preset = extractDefault(mod);
|
|
63
|
+
if (!preset || typeof preset !== "object") {
|
|
64
|
+
throw new PresetLoadError(`Preset "${ref}" does not export a default value. ` +
|
|
65
|
+
`Use \`export default definePreset({ ... })\`.`);
|
|
66
|
+
}
|
|
67
|
+
// Validate minimal shape
|
|
68
|
+
const p = preset;
|
|
69
|
+
if (!p.name || !p.mode || !p.manifest) {
|
|
70
|
+
throw new PresetLoadError(`Preset "${ref}" is missing required fields. ` +
|
|
71
|
+
`A preset must have at least \`name\`, \`mode\`, and \`manifest\`.`);
|
|
72
|
+
}
|
|
73
|
+
return p;
|
|
74
|
+
}
|
|
75
|
+
catch (cause) {
|
|
76
|
+
if (cause instanceof PresetLoadError) {
|
|
77
|
+
throw cause; // Re-throw our own errors
|
|
78
|
+
}
|
|
79
|
+
const message = cause instanceof Error ? cause.message : String(cause);
|
|
80
|
+
throw new Error(`Failed to load preset "${ref}": ${message}`, {
|
|
81
|
+
cause: cause,
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Extract the default export from a module, handling both ESM and CJS patterns.
|
|
87
|
+
*/
|
|
88
|
+
function extractDefault(mod) {
|
|
89
|
+
if (mod === null || mod === undefined)
|
|
90
|
+
return undefined;
|
|
91
|
+
if (typeof mod !== "object")
|
|
92
|
+
return mod;
|
|
93
|
+
const obj = mod;
|
|
94
|
+
// ESM: { default: value }
|
|
95
|
+
if ("default" in obj)
|
|
96
|
+
return obj.default;
|
|
97
|
+
// CJS: the module itself is the value
|
|
98
|
+
return obj;
|
|
99
|
+
}
|
|
@@ -1,20 +1,17 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* sanity-literacy preset —
|
|
2
|
+
* sanity-literacy preset — Sanity-specific domain configuration for literacy evaluation.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
* -
|
|
6
|
-
* - Prompt templates (with-docs, without-docs, agentic)
|
|
7
|
-
* - Rubric templates (task-completion, code-correctness, doc-coverage)
|
|
8
|
-
* - Scoring profiles (default, output-only)
|
|
4
|
+
* This is a domain preset that targets the `literacy` mode base. It provides
|
|
5
|
+
* Sanity-specific configuration:
|
|
9
6
|
* - Sanity doc source definitions (production, branch, local)
|
|
10
7
|
* - Product feature registry for coverage auditing
|
|
11
8
|
* - DocFetcher factory (SanityDocFetcher)
|
|
12
|
-
* -
|
|
9
|
+
* - Sanity fixture resolver (sanity:// scheme)
|
|
13
10
|
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
11
|
+
* Evaluation methodology (rubrics, scoring, prompts) is inherited from the
|
|
12
|
+
* `literacy` mode base — see mode-bases/literacy.ts.
|
|
16
13
|
*
|
|
17
|
-
* @see docs/exec-plans/architecture-overhaul/phase-8-scoring-storage-presets.md
|
|
14
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-8-scoring-storage-presets.md
|
|
18
15
|
*/
|
|
19
16
|
import { type PresetDefinition } from "../../../_vendor/ailf-core/index.d.ts";
|
|
20
17
|
export interface SanityLiteracyPresetOptions {
|