@sanity/ailf 2.0.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +28 -23
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
- package/dist/_vendor/ailf-core/config-helpers.js +29 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
- package/dist/_vendor/ailf-core/examples/index.js +208 -114
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
- package/dist/_vendor/ailf-tasks/cli.js +61 -0
- package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
- package/dist/_vendor/ailf-tasks/index.js +16 -0
- package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
- package/dist/_vendor/ailf-tasks/parser.js +73 -0
- package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
- package/dist/_vendor/ailf-tasks/schemas.js +180 -0
- package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
- package/dist/_vendor/ailf-tasks/validation.js +162 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +6 -1
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
- package/dist/adapters/task-sources/index.d.ts +1 -2
- package/dist/adapters/task-sources/index.js +1 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
- package/dist/adapters/task-sources/repo-schemas.js +2 -2
- package/dist/adapters/task-sources/repo-task-source.js +1 -1
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
- package/dist/adapters/task-sources/task-file-loader.js +20 -6
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +2 -3
- package/dist/commands/init.js +56 -170
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/composition-root.d.ts +2 -3
- package/dist/composition-root.js +27 -14
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +30 -16
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +50 -15
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +52 -32
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/checks.d.ts +8 -3
- package/dist/pipeline/checks.js +23 -3
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
- package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
- package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
- package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +15 -8
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +15 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/dist/pipeline/mirror-repo-tasks.js +1 -1
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +67 -29
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +25 -25
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
* All functions accept rootDir as a parameter — no module-level constants.
|
|
26
26
|
* No process.argv parsing. No env var fallbacks.
|
|
27
27
|
*
|
|
28
|
-
* @see docs/exec-plans/eliminate-lib-layer.md
|
|
28
|
+
* @see docs/archive/exec-plans/eliminate-lib-layer.md
|
|
29
29
|
*/
|
|
30
30
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
31
31
|
import { join } from "path";
|
|
@@ -454,40 +454,55 @@ function readAndNormalizeResults(resultsPath, log) {
|
|
|
454
454
|
resultCount: wrapper.results.length,
|
|
455
455
|
stats: wrapper.stats,
|
|
456
456
|
});
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
457
|
+
// Normalize results. Errored tests (null gradingResult) get a synthetic
|
|
458
|
+
// zero-score result so they flow through scoring and failure mode
|
|
459
|
+
// classification as "api-error" instead of being silently dropped.
|
|
460
|
+
const results = [];
|
|
461
|
+
let synthesizedCount = 0;
|
|
462
|
+
for (const r of wrapper.results) {
|
|
463
|
+
const base = {
|
|
464
|
+
cost: r.cost ?? 0,
|
|
465
|
+
description: r.testCase?.description ?? "unknown",
|
|
466
|
+
metadata: r.metadata,
|
|
467
|
+
providerId: r.provider?.id,
|
|
468
|
+
providerLabel: r.provider?.label,
|
|
469
|
+
response: r.response ?? { output: "" },
|
|
470
|
+
vars: r.vars ?? r.testCase?.vars ?? {},
|
|
471
|
+
};
|
|
472
|
+
if (r.gradingResult === null || r.gradingResult === undefined) {
|
|
473
|
+
// Synthesize a zero-score result so errored tests are visible in
|
|
474
|
+
// scoring, gap analysis, and failure mode classification.
|
|
475
|
+
const errorMsg = r.error ?? "unknown error (null gradingResult)";
|
|
476
|
+
synthesizedCount++;
|
|
477
|
+
const providerLabel = r.provider?.label ?? r.provider?.id ?? "";
|
|
478
|
+
_log.warn(`⚠ [api-error] ${providerLabel ? `[${providerLabel}] ` : ""}"${base.description}" — ${errorMsg.slice(0, 150)}`);
|
|
479
|
+
results.push({
|
|
480
|
+
...base,
|
|
481
|
+
gradingResult: {
|
|
482
|
+
pass: false,
|
|
483
|
+
componentResults: [
|
|
484
|
+
{
|
|
485
|
+
assertion: { type: "llm-rubric" },
|
|
486
|
+
pass: false,
|
|
487
|
+
reason: `[api-error] ${errorMsg}`,
|
|
488
|
+
score: 0,
|
|
489
|
+
},
|
|
490
|
+
],
|
|
491
|
+
},
|
|
492
|
+
});
|
|
488
493
|
}
|
|
494
|
+
else {
|
|
495
|
+
results.push({ ...base, gradingResult: r.gradingResult });
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
_log.debug("Normalized results", {
|
|
499
|
+
totalResults: wrapper.results.length,
|
|
500
|
+
synthesizedApiErrors: synthesizedCount,
|
|
501
|
+
});
|
|
502
|
+
if (synthesizedCount > 0) {
|
|
503
|
+
_log.warn(`⚠ Synthesized ${synthesizedCount} zero-score result(s) for errored tests (api-error)`);
|
|
489
504
|
}
|
|
490
|
-
return
|
|
505
|
+
return results;
|
|
491
506
|
}
|
|
492
507
|
/**
|
|
493
508
|
* Core scoring logic: takes a pre-filtered array of TestResult and produces
|
|
@@ -805,12 +820,34 @@ function computeTestSummary(resultsPath) {
|
|
|
805
820
|
failed++;
|
|
806
821
|
}
|
|
807
822
|
}
|
|
823
|
+
// Extract per-test timing from latencyMs (when available from Promptfoo)
|
|
824
|
+
const durations = rawResults
|
|
825
|
+
.filter((r) => typeof r.latencyMs === "number")
|
|
826
|
+
.map((r) => ({
|
|
827
|
+
task: r.testCase?.description ?? "unknown",
|
|
828
|
+
model: r.provider?.label ?? r.provider?.id ?? "unknown",
|
|
829
|
+
durationMs: r.latencyMs,
|
|
830
|
+
}));
|
|
831
|
+
let timing;
|
|
832
|
+
if (durations.length > 0) {
|
|
833
|
+
const sorted = durations.map((d) => d.durationMs).sort((a, b) => a - b);
|
|
834
|
+
const medianMs = sorted[Math.floor(sorted.length / 2)];
|
|
835
|
+
const p95Ms = sorted[Math.floor(sorted.length * 0.95)];
|
|
836
|
+
const maxMs = sorted[sorted.length - 1];
|
|
837
|
+
// Flag tests exceeding 2x median (min 60s) as "slow"
|
|
838
|
+
const slowThreshold = Math.max(medianMs * 2, 60_000);
|
|
839
|
+
const slowTests = durations
|
|
840
|
+
.filter((d) => d.durationMs > slowThreshold)
|
|
841
|
+
.sort((a, b) => b.durationMs - a.durationMs);
|
|
842
|
+
timing = { medianMs, p95Ms, maxMs, slowTests };
|
|
843
|
+
}
|
|
808
844
|
return {
|
|
809
845
|
total: rawResults.length,
|
|
810
846
|
passed,
|
|
811
847
|
failed,
|
|
812
848
|
errored,
|
|
813
849
|
...(errors.length > 0 ? { errors } : {}),
|
|
850
|
+
...(timing ? { timing } : {}),
|
|
814
851
|
};
|
|
815
852
|
}
|
|
816
853
|
function printPerModelReport(perModel, log) {
|
|
@@ -23,10 +23,15 @@ export declare function checkContextsExist(rootDir: string, areas: string[]): Va
|
|
|
23
23
|
*/
|
|
24
24
|
export declare function checkEnvironment(rootDir: string): ValidationIssue[];
|
|
25
25
|
/**
|
|
26
|
-
* Check that the
|
|
27
|
-
*
|
|
26
|
+
* Check that the generated promptfoo config for a given mode exists.
|
|
27
|
+
*
|
|
28
|
+
* When `mode` is provided, checks only for that mode's config file
|
|
29
|
+
* (e.g. `promptfooconfig.agent-harness.yaml` for mode `"agent-harness"`).
|
|
30
|
+
*
|
|
31
|
+
* When `mode` is omitted, falls back to the legacy literacy check:
|
|
32
|
+
* baseline `promptfooconfig.yaml` (required) plus optional observed/agentic.
|
|
28
33
|
*/
|
|
29
|
-
export declare function checkGeneratedConfigsExist(rootDir: string): ValidationIssue[];
|
|
34
|
+
export declare function checkGeneratedConfigsExist(rootDir: string, mode?: string): ValidationIssue[];
|
|
30
35
|
/**
|
|
31
36
|
* Check that the eval results JSON file exists, is valid JSON, and contains
|
|
32
37
|
* a `results` array.
|
package/dist/pipeline/checks.js
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
import { config as loadEnv } from "dotenv";
|
|
9
9
|
import { existsSync, readFileSync, statSync } from "fs";
|
|
10
10
|
import { join, resolve } from "path";
|
|
11
|
+
import { configFileForMode } from "./eval-constants.js";
|
|
11
12
|
// ---------------------------------------------------------------------------
|
|
12
13
|
// Precondition: contexts exist for each feature area
|
|
13
14
|
// ---------------------------------------------------------------------------
|
|
@@ -109,11 +110,30 @@ export function checkEnvironment(rootDir) {
|
|
|
109
110
|
// Postcondition: score summary is valid
|
|
110
111
|
// ---------------------------------------------------------------------------
|
|
111
112
|
/**
|
|
112
|
-
* Check that the
|
|
113
|
-
*
|
|
113
|
+
* Check that the generated promptfoo config for a given mode exists.
|
|
114
|
+
*
|
|
115
|
+
* When `mode` is provided, checks only for that mode's config file
|
|
116
|
+
* (e.g. `promptfooconfig.agent-harness.yaml` for mode `"agent-harness"`).
|
|
117
|
+
*
|
|
118
|
+
* When `mode` is omitted, falls back to the legacy literacy check:
|
|
119
|
+
* baseline `promptfooconfig.yaml` (required) plus optional observed/agentic.
|
|
114
120
|
*/
|
|
115
|
-
export function checkGeneratedConfigsExist(rootDir) {
|
|
121
|
+
export function checkGeneratedConfigsExist(rootDir, mode) {
|
|
116
122
|
const issues = [];
|
|
123
|
+
if (mode) {
|
|
124
|
+
const configName = configFileForMode(mode);
|
|
125
|
+
const configPath = resolve(rootDir, configName);
|
|
126
|
+
if (!existsSync(configPath)) {
|
|
127
|
+
issues.push({
|
|
128
|
+
message: `Config '${configName}' not found for mode '${mode}'. Run the pipeline to generate it.`,
|
|
129
|
+
path: configPath,
|
|
130
|
+
severity: "error",
|
|
131
|
+
source: "checkGeneratedConfigsExist",
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
return issues;
|
|
135
|
+
}
|
|
136
|
+
// Legacy literacy check: baseline required, observed/agentic optional
|
|
117
137
|
const baselinePath = resolve(rootDir, "promptfooconfig.yaml");
|
|
118
138
|
if (!existsSync(baselinePath)) {
|
|
119
139
|
issues.push({
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/chronic-failures.ts
|
|
3
|
+
*
|
|
4
|
+
* Aggregates error data across recent reports to identify tasks that
|
|
5
|
+
* consistently fail (>threshold error rate). This catches structurally
|
|
6
|
+
* broken tasks — wrong model config, tasks too complex for the provider,
|
|
7
|
+
* persistent API incompatibility — that would otherwise be invisible.
|
|
8
|
+
*
|
|
9
|
+
* @see docs/exec-plans/eval-pipeline-timeout-resilience.md — Phase 5
|
|
10
|
+
*/
|
|
11
|
+
import type { ReportStore } from "../report-store.js";
|
|
12
|
+
export interface ChronicFailureEntry {
|
|
13
|
+
/** Task ID */
|
|
14
|
+
task: string;
|
|
15
|
+
/** Error rate as a fraction (0–1) */
|
|
16
|
+
errorRate: number;
|
|
17
|
+
/** Number of runs with errors / total runs analyzed */
|
|
18
|
+
errorCount: number;
|
|
19
|
+
totalRuns: number;
|
|
20
|
+
/** Which models are affected and how often */
|
|
21
|
+
modelBreakdown: {
|
|
22
|
+
model: string;
|
|
23
|
+
errorCount: number;
|
|
24
|
+
}[];
|
|
25
|
+
/** Most common error message */
|
|
26
|
+
commonError: string;
|
|
27
|
+
}
|
|
28
|
+
export interface ChronicFailureReport {
|
|
29
|
+
/** Number of reports analyzed */
|
|
30
|
+
lookback: number;
|
|
31
|
+
/** Threshold used for classification */
|
|
32
|
+
threshold: number;
|
|
33
|
+
/** Tasks exceeding the error threshold */
|
|
34
|
+
failures: ChronicFailureEntry[];
|
|
35
|
+
/** Total reports found (may be less than lookback if not enough history) */
|
|
36
|
+
reportsFound: number;
|
|
37
|
+
}
|
|
38
|
+
export interface ChronicFailureOptions {
|
|
39
|
+
/** Number of recent reports to analyze (default: 10) */
|
|
40
|
+
lookback?: number;
|
|
41
|
+
/** Error rate threshold (0–1) for "chronic" classification (default: 0.5) */
|
|
42
|
+
threshold?: number;
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Query recent reports and identify tasks with chronic failures.
|
|
46
|
+
*
|
|
47
|
+
* @param reportStore - The report store to query
|
|
48
|
+
* @param options - Lookback window and threshold
|
|
49
|
+
* @returns Chronic failure report, or null if no reports found
|
|
50
|
+
*/
|
|
51
|
+
export declare function detectChronicFailures(reportStore: ReportStore, options?: ChronicFailureOptions): Promise<ChronicFailureReport>;
|
|
52
|
+
/**
|
|
53
|
+
* Format a chronic failure report for console output.
|
|
54
|
+
*/
|
|
55
|
+
export declare function formatChronicFailuresConsole(report: ChronicFailureReport): string;
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/chronic-failures.ts
|
|
3
|
+
*
|
|
4
|
+
* Aggregates error data across recent reports to identify tasks that
|
|
5
|
+
* consistently fail (>threshold error rate). This catches structurally
|
|
6
|
+
* broken tasks — wrong model config, tasks too complex for the provider,
|
|
7
|
+
* persistent API incompatibility — that would otherwise be invisible.
|
|
8
|
+
*
|
|
9
|
+
* @see docs/exec-plans/eval-pipeline-timeout-resilience.md — Phase 5
|
|
10
|
+
*/
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
// Public API
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
/**
|
|
15
|
+
* Query recent reports and identify tasks with chronic failures.
|
|
16
|
+
*
|
|
17
|
+
* @param reportStore - The report store to query
|
|
18
|
+
* @param options - Lookback window and threshold
|
|
19
|
+
* @returns Chronic failure report, or null if no reports found
|
|
20
|
+
*/
|
|
21
|
+
export async function detectChronicFailures(reportStore, options = {}) {
|
|
22
|
+
const lookback = options.lookback ?? 10;
|
|
23
|
+
const threshold = options.threshold ?? 0.5;
|
|
24
|
+
const reports = await reportStore.queryRecentErrors(lookback);
|
|
25
|
+
if (reports.length === 0) {
|
|
26
|
+
return { lookback, threshold, failures: [], reportsFound: 0 };
|
|
27
|
+
}
|
|
28
|
+
// Aggregate errors by task
|
|
29
|
+
const taskErrors = new Map();
|
|
30
|
+
for (const report of reports) {
|
|
31
|
+
for (const error of report.errors) {
|
|
32
|
+
let entry = taskErrors.get(error.task);
|
|
33
|
+
if (!entry) {
|
|
34
|
+
entry = {
|
|
35
|
+
runsWith: new Set(),
|
|
36
|
+
modelErrors: new Map(),
|
|
37
|
+
errors: [],
|
|
38
|
+
};
|
|
39
|
+
taskErrors.set(error.task, entry);
|
|
40
|
+
}
|
|
41
|
+
entry.runsWith.add(report.reportId);
|
|
42
|
+
entry.modelErrors.set(error.model, (entry.modelErrors.get(error.model) ?? 0) + 1);
|
|
43
|
+
entry.errors.push(error.error);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
// Identify chronic failures (error rate > threshold)
|
|
47
|
+
const failures = [];
|
|
48
|
+
const totalRuns = reports.length;
|
|
49
|
+
for (const [task, data] of taskErrors) {
|
|
50
|
+
const errorRate = data.runsWith.size / totalRuns;
|
|
51
|
+
if (errorRate >= threshold) {
|
|
52
|
+
// Find the most common error message
|
|
53
|
+
const errorCounts = new Map();
|
|
54
|
+
for (const err of data.errors) {
|
|
55
|
+
const truncated = err.slice(0, 200);
|
|
56
|
+
errorCounts.set(truncated, (errorCounts.get(truncated) ?? 0) + 1);
|
|
57
|
+
}
|
|
58
|
+
const commonError = [...errorCounts.entries()].sort((a, b) => b[1] - a[1])[0]?.[0] ??
|
|
59
|
+
"unknown";
|
|
60
|
+
const modelBreakdown = [...data.modelErrors.entries()]
|
|
61
|
+
.map(([model, errorCount]) => ({ model, errorCount }))
|
|
62
|
+
.sort((a, b) => b.errorCount - a.errorCount);
|
|
63
|
+
failures.push({
|
|
64
|
+
task,
|
|
65
|
+
errorRate,
|
|
66
|
+
errorCount: data.runsWith.size,
|
|
67
|
+
totalRuns,
|
|
68
|
+
modelBreakdown,
|
|
69
|
+
commonError,
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
// Sort by error rate descending
|
|
74
|
+
failures.sort((a, b) => b.errorRate - a.errorRate);
|
|
75
|
+
return { lookback, threshold, failures, reportsFound: reports.length };
|
|
76
|
+
}
|
|
77
|
+
// ---------------------------------------------------------------------------
|
|
78
|
+
// Formatting
|
|
79
|
+
// ---------------------------------------------------------------------------
|
|
80
|
+
/**
|
|
81
|
+
* Format a chronic failure report for console output.
|
|
82
|
+
*/
|
|
83
|
+
export function formatChronicFailuresConsole(report) {
|
|
84
|
+
const lines = [];
|
|
85
|
+
lines.push(`Chronic Failure Report (last ${report.reportsFound} runs)`);
|
|
86
|
+
lines.push("━".repeat(50));
|
|
87
|
+
lines.push("");
|
|
88
|
+
if (report.failures.length === 0) {
|
|
89
|
+
lines.push(" ✅ No chronic failures detected (all tasks below " +
|
|
90
|
+
`${(report.threshold * 100).toFixed(0)}% error threshold)`);
|
|
91
|
+
lines.push("");
|
|
92
|
+
return lines.join("\n");
|
|
93
|
+
}
|
|
94
|
+
lines.push(` ⚠ ${report.failures.length} task(s) with chronic failures ` +
|
|
95
|
+
`(>${(report.threshold * 100).toFixed(0)}% error rate):`);
|
|
96
|
+
lines.push("");
|
|
97
|
+
for (const f of report.failures) {
|
|
98
|
+
lines.push(` ${f.task}`);
|
|
99
|
+
lines.push(` Error rate: ${f.errorCount}/${f.totalRuns} runs ` +
|
|
100
|
+
`(${(f.errorRate * 100).toFixed(0)}%)`);
|
|
101
|
+
const models = f.modelBreakdown
|
|
102
|
+
.map((m) => `${m.model} (${m.errorCount})`)
|
|
103
|
+
.join(", ");
|
|
104
|
+
lines.push(` Models affected: ${models}`);
|
|
105
|
+
lines.push(` Common error: "${f.commonError}"`);
|
|
106
|
+
lines.push(" Suggested action: Increase timeoutMs for affected models or simplify task");
|
|
107
|
+
lines.push("");
|
|
108
|
+
}
|
|
109
|
+
return lines.join("\n");
|
|
110
|
+
}
|
|
@@ -194,6 +194,39 @@ describe("compileMCPTask", () => {
|
|
|
194
194
|
const server = serverCfg(result.providers[0]);
|
|
195
195
|
assert.deepEqual(server.auth, { type: "bearer", token: "{{env.MY_TOKEN}}" });
|
|
196
196
|
});
|
|
197
|
+
it("maps headers to mcpServer config", () => {
|
|
198
|
+
const result = compileMCPTask(makeMinimalMCPTask({
|
|
199
|
+
serverConfig: {
|
|
200
|
+
transport: "streamable-http",
|
|
201
|
+
url: "https://mcp.example.com",
|
|
202
|
+
headers: {
|
|
203
|
+
Authorization: "Bearer {{env.MY_TOKEN}}",
|
|
204
|
+
"X-Custom": "value",
|
|
205
|
+
},
|
|
206
|
+
},
|
|
207
|
+
}), { models: TEST_MODELS });
|
|
208
|
+
const server = serverCfg(result.providers[0]);
|
|
209
|
+
assert.deepEqual(server.headers, {
|
|
210
|
+
Authorization: "Bearer {{env.MY_TOKEN}}",
|
|
211
|
+
"X-Custom": "value",
|
|
212
|
+
});
|
|
213
|
+
});
|
|
214
|
+
it("passes both headers and auth when both present", () => {
|
|
215
|
+
const result = compileMCPTask(makeMinimalMCPTask({
|
|
216
|
+
serverConfig: {
|
|
217
|
+
transport: "streamable-http",
|
|
218
|
+
url: "https://mcp.example.com",
|
|
219
|
+
headers: { "X-Custom": "value" },
|
|
220
|
+
auth: { type: "bearer", token: "{{env.MY_TOKEN}}" },
|
|
221
|
+
},
|
|
222
|
+
}), { models: TEST_MODELS });
|
|
223
|
+
const server = serverCfg(result.providers[0]);
|
|
224
|
+
assert.deepEqual(server.headers, { "X-Custom": "value" });
|
|
225
|
+
assert.deepEqual(server.auth, {
|
|
226
|
+
type: "bearer",
|
|
227
|
+
token: "{{env.MY_TOKEN}}",
|
|
228
|
+
});
|
|
229
|
+
});
|
|
197
230
|
it("maps capabilities to mcpTools config", () => {
|
|
198
231
|
const result = compileMCPTask(makeMinimalMCPTask({
|
|
199
232
|
capabilities: ["query_documents", "get_schema"],
|
|
@@ -9,7 +9,6 @@
|
|
|
9
9
|
import assert from "node:assert/strict";
|
|
10
10
|
import { describe, it } from "node:test";
|
|
11
11
|
import { tmpdir } from "os";
|
|
12
|
-
import { LiteracyVariant } from "../../normalize-mode.js";
|
|
13
12
|
import { compileToPromptfoo } from "../promptfoo-compiler.js";
|
|
14
13
|
// ---------------------------------------------------------------------------
|
|
15
14
|
// Helpers
|
|
@@ -123,8 +122,8 @@ describe("compileToPromptfoo", () => {
|
|
|
123
122
|
const result = compileToPromptfoo(graph, {
|
|
124
123
|
mode: "literacy",
|
|
125
124
|
models: makeModels([
|
|
126
|
-
{ id: "model-a", label: "A", modes: [
|
|
127
|
-
{ id: "model-b", label: "B", modes: ["
|
|
125
|
+
{ id: "model-a", label: "A", modes: ["literacy"] },
|
|
126
|
+
{ id: "model-b", label: "B", modes: ["mcp-server"] },
|
|
128
127
|
]),
|
|
129
128
|
rootDir: tmpdir(),
|
|
130
129
|
});
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* task-bridge.test.ts — Unit tests for the TaskDefinition ↔ LiteracyTaskDefinition bridge.
|
|
3
|
+
*
|
|
4
|
+
* Covers round-trip fidelity, edge cases (missing optionals, all optionals),
|
|
5
|
+
* assertion type mapping, and all four CanonicalDocRef / GeneralizedDocRef variants.
|
|
6
|
+
*
|
|
7
|
+
* Run: npx tsx --test src/pipeline/compiler/__tests__/task-bridge.test.ts
|
|
8
|
+
*/
|
|
9
|
+
export {};
|