@sanity/ailf 2.0.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +28 -23
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
- package/dist/_vendor/ailf-core/config-helpers.js +29 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
- package/dist/_vendor/ailf-core/examples/index.js +208 -114
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
- package/dist/_vendor/ailf-tasks/cli.js +61 -0
- package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
- package/dist/_vendor/ailf-tasks/index.js +16 -0
- package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
- package/dist/_vendor/ailf-tasks/parser.js +73 -0
- package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
- package/dist/_vendor/ailf-tasks/schemas.js +180 -0
- package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
- package/dist/_vendor/ailf-tasks/validation.js +162 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +6 -1
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
- package/dist/adapters/task-sources/index.d.ts +1 -2
- package/dist/adapters/task-sources/index.js +1 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
- package/dist/adapters/task-sources/repo-schemas.js +2 -2
- package/dist/adapters/task-sources/repo-task-source.js +1 -1
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
- package/dist/adapters/task-sources/task-file-loader.js +20 -6
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +2 -3
- package/dist/commands/init.js +56 -170
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/composition-root.d.ts +2 -3
- package/dist/composition-root.js +27 -14
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +30 -16
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +50 -15
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +52 -32
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/checks.d.ts +8 -3
- package/dist/pipeline/checks.js +23 -3
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
- package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
- package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
- package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +15 -8
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +15 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/dist/pipeline/mirror-repo-tasks.js +1 -1
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +67 -29
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +25 -25
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline step: Run promptfoo evaluation.
|
|
3
|
+
*
|
|
4
|
+
* Preconditions: config files and context files exist
|
|
5
|
+
* Postconditions: eval-results.json exists and is valid
|
|
6
|
+
*
|
|
7
|
+
* Cache key: promptfooconfig*.yaml + contexts/*.md + tasks/*.yaml +
|
|
8
|
+
* canonical contexts + reference solutions + config/models.yaml
|
|
9
|
+
* Cache outputs: results/latest/eval-results*.json
|
|
10
|
+
*
|
|
11
|
+
* Remote cache: When local cache misses and a Sanity token is available,
|
|
12
|
+
* the step queries the Content Lake for a report with a matching eval
|
|
13
|
+
* fingerprint. On a hit, the cached score-summary.json is written to disk
|
|
14
|
+
* and the eval + calculate-scores steps are skipped entirely.
|
|
15
|
+
*
|
|
16
|
+
* @see docs/design-docs/content-lake-eval-caching.md
|
|
17
|
+
*/
|
|
18
|
+
import { execSync } from "child_process";
|
|
19
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
20
|
+
import { dirname, resolve } from "path";
|
|
21
|
+
import { fileURLToPath } from "url";
|
|
22
|
+
import { getStepInputPaths, hashFiles, lookupCache, recordCache, } from "../cache.js";
|
|
23
|
+
import { checkCanonicalContextsExist, checkGeneratedConfigsExist, checkResultsExist, } from "../checks.js";
|
|
24
|
+
import { computeEvalFingerprint } from "../eval-fingerprint.js";
|
|
25
|
+
import { resolveMappings } from "../resolve-mappings.js";
|
|
26
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
27
|
+
const ROOT = resolve(__dirname, "..", "..", "..");
|
|
28
|
+
const CONFIG_FILES = {
|
|
29
|
+
agentic: "promptfooconfig.agentic.yaml",
|
|
30
|
+
baseline: "promptfooconfig.yaml",
|
|
31
|
+
observed: "promptfooconfig.observed.yaml",
|
|
32
|
+
};
|
|
33
|
+
/** Each mode writes eval results to a different file (set in the config's outputPath) */
|
|
34
|
+
export const RESULTS_FILES = {
|
|
35
|
+
agentic: "results/latest/eval-results-agentic.json",
|
|
36
|
+
baseline: "results/latest/eval-results.json",
|
|
37
|
+
observed: "results/latest/eval-results-observed.json",
|
|
38
|
+
};
|
|
39
|
+
export function buildFilterFlags(debug) {
|
|
40
|
+
if (!debug?.enabled)
|
|
41
|
+
return "";
|
|
42
|
+
const flags = [];
|
|
43
|
+
if (debug.pattern) {
|
|
44
|
+
flags.push(`--filter-pattern '${debug.pattern}'`);
|
|
45
|
+
}
|
|
46
|
+
if (debug.sample) {
|
|
47
|
+
flags.push(`--filter-sample ${debug.sample}`);
|
|
48
|
+
}
|
|
49
|
+
if (debug.firstN) {
|
|
50
|
+
flags.push(`--filter-first-n ${debug.firstN}`);
|
|
51
|
+
}
|
|
52
|
+
// Default: first 2 tests when no other filters specified
|
|
53
|
+
if (flags.length === 0) {
|
|
54
|
+
flags.push("--filter-first-n 2");
|
|
55
|
+
}
|
|
56
|
+
return " " + flags.join(" ");
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Extract the Promptfoo share URL from the eval results JSON.
|
|
60
|
+
*
|
|
61
|
+
* Promptfoo writes a `shareableUrl` field into the results file when
|
|
62
|
+
* `PROMPTFOO_API_KEY` is set. This replaces the previous approach of
|
|
63
|
+
* scraping the URL from a captured log file (which required piping
|
|
64
|
+
* through `tee` and broke TTY progress reporting).
|
|
65
|
+
*/
|
|
66
|
+
export function extractShareUrl(mode) {
|
|
67
|
+
const resultsPath = resolve(ROOT, RESULTS_FILES[mode]);
|
|
68
|
+
if (!existsSync(resultsPath))
|
|
69
|
+
return undefined;
|
|
70
|
+
try {
|
|
71
|
+
const raw = readFileSync(resultsPath, "utf-8");
|
|
72
|
+
const data = JSON.parse(raw);
|
|
73
|
+
return data.shareableUrl ?? undefined;
|
|
74
|
+
}
|
|
75
|
+
catch {
|
|
76
|
+
return undefined;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
// ---------------------------------------------------------------------------
|
|
80
|
+
// Post-eval error scanning
|
|
81
|
+
// ---------------------------------------------------------------------------
|
|
82
|
+
export async function runEval(mode, debug, concurrency, noCache = false, remoteCacheOpts) {
|
|
83
|
+
const start = Date.now();
|
|
84
|
+
// Precondition: config file exists
|
|
85
|
+
const configIssues = checkGeneratedConfigsExist(ROOT);
|
|
86
|
+
const configErrors = configIssues.filter((i) => i.severity === "error");
|
|
87
|
+
if (configErrors.length > 0) {
|
|
88
|
+
return {
|
|
89
|
+
stepResult: {
|
|
90
|
+
durationMs: Date.now() - start,
|
|
91
|
+
error: `Config files missing: ${configErrors.map((e) => e.message).join("; ")}`,
|
|
92
|
+
status: "failed",
|
|
93
|
+
},
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
// Precondition: canonical context files exist for all mapped tasks
|
|
97
|
+
const mappings = resolveMappings(ROOT);
|
|
98
|
+
const taskIds = Object.values(mappings.feature_areas).flatMap((area) => area.tasks.map((t) => t.id));
|
|
99
|
+
const contextIssues = checkCanonicalContextsExist(ROOT, taskIds);
|
|
100
|
+
const contextErrors = contextIssues.filter((i) => i.severity === "error");
|
|
101
|
+
if (contextErrors.length > 0) {
|
|
102
|
+
return {
|
|
103
|
+
stepResult: {
|
|
104
|
+
durationMs: Date.now() - start,
|
|
105
|
+
error: `Context files missing. Run 'pnpm fetch-docs' first. ${contextErrors.map((e) => e.message).join("; ")}`,
|
|
106
|
+
status: "failed",
|
|
107
|
+
},
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
// -----------------------------------------------------------------------
|
|
111
|
+
// Compute eval fingerprint (used for both remote cache + provenance)
|
|
112
|
+
// Only for non-debug runs — debug runs use test subsets.
|
|
113
|
+
// -----------------------------------------------------------------------
|
|
114
|
+
let evalFingerprint;
|
|
115
|
+
if (!debug?.enabled && remoteCacheOpts?.graderModel) {
|
|
116
|
+
try {
|
|
117
|
+
evalFingerprint = computeEvalFingerprint({
|
|
118
|
+
filter: remoteCacheOpts.filter,
|
|
119
|
+
graderModel: remoteCacheOpts.graderModel,
|
|
120
|
+
mode,
|
|
121
|
+
rootDir: ROOT,
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
catch (err) {
|
|
125
|
+
console.warn(` ⚠️ Could not compute eval fingerprint: ${err instanceof Error ? err.message : String(err)}`);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
// -----------------------------------------------------------------------
|
|
129
|
+
// Cache check — local first, then remote
|
|
130
|
+
// -----------------------------------------------------------------------
|
|
131
|
+
// Local cache check — skip eval if inputs unchanged (biggest cost saver).
|
|
132
|
+
// Each mode gets its own cache key so that in `full` mode, a fresh agentic
|
|
133
|
+
// cache doesn't force baseline to re-run (or vice versa).
|
|
134
|
+
const cacheKey = `eval-${mode}`;
|
|
135
|
+
if (!noCache) {
|
|
136
|
+
const cacheResult = lookupCache(ROOT, cacheKey);
|
|
137
|
+
if (cacheResult.hit) {
|
|
138
|
+
return {
|
|
139
|
+
evalFingerprint,
|
|
140
|
+
stepResult: {
|
|
141
|
+
durationMs: Date.now() - start,
|
|
142
|
+
status: "success",
|
|
143
|
+
summary: `Skipped (cached) — ${cacheResult.entry.summary}`,
|
|
144
|
+
},
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
// Remote cache check — query Content Lake for matching fingerprint
|
|
149
|
+
if (evalFingerprint &&
|
|
150
|
+
!noCache &&
|
|
151
|
+
!remoteCacheOpts?.noRemoteCache &&
|
|
152
|
+
remoteCacheOpts?.sanityToken) {
|
|
153
|
+
const remoteCacheResult = await checkRemoteCache(evalFingerprint, remoteCacheOpts.sanityToken);
|
|
154
|
+
if (remoteCacheResult) {
|
|
155
|
+
return {
|
|
156
|
+
evalFingerprint,
|
|
157
|
+
remoteCacheHit: true,
|
|
158
|
+
stepResult: {
|
|
159
|
+
durationMs: Date.now() - start,
|
|
160
|
+
status: "success",
|
|
161
|
+
summary: `Skipped (remote cache hit) — reusing report ${remoteCacheResult.reportId} from ${remoteCacheResult.completedAt}`,
|
|
162
|
+
},
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
// Execute — run promptfoo directly with inherited stdio so the TTY
|
|
167
|
+
// progress bar works in interactive terminals and the CI progress
|
|
168
|
+
// reporter works in CI environments. Previously this was piped through
|
|
169
|
+
// `tee` to capture a log file for share-URL extraction, but `tee`
|
|
170
|
+
// destroyed TTY detection, disabling all progress output. The share URL
|
|
171
|
+
// is now read from the eval results JSON (`shareableUrl` field) instead.
|
|
172
|
+
//
|
|
173
|
+
// Sharing is enabled by default (via PROMPTFOO_API_KEY / cloud config).
|
|
174
|
+
// We set PROMPTFOO_DISABLE_SHARE_EMAIL_REQUEST=1 to prevent promptfoo's
|
|
175
|
+
// interactive email prompt from blocking the terminal in local TTY
|
|
176
|
+
// environments. In CI, isCI() already guards against the prompt, but
|
|
177
|
+
// the env var provides defense-in-depth for all execution contexts.
|
|
178
|
+
const configFile = CONFIG_FILES[mode];
|
|
179
|
+
const filterFlags = buildFilterFlags(debug);
|
|
180
|
+
const concurrencyFlag = concurrency ? ` --max-concurrency ${concurrency}` : "";
|
|
181
|
+
const noCacheFlag = noCache ? " --no-cache" : "";
|
|
182
|
+
const evalCmd = `dotenv -e ../../.env -o -- promptfoo eval --config ${configFile}${filterFlags}${concurrencyFlag}${noCacheFlag}`;
|
|
183
|
+
let exitCode = 0;
|
|
184
|
+
try {
|
|
185
|
+
execSync(evalCmd, {
|
|
186
|
+
cwd: ROOT,
|
|
187
|
+
env: {
|
|
188
|
+
...process.env,
|
|
189
|
+
PROMPTFOO_DISABLE_SHARE_EMAIL_REQUEST: "1",
|
|
190
|
+
},
|
|
191
|
+
stdio: "inherit",
|
|
192
|
+
});
|
|
193
|
+
}
|
|
194
|
+
catch (err) {
|
|
195
|
+
// promptfoo exits 100 when assertions fail — that's expected, not an error
|
|
196
|
+
exitCode =
|
|
197
|
+
err !== null && typeof err === "object" && "status" in err
|
|
198
|
+
? err.status
|
|
199
|
+
: 1;
|
|
200
|
+
if (exitCode !== 100) {
|
|
201
|
+
return {
|
|
202
|
+
evalFingerprint,
|
|
203
|
+
stepResult: {
|
|
204
|
+
durationMs: Date.now() - start,
|
|
205
|
+
error: `promptfoo eval failed with exit code ${exitCode}`,
|
|
206
|
+
status: "failed",
|
|
207
|
+
},
|
|
208
|
+
};
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
// Postcondition: results file exists and is valid
|
|
212
|
+
const resultsIssues = checkResultsExist(ROOT, RESULTS_FILES[mode]);
|
|
213
|
+
const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
|
|
214
|
+
if (resultsErrors.length > 0) {
|
|
215
|
+
return {
|
|
216
|
+
evalFingerprint,
|
|
217
|
+
stepResult: {
|
|
218
|
+
durationMs: Date.now() - start,
|
|
219
|
+
error: `Postcondition failed: ${resultsErrors.map((e) => e.message).join("; ")}`,
|
|
220
|
+
status: "failed",
|
|
221
|
+
},
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
// Scan results for errors and surface them clearly
|
|
225
|
+
const errorSummary = scanResultsForErrors(resolve(ROOT, RESULTS_FILES[mode]));
|
|
226
|
+
if (errorSummary) {
|
|
227
|
+
console.log();
|
|
228
|
+
console.log(errorSummary);
|
|
229
|
+
}
|
|
230
|
+
const durationMs = Date.now() - start;
|
|
231
|
+
const summary = `Evaluation complete (mode: ${mode}${debug?.enabled ? ", debug" : ""})`;
|
|
232
|
+
// Record cache — only for non-debug runs (debug uses a subset of tests).
|
|
233
|
+
// Uses per-mode cache key so baseline and agentic are independently cached.
|
|
234
|
+
if (!noCache && !debug?.enabled) {
|
|
235
|
+
const inputPaths = getStepInputPaths(ROOT, cacheKey);
|
|
236
|
+
const inputHash = hashFiles(inputPaths);
|
|
237
|
+
recordCache(ROOT, cacheKey, inputHash, summary, durationMs, [
|
|
238
|
+
RESULTS_FILES[mode],
|
|
239
|
+
]);
|
|
240
|
+
}
|
|
241
|
+
return {
|
|
242
|
+
evalFingerprint,
|
|
243
|
+
stepResult: { durationMs, status: "success", summary },
|
|
244
|
+
};
|
|
245
|
+
}
|
|
246
|
+
// ---------------------------------------------------------------------------
|
|
247
|
+
// Remote cache helpers
|
|
248
|
+
// ---------------------------------------------------------------------------
|
|
249
|
+
/**
|
|
250
|
+
* Query the Sanity Content Lake for a report with a matching eval fingerprint.
|
|
251
|
+
*
|
|
252
|
+
* On a hit, writes the cached score-summary.json to results/latest/ so that
|
|
253
|
+
* downstream steps (report, compare, publish) can proceed as if the eval
|
|
254
|
+
* had just run.
|
|
255
|
+
*
|
|
256
|
+
* @returns The matched report metadata on hit, null on miss or error
|
|
257
|
+
*/
|
|
258
|
+
async function checkRemoteCache(fingerprint, sanityToken) {
|
|
259
|
+
try {
|
|
260
|
+
const { ReportStore } = await import("../../report-store.js");
|
|
261
|
+
const store = new ReportStore({
|
|
262
|
+
dataset: process.env.AILF_REPORT_DATASET ?? undefined,
|
|
263
|
+
projectId: process.env.AILF_REPORT_PROJECT_ID ?? undefined,
|
|
264
|
+
token: sanityToken,
|
|
265
|
+
});
|
|
266
|
+
const startQuery = Date.now();
|
|
267
|
+
const cachedReport = await store.findByFingerprint(fingerprint);
|
|
268
|
+
const queryMs = Date.now() - startQuery;
|
|
269
|
+
if (!cachedReport) {
|
|
270
|
+
console.log(` ℹ️ Remote cache miss — no report matches fingerprint (${queryMs}ms)`);
|
|
271
|
+
return null;
|
|
272
|
+
}
|
|
273
|
+
// Write the cached score summary to disk so downstream steps work
|
|
274
|
+
const outDir = resolve(ROOT, "results", "latest");
|
|
275
|
+
if (!existsSync(outDir)) {
|
|
276
|
+
mkdirSync(outDir, { recursive: true });
|
|
277
|
+
}
|
|
278
|
+
writeFileSync(resolve(outDir, "score-summary.json"), JSON.stringify(cachedReport.summary, null, 2));
|
|
279
|
+
console.log(` ✅ Remote cache hit — reusing report ${cachedReport.id} from ${cachedReport.completedAt}`);
|
|
280
|
+
console.log(` ℹ️ Fingerprint: ${fingerprint.slice(0, 16)}... (${queryMs}ms)`);
|
|
281
|
+
console.log(" ⚠️ Cached scores are statistically equivalent, not identical");
|
|
282
|
+
return {
|
|
283
|
+
completedAt: cachedReport.completedAt,
|
|
284
|
+
reportId: cachedReport.id,
|
|
285
|
+
};
|
|
286
|
+
}
|
|
287
|
+
catch (err) {
|
|
288
|
+
console.warn(` ⚠️ Remote cache check failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
289
|
+
return null;
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
/**
|
|
293
|
+
* Read the eval results JSON and produce a human-readable summary of any
|
|
294
|
+
* errored or failed tests. This surfaces API errors, timeouts, and other
|
|
295
|
+
* issues that would otherwise be buried in the Promptfoo table output.
|
|
296
|
+
*
|
|
297
|
+
* Returns null if there are no errors/failures worth reporting.
|
|
298
|
+
*/
|
|
299
|
+
function scanResultsForErrors(resultsPath) {
|
|
300
|
+
if (!existsSync(resultsPath))
|
|
301
|
+
return null;
|
|
302
|
+
let file;
|
|
303
|
+
try {
|
|
304
|
+
const raw = readFileSync(resultsPath, "utf-8");
|
|
305
|
+
file = JSON.parse(raw);
|
|
306
|
+
}
|
|
307
|
+
catch {
|
|
308
|
+
return null;
|
|
309
|
+
}
|
|
310
|
+
const results = file?.results?.results;
|
|
311
|
+
if (!Array.isArray(results))
|
|
312
|
+
return null;
|
|
313
|
+
const errored = [];
|
|
314
|
+
for (const r of results) {
|
|
315
|
+
if (r.gradingResult !== null)
|
|
316
|
+
continue;
|
|
317
|
+
const desc = r.testCase?.description ?? r.description ?? "unknown";
|
|
318
|
+
const provider = r.provider?.label ?? r.provider?.id ?? "unknown";
|
|
319
|
+
// No grading result = the provider errored before producing a response.
|
|
320
|
+
// This is the only case we surface — API 500s, timeouts, rate limits.
|
|
321
|
+
// Note: r.error may also be set for assertion failures, but those have
|
|
322
|
+
// a non-null gradingResult and are normal pass/fail outcomes.
|
|
323
|
+
const errorMsg = r.error
|
|
324
|
+
? (typeof r.error === "string" ? r.error : JSON.stringify(r.error)).slice(0, 200)
|
|
325
|
+
: "Provider returned no scorable result";
|
|
326
|
+
errored.push({ description: desc, error: errorMsg, provider });
|
|
327
|
+
}
|
|
328
|
+
if (errored.length === 0)
|
|
329
|
+
return null;
|
|
330
|
+
const total = results.length;
|
|
331
|
+
const lines = [];
|
|
332
|
+
lines.push(` ┌─────────────────────────────────────────────────────────────`);
|
|
333
|
+
lines.push(` │ ⚠️ ${errored.length} of ${total} eval result(s) errored (no gradingResult)`);
|
|
334
|
+
lines.push(` │`);
|
|
335
|
+
for (const e of errored) {
|
|
336
|
+
lines.push(` │ ✗ [${e.provider}] ${e.description}`);
|
|
337
|
+
lines.push(` │ → ${e.error}`);
|
|
338
|
+
}
|
|
339
|
+
const errorRate = Math.round((errored.length / total) * 100);
|
|
340
|
+
if (errorRate >= 25) {
|
|
341
|
+
lines.push(` │`);
|
|
342
|
+
lines.push(` │ 🔥 High error rate (${errorRate}%) — check API keys, rate limits,`);
|
|
343
|
+
lines.push(` │ or model availability. Errored results are excluded from scoring.`);
|
|
344
|
+
}
|
|
345
|
+
lines.push(` └─────────────────────────────────────────────────────────────`);
|
|
346
|
+
return lines.join("\n");
|
|
347
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline step: Fetch documentation from Sanity CMS.
|
|
3
|
+
*
|
|
4
|
+
* Preconditions: tasks have inline canonical_docs
|
|
5
|
+
* Postconditions: canonical context files exist for all mapped tasks
|
|
6
|
+
*
|
|
7
|
+
* Cache key: tasks/*.yaml + config/sources.yaml + config/models.yaml
|
|
8
|
+
* Cache outputs: contexts/canonical/*.md files
|
|
9
|
+
*/
|
|
10
|
+
import type { StepResult } from "../types.js";
|
|
11
|
+
export declare function runFetchDocs(source?: string, noCache?: boolean): Promise<StepResult>;
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline step: Fetch documentation from Sanity CMS.
|
|
3
|
+
*
|
|
4
|
+
* Preconditions: tasks have inline canonical_docs
|
|
5
|
+
* Postconditions: canonical context files exist for all mapped tasks
|
|
6
|
+
*
|
|
7
|
+
* Cache key: tasks/*.yaml + config/sources.yaml + config/models.yaml
|
|
8
|
+
* Cache outputs: contexts/canonical/*.md files
|
|
9
|
+
*/
|
|
10
|
+
import { execSync } from "child_process";
|
|
11
|
+
import { dirname, resolve } from "path";
|
|
12
|
+
import { fileURLToPath } from "url";
|
|
13
|
+
import { lookupCache, recordCache } from "../cache.js";
|
|
14
|
+
import { checkCanonicalContextsExist } from "../checks.js";
|
|
15
|
+
import { resolveMappings } from "../resolve-mappings.js";
|
|
16
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
17
|
+
const ROOT = resolve(__dirname, "..", "..", "..");
|
|
18
|
+
export async function runFetchDocs(source, noCache = false) {
|
|
19
|
+
const start = Date.now();
|
|
20
|
+
// Precondition: at least one task has inline canonical mappings
|
|
21
|
+
const mappings = resolveMappings(ROOT);
|
|
22
|
+
const totalTasks = Object.values(mappings.feature_areas).reduce((sum, area) => sum + area.tasks.length, 0);
|
|
23
|
+
if (totalTasks === 0) {
|
|
24
|
+
return {
|
|
25
|
+
durationMs: Date.now() - start,
|
|
26
|
+
error: "No tasks with canonical_docs found in task files. Add canonical_docs to your task definitions.",
|
|
27
|
+
status: "failed",
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
// Cache check
|
|
31
|
+
if (!noCache) {
|
|
32
|
+
const cacheResult = lookupCache(ROOT, "fetch-docs");
|
|
33
|
+
if (cacheResult.hit) {
|
|
34
|
+
return {
|
|
35
|
+
durationMs: Date.now() - start,
|
|
36
|
+
status: "success",
|
|
37
|
+
summary: `Skipped (cached) — ${cacheResult.entry.summary}`,
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
// Execute
|
|
42
|
+
try {
|
|
43
|
+
const sourceArg = source ? ` --source ${source}` : "";
|
|
44
|
+
execSync(`pnpm fetch-docs${sourceArg}`, {
|
|
45
|
+
cwd: ROOT,
|
|
46
|
+
env: process.env,
|
|
47
|
+
stdio: "inherit",
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
catch (err) {
|
|
51
|
+
return {
|
|
52
|
+
durationMs: Date.now() - start,
|
|
53
|
+
error: `fetch-docs failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
54
|
+
status: "failed",
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
// Postcondition: canonical context files exist for all mapped tasks
|
|
58
|
+
// Re-resolve in case fetch-docs modified things (unlikely but safe)
|
|
59
|
+
const postMappings = resolveMappings(ROOT);
|
|
60
|
+
const taskIds = Object.values(postMappings.feature_areas).flatMap((area) => area.tasks.map((t) => t.id));
|
|
61
|
+
const contextIssues = checkCanonicalContextsExist(ROOT, taskIds);
|
|
62
|
+
const contextErrors = contextIssues.filter((i) => i.severity === "error");
|
|
63
|
+
if (contextErrors.length > 0) {
|
|
64
|
+
return {
|
|
65
|
+
durationMs: Date.now() - start,
|
|
66
|
+
error: `Postcondition failed: ${contextErrors.map((e) => e.message).join("; ")}`,
|
|
67
|
+
status: "failed",
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
const durationMs = Date.now() - start;
|
|
71
|
+
const summary = `Fetched canonical contexts for ${taskIds.length} tasks`;
|
|
72
|
+
// Record cache
|
|
73
|
+
if (!noCache) {
|
|
74
|
+
const { getStepInputPaths, hashFiles } = await import("../cache.js");
|
|
75
|
+
const inputPaths = getStepInputPaths(ROOT, "fetch-docs");
|
|
76
|
+
const inputHash = hashFiles(inputPaths);
|
|
77
|
+
const outputPaths = [
|
|
78
|
+
...taskIds.map((id) => `contexts/canonical/${id}.md`),
|
|
79
|
+
"contexts/document-manifest.json",
|
|
80
|
+
];
|
|
81
|
+
recordCache(ROOT, "fetch-docs", inputHash, summary, durationMs, outputPaths);
|
|
82
|
+
}
|
|
83
|
+
return { durationMs, status: "success", summary };
|
|
84
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline step: Generate promptfoo config files from config/models.yaml.
|
|
3
|
+
*
|
|
4
|
+
* Preconditions: config/models.yaml is valid
|
|
5
|
+
* Postconditions: promptfooconfig*.yaml files exist
|
|
6
|
+
*
|
|
7
|
+
* Cache key: config/models.yaml + config/sources.yaml + tasks/*.yaml
|
|
8
|
+
* Cache outputs: promptfooconfig*.yaml files
|
|
9
|
+
*/
|
|
10
|
+
import type { StepResult } from "../types.js";
|
|
11
|
+
export declare function runGenerateConfigs(source?: string, noCache?: boolean): StepResult;
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline step: Generate promptfoo config files from config/models.yaml.
|
|
3
|
+
*
|
|
4
|
+
* Preconditions: config/models.yaml is valid
|
|
5
|
+
* Postconditions: promptfooconfig*.yaml files exist
|
|
6
|
+
*
|
|
7
|
+
* Cache key: config/models.yaml + config/sources.yaml + tasks/*.yaml
|
|
8
|
+
* Cache outputs: promptfooconfig*.yaml files
|
|
9
|
+
*/
|
|
10
|
+
import { execSync } from "child_process";
|
|
11
|
+
import { dirname, resolve } from "path";
|
|
12
|
+
import { fileURLToPath } from "url";
|
|
13
|
+
import { getStepInputPaths, hashFiles, lookupCache, recordCache, } from "../cache.js";
|
|
14
|
+
import { checkGeneratedConfigsExist } from "../checks.js";
|
|
15
|
+
import { validateModelsYaml } from "../validate.js";
|
|
16
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
17
|
+
const ROOT = resolve(__dirname, "..", "..", "..");
|
|
18
|
+
export function runGenerateConfigs(source, noCache = false) {
|
|
19
|
+
const start = Date.now();
|
|
20
|
+
// Precondition: config/models.yaml must be valid
|
|
21
|
+
const modelIssues = validateModelsYaml(ROOT);
|
|
22
|
+
const errors = modelIssues.filter((i) => i.severity === "error");
|
|
23
|
+
if (errors.length > 0) {
|
|
24
|
+
return {
|
|
25
|
+
durationMs: Date.now() - start,
|
|
26
|
+
error: `config/models.yaml validation failed: ${errors.map((e) => e.message).join("; ")}`,
|
|
27
|
+
status: "failed",
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
// Cache check — include filter env vars so scoped runs don't reuse
|
|
31
|
+
// cached results from unscoped (or differently-scoped) runs.
|
|
32
|
+
const filterContext = buildFilterContext();
|
|
33
|
+
if (!noCache) {
|
|
34
|
+
const cacheResult = lookupCache(ROOT, "generate-configs", filterContext);
|
|
35
|
+
if (cacheResult.hit) {
|
|
36
|
+
return {
|
|
37
|
+
durationMs: Date.now() - start,
|
|
38
|
+
status: "success",
|
|
39
|
+
summary: `Skipped (cached) — ${cacheResult.entry.summary}`,
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
// Execute
|
|
44
|
+
try {
|
|
45
|
+
const sourceArg = source ? ` --source ${source}` : "";
|
|
46
|
+
execSync(`pnpm generate-configs${sourceArg}`, {
|
|
47
|
+
cwd: ROOT,
|
|
48
|
+
env: process.env,
|
|
49
|
+
stdio: "inherit",
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
catch (err) {
|
|
53
|
+
return {
|
|
54
|
+
durationMs: Date.now() - start,
|
|
55
|
+
error: `generate-configs failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
56
|
+
status: "failed",
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
// Postcondition: config files exist
|
|
60
|
+
const configIssues = checkGeneratedConfigsExist(ROOT);
|
|
61
|
+
const configErrors = configIssues.filter((i) => i.severity === "error");
|
|
62
|
+
if (configErrors.length > 0) {
|
|
63
|
+
return {
|
|
64
|
+
durationMs: Date.now() - start,
|
|
65
|
+
error: `Postcondition failed: ${configErrors.map((e) => e.message).join("; ")}`,
|
|
66
|
+
status: "failed",
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
const durationMs = Date.now() - start;
|
|
70
|
+
const summary = "Generated promptfoo config files";
|
|
71
|
+
// Record cache
|
|
72
|
+
if (!noCache) {
|
|
73
|
+
const inputPaths = getStepInputPaths(ROOT, "generate-configs");
|
|
74
|
+
const inputHash = hashFiles(inputPaths, filterContext);
|
|
75
|
+
const outputPaths = [
|
|
76
|
+
"promptfooconfig.yaml",
|
|
77
|
+
"promptfooconfig.observed.yaml",
|
|
78
|
+
"promptfooconfig.agentic.yaml",
|
|
79
|
+
];
|
|
80
|
+
recordCache(ROOT, "generate-configs", inputHash, summary, durationMs, outputPaths);
|
|
81
|
+
}
|
|
82
|
+
return { durationMs, status: "success", summary };
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Build cache context strings from filter environment variables.
|
|
86
|
+
* When EVAL_FILTER_AREAS or EVAL_FILTER_TASKS are set, they become part
|
|
87
|
+
* of the cache key so that differently-scoped runs don't share cache entries.
|
|
88
|
+
*/
|
|
89
|
+
function buildFilterContext() {
|
|
90
|
+
const context = [];
|
|
91
|
+
if (process.env.EVAL_FILTER_AREAS) {
|
|
92
|
+
context.push(`areas:${process.env.EVAL_FILTER_AREAS}`);
|
|
93
|
+
}
|
|
94
|
+
if (process.env.EVAL_FILTER_TASKS) {
|
|
95
|
+
context.push(`tasks:${process.env.EVAL_FILTER_TASKS}`);
|
|
96
|
+
}
|
|
97
|
+
return context;
|
|
98
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline step: Measure grader consistency via replicated grading.
|
|
3
|
+
*
|
|
4
|
+
* This step is OPTIONAL — it only runs when --grader-replications N is passed.
|
|
5
|
+
* It re-runs grading assertions N additional times on the same model responses
|
|
6
|
+
* and measures score variance across replications.
|
|
7
|
+
*
|
|
8
|
+
* Preconditions: eval-results.json exists (model responses to re-grade)
|
|
9
|
+
* Postconditions: grader-consistency.json written to results/latest/
|
|
10
|
+
*
|
|
11
|
+
* Not cached: Each run involves fresh API calls to the grader model.
|
|
12
|
+
* The whole point is to measure variance, so caching would defeat the purpose.
|
|
13
|
+
*/
|
|
14
|
+
import type { EvalMode, StepResult } from "../types.js";
|
|
15
|
+
/**
|
|
16
|
+
* Run grader consistency analysis.
|
|
17
|
+
*
|
|
18
|
+
* @param replications Number of additional grading replications (default: 5)
|
|
19
|
+
* @param mode Eval mode — determines which results file to read
|
|
20
|
+
*/
|
|
21
|
+
export declare function runGraderConsistency(replications?: number, mode?: EvalMode): StepResult;
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline step: Measure grader consistency via replicated grading.
|
|
3
|
+
*
|
|
4
|
+
* This step is OPTIONAL — it only runs when --grader-replications N is passed.
|
|
5
|
+
* It re-runs grading assertions N additional times on the same model responses
|
|
6
|
+
* and measures score variance across replications.
|
|
7
|
+
*
|
|
8
|
+
* Preconditions: eval-results.json exists (model responses to re-grade)
|
|
9
|
+
* Postconditions: grader-consistency.json written to results/latest/
|
|
10
|
+
*
|
|
11
|
+
* Not cached: Each run involves fresh API calls to the grader model.
|
|
12
|
+
* The whole point is to measure variance, so caching would defeat the purpose.
|
|
13
|
+
*/
|
|
14
|
+
import { execSync } from "child_process";
|
|
15
|
+
import { existsSync } from "fs";
|
|
16
|
+
import { dirname, resolve } from "path";
|
|
17
|
+
import { fileURLToPath } from "url";
|
|
18
|
+
import { checkResultsExist } from "../checks.js";
|
|
19
|
+
import { RESULTS_FILES } from "./eval-step.js";
|
|
20
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
21
|
+
const ROOT = resolve(__dirname, "..", "..", "..");
|
|
22
|
+
/**
|
|
23
|
+
* Run grader consistency analysis.
|
|
24
|
+
*
|
|
25
|
+
* @param replications Number of additional grading replications (default: 5)
|
|
26
|
+
* @param mode Eval mode — determines which results file to read
|
|
27
|
+
*/
|
|
28
|
+
export function runGraderConsistency(replications = 5, mode = "baseline") {
|
|
29
|
+
const start = Date.now();
|
|
30
|
+
// For full mode, use baseline results for grader consistency analysis
|
|
31
|
+
const concreteMode = mode === "full" ? "baseline" : mode;
|
|
32
|
+
const resultsFile = RESULTS_FILES[concreteMode];
|
|
33
|
+
const resultsIssues = checkResultsExist(ROOT, resultsFile);
|
|
34
|
+
const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
|
|
35
|
+
if (resultsErrors.length > 0) {
|
|
36
|
+
return {
|
|
37
|
+
durationMs: Date.now() - start,
|
|
38
|
+
error: `Results missing: ${resultsErrors.map((e) => e.message).join("; ")}. Run eval first.`,
|
|
39
|
+
status: "failed",
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
// Execute
|
|
43
|
+
try {
|
|
44
|
+
execSync(`tsx src/lib/grader-consistency.ts --replications ${replications} --results ${resultsFile}`, {
|
|
45
|
+
cwd: ROOT,
|
|
46
|
+
env: process.env,
|
|
47
|
+
stdio: "inherit",
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
catch (err) {
|
|
51
|
+
const code = err !== null && typeof err === "object" && "status" in err
|
|
52
|
+
? err.status
|
|
53
|
+
: 1;
|
|
54
|
+
return {
|
|
55
|
+
durationMs: Date.now() - start,
|
|
56
|
+
error: `grader-consistency failed with exit code ${code}`,
|
|
57
|
+
status: "failed",
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
// Postcondition: output file exists
|
|
61
|
+
const outputPath = resolve(ROOT, "results", "latest", "grader-consistency.json");
|
|
62
|
+
if (!existsSync(outputPath)) {
|
|
63
|
+
return {
|
|
64
|
+
durationMs: Date.now() - start,
|
|
65
|
+
error: "grader-consistency.json was not created",
|
|
66
|
+
status: "failed",
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
return {
|
|
70
|
+
durationMs: Date.now() - start,
|
|
71
|
+
status: "success",
|
|
72
|
+
summary: `Grader consistency analysis complete (${replications} replications)`,
|
|
73
|
+
};
|
|
74
|
+
}
|