@sanity/ailf 2.0.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +28 -23
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
- package/dist/_vendor/ailf-core/config-helpers.js +29 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
- package/dist/_vendor/ailf-core/examples/index.js +208 -114
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
- package/dist/_vendor/ailf-tasks/cli.js +61 -0
- package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
- package/dist/_vendor/ailf-tasks/index.js +16 -0
- package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
- package/dist/_vendor/ailf-tasks/parser.js +73 -0
- package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
- package/dist/_vendor/ailf-tasks/schemas.js +180 -0
- package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
- package/dist/_vendor/ailf-tasks/validation.js +162 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +6 -1
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
- package/dist/adapters/task-sources/index.d.ts +1 -2
- package/dist/adapters/task-sources/index.js +1 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
- package/dist/adapters/task-sources/repo-schemas.js +2 -2
- package/dist/adapters/task-sources/repo-task-source.js +1 -1
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
- package/dist/adapters/task-sources/task-file-loader.js +20 -6
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +2 -3
- package/dist/commands/init.js +56 -170
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/composition-root.d.ts +2 -3
- package/dist/composition-root.js +27 -14
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +30 -16
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +50 -15
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +52 -32
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/checks.d.ts +8 -3
- package/dist/pipeline/checks.js +23 -3
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
- package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
- package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
- package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +15 -8
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +15 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/dist/pipeline/mirror-repo-tasks.js +1 -1
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +67 -29
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +25 -25
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -10,11 +10,13 @@
|
|
|
10
10
|
* and stores a `releaseAutoScope` entry in PipelineState. Downstream
|
|
11
11
|
* steps (GenerateConfigsStep, RunEvalStep) use this to narrow scope.
|
|
12
12
|
*/
|
|
13
|
-
import { mkdirSync, writeFileSync } from "fs";
|
|
13
|
+
import { existsSync, mkdirSync, writeFileSync } from "fs";
|
|
14
14
|
import { join } from "path";
|
|
15
15
|
import { isIdRef, isPathRef, isSlugRef, } from "../../_vendor/ailf-core/index.js";
|
|
16
16
|
import { getStepInputPaths } from "../../pipeline/cache.js";
|
|
17
|
+
import { buildCacheContext } from "../cache-context.js";
|
|
17
18
|
import { checkCanonicalContextsExist } from "../../pipeline/checks.js";
|
|
19
|
+
import { loadPipelineTasks } from "../load-pipeline-tasks.js";
|
|
18
20
|
import { loadSource } from "../../sources.js";
|
|
19
21
|
import { configToSourceOverrides } from "../config-to-source-overrides.js";
|
|
20
22
|
export class FetchDocsStep {
|
|
@@ -27,8 +29,15 @@ export class FetchDocsStep {
|
|
|
27
29
|
return { status: "skipped", reason: "--skip-fetch" };
|
|
28
30
|
}
|
|
29
31
|
const start = Date.now();
|
|
30
|
-
//
|
|
31
|
-
|
|
32
|
+
// Load tasks from the filesystem — the same source GenerateConfigsStep
|
|
33
|
+
// uses. This replaces ctx.taskSource (ContentLakeTaskSource) which may
|
|
34
|
+
// have no ailf.task documents, causing a mismatch where generated
|
|
35
|
+
// configs reference context files that were never fetched.
|
|
36
|
+
const allTasks = await loadPipelineTasks({
|
|
37
|
+
rootDir: ctx.config.rootDir,
|
|
38
|
+
mode: ctx.config.mode,
|
|
39
|
+
repoTasksPath: ctx.config.repoTasksPath,
|
|
40
|
+
});
|
|
32
41
|
// Bridge: narrow to literacy tasks for canonical doc access
|
|
33
42
|
const literacyTasks = allTasks.filter((t) => t.mode === "literacy");
|
|
34
43
|
const tasksWithDocs = literacyTasks.filter((t) => (t.context?.docs?.length ?? 0) > 0);
|
|
@@ -71,6 +80,21 @@ export class FetchDocsStep {
|
|
|
71
80
|
if (result.metadata) {
|
|
72
81
|
writeMetadataFiles(ctx.config.rootDir, result.metadata);
|
|
73
82
|
}
|
|
83
|
+
// Capture metadata files (mode-specific extras)
|
|
84
|
+
if (ctx.collector.extrasEnabled) {
|
|
85
|
+
const contextsDir = join(ctx.config.rootDir, "contexts");
|
|
86
|
+
for (const [type, filename] of [
|
|
87
|
+
["document-manifest", "document-manifest.json"],
|
|
88
|
+
["release-impact", "release-impact.json"],
|
|
89
|
+
["document-overlay", "document-overlay.json"],
|
|
90
|
+
["url-fetch", "url-fetch.json"],
|
|
91
|
+
]) {
|
|
92
|
+
const filePath = join(contextsDir, filename);
|
|
93
|
+
if (existsSync(filePath)) {
|
|
94
|
+
ctx.collector.captureFile("fetch-docs", type, filePath);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
74
98
|
}
|
|
75
99
|
catch (err) {
|
|
76
100
|
return {
|
|
@@ -117,19 +141,9 @@ export class FetchDocsStep {
|
|
|
117
141
|
cacheInputs(ctx) {
|
|
118
142
|
return getStepInputPaths(ctx.config.rootDir, "fetch-docs");
|
|
119
143
|
}
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
// ---------------------------------------------------------------------------
|
|
124
|
-
function buildFilter(ctx) {
|
|
125
|
-
const { areas, tasks, tags } = ctx.config;
|
|
126
|
-
if (!areas && !tasks && !tags)
|
|
127
|
-
return undefined;
|
|
128
|
-
return {
|
|
129
|
-
...(areas ? { areas } : {}),
|
|
130
|
-
...(tasks ? { taskIds: tasks } : {}),
|
|
131
|
-
...(tags ? { tags } : {}),
|
|
132
|
-
};
|
|
144
|
+
cacheContext(ctx) {
|
|
145
|
+
return buildCacheContext(ctx.config);
|
|
146
|
+
}
|
|
133
147
|
}
|
|
134
148
|
/**
|
|
135
149
|
* Write metadata files returned by DocFetcher to the contexts/ directory.
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
*
|
|
15
15
|
* This is an optional step — failure doesn't stop the pipeline.
|
|
16
16
|
*/
|
|
17
|
-
import { existsSync, readFileSync, writeFileSync } from "fs";
|
|
17
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
18
18
|
import { join, resolve } from "path";
|
|
19
19
|
import { isSlugRef } from "../../_vendor/ailf-core/index.js";
|
|
20
20
|
export class GapAnalysisStep {
|
|
@@ -56,7 +56,9 @@ export class GapAnalysisStep {
|
|
|
56
56
|
console.log(formatFailureModesConsole(failureModeReport));
|
|
57
57
|
const gapReport = buildGapAnalysisReport(failureModeReport, scoreSummary.scores);
|
|
58
58
|
console.log(formatGapAnalysisConsole(gapReport));
|
|
59
|
-
|
|
59
|
+
// Write user-facing artifacts to outputDir (respects --output-dir)
|
|
60
|
+
const outDir = ctx.config.outputDir;
|
|
61
|
+
mkdirSync(outDir, { recursive: true });
|
|
60
62
|
writeFileSync(join(outDir, "failure-modes.json"), JSON.stringify(failureModeReport, null, 2));
|
|
61
63
|
writeFileSync(join(outDir, "gap-analysis.json"), JSON.stringify(gapReport, null, 2));
|
|
62
64
|
const manifestPath = resolve(root, "contexts", "document-manifest.json");
|
|
@@ -166,6 +168,15 @@ export class GapAnalysisStep {
|
|
|
166
168
|
scores: enrichedScores,
|
|
167
169
|
};
|
|
168
170
|
writeFileSync(scoreSummaryPath, JSON.stringify(enrichedSummary, null, 2));
|
|
171
|
+
// Capture gap analysis artifacts
|
|
172
|
+
const failureModesPath = join(outDir, "failure-modes.json");
|
|
173
|
+
if (existsSync(failureModesPath)) {
|
|
174
|
+
ctx.collector.captureFile("gap-analysis", "failure-modes", failureModesPath);
|
|
175
|
+
}
|
|
176
|
+
const gapReportPath = join(outDir, "gap-analysis.json");
|
|
177
|
+
if (existsSync(gapReportPath)) {
|
|
178
|
+
ctx.collector.captureFile("gap-analysis", "gap-report", gapReportPath);
|
|
179
|
+
}
|
|
169
180
|
const gapCount = gapReport.gaps.length;
|
|
170
181
|
const classRate = failureModeReport.classificationRate.toFixed(0);
|
|
171
182
|
return {
|
|
@@ -8,8 +8,12 @@
|
|
|
8
8
|
* When the variant is "full", the handler is called twice (baseline + agentic)
|
|
9
9
|
* and three YAML files are written. Other modes produce one YAML file.
|
|
10
10
|
*/
|
|
11
|
+
import { existsSync } from "node:fs";
|
|
12
|
+
import { resolve } from "node:path";
|
|
11
13
|
import { LiteracyVariant } from "../../pipeline/normalize-mode.js";
|
|
14
|
+
import { modelMatchesLiteracyVariant } from "../../pipeline/compiler/mode-bases/literacy.js";
|
|
12
15
|
import { getStepInputPaths } from "../../pipeline/cache.js";
|
|
16
|
+
import { buildCacheContext } from "../cache-context.js";
|
|
13
17
|
import { checkGeneratedConfigsExist } from "../../pipeline/checks.js";
|
|
14
18
|
import { validateModelsYaml } from "../../pipeline/validate.js";
|
|
15
19
|
import { loadSource } from "../../sources.js";
|
|
@@ -85,21 +89,14 @@ export class GenerateConfigsStep {
|
|
|
85
89
|
// ---------------------------------------------------------------------------
|
|
86
90
|
async compileLiteracyVariants(ctx, handler, tasks, models, providers, start) {
|
|
87
91
|
ctx.logger.info(`Compiling ${tasks.length} literacy task(s) via registry handler...`);
|
|
88
|
-
// Filter models per variant
|
|
92
|
+
// Filter models per variant using shared literacy variant matcher
|
|
89
93
|
const baselineModels = models.models
|
|
90
|
-
.filter((m) =>
|
|
91
|
-
.map((m) => ({
|
|
92
|
-
id: m.id,
|
|
93
|
-
label: m.label,
|
|
94
|
-
}));
|
|
94
|
+
.filter((m) => modelMatchesLiteracyVariant(m, "baseline"))
|
|
95
|
+
.map((m) => ({ id: m.id, label: m.label }));
|
|
95
96
|
const agenticModels = models.models
|
|
96
|
-
.filter((m) =>
|
|
97
|
-
m
|
|
98
|
-
m.
|
|
99
|
-
.map((m) => ({
|
|
100
|
-
id: m.id,
|
|
101
|
-
label: m.label,
|
|
102
|
-
}));
|
|
97
|
+
.filter((m) => modelMatchesLiteracyVariant(m, "agentic-naive") ||
|
|
98
|
+
modelMatchesLiteracyVariant(m, "agentic-optimized"))
|
|
99
|
+
.map((m) => ({ id: m.id, label: m.label }));
|
|
103
100
|
// Load rubric config for template resolution
|
|
104
101
|
let rubricConfig;
|
|
105
102
|
try {
|
|
@@ -137,6 +134,14 @@ export class GenerateConfigsStep {
|
|
|
137
134
|
maxConcurrency: models.maxConcurrency,
|
|
138
135
|
logger: ctx.logger,
|
|
139
136
|
});
|
|
137
|
+
// Capture generated config files (use configFileForMode for legacy naming)
|
|
138
|
+
const { configFileForMode } = await import("../../pipeline/eval-constants.js");
|
|
139
|
+
for (const variant of ["baseline", "agentic", "observed"]) {
|
|
140
|
+
const configPath = resolve(ctx.config.rootDir, configFileForMode(variant));
|
|
141
|
+
if (existsSync(configPath)) {
|
|
142
|
+
ctx.collector.captureFile("generate-configs", `promptfoo-config-${variant}`, configPath, { mode: "literacy", variant });
|
|
143
|
+
}
|
|
144
|
+
}
|
|
140
145
|
return this.checkLiteracyPostconditions(ctx, start);
|
|
141
146
|
}
|
|
142
147
|
// ---------------------------------------------------------------------------
|
|
@@ -168,6 +173,18 @@ export class GenerateConfigsStep {
|
|
|
168
173
|
maxConcurrency: models.maxConcurrency,
|
|
169
174
|
logger: ctx.logger,
|
|
170
175
|
});
|
|
176
|
+
// Capture generated config file
|
|
177
|
+
const configPath = resolve(ctx.config.rootDir, `promptfooconfig.${mode}.yaml`);
|
|
178
|
+
if (existsSync(configPath)) {
|
|
179
|
+
ctx.collector.captureFile("generate-configs", "promptfoo-config", configPath, { mode });
|
|
180
|
+
}
|
|
181
|
+
// Capture mode-specific test artifacts (extras)
|
|
182
|
+
if (ctx.collector.extrasEnabled) {
|
|
183
|
+
const testsPath = resolve(ctx.config.rootDir, "results", "latest", `${mode}-tests.json`);
|
|
184
|
+
if (existsSync(testsPath)) {
|
|
185
|
+
ctx.collector.captureFile("generate-configs", `${mode}-tests`, testsPath, { mode });
|
|
186
|
+
}
|
|
187
|
+
}
|
|
171
188
|
return {
|
|
172
189
|
durationMs: Date.now() - start,
|
|
173
190
|
status: "success",
|
|
@@ -180,8 +197,11 @@ export class GenerateConfigsStep {
|
|
|
180
197
|
async loadTasks(ctx, mode, state) {
|
|
181
198
|
const { resolve } = await import("path");
|
|
182
199
|
const { discoverTsTaskFiles, loadTsTaskFile } = await import("../../adapters/task-sources/task-file-loader.js");
|
|
183
|
-
|
|
184
|
-
|
|
200
|
+
const { resolveVendoredSubdir } = await import("../../pipeline/compiler/config-loader.js");
|
|
201
|
+
// Discover task files from the mode-specific directory and --repo-tasks-path.
|
|
202
|
+
// Use vendored copies in dist/ when @sanity/ailf-core isn't resolvable
|
|
203
|
+
// (i.e., running outside the monorepo via npx).
|
|
204
|
+
const tasksDir = resolveVendoredSubdir(ctx.config.rootDir, `tasks/${mode}`);
|
|
185
205
|
const dirs = [tasksDir];
|
|
186
206
|
// Also search --repo-tasks-path (e.g., .ailf/tasks/) for repo-based tasks
|
|
187
207
|
if (ctx.config.repoTasksPath) {
|
|
@@ -191,6 +211,7 @@ export class GenerateConfigsStep {
|
|
|
191
211
|
}
|
|
192
212
|
}
|
|
193
213
|
const tasks = [];
|
|
214
|
+
const skippedByMode = new Map();
|
|
194
215
|
for (const dir of dirs) {
|
|
195
216
|
const files = discoverTsTaskFiles(dir);
|
|
196
217
|
for (const file of files) {
|
|
@@ -201,9 +222,20 @@ export class GenerateConfigsStep {
|
|
|
201
222
|
if (!("mode" in task) || task.mode === mode) {
|
|
202
223
|
tasks.push(task);
|
|
203
224
|
}
|
|
225
|
+
else {
|
|
226
|
+
const taskMode = task.mode ?? "unknown";
|
|
227
|
+
skippedByMode.set(taskMode, (skippedByMode.get(taskMode) ?? 0) + 1);
|
|
228
|
+
}
|
|
204
229
|
}
|
|
205
230
|
}
|
|
206
231
|
}
|
|
232
|
+
if (skippedByMode.size > 0) {
|
|
233
|
+
const total = [...skippedByMode.values()].reduce((a, b) => a + b, 0);
|
|
234
|
+
const summary = [...skippedByMode.entries()]
|
|
235
|
+
.map(([m, n]) => `${n} ${m}`)
|
|
236
|
+
.join(", ");
|
|
237
|
+
ctx.logger.warn(` ⚠ Skipped ${total} task(s) with non-matching mode (${summary}). Current pipeline mode: ${mode}. Run with --mode <mode> to include them.`);
|
|
238
|
+
}
|
|
207
239
|
// Apply area/task/tag filters
|
|
208
240
|
const filtered = this.applyFilters(ctx, tasks);
|
|
209
241
|
// Release auto-scope
|
|
@@ -280,6 +312,9 @@ export class GenerateConfigsStep {
|
|
|
280
312
|
cacheInputs(ctx) {
|
|
281
313
|
return getStepInputPaths(ctx.config.rootDir, "generate-configs");
|
|
282
314
|
}
|
|
315
|
+
cacheContext(ctx) {
|
|
316
|
+
return buildCacheContext(ctx.config);
|
|
317
|
+
}
|
|
283
318
|
}
|
|
284
319
|
// ---------------------------------------------------------------------------
|
|
285
320
|
// Helpers
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* exist before evaluation begins.
|
|
11
11
|
*
|
|
12
12
|
* @see packages/eval/src/pipeline/mirror-repo-tasks.ts
|
|
13
|
-
* @see docs/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
|
|
13
|
+
* @see docs/archive/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
|
|
14
14
|
*/
|
|
15
15
|
import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
16
16
|
export declare class MirrorRepoTasksStep implements PipelineStep {
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* exist before evaluation begins.
|
|
11
11
|
*
|
|
12
12
|
* @see packages/eval/src/pipeline/mirror-repo-tasks.ts
|
|
13
|
-
* @see docs/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
|
|
13
|
+
* @see docs/archive/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
|
|
14
14
|
*/
|
|
15
15
|
import { getSanityClient } from "../../sanity/client.js";
|
|
16
16
|
import { detectGitContext, mirrorRepoTasks, } from "../../pipeline/mirror-repo-tasks.js";
|
|
@@ -115,12 +115,31 @@ export class PublishReportStep {
|
|
|
115
115
|
};
|
|
116
116
|
// Share reportId with downstream steps (CallbackStep + orchestrator job update)
|
|
117
117
|
state.reportId = reportId;
|
|
118
|
+
// Capture report object (Tier 2)
|
|
119
|
+
ctx.collector.capture("publish-report", "report-object", report);
|
|
120
|
+
// Capture auto-comparison if present (Tier 2)
|
|
121
|
+
if (comparison) {
|
|
122
|
+
ctx.collector.capture("publish-report", "auto-comparison", comparison);
|
|
123
|
+
}
|
|
118
124
|
// Write to store (system of record — best-effort, P5)
|
|
119
125
|
const sanityResult = ctx.reportStore
|
|
120
126
|
? await ctx.reportStore.write(report)
|
|
121
127
|
: null;
|
|
122
128
|
// Run sinks (fire-and-forget, P6)
|
|
123
129
|
const publishResult = await runSinks(report, ctx);
|
|
130
|
+
// Capture sink results (Tier 2)
|
|
131
|
+
if (publishResult.sinkResults.length > 0) {
|
|
132
|
+
ctx.collector.capture("publish-report", "sink-results", {
|
|
133
|
+
sinkCount: publishResult.sinkResults.length,
|
|
134
|
+
results: publishResult.sinkResults.map((r) => ({
|
|
135
|
+
name: r.name,
|
|
136
|
+
status: r.result.status,
|
|
137
|
+
...(r.result.status === "success" ? { detail: r.result.detail } : {}),
|
|
138
|
+
...(r.result.status === "failed" ? { error: r.result.error } : {}),
|
|
139
|
+
...(r.result.status === "skipped" ? { reason: r.result.reason } : {}),
|
|
140
|
+
})),
|
|
141
|
+
});
|
|
142
|
+
}
|
|
124
143
|
// Build result summary
|
|
125
144
|
const parts = [];
|
|
126
145
|
if (sanityResult) {
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* Calls pure functions from pipeline/readiness-report.ts directly.
|
|
5
5
|
* Optional step — failure doesn't stop the pipeline.
|
|
6
6
|
*/
|
|
7
|
-
import { existsSync, readFileSync, writeFileSync } from "fs";
|
|
7
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
8
8
|
import { resolve } from "path";
|
|
9
9
|
import { tryLoadConfigFile } from "../../pipeline/compiler/config-loader.js";
|
|
10
10
|
import { formatReadinessMarkdown, generateReadinessReport, } from "../../pipeline/readiness-report.js";
|
|
@@ -37,7 +37,8 @@ export class ReadinessStep {
|
|
|
37
37
|
}
|
|
38
38
|
const scoreSummary = JSON.parse(readFileSync(scoreSummaryPath, "utf-8"));
|
|
39
39
|
const thresholdConfig = ThresholdConfigSchema.parse(thresholdsLoaded.data);
|
|
40
|
-
|
|
40
|
+
// Read gap-analysis.json from outputDir (gap-analysis step writes there)
|
|
41
|
+
const gapPath = resolve(ctx.config.outputDir, "gap-analysis.json");
|
|
41
42
|
const gapAnalysis = existsSync(gapPath)
|
|
42
43
|
? JSON.parse(readFileSync(gapPath, "utf-8"))
|
|
43
44
|
: undefined;
|
|
@@ -60,7 +61,11 @@ export class ReadinessStep {
|
|
|
60
61
|
console.log(md);
|
|
61
62
|
}
|
|
62
63
|
if (readinessLines.length > 0) {
|
|
63
|
-
|
|
64
|
+
// Write to outputDir (respects --output-dir)
|
|
65
|
+
mkdirSync(ctx.config.outputDir, { recursive: true });
|
|
66
|
+
const readinessPath = resolve(ctx.config.outputDir, "readiness-report.md");
|
|
67
|
+
writeFileSync(readinessPath, readinessLines.join("\n---\n\n"));
|
|
68
|
+
ctx.collector.captureFile("readiness", "readiness-report", readinessPath);
|
|
64
69
|
}
|
|
65
70
|
const passCount = readinessAreas.filter((area) => {
|
|
66
71
|
const areaScore = scoreSummary.scores.find((s) => s.feature === area);
|
|
@@ -4,10 +4,10 @@
|
|
|
4
4
|
* Calls generatePrComment() from pipeline/pr-comment.ts with typed options.
|
|
5
5
|
* No env bridge or process.argv manipulation needed.
|
|
6
6
|
*/
|
|
7
|
-
import {
|
|
7
|
+
import { existsSync, mkdirSync } from "node:fs";
|
|
8
|
+
import { dirname, resolve } from "path";
|
|
8
9
|
import { checkScoreSummaryValid } from "../../pipeline/checks.js";
|
|
9
10
|
import { generatePrComment } from "../../pipeline/pr-comment.js";
|
|
10
|
-
const DEFAULT_REPORT_PATH = "results/latest/pr-comment.md";
|
|
11
11
|
export class ReportStep {
|
|
12
12
|
name = "report";
|
|
13
13
|
check() {
|
|
@@ -15,7 +15,7 @@ export class ReportStep {
|
|
|
15
15
|
}
|
|
16
16
|
async execute(ctx) {
|
|
17
17
|
const start = Date.now();
|
|
18
|
-
// Precondition: score summary exists
|
|
18
|
+
// Precondition: score summary exists (intermediate files stay in rootDir)
|
|
19
19
|
const summaryIssues = checkScoreSummaryValid(ctx.config.rootDir);
|
|
20
20
|
const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
|
|
21
21
|
if (summaryErrors.length > 0) {
|
|
@@ -25,7 +25,12 @@ export class ReportStep {
|
|
|
25
25
|
status: "failed",
|
|
26
26
|
};
|
|
27
27
|
}
|
|
28
|
-
|
|
28
|
+
// User-facing output: --output flag wins, else outputDir
|
|
29
|
+
const resolvedOutput = ctx.config.outputPath ?? resolve(ctx.config.outputDir, "pr-comment.md");
|
|
30
|
+
// Ensure outputDir exists before writing (it may be a custom --output-dir
|
|
31
|
+
// that hasn't been created yet — writePipelineResult runs after the
|
|
32
|
+
// orchestrator returns, so we can't rely on it).
|
|
33
|
+
mkdirSync(dirname(resolvedOutput), { recursive: true });
|
|
29
34
|
try {
|
|
30
35
|
generatePrComment({
|
|
31
36
|
outputPath: resolvedOutput,
|
|
@@ -40,6 +45,14 @@ export class ReportStep {
|
|
|
40
45
|
status: "failed",
|
|
41
46
|
};
|
|
42
47
|
}
|
|
48
|
+
// Capture report artifacts
|
|
49
|
+
if (existsSync(resolvedOutput)) {
|
|
50
|
+
ctx.collector.captureFile("report", "pr-comment", resolvedOutput);
|
|
51
|
+
}
|
|
52
|
+
const pipelineResultPath = resolve(ctx.config.outputDir, "pipeline-result.json");
|
|
53
|
+
if (existsSync(pipelineResultPath)) {
|
|
54
|
+
ctx.collector.captureFile("report", "pipeline-result", pipelineResultPath);
|
|
55
|
+
}
|
|
43
56
|
return {
|
|
44
57
|
durationMs: Date.now() - start,
|
|
45
58
|
status: "success",
|
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
import { existsSync, mkdirSync, writeFileSync } from "fs";
|
|
9
9
|
import { resolve } from "path";
|
|
10
10
|
import { getStepInputPaths } from "../../pipeline/cache.js";
|
|
11
|
+
import { buildCacheContext } from "../cache-context.js";
|
|
11
12
|
import { checkCanonicalContextsExist, checkGeneratedConfigsExist, checkResultsExist, } from "../../pipeline/checks.js";
|
|
12
13
|
import { computeEvalFingerprint } from "../../pipeline/eval-fingerprint.js";
|
|
13
14
|
import { buildFilterFlags, configFileForMode, resultsFileForMode, scanResultsForErrors, } from "../../pipeline/eval-constants.js";
|
|
@@ -28,7 +29,7 @@ export class RunEvalStep {
|
|
|
28
29
|
const start = Date.now();
|
|
29
30
|
const { rootDir, debug, concurrency, noCache } = ctx.config;
|
|
30
31
|
// Precondition: config file exists
|
|
31
|
-
const configIssues = checkGeneratedConfigsExist(rootDir);
|
|
32
|
+
const configIssues = checkGeneratedConfigsExist(rootDir, this.mode);
|
|
32
33
|
const configErrors = configIssues.filter((i) => i.severity === "error");
|
|
33
34
|
if (configErrors.length > 0) {
|
|
34
35
|
return {
|
|
@@ -38,38 +39,41 @@ export class RunEvalStep {
|
|
|
38
39
|
};
|
|
39
40
|
}
|
|
40
41
|
// Precondition: canonical context files exist for filtered tasks.
|
|
41
|
-
//
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
42
|
+
// Only applies to literacy mode — other modes don't use canonical doc contexts.
|
|
43
|
+
if (this.mode === "literacy") {
|
|
44
|
+
// Must apply the same area/task filter as fetch-docs so we only
|
|
45
|
+
// check contexts that were actually fetched.
|
|
46
|
+
const filter = ctx.config.areas || ctx.config.tasks || ctx.config.tags
|
|
47
|
+
? {
|
|
48
|
+
...(ctx.config.areas ? { areas: ctx.config.areas } : {}),
|
|
49
|
+
...(ctx.config.tasks ? { taskIds: ctx.config.tasks } : {}),
|
|
50
|
+
...(ctx.config.tags ? { tags: ctx.config.tags } : {}),
|
|
51
|
+
}
|
|
52
|
+
: undefined;
|
|
53
|
+
let tasks = await ctx.taskSource.loadTasks(filter);
|
|
54
|
+
// Release auto-scope: narrow to affected tasks (mirrors GenerateConfigsStep)
|
|
55
|
+
if (state.releaseAutoScope && !ctx.config.noAutoScope) {
|
|
56
|
+
const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
|
|
57
|
+
tasks = tasks.filter((t) => scopedIds.has(t.id));
|
|
58
|
+
}
|
|
59
|
+
// Only check context files for tasks that have canonical docs.
|
|
60
|
+
// Tasks without canonical docs are skipped by FetchDocsStep (they
|
|
61
|
+
// have no docs to fetch), so no context file is written for them.
|
|
62
|
+
// The generated Promptfoo config still includes their "without-docs"
|
|
63
|
+
// variant (testing model knowledge alone), which doesn't need a
|
|
64
|
+
// context file.
|
|
65
|
+
// Bridge: narrow to literacy tasks with docs
|
|
66
|
+
const tasksWithDocs = tasks.filter((t) => t.mode === "literacy" && (t.context?.docs?.length ?? 0) > 0);
|
|
67
|
+
const taskIds = tasksWithDocs.map((t) => t.id);
|
|
68
|
+
const contextIssues = checkCanonicalContextsExist(rootDir, taskIds);
|
|
69
|
+
const contextErrors = contextIssues.filter((i) => i.severity === "error");
|
|
70
|
+
if (contextErrors.length > 0) {
|
|
71
|
+
return {
|
|
72
|
+
durationMs: Date.now() - start,
|
|
73
|
+
error: `Context files missing. Run fetch-docs first. ${contextErrors.map((e) => e.message).join("; ")}`,
|
|
74
|
+
status: "failed",
|
|
75
|
+
};
|
|
48
76
|
}
|
|
49
|
-
: undefined;
|
|
50
|
-
let tasks = await ctx.taskSource.loadTasks(filter);
|
|
51
|
-
// Release auto-scope: narrow to affected tasks (mirrors GenerateConfigsStep)
|
|
52
|
-
if (state.releaseAutoScope && !ctx.config.noAutoScope) {
|
|
53
|
-
const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
|
|
54
|
-
tasks = tasks.filter((t) => scopedIds.has(t.id));
|
|
55
|
-
}
|
|
56
|
-
// Only check context files for tasks that have canonical docs.
|
|
57
|
-
// Tasks without canonical docs are skipped by FetchDocsStep (they
|
|
58
|
-
// have no docs to fetch), so no context file is written for them.
|
|
59
|
-
// The generated Promptfoo config still includes their "without-docs"
|
|
60
|
-
// variant (testing model knowledge alone), which doesn't need a
|
|
61
|
-
// context file.
|
|
62
|
-
// Bridge: narrow to literacy tasks with docs
|
|
63
|
-
const tasksWithDocs = tasks.filter((t) => t.mode === "literacy" && (t.context?.docs?.length ?? 0) > 0);
|
|
64
|
-
const taskIds = tasksWithDocs.map((t) => t.id);
|
|
65
|
-
const contextIssues = checkCanonicalContextsExist(rootDir, taskIds);
|
|
66
|
-
const contextErrors = contextIssues.filter((i) => i.severity === "error");
|
|
67
|
-
if (contextErrors.length > 0) {
|
|
68
|
-
return {
|
|
69
|
-
durationMs: Date.now() - start,
|
|
70
|
-
error: `Context files missing. Run fetch-docs first. ${contextErrors.map((e) => e.message).join("; ")}`,
|
|
71
|
-
status: "failed",
|
|
72
|
-
};
|
|
73
77
|
}
|
|
74
78
|
// -----------------------------------------------------------------
|
|
75
79
|
// Compute eval fingerprint (for remote cache + provenance)
|
|
@@ -109,6 +113,11 @@ export class RunEvalStep {
|
|
|
109
113
|
// required eval modes were satisfied from the remote cache.
|
|
110
114
|
state.remoteCacheHits ??= new Set();
|
|
111
115
|
state.remoteCacheHits.add(this.mode);
|
|
116
|
+
// Capture the restored score-summary from remote cache
|
|
117
|
+
const cachedSummaryPath = resolve(rootDir, "results", "latest", "score-summary.json");
|
|
118
|
+
if (existsSync(cachedSummaryPath)) {
|
|
119
|
+
ctx.collector.captureFile("run-eval", "score-summary-cached", cachedSummaryPath, { source: "remote-cache", mode: this.mode });
|
|
120
|
+
}
|
|
112
121
|
return {
|
|
113
122
|
durationMs: Date.now() - start,
|
|
114
123
|
status: "success",
|
|
@@ -143,6 +152,7 @@ export class RunEvalStep {
|
|
|
143
152
|
configPath: configFile,
|
|
144
153
|
env: subprocessEnv,
|
|
145
154
|
filterFlags: filterFlags.trim() || undefined,
|
|
155
|
+
maxDurationMs: ctx.config.evalBudgetMs,
|
|
146
156
|
});
|
|
147
157
|
// Check if results were written despite non-zero exit
|
|
148
158
|
if (result.status === "failed") {
|
|
@@ -172,6 +182,13 @@ export class RunEvalStep {
|
|
|
172
182
|
console.log();
|
|
173
183
|
console.log(errorSummary);
|
|
174
184
|
}
|
|
185
|
+
// Capture eval results
|
|
186
|
+
const resultsPath = resolve(rootDir, resultsFileForMode(this.mode));
|
|
187
|
+
if (existsSync(resultsPath)) {
|
|
188
|
+
ctx.collector.captureFile("run-eval", `eval-results-${this.mode}`, resultsPath, {
|
|
189
|
+
mode: this.mode,
|
|
190
|
+
});
|
|
191
|
+
}
|
|
175
192
|
const durationMs = Date.now() - start;
|
|
176
193
|
return {
|
|
177
194
|
durationMs,
|
|
@@ -182,6 +199,9 @@ export class RunEvalStep {
|
|
|
182
199
|
cacheInputs(ctx) {
|
|
183
200
|
return getStepInputPaths(ctx.config.rootDir, `eval-${this.mode}`);
|
|
184
201
|
}
|
|
202
|
+
cacheContext(ctx) {
|
|
203
|
+
return buildCacheContext(ctx.config);
|
|
204
|
+
}
|
|
185
205
|
}
|
|
186
206
|
// ---------------------------------------------------------------------------
|
|
187
207
|
// Remote cache helpers
|
|
@@ -47,6 +47,12 @@ export const CANONICAL_DOC_MAP = {
|
|
|
47
47
|
// ---------------------------------------------------------------------------
|
|
48
48
|
export function detectFeatureArea(description) {
|
|
49
49
|
const desc = description.toLowerCase();
|
|
50
|
+
if (desc.includes("portable text"))
|
|
51
|
+
return "portable-text";
|
|
52
|
+
if (desc.includes("content lake"))
|
|
53
|
+
return "content-lake";
|
|
54
|
+
if (desc.includes("image handling") || desc.includes("image asset"))
|
|
55
|
+
return "image-handling";
|
|
50
56
|
if (desc.includes("studio"))
|
|
51
57
|
return "studio-setup";
|
|
52
58
|
if (desc.includes("visual") ||
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
* layered on top for ambiguous cases when higher precision is needed.
|
|
14
14
|
*
|
|
15
15
|
* @see docs/design-docs/scenario-matrix/per-document-attribution.md
|
|
16
|
-
* @see docs/exec-plans/scenario-matrix-implementation/phase-2-impact-scenarios.md
|
|
16
|
+
* @see docs/archive/exec-plans/scenario-matrix-implementation/phase-2-impact-scenarios.md
|
|
17
17
|
*/
|
|
18
18
|
import type { AttributionReport, ComparisonReport } from "./types.js";
|
|
19
19
|
import type { ResolvedMappings } from "./resolve-mappings.js";
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
* layered on top for ambiguous cases when higher precision is needed.
|
|
14
14
|
*
|
|
15
15
|
* @see docs/design-docs/scenario-matrix/per-document-attribution.md
|
|
16
|
-
* @see docs/exec-plans/scenario-matrix-implementation/phase-2-impact-scenarios.md
|
|
16
|
+
* @see docs/archive/exec-plans/scenario-matrix-implementation/phase-2-impact-scenarios.md
|
|
17
17
|
*/
|
|
18
18
|
// ---------------------------------------------------------------------------
|
|
19
19
|
// Public API
|
package/dist/pipeline/cache.js
CHANGED
|
@@ -125,15 +125,18 @@ export function getStepInputPaths(rootDir, step) {
|
|
|
125
125
|
}
|
|
126
126
|
}
|
|
127
127
|
// Task files (contain assertions and test definitions).
|
|
128
|
-
//
|
|
129
|
-
// explicitly above per mode.
|
|
128
|
+
// Task files live in tasks/{mode}/ subdirectories (e.g., tasks/literacy/)
|
|
130
129
|
const tasksDir = r("tasks");
|
|
131
130
|
if (existsSync(tasksDir)) {
|
|
132
|
-
const
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
131
|
+
for (const entry of readdirSync(tasksDir)) {
|
|
132
|
+
const subDir = join(tasksDir, entry);
|
|
133
|
+
if (statSync(subDir).isDirectory()) {
|
|
134
|
+
const taskFiles = readdirSync(subDir)
|
|
135
|
+
.filter((f) => /\.(task\.ts|task\.js)$/.test(f))
|
|
136
|
+
.map((f) => join(subDir, f));
|
|
137
|
+
paths.push(...taskFiles);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
137
140
|
}
|
|
138
141
|
// Reference solutions (used by grader assertions)
|
|
139
142
|
const refDir = r("canonical/reference-solutions");
|
|
@@ -155,12 +158,18 @@ export function getStepInputPaths(rootDir, step) {
|
|
|
155
158
|
if (modelsPath2)
|
|
156
159
|
paths.push(modelsPath2);
|
|
157
160
|
// Include all task files (they define feature areas)
|
|
161
|
+
// Task files live in tasks/{mode}/ subdirectories (e.g., tasks/literacy/)
|
|
158
162
|
const tasksDir = r("tasks");
|
|
159
163
|
if (existsSync(tasksDir)) {
|
|
160
|
-
const
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
+
for (const entry of readdirSync(tasksDir)) {
|
|
165
|
+
const subDir = join(tasksDir, entry);
|
|
166
|
+
if (statSync(subDir).isDirectory()) {
|
|
167
|
+
const taskFiles = readdirSync(subDir)
|
|
168
|
+
.filter((f) => /\.(task\.ts|task\.js)$/.test(f))
|
|
169
|
+
.map((f) => join(subDir, f));
|
|
170
|
+
paths.push(...taskFiles);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
164
173
|
}
|
|
165
174
|
return paths;
|
|
166
175
|
}
|
|
@@ -175,10 +184,15 @@ export function getStepInputPaths(rootDir, step) {
|
|
|
175
184
|
paths.push(sourcesPath2);
|
|
176
185
|
const tasksDir = r("tasks");
|
|
177
186
|
if (existsSync(tasksDir)) {
|
|
178
|
-
const
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
187
|
+
for (const entry of readdirSync(tasksDir)) {
|
|
188
|
+
const subDir = join(tasksDir, entry);
|
|
189
|
+
if (statSync(subDir).isDirectory()) {
|
|
190
|
+
const taskFiles = readdirSync(subDir)
|
|
191
|
+
.filter((f) => /\.(task\.ts|task\.js)$/.test(f))
|
|
192
|
+
.map((f) => join(subDir, f));
|
|
193
|
+
paths.push(...taskFiles);
|
|
194
|
+
}
|
|
195
|
+
}
|
|
182
196
|
}
|
|
183
197
|
return paths;
|
|
184
198
|
}
|
|
@@ -38,6 +38,8 @@ export interface RawTestResult {
|
|
|
38
38
|
componentResults: ComponentResult[];
|
|
39
39
|
pass: boolean;
|
|
40
40
|
};
|
|
41
|
+
/** Per-test latency in ms (populated by Promptfoo when available) */
|
|
42
|
+
latencyMs?: number;
|
|
41
43
|
metadata?: Record<string, unknown>;
|
|
42
44
|
provider?: {
|
|
43
45
|
id?: string;
|