@sanity/ailf 1.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +29 -12
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
- package/dist/_vendor/ailf-core/config-helpers.js +51 -2
- package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
- package/dist/_vendor/ailf-core/examples/index.js +213 -94
- package/dist/_vendor/ailf-core/index.d.ts +3 -2
- package/dist/_vendor/ailf-core/index.js +2 -1
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
- package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +7 -1
- package/dist/adapters/config-sources/ts-config-loader.js +21 -13
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
- package/dist/adapters/task-sources/index.d.ts +3 -4
- package/dist/adapters/task-sources/index.js +3 -4
- package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
- package/dist/adapters/task-sources/repo-schemas.js +228 -20
- package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
- package/dist/adapters/task-sources/repo-task-source.js +81 -122
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
- package/dist/adapters/task-sources/task-file-loader.js +21 -7
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/coverage-audit.js +3 -1
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +5 -4
- package/dist/commands/init.js +190 -25
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +15 -4
- package/dist/composition-root.js +100 -55
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/build-step-sequence.js +4 -2
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +32 -19
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +77 -26
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +51 -31
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
- package/dist/pipeline/compiler/literacy-bridge.js +2 -2
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
- package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
- package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/expand-tasks.d.ts +2 -2
- package/dist/pipeline/expand-tasks.js +2 -2
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +16 -9
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +16 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
- package/dist/pipeline/mirror-repo-tasks.js +10 -10
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +68 -30
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +32 -24
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -8,8 +8,12 @@
|
|
|
8
8
|
* When the variant is "full", the handler is called twice (baseline + agentic)
|
|
9
9
|
* and three YAML files are written. Other modes produce one YAML file.
|
|
10
10
|
*/
|
|
11
|
+
import { existsSync } from "node:fs";
|
|
12
|
+
import { resolve } from "node:path";
|
|
11
13
|
import { LiteracyVariant } from "../../pipeline/normalize-mode.js";
|
|
14
|
+
import { modelMatchesLiteracyVariant } from "../../pipeline/compiler/mode-bases/literacy.js";
|
|
12
15
|
import { getStepInputPaths } from "../../pipeline/cache.js";
|
|
16
|
+
import { buildCacheContext } from "../cache-context.js";
|
|
13
17
|
import { checkGeneratedConfigsExist } from "../../pipeline/checks.js";
|
|
14
18
|
import { validateModelsYaml } from "../../pipeline/validate.js";
|
|
15
19
|
import { loadSource } from "../../sources.js";
|
|
@@ -85,21 +89,14 @@ export class GenerateConfigsStep {
|
|
|
85
89
|
// ---------------------------------------------------------------------------
|
|
86
90
|
async compileLiteracyVariants(ctx, handler, tasks, models, providers, start) {
|
|
87
91
|
ctx.logger.info(`Compiling ${tasks.length} literacy task(s) via registry handler...`);
|
|
88
|
-
// Filter models per variant
|
|
92
|
+
// Filter models per variant using shared literacy variant matcher
|
|
89
93
|
const baselineModels = models.models
|
|
90
|
-
.filter((m) =>
|
|
91
|
-
.map((m) => ({
|
|
92
|
-
id: m.id,
|
|
93
|
-
label: m.label,
|
|
94
|
-
}));
|
|
94
|
+
.filter((m) => modelMatchesLiteracyVariant(m, "baseline"))
|
|
95
|
+
.map((m) => ({ id: m.id, label: m.label }));
|
|
95
96
|
const agenticModels = models.models
|
|
96
|
-
.filter((m) =>
|
|
97
|
-
m
|
|
98
|
-
m.
|
|
99
|
-
.map((m) => ({
|
|
100
|
-
id: m.id,
|
|
101
|
-
label: m.label,
|
|
102
|
-
}));
|
|
97
|
+
.filter((m) => modelMatchesLiteracyVariant(m, "agentic-naive") ||
|
|
98
|
+
modelMatchesLiteracyVariant(m, "agentic-optimized"))
|
|
99
|
+
.map((m) => ({ id: m.id, label: m.label }));
|
|
103
100
|
// Load rubric config for template resolution
|
|
104
101
|
let rubricConfig;
|
|
105
102
|
try {
|
|
@@ -137,6 +134,14 @@ export class GenerateConfigsStep {
|
|
|
137
134
|
maxConcurrency: models.maxConcurrency,
|
|
138
135
|
logger: ctx.logger,
|
|
139
136
|
});
|
|
137
|
+
// Capture generated config files (use configFileForMode for legacy naming)
|
|
138
|
+
const { configFileForMode } = await import("../../pipeline/eval-constants.js");
|
|
139
|
+
for (const variant of ["baseline", "agentic", "observed"]) {
|
|
140
|
+
const configPath = resolve(ctx.config.rootDir, configFileForMode(variant));
|
|
141
|
+
if (existsSync(configPath)) {
|
|
142
|
+
ctx.collector.captureFile("generate-configs", `promptfoo-config-${variant}`, configPath, { mode: "literacy", variant });
|
|
143
|
+
}
|
|
144
|
+
}
|
|
140
145
|
return this.checkLiteracyPostconditions(ctx, start);
|
|
141
146
|
}
|
|
142
147
|
// ---------------------------------------------------------------------------
|
|
@@ -144,13 +149,18 @@ export class GenerateConfigsStep {
|
|
|
144
149
|
// ---------------------------------------------------------------------------
|
|
145
150
|
async compileSingleMode(ctx, handler, tasks, mode, models, start) {
|
|
146
151
|
ctx.logger.info(`Compiling ${tasks.length} ${mode} task(s) via registry handler...`);
|
|
152
|
+
// Filter models to those that declare this mode in their modes array
|
|
153
|
+
const modeModels = models.models
|
|
154
|
+
.filter((m) => !m.modes || m.modes.includes(mode))
|
|
155
|
+
.map((m) => ({
|
|
156
|
+
id: m.id,
|
|
157
|
+
label: m.label,
|
|
158
|
+
config: m.config,
|
|
159
|
+
}));
|
|
147
160
|
const merged = this.compileAll(handler, tasks, {
|
|
148
161
|
rootDir: ctx.config.rootDir,
|
|
149
162
|
graderProvider: models.grader.id,
|
|
150
|
-
models:
|
|
151
|
-
id: m.id,
|
|
152
|
-
label: m.label,
|
|
153
|
-
})),
|
|
163
|
+
models: modeModels,
|
|
154
164
|
});
|
|
155
165
|
for (const w of merged.warnings) {
|
|
156
166
|
ctx.logger.warn(` ⚠ ${w}`);
|
|
@@ -163,6 +173,18 @@ export class GenerateConfigsStep {
|
|
|
163
173
|
maxConcurrency: models.maxConcurrency,
|
|
164
174
|
logger: ctx.logger,
|
|
165
175
|
});
|
|
176
|
+
// Capture generated config file
|
|
177
|
+
const configPath = resolve(ctx.config.rootDir, `promptfooconfig.${mode}.yaml`);
|
|
178
|
+
if (existsSync(configPath)) {
|
|
179
|
+
ctx.collector.captureFile("generate-configs", "promptfoo-config", configPath, { mode });
|
|
180
|
+
}
|
|
181
|
+
// Capture mode-specific test artifacts (extras)
|
|
182
|
+
if (ctx.collector.extrasEnabled) {
|
|
183
|
+
const testsPath = resolve(ctx.config.rootDir, "results", "latest", `${mode}-tests.json`);
|
|
184
|
+
if (existsSync(testsPath)) {
|
|
185
|
+
ctx.collector.captureFile("generate-configs", `${mode}-tests`, testsPath, { mode });
|
|
186
|
+
}
|
|
187
|
+
}
|
|
166
188
|
return {
|
|
167
189
|
durationMs: Date.now() - start,
|
|
168
190
|
status: "success",
|
|
@@ -175,19 +197,45 @@ export class GenerateConfigsStep {
|
|
|
175
197
|
async loadTasks(ctx, mode, state) {
|
|
176
198
|
const { resolve } = await import("path");
|
|
177
199
|
const { discoverTsTaskFiles, loadTsTaskFile } = await import("../../adapters/task-sources/task-file-loader.js");
|
|
178
|
-
const
|
|
179
|
-
|
|
200
|
+
const { resolveVendoredSubdir } = await import("../../pipeline/compiler/config-loader.js");
|
|
201
|
+
// Discover task files from the mode-specific directory and --repo-tasks-path.
|
|
202
|
+
// Use vendored copies in dist/ when @sanity/ailf-core isn't resolvable
|
|
203
|
+
// (i.e., running outside the monorepo via npx).
|
|
204
|
+
const tasksDir = resolveVendoredSubdir(ctx.config.rootDir, `tasks/${mode}`);
|
|
205
|
+
const dirs = [tasksDir];
|
|
206
|
+
// Also search --repo-tasks-path (e.g., .ailf/tasks/) for repo-based tasks
|
|
207
|
+
if (ctx.config.repoTasksPath) {
|
|
208
|
+
const repoDir = resolve(ctx.config.repoTasksPath);
|
|
209
|
+
if (!dirs.includes(repoDir)) {
|
|
210
|
+
dirs.push(repoDir);
|
|
211
|
+
}
|
|
212
|
+
}
|
|
180
213
|
const tasks = [];
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
214
|
+
const skippedByMode = new Map();
|
|
215
|
+
for (const dir of dirs) {
|
|
216
|
+
const files = discoverTsTaskFiles(dir);
|
|
217
|
+
for (const file of files) {
|
|
218
|
+
const raw = await loadTsTaskFile(file);
|
|
219
|
+
for (const t of raw.tasks) {
|
|
220
|
+
const task = t;
|
|
221
|
+
// Filter to matching mode (skip tasks from other modes in same dir)
|
|
222
|
+
if (!("mode" in task) || task.mode === mode) {
|
|
223
|
+
tasks.push(task);
|
|
224
|
+
}
|
|
225
|
+
else {
|
|
226
|
+
const taskMode = task.mode ?? "unknown";
|
|
227
|
+
skippedByMode.set(taskMode, (skippedByMode.get(taskMode) ?? 0) + 1);
|
|
228
|
+
}
|
|
188
229
|
}
|
|
189
230
|
}
|
|
190
231
|
}
|
|
232
|
+
if (skippedByMode.size > 0) {
|
|
233
|
+
const total = [...skippedByMode.values()].reduce((a, b) => a + b, 0);
|
|
234
|
+
const summary = [...skippedByMode.entries()]
|
|
235
|
+
.map(([m, n]) => `${n} ${m}`)
|
|
236
|
+
.join(", ");
|
|
237
|
+
ctx.logger.warn(` ⚠ Skipped ${total} task(s) with non-matching mode (${summary}). Current pipeline mode: ${mode}. Run with --mode <mode> to include them.`);
|
|
238
|
+
}
|
|
191
239
|
// Apply area/task/tag filters
|
|
192
240
|
const filtered = this.applyFilters(ctx, tasks);
|
|
193
241
|
// Release auto-scope
|
|
@@ -264,6 +312,9 @@ export class GenerateConfigsStep {
|
|
|
264
312
|
cacheInputs(ctx) {
|
|
265
313
|
return getStepInputPaths(ctx.config.rootDir, "generate-configs");
|
|
266
314
|
}
|
|
315
|
+
cacheContext(ctx) {
|
|
316
|
+
return buildCacheContext(ctx.config);
|
|
317
|
+
}
|
|
267
318
|
}
|
|
268
319
|
// ---------------------------------------------------------------------------
|
|
269
320
|
// Helpers
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* exist before evaluation begins.
|
|
11
11
|
*
|
|
12
12
|
* @see packages/eval/src/pipeline/mirror-repo-tasks.ts
|
|
13
|
-
* @see docs/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
|
|
13
|
+
* @see docs/archive/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
|
|
14
14
|
*/
|
|
15
15
|
import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
16
16
|
export declare class MirrorRepoTasksStep implements PipelineStep {
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* exist before evaluation begins.
|
|
11
11
|
*
|
|
12
12
|
* @see packages/eval/src/pipeline/mirror-repo-tasks.ts
|
|
13
|
-
* @see docs/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
|
|
13
|
+
* @see docs/archive/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
|
|
14
14
|
*/
|
|
15
15
|
import { getSanityClient } from "../../sanity/client.js";
|
|
16
16
|
import { detectGitContext, mirrorRepoTasks, } from "../../pipeline/mirror-repo-tasks.js";
|
|
@@ -115,12 +115,31 @@ export class PublishReportStep {
|
|
|
115
115
|
};
|
|
116
116
|
// Share reportId with downstream steps (CallbackStep + orchestrator job update)
|
|
117
117
|
state.reportId = reportId;
|
|
118
|
+
// Capture report object (Tier 2)
|
|
119
|
+
ctx.collector.capture("publish-report", "report-object", report);
|
|
120
|
+
// Capture auto-comparison if present (Tier 2)
|
|
121
|
+
if (comparison) {
|
|
122
|
+
ctx.collector.capture("publish-report", "auto-comparison", comparison);
|
|
123
|
+
}
|
|
118
124
|
// Write to store (system of record — best-effort, P5)
|
|
119
125
|
const sanityResult = ctx.reportStore
|
|
120
126
|
? await ctx.reportStore.write(report)
|
|
121
127
|
: null;
|
|
122
128
|
// Run sinks (fire-and-forget, P6)
|
|
123
129
|
const publishResult = await runSinks(report, ctx);
|
|
130
|
+
// Capture sink results (Tier 2)
|
|
131
|
+
if (publishResult.sinkResults.length > 0) {
|
|
132
|
+
ctx.collector.capture("publish-report", "sink-results", {
|
|
133
|
+
sinkCount: publishResult.sinkResults.length,
|
|
134
|
+
results: publishResult.sinkResults.map((r) => ({
|
|
135
|
+
name: r.name,
|
|
136
|
+
status: r.result.status,
|
|
137
|
+
...(r.result.status === "success" ? { detail: r.result.detail } : {}),
|
|
138
|
+
...(r.result.status === "failed" ? { error: r.result.error } : {}),
|
|
139
|
+
...(r.result.status === "skipped" ? { reason: r.result.reason } : {}),
|
|
140
|
+
})),
|
|
141
|
+
});
|
|
142
|
+
}
|
|
124
143
|
// Build result summary
|
|
125
144
|
const parts = [];
|
|
126
145
|
if (sanityResult) {
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* Calls pure functions from pipeline/readiness-report.ts directly.
|
|
5
5
|
* Optional step — failure doesn't stop the pipeline.
|
|
6
6
|
*/
|
|
7
|
-
import { existsSync, readFileSync, writeFileSync } from "fs";
|
|
7
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
8
8
|
import { resolve } from "path";
|
|
9
9
|
import { tryLoadConfigFile } from "../../pipeline/compiler/config-loader.js";
|
|
10
10
|
import { formatReadinessMarkdown, generateReadinessReport, } from "../../pipeline/readiness-report.js";
|
|
@@ -37,7 +37,8 @@ export class ReadinessStep {
|
|
|
37
37
|
}
|
|
38
38
|
const scoreSummary = JSON.parse(readFileSync(scoreSummaryPath, "utf-8"));
|
|
39
39
|
const thresholdConfig = ThresholdConfigSchema.parse(thresholdsLoaded.data);
|
|
40
|
-
|
|
40
|
+
// Read gap-analysis.json from outputDir (gap-analysis step writes there)
|
|
41
|
+
const gapPath = resolve(ctx.config.outputDir, "gap-analysis.json");
|
|
41
42
|
const gapAnalysis = existsSync(gapPath)
|
|
42
43
|
? JSON.parse(readFileSync(gapPath, "utf-8"))
|
|
43
44
|
: undefined;
|
|
@@ -60,7 +61,11 @@ export class ReadinessStep {
|
|
|
60
61
|
console.log(md);
|
|
61
62
|
}
|
|
62
63
|
if (readinessLines.length > 0) {
|
|
63
|
-
|
|
64
|
+
// Write to outputDir (respects --output-dir)
|
|
65
|
+
mkdirSync(ctx.config.outputDir, { recursive: true });
|
|
66
|
+
const readinessPath = resolve(ctx.config.outputDir, "readiness-report.md");
|
|
67
|
+
writeFileSync(readinessPath, readinessLines.join("\n---\n\n"));
|
|
68
|
+
ctx.collector.captureFile("readiness", "readiness-report", readinessPath);
|
|
64
69
|
}
|
|
65
70
|
const passCount = readinessAreas.filter((area) => {
|
|
66
71
|
const areaScore = scoreSummary.scores.find((s) => s.feature === area);
|
|
@@ -4,10 +4,10 @@
|
|
|
4
4
|
* Calls generatePrComment() from pipeline/pr-comment.ts with typed options.
|
|
5
5
|
* No env bridge or process.argv manipulation needed.
|
|
6
6
|
*/
|
|
7
|
-
import {
|
|
7
|
+
import { existsSync, mkdirSync } from "node:fs";
|
|
8
|
+
import { dirname, resolve } from "path";
|
|
8
9
|
import { checkScoreSummaryValid } from "../../pipeline/checks.js";
|
|
9
10
|
import { generatePrComment } from "../../pipeline/pr-comment.js";
|
|
10
|
-
const DEFAULT_REPORT_PATH = "results/latest/pr-comment.md";
|
|
11
11
|
export class ReportStep {
|
|
12
12
|
name = "report";
|
|
13
13
|
check() {
|
|
@@ -15,7 +15,7 @@ export class ReportStep {
|
|
|
15
15
|
}
|
|
16
16
|
async execute(ctx) {
|
|
17
17
|
const start = Date.now();
|
|
18
|
-
// Precondition: score summary exists
|
|
18
|
+
// Precondition: score summary exists (intermediate files stay in rootDir)
|
|
19
19
|
const summaryIssues = checkScoreSummaryValid(ctx.config.rootDir);
|
|
20
20
|
const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
|
|
21
21
|
if (summaryErrors.length > 0) {
|
|
@@ -25,7 +25,12 @@ export class ReportStep {
|
|
|
25
25
|
status: "failed",
|
|
26
26
|
};
|
|
27
27
|
}
|
|
28
|
-
|
|
28
|
+
// User-facing output: --output flag wins, else outputDir
|
|
29
|
+
const resolvedOutput = ctx.config.outputPath ?? resolve(ctx.config.outputDir, "pr-comment.md");
|
|
30
|
+
// Ensure outputDir exists before writing (it may be a custom --output-dir
|
|
31
|
+
// that hasn't been created yet — writePipelineResult runs after the
|
|
32
|
+
// orchestrator returns, so we can't rely on it).
|
|
33
|
+
mkdirSync(dirname(resolvedOutput), { recursive: true });
|
|
29
34
|
try {
|
|
30
35
|
generatePrComment({
|
|
31
36
|
outputPath: resolvedOutput,
|
|
@@ -40,6 +45,14 @@ export class ReportStep {
|
|
|
40
45
|
status: "failed",
|
|
41
46
|
};
|
|
42
47
|
}
|
|
48
|
+
// Capture report artifacts
|
|
49
|
+
if (existsSync(resolvedOutput)) {
|
|
50
|
+
ctx.collector.captureFile("report", "pr-comment", resolvedOutput);
|
|
51
|
+
}
|
|
52
|
+
const pipelineResultPath = resolve(ctx.config.outputDir, "pipeline-result.json");
|
|
53
|
+
if (existsSync(pipelineResultPath)) {
|
|
54
|
+
ctx.collector.captureFile("report", "pipeline-result", pipelineResultPath);
|
|
55
|
+
}
|
|
43
56
|
return {
|
|
44
57
|
durationMs: Date.now() - start,
|
|
45
58
|
status: "success",
|
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
import { existsSync, mkdirSync, writeFileSync } from "fs";
|
|
9
9
|
import { resolve } from "path";
|
|
10
10
|
import { getStepInputPaths } from "../../pipeline/cache.js";
|
|
11
|
+
import { buildCacheContext } from "../cache-context.js";
|
|
11
12
|
import { checkCanonicalContextsExist, checkGeneratedConfigsExist, checkResultsExist, } from "../../pipeline/checks.js";
|
|
12
13
|
import { computeEvalFingerprint } from "../../pipeline/eval-fingerprint.js";
|
|
13
14
|
import { buildFilterFlags, configFileForMode, resultsFileForMode, scanResultsForErrors, } from "../../pipeline/eval-constants.js";
|
|
@@ -38,38 +39,41 @@ export class RunEvalStep {
|
|
|
38
39
|
};
|
|
39
40
|
}
|
|
40
41
|
// Precondition: canonical context files exist for filtered tasks.
|
|
41
|
-
//
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
42
|
+
// Only applies to literacy mode — other modes don't use canonical doc contexts.
|
|
43
|
+
if (this.mode === "literacy") {
|
|
44
|
+
// Must apply the same area/task filter as fetch-docs so we only
|
|
45
|
+
// check contexts that were actually fetched.
|
|
46
|
+
const filter = ctx.config.areas || ctx.config.tasks || ctx.config.tags
|
|
47
|
+
? {
|
|
48
|
+
...(ctx.config.areas ? { areas: ctx.config.areas } : {}),
|
|
49
|
+
...(ctx.config.tasks ? { taskIds: ctx.config.tasks } : {}),
|
|
50
|
+
...(ctx.config.tags ? { tags: ctx.config.tags } : {}),
|
|
51
|
+
}
|
|
52
|
+
: undefined;
|
|
53
|
+
let tasks = await ctx.taskSource.loadTasks(filter);
|
|
54
|
+
// Release auto-scope: narrow to affected tasks (mirrors GenerateConfigsStep)
|
|
55
|
+
if (state.releaseAutoScope && !ctx.config.noAutoScope) {
|
|
56
|
+
const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
|
|
57
|
+
tasks = tasks.filter((t) => scopedIds.has(t.id));
|
|
58
|
+
}
|
|
59
|
+
// Only check context files for tasks that have canonical docs.
|
|
60
|
+
// Tasks without canonical docs are skipped by FetchDocsStep (they
|
|
61
|
+
// have no docs to fetch), so no context file is written for them.
|
|
62
|
+
// The generated Promptfoo config still includes their "without-docs"
|
|
63
|
+
// variant (testing model knowledge alone), which doesn't need a
|
|
64
|
+
// context file.
|
|
65
|
+
// Bridge: narrow to literacy tasks with docs
|
|
66
|
+
const tasksWithDocs = tasks.filter((t) => t.mode === "literacy" && (t.context?.docs?.length ?? 0) > 0);
|
|
67
|
+
const taskIds = tasksWithDocs.map((t) => t.id);
|
|
68
|
+
const contextIssues = checkCanonicalContextsExist(rootDir, taskIds);
|
|
69
|
+
const contextErrors = contextIssues.filter((i) => i.severity === "error");
|
|
70
|
+
if (contextErrors.length > 0) {
|
|
71
|
+
return {
|
|
72
|
+
durationMs: Date.now() - start,
|
|
73
|
+
error: `Context files missing. Run fetch-docs first. ${contextErrors.map((e) => e.message).join("; ")}`,
|
|
74
|
+
status: "failed",
|
|
75
|
+
};
|
|
48
76
|
}
|
|
49
|
-
: undefined;
|
|
50
|
-
let tasks = await ctx.taskSource.loadTasks(filter);
|
|
51
|
-
// Release auto-scope: narrow to affected tasks (mirrors GenerateConfigsStep)
|
|
52
|
-
if (state.releaseAutoScope && !ctx.config.noAutoScope) {
|
|
53
|
-
const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
|
|
54
|
-
tasks = tasks.filter((t) => scopedIds.has(t.id));
|
|
55
|
-
}
|
|
56
|
-
// Only check context files for tasks that have canonical docs.
|
|
57
|
-
// Tasks without canonical docs are skipped by FetchDocsStep (they
|
|
58
|
-
// have no docs to fetch), so no context file is written for them.
|
|
59
|
-
// The generated Promptfoo config still includes their "without-docs"
|
|
60
|
-
// variant (testing model knowledge alone), which doesn't need a
|
|
61
|
-
// context file.
|
|
62
|
-
// Bridge: narrow to literacy tasks with docs
|
|
63
|
-
const tasksWithDocs = tasks.filter((t) => t.mode === "literacy" && (t.context?.docs?.length ?? 0) > 0);
|
|
64
|
-
const taskIds = tasksWithDocs.map((t) => t.id);
|
|
65
|
-
const contextIssues = checkCanonicalContextsExist(rootDir, taskIds);
|
|
66
|
-
const contextErrors = contextIssues.filter((i) => i.severity === "error");
|
|
67
|
-
if (contextErrors.length > 0) {
|
|
68
|
-
return {
|
|
69
|
-
durationMs: Date.now() - start,
|
|
70
|
-
error: `Context files missing. Run fetch-docs first. ${contextErrors.map((e) => e.message).join("; ")}`,
|
|
71
|
-
status: "failed",
|
|
72
|
-
};
|
|
73
77
|
}
|
|
74
78
|
// -----------------------------------------------------------------
|
|
75
79
|
// Compute eval fingerprint (for remote cache + provenance)
|
|
@@ -109,6 +113,11 @@ export class RunEvalStep {
|
|
|
109
113
|
// required eval modes were satisfied from the remote cache.
|
|
110
114
|
state.remoteCacheHits ??= new Set();
|
|
111
115
|
state.remoteCacheHits.add(this.mode);
|
|
116
|
+
// Capture the restored score-summary from remote cache
|
|
117
|
+
const cachedSummaryPath = resolve(rootDir, "results", "latest", "score-summary.json");
|
|
118
|
+
if (existsSync(cachedSummaryPath)) {
|
|
119
|
+
ctx.collector.captureFile("run-eval", "score-summary-cached", cachedSummaryPath, { source: "remote-cache", mode: this.mode });
|
|
120
|
+
}
|
|
112
121
|
return {
|
|
113
122
|
durationMs: Date.now() - start,
|
|
114
123
|
status: "success",
|
|
@@ -143,6 +152,7 @@ export class RunEvalStep {
|
|
|
143
152
|
configPath: configFile,
|
|
144
153
|
env: subprocessEnv,
|
|
145
154
|
filterFlags: filterFlags.trim() || undefined,
|
|
155
|
+
maxDurationMs: ctx.config.evalBudgetMs,
|
|
146
156
|
});
|
|
147
157
|
// Check if results were written despite non-zero exit
|
|
148
158
|
if (result.status === "failed") {
|
|
@@ -172,6 +182,13 @@ export class RunEvalStep {
|
|
|
172
182
|
console.log();
|
|
173
183
|
console.log(errorSummary);
|
|
174
184
|
}
|
|
185
|
+
// Capture eval results
|
|
186
|
+
const resultsPath = resolve(rootDir, resultsFileForMode(this.mode));
|
|
187
|
+
if (existsSync(resultsPath)) {
|
|
188
|
+
ctx.collector.captureFile("run-eval", `eval-results-${this.mode}`, resultsPath, {
|
|
189
|
+
mode: this.mode,
|
|
190
|
+
});
|
|
191
|
+
}
|
|
175
192
|
const durationMs = Date.now() - start;
|
|
176
193
|
return {
|
|
177
194
|
durationMs,
|
|
@@ -182,6 +199,9 @@ export class RunEvalStep {
|
|
|
182
199
|
cacheInputs(ctx) {
|
|
183
200
|
return getStepInputPaths(ctx.config.rootDir, `eval-${this.mode}`);
|
|
184
201
|
}
|
|
202
|
+
cacheContext(ctx) {
|
|
203
|
+
return buildCacheContext(ctx.config);
|
|
204
|
+
}
|
|
185
205
|
}
|
|
186
206
|
// ---------------------------------------------------------------------------
|
|
187
207
|
// Remote cache helpers
|
|
@@ -47,6 +47,12 @@ export const CANONICAL_DOC_MAP = {
|
|
|
47
47
|
// ---------------------------------------------------------------------------
|
|
48
48
|
export function detectFeatureArea(description) {
|
|
49
49
|
const desc = description.toLowerCase();
|
|
50
|
+
if (desc.includes("portable text"))
|
|
51
|
+
return "portable-text";
|
|
52
|
+
if (desc.includes("content lake"))
|
|
53
|
+
return "content-lake";
|
|
54
|
+
if (desc.includes("image handling") || desc.includes("image asset"))
|
|
55
|
+
return "image-handling";
|
|
50
56
|
if (desc.includes("studio"))
|
|
51
57
|
return "studio-setup";
|
|
52
58
|
if (desc.includes("visual") ||
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
* layered on top for ambiguous cases when higher precision is needed.
|
|
14
14
|
*
|
|
15
15
|
* @see docs/design-docs/scenario-matrix/per-document-attribution.md
|
|
16
|
-
* @see docs/exec-plans/scenario-matrix-implementation/phase-2-impact-scenarios.md
|
|
16
|
+
* @see docs/archive/exec-plans/scenario-matrix-implementation/phase-2-impact-scenarios.md
|
|
17
17
|
*/
|
|
18
18
|
import type { AttributionReport, ComparisonReport } from "./types.js";
|
|
19
19
|
import type { ResolvedMappings } from "./resolve-mappings.js";
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
* layered on top for ambiguous cases when higher precision is needed.
|
|
14
14
|
*
|
|
15
15
|
* @see docs/design-docs/scenario-matrix/per-document-attribution.md
|
|
16
|
-
* @see docs/exec-plans/scenario-matrix-implementation/phase-2-impact-scenarios.md
|
|
16
|
+
* @see docs/archive/exec-plans/scenario-matrix-implementation/phase-2-impact-scenarios.md
|
|
17
17
|
*/
|
|
18
18
|
// ---------------------------------------------------------------------------
|
|
19
19
|
// Public API
|
package/dist/pipeline/cache.js
CHANGED
|
@@ -125,15 +125,18 @@ export function getStepInputPaths(rootDir, step) {
|
|
|
125
125
|
}
|
|
126
126
|
}
|
|
127
127
|
// Task files (contain assertions and test definitions).
|
|
128
|
-
//
|
|
129
|
-
// explicitly above per mode.
|
|
128
|
+
// Task files live in tasks/{mode}/ subdirectories (e.g., tasks/literacy/)
|
|
130
129
|
const tasksDir = r("tasks");
|
|
131
130
|
if (existsSync(tasksDir)) {
|
|
132
|
-
const
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
131
|
+
for (const entry of readdirSync(tasksDir)) {
|
|
132
|
+
const subDir = join(tasksDir, entry);
|
|
133
|
+
if (statSync(subDir).isDirectory()) {
|
|
134
|
+
const taskFiles = readdirSync(subDir)
|
|
135
|
+
.filter((f) => /\.(task\.ts|task\.js)$/.test(f))
|
|
136
|
+
.map((f) => join(subDir, f));
|
|
137
|
+
paths.push(...taskFiles);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
137
140
|
}
|
|
138
141
|
// Reference solutions (used by grader assertions)
|
|
139
142
|
const refDir = r("canonical/reference-solutions");
|
|
@@ -155,12 +158,18 @@ export function getStepInputPaths(rootDir, step) {
|
|
|
155
158
|
if (modelsPath2)
|
|
156
159
|
paths.push(modelsPath2);
|
|
157
160
|
// Include all task files (they define feature areas)
|
|
161
|
+
// Task files live in tasks/{mode}/ subdirectories (e.g., tasks/literacy/)
|
|
158
162
|
const tasksDir = r("tasks");
|
|
159
163
|
if (existsSync(tasksDir)) {
|
|
160
|
-
const
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
+
for (const entry of readdirSync(tasksDir)) {
|
|
165
|
+
const subDir = join(tasksDir, entry);
|
|
166
|
+
if (statSync(subDir).isDirectory()) {
|
|
167
|
+
const taskFiles = readdirSync(subDir)
|
|
168
|
+
.filter((f) => /\.(task\.ts|task\.js)$/.test(f))
|
|
169
|
+
.map((f) => join(subDir, f));
|
|
170
|
+
paths.push(...taskFiles);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
164
173
|
}
|
|
165
174
|
return paths;
|
|
166
175
|
}
|
|
@@ -175,10 +184,15 @@ export function getStepInputPaths(rootDir, step) {
|
|
|
175
184
|
paths.push(sourcesPath2);
|
|
176
185
|
const tasksDir = r("tasks");
|
|
177
186
|
if (existsSync(tasksDir)) {
|
|
178
|
-
const
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
187
|
+
for (const entry of readdirSync(tasksDir)) {
|
|
188
|
+
const subDir = join(tasksDir, entry);
|
|
189
|
+
if (statSync(subDir).isDirectory()) {
|
|
190
|
+
const taskFiles = readdirSync(subDir)
|
|
191
|
+
.filter((f) => /\.(task\.ts|task\.js)$/.test(f))
|
|
192
|
+
.map((f) => join(subDir, f));
|
|
193
|
+
paths.push(...taskFiles);
|
|
194
|
+
}
|
|
195
|
+
}
|
|
182
196
|
}
|
|
183
197
|
return paths;
|
|
184
198
|
}
|
|
@@ -38,6 +38,8 @@ export interface RawTestResult {
|
|
|
38
38
|
componentResults: ComponentResult[];
|
|
39
39
|
pass: boolean;
|
|
40
40
|
};
|
|
41
|
+
/** Per-test latency in ms (populated by Promptfoo when available) */
|
|
42
|
+
latencyMs?: number;
|
|
41
43
|
metadata?: Record<string, unknown>;
|
|
42
44
|
provider?: {
|
|
43
45
|
id?: string;
|