@sanity/ailf 2.0.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +28 -23
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
- package/dist/_vendor/ailf-core/config-helpers.js +29 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
- package/dist/_vendor/ailf-core/examples/index.js +208 -114
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
- package/dist/_vendor/ailf-tasks/cli.js +61 -0
- package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
- package/dist/_vendor/ailf-tasks/index.js +16 -0
- package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
- package/dist/_vendor/ailf-tasks/parser.js +73 -0
- package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
- package/dist/_vendor/ailf-tasks/schemas.js +180 -0
- package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
- package/dist/_vendor/ailf-tasks/validation.js +162 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +6 -1
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
- package/dist/adapters/task-sources/index.d.ts +1 -2
- package/dist/adapters/task-sources/index.js +1 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
- package/dist/adapters/task-sources/repo-schemas.js +2 -2
- package/dist/adapters/task-sources/repo-task-source.js +1 -1
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
- package/dist/adapters/task-sources/task-file-loader.js +20 -6
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +2 -3
- package/dist/commands/init.js +56 -170
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/composition-root.d.ts +2 -3
- package/dist/composition-root.js +27 -14
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +30 -16
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +50 -15
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +52 -32
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/checks.d.ts +8 -3
- package/dist/pipeline/checks.js +23 -3
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
- package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
- package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
- package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +15 -8
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +15 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/dist/pipeline/mirror-repo-tasks.js +1 -1
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +67 -29
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +25 -25
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -20,3 +20,4 @@ export * from "./examples/index.js";
|
|
|
20
20
|
// ---------------------------------------------------------------------------
|
|
21
21
|
export { defineConfig, defineFeatures, defineModeBase, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./config-helpers.js";
|
|
22
22
|
export { env } from "./env-helper.js";
|
|
23
|
+
export { NoOpArtifactCollector } from "./artifact-capture/noop-collector.js";
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Port: ArtifactCollector — captures pipeline artifacts during execution.
|
|
3
|
+
*
|
|
4
|
+
* Injected into AppContext. When capture is disabled (default), the
|
|
5
|
+
* composition root provides NoOpArtifactCollector. When --capture is
|
|
6
|
+
* set, provides FilesystemArtifactCollector.
|
|
7
|
+
*
|
|
8
|
+
* Design principles:
|
|
9
|
+
* - P1: Zero-cost when off (no-op stub)
|
|
10
|
+
* - P2: Capture, don't intercept (steps call capture() explicitly)
|
|
11
|
+
* - P5: Non-blocking (failures swallowed, never block the pipeline)
|
|
12
|
+
*/
|
|
13
|
+
/**
|
|
14
|
+
* The contract for artifact capture during pipeline execution.
|
|
15
|
+
*
|
|
16
|
+
* Steps call capture() for in-memory data and captureFile() for
|
|
17
|
+
* artifacts already on disk. The orchestrator calls flush() once
|
|
18
|
+
* at pipeline end to write everything to the configured destination.
|
|
19
|
+
*/
|
|
20
|
+
export interface ArtifactCollector {
|
|
21
|
+
/**
|
|
22
|
+
* Record an in-memory artifact produced during pipeline execution.
|
|
23
|
+
*
|
|
24
|
+
* Callers need not check `enabled` before calling — the NoOp
|
|
25
|
+
* implementation is zero-cost, so unconditional calls are safe.
|
|
26
|
+
*
|
|
27
|
+
* @param step - Pipeline step name (e.g., "run-eval")
|
|
28
|
+
* @param type - Artifact type identifier (e.g., "eval-results")
|
|
29
|
+
* @param data - Content to serialize (JSON or text)
|
|
30
|
+
* @param meta - Optional metadata (variant, model, etc.)
|
|
31
|
+
*/
|
|
32
|
+
capture(step: string, type: string, data: unknown, meta?: Record<string, unknown>): void;
|
|
33
|
+
/**
|
|
34
|
+
* Record a file reference for an artifact already on disk.
|
|
35
|
+
* The file is copied into the capture directory on flush().
|
|
36
|
+
*
|
|
37
|
+
* @param step - Pipeline step name
|
|
38
|
+
* @param type - Artifact type identifier
|
|
39
|
+
* @param filePath - Absolute path to the existing file
|
|
40
|
+
* @param meta - Optional metadata
|
|
41
|
+
*/
|
|
42
|
+
captureFile(step: string, type: string, filePath: string, meta?: Record<string, unknown>): void;
|
|
43
|
+
/**
|
|
44
|
+
* Flush all captured artifacts to the configured destination.
|
|
45
|
+
* Called once at pipeline end by the orchestrator.
|
|
46
|
+
*/
|
|
47
|
+
flush(): Promise<CaptureFlushResult>;
|
|
48
|
+
/** Whether capture is active */
|
|
49
|
+
readonly enabled: boolean;
|
|
50
|
+
/** Whether mode-specific extras are being captured */
|
|
51
|
+
readonly extrasEnabled: boolean;
|
|
52
|
+
}
|
|
53
|
+
/** Result of flushing captured artifacts to the destination. */
|
|
54
|
+
export interface CaptureFlushResult {
|
|
55
|
+
/** Total number of artifacts captured */
|
|
56
|
+
artifactCount: number;
|
|
57
|
+
/** Output path (directory or .tar.gz) */
|
|
58
|
+
destination: string;
|
|
59
|
+
/** Total bytes written (uncompressed) */
|
|
60
|
+
totalBytes: number;
|
|
61
|
+
/** Whether output was compressed */
|
|
62
|
+
compressed: boolean;
|
|
63
|
+
}
|
|
64
|
+
/** A single entry in the capture manifest. */
|
|
65
|
+
export interface ArtifactManifestEntry {
|
|
66
|
+
/** Pipeline step that produced this artifact */
|
|
67
|
+
step: string;
|
|
68
|
+
/** Artifact type identifier */
|
|
69
|
+
type: string;
|
|
70
|
+
/** Relative path within the capture directory */
|
|
71
|
+
path: string;
|
|
72
|
+
/** ISO 8601 timestamp of when capture() was called */
|
|
73
|
+
capturedAt: string;
|
|
74
|
+
/** Byte size of the artifact */
|
|
75
|
+
bytes: number;
|
|
76
|
+
/** Content format */
|
|
77
|
+
format: "json" | "markdown" | "text";
|
|
78
|
+
/** Optional metadata */
|
|
79
|
+
meta?: Record<string, unknown>;
|
|
80
|
+
}
|
|
81
|
+
/** The manifest.json written to each capture directory. */
|
|
82
|
+
export interface ArtifactManifest {
|
|
83
|
+
version: 1;
|
|
84
|
+
captureId: string;
|
|
85
|
+
startedAt: string;
|
|
86
|
+
completedAt: string;
|
|
87
|
+
pipeline: {
|
|
88
|
+
mode: string;
|
|
89
|
+
variant?: string;
|
|
90
|
+
source?: string;
|
|
91
|
+
areas?: string[];
|
|
92
|
+
};
|
|
93
|
+
artifacts: ArtifactManifestEntry[];
|
|
94
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Port: ArtifactCollector — captures pipeline artifacts during execution.
|
|
3
|
+
*
|
|
4
|
+
* Injected into AppContext. When capture is disabled (default), the
|
|
5
|
+
* composition root provides NoOpArtifactCollector. When --capture is
|
|
6
|
+
* set, provides FilesystemArtifactCollector.
|
|
7
|
+
*
|
|
8
|
+
* Design principles:
|
|
9
|
+
* - P1: Zero-cost when off (no-op stub)
|
|
10
|
+
* - P2: Capture, don't intercept (steps call capture() explicitly)
|
|
11
|
+
* - P5: Non-blocking (failures swallowed, never block the pipeline)
|
|
12
|
+
*/
|
|
13
|
+
export {};
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Types for cross-run capture comparison.
|
|
3
|
+
*
|
|
4
|
+
* The CaptureComparator reads two capture directories (baseline + experiment)
|
|
5
|
+
* and produces a CaptureDiffReport. Types are defined in core so external
|
|
6
|
+
* tooling can consume diff reports without depending on the eval package.
|
|
7
|
+
*
|
|
8
|
+
* Implementation lives in packages/eval/src/artifact-capture/comparator.ts.
|
|
9
|
+
*/
|
|
10
|
+
/** How deeply to compare artifacts. */
|
|
11
|
+
export type ComparisonMode = "strict" | "structural" | "inventory";
|
|
12
|
+
/** Configurable thresholds for comparison. */
|
|
13
|
+
export interface ComparisonOptions {
|
|
14
|
+
/** Comparison depth: inventory (existence), structural (shape), strict (content) */
|
|
15
|
+
mode: ComparisonMode;
|
|
16
|
+
/** Score regression thresholds */
|
|
17
|
+
scoreThresholds?: {
|
|
18
|
+
/** Maximum allowed aggregate score delta (percentage points, default 5) */
|
|
19
|
+
aggregate: number;
|
|
20
|
+
/** Maximum allowed per-task score drop (points, default 10) */
|
|
21
|
+
perTask: number;
|
|
22
|
+
};
|
|
23
|
+
/** Timing regression thresholds */
|
|
24
|
+
timingThresholds?: {
|
|
25
|
+
/** Multiplier — flag steps exceeding this ratio (default 2.0) */
|
|
26
|
+
multiplier: number;
|
|
27
|
+
/** Per-step overrides (step name → custom multiplier) */
|
|
28
|
+
perStep?: Record<string, number>;
|
|
29
|
+
};
|
|
30
|
+
/** JSON structural diff depth (default 3) */
|
|
31
|
+
jsonDiffDepth?: number;
|
|
32
|
+
/** Additional ephemeral fields to ignore (merged with defaults) */
|
|
33
|
+
ephemeralFields?: string[];
|
|
34
|
+
}
|
|
35
|
+
/** Inventory diff — which artifacts exist in each capture. */
|
|
36
|
+
export interface InventoryDiff {
|
|
37
|
+
/** Artifact types in experiment but not in baseline */
|
|
38
|
+
added: string[];
|
|
39
|
+
/** Artifact types in baseline but not in experiment */
|
|
40
|
+
removed: string[];
|
|
41
|
+
/** Artifact types present in both */
|
|
42
|
+
common: string[];
|
|
43
|
+
}
|
|
44
|
+
/** A single structural change in a JSON artifact. */
|
|
45
|
+
export interface JsonDiffEntry {
|
|
46
|
+
/** JSON pointer path (e.g., "config.mode") */
|
|
47
|
+
path: string;
|
|
48
|
+
/** Value in baseline (undefined if key is added) */
|
|
49
|
+
baseline?: unknown;
|
|
50
|
+
/** Value in experiment (undefined if key is removed) */
|
|
51
|
+
experiment?: unknown;
|
|
52
|
+
}
|
|
53
|
+
/** Content diff for a single artifact. */
|
|
54
|
+
export interface ArtifactContentDiff {
|
|
55
|
+
/** Artifact type identifier (step/type) */
|
|
56
|
+
artifactKey: string;
|
|
57
|
+
/** Content format */
|
|
58
|
+
format: "json" | "markdown" | "text";
|
|
59
|
+
/** Structural changes (JSON) or line diff summary (text/markdown) */
|
|
60
|
+
changes: JsonDiffEntry[] | {
|
|
61
|
+
addedLines: number;
|
|
62
|
+
removedLines: number;
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
/** Score comparison between two captures. */
|
|
66
|
+
export interface ScoreComparison {
|
|
67
|
+
/** Baseline aggregate score */
|
|
68
|
+
baselineMean: number;
|
|
69
|
+
/** Experiment aggregate score */
|
|
70
|
+
currentMean: number;
|
|
71
|
+
/** Absolute delta (current - baseline) */
|
|
72
|
+
delta: number;
|
|
73
|
+
/** Per-task score deltas */
|
|
74
|
+
perTask: {
|
|
75
|
+
task: string;
|
|
76
|
+
baseline: number;
|
|
77
|
+
current: number;
|
|
78
|
+
delta: number;
|
|
79
|
+
}[];
|
|
80
|
+
/** Tasks that breached configured thresholds */
|
|
81
|
+
breaches: string[];
|
|
82
|
+
}
|
|
83
|
+
/** Timing comparison between two captures. */
|
|
84
|
+
export interface TimingComparison {
|
|
85
|
+
/** Total pipeline duration delta in ms */
|
|
86
|
+
totalDeltaMs: number;
|
|
87
|
+
/** Per-step timing */
|
|
88
|
+
perStep: {
|
|
89
|
+
step: string;
|
|
90
|
+
baselineMs: number;
|
|
91
|
+
currentMs: number;
|
|
92
|
+
ratio: number;
|
|
93
|
+
}[];
|
|
94
|
+
/** Steps that breached the timing multiplier threshold */
|
|
95
|
+
breaches: string[];
|
|
96
|
+
}
|
|
97
|
+
/** Metadata comparison between two captures. */
|
|
98
|
+
export interface MetadataComparison {
|
|
99
|
+
/** Whether pipeline modes match */
|
|
100
|
+
modeMatch: boolean;
|
|
101
|
+
/** Whether pipeline variants match */
|
|
102
|
+
variantMatch: boolean;
|
|
103
|
+
/** Config key differences */
|
|
104
|
+
configDiffs: JsonDiffEntry[];
|
|
105
|
+
}
|
|
106
|
+
/** Security scan results. */
|
|
107
|
+
export interface SecurityScan {
|
|
108
|
+
/** Whether any potential secret leaks were found */
|
|
109
|
+
leaksFound: boolean;
|
|
110
|
+
/** Details of each violation */
|
|
111
|
+
violations: {
|
|
112
|
+
/** Relative artifact file path */
|
|
113
|
+
file: string;
|
|
114
|
+
/** Description of the finding */
|
|
115
|
+
detail: string;
|
|
116
|
+
}[];
|
|
117
|
+
}
|
|
118
|
+
/** The full diff report produced by CaptureComparator. */
|
|
119
|
+
export interface CaptureDiffReport {
|
|
120
|
+
/** Are the two captures semantically equivalent? */
|
|
121
|
+
equivalent: boolean;
|
|
122
|
+
/** Human-readable summary (1-3 sentences) */
|
|
123
|
+
summary: string;
|
|
124
|
+
/** Comparison mode used */
|
|
125
|
+
mode: ComparisonMode;
|
|
126
|
+
/** Artifact inventory diff */
|
|
127
|
+
inventory: InventoryDiff;
|
|
128
|
+
/** Content diffs for common artifacts (structural/strict modes only) */
|
|
129
|
+
content?: ArtifactContentDiff[];
|
|
130
|
+
/** Score comparison (if score-summary exists in both captures) */
|
|
131
|
+
scores?: ScoreComparison;
|
|
132
|
+
/** Timing comparison (if pipeline-context exists in both captures) */
|
|
133
|
+
timing?: TimingComparison;
|
|
134
|
+
/** Metadata comparison */
|
|
135
|
+
metadata?: MetadataComparison;
|
|
136
|
+
/** Security scan results */
|
|
137
|
+
security: SecurityScan;
|
|
138
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Types for cross-run capture comparison.
|
|
3
|
+
*
|
|
4
|
+
* The CaptureComparator reads two capture directories (baseline + experiment)
|
|
5
|
+
* and produces a CaptureDiffReport. Types are defined in core so external
|
|
6
|
+
* tooling can consume diff reports without depending on the eval package.
|
|
7
|
+
*
|
|
8
|
+
* Implementation lives in packages/eval/src/artifact-capture/comparator.ts.
|
|
9
|
+
*/
|
|
10
|
+
export {};
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
* as downstream consumers are converted to use them.
|
|
13
13
|
*/
|
|
14
14
|
import type { DebugOptions, EvalMode, PluginRegistry } from "../types/index.js";
|
|
15
|
+
import type { ArtifactCollector } from "./artifact-collector.js";
|
|
15
16
|
import type { CacheStore } from "./cache-store.js";
|
|
16
17
|
import type { DocFetcher } from "./doc-fetcher.js";
|
|
17
18
|
import type { EvalRunner } from "./eval-runner.js";
|
|
@@ -78,6 +79,8 @@ export interface ResolvedConfig {
|
|
|
78
79
|
noRemoteCache: boolean;
|
|
79
80
|
/** Grader replications for consistency measurement */
|
|
80
81
|
graderReplications?: number;
|
|
82
|
+
/** Base directory for user-facing pipeline output artifacts. */
|
|
83
|
+
outputDir: string;
|
|
81
84
|
/** Output path override */
|
|
82
85
|
outputPath?: string;
|
|
83
86
|
/** Doc source URL overrides */
|
|
@@ -90,6 +93,12 @@ export interface ResolvedConfig {
|
|
|
90
93
|
searchMode: "off" | "open" | "origin-only";
|
|
91
94
|
/** Eval concurrency */
|
|
92
95
|
concurrency?: number;
|
|
96
|
+
/**
|
|
97
|
+
* Maximum wall-clock time per eval step in ms.
|
|
98
|
+
* When exceeded, the subprocess is killed and partial results are used.
|
|
99
|
+
* Sourced from models config `evalBudgetMs`.
|
|
100
|
+
*/
|
|
101
|
+
evalBudgetMs?: number;
|
|
93
102
|
/** Promptfoo URL from eval output */
|
|
94
103
|
promptfooUrl?: string;
|
|
95
104
|
/** Sanity dataset override */
|
|
@@ -109,7 +118,7 @@ export interface ResolvedConfig {
|
|
|
109
118
|
/** Before option for comparison */
|
|
110
119
|
beforeOption?: string;
|
|
111
120
|
/** Task source adapter selection */
|
|
112
|
-
taskSourceType?: "content-lake" | "repo"
|
|
121
|
+
taskSourceType?: "content-lake" | "repo";
|
|
113
122
|
/** Path to repo-based tasks directory (e.g., .ailf/tasks/) */
|
|
114
123
|
repoTasksPath?: string;
|
|
115
124
|
/** Report store project ID from .ailf/config.yaml reportStore block */
|
|
@@ -142,6 +151,14 @@ export interface ResolvedConfig {
|
|
|
142
151
|
apiKey?: string;
|
|
143
152
|
/** External preset file paths or npm package names to load */
|
|
144
153
|
presets?: string[];
|
|
154
|
+
/** Whether artifact capture is enabled for this run (default: false) */
|
|
155
|
+
captureEnabled?: boolean;
|
|
156
|
+
/** Base directory for capture output (default: results/captures/) */
|
|
157
|
+
captureDir?: string;
|
|
158
|
+
/** Whether to compress capture output to tar.gz (default: true) */
|
|
159
|
+
captureCompress?: boolean;
|
|
160
|
+
/** Whether to include mode-specific extra artifacts (default: true) */
|
|
161
|
+
captureExtras?: boolean;
|
|
145
162
|
}
|
|
146
163
|
/**
|
|
147
164
|
* Application context — the complete dependency carrier.
|
|
@@ -158,6 +175,8 @@ export interface ResolvedConfig {
|
|
|
158
175
|
export interface AppContext {
|
|
159
176
|
/** Evaluation caching (filesystem + optional Content Lake fallback) */
|
|
160
177
|
readonly cache?: CacheStore;
|
|
178
|
+
/** Artifact capture collector (no-op when --capture is not set) */
|
|
179
|
+
readonly collector: ArtifactCollector;
|
|
161
180
|
/** Resolved pipeline configuration */
|
|
162
181
|
readonly config: ResolvedConfig;
|
|
163
182
|
/** Documentation context fetcher */
|
|
@@ -15,6 +15,12 @@ export interface EvalRunConfig {
|
|
|
15
15
|
concurrency?: number;
|
|
16
16
|
/** Environment variables to pass to the eval process */
|
|
17
17
|
env?: Record<string, string>;
|
|
18
|
+
/**
|
|
19
|
+
* Maximum wall-clock time for this eval subprocess in ms.
|
|
20
|
+
* When exceeded, the process is killed and partial results are used.
|
|
21
|
+
* Default: no limit (backward compatible).
|
|
22
|
+
*/
|
|
23
|
+
maxDurationMs?: number;
|
|
18
24
|
}
|
|
19
25
|
export interface EvalRunner {
|
|
20
26
|
/** Run an evaluation and return the step result */
|
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
* Ports define the contracts between the domain kernel and the outside world.
|
|
5
5
|
* Adapters (in packages/eval) implement these interfaces.
|
|
6
6
|
*/
|
|
7
|
+
export type { ArtifactCollector, ArtifactManifest, ArtifactManifestEntry, CaptureFlushResult, } from "./artifact-collector.js";
|
|
8
|
+
export type { ArtifactContentDiff, CaptureDiffReport, ComparisonMode, ComparisonOptions, InventoryDiff, JsonDiffEntry, MetadataComparison, ScoreComparison, SecurityScan, TimingComparison, } from "./capture-comparator.js";
|
|
7
9
|
export type { CacheEntryMetadata, CacheKey, CacheLookupResult, CacheRecordInput, CacheStore, } from "./cache-store.js";
|
|
8
10
|
export type { ConfigSource } from "./config-source.js";
|
|
9
11
|
export type { AppContext, ReportSinkPort, ReportStorePort, ResolvedConfig, } from "./context.js";
|
|
@@ -38,6 +38,17 @@ export interface PipelineStep {
|
|
|
38
38
|
* When defined, the StepRunner computes a hash and checks the cache.
|
|
39
39
|
*/
|
|
40
40
|
cacheInputs?(ctx: AppContext): string[];
|
|
41
|
+
/**
|
|
42
|
+
* Cache context strings — non-file state that participates in cache key
|
|
43
|
+
* computation (e.g., mode, variant, area/task/tag filters).
|
|
44
|
+
*
|
|
45
|
+
* Without these, two runs with different CLI flags but identical config
|
|
46
|
+
* files would share a cache entry, causing cross-mode or cross-area
|
|
47
|
+
* contamination.
|
|
48
|
+
*
|
|
49
|
+
* When undefined, only file content determines the cache key.
|
|
50
|
+
*/
|
|
51
|
+
cacheContext?(ctx: AppContext): string[];
|
|
41
52
|
/**
|
|
42
53
|
* Whether this step is optional — a failure in an optional step
|
|
43
54
|
* does not stop the pipeline.
|
|
@@ -2,9 +2,9 @@
|
|
|
2
2
|
* Port: Where task definitions come from.
|
|
3
3
|
*
|
|
4
4
|
* Adapters:
|
|
5
|
-
* -
|
|
6
|
-
* -
|
|
7
|
-
* -
|
|
5
|
+
* - ContentLakeTaskSource — GROQ query against Sanity Content Lake
|
|
6
|
+
* - RepoTaskSource — reads .ailf/tasks/*.task.ts files
|
|
7
|
+
* - TsTaskFileLoader — reads tasks/{mode}/*.task.ts (eval package)
|
|
8
8
|
*
|
|
9
9
|
* The key invariant: the pipeline orchestrator and all downstream steps
|
|
10
10
|
* work with GeneralizedTaskDefinition[] regardless of where they came from.
|
|
@@ -2,9 +2,9 @@
|
|
|
2
2
|
* Port: Where task definitions come from.
|
|
3
3
|
*
|
|
4
4
|
* Adapters:
|
|
5
|
-
* -
|
|
6
|
-
* -
|
|
7
|
-
* -
|
|
5
|
+
* - ContentLakeTaskSource — GROQ query against Sanity Content Lake
|
|
6
|
+
* - RepoTaskSource — reads .ailf/tasks/*.task.ts files
|
|
7
|
+
* - TsTaskFileLoader — reads tasks/{mode}/*.task.ts (eval package)
|
|
8
8
|
*
|
|
9
9
|
* The key invariant: the pipeline orchestrator and all downstream steps
|
|
10
10
|
* work with GeneralizedTaskDefinition[] regardless of where they came from.
|
|
@@ -77,10 +77,15 @@ export declare const PipelineRequestSchema: z.ZodObject<{
|
|
|
77
77
|
taskMode: z.ZodOptional<z.ZodEnum<{
|
|
78
78
|
inline: "inline";
|
|
79
79
|
"content-lake": "content-lake";
|
|
80
|
-
yaml: "yaml";
|
|
81
80
|
}>>;
|
|
82
81
|
tasks: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
83
82
|
urls: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
83
|
+
variant: z.ZodOptional<z.ZodEnum<{
|
|
84
|
+
baseline: "baseline";
|
|
85
|
+
agentic: "agentic";
|
|
86
|
+
observed: "observed";
|
|
87
|
+
full: "full";
|
|
88
|
+
}>>;
|
|
84
89
|
presets: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
85
90
|
}, z.core.$strip>;
|
|
86
91
|
/** Inferred TypeScript type for a pipeline request payload. */
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
* @see packages/eval/src/pipeline/map-request-to-config.ts — maps to ResolvedConfig
|
|
14
14
|
*/
|
|
15
15
|
import { z } from "zod";
|
|
16
|
-
import { RAW_EVAL_MODES } from "../../ailf-shared/index.js";
|
|
16
|
+
import { LITERACY_VARIANTS, RAW_EVAL_MODES } from "../../ailf-shared/index.js";
|
|
17
17
|
// ---------------------------------------------------------------------------
|
|
18
18
|
// Debug options — boolean shorthand or structured object
|
|
19
19
|
// ---------------------------------------------------------------------------
|
|
@@ -86,9 +86,21 @@ export const PipelineRequestSchema = z.object({
|
|
|
86
86
|
searchMode: z.enum(["off", "open", "origin-only"]).optional(),
|
|
87
87
|
source: z.string().optional(),
|
|
88
88
|
sourceReportId: z.string().optional(),
|
|
89
|
-
taskMode: z.enum(["content-lake", "
|
|
89
|
+
taskMode: z.enum(["content-lake", "inline"]).optional(),
|
|
90
90
|
tasks: z.array(z.string()).optional(),
|
|
91
91
|
urls: z.array(z.string().url()).optional(),
|
|
92
|
+
/**
|
|
93
|
+
* Literacy variant — only meaningful when mode is "literacy".
|
|
94
|
+
*
|
|
95
|
+
* When provided with a canonical mode (`mode: "literacy"`), this field
|
|
96
|
+
* specifies the variant directly. When mode is a legacy alias (e.g.,
|
|
97
|
+
* `mode: "baseline"`), the variant is derived from the mode name and
|
|
98
|
+
* this field is ignored.
|
|
99
|
+
*
|
|
100
|
+
* Prefer explicit `mode: "literacy", variant: "baseline"` over the
|
|
101
|
+
* legacy `mode: "baseline"` form.
|
|
102
|
+
*/
|
|
103
|
+
variant: z.enum(LITERACY_VARIANTS).optional(),
|
|
92
104
|
/** External preset file paths or npm package names to load */
|
|
93
105
|
presets: z.array(z.string()).optional(),
|
|
94
106
|
});
|
|
@@ -6,7 +6,9 @@
|
|
|
6
6
|
* Extracted from packages/eval/src/lib/generate-configs.ts during
|
|
7
7
|
* the Ports & Adapters migration (Phase 4e).
|
|
8
8
|
*/
|
|
9
|
+
import type { EvalMode } from "../../ailf-shared/index.d.ts";
|
|
9
10
|
import type { ModelEntry } from "../types/index.js";
|
|
11
|
+
import type { ModeBase } from "../types/plugin-registry.js";
|
|
10
12
|
/**
|
|
11
13
|
* Extract the raw API model name from a Promptfoo provider ID.
|
|
12
14
|
*
|
|
@@ -38,4 +40,17 @@ export declare function mergeConfig(defaults: Record<string, unknown>, modelConf
|
|
|
38
40
|
*
|
|
39
41
|
* Models without a `modes` field match all modes.
|
|
40
42
|
*/
|
|
41
|
-
export declare function modelMatchesMode(model: ModelEntry, mode:
|
|
43
|
+
export declare function modelMatchesMode(model: ModelEntry, mode: EvalMode): boolean;
|
|
44
|
+
/**
|
|
45
|
+
* Resolve which variants a model participates in for a given mode.
|
|
46
|
+
*
|
|
47
|
+
* Resolution rules:
|
|
48
|
+
* - If the mode has no variants defined → returns `undefined` (no variant filtering)
|
|
49
|
+
* - If the model specifies variants for this mode → returns that whitelist
|
|
50
|
+
* - If the model omits variants for this mode → returns ALL mode variants (default)
|
|
51
|
+
*
|
|
52
|
+
* @param model - The model entry from models config
|
|
53
|
+
* @param modeBase - The mode base (contains variant definitions)
|
|
54
|
+
* @returns Array of variant IDs, or `undefined` if the mode has no variants
|
|
55
|
+
*/
|
|
56
|
+
export declare function resolveModelVariants(model: ModelEntry, modeBase: ModeBase): string[] | undefined;
|
|
@@ -84,3 +84,24 @@ export function modelMatchesMode(model, mode) {
|
|
|
84
84
|
}
|
|
85
85
|
return model.modes.includes(mode);
|
|
86
86
|
}
|
|
87
|
+
/**
|
|
88
|
+
* Resolve which variants a model participates in for a given mode.
|
|
89
|
+
*
|
|
90
|
+
* Resolution rules:
|
|
91
|
+
* - If the mode has no variants defined → returns `undefined` (no variant filtering)
|
|
92
|
+
* - If the model specifies variants for this mode → returns that whitelist
|
|
93
|
+
* - If the model omits variants for this mode → returns ALL mode variants (default)
|
|
94
|
+
*
|
|
95
|
+
* @param model - The model entry from models config
|
|
96
|
+
* @param modeBase - The mode base (contains variant definitions)
|
|
97
|
+
* @returns Array of variant IDs, or `undefined` if the mode has no variants
|
|
98
|
+
*/
|
|
99
|
+
export function resolveModelVariants(model, modeBase) {
|
|
100
|
+
const modeVariants = modeBase.mode.variants;
|
|
101
|
+
if (!modeVariants || modeVariants.length === 0)
|
|
102
|
+
return undefined;
|
|
103
|
+
const allVariantIds = modeVariants.map((v) => v.id);
|
|
104
|
+
const modeId = modeBase.mode.id;
|
|
105
|
+
const explicit = model.variants?.[modeId];
|
|
106
|
+
return explicit ?? allVariantIds;
|
|
107
|
+
}
|
|
@@ -10,4 +10,4 @@
|
|
|
10
10
|
export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
|
|
11
11
|
export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
|
|
12
12
|
export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, type AggregationStrategy, type AreaScore, type AssertionScore, type DimensionScore, type EnsembleGradingConfig, type GraderTransitionConfig, type TaskScore, type TaskScoreOptions, } from "./scoring-engine.js";
|
|
13
|
-
export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "./config-helpers.js";
|
|
13
|
+
export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
|
|
@@ -10,4 +10,4 @@
|
|
|
10
10
|
export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
|
|
11
11
|
export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
|
|
12
12
|
export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, } from "./scoring-engine.js";
|
|
13
|
-
export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "./config-helpers.js";
|
|
13
|
+
export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
|
|
@@ -65,6 +65,15 @@ export function classifyRubric(component) {
|
|
|
65
65
|
*/
|
|
66
66
|
export function detectFeatureArea(description) {
|
|
67
67
|
const desc = description.toLowerCase();
|
|
68
|
+
if (desc.includes("portable text")) {
|
|
69
|
+
return "portable-text";
|
|
70
|
+
}
|
|
71
|
+
if (desc.includes("content lake")) {
|
|
72
|
+
return "content-lake";
|
|
73
|
+
}
|
|
74
|
+
if (desc.includes("image handling") || desc.includes("image asset")) {
|
|
75
|
+
return "image-handling";
|
|
76
|
+
}
|
|
68
77
|
if (desc.includes("studio")) {
|
|
69
78
|
return "studio-setup";
|
|
70
79
|
}
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
*
|
|
9
9
|
* @see docs/design-docs/architecture-overhaul/domain-model.md (canonical)
|
|
10
10
|
* @see docs/design-docs/architecture-overhaul/test-definition.md (authoring surfaces)
|
|
11
|
-
* @see docs/exec-plans/architecture-overhaul/phase-0-foundation-types.md (task 0h)
|
|
11
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-0-foundation-types.md (task 0h)
|
|
12
12
|
*/
|
|
13
13
|
/** Difficulty level for a task */
|
|
14
14
|
export type TaskDifficulty = "basic" | "intermediate" | "advanced";
|
|
@@ -178,6 +178,17 @@ export interface MCPServerTaskDefinition extends TaskCommonFields {
|
|
|
178
178
|
url?: string;
|
|
179
179
|
/** Environment variables for the server process */
|
|
180
180
|
env?: Record<string, string>;
|
|
181
|
+
/**
|
|
182
|
+
* HTTP headers for remote transports (sse / streamable-http).
|
|
183
|
+
* Merged on top of any auth-derived headers, so explicit values
|
|
184
|
+
* here take precedence over `auth`-generated headers.
|
|
185
|
+
*
|
|
186
|
+
* Values support `{{env.VAR}}` template syntax for secrets.
|
|
187
|
+
*
|
|
188
|
+
* @example
|
|
189
|
+
* headers: { Authorization: "Bearer {{env.SANITY_API_TOKEN}}" }
|
|
190
|
+
*/
|
|
191
|
+
headers?: Record<string, string>;
|
|
181
192
|
/** Startup timeout in milliseconds */
|
|
182
193
|
startupTimeoutMs?: number;
|
|
183
194
|
/**
|
|
@@ -8,6 +8,6 @@
|
|
|
8
8
|
*
|
|
9
9
|
* @see docs/design-docs/architecture-overhaul/domain-model.md (canonical)
|
|
10
10
|
* @see docs/design-docs/architecture-overhaul/test-definition.md (authoring surfaces)
|
|
11
|
-
* @see docs/exec-plans/architecture-overhaul/phase-0-foundation-types.md (task 0h)
|
|
11
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-0-foundation-types.md (task 0h)
|
|
12
12
|
*/
|
|
13
13
|
export {};
|
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
* Ports & Adapters migration (Phase 0c). The original file is now a
|
|
10
10
|
* re-export barrel that preserves backward compatibility.
|
|
11
11
|
*/
|
|
12
|
-
import type { DocumentRef as _DocumentRef, EvalMode
|
|
12
|
+
import type { DocumentRef as _DocumentRef, EvalMode } from "../../ailf-shared/index.d.ts";
|
|
13
13
|
export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "./scoring-input.js";
|
|
14
14
|
export type { DocumentRef } from "../../ailf-shared/index.d.ts";
|
|
15
15
|
export type { StoredBaseline, StoredReport, StoredRun, StoredTaskResult, StoredTrace, SchemaVersioned, } from "./storage-schema.js";
|
|
@@ -25,7 +25,6 @@ export type { ArtifactId, Brand, Err, FixtureId, IdValidationError, NewReportId,
|
|
|
25
25
|
export { err, fixtureId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
|
|
26
26
|
export type { AgentHarnessTaskDefinition, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PathDocRef, PerspectiveDocRef, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
|
|
27
27
|
type DocumentRef = _DocumentRef;
|
|
28
|
-
type EvalMode = _EvalMode;
|
|
29
28
|
/** Aggregated retrieval metrics for a feature area */
|
|
30
29
|
export interface AreaRetrievalMetrics {
|
|
31
30
|
area: string;
|
|
@@ -119,7 +118,7 @@ export interface FailureModeReport {
|
|
|
119
118
|
totalJudgments: number;
|
|
120
119
|
}
|
|
121
120
|
/** Failure mode classification for a low-scoring judgment */
|
|
122
|
-
export type FailureModeType = "incorrect-docs" | "missing-docs" | "model-limitation" | "outdated-docs" | "poor-structure" | "unclassified";
|
|
121
|
+
export type FailureModeType = "api-error" | "incorrect-docs" | "missing-docs" | "model-limitation" | "outdated-docs" | "poor-structure" | "unclassified";
|
|
123
122
|
/** Per-feature-area score breakdown */
|
|
124
123
|
export interface FeatureScore {
|
|
125
124
|
/**
|
|
@@ -352,11 +351,40 @@ export interface ModelEntry {
|
|
|
352
351
|
env?: string;
|
|
353
352
|
id: string;
|
|
354
353
|
label: string;
|
|
355
|
-
|
|
354
|
+
/**
|
|
355
|
+
* Which evaluation modes this model participates in.
|
|
356
|
+
*
|
|
357
|
+
* Values must be canonical eval mode names (e.g., "literacy", "mcp-server").
|
|
358
|
+
* When omitted, the model participates in all modes.
|
|
359
|
+
*/
|
|
360
|
+
modes?: EvalMode[];
|
|
361
|
+
/**
|
|
362
|
+
* Per-provider timeout in ms. Emitted into Promptfoo provider config.
|
|
363
|
+
* Default: 300_000 (5 min, matching Promptfoo's built-in default).
|
|
364
|
+
*/
|
|
365
|
+
timeoutMs?: number;
|
|
366
|
+
/**
|
|
367
|
+
* Per-mode variant whitelist. Keys are eval mode IDs, values are arrays
|
|
368
|
+
* of variant IDs to include for that mode.
|
|
369
|
+
*
|
|
370
|
+
* When a model enrolls in a mode (via `modes`) but does not specify
|
|
371
|
+
* variants for it here, ALL variants defined by the mode base are included.
|
|
372
|
+
*
|
|
373
|
+
* Only meaningful for modes that define variants (e.g., literacy has
|
|
374
|
+
* "baseline", "observed", "agentic-naive", "agentic-optimized").
|
|
375
|
+
* Ignored for modes without variants.
|
|
376
|
+
*/
|
|
377
|
+
variants?: Partial<Record<EvalMode, string[]>>;
|
|
356
378
|
}
|
|
357
379
|
/** Parsed config/models.yaml structure */
|
|
358
380
|
export interface ModelsConfig {
|
|
359
381
|
defaults: Record<string, unknown>;
|
|
382
|
+
/**
|
|
383
|
+
* Maximum wall-clock time per eval step (all tests for one mode) in ms.
|
|
384
|
+
* When exceeded, the subprocess is killed and partial results are used.
|
|
385
|
+
* Default: no limit (backward compatible).
|
|
386
|
+
*/
|
|
387
|
+
evalBudgetMs?: number;
|
|
360
388
|
grader: {
|
|
361
389
|
id: string;
|
|
362
390
|
label?: string;
|
|
@@ -507,6 +535,21 @@ export interface TestSummary {
|
|
|
507
535
|
task: string;
|
|
508
536
|
error: string;
|
|
509
537
|
}[];
|
|
538
|
+
/** Per-test timing statistics (when latencyMs is available from Promptfoo) */
|
|
539
|
+
timing?: {
|
|
540
|
+
/** Median test duration in ms */
|
|
541
|
+
medianMs: number;
|
|
542
|
+
/** 95th percentile test duration in ms */
|
|
543
|
+
p95Ms: number;
|
|
544
|
+
/** Maximum test duration in ms */
|
|
545
|
+
maxMs: number;
|
|
546
|
+
/** Tests that exceeded the slow threshold (2x median, min 60s) */
|
|
547
|
+
slowTests: {
|
|
548
|
+
task: string;
|
|
549
|
+
model: string;
|
|
550
|
+
durationMs: number;
|
|
551
|
+
}[];
|
|
552
|
+
};
|
|
510
553
|
}
|
|
511
554
|
/** Token usage and estimated cost for a pipeline run. */
|
|
512
555
|
export interface PipelineUsage {
|