@sanity/ailf 2.0.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +28 -23
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
- package/dist/_vendor/ailf-core/config-helpers.js +29 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
- package/dist/_vendor/ailf-core/examples/index.js +208 -114
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
- package/dist/_vendor/ailf-tasks/cli.js +61 -0
- package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
- package/dist/_vendor/ailf-tasks/index.js +16 -0
- package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
- package/dist/_vendor/ailf-tasks/parser.js +73 -0
- package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
- package/dist/_vendor/ailf-tasks/schemas.js +180 -0
- package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
- package/dist/_vendor/ailf-tasks/validation.js +162 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +6 -1
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
- package/dist/adapters/task-sources/index.d.ts +1 -2
- package/dist/adapters/task-sources/index.js +1 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
- package/dist/adapters/task-sources/repo-schemas.js +2 -2
- package/dist/adapters/task-sources/repo-task-source.js +1 -1
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
- package/dist/adapters/task-sources/task-file-loader.js +20 -6
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +2 -3
- package/dist/commands/init.js +56 -170
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/composition-root.d.ts +2 -3
- package/dist/composition-root.js +27 -14
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +30 -16
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +50 -15
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +52 -32
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/checks.d.ts +8 -3
- package/dist/pipeline/checks.js +23 -3
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
- package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
- package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
- package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +15 -8
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +15 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/dist/pipeline/mirror-repo-tasks.js +1 -1
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +67 -29
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +25 -25
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline step: Publish evaluation report to the report store.
|
|
3
|
+
*
|
|
4
|
+
* This step wraps ScoreSummary + provenance into a Report, writes it to
|
|
5
|
+
* the Sanity Content Lake (system of record), optionally auto-compares
|
|
6
|
+
* against the most recent comparable baseline, and fans out to configured
|
|
7
|
+
* sinks (BigQuery, Slack, webhooks, etc.).
|
|
8
|
+
*
|
|
9
|
+
* Opt-in via `--publish` flag or `AILF_PUBLISH=1` environment variable.
|
|
10
|
+
* Without this flag, the pipeline writes results locally only (unchanged
|
|
11
|
+
* from current behavior).
|
|
12
|
+
*
|
|
13
|
+
* Design principles:
|
|
14
|
+
* - P1: Reports are immutable events (write-once to Sanity)
|
|
15
|
+
* - P5: Local-first (pipeline never fails because of a store write)
|
|
16
|
+
* - P6: Sinks are fire-and-forget (failures logged, not thrown)
|
|
17
|
+
*
|
|
18
|
+
* Preconditions: score-summary.json exists and is valid
|
|
19
|
+
* Postconditions: Report written to Sanity (best-effort), sinks notified
|
|
20
|
+
*
|
|
21
|
+
* @see docs/design-docs/report-store/architecture.md
|
|
22
|
+
* @see docs/design-docs/report-store/sink-architecture.md
|
|
23
|
+
*/
|
|
24
|
+
import { type ProvenanceInput } from "../provenance.js";
|
|
25
|
+
import type { PromptfooUrlEntry, StepResult } from "../types.js";
|
|
26
|
+
export interface PublishOptions {
|
|
27
|
+
/** Whether this is a debug run (debug runs don't store fingerprints) */
|
|
28
|
+
debug?: boolean;
|
|
29
|
+
/** Evaluation fingerprint override (computed externally by the pipeline) */
|
|
30
|
+
evalFingerprint?: string;
|
|
31
|
+
/** @deprecated Use `promptfooUrls` — kept for backward compatibility */
|
|
32
|
+
promptfooUrl?: string;
|
|
33
|
+
/** Per-mode Promptfoo share URLs */
|
|
34
|
+
promptfooUrls?: PromptfooUrlEntry[];
|
|
35
|
+
/** Override provenance input (for testing or custom workflows) */
|
|
36
|
+
provenanceInput?: Partial<ProvenanceInput>;
|
|
37
|
+
/** Sanity dataset for report storage (independent of eval dataset) */
|
|
38
|
+
reportDataset?: string;
|
|
39
|
+
/** Sanity project ID for report storage (independent of eval project) */
|
|
40
|
+
reportProjectId?: string;
|
|
41
|
+
/** Sanity API token for writes */
|
|
42
|
+
sanityToken?: string;
|
|
43
|
+
/** Optional human-supplied tag */
|
|
44
|
+
tag?: string;
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Run the publish-report pipeline step.
|
|
48
|
+
*
|
|
49
|
+
* 1. Read score-summary.json
|
|
50
|
+
* 2. Build provenance from pipeline context
|
|
51
|
+
* 3. Create Report with generated UUID v7 ID
|
|
52
|
+
* 4. Auto-compare against most recent comparable baseline
|
|
53
|
+
* 5. Write to Sanity Content Lake (system of record)
|
|
54
|
+
* 6. Fan out to configured sinks (fire-and-forget)
|
|
55
|
+
* 7. Return step result with report ID and sink outcomes
|
|
56
|
+
*/
|
|
57
|
+
export declare function runPublishReport(pipelineStart: number, options?: PublishOptions): Promise<StepResult>;
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline step: Publish evaluation report to the report store.
|
|
3
|
+
*
|
|
4
|
+
* This step wraps ScoreSummary + provenance into a Report, writes it to
|
|
5
|
+
* the Sanity Content Lake (system of record), optionally auto-compares
|
|
6
|
+
* against the most recent comparable baseline, and fans out to configured
|
|
7
|
+
* sinks (BigQuery, Slack, webhooks, etc.).
|
|
8
|
+
*
|
|
9
|
+
* Opt-in via `--publish` flag or `AILF_PUBLISH=1` environment variable.
|
|
10
|
+
* Without this flag, the pipeline writes results locally only (unchanged
|
|
11
|
+
* from current behavior).
|
|
12
|
+
*
|
|
13
|
+
* Design principles:
|
|
14
|
+
* - P1: Reports are immutable events (write-once to Sanity)
|
|
15
|
+
* - P5: Local-first (pipeline never fails because of a store write)
|
|
16
|
+
* - P6: Sinks are fire-and-forget (failures logged, not thrown)
|
|
17
|
+
*
|
|
18
|
+
* Preconditions: score-summary.json exists and is valid
|
|
19
|
+
* Postconditions: Report written to Sanity (best-effort), sinks notified
|
|
20
|
+
*
|
|
21
|
+
* @see docs/design-docs/report-store/architecture.md
|
|
22
|
+
* @see docs/design-docs/report-store/sink-architecture.md
|
|
23
|
+
*/
|
|
24
|
+
import { readFileSync } from "fs";
|
|
25
|
+
import { dirname, resolve } from "path";
|
|
26
|
+
import { fileURLToPath } from "url";
|
|
27
|
+
import { checkScoreSummaryValid } from "../checks.js";
|
|
28
|
+
import { buildProvenance } from "../provenance.js";
|
|
29
|
+
import { generateReportId, ReportStore } from "../../report-store.js";
|
|
30
|
+
import { loadSinks } from "../../sinks/index.js";
|
|
31
|
+
import { withRetry } from "../../sinks/retry.js";
|
|
32
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
33
|
+
const ROOT = resolve(__dirname, "..", "..", "..");
|
|
34
|
+
/**
|
|
35
|
+
* Run the publish-report pipeline step.
|
|
36
|
+
*
|
|
37
|
+
* 1. Read score-summary.json
|
|
38
|
+
* 2. Build provenance from pipeline context
|
|
39
|
+
* 3. Create Report with generated UUID v7 ID
|
|
40
|
+
* 4. Auto-compare against most recent comparable baseline
|
|
41
|
+
* 5. Write to Sanity Content Lake (system of record)
|
|
42
|
+
* 6. Fan out to configured sinks (fire-and-forget)
|
|
43
|
+
* 7. Return step result with report ID and sink outcomes
|
|
44
|
+
*/
|
|
45
|
+
export async function runPublishReport(pipelineStart, options = {}) {
|
|
46
|
+
const start = Date.now();
|
|
47
|
+
// Precondition: score summary exists
|
|
48
|
+
const summaryIssues = checkScoreSummaryValid(ROOT);
|
|
49
|
+
const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
|
|
50
|
+
if (summaryErrors.length > 0) {
|
|
51
|
+
return {
|
|
52
|
+
durationMs: Date.now() - start,
|
|
53
|
+
error: `Score summary missing: ${summaryErrors.map((e) => e.message).join("; ")}`,
|
|
54
|
+
status: "failed",
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
// Read score summary
|
|
58
|
+
let summary;
|
|
59
|
+
try {
|
|
60
|
+
const summaryPath = resolve(ROOT, "results", "latest", "score-summary.json");
|
|
61
|
+
summary = JSON.parse(readFileSync(summaryPath, "utf-8"));
|
|
62
|
+
}
|
|
63
|
+
catch (err) {
|
|
64
|
+
return {
|
|
65
|
+
durationMs: Date.now() - start,
|
|
66
|
+
error: `Failed to read score-summary.json: ${err instanceof Error ? err.message : String(err)}`,
|
|
67
|
+
status: "failed",
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
// Build provenance
|
|
71
|
+
const provenanceInput = buildProvenanceInput(summary, options);
|
|
72
|
+
const provenance = buildProvenance(provenanceInput);
|
|
73
|
+
// Create report
|
|
74
|
+
const now = new Date().toISOString();
|
|
75
|
+
const reportId = generateReportId();
|
|
76
|
+
const durationMs = Date.now() - pipelineStart;
|
|
77
|
+
// Initialize report store — uses AILF_REPORT_* env vars, independent of
|
|
78
|
+
// SANITY_DATASET/SANITY_PROJECT_ID which control doc evaluation.
|
|
79
|
+
const token = options.sanityToken ??
|
|
80
|
+
process.env.AILF_REPORT_SANITY_API_TOKEN ??
|
|
81
|
+
process.env.SANITY_API_TOKEN;
|
|
82
|
+
const dataset = options.reportDataset ?? process.env.AILF_REPORT_DATASET ?? undefined;
|
|
83
|
+
const projectId = options.reportProjectId ?? process.env.AILF_REPORT_PROJECT_ID ?? undefined;
|
|
84
|
+
const store = new ReportStore({ dataset, projectId, token });
|
|
85
|
+
// Auto-compare against most recent comparable baseline
|
|
86
|
+
const comparison = await store.autoCompare(summary, provenance, now);
|
|
87
|
+
const report = {
|
|
88
|
+
comparison: comparison ?? undefined,
|
|
89
|
+
completedAt: now,
|
|
90
|
+
durationMs,
|
|
91
|
+
id: reportId,
|
|
92
|
+
provenance,
|
|
93
|
+
summary,
|
|
94
|
+
tag: options.tag,
|
|
95
|
+
};
|
|
96
|
+
// Write to Sanity (system of record — best-effort, P5)
|
|
97
|
+
const sanityResult = await store.write(report);
|
|
98
|
+
// Load and run sinks (fire-and-forget, P6)
|
|
99
|
+
const publishResult = await runSinks(report);
|
|
100
|
+
// Build result summary
|
|
101
|
+
const parts = [];
|
|
102
|
+
if (sanityResult) {
|
|
103
|
+
parts.push(`report:${sanityResult}`);
|
|
104
|
+
}
|
|
105
|
+
else {
|
|
106
|
+
parts.push("Sanity write skipped (no token or unreachable)");
|
|
107
|
+
}
|
|
108
|
+
if (comparison) {
|
|
109
|
+
const delta = comparison.deltas.overall;
|
|
110
|
+
const sign = delta >= 0 ? "+" : "";
|
|
111
|
+
parts.push(`vs baseline: ${sign}${delta.toFixed(1)}`);
|
|
112
|
+
}
|
|
113
|
+
if (publishResult.sinkResults.length > 0) {
|
|
114
|
+
const succeeded = publishResult.sinkResults.filter((r) => r.result.status === "success").length;
|
|
115
|
+
const total = publishResult.sinkResults.length;
|
|
116
|
+
parts.push(`sinks: ${succeeded}/${total}`);
|
|
117
|
+
}
|
|
118
|
+
return {
|
|
119
|
+
durationMs: Date.now() - start,
|
|
120
|
+
status: "success",
|
|
121
|
+
summary: `Published — ${parts.join(", ")}`,
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
// ---------------------------------------------------------------------------
|
|
125
|
+
// Sink runner
|
|
126
|
+
// ---------------------------------------------------------------------------
|
|
127
|
+
/**
|
|
128
|
+
* Assemble provenance input from the score summary and pipeline context.
|
|
129
|
+
*/
|
|
130
|
+
function buildProvenanceInput(summary, options) {
|
|
131
|
+
const areas = summary.scores.map((s) => s.feature);
|
|
132
|
+
const mode = (process.env.EVAL_MODE ?? "baseline");
|
|
133
|
+
// Read document IDs from env (set by pipeline.ts from --sanity-document flags)
|
|
134
|
+
const docIds = process.env.SANITY_DOCUMENT_IDS;
|
|
135
|
+
const sanityDocumentIds = docIds
|
|
136
|
+
? docIds
|
|
137
|
+
.split(",")
|
|
138
|
+
.map((id) => id.trim())
|
|
139
|
+
.filter(Boolean)
|
|
140
|
+
: undefined;
|
|
141
|
+
// Read task filter from env
|
|
142
|
+
const taskFilter = process.env.EVAL_FILTER_TASKS;
|
|
143
|
+
const taskIds = taskFilter
|
|
144
|
+
? taskFilter
|
|
145
|
+
.split(",")
|
|
146
|
+
.map((t) => t.trim())
|
|
147
|
+
.filter(Boolean)
|
|
148
|
+
: undefined;
|
|
149
|
+
// Build source from summary metadata or env
|
|
150
|
+
const source = {
|
|
151
|
+
baseUrl: summary.source?.baseUrl ?? "https://www.sanity.io/docs",
|
|
152
|
+
dataset: summary.source?.dataset ?? process.env.SANITY_DATASET ?? "next",
|
|
153
|
+
documentIds: [],
|
|
154
|
+
llmsTxt: (summary.source?.baseUrl ?? "https://www.sanity.io/docs") + "/llms.txt",
|
|
155
|
+
name: summary.source?.name ?? "production",
|
|
156
|
+
perspective: summary.source?.perspective ??
|
|
157
|
+
process.env.SANITY_PERSPECTIVE ??
|
|
158
|
+
undefined,
|
|
159
|
+
priorityDomain: "sanity.io",
|
|
160
|
+
projectId: summary.source?.projectId ?? process.env.SANITY_PROJECT_ID ?? "3do82whm",
|
|
161
|
+
studioOrigin: "https://admin.sanity.io",
|
|
162
|
+
urls: [],
|
|
163
|
+
};
|
|
164
|
+
// Pass through eval fingerprint for cross-environment cache lookup.
|
|
165
|
+
// Debug runs don't store fingerprints (they evaluate a subset of tests
|
|
166
|
+
// and would produce misleading cache hits for full runs).
|
|
167
|
+
const evalFingerprint = !options.debug ? options.evalFingerprint : undefined;
|
|
168
|
+
return {
|
|
169
|
+
areas,
|
|
170
|
+
evalFingerprint,
|
|
171
|
+
mode,
|
|
172
|
+
promptfooUrl: options.promptfooUrl,
|
|
173
|
+
promptfooUrls: options.promptfooUrls,
|
|
174
|
+
rootDir: ROOT,
|
|
175
|
+
sanityDocumentIds,
|
|
176
|
+
source,
|
|
177
|
+
taskIds,
|
|
178
|
+
...options.provenanceInput,
|
|
179
|
+
};
|
|
180
|
+
}
|
|
181
|
+
// ---------------------------------------------------------------------------
|
|
182
|
+
// Provenance input builder
|
|
183
|
+
// ---------------------------------------------------------------------------
|
|
184
|
+
/**
|
|
185
|
+
* Fan out a report to all configured sinks.
|
|
186
|
+
*
|
|
187
|
+
* Sinks are loaded from config/sinks.yaml. Each sink is run with retry
|
|
188
|
+
* logic (3 attempts, exponential backoff). Failures are logged but never
|
|
189
|
+
* block the pipeline.
|
|
190
|
+
*/
|
|
191
|
+
async function runSinks(report) {
|
|
192
|
+
const sinks = loadSinks();
|
|
193
|
+
const sinkResults = [];
|
|
194
|
+
if (sinks.length === 0) {
|
|
195
|
+
return { report, sinkResults };
|
|
196
|
+
}
|
|
197
|
+
// Health check all sinks first (non-blocking)
|
|
198
|
+
for (const sink of sinks) {
|
|
199
|
+
if (sink.healthCheck) {
|
|
200
|
+
try {
|
|
201
|
+
const health = await sink.healthCheck();
|
|
202
|
+
if (!health.healthy) {
|
|
203
|
+
console.warn(` ⚠️ Sink ${sink.name} health check failed: ${health.reason}`);
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
catch (err) {
|
|
207
|
+
console.warn(` ⚠️ Sink ${sink.name} health check error: ${err instanceof Error ? err.message : String(err)}`);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
// Publish to all sinks in parallel (fire-and-forget with retries)
|
|
212
|
+
const settled = await Promise.allSettled(sinks.map(async (sink) => {
|
|
213
|
+
const result = await withRetry(() => sink.publish(report));
|
|
214
|
+
return { name: sink.name, result };
|
|
215
|
+
}));
|
|
216
|
+
for (const outcome of settled) {
|
|
217
|
+
if (outcome.status === "fulfilled") {
|
|
218
|
+
sinkResults.push(outcome.value);
|
|
219
|
+
const { name, result } = outcome.value;
|
|
220
|
+
if (result.status === "failed") {
|
|
221
|
+
console.warn(` ⚠️ Sink ${name} failed: ${result.error}`);
|
|
222
|
+
}
|
|
223
|
+
else if (result.status === "skipped") {
|
|
224
|
+
console.log(` ⏭️ Sink ${name} skipped: ${result.reason}`);
|
|
225
|
+
}
|
|
226
|
+
else {
|
|
227
|
+
console.log(` ✅ Sink ${name} delivered${result.detail ? ` (${result.detail})` : ""}`);
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
else {
|
|
231
|
+
// Promise.allSettled rejection — shouldn't happen with withRetry, but just in case
|
|
232
|
+
const error = outcome.reason instanceof Error
|
|
233
|
+
? outcome.reason.message
|
|
234
|
+
: String(outcome.reason);
|
|
235
|
+
sinkResults.push({
|
|
236
|
+
name: "unknown",
|
|
237
|
+
result: { error, status: "failed" },
|
|
238
|
+
});
|
|
239
|
+
console.warn(` ⚠️ Sink delivery error: ${error}`);
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
return { report, sinkResults };
|
|
243
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline step: Generate PR comment / report from scores.
|
|
3
|
+
*
|
|
4
|
+
* Preconditions: score-summary.json exists
|
|
5
|
+
* Postconditions: report markdown generated
|
|
6
|
+
*
|
|
7
|
+
* Cache key: results/latest/score-summary.json
|
|
8
|
+
* Note: Report is always regenerated (not cached) since it may include
|
|
9
|
+
* dynamic data like Promptfoo URLs. The cache infrastructure is wired up
|
|
10
|
+
* for consistency but reports are cheap to generate.
|
|
11
|
+
*/
|
|
12
|
+
import type { StepResult } from "../types.js";
|
|
13
|
+
export declare function runReport(outputPath?: string, promptfooUrl?: string): StepResult;
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline step: Generate PR comment / report from scores.
|
|
3
|
+
*
|
|
4
|
+
* Preconditions: score-summary.json exists
|
|
5
|
+
* Postconditions: report markdown generated
|
|
6
|
+
*
|
|
7
|
+
* Cache key: results/latest/score-summary.json
|
|
8
|
+
* Note: Report is always regenerated (not cached) since it may include
|
|
9
|
+
* dynamic data like Promptfoo URLs. The cache infrastructure is wired up
|
|
10
|
+
* for consistency but reports are cheap to generate.
|
|
11
|
+
*/
|
|
12
|
+
import { execSync } from "child_process";
|
|
13
|
+
import { dirname, resolve } from "path";
|
|
14
|
+
import { fileURLToPath } from "url";
|
|
15
|
+
import { checkScoreSummaryValid } from "../checks.js";
|
|
16
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
17
|
+
const ROOT = resolve(__dirname, "..", "..", "..");
|
|
18
|
+
const DEFAULT_REPORT_PATH = resolve(ROOT, "results/latest/pr-comment.md");
|
|
19
|
+
export function runReport(outputPath, promptfooUrl) {
|
|
20
|
+
const start = Date.now();
|
|
21
|
+
// Precondition: score summary exists
|
|
22
|
+
const summaryIssues = checkScoreSummaryValid(ROOT);
|
|
23
|
+
const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
|
|
24
|
+
if (summaryErrors.length > 0) {
|
|
25
|
+
return {
|
|
26
|
+
durationMs: Date.now() - start,
|
|
27
|
+
error: `Score summary missing: ${summaryErrors.map((e) => e.message).join("; ")}`,
|
|
28
|
+
status: "failed",
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
// Always write to a file — use the caller's path or a default.
|
|
32
|
+
// This avoids dumping the full PR-comment markdown into the terminal.
|
|
33
|
+
const resolvedOutput = outputPath ?? DEFAULT_REPORT_PATH;
|
|
34
|
+
// Execute — reports are always regenerated (cheap, may include dynamic URLs)
|
|
35
|
+
try {
|
|
36
|
+
const outputArg = ` --output ${resolvedOutput}`;
|
|
37
|
+
const urlArg = promptfooUrl ? ` --promptfoo-url ${promptfooUrl}` : "";
|
|
38
|
+
execSync(`pnpm pr-comment${outputArg}${urlArg}`, {
|
|
39
|
+
cwd: ROOT,
|
|
40
|
+
env: process.env,
|
|
41
|
+
stdio: ["inherit", "ignore", "inherit"],
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
catch (err) {
|
|
45
|
+
return {
|
|
46
|
+
durationMs: Date.now() - start,
|
|
47
|
+
error: `pr-comment failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
48
|
+
status: "failed",
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
return {
|
|
52
|
+
durationMs: Date.now() - start,
|
|
53
|
+
status: "success",
|
|
54
|
+
summary: `Report written to ${resolvedOutput}`,
|
|
55
|
+
};
|
|
56
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline step: Update QUALITY_SCORE.md from score summary.
|
|
3
|
+
*
|
|
4
|
+
* Preconditions: score-summary.json exists and is valid
|
|
5
|
+
* Postconditions: QUALITY_SCORE.md is updated with latest scores
|
|
6
|
+
*
|
|
7
|
+
* Note: Not cached — writing to a git-tracked file is intentionally
|
|
8
|
+
* always re-executed to ensure the file reflects the latest scores.
|
|
9
|
+
*/
|
|
10
|
+
import type { StepResult } from "../types.js";
|
|
11
|
+
export declare function runUpdateQualityScores(): Promise<StepResult>;
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline step: Update QUALITY_SCORE.md from score summary.
|
|
3
|
+
*
|
|
4
|
+
* Preconditions: score-summary.json exists and is valid
|
|
5
|
+
* Postconditions: QUALITY_SCORE.md is updated with latest scores
|
|
6
|
+
*
|
|
7
|
+
* Note: Not cached — writing to a git-tracked file is intentionally
|
|
8
|
+
* always re-executed to ensure the file reflects the latest scores.
|
|
9
|
+
*/
|
|
10
|
+
import { dirname, resolve } from "path";
|
|
11
|
+
import { fileURLToPath } from "url";
|
|
12
|
+
import { checkScoreSummaryValid } from "../checks.js";
|
|
13
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
14
|
+
const ROOT = resolve(__dirname, "..", "..", "..");
|
|
15
|
+
export async function runUpdateQualityScores() {
|
|
16
|
+
const start = Date.now();
|
|
17
|
+
// Precondition: score summary exists and is valid
|
|
18
|
+
const summaryIssues = checkScoreSummaryValid(ROOT);
|
|
19
|
+
const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
|
|
20
|
+
if (summaryErrors.length > 0) {
|
|
21
|
+
return {
|
|
22
|
+
durationMs: Date.now() - start,
|
|
23
|
+
error: `Score summary missing or invalid: ${summaryErrors.map((e) => e.message).join("; ")}`,
|
|
24
|
+
status: "failed",
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
// Dynamic import to avoid loading file-writing code at module parse time
|
|
28
|
+
const { updateQualityScores } = await import("../../scripts/update-quality-scores.js");
|
|
29
|
+
const result = updateQualityScores();
|
|
30
|
+
if (!result.success) {
|
|
31
|
+
return {
|
|
32
|
+
durationMs: Date.now() - start,
|
|
33
|
+
error: result.message,
|
|
34
|
+
status: "failed",
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
return {
|
|
38
|
+
durationMs: Date.now() - start,
|
|
39
|
+
status: "success",
|
|
40
|
+
summary: result.message,
|
|
41
|
+
};
|
|
42
|
+
}
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
* The actual evaluation execution is handled by the pipeline orchestrator —
|
|
19
19
|
* this module is pure computation on inputs and outputs.
|
|
20
20
|
*
|
|
21
|
-
* @see docs/exec-plans/scenario-matrix-implementation/phase-4-content-release-integration.md
|
|
21
|
+
* @see docs/archive/exec-plans/scenario-matrix-implementation/phase-4-content-release-integration.md
|
|
22
22
|
* @see docs/design-docs/scenario-matrix/per-document-attribution.md
|
|
23
23
|
*/
|
|
24
24
|
import type { AttributionReport, LOOCostEstimate, LOOResult, TaskAttribution } from "./types.js";
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
* The actual evaluation execution is handled by the pipeline orchestrator —
|
|
19
19
|
* this module is pure computation on inputs and outputs.
|
|
20
20
|
*
|
|
21
|
-
* @see docs/exec-plans/scenario-matrix-implementation/phase-4-content-release-integration.md
|
|
21
|
+
* @see docs/archive/exec-plans/scenario-matrix-implementation/phase-4-content-release-integration.md
|
|
22
22
|
* @see docs/design-docs/scenario-matrix/per-document-attribution.md
|
|
23
23
|
*/
|
|
24
24
|
// ---------------------------------------------------------------------------
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* publish-report-step.ts) is responsible for loading the threshold config
|
|
9
9
|
* and score summary.
|
|
10
10
|
*
|
|
11
|
-
* @see docs/exec-plans/scenario-matrix-implementation/phase-5-readiness-thresholds.md
|
|
11
|
+
* @see docs/archive/exec-plans/scenario-matrix-implementation/phase-5-readiness-thresholds.md
|
|
12
12
|
*/
|
|
13
13
|
import type { ThresholdConfig } from "./schemas.js";
|
|
14
14
|
import type { ComparisonReport, ScoreSummary, ThresholdEvaluation, ThresholdViolation } from "./types.js";
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* publish-report-step.ts) is responsible for loading the threshold config
|
|
9
9
|
* and score summary.
|
|
10
10
|
*
|
|
11
|
-
* @see docs/exec-plans/scenario-matrix-implementation/phase-5-readiness-thresholds.md
|
|
11
|
+
* @see docs/archive/exec-plans/scenario-matrix-implementation/phase-5-readiness-thresholds.md
|
|
12
12
|
*/
|
|
13
13
|
// ---------------------------------------------------------------------------
|
|
14
14
|
// Severity priority for sorting (higher = more severe)
|
|
@@ -160,8 +160,21 @@ export function validateModelsYaml(rootDir) {
|
|
|
160
160
|
if (!model.label) {
|
|
161
161
|
issues.push(error(source, `models[${i}] is missing 'label'`, path.join(rootDir, "config")));
|
|
162
162
|
}
|
|
163
|
+
if (model.timeoutMs !== undefined &&
|
|
164
|
+
(typeof model.timeoutMs !== "number" ||
|
|
165
|
+
!Number.isInteger(model.timeoutMs) ||
|
|
166
|
+
model.timeoutMs <= 0)) {
|
|
167
|
+
issues.push(error(source, `models[${i}] has invalid 'timeoutMs' (must be a positive integer)`, path.join(rootDir, "config")));
|
|
168
|
+
}
|
|
163
169
|
}
|
|
164
170
|
}
|
|
171
|
+
// Check evalBudgetMs
|
|
172
|
+
if (data.evalBudgetMs !== undefined &&
|
|
173
|
+
(typeof data.evalBudgetMs !== "number" ||
|
|
174
|
+
!Number.isInteger(data.evalBudgetMs) ||
|
|
175
|
+
data.evalBudgetMs <= 0)) {
|
|
176
|
+
issues.push(error(source, "config/models has invalid 'evalBudgetMs' (must be a positive integer)", path.join(rootDir, "config")));
|
|
177
|
+
}
|
|
165
178
|
// Check grader
|
|
166
179
|
if (!data.grader) {
|
|
167
180
|
issues.push(error(source, "config/models is missing a 'grader' section", path.join(rootDir, "config")));
|
package/dist/report-store.d.ts
CHANGED
|
@@ -94,6 +94,23 @@ export declare class ReportStore {
|
|
|
94
94
|
* @returns The report ID on success, null on failure (logged, not thrown)
|
|
95
95
|
*/
|
|
96
96
|
write(report: Report): Promise<null | ReportId>;
|
|
97
|
+
/**
|
|
98
|
+
* Query error arrays from the last N reports for chronic failure detection.
|
|
99
|
+
*
|
|
100
|
+
* Returns a simplified shape with just the report ID and error details,
|
|
101
|
+
* suitable for aggregation by the chronic-failures module.
|
|
102
|
+
*
|
|
103
|
+
* @param limit - Number of recent reports to query (default: 10)
|
|
104
|
+
* @returns Array of { reportId, errors } or empty array on failure
|
|
105
|
+
*/
|
|
106
|
+
queryRecentErrors(limit?: number): Promise<{
|
|
107
|
+
reportId: string;
|
|
108
|
+
errors: {
|
|
109
|
+
task: string;
|
|
110
|
+
model: string;
|
|
111
|
+
error: string;
|
|
112
|
+
}[];
|
|
113
|
+
}[]>;
|
|
97
114
|
}
|
|
98
115
|
/**
|
|
99
116
|
* Generate a UUID v7 (time-sortable) for report identification.
|
package/dist/report-store.js
CHANGED
|
@@ -213,6 +213,30 @@ export class ReportStore {
|
|
|
213
213
|
return null;
|
|
214
214
|
}
|
|
215
215
|
}
|
|
216
|
+
/**
|
|
217
|
+
* Query error arrays from the last N reports for chronic failure detection.
|
|
218
|
+
*
|
|
219
|
+
* Returns a simplified shape with just the report ID and error details,
|
|
220
|
+
* suitable for aggregation by the chronic-failures module.
|
|
221
|
+
*
|
|
222
|
+
* @param limit - Number of recent reports to query (default: 10)
|
|
223
|
+
* @returns Array of { reportId, errors } or empty array on failure
|
|
224
|
+
*/
|
|
225
|
+
async queryRecentErrors(limit = 10) {
|
|
226
|
+
try {
|
|
227
|
+
const groq = `*[_type == $type && defined(summary.testSummary.errors)]
|
|
228
|
+
| order(completedAt desc) [0...$limit] {
|
|
229
|
+
"reportId": reportId,
|
|
230
|
+
"errors": summary.testSummary.errors[] { task, model, error }
|
|
231
|
+
}`;
|
|
232
|
+
const results = await this.client.fetch(groq, { limit, type: REPORT_TYPE });
|
|
233
|
+
return results ?? [];
|
|
234
|
+
}
|
|
235
|
+
catch (error) {
|
|
236
|
+
console.warn(` ⚠️ Failed to query recent errors: ${error instanceof Error ? error.message : String(error)}`);
|
|
237
|
+
return [];
|
|
238
|
+
}
|
|
239
|
+
}
|
|
216
240
|
}
|
|
217
241
|
/**
|
|
218
242
|
* Generate a UUID v7 (time-sortable) for report identification.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* agent-behavior-report.ts
|
|
3
|
+
*
|
|
4
|
+
* Standalone script that reads Promptfoo evaluation results containing
|
|
5
|
+
* agent behavior observation data and generates a detailed report.
|
|
6
|
+
*
|
|
7
|
+
* This provides deeper analysis than the summary included in the main
|
|
8
|
+
* calculate-scores report, including:
|
|
9
|
+
*
|
|
10
|
+
* - Per-task behavior breakdown (which specific pages each task visited)
|
|
11
|
+
* - Canonical doc coverage (did the agent find the "right" docs?)
|
|
12
|
+
* - Request timeline and latency analysis
|
|
13
|
+
* - Search strategy analysis
|
|
14
|
+
* - Cross-task navigation pattern detection
|
|
15
|
+
*
|
|
16
|
+
* Usage:
|
|
17
|
+
* tsx src/scripts/agent-behavior-report.ts [results-path]
|
|
18
|
+
*/
|
|
19
|
+
import "dotenv/config";
|