@sanity/ailf 1.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +29 -12
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
- package/dist/_vendor/ailf-core/config-helpers.js +51 -2
- package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
- package/dist/_vendor/ailf-core/examples/index.js +213 -94
- package/dist/_vendor/ailf-core/index.d.ts +3 -2
- package/dist/_vendor/ailf-core/index.js +2 -1
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
- package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +7 -1
- package/dist/adapters/config-sources/ts-config-loader.js +21 -13
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
- package/dist/adapters/task-sources/index.d.ts +3 -4
- package/dist/adapters/task-sources/index.js +3 -4
- package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
- package/dist/adapters/task-sources/repo-schemas.js +228 -20
- package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
- package/dist/adapters/task-sources/repo-task-source.js +81 -122
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
- package/dist/adapters/task-sources/task-file-loader.js +21 -7
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/coverage-audit.js +3 -1
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +5 -4
- package/dist/commands/init.js +190 -25
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +15 -4
- package/dist/composition-root.js +100 -55
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/build-step-sequence.js +4 -2
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +32 -19
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +77 -26
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +51 -31
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
- package/dist/pipeline/compiler/literacy-bridge.js +2 -2
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
- package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
- package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/expand-tasks.d.ts +2 -2
- package/dist/pipeline/expand-tasks.js +2 -2
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +16 -9
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +16 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
- package/dist/pipeline/mirror-repo-tasks.js +10 -10
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +68 -30
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +32 -24
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* grader-validate.ts
|
|
3
|
+
*
|
|
4
|
+
* CLI script for validating grader accuracy against human reference grades
|
|
5
|
+
* (Phase 2 of grader reliability).
|
|
6
|
+
*
|
|
7
|
+
* Loads human-graded reference samples from canonical/grader-references/,
|
|
8
|
+
* runs the grader model on each sample, and compares against human scores.
|
|
9
|
+
*
|
|
10
|
+
* Usage:
|
|
11
|
+
* pnpm grader-validate # validate with default grader
|
|
12
|
+
* pnpm grader-validate --grader openai:gpt-5.5 # validate a candidate grader
|
|
13
|
+
* pnpm grader-validate --threshold 15 # custom MAE threshold
|
|
14
|
+
*
|
|
15
|
+
* Reads: canonical/grader-references/*.yaml
|
|
16
|
+
* Reads: config/models.yaml (for default grader model)
|
|
17
|
+
* Writes: results/latest/grader-validation.json
|
|
18
|
+
*/
|
|
19
|
+
import "dotenv/config";
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* grader-validate.ts
|
|
3
|
+
*
|
|
4
|
+
* CLI script for validating grader accuracy against human reference grades
|
|
5
|
+
* (Phase 2 of grader reliability).
|
|
6
|
+
*
|
|
7
|
+
* Loads human-graded reference samples from canonical/grader-references/,
|
|
8
|
+
* runs the grader model on each sample, and compares against human scores.
|
|
9
|
+
*
|
|
10
|
+
* Usage:
|
|
11
|
+
* pnpm grader-validate # validate with default grader
|
|
12
|
+
* pnpm grader-validate --grader openai:gpt-5.5 # validate a candidate grader
|
|
13
|
+
* pnpm grader-validate --threshold 15 # custom MAE threshold
|
|
14
|
+
*
|
|
15
|
+
* Reads: canonical/grader-references/*.yaml
|
|
16
|
+
* Reads: config/models.yaml (for default grader model)
|
|
17
|
+
* Writes: results/latest/grader-validation.json
|
|
18
|
+
*/
|
|
19
|
+
// oxlint-disable-next-line import/no-unassigned-import -- side-effect: loads .env into process.env
|
|
20
|
+
import "dotenv/config";
|
|
21
|
+
import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
|
|
22
|
+
import { dirname, join, resolve } from "path";
|
|
23
|
+
import { fileURLToPath } from "url";
|
|
24
|
+
import { load } from "js-yaml";
|
|
25
|
+
import { classifyCorrelation, validateGrader, } from "../pipeline/grader-validation.js";
|
|
26
|
+
import { gradeOnce, loadGraderModel } from "./grader-api.js";
|
|
27
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
28
|
+
const ROOT = resolve(__dirname, "..", "..");
|
|
29
|
+
// ---------------------------------------------------------------------------
|
|
30
|
+
// CLI argument parsing
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
const args = process.argv.slice(2);
|
|
33
|
+
function getFlag(name) {
|
|
34
|
+
return args.includes(`--${name}`);
|
|
35
|
+
}
|
|
36
|
+
function getOption(name) {
|
|
37
|
+
const idx = args.indexOf(`--${name}`);
|
|
38
|
+
return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined;
|
|
39
|
+
}
|
|
40
|
+
const graderOverride = getOption("grader");
|
|
41
|
+
const thresholdStr = getOption("threshold");
|
|
42
|
+
const maeThreshold = thresholdStr ? parseFloat(thresholdStr) : 10;
|
|
43
|
+
const showHelp = getFlag("help") || getFlag("h");
|
|
44
|
+
if (showHelp) {
|
|
45
|
+
console.log(`
|
|
46
|
+
Usage: pnpm grader-validate [options]
|
|
47
|
+
|
|
48
|
+
Validate grader accuracy against human reference grades.
|
|
49
|
+
|
|
50
|
+
Options:
|
|
51
|
+
--grader <model> Grader model to validate (default: from config/models.yaml)
|
|
52
|
+
--threshold <n> MAE threshold for pass/fail (default: 10)
|
|
53
|
+
--help, -h Show this help
|
|
54
|
+
|
|
55
|
+
Examples:
|
|
56
|
+
pnpm grader-validate # validate current grader
|
|
57
|
+
pnpm grader-validate --grader openai:gpt-5.5 # test a candidate
|
|
58
|
+
pnpm grader-validate --threshold 15 # lenient threshold
|
|
59
|
+
`);
|
|
60
|
+
process.exit(0);
|
|
61
|
+
}
|
|
62
|
+
// ---------------------------------------------------------------------------
|
|
63
|
+
// Load reference grades
|
|
64
|
+
// ---------------------------------------------------------------------------
|
|
65
|
+
// ---------------------------------------------------------------------------
|
|
66
|
+
// Dimension mapping
|
|
67
|
+
// ---------------------------------------------------------------------------
|
|
68
|
+
function loadReferenceGrades() {
|
|
69
|
+
const refsDir = join(ROOT, "canonical", "grader-references");
|
|
70
|
+
if (!existsSync(refsDir)) {
|
|
71
|
+
console.error(`❌ Reference grades directory not found: ${refsDir}`);
|
|
72
|
+
console.error("Create canonical/grader-references/ with YAML reference files.");
|
|
73
|
+
console.error("See docs/exec-plans/completed/grader-reliability.md — Phase 2.");
|
|
74
|
+
process.exit(1);
|
|
75
|
+
}
|
|
76
|
+
const files = readdirSync(refsDir)
|
|
77
|
+
.filter((f) => f.endsWith(".yaml") || f.endsWith(".yml"))
|
|
78
|
+
.sort();
|
|
79
|
+
if (files.length === 0) {
|
|
80
|
+
console.error(`❌ No YAML files found in ${refsDir}`);
|
|
81
|
+
process.exit(1);
|
|
82
|
+
}
|
|
83
|
+
const allGrades = [];
|
|
84
|
+
for (const file of files) {
|
|
85
|
+
const filePath = join(refsDir, file);
|
|
86
|
+
const raw = readFileSync(filePath, "utf-8");
|
|
87
|
+
const parsed = load(raw);
|
|
88
|
+
if (Array.isArray(parsed)) {
|
|
89
|
+
allGrades.push(...parsed);
|
|
90
|
+
}
|
|
91
|
+
else if (typeof parsed === "object" && parsed !== null) {
|
|
92
|
+
allGrades.push(parsed);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
return allGrades;
|
|
96
|
+
}
|
|
97
|
+
// ---------------------------------------------------------------------------
|
|
98
|
+
// OpenAI grading API call (reuse from grader-consistency)
|
|
99
|
+
// ---------------------------------------------------------------------------
|
|
100
|
+
async function main() {
|
|
101
|
+
console.log("=== Grader Validation ===\n");
|
|
102
|
+
// Resolve grader model
|
|
103
|
+
const graderModel = graderOverride ?? loadGraderModel().id;
|
|
104
|
+
console.log(` Grader: ${graderModel}`);
|
|
105
|
+
console.log(` Threshold: MAE < ${maeThreshold}`);
|
|
106
|
+
// Load reference grades
|
|
107
|
+
const rawGrades = loadReferenceGrades();
|
|
108
|
+
console.log(` Samples: ${rawGrades.length} reference-graded responses`);
|
|
109
|
+
// Count total rubric judgments
|
|
110
|
+
let totalJudgments = 0;
|
|
111
|
+
for (const rg of rawGrades) {
|
|
112
|
+
totalJudgments += rg.rubrics.length;
|
|
113
|
+
}
|
|
114
|
+
console.log(` Judgments: ${totalJudgments} (response × rubric pairs)`);
|
|
115
|
+
const estimatedCost = totalJudgments * 0.005;
|
|
116
|
+
console.log(` Est. cost: ~$${estimatedCost.toFixed(2)}`);
|
|
117
|
+
console.log();
|
|
118
|
+
// Grade each reference sample
|
|
119
|
+
console.log(" Running grader on reference samples...");
|
|
120
|
+
const grades = [];
|
|
121
|
+
let completed = 0;
|
|
122
|
+
let failed = 0;
|
|
123
|
+
for (const ref of rawGrades) {
|
|
124
|
+
for (const rubric of ref.rubrics) {
|
|
125
|
+
const dimension = mapDimension(rubric.dimension);
|
|
126
|
+
if (!dimension) {
|
|
127
|
+
console.error(` ⚠ Unknown dimension '${rubric.dimension}' — skipping`);
|
|
128
|
+
continue;
|
|
129
|
+
}
|
|
130
|
+
const graderScore = await gradeOnce(graderModel, ref.response, rubric.rubricText);
|
|
131
|
+
completed++;
|
|
132
|
+
if (completed % 5 === 0 || completed === totalJudgments) {
|
|
133
|
+
process.stdout.write(`\r Progress: ${completed}/${totalJudgments}`);
|
|
134
|
+
}
|
|
135
|
+
if (graderScore === null) {
|
|
136
|
+
failed++;
|
|
137
|
+
continue;
|
|
138
|
+
}
|
|
139
|
+
grades.push({
|
|
140
|
+
area: ref.area,
|
|
141
|
+
dimension,
|
|
142
|
+
graderScore,
|
|
143
|
+
humanScore: rubric.humanScore,
|
|
144
|
+
taskId: ref.taskId,
|
|
145
|
+
...(rubric.notes && { notes: rubric.notes }),
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
console.log(); // newline after progress
|
|
150
|
+
if (failed > 0) {
|
|
151
|
+
console.log(` ⚠ ${failed} grading calls failed (excluded from analysis)`);
|
|
152
|
+
}
|
|
153
|
+
console.log();
|
|
154
|
+
if (grades.length === 0) {
|
|
155
|
+
console.error("❌ No grades to analyze.");
|
|
156
|
+
process.exit(1);
|
|
157
|
+
}
|
|
158
|
+
// Validate
|
|
159
|
+
const result = validateGrader(grades, graderModel, { maeThreshold });
|
|
160
|
+
// Print report
|
|
161
|
+
printReport(result);
|
|
162
|
+
// Write output
|
|
163
|
+
const outDir = join(ROOT, "results", "latest");
|
|
164
|
+
mkdirSync(outDir, { recursive: true });
|
|
165
|
+
const outPath = join(outDir, "grader-validation.json");
|
|
166
|
+
writeFileSync(outPath, JSON.stringify(result, null, 2));
|
|
167
|
+
console.log(`\n 📄 Results written to ${outPath}`);
|
|
168
|
+
// Exit with error code if threshold not met
|
|
169
|
+
if (!result.passesThreshold) {
|
|
170
|
+
console.error(`\n ❌ VALIDATION FAILED: MAE ${result.overallMae} exceeds threshold ${maeThreshold}`);
|
|
171
|
+
process.exit(1);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
// ---------------------------------------------------------------------------
|
|
175
|
+
// Main
|
|
176
|
+
// ---------------------------------------------------------------------------
|
|
177
|
+
function mapDimension(dim) {
|
|
178
|
+
switch (dim) {
|
|
179
|
+
case "code-correctness":
|
|
180
|
+
return "codeCorrectness";
|
|
181
|
+
case "doc-coverage":
|
|
182
|
+
return "docCoverage";
|
|
183
|
+
case "task-completion":
|
|
184
|
+
return "taskCompletion";
|
|
185
|
+
default:
|
|
186
|
+
return null;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
// ---------------------------------------------------------------------------
|
|
190
|
+
// Report formatting
|
|
191
|
+
// ---------------------------------------------------------------------------
|
|
192
|
+
function printReport(result) {
|
|
193
|
+
console.log("=".repeat(80));
|
|
194
|
+
console.log(" GRADER VALIDATION REPORT");
|
|
195
|
+
console.log("=".repeat(80));
|
|
196
|
+
console.log();
|
|
197
|
+
console.log(` Grader: ${result.graderModel}`);
|
|
198
|
+
console.log(` Observations: ${result.totalObservations}`);
|
|
199
|
+
console.log();
|
|
200
|
+
// Overall metrics
|
|
201
|
+
console.log("-".repeat(80));
|
|
202
|
+
console.log("OVERALL METRICS");
|
|
203
|
+
console.log("-".repeat(80));
|
|
204
|
+
console.log();
|
|
205
|
+
console.log(` MAE: ${result.overallMae} points`);
|
|
206
|
+
console.log(` Correlation: r=${result.overallCorrelation} (${classifyCorrelation(result.overallCorrelation)})`);
|
|
207
|
+
console.log(` Bias: ${result.overallBias > 0 ? "+" : ""}${result.overallBias} (${result.overallBias > 0 ? "grader scores higher" : result.overallBias < 0 ? "grader scores lower" : "no bias"})`);
|
|
208
|
+
console.log();
|
|
209
|
+
// Per-dimension table
|
|
210
|
+
console.log("-".repeat(80));
|
|
211
|
+
console.log("PER-DIMENSION VALIDITY");
|
|
212
|
+
console.log("-".repeat(80));
|
|
213
|
+
console.log();
|
|
214
|
+
const h = "| Dimension | MAE | Correlation | Quality | Bias | Count |";
|
|
215
|
+
const sep = "|------------------|-------|-------------|-----------|--------|-------|";
|
|
216
|
+
console.log(h);
|
|
217
|
+
console.log(sep);
|
|
218
|
+
const dims = [
|
|
219
|
+
{ data: result.perDimension.taskCompletion, name: "Task Completion" },
|
|
220
|
+
{ data: result.perDimension.codeCorrectness, name: "Code Correctness" },
|
|
221
|
+
{ data: result.perDimension.docCoverage, name: "Doc Coverage" },
|
|
222
|
+
];
|
|
223
|
+
for (const { data, name } of dims) {
|
|
224
|
+
const quality = classifyCorrelation(data.correlation);
|
|
225
|
+
const biasStr = data.bias > 0 ? `+${data.bias}` : `${data.bias}`;
|
|
226
|
+
console.log(`| ${name.padEnd(16)} | ${String(data.mae).padStart(5)} | r=${String(data.correlation).padStart(9)} | ${quality.padEnd(9)} | ${biasStr.padStart(6)} | ${String(data.count).padStart(5)} |`);
|
|
227
|
+
}
|
|
228
|
+
console.log();
|
|
229
|
+
// Pass/fail verdict
|
|
230
|
+
console.log("-".repeat(80));
|
|
231
|
+
console.log("VERDICT");
|
|
232
|
+
console.log("-".repeat(80));
|
|
233
|
+
console.log();
|
|
234
|
+
if (result.passesThreshold) {
|
|
235
|
+
console.log(` ✅ PASSED: MAE ${result.overallMae} < threshold ${result.maeThreshold}`);
|
|
236
|
+
}
|
|
237
|
+
else {
|
|
238
|
+
console.log(` ❌ FAILED: MAE ${result.overallMae} >= threshold ${result.maeThreshold}`);
|
|
239
|
+
}
|
|
240
|
+
console.log();
|
|
241
|
+
// Largest disagreements
|
|
242
|
+
const topN = Math.min(5, result.largestDisagreements.length);
|
|
243
|
+
if (topN > 0) {
|
|
244
|
+
console.log("-".repeat(80));
|
|
245
|
+
console.log(`TOP ${topN} LARGEST DISAGREEMENTS`);
|
|
246
|
+
console.log("-".repeat(80));
|
|
247
|
+
console.log();
|
|
248
|
+
for (let i = 0; i < topN; i++) {
|
|
249
|
+
const d = result.largestDisagreements[i];
|
|
250
|
+
const sign = d.signedError > 0 ? "+" : "";
|
|
251
|
+
console.log(` ${i + 1}. ${d.taskId} — ${d.dimension}`);
|
|
252
|
+
console.log(` Human=${d.humanScore}, Grader=${d.graderScore} (${sign}${d.signedError})`);
|
|
253
|
+
if (d.notes) {
|
|
254
|
+
console.log(` Note: ${d.notes}`);
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
console.log();
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
// Only run when invoked directly
|
|
261
|
+
if (process.argv[1]?.endsWith("grader-validate.ts") ||
|
|
262
|
+
process.argv[1]?.endsWith("grader-validate.js")) {
|
|
263
|
+
main().catch((err) => {
|
|
264
|
+
console.error("❌ Fatal error:", err);
|
|
265
|
+
process.exit(1);
|
|
266
|
+
});
|
|
267
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* measure-retrieval.ts
|
|
3
|
+
*
|
|
4
|
+
* Evaluates retrieval quality by comparing what Sanity's text search
|
|
5
|
+
* returns against the manually-annotated canonical documents for each
|
|
6
|
+
* evaluation task. Produces Recall@K and NDCG@K metrics.
|
|
7
|
+
*
|
|
8
|
+
* This answers: "Can a retriever find the docs an LLM actually needs?"
|
|
9
|
+
*/
|
|
10
|
+
import "dotenv/config";
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* measure-retrieval.ts
|
|
3
|
+
*
|
|
4
|
+
* Evaluates retrieval quality by comparing what Sanity's text search
|
|
5
|
+
* returns against the manually-annotated canonical documents for each
|
|
6
|
+
* evaluation task. Produces Recall@K and NDCG@K metrics.
|
|
7
|
+
*
|
|
8
|
+
* This answers: "Can a retriever find the docs an LLM actually needs?"
|
|
9
|
+
*/
|
|
10
|
+
// oxlint-disable-next-line import/no-unassigned-import -- side-effect: loads .env into process.env
|
|
11
|
+
import "dotenv/config";
|
|
12
|
+
import { writeFileSync, mkdirSync } from "fs";
|
|
13
|
+
import { join, dirname } from "path";
|
|
14
|
+
import { resolveMappings, } from "../pipeline/resolve-mappings.js";
|
|
15
|
+
import { getSanityClient } from "../sanity/client.js";
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
// Retrieval via Sanity text search
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
function calculateNDCG(canonical, retrieved, k) {
|
|
20
|
+
const canonicalSet = new Set(canonical);
|
|
21
|
+
// Discounted Cumulative Gain
|
|
22
|
+
let dcg = 0;
|
|
23
|
+
for (let i = 0; i < Math.min(k, retrieved.length); i++) {
|
|
24
|
+
if (canonicalSet.has(retrieved[i])) {
|
|
25
|
+
dcg += 1 / Math.log2(i + 2); // +2 because log2(1) = 0
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
// Ideal DCG
|
|
29
|
+
let idcg = 0;
|
|
30
|
+
for (let i = 0; i < Math.min(k, canonical.length); i++) {
|
|
31
|
+
idcg += 1 / Math.log2(i + 2);
|
|
32
|
+
}
|
|
33
|
+
return idcg === 0 ? 0 : dcg / idcg;
|
|
34
|
+
}
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
// Metrics
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
function calculateRecall(canonical, retrieved, k) {
|
|
39
|
+
const retrievedSet = new Set(retrieved.slice(0, k));
|
|
40
|
+
const hits = canonical.filter((doc) => retrievedSet.has(doc)).length;
|
|
41
|
+
return canonical.length === 0 ? 0 : hits / canonical.length;
|
|
42
|
+
}
|
|
43
|
+
async function main() {
|
|
44
|
+
console.log("=== Sanity AI Literacy — Retrieval Quality Measurement ===\n");
|
|
45
|
+
const ROOT = join(dirname(new URL(import.meta.url).pathname), "..", "..");
|
|
46
|
+
const mappings = resolveMappings(ROOT);
|
|
47
|
+
const results = [];
|
|
48
|
+
for (const [area, areaData] of Object.entries(mappings.feature_areas)) {
|
|
49
|
+
console.log(`Feature area: ${area}`);
|
|
50
|
+
for (const task of areaData.tasks) {
|
|
51
|
+
const canonicalSlugs = task.canonical_docs.map((d) => d.slug);
|
|
52
|
+
// Use the task description as a search query
|
|
53
|
+
const retrieved = await retrieveDocsForQuery(task.description, 10);
|
|
54
|
+
const result = {
|
|
55
|
+
canonical_docs: canonicalSlugs,
|
|
56
|
+
feature_area: area,
|
|
57
|
+
ndcg_at_10: calculateNDCG(canonicalSlugs, retrieved, 10),
|
|
58
|
+
recall_at_5: calculateRecall(canonicalSlugs, retrieved, 5),
|
|
59
|
+
recall_at_10: calculateRecall(canonicalSlugs, retrieved, 10),
|
|
60
|
+
retrieved_docs: retrieved,
|
|
61
|
+
task_id: task.id,
|
|
62
|
+
};
|
|
63
|
+
results.push(result);
|
|
64
|
+
console.log(` ${task.id}:`);
|
|
65
|
+
console.log(` Recall@5: ${(result.recall_at_5 * 100).toFixed(1)}%`);
|
|
66
|
+
console.log(` Recall@10: ${(result.recall_at_10 * 100).toFixed(1)}%`);
|
|
67
|
+
console.log(` NDCG@10: ${(result.ndcg_at_10 * 100).toFixed(1)}%`);
|
|
68
|
+
}
|
|
69
|
+
console.log();
|
|
70
|
+
}
|
|
71
|
+
// -----------------------------------------------------------------------
|
|
72
|
+
// Aggregate by feature area
|
|
73
|
+
// -----------------------------------------------------------------------
|
|
74
|
+
const byArea = {};
|
|
75
|
+
for (const area of Object.keys(mappings.feature_areas)) {
|
|
76
|
+
const areaResults = results.filter((r) => r.feature_area === area);
|
|
77
|
+
if (areaResults.length === 0)
|
|
78
|
+
continue;
|
|
79
|
+
byArea[area] = {
|
|
80
|
+
avg_ndcg_at_10: areaResults.reduce((s, r) => s + r.ndcg_at_10, 0) / areaResults.length,
|
|
81
|
+
avg_recall_at_5: areaResults.reduce((s, r) => s + r.recall_at_5, 0) / areaResults.length,
|
|
82
|
+
avg_recall_at_10: areaResults.reduce((s, r) => s + r.recall_at_10, 0) /
|
|
83
|
+
areaResults.length,
|
|
84
|
+
task_count: areaResults.length,
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
// -----------------------------------------------------------------------
|
|
88
|
+
// Overall
|
|
89
|
+
// -----------------------------------------------------------------------
|
|
90
|
+
const overall = {
|
|
91
|
+
avg_ndcg_at_10: results.reduce((s, r) => s + r.ndcg_at_10, 0) / results.length,
|
|
92
|
+
avg_recall_at_5: results.reduce((s, r) => s + r.recall_at_5, 0) / results.length,
|
|
93
|
+
avg_recall_at_10: results.reduce((s, r) => s + r.recall_at_10, 0) / results.length,
|
|
94
|
+
};
|
|
95
|
+
// -----------------------------------------------------------------------
|
|
96
|
+
// Print summary
|
|
97
|
+
// -----------------------------------------------------------------------
|
|
98
|
+
console.log("=".repeat(70));
|
|
99
|
+
console.log("RETRIEVAL QUALITY SUMMARY");
|
|
100
|
+
console.log("=".repeat(70));
|
|
101
|
+
console.log();
|
|
102
|
+
console.log("| Feature Area | Recall@5 | Recall@10 | NDCG@10 | Tasks |");
|
|
103
|
+
console.log("|---------------------|----------|-----------|---------|-------|");
|
|
104
|
+
for (const [area, stats] of Object.entries(byArea)) {
|
|
105
|
+
console.log(`| ${area.padEnd(19)} | ${(stats.avg_recall_at_5 * 100).toFixed(1).padStart(7)}% | ` +
|
|
106
|
+
`${(stats.avg_recall_at_10 * 100).toFixed(1).padStart(8)}% | ` +
|
|
107
|
+
`${(stats.avg_ndcg_at_10 * 100).toFixed(1).padStart(6)}% | ` +
|
|
108
|
+
`${stats.task_count.toString().padStart(5)} |`);
|
|
109
|
+
}
|
|
110
|
+
console.log();
|
|
111
|
+
console.log(`Overall: Recall@5=${(overall.avg_recall_at_5 * 100).toFixed(1)}% ` +
|
|
112
|
+
`Recall@10=${(overall.avg_recall_at_10 * 100).toFixed(1)}% ` +
|
|
113
|
+
`NDCG@10=${(overall.avg_ndcg_at_10 * 100).toFixed(1)}%`);
|
|
114
|
+
// -----------------------------------------------------------------------
|
|
115
|
+
// Persist results
|
|
116
|
+
// -----------------------------------------------------------------------
|
|
117
|
+
const summary = { by_area: byArea, overall, results };
|
|
118
|
+
const outDir = join(ROOT, "results", "latest");
|
|
119
|
+
mkdirSync(outDir, { recursive: true });
|
|
120
|
+
writeFileSync(join(outDir, "retrieval-results.json"), JSON.stringify(summary, null, 2));
|
|
121
|
+
console.log("\nResults written to results/latest/retrieval-results.json");
|
|
122
|
+
}
|
|
123
|
+
// ---------------------------------------------------------------------------
|
|
124
|
+
// Main
|
|
125
|
+
// ---------------------------------------------------------------------------
|
|
126
|
+
async function retrieveDocsForQuery(query, k = 10) {
|
|
127
|
+
const client = getSanityClient();
|
|
128
|
+
const results = await client.fetch(`
|
|
129
|
+
*[_type == "article" && !(_id in path("drafts.**"))]
|
|
130
|
+
| score(
|
|
131
|
+
boost(title match $query, 3),
|
|
132
|
+
boost(pt::text(content) match $query, 1)
|
|
133
|
+
)
|
|
134
|
+
| order(_score desc)
|
|
135
|
+
[0...$k] {
|
|
136
|
+
"slug": slug.current,
|
|
137
|
+
_score
|
|
138
|
+
}
|
|
139
|
+
`, { k, query });
|
|
140
|
+
return results.map((r) => r.slug);
|
|
141
|
+
}
|
|
142
|
+
main().catch((err) => {
|
|
143
|
+
console.error("Fatal error:", err);
|
|
144
|
+
process.exit(1);
|
|
145
|
+
});
|
|
@@ -19,6 +19,6 @@
|
|
|
19
19
|
* - SANITY_API_TOKEN with write access to the project
|
|
20
20
|
* - SANITY_PROJECT_ID and SANITY_DATASET configured (or defaults used)
|
|
21
21
|
*
|
|
22
|
-
* @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
|
|
22
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
|
|
23
23
|
*/
|
|
24
24
|
export {};
|
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
* - SANITY_API_TOKEN with write access to the project
|
|
20
20
|
* - SANITY_PROJECT_ID and SANITY_DATASET configured (or defaults used)
|
|
21
21
|
*
|
|
22
|
-
* @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
|
|
22
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
|
|
23
23
|
*/
|
|
24
24
|
import { config as dotenvConfig } from "dotenv";
|
|
25
25
|
import { existsSync } from "fs";
|
|
@@ -19,6 +19,6 @@
|
|
|
19
19
|
* - SANITY_API_TOKEN (or AILF_REPORT_SANITY_API_TOKEN) with write access
|
|
20
20
|
* - SANITY_PROJECT_ID and SANITY_DATASET configured
|
|
21
21
|
*
|
|
22
|
-
* @see docs/exec-plans/tasks-as-content/phase-3-migration.md
|
|
22
|
+
* @see docs/archive/exec-plans/tasks-as-content/phase-3-migration.md
|
|
23
23
|
*/
|
|
24
24
|
export {};
|
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
* - SANITY_API_TOKEN (or AILF_REPORT_SANITY_API_TOKEN) with write access
|
|
20
20
|
* - SANITY_PROJECT_ID and SANITY_DATASET configured
|
|
21
21
|
*
|
|
22
|
-
* @see docs/exec-plans/tasks-as-content/phase-3-migration.md
|
|
22
|
+
* @see docs/archive/exec-plans/tasks-as-content/phase-3-migration.md
|
|
23
23
|
*/
|
|
24
24
|
import { config as dotenvConfig } from "dotenv";
|
|
25
25
|
import { existsSync, readFileSync } from "fs";
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline.ts
|
|
3
|
+
*
|
|
4
|
+
* CLI orchestrator for the modular evaluation pipeline.
|
|
5
|
+
* Runs steps in sequence with validation between each.
|
|
6
|
+
*
|
|
7
|
+
* This is the single entry point for both local and CI evaluation.
|
|
8
|
+
* The CI workflow (eval.yml) calls this script, then layers on
|
|
9
|
+
* CI-specific post-steps (PR comment posting, artifact upload).
|
|
10
|
+
*
|
|
11
|
+
* Usage:
|
|
12
|
+
* pnpm pipeline # full baseline pipeline
|
|
13
|
+
* pnpm pipeline --dry-run # validate only, no execution
|
|
14
|
+
* pnpm pipeline --skip-fetch # reuse cached doc contexts
|
|
15
|
+
* pnpm pipeline --skip-eval # recalculate from existing results
|
|
16
|
+
* pnpm pipeline --mode agentic # run agentic pipeline
|
|
17
|
+
* pnpm pipeline --mode observed # run observed pipeline
|
|
18
|
+
* pnpm pipeline --source staging # use staging doc source
|
|
19
|
+
* pnpm pipeline --debug # run first 2 tests only (fast)
|
|
20
|
+
* pnpm pipeline --debug-n 5 # run first 5 tests
|
|
21
|
+
* pnpm pipeline --debug-pattern "Blog" # filter by description
|
|
22
|
+
* pnpm pipeline --debug-sample 3 # random sample of 3 tests
|
|
23
|
+
* pnpm pipeline --no-cache # bypass caching, force re-run
|
|
24
|
+
* pnpm pipeline --concurrency 64 # override max parallel API calls
|
|
25
|
+
* pnpm pipeline --area groq,frameworks # only evaluate these areas
|
|
26
|
+
* pnpm pipeline --task groq-blog-queries # only evaluate this task
|
|
27
|
+
* pnpm pipeline --changed-docs groq-introduction,how-queries-work
|
|
28
|
+
* # auto-scope to affected tasks
|
|
29
|
+
* pnpm pipeline --url https://... # override docs base URL
|
|
30
|
+
* pnpm pipeline --sanity-dataset staging # override Sanity dataset
|
|
31
|
+
* pnpm pipeline --sanity-project abc123 # override Sanity project ID
|
|
32
|
+
* pnpm pipeline --sanity-perspective agent-c7OKTk
|
|
33
|
+
* # evaluate a Sanity release
|
|
34
|
+
* pnpm pipeline --sanity-document <uuid>
|
|
35
|
+
* # evaluate specific document(s)
|
|
36
|
+
* pnpm pipeline --sanity-document <uuid> --sanity-documents <uuid>
|
|
37
|
+
* # singular and plural aliases work
|
|
38
|
+
* pnpm pipeline --header "X-Vercel-Protection-Bypass: <secret>"
|
|
39
|
+
* # custom HTTP header (repeatable)
|
|
40
|
+
* pnpm pipeline --allowed-origin my-branch.sanity.build
|
|
41
|
+
* # sandbox agent to this origin
|
|
42
|
+
* pnpm pipeline --before published # run before/after impact evaluation
|
|
43
|
+
* pnpm pipeline --before production # before = production source
|
|
44
|
+
* pnpm pipeline --before results/baselines/20260310.json # use existing scores
|
|
45
|
+
* pnpm pipeline --before latest-baseline # use most recent baseline
|
|
46
|
+
* pnpm pipeline --compare # compare scores against latest baseline
|
|
47
|
+
* pnpm pipeline --compare --compare-baseline <path> # compare against specific file
|
|
48
|
+
* pnpm pipeline --compare --threshold 5 # noise threshold for unchanged (default: 2)
|
|
49
|
+
* pnpm pipeline --output /tmp/report.md # write report to specific path
|
|
50
|
+
* pnpm pipeline --promptfoo-url <url> # include Promptfoo URL in report
|
|
51
|
+
* pnpm pipeline --gap-analysis # run failure mode + impact analysis
|
|
52
|
+
* pnpm pipeline --publish # write report to Sanity + fan out to sinks
|
|
53
|
+
* pnpm pipeline --publish --publish-tag "daily-2026-03-11" # tag the report
|
|
54
|
+
* pnpm pipeline --publish --report-dataset ailf-reports # report store dataset
|
|
55
|
+
* pnpm pipeline --publish --report-project abc123 # report store project
|
|
56
|
+
*
|
|
57
|
+
* Override precedence (highest wins):
|
|
58
|
+
* CLI flag (--url, --sanity-dataset, --sanity-project, --allowed-origin)
|
|
59
|
+
* → Environment variable (DOC_BASE_URL, SANITY_DATASET, SANITY_PROJECT_ID, DOC_ALLOWED_ORIGIN)
|
|
60
|
+
* → config/sources.yaml default value
|
|
61
|
+
*
|
|
62
|
+
* --header flags are additive and do not override env vars — they are
|
|
63
|
+
* always merged with any headers defined in DOC_HEADERS env var.
|
|
64
|
+
*
|
|
65
|
+
* Environment variable fallbacks (for CI):
|
|
66
|
+
* DEBUG_EVAL=1 → --debug
|
|
67
|
+
* DEBUG_EVAL_N=2 → --debug-n 2
|
|
68
|
+
* DEBUG_EVAL_PATTERN → --debug-pattern
|
|
69
|
+
* DEBUG_EVAL_SAMPLE → --debug-sample
|
|
70
|
+
* EVAL_FILTER_AREAS → --area
|
|
71
|
+
* EVAL_FILTER_TASKS → --task
|
|
72
|
+
* EVAL_CHANGED_DOCS → --changed-docs
|
|
73
|
+
* AILF_REPORT_DATASET → --report-dataset (report store, not eval)
|
|
74
|
+
* AILF_REPORT_PROJECT_ID → --report-project (report store, not eval)
|
|
75
|
+
*/
|
|
76
|
+
export {};
|