@sanity/ailf 2.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +28 -23
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
- package/dist/_vendor/ailf-core/config-helpers.js +29 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
- package/dist/_vendor/ailf-core/examples/index.js +208 -114
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
- package/dist/_vendor/ailf-tasks/cli.js +61 -0
- package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
- package/dist/_vendor/ailf-tasks/index.js +16 -0
- package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
- package/dist/_vendor/ailf-tasks/parser.js +73 -0
- package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
- package/dist/_vendor/ailf-tasks/schemas.js +180 -0
- package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
- package/dist/_vendor/ailf-tasks/validation.js +162 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +6 -1
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
- package/dist/adapters/task-sources/index.d.ts +1 -2
- package/dist/adapters/task-sources/index.js +1 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
- package/dist/adapters/task-sources/repo-schemas.js +2 -2
- package/dist/adapters/task-sources/repo-task-source.js +1 -1
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
- package/dist/adapters/task-sources/task-file-loader.js +20 -6
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +2 -3
- package/dist/commands/init.js +56 -170
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/composition-root.d.ts +2 -3
- package/dist/composition-root.js +27 -14
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +30 -16
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +50 -15
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +51 -31
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
- package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
- package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
- package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +15 -8
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +15 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/dist/pipeline/mirror-repo-tasks.js +1 -1
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +67 -29
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +24 -24
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* grader-compare.ts
|
|
3
|
+
*
|
|
4
|
+
* CLI for inter-grader comparison (Phase 3 of grader reliability).
|
|
5
|
+
*
|
|
6
|
+
* Re-runs grading assertions on existing eval results using candidate grader
|
|
7
|
+
* models, then compares the resulting scores against the baseline grader.
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* pnpm grader-compare # compare vs configured candidates
|
|
11
|
+
* pnpm grader-compare --candidate openai:gpt-5.5-preview
|
|
12
|
+
* pnpm grader-compare --candidate openai:gpt-5.5-preview --candidate anthropic:claude-4-opus
|
|
13
|
+
* pnpm grader-compare --results eval-results.json
|
|
14
|
+
* pnpm grader-compare --format json # machine-readable output
|
|
15
|
+
*
|
|
16
|
+
* Reads: results/latest/eval-results.json (model responses to re-grade)
|
|
17
|
+
* Reads: config/models.yaml (baseline grader + optional candidate list)
|
|
18
|
+
* Writes: results/latest/grader-comparison.json
|
|
19
|
+
*
|
|
20
|
+
* @see docs/exec-plans/completed/grader-reliability.md — Phase 3
|
|
21
|
+
*/
|
|
22
|
+
export {};
|
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* grader-compare.ts
|
|
3
|
+
*
|
|
4
|
+
* CLI for inter-grader comparison (Phase 3 of grader reliability).
|
|
5
|
+
*
|
|
6
|
+
* Re-runs grading assertions on existing eval results using candidate grader
|
|
7
|
+
* models, then compares the resulting scores against the baseline grader.
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* pnpm grader-compare # compare vs configured candidates
|
|
11
|
+
* pnpm grader-compare --candidate openai:gpt-5.5-preview
|
|
12
|
+
* pnpm grader-compare --candidate openai:gpt-5.5-preview --candidate anthropic:claude-4-opus
|
|
13
|
+
* pnpm grader-compare --results eval-results.json
|
|
14
|
+
* pnpm grader-compare --format json # machine-readable output
|
|
15
|
+
*
|
|
16
|
+
* Reads: results/latest/eval-results.json (model responses to re-grade)
|
|
17
|
+
* Reads: config/models.yaml (baseline grader + optional candidate list)
|
|
18
|
+
* Writes: results/latest/grader-comparison.json
|
|
19
|
+
*
|
|
20
|
+
* @see docs/exec-plans/completed/grader-reliability.md — Phase 3
|
|
21
|
+
*/
|
|
22
|
+
import { existsSync, readFileSync, writeFileSync } from "fs";
|
|
23
|
+
import { dirname, join, resolve } from "path";
|
|
24
|
+
import { fileURLToPath } from "url";
|
|
25
|
+
import { load } from "js-yaml";
|
|
26
|
+
import { compareGraders, } from "../pipeline/grader-comparison.js";
|
|
27
|
+
import { classifyCorrelation } from "../pipeline/grader-validation.js";
|
|
28
|
+
import { gradeOnce } from "./grader-api.js";
|
|
29
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
30
|
+
const ROOT = resolve(__dirname, "..", "..");
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
// CLI argument parsing
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
const args = process.argv.slice(2);
|
|
35
|
+
function getAllOptions(name) {
|
|
36
|
+
const results = [];
|
|
37
|
+
const flag = `--${name}`;
|
|
38
|
+
for (let i = 0; i < args.length; i++) {
|
|
39
|
+
if (args[i] === flag && i + 1 < args.length) {
|
|
40
|
+
results.push(args[i + 1]);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return results;
|
|
44
|
+
}
|
|
45
|
+
function getFlag(name) {
|
|
46
|
+
return args.includes(`--${name}`);
|
|
47
|
+
}
|
|
48
|
+
function getOption(name) {
|
|
49
|
+
const idx = args.indexOf(`--${name}`);
|
|
50
|
+
return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined;
|
|
51
|
+
}
|
|
52
|
+
const candidateArgs = getAllOptions("candidate");
|
|
53
|
+
const resultsPath = getOption("results") ?? "results/latest/eval-results.json";
|
|
54
|
+
const format = getOption("format") ?? "table";
|
|
55
|
+
const outputPath = getOption("output");
|
|
56
|
+
const showHelp = getFlag("help") || getFlag("h");
|
|
57
|
+
if (showHelp) {
|
|
58
|
+
console.log(`
|
|
59
|
+
Usage: pnpm grader-compare [options]
|
|
60
|
+
|
|
61
|
+
Compare multiple grader models on the same evaluation responses.
|
|
62
|
+
|
|
63
|
+
Options:
|
|
64
|
+
--candidate <model> Candidate grader model ID (repeatable)
|
|
65
|
+
e.g., --candidate openai:gpt-5.5-preview
|
|
66
|
+
--results <path> Path to eval results (default: results/latest/eval-results.json)
|
|
67
|
+
--format <fmt> Output format: table (default) or json
|
|
68
|
+
--output <path> Write JSON report to file
|
|
69
|
+
--help, -h Show this help
|
|
70
|
+
|
|
71
|
+
If no --candidate flags are provided, reads grader-candidates from config/models.yaml.
|
|
72
|
+
|
|
73
|
+
Examples:
|
|
74
|
+
pnpm grader-compare --candidate openai:gpt-5.5-preview
|
|
75
|
+
pnpm grader-compare --candidate openai:gpt-5.5-preview --candidate anthropic:claude-4-opus
|
|
76
|
+
pnpm grader-compare --format json
|
|
77
|
+
`);
|
|
78
|
+
process.exit(0);
|
|
79
|
+
}
|
|
80
|
+
// ---------------------------------------------------------------------------
|
|
81
|
+
// Dimension classification
|
|
82
|
+
// ---------------------------------------------------------------------------
|
|
83
|
+
// DimensionName imported from pipeline/types.ts
|
|
84
|
+
const DIMENSION_PATTERNS = [
|
|
85
|
+
{ dimension: "taskCompletion", pattern: /task[_-]?completion/i },
|
|
86
|
+
{ dimension: "codeCorrectness", pattern: /code[_-]?correct/i },
|
|
87
|
+
{ dimension: "docCoverage", pattern: /doc[_-]?coverage/i },
|
|
88
|
+
];
|
|
89
|
+
/** Classify a component result into a dimension based on rubric content or metric */
|
|
90
|
+
function classifyDimension(comp) {
|
|
91
|
+
// Check the metric name first (structured dimensions)
|
|
92
|
+
const metric = comp.assertion?.metric ?? "";
|
|
93
|
+
for (const { dimension, pattern } of DIMENSION_PATTERNS) {
|
|
94
|
+
if (pattern.test(metric))
|
|
95
|
+
return dimension;
|
|
96
|
+
}
|
|
97
|
+
// Fall back to rubric text analysis
|
|
98
|
+
const rubric = typeof comp.assertion?.value === "string" ? comp.assertion.value : "";
|
|
99
|
+
for (const { dimension, pattern } of DIMENSION_PATTERNS) {
|
|
100
|
+
if (pattern.test(rubric))
|
|
101
|
+
return dimension;
|
|
102
|
+
}
|
|
103
|
+
return null;
|
|
104
|
+
}
|
|
105
|
+
// ---------------------------------------------------------------------------
|
|
106
|
+
// Judgment extraction (same pattern as grader-consistency.ts)
|
|
107
|
+
// ---------------------------------------------------------------------------
|
|
108
|
+
/** Detect feature area from test description */
|
|
109
|
+
function detectFeatureArea(description) {
|
|
110
|
+
// Pattern: "[gold] Area Name — Task Description" or "Area Name — Task Description"
|
|
111
|
+
const cleaned = description.replace(/^\[(?:gold|baseline)\]\s*/i, "");
|
|
112
|
+
const parts = cleaned.split("—");
|
|
113
|
+
if (parts.length >= 2) {
|
|
114
|
+
return parts[0].trim().toLowerCase().replace(/\s+/g, "-");
|
|
115
|
+
}
|
|
116
|
+
return "unknown";
|
|
117
|
+
}
|
|
118
|
+
/** Detect task ID from test description */
|
|
119
|
+
function detectTaskId(description) {
|
|
120
|
+
// Description format: "[gold] Area Name — Task Description"
|
|
121
|
+
const cleaned = description.replace(/^\[(?:gold|baseline)\]\s*/i, "");
|
|
122
|
+
return cleaned
|
|
123
|
+
.toLowerCase()
|
|
124
|
+
.replace(/\s+/g, "-")
|
|
125
|
+
.replace(/[^a-z0-9-]/g, "")
|
|
126
|
+
.slice(0, 60);
|
|
127
|
+
}
|
|
128
|
+
// ---------------------------------------------------------------------------
|
|
129
|
+
// OpenAI grading API call (reuses pattern from grader-consistency.ts)
|
|
130
|
+
// ---------------------------------------------------------------------------
|
|
131
|
+
function extractJudgments(evalResults) {
|
|
132
|
+
const judgments = [];
|
|
133
|
+
const results = evalResults.results?.results ?? [];
|
|
134
|
+
for (const result of results) {
|
|
135
|
+
const description = result.testCase?.description ?? result.description ?? "";
|
|
136
|
+
// Only process gold tests (with-docs), skip baseline tests
|
|
137
|
+
if (!description.toLowerCase().includes("[gold]"))
|
|
138
|
+
continue;
|
|
139
|
+
const area = detectFeatureArea(description);
|
|
140
|
+
const taskId = detectTaskId(description);
|
|
141
|
+
const providerId = result.provider?.id;
|
|
142
|
+
const components = result.gradingResult?.componentResults ?? [];
|
|
143
|
+
for (const comp of components) {
|
|
144
|
+
if (comp.assertion?.type !== "llm-rubric")
|
|
145
|
+
continue;
|
|
146
|
+
const dimension = classifyDimension(comp);
|
|
147
|
+
if (!dimension)
|
|
148
|
+
continue;
|
|
149
|
+
const rubricText = typeof comp.assertion.value === "string" ? comp.assertion.value : "";
|
|
150
|
+
if (!rubricText)
|
|
151
|
+
continue;
|
|
152
|
+
judgments.push({
|
|
153
|
+
area,
|
|
154
|
+
dimension,
|
|
155
|
+
originalScore: typeof comp.score === "number" ? comp.score : 0,
|
|
156
|
+
providerId,
|
|
157
|
+
responseText: result.response?.output ?? "",
|
|
158
|
+
rubricText,
|
|
159
|
+
taskId,
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
return judgments;
|
|
164
|
+
}
|
|
165
|
+
// ---------------------------------------------------------------------------
|
|
166
|
+
// Config loading
|
|
167
|
+
// ---------------------------------------------------------------------------
|
|
168
|
+
function formatComparisonReport(result) {
|
|
169
|
+
console.log("-".repeat(80));
|
|
170
|
+
console.log("COMPARISON RESULTS");
|
|
171
|
+
console.log("-".repeat(80));
|
|
172
|
+
console.log();
|
|
173
|
+
console.log(` Baseline grader: ${result.baselineGrader}`);
|
|
174
|
+
console.log(` Candidates: ${result.candidateGraders.join(", ")}`);
|
|
175
|
+
console.log();
|
|
176
|
+
for (const pair of result.pairwise) {
|
|
177
|
+
console.log("-".repeat(80));
|
|
178
|
+
console.log(` ${pair.graderA} vs ${pair.graderB}`);
|
|
179
|
+
console.log("-".repeat(80));
|
|
180
|
+
console.log();
|
|
181
|
+
console.log(` Overall:`);
|
|
182
|
+
console.log(` Correlation: r=${pair.correlation} (${classifyCorrelation(pair.correlation)})`);
|
|
183
|
+
console.log(` Bias: ${pair.bias > 0 ? "+" : ""}${pair.bias} (${pair.bias > 0 ? "candidate grades higher" : pair.bias < 0 ? "candidate grades lower" : "no systematic bias"})`);
|
|
184
|
+
console.log(` Mean Abs Diff: ${pair.meanAbsDiff} points`);
|
|
185
|
+
console.log();
|
|
186
|
+
// Per-dimension table
|
|
187
|
+
const h = "| Dimension | Correlation | Quality | Bias | MAD | Count |";
|
|
188
|
+
const sep = "|------------------|-------------|-----------|--------|-------|-------|";
|
|
189
|
+
console.log(h);
|
|
190
|
+
console.log(sep);
|
|
191
|
+
const dims = [
|
|
192
|
+
{ data: pair.perDimension.taskCompletion, name: "Task Completion" },
|
|
193
|
+
{ data: pair.perDimension.codeCorrectness, name: "Code Correctness" },
|
|
194
|
+
{ data: pair.perDimension.docCoverage, name: "Doc Coverage" },
|
|
195
|
+
];
|
|
196
|
+
for (const { data, name } of dims) {
|
|
197
|
+
const quality = classifyCorrelation(data.correlation);
|
|
198
|
+
const biasStr = data.bias > 0 ? `+${data.bias}` : `${data.bias}`;
|
|
199
|
+
console.log(`| ${name.padEnd(16)} | r=${String(data.correlation).padStart(9)} | ${quality.padEnd(9)} | ${biasStr.padStart(6)} | ${String(data.meanAbsDiff).padStart(5)} | ${String(data.count).padStart(5)} |`);
|
|
200
|
+
}
|
|
201
|
+
console.log();
|
|
202
|
+
}
|
|
203
|
+
// Recommendations
|
|
204
|
+
if (result.recommendations.length > 0) {
|
|
205
|
+
console.log("-".repeat(80));
|
|
206
|
+
console.log("RECOMMENDATIONS");
|
|
207
|
+
console.log("-".repeat(80));
|
|
208
|
+
console.log();
|
|
209
|
+
for (const rec of result.recommendations) {
|
|
210
|
+
const icon = rec.recommendation === "comparable"
|
|
211
|
+
? "✅"
|
|
212
|
+
: rec.recommendation === "divergent"
|
|
213
|
+
? "⚠️"
|
|
214
|
+
: "❌";
|
|
215
|
+
console.log(` ${icon} ${rec.modelId}: ${rec.recommendation}`);
|
|
216
|
+
console.log(` ${rec.reason}`);
|
|
217
|
+
}
|
|
218
|
+
console.log();
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
// ---------------------------------------------------------------------------
|
|
222
|
+
// Main execution
|
|
223
|
+
// ---------------------------------------------------------------------------
|
|
224
|
+
function loadConfig() {
|
|
225
|
+
const modelsPath = join(ROOT, "config", "models.yaml");
|
|
226
|
+
if (!existsSync(modelsPath)) {
|
|
227
|
+
console.error("❌ config/models.yaml not found");
|
|
228
|
+
process.exit(1);
|
|
229
|
+
}
|
|
230
|
+
const raw = readFileSync(modelsPath, "utf-8");
|
|
231
|
+
const data = load(raw);
|
|
232
|
+
const grader = {
|
|
233
|
+
id: data?.grader?.id ?? "openai:gpt-5",
|
|
234
|
+
label: data?.grader?.label ?? "GPT-5 (grader)",
|
|
235
|
+
};
|
|
236
|
+
// CLI candidates override config candidates
|
|
237
|
+
let candidates;
|
|
238
|
+
if (candidateArgs.length > 0) {
|
|
239
|
+
candidates = candidateArgs.map((id) => ({
|
|
240
|
+
id,
|
|
241
|
+
label: id.split(":").pop() ?? id,
|
|
242
|
+
}));
|
|
243
|
+
}
|
|
244
|
+
else {
|
|
245
|
+
const configCandidates = data?.["grader-candidates"] ?? [];
|
|
246
|
+
candidates = configCandidates.map((c) => ({
|
|
247
|
+
id: c.id,
|
|
248
|
+
label: c.label ?? c.id.split(":").pop() ?? c.id,
|
|
249
|
+
}));
|
|
250
|
+
}
|
|
251
|
+
return { baselineGrader: grader, candidates };
|
|
252
|
+
}
|
|
253
|
+
// ---------------------------------------------------------------------------
|
|
254
|
+
// Formatted output
|
|
255
|
+
// ---------------------------------------------------------------------------
|
|
256
|
+
async function main() {
|
|
257
|
+
console.log("=".repeat(80));
|
|
258
|
+
console.log(" INTER-GRADER COMPARISON");
|
|
259
|
+
console.log("=".repeat(80));
|
|
260
|
+
console.log();
|
|
261
|
+
// Load config
|
|
262
|
+
const { baselineGrader, candidates } = loadConfig();
|
|
263
|
+
if (candidates.length === 0) {
|
|
264
|
+
console.error("❌ No candidate graders specified. Use --candidate <model> or add grader-candidates to config/models.yaml.");
|
|
265
|
+
process.exit(1);
|
|
266
|
+
}
|
|
267
|
+
console.log(` Baseline grader: ${baselineGrader.id} (${baselineGrader.label})`);
|
|
268
|
+
for (const c of candidates) {
|
|
269
|
+
console.log(` Candidate: ${c.id} (${c.label})`);
|
|
270
|
+
}
|
|
271
|
+
console.log();
|
|
272
|
+
// Load eval results
|
|
273
|
+
const evalResultsPath = resolve(ROOT, resultsPath);
|
|
274
|
+
if (!existsSync(evalResultsPath)) {
|
|
275
|
+
console.error(`❌ Eval results not found: ${evalResultsPath}`);
|
|
276
|
+
console.error(" Run the evaluation pipeline first: pnpm pipeline");
|
|
277
|
+
process.exit(1);
|
|
278
|
+
}
|
|
279
|
+
const evalResultsRaw = readFileSync(evalResultsPath, "utf-8");
|
|
280
|
+
const evalResults = JSON.parse(evalResultsRaw);
|
|
281
|
+
const evalData = evalResults;
|
|
282
|
+
// Extract judgments
|
|
283
|
+
const judgments = extractJudgments(evalData);
|
|
284
|
+
console.log(` Judgments found: ${judgments.length}`);
|
|
285
|
+
if (judgments.length === 0) {
|
|
286
|
+
console.error("❌ No gold-test judgments found in eval results.");
|
|
287
|
+
process.exit(1);
|
|
288
|
+
}
|
|
289
|
+
// Build baseline scores from original eval results
|
|
290
|
+
const baselineScores = judgments.map((j) => ({
|
|
291
|
+
area: j.area,
|
|
292
|
+
dimension: j.dimension,
|
|
293
|
+
score: Math.round(j.originalScore * 100),
|
|
294
|
+
taskId: j.taskId,
|
|
295
|
+
}));
|
|
296
|
+
const baselineScoreSet = {
|
|
297
|
+
label: baselineGrader.label,
|
|
298
|
+
modelId: baselineGrader.id,
|
|
299
|
+
scores: baselineScores,
|
|
300
|
+
};
|
|
301
|
+
// Grade with each candidate
|
|
302
|
+
const candidateScoreSets = [];
|
|
303
|
+
for (const candidate of candidates) {
|
|
304
|
+
console.log();
|
|
305
|
+
console.log(` Grading with ${candidate.id}...`);
|
|
306
|
+
const candidateScores = [];
|
|
307
|
+
let completed = 0;
|
|
308
|
+
let failed = 0;
|
|
309
|
+
for (const j of judgments) {
|
|
310
|
+
const score = await gradeOnce(candidate.id, j.responseText, j.rubricText);
|
|
311
|
+
completed++;
|
|
312
|
+
if (score !== null) {
|
|
313
|
+
candidateScores.push({
|
|
314
|
+
area: j.area,
|
|
315
|
+
dimension: j.dimension,
|
|
316
|
+
score,
|
|
317
|
+
taskId: j.taskId,
|
|
318
|
+
});
|
|
319
|
+
}
|
|
320
|
+
else {
|
|
321
|
+
failed++;
|
|
322
|
+
}
|
|
323
|
+
if (completed % 10 === 0 || completed === judgments.length) {
|
|
324
|
+
process.stdout.write(`\r Progress: ${completed}/${judgments.length}${failed > 0 ? ` (${failed} failed)` : ""}`);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
console.log();
|
|
328
|
+
candidateScoreSets.push({
|
|
329
|
+
label: candidate.label,
|
|
330
|
+
modelId: candidate.id,
|
|
331
|
+
scores: candidateScores,
|
|
332
|
+
});
|
|
333
|
+
}
|
|
334
|
+
console.log();
|
|
335
|
+
// Run comparison
|
|
336
|
+
const comparison = compareGraders(baselineScoreSet, candidateScoreSets);
|
|
337
|
+
// Output
|
|
338
|
+
if (format === "json") {
|
|
339
|
+
const json = JSON.stringify(comparison, null, 2);
|
|
340
|
+
if (outputPath) {
|
|
341
|
+
writeFileSync(outputPath, json);
|
|
342
|
+
console.log(` ✅ Report written to ${outputPath}`);
|
|
343
|
+
}
|
|
344
|
+
else {
|
|
345
|
+
console.log(json);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
else {
|
|
349
|
+
formatComparisonReport(comparison);
|
|
350
|
+
}
|
|
351
|
+
// Write to results/latest/
|
|
352
|
+
const resultFilePath = join(ROOT, "results", "latest", "grader-comparison.json");
|
|
353
|
+
try {
|
|
354
|
+
writeFileSync(resultFilePath, JSON.stringify(comparison, null, 2));
|
|
355
|
+
console.log(` 📄 Report saved: ${resultFilePath}`);
|
|
356
|
+
}
|
|
357
|
+
catch {
|
|
358
|
+
// results/latest/ may not exist yet
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
// Only run when invoked directly
|
|
362
|
+
if (process.argv[1]?.endsWith("grader-compare.ts") ||
|
|
363
|
+
process.argv[1]?.endsWith("grader-compare.js")) {
|
|
364
|
+
main().catch((err) => {
|
|
365
|
+
console.error("❌ Fatal error:", err);
|
|
366
|
+
process.exit(1);
|
|
367
|
+
});
|
|
368
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* grader-consistency.ts
|
|
3
|
+
*
|
|
4
|
+
* CLI script for measuring grader consistency (Phase 1 of grader reliability).
|
|
5
|
+
*
|
|
6
|
+
* Reads existing eval results, re-runs ONLY the grading assertions N additional
|
|
7
|
+
* times with the configured grader model, and analyzes score variance.
|
|
8
|
+
*
|
|
9
|
+
* This does NOT re-run the models under test — it only re-grades the same
|
|
10
|
+
* responses. Cost is low: ~$0.005 per grading call × N replications.
|
|
11
|
+
*
|
|
12
|
+
* Usage:
|
|
13
|
+
* pnpm grader-consistency # 5 replications (default)
|
|
14
|
+
* pnpm grader-consistency --replications 3 # custom count
|
|
15
|
+
* pnpm grader-consistency --results <path> # custom results file
|
|
16
|
+
*
|
|
17
|
+
* Reads: results/latest/eval-results.json (default)
|
|
18
|
+
* Writes: results/latest/grader-consistency.json
|
|
19
|
+
*/
|
|
20
|
+
import "dotenv/config";
|