@sanity/ailf 1.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +29 -12
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
- package/dist/_vendor/ailf-core/config-helpers.js +51 -2
- package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
- package/dist/_vendor/ailf-core/examples/index.js +213 -94
- package/dist/_vendor/ailf-core/index.d.ts +3 -2
- package/dist/_vendor/ailf-core/index.js +2 -1
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
- package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +7 -1
- package/dist/adapters/config-sources/ts-config-loader.js +21 -13
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
- package/dist/adapters/task-sources/index.d.ts +3 -4
- package/dist/adapters/task-sources/index.js +3 -4
- package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
- package/dist/adapters/task-sources/repo-schemas.js +228 -20
- package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
- package/dist/adapters/task-sources/repo-task-source.js +81 -122
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
- package/dist/adapters/task-sources/task-file-loader.js +21 -7
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/coverage-audit.js +3 -1
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +5 -4
- package/dist/commands/init.js +190 -25
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +15 -4
- package/dist/composition-root.js +100 -55
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/build-step-sequence.js +4 -2
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +32 -19
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +77 -26
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +51 -31
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
- package/dist/pipeline/compiler/literacy-bridge.js +2 -2
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
- package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
- package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/expand-tasks.d.ts +2 -2
- package/dist/pipeline/expand-tasks.js +2 -2
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +16 -9
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +16 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
- package/dist/pipeline/mirror-repo-tasks.js +10 -10
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +68 -30
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +32 -24
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* grader-compare.ts
|
|
3
|
+
*
|
|
4
|
+
* CLI for inter-grader comparison (Phase 3 of grader reliability).
|
|
5
|
+
*
|
|
6
|
+
* Re-runs grading assertions on existing eval results using candidate grader
|
|
7
|
+
* models, then compares the resulting scores against the baseline grader.
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* pnpm grader-compare # compare vs configured candidates
|
|
11
|
+
* pnpm grader-compare --candidate openai:gpt-5.5-preview
|
|
12
|
+
* pnpm grader-compare --candidate openai:gpt-5.5-preview --candidate anthropic:claude-4-opus
|
|
13
|
+
* pnpm grader-compare --results eval-results.json
|
|
14
|
+
* pnpm grader-compare --format json # machine-readable output
|
|
15
|
+
*
|
|
16
|
+
* Reads: results/latest/eval-results.json (model responses to re-grade)
|
|
17
|
+
* Reads: config/models.yaml (baseline grader + optional candidate list)
|
|
18
|
+
* Writes: results/latest/grader-comparison.json
|
|
19
|
+
*
|
|
20
|
+
* @see docs/exec-plans/completed/grader-reliability.md — Phase 3
|
|
21
|
+
*/
|
|
22
|
+
export {};
|
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* grader-compare.ts
|
|
3
|
+
*
|
|
4
|
+
* CLI for inter-grader comparison (Phase 3 of grader reliability).
|
|
5
|
+
*
|
|
6
|
+
* Re-runs grading assertions on existing eval results using candidate grader
|
|
7
|
+
* models, then compares the resulting scores against the baseline grader.
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* pnpm grader-compare # compare vs configured candidates
|
|
11
|
+
* pnpm grader-compare --candidate openai:gpt-5.5-preview
|
|
12
|
+
* pnpm grader-compare --candidate openai:gpt-5.5-preview --candidate anthropic:claude-4-opus
|
|
13
|
+
* pnpm grader-compare --results eval-results.json
|
|
14
|
+
* pnpm grader-compare --format json # machine-readable output
|
|
15
|
+
*
|
|
16
|
+
* Reads: results/latest/eval-results.json (model responses to re-grade)
|
|
17
|
+
* Reads: config/models.yaml (baseline grader + optional candidate list)
|
|
18
|
+
* Writes: results/latest/grader-comparison.json
|
|
19
|
+
*
|
|
20
|
+
* @see docs/exec-plans/completed/grader-reliability.md — Phase 3
|
|
21
|
+
*/
|
|
22
|
+
import { existsSync, readFileSync, writeFileSync } from "fs";
|
|
23
|
+
import { dirname, join, resolve } from "path";
|
|
24
|
+
import { fileURLToPath } from "url";
|
|
25
|
+
import { load } from "js-yaml";
|
|
26
|
+
import { compareGraders, } from "../pipeline/grader-comparison.js";
|
|
27
|
+
import { classifyCorrelation } from "../pipeline/grader-validation.js";
|
|
28
|
+
import { gradeOnce } from "./grader-api.js";
|
|
29
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
30
|
+
const ROOT = resolve(__dirname, "..", "..");
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
// CLI argument parsing
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
const args = process.argv.slice(2);
|
|
35
|
+
function getAllOptions(name) {
|
|
36
|
+
const results = [];
|
|
37
|
+
const flag = `--${name}`;
|
|
38
|
+
for (let i = 0; i < args.length; i++) {
|
|
39
|
+
if (args[i] === flag && i + 1 < args.length) {
|
|
40
|
+
results.push(args[i + 1]);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return results;
|
|
44
|
+
}
|
|
45
|
+
function getFlag(name) {
|
|
46
|
+
return args.includes(`--${name}`);
|
|
47
|
+
}
|
|
48
|
+
function getOption(name) {
|
|
49
|
+
const idx = args.indexOf(`--${name}`);
|
|
50
|
+
return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined;
|
|
51
|
+
}
|
|
52
|
+
const candidateArgs = getAllOptions("candidate");
|
|
53
|
+
const resultsPath = getOption("results") ?? "results/latest/eval-results.json";
|
|
54
|
+
const format = getOption("format") ?? "table";
|
|
55
|
+
const outputPath = getOption("output");
|
|
56
|
+
const showHelp = getFlag("help") || getFlag("h");
|
|
57
|
+
if (showHelp) {
|
|
58
|
+
console.log(`
|
|
59
|
+
Usage: pnpm grader-compare [options]
|
|
60
|
+
|
|
61
|
+
Compare multiple grader models on the same evaluation responses.
|
|
62
|
+
|
|
63
|
+
Options:
|
|
64
|
+
--candidate <model> Candidate grader model ID (repeatable)
|
|
65
|
+
e.g., --candidate openai:gpt-5.5-preview
|
|
66
|
+
--results <path> Path to eval results (default: results/latest/eval-results.json)
|
|
67
|
+
--format <fmt> Output format: table (default) or json
|
|
68
|
+
--output <path> Write JSON report to file
|
|
69
|
+
--help, -h Show this help
|
|
70
|
+
|
|
71
|
+
If no --candidate flags are provided, reads grader-candidates from config/models.yaml.
|
|
72
|
+
|
|
73
|
+
Examples:
|
|
74
|
+
pnpm grader-compare --candidate openai:gpt-5.5-preview
|
|
75
|
+
pnpm grader-compare --candidate openai:gpt-5.5-preview --candidate anthropic:claude-4-opus
|
|
76
|
+
pnpm grader-compare --format json
|
|
77
|
+
`);
|
|
78
|
+
process.exit(0);
|
|
79
|
+
}
|
|
80
|
+
// ---------------------------------------------------------------------------
|
|
81
|
+
// Dimension classification
|
|
82
|
+
// ---------------------------------------------------------------------------
|
|
83
|
+
// DimensionName imported from pipeline/types.ts
|
|
84
|
+
const DIMENSION_PATTERNS = [
|
|
85
|
+
{ dimension: "taskCompletion", pattern: /task[_-]?completion/i },
|
|
86
|
+
{ dimension: "codeCorrectness", pattern: /code[_-]?correct/i },
|
|
87
|
+
{ dimension: "docCoverage", pattern: /doc[_-]?coverage/i },
|
|
88
|
+
];
|
|
89
|
+
/** Classify a component result into a dimension based on rubric content or metric */
|
|
90
|
+
function classifyDimension(comp) {
|
|
91
|
+
// Check the metric name first (structured dimensions)
|
|
92
|
+
const metric = comp.assertion?.metric ?? "";
|
|
93
|
+
for (const { dimension, pattern } of DIMENSION_PATTERNS) {
|
|
94
|
+
if (pattern.test(metric))
|
|
95
|
+
return dimension;
|
|
96
|
+
}
|
|
97
|
+
// Fall back to rubric text analysis
|
|
98
|
+
const rubric = typeof comp.assertion?.value === "string" ? comp.assertion.value : "";
|
|
99
|
+
for (const { dimension, pattern } of DIMENSION_PATTERNS) {
|
|
100
|
+
if (pattern.test(rubric))
|
|
101
|
+
return dimension;
|
|
102
|
+
}
|
|
103
|
+
return null;
|
|
104
|
+
}
|
|
105
|
+
// ---------------------------------------------------------------------------
|
|
106
|
+
// Judgment extraction (same pattern as grader-consistency.ts)
|
|
107
|
+
// ---------------------------------------------------------------------------
|
|
108
|
+
/** Detect feature area from test description */
|
|
109
|
+
function detectFeatureArea(description) {
|
|
110
|
+
// Pattern: "[gold] Area Name — Task Description" or "Area Name — Task Description"
|
|
111
|
+
const cleaned = description.replace(/^\[(?:gold|baseline)\]\s*/i, "");
|
|
112
|
+
const parts = cleaned.split("—");
|
|
113
|
+
if (parts.length >= 2) {
|
|
114
|
+
return parts[0].trim().toLowerCase().replace(/\s+/g, "-");
|
|
115
|
+
}
|
|
116
|
+
return "unknown";
|
|
117
|
+
}
|
|
118
|
+
/** Detect task ID from test description */
|
|
119
|
+
function detectTaskId(description) {
|
|
120
|
+
// Description format: "[gold] Area Name — Task Description"
|
|
121
|
+
const cleaned = description.replace(/^\[(?:gold|baseline)\]\s*/i, "");
|
|
122
|
+
return cleaned
|
|
123
|
+
.toLowerCase()
|
|
124
|
+
.replace(/\s+/g, "-")
|
|
125
|
+
.replace(/[^a-z0-9-]/g, "")
|
|
126
|
+
.slice(0, 60);
|
|
127
|
+
}
|
|
128
|
+
// ---------------------------------------------------------------------------
|
|
129
|
+
// OpenAI grading API call (reuses pattern from grader-consistency.ts)
|
|
130
|
+
// ---------------------------------------------------------------------------
|
|
131
|
+
function extractJudgments(evalResults) {
|
|
132
|
+
const judgments = [];
|
|
133
|
+
const results = evalResults.results?.results ?? [];
|
|
134
|
+
for (const result of results) {
|
|
135
|
+
const description = result.testCase?.description ?? result.description ?? "";
|
|
136
|
+
// Only process gold tests (with-docs), skip baseline tests
|
|
137
|
+
if (!description.toLowerCase().includes("[gold]"))
|
|
138
|
+
continue;
|
|
139
|
+
const area = detectFeatureArea(description);
|
|
140
|
+
const taskId = detectTaskId(description);
|
|
141
|
+
const providerId = result.provider?.id;
|
|
142
|
+
const components = result.gradingResult?.componentResults ?? [];
|
|
143
|
+
for (const comp of components) {
|
|
144
|
+
if (comp.assertion?.type !== "llm-rubric")
|
|
145
|
+
continue;
|
|
146
|
+
const dimension = classifyDimension(comp);
|
|
147
|
+
if (!dimension)
|
|
148
|
+
continue;
|
|
149
|
+
const rubricText = typeof comp.assertion.value === "string" ? comp.assertion.value : "";
|
|
150
|
+
if (!rubricText)
|
|
151
|
+
continue;
|
|
152
|
+
judgments.push({
|
|
153
|
+
area,
|
|
154
|
+
dimension,
|
|
155
|
+
originalScore: typeof comp.score === "number" ? comp.score : 0,
|
|
156
|
+
providerId,
|
|
157
|
+
responseText: result.response?.output ?? "",
|
|
158
|
+
rubricText,
|
|
159
|
+
taskId,
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
return judgments;
|
|
164
|
+
}
|
|
165
|
+
// ---------------------------------------------------------------------------
|
|
166
|
+
// Config loading
|
|
167
|
+
// ---------------------------------------------------------------------------
|
|
168
|
+
function formatComparisonReport(result) {
|
|
169
|
+
console.log("-".repeat(80));
|
|
170
|
+
console.log("COMPARISON RESULTS");
|
|
171
|
+
console.log("-".repeat(80));
|
|
172
|
+
console.log();
|
|
173
|
+
console.log(` Baseline grader: ${result.baselineGrader}`);
|
|
174
|
+
console.log(` Candidates: ${result.candidateGraders.join(", ")}`);
|
|
175
|
+
console.log();
|
|
176
|
+
for (const pair of result.pairwise) {
|
|
177
|
+
console.log("-".repeat(80));
|
|
178
|
+
console.log(` ${pair.graderA} vs ${pair.graderB}`);
|
|
179
|
+
console.log("-".repeat(80));
|
|
180
|
+
console.log();
|
|
181
|
+
console.log(` Overall:`);
|
|
182
|
+
console.log(` Correlation: r=${pair.correlation} (${classifyCorrelation(pair.correlation)})`);
|
|
183
|
+
console.log(` Bias: ${pair.bias > 0 ? "+" : ""}${pair.bias} (${pair.bias > 0 ? "candidate grades higher" : pair.bias < 0 ? "candidate grades lower" : "no systematic bias"})`);
|
|
184
|
+
console.log(` Mean Abs Diff: ${pair.meanAbsDiff} points`);
|
|
185
|
+
console.log();
|
|
186
|
+
// Per-dimension table
|
|
187
|
+
const h = "| Dimension | Correlation | Quality | Bias | MAD | Count |";
|
|
188
|
+
const sep = "|------------------|-------------|-----------|--------|-------|-------|";
|
|
189
|
+
console.log(h);
|
|
190
|
+
console.log(sep);
|
|
191
|
+
const dims = [
|
|
192
|
+
{ data: pair.perDimension.taskCompletion, name: "Task Completion" },
|
|
193
|
+
{ data: pair.perDimension.codeCorrectness, name: "Code Correctness" },
|
|
194
|
+
{ data: pair.perDimension.docCoverage, name: "Doc Coverage" },
|
|
195
|
+
];
|
|
196
|
+
for (const { data, name } of dims) {
|
|
197
|
+
const quality = classifyCorrelation(data.correlation);
|
|
198
|
+
const biasStr = data.bias > 0 ? `+${data.bias}` : `${data.bias}`;
|
|
199
|
+
console.log(`| ${name.padEnd(16)} | r=${String(data.correlation).padStart(9)} | ${quality.padEnd(9)} | ${biasStr.padStart(6)} | ${String(data.meanAbsDiff).padStart(5)} | ${String(data.count).padStart(5)} |`);
|
|
200
|
+
}
|
|
201
|
+
console.log();
|
|
202
|
+
}
|
|
203
|
+
// Recommendations
|
|
204
|
+
if (result.recommendations.length > 0) {
|
|
205
|
+
console.log("-".repeat(80));
|
|
206
|
+
console.log("RECOMMENDATIONS");
|
|
207
|
+
console.log("-".repeat(80));
|
|
208
|
+
console.log();
|
|
209
|
+
for (const rec of result.recommendations) {
|
|
210
|
+
const icon = rec.recommendation === "comparable"
|
|
211
|
+
? "✅"
|
|
212
|
+
: rec.recommendation === "divergent"
|
|
213
|
+
? "⚠️"
|
|
214
|
+
: "❌";
|
|
215
|
+
console.log(` ${icon} ${rec.modelId}: ${rec.recommendation}`);
|
|
216
|
+
console.log(` ${rec.reason}`);
|
|
217
|
+
}
|
|
218
|
+
console.log();
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
// ---------------------------------------------------------------------------
|
|
222
|
+
// Main execution
|
|
223
|
+
// ---------------------------------------------------------------------------
|
|
224
|
+
function loadConfig() {
|
|
225
|
+
const modelsPath = join(ROOT, "config", "models.yaml");
|
|
226
|
+
if (!existsSync(modelsPath)) {
|
|
227
|
+
console.error("❌ config/models.yaml not found");
|
|
228
|
+
process.exit(1);
|
|
229
|
+
}
|
|
230
|
+
const raw = readFileSync(modelsPath, "utf-8");
|
|
231
|
+
const data = load(raw);
|
|
232
|
+
const grader = {
|
|
233
|
+
id: data?.grader?.id ?? "openai:gpt-5",
|
|
234
|
+
label: data?.grader?.label ?? "GPT-5 (grader)",
|
|
235
|
+
};
|
|
236
|
+
// CLI candidates override config candidates
|
|
237
|
+
let candidates;
|
|
238
|
+
if (candidateArgs.length > 0) {
|
|
239
|
+
candidates = candidateArgs.map((id) => ({
|
|
240
|
+
id,
|
|
241
|
+
label: id.split(":").pop() ?? id,
|
|
242
|
+
}));
|
|
243
|
+
}
|
|
244
|
+
else {
|
|
245
|
+
const configCandidates = data?.["grader-candidates"] ?? [];
|
|
246
|
+
candidates = configCandidates.map((c) => ({
|
|
247
|
+
id: c.id,
|
|
248
|
+
label: c.label ?? c.id.split(":").pop() ?? c.id,
|
|
249
|
+
}));
|
|
250
|
+
}
|
|
251
|
+
return { baselineGrader: grader, candidates };
|
|
252
|
+
}
|
|
253
|
+
// ---------------------------------------------------------------------------
|
|
254
|
+
// Formatted output
|
|
255
|
+
// ---------------------------------------------------------------------------
|
|
256
|
+
async function main() {
|
|
257
|
+
console.log("=".repeat(80));
|
|
258
|
+
console.log(" INTER-GRADER COMPARISON");
|
|
259
|
+
console.log("=".repeat(80));
|
|
260
|
+
console.log();
|
|
261
|
+
// Load config
|
|
262
|
+
const { baselineGrader, candidates } = loadConfig();
|
|
263
|
+
if (candidates.length === 0) {
|
|
264
|
+
console.error("❌ No candidate graders specified. Use --candidate <model> or add grader-candidates to config/models.yaml.");
|
|
265
|
+
process.exit(1);
|
|
266
|
+
}
|
|
267
|
+
console.log(` Baseline grader: ${baselineGrader.id} (${baselineGrader.label})`);
|
|
268
|
+
for (const c of candidates) {
|
|
269
|
+
console.log(` Candidate: ${c.id} (${c.label})`);
|
|
270
|
+
}
|
|
271
|
+
console.log();
|
|
272
|
+
// Load eval results
|
|
273
|
+
const evalResultsPath = resolve(ROOT, resultsPath);
|
|
274
|
+
if (!existsSync(evalResultsPath)) {
|
|
275
|
+
console.error(`❌ Eval results not found: ${evalResultsPath}`);
|
|
276
|
+
console.error(" Run the evaluation pipeline first: pnpm pipeline");
|
|
277
|
+
process.exit(1);
|
|
278
|
+
}
|
|
279
|
+
const evalResultsRaw = readFileSync(evalResultsPath, "utf-8");
|
|
280
|
+
const evalResults = JSON.parse(evalResultsRaw);
|
|
281
|
+
const evalData = evalResults;
|
|
282
|
+
// Extract judgments
|
|
283
|
+
const judgments = extractJudgments(evalData);
|
|
284
|
+
console.log(` Judgments found: ${judgments.length}`);
|
|
285
|
+
if (judgments.length === 0) {
|
|
286
|
+
console.error("❌ No gold-test judgments found in eval results.");
|
|
287
|
+
process.exit(1);
|
|
288
|
+
}
|
|
289
|
+
// Build baseline scores from original eval results
|
|
290
|
+
const baselineScores = judgments.map((j) => ({
|
|
291
|
+
area: j.area,
|
|
292
|
+
dimension: j.dimension,
|
|
293
|
+
score: Math.round(j.originalScore * 100),
|
|
294
|
+
taskId: j.taskId,
|
|
295
|
+
}));
|
|
296
|
+
const baselineScoreSet = {
|
|
297
|
+
label: baselineGrader.label,
|
|
298
|
+
modelId: baselineGrader.id,
|
|
299
|
+
scores: baselineScores,
|
|
300
|
+
};
|
|
301
|
+
// Grade with each candidate
|
|
302
|
+
const candidateScoreSets = [];
|
|
303
|
+
for (const candidate of candidates) {
|
|
304
|
+
console.log();
|
|
305
|
+
console.log(` Grading with ${candidate.id}...`);
|
|
306
|
+
const candidateScores = [];
|
|
307
|
+
let completed = 0;
|
|
308
|
+
let failed = 0;
|
|
309
|
+
for (const j of judgments) {
|
|
310
|
+
const score = await gradeOnce(candidate.id, j.responseText, j.rubricText);
|
|
311
|
+
completed++;
|
|
312
|
+
if (score !== null) {
|
|
313
|
+
candidateScores.push({
|
|
314
|
+
area: j.area,
|
|
315
|
+
dimension: j.dimension,
|
|
316
|
+
score,
|
|
317
|
+
taskId: j.taskId,
|
|
318
|
+
});
|
|
319
|
+
}
|
|
320
|
+
else {
|
|
321
|
+
failed++;
|
|
322
|
+
}
|
|
323
|
+
if (completed % 10 === 0 || completed === judgments.length) {
|
|
324
|
+
process.stdout.write(`\r Progress: ${completed}/${judgments.length}${failed > 0 ? ` (${failed} failed)` : ""}`);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
console.log();
|
|
328
|
+
candidateScoreSets.push({
|
|
329
|
+
label: candidate.label,
|
|
330
|
+
modelId: candidate.id,
|
|
331
|
+
scores: candidateScores,
|
|
332
|
+
});
|
|
333
|
+
}
|
|
334
|
+
console.log();
|
|
335
|
+
// Run comparison
|
|
336
|
+
const comparison = compareGraders(baselineScoreSet, candidateScoreSets);
|
|
337
|
+
// Output
|
|
338
|
+
if (format === "json") {
|
|
339
|
+
const json = JSON.stringify(comparison, null, 2);
|
|
340
|
+
if (outputPath) {
|
|
341
|
+
writeFileSync(outputPath, json);
|
|
342
|
+
console.log(` ✅ Report written to ${outputPath}`);
|
|
343
|
+
}
|
|
344
|
+
else {
|
|
345
|
+
console.log(json);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
else {
|
|
349
|
+
formatComparisonReport(comparison);
|
|
350
|
+
}
|
|
351
|
+
// Write to results/latest/
|
|
352
|
+
const resultFilePath = join(ROOT, "results", "latest", "grader-comparison.json");
|
|
353
|
+
try {
|
|
354
|
+
writeFileSync(resultFilePath, JSON.stringify(comparison, null, 2));
|
|
355
|
+
console.log(` 📄 Report saved: ${resultFilePath}`);
|
|
356
|
+
}
|
|
357
|
+
catch {
|
|
358
|
+
// results/latest/ may not exist yet
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
// Only run when invoked directly
|
|
362
|
+
if (process.argv[1]?.endsWith("grader-compare.ts") ||
|
|
363
|
+
process.argv[1]?.endsWith("grader-compare.js")) {
|
|
364
|
+
main().catch((err) => {
|
|
365
|
+
console.error("❌ Fatal error:", err);
|
|
366
|
+
process.exit(1);
|
|
367
|
+
});
|
|
368
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* grader-consistency.ts
|
|
3
|
+
*
|
|
4
|
+
* CLI script for measuring grader consistency (Phase 1 of grader reliability).
|
|
5
|
+
*
|
|
6
|
+
* Reads existing eval results, re-runs ONLY the grading assertions N additional
|
|
7
|
+
* times with the configured grader model, and analyzes score variance.
|
|
8
|
+
*
|
|
9
|
+
* This does NOT re-run the models under test — it only re-grades the same
|
|
10
|
+
* responses. Cost is low: ~$0.005 per grading call × N replications.
|
|
11
|
+
*
|
|
12
|
+
* Usage:
|
|
13
|
+
* pnpm grader-consistency # 5 replications (default)
|
|
14
|
+
* pnpm grader-consistency --replications 3 # custom count
|
|
15
|
+
* pnpm grader-consistency --results <path> # custom results file
|
|
16
|
+
*
|
|
17
|
+
* Reads: results/latest/eval-results.json (default)
|
|
18
|
+
* Writes: results/latest/grader-consistency.json
|
|
19
|
+
*/
|
|
20
|
+
import "dotenv/config";
|