@sanity/ailf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -0
- package/bin/ailf.js +64 -0
- package/canonical/grader-references/README.md +88 -0
- package/canonical/grader-references/groq.yaml +234 -0
- package/canonical/grader-references/studio-setup.yaml +275 -0
- package/canonical/reference-solutions/.gitkeep +1 -0
- package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
- package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
- package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
- package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
- package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
- package/canonical/reference-solutions/groq/joins-references.ts +300 -0
- package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
- package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
- package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
- package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
- package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
- package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
- package/config/bigquery/README.md +74 -0
- package/config/bigquery/views/area_scores.sql +87 -0
- package/config/bigquery/views/reports.sql +49 -0
- package/config/features.yaml +116 -0
- package/config/models.yaml +115 -0
- package/config/prompts.yaml +75 -0
- package/config/rubrics.yaml +62 -0
- package/config/schedules.yaml +43 -0
- package/config/sinks.yaml +54 -0
- package/config/sources.yaml +51 -0
- package/config/thresholds.yaml +49 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
- package/dist/_vendor/ailf-core/examples/index.js +285 -0
- package/dist/_vendor/ailf-core/index.d.ts +17 -0
- package/dist/_vendor/ailf-core/index.js +17 -0
- package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
- package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
- package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
- package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
- package/dist/_vendor/ailf-core/ports/context.js +14 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
- package/dist/_vendor/ailf-core/ports/index.js +7 -0
- package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
- package/dist/_vendor/ailf-core/ports/logger.js +11 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
- package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/schemas/index.js +16 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
- package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
- package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
- package/dist/_vendor/ailf-core/services/index.js +12 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/scoring.js +222 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
- package/dist/_vendor/ailf-core/types/index.js +21 -0
- package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
- package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
- package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
- package/dist/_vendor/ailf-shared/document-ref.js +1 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -0
- package/dist/_vendor/ailf-shared/index.js +16 -0
- package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
- package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
- package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
- package/dist/_vendor/ailf-shared/score-grades.js +23 -0
- package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
- package/dist/adapters/cache/content-lake-cache.js +59 -0
- package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
- package/dist/adapters/cache/filesystem-cache.js +54 -0
- package/dist/adapters/cache/index.d.ts +2 -0
- package/dist/adapters/cache/index.js +2 -0
- package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
- package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
- package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
- package/dist/adapters/config-sources/file-config-adapter.js +96 -0
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +2 -0
- package/dist/adapters/doc-fetchers/index.d.ts +1 -0
- package/dist/adapters/doc-fetchers/index.js +1 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
- package/dist/adapters/eval-runners/index.d.ts +1 -0
- package/dist/adapters/eval-runners/index.js +1 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
- package/dist/adapters/index.d.ts +12 -0
- package/dist/adapters/index.js +12 -0
- package/dist/adapters/loggers/console-logger.d.ts +22 -0
- package/dist/adapters/loggers/console-logger.js +54 -0
- package/dist/adapters/loggers/index.d.ts +9 -0
- package/dist/adapters/loggers/index.js +9 -0
- package/dist/adapters/loggers/json-logger.d.ts +18 -0
- package/dist/adapters/loggers/json-logger.js +33 -0
- package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
- package/dist/adapters/loggers/quiet-logger.js +30 -0
- package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/composite-task-source.js +59 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
- package/dist/adapters/task-sources/index.d.ts +7 -0
- package/dist/adapters/task-sources/index.js +7 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
- package/dist/adapters/task-sources/repo-schemas.js +234 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
- package/dist/adapters/task-sources/repo-task-source.js +104 -0
- package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
- package/dist/adapters/task-sources/repo-trigger.js +153 -0
- package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
- package/dist/adapters/task-sources/repo-validation.js +164 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
- package/dist/adapters/task-sources/yaml-task-source.js +136 -0
- package/dist/agent-observer/agentic-provider.d.ts +132 -0
- package/dist/agent-observer/agentic-provider.js +983 -0
- package/dist/agent-observer/classifier.d.ts +62 -0
- package/dist/agent-observer/classifier.js +269 -0
- package/dist/agent-observer/index.d.ts +7 -0
- package/dist/agent-observer/index.js +4 -0
- package/dist/agent-observer/pricing.d.ts +35 -0
- package/dist/agent-observer/pricing.js +82 -0
- package/dist/agent-observer/provider.d.ts +77 -0
- package/dist/agent-observer/provider.js +151 -0
- package/dist/agent-observer/proxy.d.ts +91 -0
- package/dist/agent-observer/proxy.js +321 -0
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/agent-observer/types.d.ts +137 -0
- package/dist/agent-observer/types.js +16 -0
- package/dist/assertions/source-isolation.d.ts +72 -0
- package/dist/assertions/source-isolation.js +117 -0
- package/dist/cli.d.ts +24 -0
- package/dist/cli.js +199 -0
- package/dist/commands/agent-report.d.ts +5 -0
- package/dist/commands/agent-report.js +69 -0
- package/dist/commands/baseline.d.ts +9 -0
- package/dist/commands/baseline.js +141 -0
- package/dist/commands/cache.d.ts +13 -0
- package/dist/commands/cache.js +135 -0
- package/dist/commands/calculate-scores.d.ts +8 -0
- package/dist/commands/calculate-scores.js +48 -0
- package/dist/commands/compare.d.ts +8 -0
- package/dist/commands/compare.js +120 -0
- package/dist/commands/completion.d.ts +18 -0
- package/dist/commands/completion.js +260 -0
- package/dist/commands/coverage-audit.d.ts +7 -0
- package/dist/commands/coverage-audit.js +40 -0
- package/dist/commands/discovery-report.d.ts +10 -0
- package/dist/commands/discovery-report.js +44 -0
- package/dist/commands/eval.d.ts +9 -0
- package/dist/commands/eval.js +35 -0
- package/dist/commands/explain-handler.d.ts +34 -0
- package/dist/commands/explain-handler.js +719 -0
- package/dist/commands/fetch-docs.d.ts +8 -0
- package/dist/commands/fetch-docs.js +128 -0
- package/dist/commands/generate-configs.d.ts +8 -0
- package/dist/commands/generate-configs.js +46 -0
- package/dist/commands/grader/index.d.ts +11 -0
- package/dist/commands/grader/index.js +118 -0
- package/dist/commands/init.d.ts +19 -0
- package/dist/commands/init.js +150 -0
- package/dist/commands/interactive.d.ts +12 -0
- package/dist/commands/interactive.js +238 -0
- package/dist/commands/lookup-doc.d.ts +15 -0
- package/dist/commands/lookup-doc.js +84 -0
- package/dist/commands/measure-retrieval.d.ts +5 -0
- package/dist/commands/measure-retrieval.js +65 -0
- package/dist/commands/pipeline-action.d.ts +71 -0
- package/dist/commands/pipeline-action.js +305 -0
- package/dist/commands/pipeline.d.ts +62 -0
- package/dist/commands/pipeline.js +53 -0
- package/dist/commands/pr-comment.d.ts +8 -0
- package/dist/commands/pr-comment.js +47 -0
- package/dist/commands/publish.d.ts +26 -0
- package/dist/commands/publish.js +253 -0
- package/dist/commands/readiness-report.d.ts +10 -0
- package/dist/commands/readiness-report.js +104 -0
- package/dist/commands/shared/options.d.ts +29 -0
- package/dist/commands/shared/options.js +57 -0
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +16 -0
- package/dist/commands/validate-tasks.js +93 -0
- package/dist/commands/validate.d.ts +9 -0
- package/dist/commands/validate.js +73 -0
- package/dist/commands/webhook-server.d.ts +5 -0
- package/dist/commands/webhook-server.js +30 -0
- package/dist/commands/weekly-digest.d.ts +10 -0
- package/dist/commands/weekly-digest.js +104 -0
- package/dist/composition-root.d.ts +26 -0
- package/dist/composition-root.js +107 -0
- package/dist/interpolate.d.ts +26 -0
- package/dist/interpolate.js +70 -0
- package/dist/job-store.d.ts +104 -0
- package/dist/job-store.js +188 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.d.ts +27 -0
- package/dist/orchestration/build-app-context.js +81 -0
- package/dist/orchestration/build-step-sequence.d.ts +15 -0
- package/dist/orchestration/build-step-sequence.js +84 -0
- package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
- package/dist/orchestration/config-to-source-overrides.js +28 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/index.d.ts +11 -0
- package/dist/orchestration/index.js +11 -0
- package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
- package/dist/orchestration/pipeline-orchestrator.js +153 -0
- package/dist/orchestration/step-runner.d.ts +20 -0
- package/dist/orchestration/step-runner.js +88 -0
- package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
- package/dist/orchestration/steps/calculate-scores-step.js +95 -0
- package/dist/orchestration/steps/callback-step.d.ts +24 -0
- package/dist/orchestration/steps/callback-step.js +76 -0
- package/dist/orchestration/steps/compare-step.d.ts +14 -0
- package/dist/orchestration/steps/compare-step.js +92 -0
- package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
- package/dist/orchestration/steps/discovery-report-step.js +55 -0
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
- package/dist/orchestration/steps/fetch-docs-step.js +135 -0
- package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
- package/dist/orchestration/steps/gap-analysis-step.js +136 -0
- package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
- package/dist/orchestration/steps/generate-configs-step.js +85 -0
- package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
- package/dist/orchestration/steps/grader-consistency-step.js +64 -0
- package/dist/orchestration/steps/index.d.ts +19 -0
- package/dist/orchestration/steps/index.js +19 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
- package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
- package/dist/orchestration/steps/publish-report-step.js +216 -0
- package/dist/orchestration/steps/readiness-step.d.ts +13 -0
- package/dist/orchestration/steps/readiness-step.js +91 -0
- package/dist/orchestration/steps/report-step.d.ts +12 -0
- package/dist/orchestration/steps/report-step.js +49 -0
- package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
- package/dist/orchestration/steps/run-eval-step.js +195 -0
- package/dist/orchestration/steps/validate-step.d.ts +12 -0
- package/dist/orchestration/steps/validate-step.js +41 -0
- package/dist/pipeline/agent-behavior-report.d.ts +53 -0
- package/dist/pipeline/agent-behavior-report.js +132 -0
- package/dist/pipeline/attribution.d.ts +47 -0
- package/dist/pipeline/attribution.js +226 -0
- package/dist/pipeline/baseline.d.ts +37 -0
- package/dist/pipeline/baseline.js +141 -0
- package/dist/pipeline/cache.d.ts +101 -0
- package/dist/pipeline/cache.js +283 -0
- package/dist/pipeline/calculate-scores.d.ts +102 -0
- package/dist/pipeline/calculate-scores.js +1128 -0
- package/dist/pipeline/callback-delivery.d.ts +50 -0
- package/dist/pipeline/callback-delivery.js +89 -0
- package/dist/pipeline/checks.d.ts +39 -0
- package/dist/pipeline/checks.js +280 -0
- package/dist/pipeline/classify-url.d.ts +61 -0
- package/dist/pipeline/classify-url.js +93 -0
- package/dist/pipeline/compare.d.ts +31 -0
- package/dist/pipeline/compare.js +208 -0
- package/dist/pipeline/coverage-audit.d.ts +39 -0
- package/dist/pipeline/coverage-audit.js +165 -0
- package/dist/pipeline/degradations.d.ts +85 -0
- package/dist/pipeline/degradations.js +242 -0
- package/dist/pipeline/discovery-report.d.ts +55 -0
- package/dist/pipeline/discovery-report.js +178 -0
- package/dist/pipeline/eval-constants.d.ts +68 -0
- package/dist/pipeline/eval-constants.js +111 -0
- package/dist/pipeline/eval-fingerprint.d.ts +66 -0
- package/dist/pipeline/eval-fingerprint.js +175 -0
- package/dist/pipeline/expand-tasks.d.ts +220 -0
- package/dist/pipeline/expand-tasks.js +421 -0
- package/dist/pipeline/failure-modes.d.ts +46 -0
- package/dist/pipeline/failure-modes.js +348 -0
- package/dist/pipeline/fetch-url-content.d.ts +44 -0
- package/dist/pipeline/fetch-url-content.js +93 -0
- package/dist/pipeline/gap-analysis.d.ts +48 -0
- package/dist/pipeline/gap-analysis.js +231 -0
- package/dist/pipeline/generate-configs.d.ts +72 -0
- package/dist/pipeline/generate-configs.js +395 -0
- package/dist/pipeline/grader-api.d.ts +49 -0
- package/dist/pipeline/grader-api.js +200 -0
- package/dist/pipeline/grader-compare-runner.d.ts +44 -0
- package/dist/pipeline/grader-compare-runner.js +301 -0
- package/dist/pipeline/grader-comparison.d.ts +111 -0
- package/dist/pipeline/grader-comparison.js +161 -0
- package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
- package/dist/pipeline/grader-consistency-runner.js +270 -0
- package/dist/pipeline/grader-consistency.d.ts +103 -0
- package/dist/pipeline/grader-consistency.js +146 -0
- package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
- package/dist/pipeline/grader-sensitivity-runner.js +282 -0
- package/dist/pipeline/grader-sensitivity.d.ts +94 -0
- package/dist/pipeline/grader-sensitivity.js +144 -0
- package/dist/pipeline/grader-validate-runner.d.ts +38 -0
- package/dist/pipeline/grader-validate-runner.js +229 -0
- package/dist/pipeline/grader-validation.d.ts +107 -0
- package/dist/pipeline/grader-validation.js +169 -0
- package/dist/pipeline/map-request-to-config.d.ts +19 -0
- package/dist/pipeline/map-request-to-config.js +80 -0
- package/dist/pipeline/measure-retrieval.d.ts +59 -0
- package/dist/pipeline/measure-retrieval.js +111 -0
- package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
- package/dist/pipeline/mirror-repo-tasks.js +350 -0
- package/dist/pipeline/plan-format.d.ts +33 -0
- package/dist/pipeline/plan-format.js +202 -0
- package/dist/pipeline/plan.d.ts +169 -0
- package/dist/pipeline/plan.js +708 -0
- package/dist/pipeline/pr-comment.d.ts +19 -0
- package/dist/pipeline/pr-comment.js +502 -0
- package/dist/pipeline/probe.d.ts +52 -0
- package/dist/pipeline/probe.js +390 -0
- package/dist/pipeline/provenance.d.ts +47 -0
- package/dist/pipeline/provenance.js +146 -0
- package/dist/pipeline/readiness-report.d.ts +87 -0
- package/dist/pipeline/readiness-report.js +205 -0
- package/dist/pipeline/release-classification.d.ts +54 -0
- package/dist/pipeline/release-classification.js +238 -0
- package/dist/pipeline/release-report.d.ts +37 -0
- package/dist/pipeline/release-report.js +222 -0
- package/dist/pipeline/repo-eval-comment.d.ts +37 -0
- package/dist/pipeline/repo-eval-comment.js +165 -0
- package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
- package/dist/pipeline/repo-threshold-evaluator.js +162 -0
- package/dist/pipeline/resolve-mappings.d.ts +35 -0
- package/dist/pipeline/resolve-mappings.js +72 -0
- package/dist/pipeline/retrieval-metrics.d.ts +39 -0
- package/dist/pipeline/retrieval-metrics.js +136 -0
- package/dist/pipeline/reverse-mapping.d.ts +67 -0
- package/dist/pipeline/reverse-mapping.js +88 -0
- package/dist/pipeline/schemas.d.ts +9 -0
- package/dist/pipeline/schemas.js +9 -0
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +88 -0
- package/dist/pipeline/targeted-loo.js +203 -0
- package/dist/pipeline/thresholds.d.ts +27 -0
- package/dist/pipeline/thresholds.js +245 -0
- package/dist/pipeline/types.d.ts +10 -0
- package/dist/pipeline/types.js +10 -0
- package/dist/pipeline/validate.d.ts +67 -0
- package/dist/pipeline/validate.js +406 -0
- package/dist/pipeline/webhook-server.d.ts +37 -0
- package/dist/pipeline/webhook-server.js +133 -0
- package/dist/report-store.d.ts +84 -0
- package/dist/report-store.js +208 -0
- package/dist/sanity/client.d.ts +38 -0
- package/dist/sanity/client.js +86 -0
- package/dist/sanity/portable-text.d.ts +11 -0
- package/dist/sanity/portable-text.js +211 -0
- package/dist/sanity/queries.d.ts +133 -0
- package/dist/sanity/queries.js +300 -0
- package/dist/schedules/digest.d.ts +116 -0
- package/dist/schedules/digest.js +156 -0
- package/dist/schedules/index.d.ts +12 -0
- package/dist/schedules/index.js +10 -0
- package/dist/schedules/loader.d.ts +31 -0
- package/dist/schedules/loader.js +73 -0
- package/dist/schedules/schema.d.ts +9 -0
- package/dist/schedules/schema.js +9 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +21 -0
- package/dist/scripts/validate-task-sources.js +210 -0
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/bigquery/index.d.ts +131 -0
- package/dist/sinks/bigquery/index.js +222 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/index.d.ts +23 -0
- package/dist/sinks/index.js +18 -0
- package/dist/sinks/loader.d.ts +18 -0
- package/dist/sinks/loader.js +82 -0
- package/dist/sinks/retry.d.ts +24 -0
- package/dist/sinks/retry.js +52 -0
- package/dist/sinks/schema.d.ts +9 -0
- package/dist/sinks/schema.js +9 -0
- package/dist/sinks/slack/format.d.ts +65 -0
- package/dist/sinks/slack/format.js +327 -0
- package/dist/sinks/slack/index.d.ts +27 -0
- package/dist/sinks/slack/index.js +78 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +59 -0
- package/dist/sinks/types.js +44 -0
- package/dist/sinks/webhook/index.d.ts +19 -0
- package/dist/sinks/webhook/index.js +50 -0
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/sources.d.ts +104 -0
- package/dist/sources.js +292 -0
- package/dist/webhook/budget.d.ts +42 -0
- package/dist/webhook/budget.js +60 -0
- package/dist/webhook/debounce.d.ts +67 -0
- package/dist/webhook/debounce.js +76 -0
- package/dist/webhook/dispatch.d.ts +45 -0
- package/dist/webhook/dispatch.js +84 -0
- package/dist/webhook/eval-request-handler.d.ts +87 -0
- package/dist/webhook/eval-request-handler.js +181 -0
- package/dist/webhook/handler.d.ts +88 -0
- package/dist/webhook/handler.js +203 -0
- package/dist/webhook/index.d.ts +17 -0
- package/dist/webhook/index.js +12 -0
- package/dist/webhook/types.d.ts +109 -0
- package/dist/webhook/types.js +10 -0
- package/package.json +72 -0
- package/tasks/.expanded.agentic.yaml +51 -0
- package/tasks/.expanded.yaml +66 -0
- package/tasks/frameworks.yaml +98 -0
- package/tasks/functions.yaml +51 -0
- package/tasks/groq.yaml +216 -0
- package/tasks/nextjs-live.yaml +62 -0
- package/tasks/studio-setup.yaml +111 -0
- package/tasks/visual-editing.yaml +120 -0
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* grader-consistency-runner.ts
|
|
3
|
+
*
|
|
4
|
+
* Orchestration logic for grader consistency analysis.
|
|
5
|
+
*
|
|
6
|
+
* Reads existing eval results, re-runs ONLY the grading assertions N additional
|
|
7
|
+
* times with the configured grader model, and analyzes score variance.
|
|
8
|
+
*
|
|
9
|
+
* This does NOT re-run the models under test — it only re-grades the same
|
|
10
|
+
* responses. Cost is low: ~$0.005 per grading call × N replications.
|
|
11
|
+
*
|
|
12
|
+
* Migrated from lib/grader-consistency.ts — no process.argv, no process.exit(),
|
|
13
|
+
* no module-level constants.
|
|
14
|
+
*
|
|
15
|
+
* @see docs/exec-plans/completed/grader-reliability.md — Phase 1
|
|
16
|
+
*/
|
|
17
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
18
|
+
import { join } from "path";
|
|
19
|
+
import { gradeOnce } from "./grader-api.js";
|
|
20
|
+
import { analyzeConsistency, } from "./grader-consistency.js";
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// Rubric dimension classification (similar to calculate-scores)
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
function classifyDimension(component) {
|
|
25
|
+
// Prefer structured metadata
|
|
26
|
+
const metadata = component.assertion?.metadata;
|
|
27
|
+
if (metadata?.dimension) {
|
|
28
|
+
switch (metadata.dimension) {
|
|
29
|
+
case "code-correctness":
|
|
30
|
+
return "codeCorrectness";
|
|
31
|
+
case "doc-coverage":
|
|
32
|
+
return "docCoverage";
|
|
33
|
+
case "task-completion":
|
|
34
|
+
return "taskCompletion";
|
|
35
|
+
default:
|
|
36
|
+
return null;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
// Fallback: heuristic name matching
|
|
40
|
+
const value = (component.assertion?.value ?? "").toLowerCase();
|
|
41
|
+
if (value.includes("task completion"))
|
|
42
|
+
return "taskCompletion";
|
|
43
|
+
if (value.includes("code correctness"))
|
|
44
|
+
return "codeCorrectness";
|
|
45
|
+
if (value.includes("documentation coverage") || value.includes("hallucinate"))
|
|
46
|
+
return "docCoverage";
|
|
47
|
+
return null;
|
|
48
|
+
}
|
|
49
|
+
// ---------------------------------------------------------------------------
|
|
50
|
+
// Grading judgment extraction
|
|
51
|
+
// ---------------------------------------------------------------------------
|
|
52
|
+
function detectFeatureArea(description) {
|
|
53
|
+
const desc = description.toLowerCase();
|
|
54
|
+
if (desc.includes("studio"))
|
|
55
|
+
return "studio-setup";
|
|
56
|
+
if (desc.includes("visual") ||
|
|
57
|
+
desc.includes("presentation") ||
|
|
58
|
+
desc.includes("live preview"))
|
|
59
|
+
return "visual-editing";
|
|
60
|
+
if (desc.includes("function") || desc.includes("webhook"))
|
|
61
|
+
return "functions";
|
|
62
|
+
if (desc.startsWith("groq"))
|
|
63
|
+
return "groq";
|
|
64
|
+
if (desc.includes("next") || desc.includes("app router"))
|
|
65
|
+
return "nextjs-live";
|
|
66
|
+
if (desc.includes("remix") ||
|
|
67
|
+
desc.includes("nuxt") ||
|
|
68
|
+
desc.includes("svelte"))
|
|
69
|
+
return "frameworks";
|
|
70
|
+
return "other";
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Extract all llm-rubric grading judgments from eval results.
|
|
74
|
+
* Only includes gold (with-docs) tests to keep the analysis focused.
|
|
75
|
+
*/
|
|
76
|
+
export function extractGradingJudgments(file) {
|
|
77
|
+
const judgments = [];
|
|
78
|
+
for (const result of file.results.results) {
|
|
79
|
+
if (!result.gradingResult)
|
|
80
|
+
continue;
|
|
81
|
+
const description = result.testCase?.description ?? "unknown";
|
|
82
|
+
const hasDocs = result.vars?.docs && result.vars.docs.trim().length > 0;
|
|
83
|
+
// Only grade "gold" (with-docs) tests — baseline tests have abbreviated rubrics
|
|
84
|
+
if (!hasDocs)
|
|
85
|
+
continue;
|
|
86
|
+
const area = detectFeatureArea(description);
|
|
87
|
+
const providerId = result.provider?.id;
|
|
88
|
+
for (const comp of result.gradingResult.componentResults) {
|
|
89
|
+
if (comp.assertion?.type !== "llm-rubric")
|
|
90
|
+
continue;
|
|
91
|
+
const dimension = classifyDimension(comp);
|
|
92
|
+
if (!dimension)
|
|
93
|
+
continue;
|
|
94
|
+
const rubricText = typeof comp.assertion.value === "string" ? comp.assertion.value : "";
|
|
95
|
+
if (!rubricText)
|
|
96
|
+
continue;
|
|
97
|
+
judgments.push({
|
|
98
|
+
area,
|
|
99
|
+
description,
|
|
100
|
+
dimension,
|
|
101
|
+
originalScore: typeof comp.score === "number" ? comp.score : 0,
|
|
102
|
+
providerId,
|
|
103
|
+
responseText: result.response?.output ?? "",
|
|
104
|
+
rubricText,
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
return judgments;
|
|
109
|
+
}
|
|
110
|
+
// ---------------------------------------------------------------------------
|
|
111
|
+
// Report formatting (pure)
|
|
112
|
+
// ---------------------------------------------------------------------------
|
|
113
|
+
export function formatConsistencyReport(result, graderModel) {
|
|
114
|
+
const lines = [];
|
|
115
|
+
lines.push("=".repeat(80));
|
|
116
|
+
lines.push(" GRADER CONSISTENCY REPORT");
|
|
117
|
+
lines.push("=".repeat(80));
|
|
118
|
+
lines.push("");
|
|
119
|
+
lines.push(` Grader model: ${graderModel}`);
|
|
120
|
+
lines.push(` Replications: ${result.replications} (incl. original)`);
|
|
121
|
+
lines.push(` Judgments: ${result.totalJudgments}`);
|
|
122
|
+
lines.push("");
|
|
123
|
+
// Overall stats
|
|
124
|
+
lines.push("-".repeat(80));
|
|
125
|
+
lines.push("OVERALL");
|
|
126
|
+
lines.push("-".repeat(80));
|
|
127
|
+
lines.push("");
|
|
128
|
+
lines.push(` Avg σ: ${result.avgStdDev}`);
|
|
129
|
+
lines.push(` Max σ: ${result.maxStdDev}`);
|
|
130
|
+
lines.push(` Avg range: ${result.avgRange} points`);
|
|
131
|
+
lines.push("");
|
|
132
|
+
// Per-dimension table
|
|
133
|
+
lines.push("-".repeat(80));
|
|
134
|
+
lines.push("PER-DIMENSION CONSISTENCY");
|
|
135
|
+
lines.push("-".repeat(80));
|
|
136
|
+
lines.push("");
|
|
137
|
+
const h = "| Dimension | Avg σ | Max σ | Avg Range | Judgments |";
|
|
138
|
+
const sep = "|------------------|-------|-------|-----------|-----------| ";
|
|
139
|
+
lines.push(h);
|
|
140
|
+
lines.push(sep);
|
|
141
|
+
const dims = [
|
|
142
|
+
{ data: result.perDimension.taskCompletion, name: "Task Completion" },
|
|
143
|
+
{ data: result.perDimension.codeCorrectness, name: "Code Correctness" },
|
|
144
|
+
{ data: result.perDimension.docCoverage, name: "Doc Coverage" },
|
|
145
|
+
];
|
|
146
|
+
for (const { data, name } of dims) {
|
|
147
|
+
lines.push(`| ${name.padEnd(16)} | ${String(data.avgStdDev).padStart(5)} | ${String(data.maxStdDev).padStart(5)} | ${String(data.avgRange).padStart(9)} | ${String(data.judgmentCount).padStart(9)} |`);
|
|
148
|
+
}
|
|
149
|
+
lines.push("");
|
|
150
|
+
// Noise threshold recommendation
|
|
151
|
+
lines.push("-".repeat(80));
|
|
152
|
+
lines.push("NOISE THRESHOLD RECOMMENDATION");
|
|
153
|
+
lines.push("-".repeat(80));
|
|
154
|
+
lines.push("");
|
|
155
|
+
lines.push(` Current default: ±2 (DEFAULT_NOISE_THRESHOLD)`);
|
|
156
|
+
lines.push(` Recommended: ±${result.recommendedThreshold} (based on 2× max dimension σ)`);
|
|
157
|
+
if (result.recommendedThreshold > 2) {
|
|
158
|
+
lines.push(` ⚠ Current threshold may be too low — comparison deltas within ±${result.recommendedThreshold}`);
|
|
159
|
+
lines.push(` should be treated as noise, not real changes.`);
|
|
160
|
+
}
|
|
161
|
+
else {
|
|
162
|
+
lines.push(` ✅ Current threshold is adequate for this grader's consistency.`);
|
|
163
|
+
}
|
|
164
|
+
lines.push("");
|
|
165
|
+
// Top 5 noisiest judgments
|
|
166
|
+
const topN = Math.min(5, result.judgments.length);
|
|
167
|
+
if (topN > 0) {
|
|
168
|
+
lines.push("-".repeat(80));
|
|
169
|
+
lines.push(`TOP ${topN} NOISIEST JUDGMENTS`);
|
|
170
|
+
lines.push("-".repeat(80));
|
|
171
|
+
lines.push("");
|
|
172
|
+
for (let i = 0; i < topN; i++) {
|
|
173
|
+
const j = result.judgments[i];
|
|
174
|
+
const provider = j.providerId ? ` [${j.providerId}]` : "";
|
|
175
|
+
lines.push(` ${i + 1}. ${j.taskId}${provider}`);
|
|
176
|
+
lines.push(` ${j.dimension}: σ=${j.stdDev}, range=${j.range} (${j.min}–${j.max}), mean=${j.mean}`);
|
|
177
|
+
}
|
|
178
|
+
lines.push("");
|
|
179
|
+
}
|
|
180
|
+
return lines.join("\n");
|
|
181
|
+
}
|
|
182
|
+
// ---------------------------------------------------------------------------
|
|
183
|
+
// Main runner
|
|
184
|
+
// ---------------------------------------------------------------------------
|
|
185
|
+
/**
|
|
186
|
+
* Run the grader consistency analysis.
|
|
187
|
+
*
|
|
188
|
+
* Reads eval results, re-grades each judgment N times, and writes the
|
|
189
|
+
* consistency report to results/latest/grader-consistency.json.
|
|
190
|
+
*
|
|
191
|
+
* @throws Error if results file not found, replications < 2, or no judgments found
|
|
192
|
+
*/
|
|
193
|
+
export async function runGraderConsistency(options) {
|
|
194
|
+
const { replications, resultsPath, rootDir } = options;
|
|
195
|
+
console.log("=== Grader Consistency Analysis ===\n");
|
|
196
|
+
// Validate inputs
|
|
197
|
+
if (!existsSync(resultsPath)) {
|
|
198
|
+
throw new Error(`Results file not found: ${resultsPath}. Run 'pnpm eval' first to generate results.`);
|
|
199
|
+
}
|
|
200
|
+
if (replications < 2) {
|
|
201
|
+
throw new Error("Need at least 2 replications for meaningful analysis.");
|
|
202
|
+
}
|
|
203
|
+
// Load eval results
|
|
204
|
+
console.log(` Results: ${resultsPath}`);
|
|
205
|
+
console.log(` Replications: ${replications}`);
|
|
206
|
+
const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
|
|
207
|
+
// Extract grader model
|
|
208
|
+
const graderModel = file.config?.defaultTest?.options?.rubricProvider ??
|
|
209
|
+
file.config?.defaultTest?.options?.provider;
|
|
210
|
+
if (!graderModel) {
|
|
211
|
+
throw new Error("Could not determine grader model from eval results config.");
|
|
212
|
+
}
|
|
213
|
+
console.log(` Grader: ${graderModel}`);
|
|
214
|
+
// Extract judgments
|
|
215
|
+
const judgments = extractGradingJudgments(file);
|
|
216
|
+
console.log(` Judgments: ${judgments.length} (gold tests × rubric dimensions)`);
|
|
217
|
+
if (judgments.length === 0) {
|
|
218
|
+
throw new Error("No gradable judgments found in results.");
|
|
219
|
+
}
|
|
220
|
+
const totalCalls = judgments.length * replications;
|
|
221
|
+
const estimatedCost = totalCalls * 0.005;
|
|
222
|
+
console.log(` API calls: ${totalCalls} (${judgments.length} × ${replications})`);
|
|
223
|
+
console.log(` Est. cost: ~$${estimatedCost.toFixed(2)}`);
|
|
224
|
+
console.log();
|
|
225
|
+
// Re-grade each judgment N times
|
|
226
|
+
console.log(` Running ${replications} replications per judgment...`);
|
|
227
|
+
const gradings = [];
|
|
228
|
+
let completed = 0;
|
|
229
|
+
let failed = 0;
|
|
230
|
+
for (const judgment of judgments) {
|
|
231
|
+
const scores = [judgment.originalScore]; // Include the original as replication #0
|
|
232
|
+
for (let i = 0; i < replications; i++) {
|
|
233
|
+
const score = await gradeOnce(graderModel, judgment.responseText, judgment.rubricText);
|
|
234
|
+
if (score !== null) {
|
|
235
|
+
scores.push(score);
|
|
236
|
+
}
|
|
237
|
+
else {
|
|
238
|
+
failed++;
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
completed++;
|
|
242
|
+
if (completed % 10 === 0 || completed === judgments.length) {
|
|
243
|
+
const pct = Math.round((completed / judgments.length) * 100);
|
|
244
|
+
process.stdout.write(`\r Progress: ${completed}/${judgments.length} (${pct}%)`);
|
|
245
|
+
}
|
|
246
|
+
gradings.push({
|
|
247
|
+
area: judgment.area,
|
|
248
|
+
dimension: judgment.dimension,
|
|
249
|
+
providerId: judgment.providerId,
|
|
250
|
+
scores,
|
|
251
|
+
taskId: judgment.description,
|
|
252
|
+
});
|
|
253
|
+
}
|
|
254
|
+
console.log(); // newline after progress
|
|
255
|
+
if (failed > 0) {
|
|
256
|
+
console.log(` ⚠ ${failed} grading calls failed (excluded from analysis)`);
|
|
257
|
+
}
|
|
258
|
+
console.log();
|
|
259
|
+
// Analyze consistency
|
|
260
|
+
const result = analyzeConsistency(gradings);
|
|
261
|
+
// Print report
|
|
262
|
+
console.log(formatConsistencyReport(result, graderModel));
|
|
263
|
+
// Write output
|
|
264
|
+
const outDir = join(rootDir, "results", "latest");
|
|
265
|
+
mkdirSync(outDir, { recursive: true });
|
|
266
|
+
const outPath = join(outDir, "grader-consistency.json");
|
|
267
|
+
writeFileSync(outPath, JSON.stringify(result, null, 2));
|
|
268
|
+
console.log(`\n 📄 Results written to ${outPath}`);
|
|
269
|
+
return result;
|
|
270
|
+
}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/grader-consistency.ts
|
|
3
|
+
*
|
|
4
|
+
* Pure computation module for measuring grader consistency.
|
|
5
|
+
*
|
|
6
|
+
* Takes replicated grading results (the same judgment run N times) and
|
|
7
|
+
* computes standard deviation, per-dimension consistency, and an
|
|
8
|
+
* empirically-derived noise threshold recommendation.
|
|
9
|
+
*
|
|
10
|
+
* This module has NO side effects — no file I/O, no API calls.
|
|
11
|
+
* It operates on pre-collected data only.
|
|
12
|
+
*
|
|
13
|
+
* @see docs/exec-plans/completed/grader-reliability.md — Phase 1
|
|
14
|
+
*/
|
|
15
|
+
/** Per-dimension consistency aggregates */
|
|
16
|
+
export interface DimensionConsistency {
|
|
17
|
+
/** Average range (max - min) across all judgments */
|
|
18
|
+
avgRange: number;
|
|
19
|
+
/** Average standard deviation across all judgments in this dimension */
|
|
20
|
+
avgStdDev: number;
|
|
21
|
+
/** Number of judgments contributing to this aggregate */
|
|
22
|
+
judgmentCount: number;
|
|
23
|
+
/** Maximum standard deviation observed in this dimension */
|
|
24
|
+
maxStdDev: number;
|
|
25
|
+
}
|
|
26
|
+
/** Top-level grader consistency result */
|
|
27
|
+
export interface GraderConsistency {
|
|
28
|
+
/** Average range (max - min) across all judgments */
|
|
29
|
+
avgRange: number;
|
|
30
|
+
/** Average standard deviation across ALL grading judgments */
|
|
31
|
+
avgStdDev: number;
|
|
32
|
+
/** When this analysis was generated */
|
|
33
|
+
generatedAt: string;
|
|
34
|
+
/** Per-judgment details (sorted by stdDev descending — noisiest first) */
|
|
35
|
+
judgments: JudgmentConsistency[];
|
|
36
|
+
/** Maximum standard deviation observed (worst-case noise) */
|
|
37
|
+
maxStdDev: number;
|
|
38
|
+
/** Per-dimension consistency */
|
|
39
|
+
perDimension: {
|
|
40
|
+
taskCompletion: DimensionConsistency;
|
|
41
|
+
codeCorrectness: DimensionConsistency;
|
|
42
|
+
docCoverage: DimensionConsistency;
|
|
43
|
+
};
|
|
44
|
+
/** Recommended noise threshold for comparisons (2× max dimension avgStdDev) */
|
|
45
|
+
recommendedThreshold: number;
|
|
46
|
+
/** Number of replications per judgment */
|
|
47
|
+
replications: number;
|
|
48
|
+
/** Total number of judgments analyzed */
|
|
49
|
+
totalJudgments: number;
|
|
50
|
+
}
|
|
51
|
+
/** Consistency stats for a single judgment */
|
|
52
|
+
export interface JudgmentConsistency {
|
|
53
|
+
/** Feature area */
|
|
54
|
+
area: string;
|
|
55
|
+
/** Scoring dimension */
|
|
56
|
+
dimension: "codeCorrectness" | "docCoverage" | "taskCompletion";
|
|
57
|
+
/** Max score observed */
|
|
58
|
+
max: number;
|
|
59
|
+
/** Mean score across replications */
|
|
60
|
+
mean: number;
|
|
61
|
+
/** Min score observed */
|
|
62
|
+
min: number;
|
|
63
|
+
/** Provider (model under test) */
|
|
64
|
+
providerId?: string;
|
|
65
|
+
/** Score range (max - min) */
|
|
66
|
+
range: number;
|
|
67
|
+
/** Number of replications */
|
|
68
|
+
replications: number;
|
|
69
|
+
/** Standard deviation across replications */
|
|
70
|
+
stdDev: number;
|
|
71
|
+
/** Task ID */
|
|
72
|
+
taskId: string;
|
|
73
|
+
}
|
|
74
|
+
/** A single replicated grading judgment: one (response, rubric) pair graded N times */
|
|
75
|
+
export interface ReplicatedGrading {
|
|
76
|
+
/** Feature area (derived from task description) */
|
|
77
|
+
area: string;
|
|
78
|
+
/** Which scoring dimension this rubric measures */
|
|
79
|
+
dimension: "codeCorrectness" | "docCoverage" | "taskCompletion";
|
|
80
|
+
/** Provider (model under test) that produced the original response */
|
|
81
|
+
providerId?: string;
|
|
82
|
+
/** The scores from each replication (length = N replications) */
|
|
83
|
+
scores: number[];
|
|
84
|
+
/** Which task this judgment belongs to */
|
|
85
|
+
taskId: string;
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Analyze a set of replicated gradings and produce a full consistency report.
|
|
89
|
+
*
|
|
90
|
+
* This is the main entry point — a pure function with no side effects.
|
|
91
|
+
*
|
|
92
|
+
* @param gradings Array of replicated grading judgments (each graded N times)
|
|
93
|
+
* @returns GraderConsistency report with per-dimension and overall statistics
|
|
94
|
+
*/
|
|
95
|
+
export declare function analyzeConsistency(gradings: ReplicatedGrading[]): GraderConsistency;
|
|
96
|
+
/**
|
|
97
|
+
* Compute consistency statistics for a single replicated judgment.
|
|
98
|
+
*/
|
|
99
|
+
export declare function analyzeJudgment(grading: ReplicatedGrading): JudgmentConsistency;
|
|
100
|
+
/** Compute mean of an array of numbers */
|
|
101
|
+
export declare function mean(values: number[]): number;
|
|
102
|
+
/** Compute population standard deviation of an array of numbers */
|
|
103
|
+
export declare function stdDev(values: number[]): number;
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/grader-consistency.ts
|
|
3
|
+
*
|
|
4
|
+
* Pure computation module for measuring grader consistency.
|
|
5
|
+
*
|
|
6
|
+
* Takes replicated grading results (the same judgment run N times) and
|
|
7
|
+
* computes standard deviation, per-dimension consistency, and an
|
|
8
|
+
* empirically-derived noise threshold recommendation.
|
|
9
|
+
*
|
|
10
|
+
* This module has NO side effects — no file I/O, no API calls.
|
|
11
|
+
* It operates on pre-collected data only.
|
|
12
|
+
*
|
|
13
|
+
* @see docs/exec-plans/completed/grader-reliability.md — Phase 1
|
|
14
|
+
*/
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
// Pure computation
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
/**
|
|
19
|
+
* Analyze a set of replicated gradings and produce a full consistency report.
|
|
20
|
+
*
|
|
21
|
+
* This is the main entry point — a pure function with no side effects.
|
|
22
|
+
*
|
|
23
|
+
* @param gradings Array of replicated grading judgments (each graded N times)
|
|
24
|
+
* @returns GraderConsistency report with per-dimension and overall statistics
|
|
25
|
+
*/
|
|
26
|
+
export function analyzeConsistency(gradings) {
|
|
27
|
+
if (gradings.length === 0) {
|
|
28
|
+
return {
|
|
29
|
+
avgRange: 0,
|
|
30
|
+
avgStdDev: 0,
|
|
31
|
+
generatedAt: new Date().toISOString(),
|
|
32
|
+
judgments: [],
|
|
33
|
+
maxStdDev: 0,
|
|
34
|
+
perDimension: {
|
|
35
|
+
codeCorrectness: {
|
|
36
|
+
avgRange: 0,
|
|
37
|
+
avgStdDev: 0,
|
|
38
|
+
judgmentCount: 0,
|
|
39
|
+
maxStdDev: 0,
|
|
40
|
+
},
|
|
41
|
+
docCoverage: {
|
|
42
|
+
avgRange: 0,
|
|
43
|
+
avgStdDev: 0,
|
|
44
|
+
judgmentCount: 0,
|
|
45
|
+
maxStdDev: 0,
|
|
46
|
+
},
|
|
47
|
+
taskCompletion: {
|
|
48
|
+
avgRange: 0,
|
|
49
|
+
avgStdDev: 0,
|
|
50
|
+
judgmentCount: 0,
|
|
51
|
+
maxStdDev: 0,
|
|
52
|
+
},
|
|
53
|
+
},
|
|
54
|
+
recommendedThreshold: 0,
|
|
55
|
+
replications: 0,
|
|
56
|
+
totalJudgments: 0,
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
// Analyze each judgment
|
|
60
|
+
const judgments = gradings.map(analyzeJudgment);
|
|
61
|
+
// Group by dimension
|
|
62
|
+
const byDimension = {
|
|
63
|
+
codeCorrectness: judgments.filter((j) => j.dimension === "codeCorrectness"),
|
|
64
|
+
docCoverage: judgments.filter((j) => j.dimension === "docCoverage"),
|
|
65
|
+
taskCompletion: judgments.filter((j) => j.dimension === "taskCompletion"),
|
|
66
|
+
};
|
|
67
|
+
const perDimension = {
|
|
68
|
+
codeCorrectness: aggregateDimension(byDimension.codeCorrectness),
|
|
69
|
+
docCoverage: aggregateDimension(byDimension.docCoverage),
|
|
70
|
+
taskCompletion: aggregateDimension(byDimension.taskCompletion),
|
|
71
|
+
};
|
|
72
|
+
// Overall stats
|
|
73
|
+
const allStdDevs = judgments.map((j) => j.stdDev);
|
|
74
|
+
const allRanges = judgments.map((j) => j.range);
|
|
75
|
+
const replications = gradings[0]?.scores.length ?? 0;
|
|
76
|
+
// Recommended threshold: 2× the worst (highest) per-dimension avgStdDev.
|
|
77
|
+
// This means a comparison delta must exceed 2σ of the noisiest dimension
|
|
78
|
+
// to be classified as a real change rather than grader variance.
|
|
79
|
+
const maxDimensionAvgStdDev = Math.max(perDimension.taskCompletion.avgStdDev, perDimension.codeCorrectness.avgStdDev, perDimension.docCoverage.avgStdDev);
|
|
80
|
+
const recommendedThreshold = Math.ceil(maxDimensionAvgStdDev * 2);
|
|
81
|
+
// Sort judgments by stdDev descending (noisiest first)
|
|
82
|
+
const sortedJudgments = [...judgments].sort((a, b) => b.stdDev - a.stdDev);
|
|
83
|
+
return {
|
|
84
|
+
avgRange: Math.round(mean(allRanges) * 10) / 10,
|
|
85
|
+
avgStdDev: Math.round(mean(allStdDevs) * 10) / 10,
|
|
86
|
+
generatedAt: new Date().toISOString(),
|
|
87
|
+
judgments: sortedJudgments,
|
|
88
|
+
maxStdDev: Math.max(...allStdDevs),
|
|
89
|
+
perDimension,
|
|
90
|
+
recommendedThreshold,
|
|
91
|
+
replications,
|
|
92
|
+
totalJudgments: judgments.length,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Compute consistency statistics for a single replicated judgment.
|
|
97
|
+
*/
|
|
98
|
+
export function analyzeJudgment(grading) {
|
|
99
|
+
const scores = grading.scores;
|
|
100
|
+
const avg = mean(scores);
|
|
101
|
+
const sd = stdDev(scores);
|
|
102
|
+
const min = Math.min(...scores);
|
|
103
|
+
const max = Math.max(...scores);
|
|
104
|
+
return {
|
|
105
|
+
area: grading.area,
|
|
106
|
+
dimension: grading.dimension,
|
|
107
|
+
max,
|
|
108
|
+
mean: Math.round(avg * 10) / 10,
|
|
109
|
+
min,
|
|
110
|
+
range: max - min,
|
|
111
|
+
replications: scores.length,
|
|
112
|
+
stdDev: Math.round(sd * 10) / 10,
|
|
113
|
+
taskId: grading.taskId,
|
|
114
|
+
...(grading.providerId && { providerId: grading.providerId }),
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
/** Compute mean of an array of numbers */
|
|
118
|
+
export function mean(values) {
|
|
119
|
+
if (values.length === 0)
|
|
120
|
+
return 0;
|
|
121
|
+
return values.reduce((sum, v) => sum + v, 0) / values.length;
|
|
122
|
+
}
|
|
123
|
+
/** Compute population standard deviation of an array of numbers */
|
|
124
|
+
export function stdDev(values) {
|
|
125
|
+
if (values.length < 2)
|
|
126
|
+
return 0;
|
|
127
|
+
const avg = mean(values);
|
|
128
|
+
const squaredDiffs = values.map((v) => (v - avg) ** 2);
|
|
129
|
+
return Math.sqrt(mean(squaredDiffs));
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Aggregate judgment-level stats into a per-dimension summary.
|
|
133
|
+
*/
|
|
134
|
+
function aggregateDimension(judgments) {
|
|
135
|
+
if (judgments.length === 0) {
|
|
136
|
+
return { avgRange: 0, avgStdDev: 0, judgmentCount: 0, maxStdDev: 0 };
|
|
137
|
+
}
|
|
138
|
+
const stdDevs = judgments.map((j) => j.stdDev);
|
|
139
|
+
const ranges = judgments.map((j) => j.range);
|
|
140
|
+
return {
|
|
141
|
+
avgRange: Math.round(mean(ranges) * 10) / 10,
|
|
142
|
+
avgStdDev: Math.round(mean(stdDevs) * 10) / 10,
|
|
143
|
+
judgmentCount: judgments.length,
|
|
144
|
+
maxStdDev: Math.max(...stdDevs),
|
|
145
|
+
};
|
|
146
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/grader-sensitivity-runner.ts
|
|
3
|
+
*
|
|
4
|
+
* Orchestration module for grader sensitivity (discrimination power) testing
|
|
5
|
+
* (Phase 4).
|
|
6
|
+
*
|
|
7
|
+
* Discovers canonical reference solutions, applies programmatic degradations,
|
|
8
|
+
* grades each original/degraded pair, and calls `analyzeSensitivity()` from
|
|
9
|
+
* the pure computation module.
|
|
10
|
+
*
|
|
11
|
+
* Migrated from lib/grader-sensitivity.ts — no process.argv, no process.exit(),
|
|
12
|
+
* no module-level constants. Accepts rootDir as parameter.
|
|
13
|
+
*
|
|
14
|
+
* @see docs/exec-plans/completed/grader-reliability.md — Phase 4
|
|
15
|
+
*/
|
|
16
|
+
import { type GraderSensitivityResult } from "./grader-sensitivity.js";
|
|
17
|
+
export interface GraderSensitivityRunnerOptions {
|
|
18
|
+
/** Filter to a specific feature area (e.g., "groq") */
|
|
19
|
+
areaFilter?: string;
|
|
20
|
+
/** Output format */
|
|
21
|
+
format?: "json" | "table";
|
|
22
|
+
/** Custom output path */
|
|
23
|
+
outputPath?: string;
|
|
24
|
+
/** Root directory of the eval package */
|
|
25
|
+
rootDir: string;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Format a GraderSensitivityResult as a human-readable table report.
|
|
29
|
+
* Returns a string — does NOT print to console.
|
|
30
|
+
*/
|
|
31
|
+
export declare function formatSensitivityReport(result: GraderSensitivityResult): string;
|
|
32
|
+
/**
|
|
33
|
+
* Run grader sensitivity analysis.
|
|
34
|
+
*
|
|
35
|
+
* Discovers reference solutions, applies degradations, grades each pair
|
|
36
|
+
* with the configured grader, and analyzes discrimination power.
|
|
37
|
+
*
|
|
38
|
+
* @throws Error if no reference solutions found or no pairs generated
|
|
39
|
+
*/
|
|
40
|
+
export declare function runGraderSensitivity(options: GraderSensitivityRunnerOptions): Promise<GraderSensitivityResult>;
|