@sanity/ailf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -0
- package/bin/ailf.js +64 -0
- package/canonical/grader-references/README.md +88 -0
- package/canonical/grader-references/groq.yaml +234 -0
- package/canonical/grader-references/studio-setup.yaml +275 -0
- package/canonical/reference-solutions/.gitkeep +1 -0
- package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
- package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
- package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
- package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
- package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
- package/canonical/reference-solutions/groq/joins-references.ts +300 -0
- package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
- package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
- package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
- package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
- package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
- package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
- package/config/bigquery/README.md +74 -0
- package/config/bigquery/views/area_scores.sql +87 -0
- package/config/bigquery/views/reports.sql +49 -0
- package/config/features.yaml +116 -0
- package/config/models.yaml +115 -0
- package/config/prompts.yaml +75 -0
- package/config/rubrics.yaml +62 -0
- package/config/schedules.yaml +43 -0
- package/config/sinks.yaml +54 -0
- package/config/sources.yaml +51 -0
- package/config/thresholds.yaml +49 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
- package/dist/_vendor/ailf-core/examples/index.js +285 -0
- package/dist/_vendor/ailf-core/index.d.ts +17 -0
- package/dist/_vendor/ailf-core/index.js +17 -0
- package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
- package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
- package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
- package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
- package/dist/_vendor/ailf-core/ports/context.js +14 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
- package/dist/_vendor/ailf-core/ports/index.js +7 -0
- package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
- package/dist/_vendor/ailf-core/ports/logger.js +11 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
- package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/schemas/index.js +16 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
- package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
- package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
- package/dist/_vendor/ailf-core/services/index.js +12 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/scoring.js +222 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
- package/dist/_vendor/ailf-core/types/index.js +21 -0
- package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
- package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
- package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
- package/dist/_vendor/ailf-shared/document-ref.js +1 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -0
- package/dist/_vendor/ailf-shared/index.js +16 -0
- package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
- package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
- package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
- package/dist/_vendor/ailf-shared/score-grades.js +23 -0
- package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
- package/dist/adapters/cache/content-lake-cache.js +59 -0
- package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
- package/dist/adapters/cache/filesystem-cache.js +54 -0
- package/dist/adapters/cache/index.d.ts +2 -0
- package/dist/adapters/cache/index.js +2 -0
- package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
- package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
- package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
- package/dist/adapters/config-sources/file-config-adapter.js +96 -0
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +2 -0
- package/dist/adapters/doc-fetchers/index.d.ts +1 -0
- package/dist/adapters/doc-fetchers/index.js +1 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
- package/dist/adapters/eval-runners/index.d.ts +1 -0
- package/dist/adapters/eval-runners/index.js +1 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
- package/dist/adapters/index.d.ts +12 -0
- package/dist/adapters/index.js +12 -0
- package/dist/adapters/loggers/console-logger.d.ts +22 -0
- package/dist/adapters/loggers/console-logger.js +54 -0
- package/dist/adapters/loggers/index.d.ts +9 -0
- package/dist/adapters/loggers/index.js +9 -0
- package/dist/adapters/loggers/json-logger.d.ts +18 -0
- package/dist/adapters/loggers/json-logger.js +33 -0
- package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
- package/dist/adapters/loggers/quiet-logger.js +30 -0
- package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/composite-task-source.js +59 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
- package/dist/adapters/task-sources/index.d.ts +7 -0
- package/dist/adapters/task-sources/index.js +7 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
- package/dist/adapters/task-sources/repo-schemas.js +234 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
- package/dist/adapters/task-sources/repo-task-source.js +104 -0
- package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
- package/dist/adapters/task-sources/repo-trigger.js +153 -0
- package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
- package/dist/adapters/task-sources/repo-validation.js +164 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
- package/dist/adapters/task-sources/yaml-task-source.js +136 -0
- package/dist/agent-observer/agentic-provider.d.ts +132 -0
- package/dist/agent-observer/agentic-provider.js +983 -0
- package/dist/agent-observer/classifier.d.ts +62 -0
- package/dist/agent-observer/classifier.js +269 -0
- package/dist/agent-observer/index.d.ts +7 -0
- package/dist/agent-observer/index.js +4 -0
- package/dist/agent-observer/pricing.d.ts +35 -0
- package/dist/agent-observer/pricing.js +82 -0
- package/dist/agent-observer/provider.d.ts +77 -0
- package/dist/agent-observer/provider.js +151 -0
- package/dist/agent-observer/proxy.d.ts +91 -0
- package/dist/agent-observer/proxy.js +321 -0
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/agent-observer/types.d.ts +137 -0
- package/dist/agent-observer/types.js +16 -0
- package/dist/assertions/source-isolation.d.ts +72 -0
- package/dist/assertions/source-isolation.js +117 -0
- package/dist/cli.d.ts +24 -0
- package/dist/cli.js +199 -0
- package/dist/commands/agent-report.d.ts +5 -0
- package/dist/commands/agent-report.js +69 -0
- package/dist/commands/baseline.d.ts +9 -0
- package/dist/commands/baseline.js +141 -0
- package/dist/commands/cache.d.ts +13 -0
- package/dist/commands/cache.js +135 -0
- package/dist/commands/calculate-scores.d.ts +8 -0
- package/dist/commands/calculate-scores.js +48 -0
- package/dist/commands/compare.d.ts +8 -0
- package/dist/commands/compare.js +120 -0
- package/dist/commands/completion.d.ts +18 -0
- package/dist/commands/completion.js +260 -0
- package/dist/commands/coverage-audit.d.ts +7 -0
- package/dist/commands/coverage-audit.js +40 -0
- package/dist/commands/discovery-report.d.ts +10 -0
- package/dist/commands/discovery-report.js +44 -0
- package/dist/commands/eval.d.ts +9 -0
- package/dist/commands/eval.js +35 -0
- package/dist/commands/explain-handler.d.ts +34 -0
- package/dist/commands/explain-handler.js +719 -0
- package/dist/commands/fetch-docs.d.ts +8 -0
- package/dist/commands/fetch-docs.js +128 -0
- package/dist/commands/generate-configs.d.ts +8 -0
- package/dist/commands/generate-configs.js +46 -0
- package/dist/commands/grader/index.d.ts +11 -0
- package/dist/commands/grader/index.js +118 -0
- package/dist/commands/init.d.ts +19 -0
- package/dist/commands/init.js +150 -0
- package/dist/commands/interactive.d.ts +12 -0
- package/dist/commands/interactive.js +238 -0
- package/dist/commands/lookup-doc.d.ts +15 -0
- package/dist/commands/lookup-doc.js +84 -0
- package/dist/commands/measure-retrieval.d.ts +5 -0
- package/dist/commands/measure-retrieval.js +65 -0
- package/dist/commands/pipeline-action.d.ts +71 -0
- package/dist/commands/pipeline-action.js +305 -0
- package/dist/commands/pipeline.d.ts +62 -0
- package/dist/commands/pipeline.js +53 -0
- package/dist/commands/pr-comment.d.ts +8 -0
- package/dist/commands/pr-comment.js +47 -0
- package/dist/commands/publish.d.ts +26 -0
- package/dist/commands/publish.js +253 -0
- package/dist/commands/readiness-report.d.ts +10 -0
- package/dist/commands/readiness-report.js +104 -0
- package/dist/commands/shared/options.d.ts +29 -0
- package/dist/commands/shared/options.js +57 -0
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +16 -0
- package/dist/commands/validate-tasks.js +93 -0
- package/dist/commands/validate.d.ts +9 -0
- package/dist/commands/validate.js +73 -0
- package/dist/commands/webhook-server.d.ts +5 -0
- package/dist/commands/webhook-server.js +30 -0
- package/dist/commands/weekly-digest.d.ts +10 -0
- package/dist/commands/weekly-digest.js +104 -0
- package/dist/composition-root.d.ts +26 -0
- package/dist/composition-root.js +107 -0
- package/dist/interpolate.d.ts +26 -0
- package/dist/interpolate.js +70 -0
- package/dist/job-store.d.ts +104 -0
- package/dist/job-store.js +188 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.d.ts +27 -0
- package/dist/orchestration/build-app-context.js +81 -0
- package/dist/orchestration/build-step-sequence.d.ts +15 -0
- package/dist/orchestration/build-step-sequence.js +84 -0
- package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
- package/dist/orchestration/config-to-source-overrides.js +28 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/index.d.ts +11 -0
- package/dist/orchestration/index.js +11 -0
- package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
- package/dist/orchestration/pipeline-orchestrator.js +153 -0
- package/dist/orchestration/step-runner.d.ts +20 -0
- package/dist/orchestration/step-runner.js +88 -0
- package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
- package/dist/orchestration/steps/calculate-scores-step.js +95 -0
- package/dist/orchestration/steps/callback-step.d.ts +24 -0
- package/dist/orchestration/steps/callback-step.js +76 -0
- package/dist/orchestration/steps/compare-step.d.ts +14 -0
- package/dist/orchestration/steps/compare-step.js +92 -0
- package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
- package/dist/orchestration/steps/discovery-report-step.js +55 -0
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
- package/dist/orchestration/steps/fetch-docs-step.js +135 -0
- package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
- package/dist/orchestration/steps/gap-analysis-step.js +136 -0
- package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
- package/dist/orchestration/steps/generate-configs-step.js +85 -0
- package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
- package/dist/orchestration/steps/grader-consistency-step.js +64 -0
- package/dist/orchestration/steps/index.d.ts +19 -0
- package/dist/orchestration/steps/index.js +19 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
- package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
- package/dist/orchestration/steps/publish-report-step.js +216 -0
- package/dist/orchestration/steps/readiness-step.d.ts +13 -0
- package/dist/orchestration/steps/readiness-step.js +91 -0
- package/dist/orchestration/steps/report-step.d.ts +12 -0
- package/dist/orchestration/steps/report-step.js +49 -0
- package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
- package/dist/orchestration/steps/run-eval-step.js +195 -0
- package/dist/orchestration/steps/validate-step.d.ts +12 -0
- package/dist/orchestration/steps/validate-step.js +41 -0
- package/dist/pipeline/agent-behavior-report.d.ts +53 -0
- package/dist/pipeline/agent-behavior-report.js +132 -0
- package/dist/pipeline/attribution.d.ts +47 -0
- package/dist/pipeline/attribution.js +226 -0
- package/dist/pipeline/baseline.d.ts +37 -0
- package/dist/pipeline/baseline.js +141 -0
- package/dist/pipeline/cache.d.ts +101 -0
- package/dist/pipeline/cache.js +283 -0
- package/dist/pipeline/calculate-scores.d.ts +102 -0
- package/dist/pipeline/calculate-scores.js +1128 -0
- package/dist/pipeline/callback-delivery.d.ts +50 -0
- package/dist/pipeline/callback-delivery.js +89 -0
- package/dist/pipeline/checks.d.ts +39 -0
- package/dist/pipeline/checks.js +280 -0
- package/dist/pipeline/classify-url.d.ts +61 -0
- package/dist/pipeline/classify-url.js +93 -0
- package/dist/pipeline/compare.d.ts +31 -0
- package/dist/pipeline/compare.js +208 -0
- package/dist/pipeline/coverage-audit.d.ts +39 -0
- package/dist/pipeline/coverage-audit.js +165 -0
- package/dist/pipeline/degradations.d.ts +85 -0
- package/dist/pipeline/degradations.js +242 -0
- package/dist/pipeline/discovery-report.d.ts +55 -0
- package/dist/pipeline/discovery-report.js +178 -0
- package/dist/pipeline/eval-constants.d.ts +68 -0
- package/dist/pipeline/eval-constants.js +111 -0
- package/dist/pipeline/eval-fingerprint.d.ts +66 -0
- package/dist/pipeline/eval-fingerprint.js +175 -0
- package/dist/pipeline/expand-tasks.d.ts +220 -0
- package/dist/pipeline/expand-tasks.js +421 -0
- package/dist/pipeline/failure-modes.d.ts +46 -0
- package/dist/pipeline/failure-modes.js +348 -0
- package/dist/pipeline/fetch-url-content.d.ts +44 -0
- package/dist/pipeline/fetch-url-content.js +93 -0
- package/dist/pipeline/gap-analysis.d.ts +48 -0
- package/dist/pipeline/gap-analysis.js +231 -0
- package/dist/pipeline/generate-configs.d.ts +72 -0
- package/dist/pipeline/generate-configs.js +395 -0
- package/dist/pipeline/grader-api.d.ts +49 -0
- package/dist/pipeline/grader-api.js +200 -0
- package/dist/pipeline/grader-compare-runner.d.ts +44 -0
- package/dist/pipeline/grader-compare-runner.js +301 -0
- package/dist/pipeline/grader-comparison.d.ts +111 -0
- package/dist/pipeline/grader-comparison.js +161 -0
- package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
- package/dist/pipeline/grader-consistency-runner.js +270 -0
- package/dist/pipeline/grader-consistency.d.ts +103 -0
- package/dist/pipeline/grader-consistency.js +146 -0
- package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
- package/dist/pipeline/grader-sensitivity-runner.js +282 -0
- package/dist/pipeline/grader-sensitivity.d.ts +94 -0
- package/dist/pipeline/grader-sensitivity.js +144 -0
- package/dist/pipeline/grader-validate-runner.d.ts +38 -0
- package/dist/pipeline/grader-validate-runner.js +229 -0
- package/dist/pipeline/grader-validation.d.ts +107 -0
- package/dist/pipeline/grader-validation.js +169 -0
- package/dist/pipeline/map-request-to-config.d.ts +19 -0
- package/dist/pipeline/map-request-to-config.js +80 -0
- package/dist/pipeline/measure-retrieval.d.ts +59 -0
- package/dist/pipeline/measure-retrieval.js +111 -0
- package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
- package/dist/pipeline/mirror-repo-tasks.js +350 -0
- package/dist/pipeline/plan-format.d.ts +33 -0
- package/dist/pipeline/plan-format.js +202 -0
- package/dist/pipeline/plan.d.ts +169 -0
- package/dist/pipeline/plan.js +708 -0
- package/dist/pipeline/pr-comment.d.ts +19 -0
- package/dist/pipeline/pr-comment.js +502 -0
- package/dist/pipeline/probe.d.ts +52 -0
- package/dist/pipeline/probe.js +390 -0
- package/dist/pipeline/provenance.d.ts +47 -0
- package/dist/pipeline/provenance.js +146 -0
- package/dist/pipeline/readiness-report.d.ts +87 -0
- package/dist/pipeline/readiness-report.js +205 -0
- package/dist/pipeline/release-classification.d.ts +54 -0
- package/dist/pipeline/release-classification.js +238 -0
- package/dist/pipeline/release-report.d.ts +37 -0
- package/dist/pipeline/release-report.js +222 -0
- package/dist/pipeline/repo-eval-comment.d.ts +37 -0
- package/dist/pipeline/repo-eval-comment.js +165 -0
- package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
- package/dist/pipeline/repo-threshold-evaluator.js +162 -0
- package/dist/pipeline/resolve-mappings.d.ts +35 -0
- package/dist/pipeline/resolve-mappings.js +72 -0
- package/dist/pipeline/retrieval-metrics.d.ts +39 -0
- package/dist/pipeline/retrieval-metrics.js +136 -0
- package/dist/pipeline/reverse-mapping.d.ts +67 -0
- package/dist/pipeline/reverse-mapping.js +88 -0
- package/dist/pipeline/schemas.d.ts +9 -0
- package/dist/pipeline/schemas.js +9 -0
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +88 -0
- package/dist/pipeline/targeted-loo.js +203 -0
- package/dist/pipeline/thresholds.d.ts +27 -0
- package/dist/pipeline/thresholds.js +245 -0
- package/dist/pipeline/types.d.ts +10 -0
- package/dist/pipeline/types.js +10 -0
- package/dist/pipeline/validate.d.ts +67 -0
- package/dist/pipeline/validate.js +406 -0
- package/dist/pipeline/webhook-server.d.ts +37 -0
- package/dist/pipeline/webhook-server.js +133 -0
- package/dist/report-store.d.ts +84 -0
- package/dist/report-store.js +208 -0
- package/dist/sanity/client.d.ts +38 -0
- package/dist/sanity/client.js +86 -0
- package/dist/sanity/portable-text.d.ts +11 -0
- package/dist/sanity/portable-text.js +211 -0
- package/dist/sanity/queries.d.ts +133 -0
- package/dist/sanity/queries.js +300 -0
- package/dist/schedules/digest.d.ts +116 -0
- package/dist/schedules/digest.js +156 -0
- package/dist/schedules/index.d.ts +12 -0
- package/dist/schedules/index.js +10 -0
- package/dist/schedules/loader.d.ts +31 -0
- package/dist/schedules/loader.js +73 -0
- package/dist/schedules/schema.d.ts +9 -0
- package/dist/schedules/schema.js +9 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +21 -0
- package/dist/scripts/validate-task-sources.js +210 -0
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/bigquery/index.d.ts +131 -0
- package/dist/sinks/bigquery/index.js +222 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/index.d.ts +23 -0
- package/dist/sinks/index.js +18 -0
- package/dist/sinks/loader.d.ts +18 -0
- package/dist/sinks/loader.js +82 -0
- package/dist/sinks/retry.d.ts +24 -0
- package/dist/sinks/retry.js +52 -0
- package/dist/sinks/schema.d.ts +9 -0
- package/dist/sinks/schema.js +9 -0
- package/dist/sinks/slack/format.d.ts +65 -0
- package/dist/sinks/slack/format.js +327 -0
- package/dist/sinks/slack/index.d.ts +27 -0
- package/dist/sinks/slack/index.js +78 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +59 -0
- package/dist/sinks/types.js +44 -0
- package/dist/sinks/webhook/index.d.ts +19 -0
- package/dist/sinks/webhook/index.js +50 -0
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/sources.d.ts +104 -0
- package/dist/sources.js +292 -0
- package/dist/webhook/budget.d.ts +42 -0
- package/dist/webhook/budget.js +60 -0
- package/dist/webhook/debounce.d.ts +67 -0
- package/dist/webhook/debounce.js +76 -0
- package/dist/webhook/dispatch.d.ts +45 -0
- package/dist/webhook/dispatch.js +84 -0
- package/dist/webhook/eval-request-handler.d.ts +87 -0
- package/dist/webhook/eval-request-handler.js +181 -0
- package/dist/webhook/handler.d.ts +88 -0
- package/dist/webhook/handler.js +203 -0
- package/dist/webhook/index.d.ts +17 -0
- package/dist/webhook/index.js +12 -0
- package/dist/webhook/types.d.ts +109 -0
- package/dist/webhook/types.js +10 -0
- package/package.json +72 -0
- package/tasks/.expanded.agentic.yaml +51 -0
- package/tasks/.expanded.yaml +66 -0
- package/tasks/frameworks.yaml +98 -0
- package/tasks/functions.yaml +51 -0
- package/tasks/groq.yaml +216 -0
- package/tasks/nextjs-live.yaml +62 -0
- package/tasks/studio-setup.yaml +111 -0
- package/tasks/visual-editing.yaml +120 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/grader-sensitivity-runner.ts
|
|
3
|
+
*
|
|
4
|
+
* Orchestration module for grader sensitivity (discrimination power) testing
|
|
5
|
+
* (Phase 4).
|
|
6
|
+
*
|
|
7
|
+
* Discovers canonical reference solutions, applies programmatic degradations,
|
|
8
|
+
* grades each original/degraded pair, and calls `analyzeSensitivity()` from
|
|
9
|
+
* the pure computation module.
|
|
10
|
+
*
|
|
11
|
+
* Migrated from lib/grader-sensitivity.ts — no process.argv, no process.exit(),
|
|
12
|
+
* no module-level constants. Accepts rootDir as parameter.
|
|
13
|
+
*
|
|
14
|
+
* @see docs/exec-plans/completed/grader-reliability.md — Phase 4
|
|
15
|
+
*/
|
|
16
|
+
import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync, } from "fs";
|
|
17
|
+
import { basename, join } from "path";
|
|
18
|
+
import { DEGRADATION_STRATEGIES } from "./degradations.js";
|
|
19
|
+
import { gradeOnce, loadGraderModel } from "./grader-api.js";
|
|
20
|
+
import { analyzeSensitivity, } from "./grader-sensitivity.js";
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// Internal helpers
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
/**
|
|
25
|
+
* Discover canonical reference solutions.
|
|
26
|
+
* Scans canonical/reference-solutions/<area>/ for source files.
|
|
27
|
+
*/
|
|
28
|
+
function discoverReferenceSolutions(rootDir, areaFilter) {
|
|
29
|
+
const refsDir = join(rootDir, "canonical", "reference-solutions");
|
|
30
|
+
if (!existsSync(refsDir)) {
|
|
31
|
+
// Reference solutions now live in the Content Lake (ailf.referenceSolution).
|
|
32
|
+
// Return empty array when local files don't exist.
|
|
33
|
+
return [];
|
|
34
|
+
}
|
|
35
|
+
const areas = readdirSync(refsDir, { withFileTypes: true })
|
|
36
|
+
.filter((d) => d.isDirectory())
|
|
37
|
+
.map((d) => d.name)
|
|
38
|
+
.filter((name) => !areaFilter || name === areaFilter)
|
|
39
|
+
.sort();
|
|
40
|
+
if (areas.length === 0) {
|
|
41
|
+
throw new Error(areaFilter
|
|
42
|
+
? `No reference solutions found for area "${areaFilter}".`
|
|
43
|
+
: "No reference solution directories found.");
|
|
44
|
+
}
|
|
45
|
+
const solutions = [];
|
|
46
|
+
for (const area of areas) {
|
|
47
|
+
const areaDir = join(refsDir, area);
|
|
48
|
+
const files = readdirSync(areaDir)
|
|
49
|
+
.filter((f) => f.endsWith(".ts") ||
|
|
50
|
+
f.endsWith(".tsx") ||
|
|
51
|
+
f.endsWith(".js") ||
|
|
52
|
+
f.endsWith(".jsx"))
|
|
53
|
+
.sort();
|
|
54
|
+
for (const file of files) {
|
|
55
|
+
const filePath = join(areaDir, file);
|
|
56
|
+
const content = readFileSync(filePath, "utf-8");
|
|
57
|
+
solutions.push({
|
|
58
|
+
area,
|
|
59
|
+
content,
|
|
60
|
+
sourcePath: `canonical/reference-solutions/${area}/${file}`,
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
return solutions;
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Generate all degraded pairs from reference solutions.
|
|
68
|
+
* Each solution × each degradation strategy = one pair.
|
|
69
|
+
*/
|
|
70
|
+
function generateDegradedPairs(solutions) {
|
|
71
|
+
const pairs = [];
|
|
72
|
+
for (const solution of solutions) {
|
|
73
|
+
for (const strategy of DEGRADATION_STRATEGIES) {
|
|
74
|
+
const degraded = strategy.apply(solution.content);
|
|
75
|
+
// Only include if degradation actually changed the code
|
|
76
|
+
if (degraded !== solution.content) {
|
|
77
|
+
pairs.push({
|
|
78
|
+
degradation: strategy,
|
|
79
|
+
degraded,
|
|
80
|
+
original: solution.content,
|
|
81
|
+
sourcePath: solution.sourcePath,
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
return pairs;
|
|
87
|
+
}
|
|
88
|
+
// ---------------------------------------------------------------------------
|
|
89
|
+
// Report formatting (pure)
|
|
90
|
+
// ---------------------------------------------------------------------------
|
|
91
|
+
/**
|
|
92
|
+
* Format a GraderSensitivityResult as a human-readable table report.
|
|
93
|
+
* Returns a string — does NOT print to console.
|
|
94
|
+
*/
|
|
95
|
+
export function formatSensitivityReport(result) {
|
|
96
|
+
const lines = [];
|
|
97
|
+
lines.push("=".repeat(80));
|
|
98
|
+
lines.push(" GRADER SENSITIVITY REPORT");
|
|
99
|
+
lines.push("=".repeat(80));
|
|
100
|
+
lines.push("");
|
|
101
|
+
lines.push(` Grader: ${result.graderModel}`);
|
|
102
|
+
lines.push(` Total pairs: ${result.totalPairs}`);
|
|
103
|
+
lines.push("");
|
|
104
|
+
// Overall metrics
|
|
105
|
+
lines.push("-".repeat(80));
|
|
106
|
+
lines.push("OVERALL");
|
|
107
|
+
lines.push("-".repeat(80));
|
|
108
|
+
lines.push("");
|
|
109
|
+
lines.push(` Concordance: ${result.concordanceRate}%`);
|
|
110
|
+
lines.push(` Avg separation: ${result.avgSeparation} points`);
|
|
111
|
+
lines.push("");
|
|
112
|
+
// Per-dimension table
|
|
113
|
+
lines.push("-".repeat(80));
|
|
114
|
+
lines.push("PER-DIMENSION SENSITIVITY");
|
|
115
|
+
lines.push("-".repeat(80));
|
|
116
|
+
lines.push("");
|
|
117
|
+
const h = "| Dimension | Concordance | Avg Sep | Tied | Pairs |";
|
|
118
|
+
const sep = "|------------------|-------------|---------|-------|-------|";
|
|
119
|
+
lines.push(h);
|
|
120
|
+
lines.push(sep);
|
|
121
|
+
const dims = [
|
|
122
|
+
{ data: result.perDimension.taskCompletion, name: "Task Completion" },
|
|
123
|
+
{ data: result.perDimension.codeCorrectness, name: "Code Correctness" },
|
|
124
|
+
{ data: result.perDimension.docCoverage, name: "Doc Coverage" },
|
|
125
|
+
];
|
|
126
|
+
for (const { data, name } of dims) {
|
|
127
|
+
lines.push(`| ${name.padEnd(16)} | ${String(data.concordanceRate + "%").padStart(11)} | ${String(data.avgSeparation).padStart(7)} | ${String(data.tiedRate + "%").padStart(5)} | ${String(data.pairCount).padStart(5)} |`);
|
|
128
|
+
}
|
|
129
|
+
lines.push("");
|
|
130
|
+
// Cross-dimension analysis
|
|
131
|
+
lines.push("-".repeat(80));
|
|
132
|
+
lines.push("CROSS-DIMENSION SENSITIVITY");
|
|
133
|
+
lines.push("-".repeat(80));
|
|
134
|
+
lines.push("");
|
|
135
|
+
lines.push(` On-target: ${result.crossDimension.onTarget.concordanceRate}% concordance, ${result.crossDimension.onTarget.avgSeparation} avg sep (${result.crossDimension.onTarget.pairCount} pairs)`);
|
|
136
|
+
lines.push(` Off-target: ${result.crossDimension.offTarget.concordanceRate}% concordance, ${result.crossDimension.offTarget.avgSeparation} avg sep (${result.crossDimension.offTarget.pairCount} pairs)`);
|
|
137
|
+
lines.push("");
|
|
138
|
+
// Per-degradation breakdown
|
|
139
|
+
if (result.byDegradation.length > 0) {
|
|
140
|
+
lines.push("-".repeat(80));
|
|
141
|
+
lines.push("PER-DEGRADATION TYPE");
|
|
142
|
+
lines.push("-".repeat(80));
|
|
143
|
+
lines.push("");
|
|
144
|
+
const dh = "| Degradation | Concordance | Avg Sep | Pairs |";
|
|
145
|
+
const ds = "|----------------------------------------------|-------------|---------|-------|";
|
|
146
|
+
lines.push(dh);
|
|
147
|
+
lines.push(ds);
|
|
148
|
+
for (const deg of result.byDegradation) {
|
|
149
|
+
const desc = deg.description.slice(0, 44).padEnd(44);
|
|
150
|
+
lines.push(`| ${desc} | ${String(deg.concordanceRate + "%").padStart(11)} | ${String(deg.avgSeparation).padStart(7)} | ${String(deg.pairCount).padStart(5)} |`);
|
|
151
|
+
}
|
|
152
|
+
lines.push("");
|
|
153
|
+
}
|
|
154
|
+
// Failed pairs (worst failures)
|
|
155
|
+
const topN = Math.min(5, result.failedPairs.length);
|
|
156
|
+
if (topN > 0) {
|
|
157
|
+
lines.push("-".repeat(80));
|
|
158
|
+
lines.push(`TOP ${topN} REVERSED PAIRS (grader ranked degraded higher)`);
|
|
159
|
+
lines.push("-".repeat(80));
|
|
160
|
+
lines.push("");
|
|
161
|
+
for (let i = 0; i < topN; i++) {
|
|
162
|
+
const p = result.failedPairs[i];
|
|
163
|
+
const delta = p.degradedScore - p.originalScore;
|
|
164
|
+
lines.push(` ${i + 1}. ${basename(p.sourcePath)} — ${p.dimension}`);
|
|
165
|
+
lines.push(` Original=${p.originalScore}, Degraded=${p.degradedScore} (Δ=+${delta})`);
|
|
166
|
+
lines.push(` Degradation: ${p.degradationDescription}`);
|
|
167
|
+
}
|
|
168
|
+
lines.push("");
|
|
169
|
+
}
|
|
170
|
+
return lines.join("\n");
|
|
171
|
+
}
|
|
172
|
+
// ---------------------------------------------------------------------------
|
|
173
|
+
// Main runner
|
|
174
|
+
// ---------------------------------------------------------------------------
|
|
175
|
+
/**
|
|
176
|
+
* Run grader sensitivity analysis.
|
|
177
|
+
*
|
|
178
|
+
* Discovers reference solutions, applies degradations, grades each pair
|
|
179
|
+
* with the configured grader, and analyzes discrimination power.
|
|
180
|
+
*
|
|
181
|
+
* @throws Error if no reference solutions found or no pairs generated
|
|
182
|
+
*/
|
|
183
|
+
export async function runGraderSensitivity(options) {
|
|
184
|
+
const { rootDir, areaFilter, format = "table" } = options;
|
|
185
|
+
console.log("=== Grader Sensitivity Analysis ===\n");
|
|
186
|
+
// Resolve grader model
|
|
187
|
+
const grader = loadGraderModel(rootDir);
|
|
188
|
+
console.log(` Grader: ${grader.label} (${grader.id})`);
|
|
189
|
+
// Discover reference solutions
|
|
190
|
+
const solutions = discoverReferenceSolutions(rootDir, areaFilter);
|
|
191
|
+
console.log(` Solutions: ${solutions.length} reference files`);
|
|
192
|
+
if (areaFilter) {
|
|
193
|
+
console.log(` Area filter: ${areaFilter}`);
|
|
194
|
+
}
|
|
195
|
+
// Generate degraded pairs
|
|
196
|
+
const degradedPairs = generateDegradedPairs(solutions);
|
|
197
|
+
console.log(` Pairs: ${degradedPairs.length} (solutions × degradations)`);
|
|
198
|
+
if (degradedPairs.length === 0) {
|
|
199
|
+
throw new Error("No degraded pairs generated. Check reference solutions.");
|
|
200
|
+
}
|
|
201
|
+
// Each pair needs 2 grades (original + degraded) × 3 dimensions = 6 calls
|
|
202
|
+
const dimensions = [
|
|
203
|
+
"taskCompletion",
|
|
204
|
+
"codeCorrectness",
|
|
205
|
+
"docCoverage",
|
|
206
|
+
];
|
|
207
|
+
const totalCalls = degradedPairs.length * dimensions.length * 2;
|
|
208
|
+
const estimatedCost = totalCalls * 0.005;
|
|
209
|
+
console.log(` API calls: ${totalCalls}`);
|
|
210
|
+
console.log(` Est. cost: ~$${estimatedCost.toFixed(2)}`);
|
|
211
|
+
console.log();
|
|
212
|
+
// Grade each pair
|
|
213
|
+
console.log(" Grading original and degraded pairs...");
|
|
214
|
+
const rubricTemplate = (dim) => {
|
|
215
|
+
const labels = {
|
|
216
|
+
codeCorrectness: "Evaluate code correctness: Does the code follow best practices, use correct APIs, and avoid anti-patterns? Score 0–100.",
|
|
217
|
+
docCoverage: "Evaluate documentation coverage: Does the response demonstrate knowledge from official documentation? Are imports, API names, and configuration correct per the docs? Score 0–100.",
|
|
218
|
+
taskCompletion: "Evaluate task completion: Does the response fully implement the requested feature? Are all requirements addressed? Score 0–100.",
|
|
219
|
+
};
|
|
220
|
+
return labels[dim] ?? `Evaluate ${dim}. Score 0–100.`;
|
|
221
|
+
};
|
|
222
|
+
const sensitivityPairs = [];
|
|
223
|
+
let completed = 0;
|
|
224
|
+
let failed = 0;
|
|
225
|
+
for (const pair of degradedPairs) {
|
|
226
|
+
const area = pair.sourcePath
|
|
227
|
+
.split("/")
|
|
228
|
+
.find((_s, i, arr) => arr[i - 1] === "reference-solutions") ?? "unknown";
|
|
229
|
+
for (const dimension of dimensions) {
|
|
230
|
+
const rubric = rubricTemplate(dimension);
|
|
231
|
+
// Grade original
|
|
232
|
+
const originalScore = await gradeOnce(grader.id, pair.original, rubric);
|
|
233
|
+
// Grade degraded
|
|
234
|
+
const degradedScore = await gradeOnce(grader.id, pair.degraded, rubric);
|
|
235
|
+
completed++;
|
|
236
|
+
if (completed % 10 === 0 ||
|
|
237
|
+
completed === degradedPairs.length * dimensions.length) {
|
|
238
|
+
const total = degradedPairs.length * dimensions.length;
|
|
239
|
+
const pct = Math.round((completed / total) * 100);
|
|
240
|
+
process.stdout.write(`\r Progress: ${completed}/${total} (${pct}%)`);
|
|
241
|
+
}
|
|
242
|
+
if (originalScore === null || degradedScore === null) {
|
|
243
|
+
failed++;
|
|
244
|
+
continue;
|
|
245
|
+
}
|
|
246
|
+
sensitivityPairs.push({
|
|
247
|
+
area,
|
|
248
|
+
degradationDescription: pair.degradation.description,
|
|
249
|
+
degradedScore,
|
|
250
|
+
dimension,
|
|
251
|
+
originalScore,
|
|
252
|
+
sourcePath: pair.sourcePath,
|
|
253
|
+
targetDimension: pair.degradation.targetDimension,
|
|
254
|
+
});
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
console.log(); // newline after progress
|
|
258
|
+
if (failed > 0) {
|
|
259
|
+
console.log(` ⚠ ${failed} grading pairs failed (excluded)`);
|
|
260
|
+
}
|
|
261
|
+
console.log();
|
|
262
|
+
if (sensitivityPairs.length === 0) {
|
|
263
|
+
throw new Error("No sensitivity pairs to analyze. All grading calls failed.");
|
|
264
|
+
}
|
|
265
|
+
// Analyze
|
|
266
|
+
const result = analyzeSensitivity(sensitivityPairs, grader.id);
|
|
267
|
+
// Output
|
|
268
|
+
if (format === "table") {
|
|
269
|
+
console.log(formatSensitivityReport(result));
|
|
270
|
+
}
|
|
271
|
+
else {
|
|
272
|
+
console.log(JSON.stringify(result, null, 2));
|
|
273
|
+
}
|
|
274
|
+
// Write output
|
|
275
|
+
const outPath = options.outputPath ??
|
|
276
|
+
join(rootDir, "results", "latest", "grader-sensitivity.json");
|
|
277
|
+
const outDir = join(outPath, "..");
|
|
278
|
+
mkdirSync(outDir, { recursive: true });
|
|
279
|
+
writeFileSync(outPath, JSON.stringify(result, null, 2));
|
|
280
|
+
console.log(`\n 📄 Results written to ${outPath}`);
|
|
281
|
+
return result;
|
|
282
|
+
}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/grader-sensitivity.ts
|
|
3
|
+
*
|
|
4
|
+
* Pure computation module for measuring grader sensitivity (discrimination power).
|
|
5
|
+
*
|
|
6
|
+
* Takes paired grading results — original (good) and degraded (bad) versions
|
|
7
|
+
* of the same response graded by the same grader — and computes:
|
|
8
|
+
* - Concordance rate: % of pairs where the grader ranked the original higher
|
|
9
|
+
* - Score separation: average score difference between good and bad
|
|
10
|
+
* - Per-dimension sensitivity
|
|
11
|
+
*
|
|
12
|
+
* This module has NO side effects — no file I/O, no API calls.
|
|
13
|
+
*
|
|
14
|
+
* @see docs/exec-plans/completed/grader-reliability.md — Phase 4
|
|
15
|
+
*/
|
|
16
|
+
/** Sensitivity broken down by degradation type */
|
|
17
|
+
export interface DegradationSensitivity {
|
|
18
|
+
/** Average score separation for this degradation type */
|
|
19
|
+
avgSeparation: number;
|
|
20
|
+
/** Concordance rate for this degradation type */
|
|
21
|
+
concordanceRate: number;
|
|
22
|
+
/** Description of the degradation */
|
|
23
|
+
description: string;
|
|
24
|
+
/** Number of pairs using this degradation */
|
|
25
|
+
pairCount: number;
|
|
26
|
+
/** Which dimension this degradation targeted */
|
|
27
|
+
targetDimension: "codeCorrectness" | "docCoverage" | "taskCompletion";
|
|
28
|
+
}
|
|
29
|
+
/** Per-dimension sensitivity metrics */
|
|
30
|
+
export interface DimensionSensitivity {
|
|
31
|
+
/** Average score separation (original - degraded) */
|
|
32
|
+
avgSeparation: number;
|
|
33
|
+
/** % of pairs where grader ranked original higher */
|
|
34
|
+
concordanceRate: number;
|
|
35
|
+
/** Number of paired comparisons */
|
|
36
|
+
pairCount: number;
|
|
37
|
+
/** % of pairs where scores were exactly equal */
|
|
38
|
+
tiedRate: number;
|
|
39
|
+
}
|
|
40
|
+
/** Full sensitivity analysis result */
|
|
41
|
+
export interface GraderSensitivityResult {
|
|
42
|
+
/** Average score separation across all pairs */
|
|
43
|
+
avgSeparation: number;
|
|
44
|
+
/** Per-degradation-type sensitivity */
|
|
45
|
+
byDegradation: DegradationSensitivity[];
|
|
46
|
+
/** Overall concordance rate (% of correctly ranked pairs) */
|
|
47
|
+
concordanceRate: number;
|
|
48
|
+
/** Cross-dimension sensitivity: does degradation in one dimension affect others? */
|
|
49
|
+
crossDimension: {
|
|
50
|
+
/** When targetDimension != dimension, does the score still drop? */
|
|
51
|
+
onTarget: DimensionSensitivity;
|
|
52
|
+
/** Scores for non-targeted dimensions */
|
|
53
|
+
offTarget: DimensionSensitivity;
|
|
54
|
+
};
|
|
55
|
+
/** Number of pairs where the grader gave a HIGHER score to the degraded version */
|
|
56
|
+
failedPairs: SensitivityPair[];
|
|
57
|
+
/** When this analysis was generated */
|
|
58
|
+
generatedAt: string;
|
|
59
|
+
/** Grader model used */
|
|
60
|
+
graderModel: string;
|
|
61
|
+
/** Per-dimension sensitivity metrics */
|
|
62
|
+
perDimension: {
|
|
63
|
+
taskCompletion: DimensionSensitivity;
|
|
64
|
+
codeCorrectness: DimensionSensitivity;
|
|
65
|
+
docCoverage: DimensionSensitivity;
|
|
66
|
+
};
|
|
67
|
+
/** Total paired comparisons analyzed */
|
|
68
|
+
totalPairs: number;
|
|
69
|
+
}
|
|
70
|
+
/** A single paired comparison: original vs. degraded response */
|
|
71
|
+
export interface SensitivityPair {
|
|
72
|
+
/** Feature area (e.g., "groq") */
|
|
73
|
+
area: string;
|
|
74
|
+
/** What degradation was applied */
|
|
75
|
+
degradationDescription: string;
|
|
76
|
+
/** Score assigned to the degraded version (0–100) */
|
|
77
|
+
degradedScore: number;
|
|
78
|
+
/** Which dimension this judgment measures */
|
|
79
|
+
dimension: "codeCorrectness" | "docCoverage" | "taskCompletion";
|
|
80
|
+
/** Score assigned to the original version (0–100) */
|
|
81
|
+
originalScore: number;
|
|
82
|
+
/** Source file of the reference solution */
|
|
83
|
+
sourcePath: string;
|
|
84
|
+
/** Which dimension the degradation targeted */
|
|
85
|
+
targetDimension: "codeCorrectness" | "docCoverage" | "taskCompletion";
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Analyze sensitivity from a set of paired grading results.
|
|
89
|
+
*
|
|
90
|
+
* @param pairs Array of paired comparisons (original vs. degraded)
|
|
91
|
+
* @param graderModel Grader model ID for the report
|
|
92
|
+
* @returns Full sensitivity analysis with concordance, separation, and per-dimension metrics
|
|
93
|
+
*/
|
|
94
|
+
export declare function analyzeSensitivity(pairs: SensitivityPair[], graderModel: string): GraderSensitivityResult;
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/grader-sensitivity.ts
|
|
3
|
+
*
|
|
4
|
+
* Pure computation module for measuring grader sensitivity (discrimination power).
|
|
5
|
+
*
|
|
6
|
+
* Takes paired grading results — original (good) and degraded (bad) versions
|
|
7
|
+
* of the same response graded by the same grader — and computes:
|
|
8
|
+
* - Concordance rate: % of pairs where the grader ranked the original higher
|
|
9
|
+
* - Score separation: average score difference between good and bad
|
|
10
|
+
* - Per-dimension sensitivity
|
|
11
|
+
*
|
|
12
|
+
* This module has NO side effects — no file I/O, no API calls.
|
|
13
|
+
*
|
|
14
|
+
* @see docs/exec-plans/completed/grader-reliability.md — Phase 4
|
|
15
|
+
*/
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
// Pure computation
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
/**
|
|
20
|
+
* Analyze sensitivity from a set of paired grading results.
|
|
21
|
+
*
|
|
22
|
+
* @param pairs Array of paired comparisons (original vs. degraded)
|
|
23
|
+
* @param graderModel Grader model ID for the report
|
|
24
|
+
* @returns Full sensitivity analysis with concordance, separation, and per-dimension metrics
|
|
25
|
+
*/
|
|
26
|
+
export function analyzeSensitivity(pairs, graderModel) {
|
|
27
|
+
if (pairs.length === 0) {
|
|
28
|
+
return emptyResult(graderModel);
|
|
29
|
+
}
|
|
30
|
+
// Overall concordance and separation
|
|
31
|
+
const { avgSeparation, concordanceRate, tiedRate: _tiedRate, } = computeMetrics(pairs);
|
|
32
|
+
// Per-dimension (based on the grading dimension, not the target dimension)
|
|
33
|
+
const perDimension = {
|
|
34
|
+
codeCorrectness: computeMetrics(pairs.filter((p) => p.dimension === "codeCorrectness")),
|
|
35
|
+
docCoverage: computeMetrics(pairs.filter((p) => p.dimension === "docCoverage")),
|
|
36
|
+
taskCompletion: computeMetrics(pairs.filter((p) => p.dimension === "taskCompletion")),
|
|
37
|
+
};
|
|
38
|
+
// Cross-dimension: on-target (dimension matches targetDimension) vs off-target
|
|
39
|
+
const onTargetPairs = pairs.filter((p) => p.dimension === p.targetDimension);
|
|
40
|
+
const offTargetPairs = pairs.filter((p) => p.dimension !== p.targetDimension);
|
|
41
|
+
const crossDimension = {
|
|
42
|
+
offTarget: computeMetrics(offTargetPairs),
|
|
43
|
+
onTarget: computeMetrics(onTargetPairs),
|
|
44
|
+
};
|
|
45
|
+
// Per-degradation type
|
|
46
|
+
const byDegradation = computeByDegradation(pairs);
|
|
47
|
+
// Failed pairs (grader ranked degraded higher)
|
|
48
|
+
const failedPairs = pairs
|
|
49
|
+
.filter((p) => p.degradedScore > p.originalScore)
|
|
50
|
+
.sort((a, b) => {
|
|
51
|
+
const sepA = a.degradedScore - a.originalScore;
|
|
52
|
+
const sepB = b.degradedScore - b.originalScore;
|
|
53
|
+
return sepB - sepA; // worst failures first
|
|
54
|
+
});
|
|
55
|
+
return {
|
|
56
|
+
avgSeparation,
|
|
57
|
+
byDegradation,
|
|
58
|
+
concordanceRate,
|
|
59
|
+
crossDimension,
|
|
60
|
+
failedPairs,
|
|
61
|
+
generatedAt: new Date().toISOString(),
|
|
62
|
+
graderModel,
|
|
63
|
+
perDimension,
|
|
64
|
+
totalPairs: pairs.length,
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
// ---------------------------------------------------------------------------
|
|
68
|
+
// Internal helpers
|
|
69
|
+
// ---------------------------------------------------------------------------
|
|
70
|
+
/** Group pairs by degradation description and compute per-group metrics */
|
|
71
|
+
function computeByDegradation(pairs) {
|
|
72
|
+
const groups = new Map();
|
|
73
|
+
for (const p of pairs) {
|
|
74
|
+
const key = p.degradationDescription;
|
|
75
|
+
const group = groups.get(key) ?? [];
|
|
76
|
+
group.push(p);
|
|
77
|
+
groups.set(key, group);
|
|
78
|
+
}
|
|
79
|
+
const results = [];
|
|
80
|
+
for (const [description, group] of groups) {
|
|
81
|
+
const metrics = computeMetrics(group);
|
|
82
|
+
results.push({
|
|
83
|
+
avgSeparation: metrics.avgSeparation,
|
|
84
|
+
concordanceRate: metrics.concordanceRate,
|
|
85
|
+
description,
|
|
86
|
+
pairCount: metrics.pairCount,
|
|
87
|
+
targetDimension: group[0].targetDimension,
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
// Sort by concordance rate ascending (worst discrimination first)
|
|
91
|
+
results.sort((a, b) => a.concordanceRate - b.concordanceRate);
|
|
92
|
+
return results;
|
|
93
|
+
}
|
|
94
|
+
/** Compute concordance, separation, and tied rate from a set of pairs */
|
|
95
|
+
function computeMetrics(pairs) {
|
|
96
|
+
if (pairs.length === 0) {
|
|
97
|
+
return { avgSeparation: 0, concordanceRate: 0, pairCount: 0, tiedRate: 0 };
|
|
98
|
+
}
|
|
99
|
+
let concordant = 0;
|
|
100
|
+
let tied = 0;
|
|
101
|
+
let totalSeparation = 0;
|
|
102
|
+
for (const p of pairs) {
|
|
103
|
+
const sep = p.originalScore - p.degradedScore;
|
|
104
|
+
totalSeparation += sep;
|
|
105
|
+
if (p.originalScore > p.degradedScore)
|
|
106
|
+
concordant++;
|
|
107
|
+
if (p.originalScore === p.degradedScore)
|
|
108
|
+
tied++;
|
|
109
|
+
}
|
|
110
|
+
return {
|
|
111
|
+
avgSeparation: round(totalSeparation / pairs.length),
|
|
112
|
+
concordanceRate: round((concordant / pairs.length) * 100),
|
|
113
|
+
pairCount: pairs.length,
|
|
114
|
+
tiedRate: round((tied / pairs.length) * 100),
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
/** Empty result for when there are no pairs */
|
|
118
|
+
function emptyResult(graderModel) {
|
|
119
|
+
const emptyDim = {
|
|
120
|
+
avgSeparation: 0,
|
|
121
|
+
concordanceRate: 0,
|
|
122
|
+
pairCount: 0,
|
|
123
|
+
tiedRate: 0,
|
|
124
|
+
};
|
|
125
|
+
return {
|
|
126
|
+
avgSeparation: 0,
|
|
127
|
+
byDegradation: [],
|
|
128
|
+
concordanceRate: 0,
|
|
129
|
+
crossDimension: { offTarget: emptyDim, onTarget: emptyDim },
|
|
130
|
+
failedPairs: [],
|
|
131
|
+
generatedAt: new Date().toISOString(),
|
|
132
|
+
graderModel,
|
|
133
|
+
perDimension: {
|
|
134
|
+
codeCorrectness: emptyDim,
|
|
135
|
+
docCoverage: emptyDim,
|
|
136
|
+
taskCompletion: emptyDim,
|
|
137
|
+
},
|
|
138
|
+
totalPairs: 0,
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
/** Round to 1 decimal place */
|
|
142
|
+
function round(n) {
|
|
143
|
+
return Math.round(n * 10) / 10;
|
|
144
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/grader-validate-runner.ts
|
|
3
|
+
*
|
|
4
|
+
* Orchestration module for grader validation against human reference grades
|
|
5
|
+
* (Phase 2).
|
|
6
|
+
*
|
|
7
|
+
* Loads human-graded reference samples from canonical/grader-references/,
|
|
8
|
+
* runs the grader model on each sample, and compares against human scores
|
|
9
|
+
* using `validateGrader()` from the pure computation module.
|
|
10
|
+
*
|
|
11
|
+
* Migrated from lib/grader-validate.ts — no process.argv, no process.exit(),
|
|
12
|
+
* no module-level constants. Accepts rootDir as parameter.
|
|
13
|
+
*
|
|
14
|
+
* @see docs/exec-plans/completed/grader-reliability.md — Phase 2
|
|
15
|
+
*/
|
|
16
|
+
import { type GraderValidation } from "./grader-validation.js";
|
|
17
|
+
export interface GraderValidateRunnerOptions {
|
|
18
|
+
/** Grader model to validate (defaults to loadGraderModel(rootDir).id) */
|
|
19
|
+
graderModel?: string;
|
|
20
|
+
/** MAE threshold for pass/fail (default: 10) */
|
|
21
|
+
maeThreshold?: number;
|
|
22
|
+
/** Root directory of the eval package */
|
|
23
|
+
rootDir: string;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Format a GraderValidation result as a human-readable table report.
|
|
27
|
+
* Returns a string — does NOT print to console.
|
|
28
|
+
*/
|
|
29
|
+
export declare function formatValidationReport(result: GraderValidation): string;
|
|
30
|
+
/**
|
|
31
|
+
* Run grader validation against human reference grades.
|
|
32
|
+
*
|
|
33
|
+
* Loads human-graded samples, grades each with the grader model,
|
|
34
|
+
* and computes validation metrics (MAE, correlation, bias).
|
|
35
|
+
*
|
|
36
|
+
* @throws Error if no reference grades found, or no grades to analyze
|
|
37
|
+
*/
|
|
38
|
+
export declare function runGraderValidate(options: GraderValidateRunnerOptions): Promise<GraderValidation>;
|