@sanity/ailf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -0
- package/bin/ailf.js +64 -0
- package/canonical/grader-references/README.md +88 -0
- package/canonical/grader-references/groq.yaml +234 -0
- package/canonical/grader-references/studio-setup.yaml +275 -0
- package/canonical/reference-solutions/.gitkeep +1 -0
- package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
- package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
- package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
- package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
- package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
- package/canonical/reference-solutions/groq/joins-references.ts +300 -0
- package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
- package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
- package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
- package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
- package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
- package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
- package/config/bigquery/README.md +74 -0
- package/config/bigquery/views/area_scores.sql +87 -0
- package/config/bigquery/views/reports.sql +49 -0
- package/config/features.yaml +116 -0
- package/config/models.yaml +115 -0
- package/config/prompts.yaml +75 -0
- package/config/rubrics.yaml +62 -0
- package/config/schedules.yaml +43 -0
- package/config/sinks.yaml +54 -0
- package/config/sources.yaml +51 -0
- package/config/thresholds.yaml +49 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
- package/dist/_vendor/ailf-core/examples/index.js +285 -0
- package/dist/_vendor/ailf-core/index.d.ts +17 -0
- package/dist/_vendor/ailf-core/index.js +17 -0
- package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
- package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
- package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
- package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
- package/dist/_vendor/ailf-core/ports/context.js +14 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
- package/dist/_vendor/ailf-core/ports/index.js +7 -0
- package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
- package/dist/_vendor/ailf-core/ports/logger.js +11 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
- package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/schemas/index.js +16 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
- package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
- package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
- package/dist/_vendor/ailf-core/services/index.js +12 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/scoring.js +222 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
- package/dist/_vendor/ailf-core/types/index.js +21 -0
- package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
- package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
- package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
- package/dist/_vendor/ailf-shared/document-ref.js +1 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -0
- package/dist/_vendor/ailf-shared/index.js +16 -0
- package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
- package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
- package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
- package/dist/_vendor/ailf-shared/score-grades.js +23 -0
- package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
- package/dist/adapters/cache/content-lake-cache.js +59 -0
- package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
- package/dist/adapters/cache/filesystem-cache.js +54 -0
- package/dist/adapters/cache/index.d.ts +2 -0
- package/dist/adapters/cache/index.js +2 -0
- package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
- package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
- package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
- package/dist/adapters/config-sources/file-config-adapter.js +96 -0
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +2 -0
- package/dist/adapters/doc-fetchers/index.d.ts +1 -0
- package/dist/adapters/doc-fetchers/index.js +1 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
- package/dist/adapters/eval-runners/index.d.ts +1 -0
- package/dist/adapters/eval-runners/index.js +1 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
- package/dist/adapters/index.d.ts +12 -0
- package/dist/adapters/index.js +12 -0
- package/dist/adapters/loggers/console-logger.d.ts +22 -0
- package/dist/adapters/loggers/console-logger.js +54 -0
- package/dist/adapters/loggers/index.d.ts +9 -0
- package/dist/adapters/loggers/index.js +9 -0
- package/dist/adapters/loggers/json-logger.d.ts +18 -0
- package/dist/adapters/loggers/json-logger.js +33 -0
- package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
- package/dist/adapters/loggers/quiet-logger.js +30 -0
- package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/composite-task-source.js +59 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
- package/dist/adapters/task-sources/index.d.ts +7 -0
- package/dist/adapters/task-sources/index.js +7 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
- package/dist/adapters/task-sources/repo-schemas.js +234 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
- package/dist/adapters/task-sources/repo-task-source.js +104 -0
- package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
- package/dist/adapters/task-sources/repo-trigger.js +153 -0
- package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
- package/dist/adapters/task-sources/repo-validation.js +164 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
- package/dist/adapters/task-sources/yaml-task-source.js +136 -0
- package/dist/agent-observer/agentic-provider.d.ts +132 -0
- package/dist/agent-observer/agentic-provider.js +983 -0
- package/dist/agent-observer/classifier.d.ts +62 -0
- package/dist/agent-observer/classifier.js +269 -0
- package/dist/agent-observer/index.d.ts +7 -0
- package/dist/agent-observer/index.js +4 -0
- package/dist/agent-observer/pricing.d.ts +35 -0
- package/dist/agent-observer/pricing.js +82 -0
- package/dist/agent-observer/provider.d.ts +77 -0
- package/dist/agent-observer/provider.js +151 -0
- package/dist/agent-observer/proxy.d.ts +91 -0
- package/dist/agent-observer/proxy.js +321 -0
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/agent-observer/types.d.ts +137 -0
- package/dist/agent-observer/types.js +16 -0
- package/dist/assertions/source-isolation.d.ts +72 -0
- package/dist/assertions/source-isolation.js +117 -0
- package/dist/cli.d.ts +24 -0
- package/dist/cli.js +199 -0
- package/dist/commands/agent-report.d.ts +5 -0
- package/dist/commands/agent-report.js +69 -0
- package/dist/commands/baseline.d.ts +9 -0
- package/dist/commands/baseline.js +141 -0
- package/dist/commands/cache.d.ts +13 -0
- package/dist/commands/cache.js +135 -0
- package/dist/commands/calculate-scores.d.ts +8 -0
- package/dist/commands/calculate-scores.js +48 -0
- package/dist/commands/compare.d.ts +8 -0
- package/dist/commands/compare.js +120 -0
- package/dist/commands/completion.d.ts +18 -0
- package/dist/commands/completion.js +260 -0
- package/dist/commands/coverage-audit.d.ts +7 -0
- package/dist/commands/coverage-audit.js +40 -0
- package/dist/commands/discovery-report.d.ts +10 -0
- package/dist/commands/discovery-report.js +44 -0
- package/dist/commands/eval.d.ts +9 -0
- package/dist/commands/eval.js +35 -0
- package/dist/commands/explain-handler.d.ts +34 -0
- package/dist/commands/explain-handler.js +719 -0
- package/dist/commands/fetch-docs.d.ts +8 -0
- package/dist/commands/fetch-docs.js +128 -0
- package/dist/commands/generate-configs.d.ts +8 -0
- package/dist/commands/generate-configs.js +46 -0
- package/dist/commands/grader/index.d.ts +11 -0
- package/dist/commands/grader/index.js +118 -0
- package/dist/commands/init.d.ts +19 -0
- package/dist/commands/init.js +150 -0
- package/dist/commands/interactive.d.ts +12 -0
- package/dist/commands/interactive.js +238 -0
- package/dist/commands/lookup-doc.d.ts +15 -0
- package/dist/commands/lookup-doc.js +84 -0
- package/dist/commands/measure-retrieval.d.ts +5 -0
- package/dist/commands/measure-retrieval.js +65 -0
- package/dist/commands/pipeline-action.d.ts +71 -0
- package/dist/commands/pipeline-action.js +305 -0
- package/dist/commands/pipeline.d.ts +62 -0
- package/dist/commands/pipeline.js +53 -0
- package/dist/commands/pr-comment.d.ts +8 -0
- package/dist/commands/pr-comment.js +47 -0
- package/dist/commands/publish.d.ts +26 -0
- package/dist/commands/publish.js +253 -0
- package/dist/commands/readiness-report.d.ts +10 -0
- package/dist/commands/readiness-report.js +104 -0
- package/dist/commands/shared/options.d.ts +29 -0
- package/dist/commands/shared/options.js +57 -0
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +16 -0
- package/dist/commands/validate-tasks.js +93 -0
- package/dist/commands/validate.d.ts +9 -0
- package/dist/commands/validate.js +73 -0
- package/dist/commands/webhook-server.d.ts +5 -0
- package/dist/commands/webhook-server.js +30 -0
- package/dist/commands/weekly-digest.d.ts +10 -0
- package/dist/commands/weekly-digest.js +104 -0
- package/dist/composition-root.d.ts +26 -0
- package/dist/composition-root.js +107 -0
- package/dist/interpolate.d.ts +26 -0
- package/dist/interpolate.js +70 -0
- package/dist/job-store.d.ts +104 -0
- package/dist/job-store.js +188 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.d.ts +27 -0
- package/dist/orchestration/build-app-context.js +81 -0
- package/dist/orchestration/build-step-sequence.d.ts +15 -0
- package/dist/orchestration/build-step-sequence.js +84 -0
- package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
- package/dist/orchestration/config-to-source-overrides.js +28 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/index.d.ts +11 -0
- package/dist/orchestration/index.js +11 -0
- package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
- package/dist/orchestration/pipeline-orchestrator.js +153 -0
- package/dist/orchestration/step-runner.d.ts +20 -0
- package/dist/orchestration/step-runner.js +88 -0
- package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
- package/dist/orchestration/steps/calculate-scores-step.js +95 -0
- package/dist/orchestration/steps/callback-step.d.ts +24 -0
- package/dist/orchestration/steps/callback-step.js +76 -0
- package/dist/orchestration/steps/compare-step.d.ts +14 -0
- package/dist/orchestration/steps/compare-step.js +92 -0
- package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
- package/dist/orchestration/steps/discovery-report-step.js +55 -0
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
- package/dist/orchestration/steps/fetch-docs-step.js +135 -0
- package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
- package/dist/orchestration/steps/gap-analysis-step.js +136 -0
- package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
- package/dist/orchestration/steps/generate-configs-step.js +85 -0
- package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
- package/dist/orchestration/steps/grader-consistency-step.js +64 -0
- package/dist/orchestration/steps/index.d.ts +19 -0
- package/dist/orchestration/steps/index.js +19 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
- package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
- package/dist/orchestration/steps/publish-report-step.js +216 -0
- package/dist/orchestration/steps/readiness-step.d.ts +13 -0
- package/dist/orchestration/steps/readiness-step.js +91 -0
- package/dist/orchestration/steps/report-step.d.ts +12 -0
- package/dist/orchestration/steps/report-step.js +49 -0
- package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
- package/dist/orchestration/steps/run-eval-step.js +195 -0
- package/dist/orchestration/steps/validate-step.d.ts +12 -0
- package/dist/orchestration/steps/validate-step.js +41 -0
- package/dist/pipeline/agent-behavior-report.d.ts +53 -0
- package/dist/pipeline/agent-behavior-report.js +132 -0
- package/dist/pipeline/attribution.d.ts +47 -0
- package/dist/pipeline/attribution.js +226 -0
- package/dist/pipeline/baseline.d.ts +37 -0
- package/dist/pipeline/baseline.js +141 -0
- package/dist/pipeline/cache.d.ts +101 -0
- package/dist/pipeline/cache.js +283 -0
- package/dist/pipeline/calculate-scores.d.ts +102 -0
- package/dist/pipeline/calculate-scores.js +1128 -0
- package/dist/pipeline/callback-delivery.d.ts +50 -0
- package/dist/pipeline/callback-delivery.js +89 -0
- package/dist/pipeline/checks.d.ts +39 -0
- package/dist/pipeline/checks.js +280 -0
- package/dist/pipeline/classify-url.d.ts +61 -0
- package/dist/pipeline/classify-url.js +93 -0
- package/dist/pipeline/compare.d.ts +31 -0
- package/dist/pipeline/compare.js +208 -0
- package/dist/pipeline/coverage-audit.d.ts +39 -0
- package/dist/pipeline/coverage-audit.js +165 -0
- package/dist/pipeline/degradations.d.ts +85 -0
- package/dist/pipeline/degradations.js +242 -0
- package/dist/pipeline/discovery-report.d.ts +55 -0
- package/dist/pipeline/discovery-report.js +178 -0
- package/dist/pipeline/eval-constants.d.ts +68 -0
- package/dist/pipeline/eval-constants.js +111 -0
- package/dist/pipeline/eval-fingerprint.d.ts +66 -0
- package/dist/pipeline/eval-fingerprint.js +175 -0
- package/dist/pipeline/expand-tasks.d.ts +220 -0
- package/dist/pipeline/expand-tasks.js +421 -0
- package/dist/pipeline/failure-modes.d.ts +46 -0
- package/dist/pipeline/failure-modes.js +348 -0
- package/dist/pipeline/fetch-url-content.d.ts +44 -0
- package/dist/pipeline/fetch-url-content.js +93 -0
- package/dist/pipeline/gap-analysis.d.ts +48 -0
- package/dist/pipeline/gap-analysis.js +231 -0
- package/dist/pipeline/generate-configs.d.ts +72 -0
- package/dist/pipeline/generate-configs.js +395 -0
- package/dist/pipeline/grader-api.d.ts +49 -0
- package/dist/pipeline/grader-api.js +200 -0
- package/dist/pipeline/grader-compare-runner.d.ts +44 -0
- package/dist/pipeline/grader-compare-runner.js +301 -0
- package/dist/pipeline/grader-comparison.d.ts +111 -0
- package/dist/pipeline/grader-comparison.js +161 -0
- package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
- package/dist/pipeline/grader-consistency-runner.js +270 -0
- package/dist/pipeline/grader-consistency.d.ts +103 -0
- package/dist/pipeline/grader-consistency.js +146 -0
- package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
- package/dist/pipeline/grader-sensitivity-runner.js +282 -0
- package/dist/pipeline/grader-sensitivity.d.ts +94 -0
- package/dist/pipeline/grader-sensitivity.js +144 -0
- package/dist/pipeline/grader-validate-runner.d.ts +38 -0
- package/dist/pipeline/grader-validate-runner.js +229 -0
- package/dist/pipeline/grader-validation.d.ts +107 -0
- package/dist/pipeline/grader-validation.js +169 -0
- package/dist/pipeline/map-request-to-config.d.ts +19 -0
- package/dist/pipeline/map-request-to-config.js +80 -0
- package/dist/pipeline/measure-retrieval.d.ts +59 -0
- package/dist/pipeline/measure-retrieval.js +111 -0
- package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
- package/dist/pipeline/mirror-repo-tasks.js +350 -0
- package/dist/pipeline/plan-format.d.ts +33 -0
- package/dist/pipeline/plan-format.js +202 -0
- package/dist/pipeline/plan.d.ts +169 -0
- package/dist/pipeline/plan.js +708 -0
- package/dist/pipeline/pr-comment.d.ts +19 -0
- package/dist/pipeline/pr-comment.js +502 -0
- package/dist/pipeline/probe.d.ts +52 -0
- package/dist/pipeline/probe.js +390 -0
- package/dist/pipeline/provenance.d.ts +47 -0
- package/dist/pipeline/provenance.js +146 -0
- package/dist/pipeline/readiness-report.d.ts +87 -0
- package/dist/pipeline/readiness-report.js +205 -0
- package/dist/pipeline/release-classification.d.ts +54 -0
- package/dist/pipeline/release-classification.js +238 -0
- package/dist/pipeline/release-report.d.ts +37 -0
- package/dist/pipeline/release-report.js +222 -0
- package/dist/pipeline/repo-eval-comment.d.ts +37 -0
- package/dist/pipeline/repo-eval-comment.js +165 -0
- package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
- package/dist/pipeline/repo-threshold-evaluator.js +162 -0
- package/dist/pipeline/resolve-mappings.d.ts +35 -0
- package/dist/pipeline/resolve-mappings.js +72 -0
- package/dist/pipeline/retrieval-metrics.d.ts +39 -0
- package/dist/pipeline/retrieval-metrics.js +136 -0
- package/dist/pipeline/reverse-mapping.d.ts +67 -0
- package/dist/pipeline/reverse-mapping.js +88 -0
- package/dist/pipeline/schemas.d.ts +9 -0
- package/dist/pipeline/schemas.js +9 -0
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +88 -0
- package/dist/pipeline/targeted-loo.js +203 -0
- package/dist/pipeline/thresholds.d.ts +27 -0
- package/dist/pipeline/thresholds.js +245 -0
- package/dist/pipeline/types.d.ts +10 -0
- package/dist/pipeline/types.js +10 -0
- package/dist/pipeline/validate.d.ts +67 -0
- package/dist/pipeline/validate.js +406 -0
- package/dist/pipeline/webhook-server.d.ts +37 -0
- package/dist/pipeline/webhook-server.js +133 -0
- package/dist/report-store.d.ts +84 -0
- package/dist/report-store.js +208 -0
- package/dist/sanity/client.d.ts +38 -0
- package/dist/sanity/client.js +86 -0
- package/dist/sanity/portable-text.d.ts +11 -0
- package/dist/sanity/portable-text.js +211 -0
- package/dist/sanity/queries.d.ts +133 -0
- package/dist/sanity/queries.js +300 -0
- package/dist/schedules/digest.d.ts +116 -0
- package/dist/schedules/digest.js +156 -0
- package/dist/schedules/index.d.ts +12 -0
- package/dist/schedules/index.js +10 -0
- package/dist/schedules/loader.d.ts +31 -0
- package/dist/schedules/loader.js +73 -0
- package/dist/schedules/schema.d.ts +9 -0
- package/dist/schedules/schema.js +9 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +21 -0
- package/dist/scripts/validate-task-sources.js +210 -0
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/bigquery/index.d.ts +131 -0
- package/dist/sinks/bigquery/index.js +222 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/index.d.ts +23 -0
- package/dist/sinks/index.js +18 -0
- package/dist/sinks/loader.d.ts +18 -0
- package/dist/sinks/loader.js +82 -0
- package/dist/sinks/retry.d.ts +24 -0
- package/dist/sinks/retry.js +52 -0
- package/dist/sinks/schema.d.ts +9 -0
- package/dist/sinks/schema.js +9 -0
- package/dist/sinks/slack/format.d.ts +65 -0
- package/dist/sinks/slack/format.js +327 -0
- package/dist/sinks/slack/index.d.ts +27 -0
- package/dist/sinks/slack/index.js +78 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +59 -0
- package/dist/sinks/types.js +44 -0
- package/dist/sinks/webhook/index.d.ts +19 -0
- package/dist/sinks/webhook/index.js +50 -0
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/sources.d.ts +104 -0
- package/dist/sources.js +292 -0
- package/dist/webhook/budget.d.ts +42 -0
- package/dist/webhook/budget.js +60 -0
- package/dist/webhook/debounce.d.ts +67 -0
- package/dist/webhook/debounce.js +76 -0
- package/dist/webhook/dispatch.d.ts +45 -0
- package/dist/webhook/dispatch.js +84 -0
- package/dist/webhook/eval-request-handler.d.ts +87 -0
- package/dist/webhook/eval-request-handler.js +181 -0
- package/dist/webhook/handler.d.ts +88 -0
- package/dist/webhook/handler.js +203 -0
- package/dist/webhook/index.d.ts +17 -0
- package/dist/webhook/index.js +12 -0
- package/dist/webhook/types.d.ts +109 -0
- package/dist/webhook/types.js +10 -0
- package/package.json +72 -0
- package/tasks/.expanded.agentic.yaml +51 -0
- package/tasks/.expanded.yaml +66 -0
- package/tasks/frameworks.yaml +98 -0
- package/tasks/functions.yaml +51 -0
- package/tasks/groq.yaml +216 -0
- package/tasks/nextjs-live.yaml +62 -0
- package/tasks/studio-setup.yaml +111 -0
- package/tasks/visual-editing.yaml +120 -0
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/grader-validate-runner.ts
|
|
3
|
+
*
|
|
4
|
+
* Orchestration module for grader validation against human reference grades
|
|
5
|
+
* (Phase 2).
|
|
6
|
+
*
|
|
7
|
+
* Loads human-graded reference samples from canonical/grader-references/,
|
|
8
|
+
* runs the grader model on each sample, and compares against human scores
|
|
9
|
+
* using `validateGrader()` from the pure computation module.
|
|
10
|
+
*
|
|
11
|
+
* Migrated from lib/grader-validate.ts — no process.argv, no process.exit(),
|
|
12
|
+
* no module-level constants. Accepts rootDir as parameter.
|
|
13
|
+
*
|
|
14
|
+
* @see docs/exec-plans/completed/grader-reliability.md — Phase 2
|
|
15
|
+
*/
|
|
16
|
+
import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
|
|
17
|
+
import { join } from "path";
|
|
18
|
+
import { load } from "js-yaml";
|
|
19
|
+
import { gradeOnce, loadGraderModel } from "./grader-api.js";
|
|
20
|
+
import { classifyCorrelation, validateGrader, } from "./grader-validation.js";
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// Internal helpers
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
function mapDimension(dim) {
|
|
25
|
+
switch (dim) {
|
|
26
|
+
case "code-correctness":
|
|
27
|
+
return "codeCorrectness";
|
|
28
|
+
case "doc-coverage":
|
|
29
|
+
return "docCoverage";
|
|
30
|
+
case "task-completion":
|
|
31
|
+
return "taskCompletion";
|
|
32
|
+
default:
|
|
33
|
+
return null;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Load human reference grades from canonical/grader-references/.
|
|
38
|
+
* @throws Error if directory or files not found
|
|
39
|
+
*/
|
|
40
|
+
function loadReferenceGrades(rootDir) {
|
|
41
|
+
const refsDir = join(rootDir, "canonical", "grader-references");
|
|
42
|
+
if (!existsSync(refsDir)) {
|
|
43
|
+
throw new Error(`Reference grades directory not found: ${refsDir}. ` +
|
|
44
|
+
"Create canonical/grader-references/ with YAML reference files. " +
|
|
45
|
+
"See docs/exec-plans/completed/grader-reliability.md — Phase 2.");
|
|
46
|
+
}
|
|
47
|
+
const files = readdirSync(refsDir)
|
|
48
|
+
.filter((f) => f.endsWith(".yaml") || f.endsWith(".yml"))
|
|
49
|
+
.sort();
|
|
50
|
+
if (files.length === 0) {
|
|
51
|
+
throw new Error(`No YAML files found in ${refsDir}`);
|
|
52
|
+
}
|
|
53
|
+
const allGrades = [];
|
|
54
|
+
for (const file of files) {
|
|
55
|
+
const filePath = join(refsDir, file);
|
|
56
|
+
const raw = readFileSync(filePath, "utf-8");
|
|
57
|
+
const parsed = load(raw);
|
|
58
|
+
if (Array.isArray(parsed)) {
|
|
59
|
+
allGrades.push(...parsed);
|
|
60
|
+
}
|
|
61
|
+
else if (typeof parsed === "object" && parsed !== null) {
|
|
62
|
+
allGrades.push(parsed);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
return allGrades;
|
|
66
|
+
}
|
|
67
|
+
// ---------------------------------------------------------------------------
|
|
68
|
+
// Report formatting (pure)
|
|
69
|
+
// ---------------------------------------------------------------------------
|
|
70
|
+
/**
|
|
71
|
+
* Format a GraderValidation result as a human-readable table report.
|
|
72
|
+
* Returns a string — does NOT print to console.
|
|
73
|
+
*/
|
|
74
|
+
export function formatValidationReport(result) {
|
|
75
|
+
const lines = [];
|
|
76
|
+
lines.push("=".repeat(80));
|
|
77
|
+
lines.push(" GRADER VALIDATION REPORT");
|
|
78
|
+
lines.push("=".repeat(80));
|
|
79
|
+
lines.push("");
|
|
80
|
+
lines.push(` Grader: ${result.graderModel}`);
|
|
81
|
+
lines.push(` Observations: ${result.totalObservations}`);
|
|
82
|
+
lines.push("");
|
|
83
|
+
// Overall metrics
|
|
84
|
+
lines.push("-".repeat(80));
|
|
85
|
+
lines.push("OVERALL METRICS");
|
|
86
|
+
lines.push("-".repeat(80));
|
|
87
|
+
lines.push("");
|
|
88
|
+
lines.push(` MAE: ${result.overallMae} points`);
|
|
89
|
+
lines.push(` Correlation: r=${result.overallCorrelation} (${classifyCorrelation(result.overallCorrelation)})`);
|
|
90
|
+
lines.push(` Bias: ${result.overallBias > 0 ? "+" : ""}${result.overallBias} (${result.overallBias > 0 ? "grader scores higher" : result.overallBias < 0 ? "grader scores lower" : "no bias"})`);
|
|
91
|
+
lines.push("");
|
|
92
|
+
// Per-dimension table
|
|
93
|
+
lines.push("-".repeat(80));
|
|
94
|
+
lines.push("PER-DIMENSION VALIDITY");
|
|
95
|
+
lines.push("-".repeat(80));
|
|
96
|
+
lines.push("");
|
|
97
|
+
const h = "| Dimension | MAE | Correlation | Quality | Bias | Count |";
|
|
98
|
+
const sep = "|------------------|-------|-------------|-----------|--------|-------|";
|
|
99
|
+
lines.push(h);
|
|
100
|
+
lines.push(sep);
|
|
101
|
+
const dims = [
|
|
102
|
+
{ data: result.perDimension.taskCompletion, name: "Task Completion" },
|
|
103
|
+
{ data: result.perDimension.codeCorrectness, name: "Code Correctness" },
|
|
104
|
+
{ data: result.perDimension.docCoverage, name: "Doc Coverage" },
|
|
105
|
+
];
|
|
106
|
+
for (const { data, name } of dims) {
|
|
107
|
+
const quality = classifyCorrelation(data.correlation);
|
|
108
|
+
const biasStr = data.bias > 0 ? `+${data.bias}` : `${data.bias}`;
|
|
109
|
+
lines.push(`| ${name.padEnd(16)} | ${String(data.mae).padStart(5)} | r=${String(data.correlation).padStart(9)} | ${quality.padEnd(9)} | ${biasStr.padStart(6)} | ${String(data.count).padStart(5)} |`);
|
|
110
|
+
}
|
|
111
|
+
lines.push("");
|
|
112
|
+
// Pass/fail verdict
|
|
113
|
+
lines.push("-".repeat(80));
|
|
114
|
+
lines.push("VERDICT");
|
|
115
|
+
lines.push("-".repeat(80));
|
|
116
|
+
lines.push("");
|
|
117
|
+
if (result.passesThreshold) {
|
|
118
|
+
lines.push(` ✅ PASSED: MAE ${result.overallMae} < threshold ${result.maeThreshold}`);
|
|
119
|
+
}
|
|
120
|
+
else {
|
|
121
|
+
lines.push(` ❌ FAILED: MAE ${result.overallMae} >= threshold ${result.maeThreshold}`);
|
|
122
|
+
}
|
|
123
|
+
lines.push("");
|
|
124
|
+
// Largest disagreements
|
|
125
|
+
const topN = Math.min(5, result.largestDisagreements.length);
|
|
126
|
+
if (topN > 0) {
|
|
127
|
+
lines.push("-".repeat(80));
|
|
128
|
+
lines.push(`TOP ${topN} LARGEST DISAGREEMENTS`);
|
|
129
|
+
lines.push("-".repeat(80));
|
|
130
|
+
lines.push("");
|
|
131
|
+
for (let i = 0; i < topN; i++) {
|
|
132
|
+
const d = result.largestDisagreements[i];
|
|
133
|
+
const sign = d.signedError > 0 ? "+" : "";
|
|
134
|
+
lines.push(` ${i + 1}. ${d.taskId} — ${d.dimension}`);
|
|
135
|
+
lines.push(` Human=${d.humanScore}, Grader=${d.graderScore} (${sign}${d.signedError})`);
|
|
136
|
+
if (d.notes) {
|
|
137
|
+
lines.push(` Note: ${d.notes}`);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
lines.push("");
|
|
141
|
+
}
|
|
142
|
+
return lines.join("\n");
|
|
143
|
+
}
|
|
144
|
+
// ---------------------------------------------------------------------------
|
|
145
|
+
// Main runner
|
|
146
|
+
// ---------------------------------------------------------------------------
|
|
147
|
+
/**
|
|
148
|
+
* Run grader validation against human reference grades.
|
|
149
|
+
*
|
|
150
|
+
* Loads human-graded samples, grades each with the grader model,
|
|
151
|
+
* and computes validation metrics (MAE, correlation, bias).
|
|
152
|
+
*
|
|
153
|
+
* @throws Error if no reference grades found, or no grades to analyze
|
|
154
|
+
*/
|
|
155
|
+
export async function runGraderValidate(options) {
|
|
156
|
+
const { rootDir } = options;
|
|
157
|
+
const maeThreshold = options.maeThreshold ?? 10;
|
|
158
|
+
console.log("=== Grader Validation ===\n");
|
|
159
|
+
// Resolve grader model
|
|
160
|
+
const graderModel = options.graderModel ?? loadGraderModel(rootDir).id;
|
|
161
|
+
console.log(` Grader: ${graderModel}`);
|
|
162
|
+
console.log(` Threshold: MAE < ${maeThreshold}`);
|
|
163
|
+
// Load reference grades
|
|
164
|
+
const rawGrades = loadReferenceGrades(rootDir);
|
|
165
|
+
console.log(` Samples: ${rawGrades.length} reference-graded responses`);
|
|
166
|
+
// Count total rubric judgments
|
|
167
|
+
let totalJudgments = 0;
|
|
168
|
+
for (const rg of rawGrades) {
|
|
169
|
+
totalJudgments += rg.rubrics.length;
|
|
170
|
+
}
|
|
171
|
+
console.log(` Judgments: ${totalJudgments} (response × rubric pairs)`);
|
|
172
|
+
const estimatedCost = totalJudgments * 0.005;
|
|
173
|
+
console.log(` Est. cost: ~$${estimatedCost.toFixed(2)}`);
|
|
174
|
+
console.log();
|
|
175
|
+
// Grade each reference sample
|
|
176
|
+
console.log(" Running grader on reference samples...");
|
|
177
|
+
const grades = [];
|
|
178
|
+
let completed = 0;
|
|
179
|
+
let failed = 0;
|
|
180
|
+
for (const ref of rawGrades) {
|
|
181
|
+
for (const rubric of ref.rubrics) {
|
|
182
|
+
const dimension = mapDimension(rubric.dimension);
|
|
183
|
+
if (!dimension) {
|
|
184
|
+
console.error(` ⚠ Unknown dimension '${rubric.dimension}' — skipping`);
|
|
185
|
+
continue;
|
|
186
|
+
}
|
|
187
|
+
const graderScore = await gradeOnce(graderModel, ref.response, rubric.rubricText);
|
|
188
|
+
completed++;
|
|
189
|
+
if (completed % 5 === 0 || completed === totalJudgments) {
|
|
190
|
+
process.stdout.write(`\r Progress: ${completed}/${totalJudgments}`);
|
|
191
|
+
}
|
|
192
|
+
if (graderScore === null) {
|
|
193
|
+
failed++;
|
|
194
|
+
continue;
|
|
195
|
+
}
|
|
196
|
+
grades.push({
|
|
197
|
+
area: ref.area,
|
|
198
|
+
dimension,
|
|
199
|
+
graderScore,
|
|
200
|
+
humanScore: rubric.humanScore,
|
|
201
|
+
taskId: ref.taskId,
|
|
202
|
+
...(rubric.notes && { notes: rubric.notes }),
|
|
203
|
+
});
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
console.log(); // newline after progress
|
|
207
|
+
if (failed > 0) {
|
|
208
|
+
console.log(` ⚠ ${failed} grading calls failed (excluded from analysis)`);
|
|
209
|
+
}
|
|
210
|
+
console.log();
|
|
211
|
+
if (grades.length === 0) {
|
|
212
|
+
throw new Error("No grades to analyze. All grading calls failed.");
|
|
213
|
+
}
|
|
214
|
+
// Validate
|
|
215
|
+
const result = validateGrader(grades, graderModel, { maeThreshold });
|
|
216
|
+
// Print report
|
|
217
|
+
console.log(formatValidationReport(result));
|
|
218
|
+
// Write output
|
|
219
|
+
const outDir = join(rootDir, "results", "latest");
|
|
220
|
+
mkdirSync(outDir, { recursive: true });
|
|
221
|
+
const outPath = join(outDir, "grader-validation.json");
|
|
222
|
+
writeFileSync(outPath, JSON.stringify(result, null, 2));
|
|
223
|
+
console.log(`\n 📄 Results written to ${outPath}`);
|
|
224
|
+
// Throw if threshold not met (instead of process.exit)
|
|
225
|
+
if (!result.passesThreshold) {
|
|
226
|
+
throw new Error(`VALIDATION FAILED: MAE ${result.overallMae} exceeds threshold ${maeThreshold}`);
|
|
227
|
+
}
|
|
228
|
+
return result;
|
|
229
|
+
}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/grader-validation.ts
|
|
3
|
+
*
|
|
4
|
+
* Pure computation module for grader validation against human reference grades.
|
|
5
|
+
*
|
|
6
|
+
* Takes paired (graderScore, humanScore) observations and computes:
|
|
7
|
+
* - Mean Absolute Error (MAE)
|
|
8
|
+
* - Pearson correlation
|
|
9
|
+
* - Per-dimension breakdown
|
|
10
|
+
* - Largest disagreements (for diagnosis)
|
|
11
|
+
*
|
|
12
|
+
* This module has NO side effects — no file I/O, no API calls.
|
|
13
|
+
*
|
|
14
|
+
* @see docs/exec-plans/completed/grader-reliability.md — Phase 2
|
|
15
|
+
*/
|
|
16
|
+
/** Quality label for a correlation value */
|
|
17
|
+
export type CorrelationQuality = "excellent" | "good" | "moderate" | "poor" | "very-poor";
|
|
18
|
+
/** Per-dimension validity metrics */
|
|
19
|
+
export interface DimensionValidity {
|
|
20
|
+
/** Mean signed error (positive = grader scores higher than human) */
|
|
21
|
+
bias: number;
|
|
22
|
+
/** Pearson correlation coefficient (-1 to 1) */
|
|
23
|
+
correlation: number;
|
|
24
|
+
/** Number of observations */
|
|
25
|
+
count: number;
|
|
26
|
+
/** Mean Absolute Error for this dimension */
|
|
27
|
+
mae: number;
|
|
28
|
+
}
|
|
29
|
+
/** A single large disagreement between grader and human */
|
|
30
|
+
export interface Disagreement {
|
|
31
|
+
/** Absolute difference */
|
|
32
|
+
absoluteError: number;
|
|
33
|
+
/** Feature area */
|
|
34
|
+
area: string;
|
|
35
|
+
/** Scoring dimension */
|
|
36
|
+
dimension: "codeCorrectness" | "docCoverage" | "taskCompletion";
|
|
37
|
+
/** Grader score */
|
|
38
|
+
graderScore: number;
|
|
39
|
+
/** Human score */
|
|
40
|
+
humanScore: number;
|
|
41
|
+
/** Optional notes from the human grader */
|
|
42
|
+
notes?: string;
|
|
43
|
+
/** Signed difference (grader - human) */
|
|
44
|
+
signedError: number;
|
|
45
|
+
/** Task ID */
|
|
46
|
+
taskId: string;
|
|
47
|
+
}
|
|
48
|
+
/** Top-level grader validation result */
|
|
49
|
+
export interface GraderValidation {
|
|
50
|
+
/** When this validation was generated */
|
|
51
|
+
generatedAt: string;
|
|
52
|
+
/** Grader model that was validated */
|
|
53
|
+
graderModel: string;
|
|
54
|
+
/** Top N largest disagreements (sorted by absolute error, descending) */
|
|
55
|
+
largestDisagreements: Disagreement[];
|
|
56
|
+
/** The MAE threshold used */
|
|
57
|
+
maeThreshold: number;
|
|
58
|
+
/** Overall bias (positive = grader grades higher than humans on average) */
|
|
59
|
+
overallBias: number;
|
|
60
|
+
/** Overall Pearson correlation */
|
|
61
|
+
overallCorrelation: number;
|
|
62
|
+
/** Overall Mean Absolute Error across all dimensions */
|
|
63
|
+
overallMae: number;
|
|
64
|
+
/** Whether the grader passes the MAE threshold (default: MAE < 10) */
|
|
65
|
+
passesThreshold: boolean;
|
|
66
|
+
/** Per-dimension validity metrics */
|
|
67
|
+
perDimension: {
|
|
68
|
+
taskCompletion: DimensionValidity;
|
|
69
|
+
codeCorrectness: DimensionValidity;
|
|
70
|
+
docCoverage: DimensionValidity;
|
|
71
|
+
};
|
|
72
|
+
/** Total number of (grader, human) score pairs analyzed */
|
|
73
|
+
totalObservations: number;
|
|
74
|
+
}
|
|
75
|
+
/** A single human reference grade for one (response, rubric) judgment */
|
|
76
|
+
export interface HumanReferenceGrade {
|
|
77
|
+
/** Feature area */
|
|
78
|
+
area: string;
|
|
79
|
+
/** Which scoring dimension */
|
|
80
|
+
dimension: "codeCorrectness" | "docCoverage" | "taskCompletion";
|
|
81
|
+
/** The grader's score (0–100) — populated after grading */
|
|
82
|
+
graderScore: number;
|
|
83
|
+
/** The human expert's score (0–100) */
|
|
84
|
+
humanScore: number;
|
|
85
|
+
/** Optional notes from the human grader */
|
|
86
|
+
notes?: string;
|
|
87
|
+
/** Which task this judgment belongs to */
|
|
88
|
+
taskId: string;
|
|
89
|
+
}
|
|
90
|
+
/** Classify a correlation value into a human-readable quality label */
|
|
91
|
+
export declare function classifyCorrelation(r: number): CorrelationQuality;
|
|
92
|
+
/** Compute Pearson correlation coefficient between two arrays of equal length */
|
|
93
|
+
export declare function pearsonCorrelation(x: number[], y: number[]): number;
|
|
94
|
+
/**
|
|
95
|
+
* Validate grader scores against human reference grades.
|
|
96
|
+
*
|
|
97
|
+
* This is the main entry point — a pure function with no side effects.
|
|
98
|
+
*
|
|
99
|
+
* @param grades Array of paired (grader, human) observations
|
|
100
|
+
* @param graderModel The grader model identifier
|
|
101
|
+
* @param options Optional configuration
|
|
102
|
+
* @returns GraderValidation report
|
|
103
|
+
*/
|
|
104
|
+
export declare function validateGrader(grades: HumanReferenceGrade[], graderModel: string, options?: {
|
|
105
|
+
maeThreshold?: number;
|
|
106
|
+
maxDisagreements?: number;
|
|
107
|
+
}): GraderValidation;
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/grader-validation.ts
|
|
3
|
+
*
|
|
4
|
+
* Pure computation module for grader validation against human reference grades.
|
|
5
|
+
*
|
|
6
|
+
* Takes paired (graderScore, humanScore) observations and computes:
|
|
7
|
+
* - Mean Absolute Error (MAE)
|
|
8
|
+
* - Pearson correlation
|
|
9
|
+
* - Per-dimension breakdown
|
|
10
|
+
* - Largest disagreements (for diagnosis)
|
|
11
|
+
*
|
|
12
|
+
* This module has NO side effects — no file I/O, no API calls.
|
|
13
|
+
*
|
|
14
|
+
* @see docs/exec-plans/completed/grader-reliability.md — Phase 2
|
|
15
|
+
*/
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
// Pure computation
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
/** Classify a correlation value into a human-readable quality label */
|
|
20
|
+
export function classifyCorrelation(r) {
|
|
21
|
+
const abs = Math.abs(r);
|
|
22
|
+
if (abs >= 0.9)
|
|
23
|
+
return "excellent";
|
|
24
|
+
if (abs >= 0.7)
|
|
25
|
+
return "good";
|
|
26
|
+
if (abs >= 0.5)
|
|
27
|
+
return "moderate";
|
|
28
|
+
if (abs >= 0.3)
|
|
29
|
+
return "poor";
|
|
30
|
+
return "very-poor";
|
|
31
|
+
}
|
|
32
|
+
/** Compute Pearson correlation coefficient between two arrays of equal length */
|
|
33
|
+
export function pearsonCorrelation(x, y) {
|
|
34
|
+
const n = x.length;
|
|
35
|
+
if (n < 2)
|
|
36
|
+
return 0;
|
|
37
|
+
if (x.length !== y.length) {
|
|
38
|
+
throw new Error(`Arrays must have equal length (got ${x.length} and ${y.length})`);
|
|
39
|
+
}
|
|
40
|
+
const meanX = x.reduce((s, v) => s + v, 0) / n;
|
|
41
|
+
const meanY = y.reduce((s, v) => s + v, 0) / n;
|
|
42
|
+
let sumXY = 0;
|
|
43
|
+
let sumX2 = 0;
|
|
44
|
+
let sumY2 = 0;
|
|
45
|
+
for (let i = 0; i < n; i++) {
|
|
46
|
+
const dx = x[i] - meanX;
|
|
47
|
+
const dy = y[i] - meanY;
|
|
48
|
+
sumXY += dx * dy;
|
|
49
|
+
sumX2 += dx * dx;
|
|
50
|
+
sumY2 += dy * dy;
|
|
51
|
+
}
|
|
52
|
+
const denom = Math.sqrt(sumX2 * sumY2);
|
|
53
|
+
if (denom === 0)
|
|
54
|
+
return 0;
|
|
55
|
+
return sumXY / denom;
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Validate grader scores against human reference grades.
|
|
59
|
+
*
|
|
60
|
+
* This is the main entry point — a pure function with no side effects.
|
|
61
|
+
*
|
|
62
|
+
* @param grades Array of paired (grader, human) observations
|
|
63
|
+
* @param graderModel The grader model identifier
|
|
64
|
+
* @param options Optional configuration
|
|
65
|
+
* @returns GraderValidation report
|
|
66
|
+
*/
|
|
67
|
+
export function validateGrader(grades, graderModel, options) {
|
|
68
|
+
const maeThreshold = options?.maeThreshold ?? 10;
|
|
69
|
+
const maxDisagreements = options?.maxDisagreements ?? 10;
|
|
70
|
+
if (grades.length === 0) {
|
|
71
|
+
return {
|
|
72
|
+
generatedAt: new Date().toISOString(),
|
|
73
|
+
graderModel,
|
|
74
|
+
largestDisagreements: [],
|
|
75
|
+
maeThreshold,
|
|
76
|
+
overallBias: 0,
|
|
77
|
+
overallCorrelation: 0,
|
|
78
|
+
overallMae: 0,
|
|
79
|
+
passesThreshold: true,
|
|
80
|
+
perDimension: {
|
|
81
|
+
codeCorrectness: { bias: 0, correlation: 0, count: 0, mae: 0 },
|
|
82
|
+
docCoverage: { bias: 0, correlation: 0, count: 0, mae: 0 },
|
|
83
|
+
taskCompletion: { bias: 0, correlation: 0, count: 0, mae: 0 },
|
|
84
|
+
},
|
|
85
|
+
totalObservations: 0,
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
// Build paired arrays
|
|
89
|
+
const allPairs = grades.map((g) => ({
|
|
90
|
+
grader: g.graderScore,
|
|
91
|
+
human: g.humanScore,
|
|
92
|
+
}));
|
|
93
|
+
// Group by dimension
|
|
94
|
+
const byDimension = {
|
|
95
|
+
codeCorrectness: grades
|
|
96
|
+
.filter((g) => g.dimension === "codeCorrectness")
|
|
97
|
+
.map((g) => ({ grader: g.graderScore, human: g.humanScore })),
|
|
98
|
+
docCoverage: grades
|
|
99
|
+
.filter((g) => g.dimension === "docCoverage")
|
|
100
|
+
.map((g) => ({ grader: g.graderScore, human: g.humanScore })),
|
|
101
|
+
taskCompletion: grades
|
|
102
|
+
.filter((g) => g.dimension === "taskCompletion")
|
|
103
|
+
.map((g) => ({ grader: g.graderScore, human: g.humanScore })),
|
|
104
|
+
};
|
|
105
|
+
// Overall metrics
|
|
106
|
+
const overallMae = computeMae(allPairs);
|
|
107
|
+
const overallCorrelation = Math.round(pearsonCorrelation(allPairs.map((p) => p.grader), allPairs.map((p) => p.human)) * 100) / 100;
|
|
108
|
+
const overallBias = computeBias(allPairs);
|
|
109
|
+
// Per-dimension metrics
|
|
110
|
+
const perDimension = {
|
|
111
|
+
codeCorrectness: computeDimensionValidity(byDimension.codeCorrectness),
|
|
112
|
+
docCoverage: computeDimensionValidity(byDimension.docCoverage),
|
|
113
|
+
taskCompletion: computeDimensionValidity(byDimension.taskCompletion),
|
|
114
|
+
};
|
|
115
|
+
// Find largest disagreements
|
|
116
|
+
const disagreements = grades
|
|
117
|
+
.map((g) => ({
|
|
118
|
+
absoluteError: Math.abs(g.graderScore - g.humanScore),
|
|
119
|
+
area: g.area,
|
|
120
|
+
dimension: g.dimension,
|
|
121
|
+
graderScore: g.graderScore,
|
|
122
|
+
humanScore: g.humanScore,
|
|
123
|
+
signedError: g.graderScore - g.humanScore,
|
|
124
|
+
taskId: g.taskId,
|
|
125
|
+
...(g.notes && { notes: g.notes }),
|
|
126
|
+
}))
|
|
127
|
+
.sort((a, b) => b.absoluteError - a.absoluteError)
|
|
128
|
+
.slice(0, maxDisagreements);
|
|
129
|
+
return {
|
|
130
|
+
generatedAt: new Date().toISOString(),
|
|
131
|
+
graderModel,
|
|
132
|
+
largestDisagreements: disagreements,
|
|
133
|
+
maeThreshold,
|
|
134
|
+
overallBias,
|
|
135
|
+
overallCorrelation,
|
|
136
|
+
overallMae,
|
|
137
|
+
passesThreshold: overallMae < maeThreshold,
|
|
138
|
+
perDimension,
|
|
139
|
+
totalObservations: grades.length,
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
/** Compute mean signed error (bias) */
|
|
143
|
+
function computeBias(pairs) {
|
|
144
|
+
if (pairs.length === 0)
|
|
145
|
+
return 0;
|
|
146
|
+
const totalBias = pairs.reduce((sum, p) => sum + (p.grader - p.human), 0);
|
|
147
|
+
return Math.round((totalBias / pairs.length) * 10) / 10;
|
|
148
|
+
}
|
|
149
|
+
/** Compute validity metrics for a set of paired observations */
|
|
150
|
+
function computeDimensionValidity(pairs) {
|
|
151
|
+
if (pairs.length === 0) {
|
|
152
|
+
return { bias: 0, correlation: 0, count: 0, mae: 0 };
|
|
153
|
+
}
|
|
154
|
+
const graderScores = pairs.map((p) => p.grader);
|
|
155
|
+
const humanScores = pairs.map((p) => p.human);
|
|
156
|
+
return {
|
|
157
|
+
bias: computeBias(pairs),
|
|
158
|
+
correlation: Math.round(pearsonCorrelation(graderScores, humanScores) * 100) / 100,
|
|
159
|
+
count: pairs.length,
|
|
160
|
+
mae: computeMae(pairs),
|
|
161
|
+
};
|
|
162
|
+
}
|
|
163
|
+
/** Compute Mean Absolute Error */
|
|
164
|
+
function computeMae(pairs) {
|
|
165
|
+
if (pairs.length === 0)
|
|
166
|
+
return 0;
|
|
167
|
+
const totalError = pairs.reduce((sum, p) => sum + Math.abs(p.grader - p.human), 0);
|
|
168
|
+
return Math.round((totalError / pairs.length) * 10) / 10;
|
|
169
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { PipelineRequest, ResolvedConfig } from "../_vendor/ailf-core/index.d.ts";
|
|
2
|
+
/**
|
|
3
|
+
* Map a PipelineRequest to a ResolvedConfig.
|
|
4
|
+
*
|
|
5
|
+
* This is the single mapping function shared between:
|
|
6
|
+
* - API gateway (POST /v1/pipeline → PipelineRequest → ResolvedConfig)
|
|
7
|
+
* - CLI --config (JSON file → PipelineRequest → ResolvedConfig)
|
|
8
|
+
* - GitHub Actions (repository_dispatch payload → PipelineRequest → ResolvedConfig)
|
|
9
|
+
*
|
|
10
|
+
* Fields not present in PipelineRequest get sensible defaults.
|
|
11
|
+
* Fields that only exist in ResolvedConfig (rootDir, promptfooUrl, etc.)
|
|
12
|
+
* must be provided by the caller via the rootDir parameter.
|
|
13
|
+
*
|
|
14
|
+
* **Publish default:** When `jobId` is present (API-triggered evaluation),
|
|
15
|
+
* `publish` defaults to `true` so the report is persisted to the Content
|
|
16
|
+
* Lake and the job document gets a `reportId`. Callers can still override
|
|
17
|
+
* with `publish: false`.
|
|
18
|
+
*/
|
|
19
|
+
export declare function mapRequestToConfig(request: PipelineRequest, rootDir: string): ResolvedConfig;
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Map a PipelineRequest to a ResolvedConfig.
|
|
3
|
+
*
|
|
4
|
+
* This is the single mapping function shared between:
|
|
5
|
+
* - API gateway (POST /v1/pipeline → PipelineRequest → ResolvedConfig)
|
|
6
|
+
* - CLI --config (JSON file → PipelineRequest → ResolvedConfig)
|
|
7
|
+
* - GitHub Actions (repository_dispatch payload → PipelineRequest → ResolvedConfig)
|
|
8
|
+
*
|
|
9
|
+
* Fields not present in PipelineRequest get sensible defaults.
|
|
10
|
+
* Fields that only exist in ResolvedConfig (rootDir, promptfooUrl, etc.)
|
|
11
|
+
* must be provided by the caller via the rootDir parameter.
|
|
12
|
+
*
|
|
13
|
+
* **Publish default:** When `jobId` is present (API-triggered evaluation),
|
|
14
|
+
* `publish` defaults to `true` so the report is persisted to the Content
|
|
15
|
+
* Lake and the job document gets a `reportId`. Callers can still override
|
|
16
|
+
* with `publish: false`.
|
|
17
|
+
*/
|
|
18
|
+
export function mapRequestToConfig(request, rootDir) {
|
|
19
|
+
// API-triggered evaluations (identified by jobId) default to publish: true.
|
|
20
|
+
// Without this, the job's reportId is always null and GET /v1/reports/:id
|
|
21
|
+
// has nothing to return.
|
|
22
|
+
const publishDefault = !!request.jobId;
|
|
23
|
+
return {
|
|
24
|
+
rootDir,
|
|
25
|
+
mode: request.mode ?? "full",
|
|
26
|
+
debug: mapDebug(request.debug),
|
|
27
|
+
areas: request.areas,
|
|
28
|
+
tasks: request.tasks,
|
|
29
|
+
changedDocs: request.changedDocs,
|
|
30
|
+
source: request.source,
|
|
31
|
+
skipFetch: false,
|
|
32
|
+
skipEval: false,
|
|
33
|
+
compareEnabled: request.compare ?? false,
|
|
34
|
+
compareThreshold: request.compareThreshold,
|
|
35
|
+
compareBaseline: request.compareBaseline,
|
|
36
|
+
gapAnalysisEnabled: request.gapAnalysis ?? true,
|
|
37
|
+
readinessEnabled: request.readiness ?? false,
|
|
38
|
+
discoveryReportEnabled: request.discoveryReport ?? false,
|
|
39
|
+
publishEnabled: request.publish ?? publishDefault,
|
|
40
|
+
publishTag: request.publishTag,
|
|
41
|
+
noCache: request.noCache ?? false,
|
|
42
|
+
noRemoteCache: request.noRemoteCache ?? false,
|
|
43
|
+
graderReplications: request.graderReplications,
|
|
44
|
+
urls: request.urls,
|
|
45
|
+
headers: request.headers,
|
|
46
|
+
allowedOrigins: request.allowedOrigins,
|
|
47
|
+
searchMode: request.searchMode ?? "open",
|
|
48
|
+
concurrency: request.concurrency,
|
|
49
|
+
datasetOverride: request.dataset,
|
|
50
|
+
projectIdOverride: request.projectId,
|
|
51
|
+
perspectiveOverride: request.perspective,
|
|
52
|
+
taskSourceType: mapTaskSourceType(request.taskMode),
|
|
53
|
+
outputPath: undefined,
|
|
54
|
+
promptfooUrl: undefined,
|
|
55
|
+
studioOriginOverride: undefined,
|
|
56
|
+
sanityDocumentArgs: undefined,
|
|
57
|
+
beforeOption: undefined,
|
|
58
|
+
repoTasksPath: undefined,
|
|
59
|
+
callback: request.callback,
|
|
60
|
+
jobId: request.jobId,
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
function mapDebug(debug) {
|
|
64
|
+
if (debug === undefined || debug === false)
|
|
65
|
+
return undefined;
|
|
66
|
+
if (debug === true)
|
|
67
|
+
return { enabled: true };
|
|
68
|
+
return {
|
|
69
|
+
enabled: debug.enabled ?? true,
|
|
70
|
+
firstN: debug.firstN,
|
|
71
|
+
pattern: debug.pattern,
|
|
72
|
+
sample: debug.sample,
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
function mapTaskSourceType(taskMode) {
|
|
76
|
+
if (taskMode === "content-lake" || taskMode === "yaml")
|
|
77
|
+
return taskMode;
|
|
78
|
+
// "inline" is handled separately by the caller; maps to undefined
|
|
79
|
+
return undefined;
|
|
80
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* measure-retrieval.ts
|
|
3
|
+
*
|
|
4
|
+
* Pure retrieval quality measurement functions.
|
|
5
|
+
*
|
|
6
|
+
* Evaluates retrieval quality by comparing what Sanity's text search
|
|
7
|
+
* returns against the manually-annotated canonical documents for each
|
|
8
|
+
* evaluation task. Produces Recall@K and NDCG@K metrics.
|
|
9
|
+
*
|
|
10
|
+
* This answers: "Can a retriever find the docs an LLM actually needs?"
|
|
11
|
+
*
|
|
12
|
+
* Migrated from lib/measure-retrieval.ts — no process.argv/process.env,
|
|
13
|
+
* accepts rootDir and retriever function as parameters.
|
|
14
|
+
*/
|
|
15
|
+
export interface RetrievalResult {
|
|
16
|
+
canonical_docs: string[];
|
|
17
|
+
feature_area: string;
|
|
18
|
+
ndcg_at_10: number;
|
|
19
|
+
recall_at_5: number;
|
|
20
|
+
recall_at_10: number;
|
|
21
|
+
retrieved_docs: string[];
|
|
22
|
+
task_id: string;
|
|
23
|
+
}
|
|
24
|
+
export interface RetrievalSummary {
|
|
25
|
+
by_area: Record<string, {
|
|
26
|
+
avg_recall_at_5: number;
|
|
27
|
+
avg_recall_at_10: number;
|
|
28
|
+
avg_ndcg_at_10: number;
|
|
29
|
+
task_count: number;
|
|
30
|
+
}>;
|
|
31
|
+
overall: {
|
|
32
|
+
avg_recall_at_5: number;
|
|
33
|
+
avg_recall_at_10: number;
|
|
34
|
+
avg_ndcg_at_10: number;
|
|
35
|
+
};
|
|
36
|
+
results: RetrievalResult[];
|
|
37
|
+
}
|
|
38
|
+
/** A retriever function: given a query and k, returns k document slugs. */
|
|
39
|
+
export type RetrieverFn = (query: string, k: number) => Promise<string[]>;
|
|
40
|
+
export declare function calculateRecall(canonical: string[], retrieved: string[], k: number): number;
|
|
41
|
+
export declare function calculateNDCG(canonical: string[], retrieved: string[], k: number): number;
|
|
42
|
+
export interface MeasureRetrievalOptions {
|
|
43
|
+
/** Root directory of the eval package */
|
|
44
|
+
rootDir: string;
|
|
45
|
+
/** Retriever function — injected by callers for testability */
|
|
46
|
+
retriever: RetrieverFn;
|
|
47
|
+
/** Optional progress callback */
|
|
48
|
+
onProgress?: (area: string, taskId: string, result: RetrievalResult) => void;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Run retrieval quality measurement for all tasks.
|
|
52
|
+
*
|
|
53
|
+
* @returns A full RetrievalSummary with per-task, per-area, and overall metrics.
|
|
54
|
+
*/
|
|
55
|
+
export declare function measureRetrieval(options: MeasureRetrievalOptions): Promise<RetrievalSummary>;
|
|
56
|
+
/**
|
|
57
|
+
* Format a retrieval summary for console display.
|
|
58
|
+
*/
|
|
59
|
+
export declare function formatRetrievalTable(summary: RetrievalSummary): string;
|