@sanity/ailf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -0
- package/bin/ailf.js +64 -0
- package/canonical/grader-references/README.md +88 -0
- package/canonical/grader-references/groq.yaml +234 -0
- package/canonical/grader-references/studio-setup.yaml +275 -0
- package/canonical/reference-solutions/.gitkeep +1 -0
- package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
- package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
- package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
- package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
- package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
- package/canonical/reference-solutions/groq/joins-references.ts +300 -0
- package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
- package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
- package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
- package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
- package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
- package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
- package/config/bigquery/README.md +74 -0
- package/config/bigquery/views/area_scores.sql +87 -0
- package/config/bigquery/views/reports.sql +49 -0
- package/config/features.yaml +116 -0
- package/config/models.yaml +115 -0
- package/config/prompts.yaml +75 -0
- package/config/rubrics.yaml +62 -0
- package/config/schedules.yaml +43 -0
- package/config/sinks.yaml +54 -0
- package/config/sources.yaml +51 -0
- package/config/thresholds.yaml +49 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
- package/dist/_vendor/ailf-core/examples/index.js +285 -0
- package/dist/_vendor/ailf-core/index.d.ts +17 -0
- package/dist/_vendor/ailf-core/index.js +17 -0
- package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
- package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
- package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
- package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
- package/dist/_vendor/ailf-core/ports/context.js +14 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
- package/dist/_vendor/ailf-core/ports/index.js +7 -0
- package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
- package/dist/_vendor/ailf-core/ports/logger.js +11 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
- package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/schemas/index.js +16 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
- package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
- package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
- package/dist/_vendor/ailf-core/services/index.js +12 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/scoring.js +222 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
- package/dist/_vendor/ailf-core/types/index.js +21 -0
- package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
- package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
- package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
- package/dist/_vendor/ailf-shared/document-ref.js +1 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -0
- package/dist/_vendor/ailf-shared/index.js +16 -0
- package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
- package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
- package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
- package/dist/_vendor/ailf-shared/score-grades.js +23 -0
- package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
- package/dist/adapters/cache/content-lake-cache.js +59 -0
- package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
- package/dist/adapters/cache/filesystem-cache.js +54 -0
- package/dist/adapters/cache/index.d.ts +2 -0
- package/dist/adapters/cache/index.js +2 -0
- package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
- package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
- package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
- package/dist/adapters/config-sources/file-config-adapter.js +96 -0
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +2 -0
- package/dist/adapters/doc-fetchers/index.d.ts +1 -0
- package/dist/adapters/doc-fetchers/index.js +1 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
- package/dist/adapters/eval-runners/index.d.ts +1 -0
- package/dist/adapters/eval-runners/index.js +1 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
- package/dist/adapters/index.d.ts +12 -0
- package/dist/adapters/index.js +12 -0
- package/dist/adapters/loggers/console-logger.d.ts +22 -0
- package/dist/adapters/loggers/console-logger.js +54 -0
- package/dist/adapters/loggers/index.d.ts +9 -0
- package/dist/adapters/loggers/index.js +9 -0
- package/dist/adapters/loggers/json-logger.d.ts +18 -0
- package/dist/adapters/loggers/json-logger.js +33 -0
- package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
- package/dist/adapters/loggers/quiet-logger.js +30 -0
- package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/composite-task-source.js +59 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
- package/dist/adapters/task-sources/index.d.ts +7 -0
- package/dist/adapters/task-sources/index.js +7 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
- package/dist/adapters/task-sources/repo-schemas.js +234 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
- package/dist/adapters/task-sources/repo-task-source.js +104 -0
- package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
- package/dist/adapters/task-sources/repo-trigger.js +153 -0
- package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
- package/dist/adapters/task-sources/repo-validation.js +164 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
- package/dist/adapters/task-sources/yaml-task-source.js +136 -0
- package/dist/agent-observer/agentic-provider.d.ts +132 -0
- package/dist/agent-observer/agentic-provider.js +983 -0
- package/dist/agent-observer/classifier.d.ts +62 -0
- package/dist/agent-observer/classifier.js +269 -0
- package/dist/agent-observer/index.d.ts +7 -0
- package/dist/agent-observer/index.js +4 -0
- package/dist/agent-observer/pricing.d.ts +35 -0
- package/dist/agent-observer/pricing.js +82 -0
- package/dist/agent-observer/provider.d.ts +77 -0
- package/dist/agent-observer/provider.js +151 -0
- package/dist/agent-observer/proxy.d.ts +91 -0
- package/dist/agent-observer/proxy.js +321 -0
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/agent-observer/types.d.ts +137 -0
- package/dist/agent-observer/types.js +16 -0
- package/dist/assertions/source-isolation.d.ts +72 -0
- package/dist/assertions/source-isolation.js +117 -0
- package/dist/cli.d.ts +24 -0
- package/dist/cli.js +199 -0
- package/dist/commands/agent-report.d.ts +5 -0
- package/dist/commands/agent-report.js +69 -0
- package/dist/commands/baseline.d.ts +9 -0
- package/dist/commands/baseline.js +141 -0
- package/dist/commands/cache.d.ts +13 -0
- package/dist/commands/cache.js +135 -0
- package/dist/commands/calculate-scores.d.ts +8 -0
- package/dist/commands/calculate-scores.js +48 -0
- package/dist/commands/compare.d.ts +8 -0
- package/dist/commands/compare.js +120 -0
- package/dist/commands/completion.d.ts +18 -0
- package/dist/commands/completion.js +260 -0
- package/dist/commands/coverage-audit.d.ts +7 -0
- package/dist/commands/coverage-audit.js +40 -0
- package/dist/commands/discovery-report.d.ts +10 -0
- package/dist/commands/discovery-report.js +44 -0
- package/dist/commands/eval.d.ts +9 -0
- package/dist/commands/eval.js +35 -0
- package/dist/commands/explain-handler.d.ts +34 -0
- package/dist/commands/explain-handler.js +719 -0
- package/dist/commands/fetch-docs.d.ts +8 -0
- package/dist/commands/fetch-docs.js +128 -0
- package/dist/commands/generate-configs.d.ts +8 -0
- package/dist/commands/generate-configs.js +46 -0
- package/dist/commands/grader/index.d.ts +11 -0
- package/dist/commands/grader/index.js +118 -0
- package/dist/commands/init.d.ts +19 -0
- package/dist/commands/init.js +150 -0
- package/dist/commands/interactive.d.ts +12 -0
- package/dist/commands/interactive.js +238 -0
- package/dist/commands/lookup-doc.d.ts +15 -0
- package/dist/commands/lookup-doc.js +84 -0
- package/dist/commands/measure-retrieval.d.ts +5 -0
- package/dist/commands/measure-retrieval.js +65 -0
- package/dist/commands/pipeline-action.d.ts +71 -0
- package/dist/commands/pipeline-action.js +305 -0
- package/dist/commands/pipeline.d.ts +62 -0
- package/dist/commands/pipeline.js +53 -0
- package/dist/commands/pr-comment.d.ts +8 -0
- package/dist/commands/pr-comment.js +47 -0
- package/dist/commands/publish.d.ts +26 -0
- package/dist/commands/publish.js +253 -0
- package/dist/commands/readiness-report.d.ts +10 -0
- package/dist/commands/readiness-report.js +104 -0
- package/dist/commands/shared/options.d.ts +29 -0
- package/dist/commands/shared/options.js +57 -0
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +16 -0
- package/dist/commands/validate-tasks.js +93 -0
- package/dist/commands/validate.d.ts +9 -0
- package/dist/commands/validate.js +73 -0
- package/dist/commands/webhook-server.d.ts +5 -0
- package/dist/commands/webhook-server.js +30 -0
- package/dist/commands/weekly-digest.d.ts +10 -0
- package/dist/commands/weekly-digest.js +104 -0
- package/dist/composition-root.d.ts +26 -0
- package/dist/composition-root.js +107 -0
- package/dist/interpolate.d.ts +26 -0
- package/dist/interpolate.js +70 -0
- package/dist/job-store.d.ts +104 -0
- package/dist/job-store.js +188 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.d.ts +27 -0
- package/dist/orchestration/build-app-context.js +81 -0
- package/dist/orchestration/build-step-sequence.d.ts +15 -0
- package/dist/orchestration/build-step-sequence.js +84 -0
- package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
- package/dist/orchestration/config-to-source-overrides.js +28 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/index.d.ts +11 -0
- package/dist/orchestration/index.js +11 -0
- package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
- package/dist/orchestration/pipeline-orchestrator.js +153 -0
- package/dist/orchestration/step-runner.d.ts +20 -0
- package/dist/orchestration/step-runner.js +88 -0
- package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
- package/dist/orchestration/steps/calculate-scores-step.js +95 -0
- package/dist/orchestration/steps/callback-step.d.ts +24 -0
- package/dist/orchestration/steps/callback-step.js +76 -0
- package/dist/orchestration/steps/compare-step.d.ts +14 -0
- package/dist/orchestration/steps/compare-step.js +92 -0
- package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
- package/dist/orchestration/steps/discovery-report-step.js +55 -0
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
- package/dist/orchestration/steps/fetch-docs-step.js +135 -0
- package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
- package/dist/orchestration/steps/gap-analysis-step.js +136 -0
- package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
- package/dist/orchestration/steps/generate-configs-step.js +85 -0
- package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
- package/dist/orchestration/steps/grader-consistency-step.js +64 -0
- package/dist/orchestration/steps/index.d.ts +19 -0
- package/dist/orchestration/steps/index.js +19 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
- package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
- package/dist/orchestration/steps/publish-report-step.js +216 -0
- package/dist/orchestration/steps/readiness-step.d.ts +13 -0
- package/dist/orchestration/steps/readiness-step.js +91 -0
- package/dist/orchestration/steps/report-step.d.ts +12 -0
- package/dist/orchestration/steps/report-step.js +49 -0
- package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
- package/dist/orchestration/steps/run-eval-step.js +195 -0
- package/dist/orchestration/steps/validate-step.d.ts +12 -0
- package/dist/orchestration/steps/validate-step.js +41 -0
- package/dist/pipeline/agent-behavior-report.d.ts +53 -0
- package/dist/pipeline/agent-behavior-report.js +132 -0
- package/dist/pipeline/attribution.d.ts +47 -0
- package/dist/pipeline/attribution.js +226 -0
- package/dist/pipeline/baseline.d.ts +37 -0
- package/dist/pipeline/baseline.js +141 -0
- package/dist/pipeline/cache.d.ts +101 -0
- package/dist/pipeline/cache.js +283 -0
- package/dist/pipeline/calculate-scores.d.ts +102 -0
- package/dist/pipeline/calculate-scores.js +1128 -0
- package/dist/pipeline/callback-delivery.d.ts +50 -0
- package/dist/pipeline/callback-delivery.js +89 -0
- package/dist/pipeline/checks.d.ts +39 -0
- package/dist/pipeline/checks.js +280 -0
- package/dist/pipeline/classify-url.d.ts +61 -0
- package/dist/pipeline/classify-url.js +93 -0
- package/dist/pipeline/compare.d.ts +31 -0
- package/dist/pipeline/compare.js +208 -0
- package/dist/pipeline/coverage-audit.d.ts +39 -0
- package/dist/pipeline/coverage-audit.js +165 -0
- package/dist/pipeline/degradations.d.ts +85 -0
- package/dist/pipeline/degradations.js +242 -0
- package/dist/pipeline/discovery-report.d.ts +55 -0
- package/dist/pipeline/discovery-report.js +178 -0
- package/dist/pipeline/eval-constants.d.ts +68 -0
- package/dist/pipeline/eval-constants.js +111 -0
- package/dist/pipeline/eval-fingerprint.d.ts +66 -0
- package/dist/pipeline/eval-fingerprint.js +175 -0
- package/dist/pipeline/expand-tasks.d.ts +220 -0
- package/dist/pipeline/expand-tasks.js +421 -0
- package/dist/pipeline/failure-modes.d.ts +46 -0
- package/dist/pipeline/failure-modes.js +348 -0
- package/dist/pipeline/fetch-url-content.d.ts +44 -0
- package/dist/pipeline/fetch-url-content.js +93 -0
- package/dist/pipeline/gap-analysis.d.ts +48 -0
- package/dist/pipeline/gap-analysis.js +231 -0
- package/dist/pipeline/generate-configs.d.ts +72 -0
- package/dist/pipeline/generate-configs.js +395 -0
- package/dist/pipeline/grader-api.d.ts +49 -0
- package/dist/pipeline/grader-api.js +200 -0
- package/dist/pipeline/grader-compare-runner.d.ts +44 -0
- package/dist/pipeline/grader-compare-runner.js +301 -0
- package/dist/pipeline/grader-comparison.d.ts +111 -0
- package/dist/pipeline/grader-comparison.js +161 -0
- package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
- package/dist/pipeline/grader-consistency-runner.js +270 -0
- package/dist/pipeline/grader-consistency.d.ts +103 -0
- package/dist/pipeline/grader-consistency.js +146 -0
- package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
- package/dist/pipeline/grader-sensitivity-runner.js +282 -0
- package/dist/pipeline/grader-sensitivity.d.ts +94 -0
- package/dist/pipeline/grader-sensitivity.js +144 -0
- package/dist/pipeline/grader-validate-runner.d.ts +38 -0
- package/dist/pipeline/grader-validate-runner.js +229 -0
- package/dist/pipeline/grader-validation.d.ts +107 -0
- package/dist/pipeline/grader-validation.js +169 -0
- package/dist/pipeline/map-request-to-config.d.ts +19 -0
- package/dist/pipeline/map-request-to-config.js +80 -0
- package/dist/pipeline/measure-retrieval.d.ts +59 -0
- package/dist/pipeline/measure-retrieval.js +111 -0
- package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
- package/dist/pipeline/mirror-repo-tasks.js +350 -0
- package/dist/pipeline/plan-format.d.ts +33 -0
- package/dist/pipeline/plan-format.js +202 -0
- package/dist/pipeline/plan.d.ts +169 -0
- package/dist/pipeline/plan.js +708 -0
- package/dist/pipeline/pr-comment.d.ts +19 -0
- package/dist/pipeline/pr-comment.js +502 -0
- package/dist/pipeline/probe.d.ts +52 -0
- package/dist/pipeline/probe.js +390 -0
- package/dist/pipeline/provenance.d.ts +47 -0
- package/dist/pipeline/provenance.js +146 -0
- package/dist/pipeline/readiness-report.d.ts +87 -0
- package/dist/pipeline/readiness-report.js +205 -0
- package/dist/pipeline/release-classification.d.ts +54 -0
- package/dist/pipeline/release-classification.js +238 -0
- package/dist/pipeline/release-report.d.ts +37 -0
- package/dist/pipeline/release-report.js +222 -0
- package/dist/pipeline/repo-eval-comment.d.ts +37 -0
- package/dist/pipeline/repo-eval-comment.js +165 -0
- package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
- package/dist/pipeline/repo-threshold-evaluator.js +162 -0
- package/dist/pipeline/resolve-mappings.d.ts +35 -0
- package/dist/pipeline/resolve-mappings.js +72 -0
- package/dist/pipeline/retrieval-metrics.d.ts +39 -0
- package/dist/pipeline/retrieval-metrics.js +136 -0
- package/dist/pipeline/reverse-mapping.d.ts +67 -0
- package/dist/pipeline/reverse-mapping.js +88 -0
- package/dist/pipeline/schemas.d.ts +9 -0
- package/dist/pipeline/schemas.js +9 -0
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +88 -0
- package/dist/pipeline/targeted-loo.js +203 -0
- package/dist/pipeline/thresholds.d.ts +27 -0
- package/dist/pipeline/thresholds.js +245 -0
- package/dist/pipeline/types.d.ts +10 -0
- package/dist/pipeline/types.js +10 -0
- package/dist/pipeline/validate.d.ts +67 -0
- package/dist/pipeline/validate.js +406 -0
- package/dist/pipeline/webhook-server.d.ts +37 -0
- package/dist/pipeline/webhook-server.js +133 -0
- package/dist/report-store.d.ts +84 -0
- package/dist/report-store.js +208 -0
- package/dist/sanity/client.d.ts +38 -0
- package/dist/sanity/client.js +86 -0
- package/dist/sanity/portable-text.d.ts +11 -0
- package/dist/sanity/portable-text.js +211 -0
- package/dist/sanity/queries.d.ts +133 -0
- package/dist/sanity/queries.js +300 -0
- package/dist/schedules/digest.d.ts +116 -0
- package/dist/schedules/digest.js +156 -0
- package/dist/schedules/index.d.ts +12 -0
- package/dist/schedules/index.js +10 -0
- package/dist/schedules/loader.d.ts +31 -0
- package/dist/schedules/loader.js +73 -0
- package/dist/schedules/schema.d.ts +9 -0
- package/dist/schedules/schema.js +9 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +21 -0
- package/dist/scripts/validate-task-sources.js +210 -0
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/bigquery/index.d.ts +131 -0
- package/dist/sinks/bigquery/index.js +222 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/index.d.ts +23 -0
- package/dist/sinks/index.js +18 -0
- package/dist/sinks/loader.d.ts +18 -0
- package/dist/sinks/loader.js +82 -0
- package/dist/sinks/retry.d.ts +24 -0
- package/dist/sinks/retry.js +52 -0
- package/dist/sinks/schema.d.ts +9 -0
- package/dist/sinks/schema.js +9 -0
- package/dist/sinks/slack/format.d.ts +65 -0
- package/dist/sinks/slack/format.js +327 -0
- package/dist/sinks/slack/index.d.ts +27 -0
- package/dist/sinks/slack/index.js +78 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +59 -0
- package/dist/sinks/types.js +44 -0
- package/dist/sinks/webhook/index.d.ts +19 -0
- package/dist/sinks/webhook/index.js +50 -0
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/sources.d.ts +104 -0
- package/dist/sources.js +292 -0
- package/dist/webhook/budget.d.ts +42 -0
- package/dist/webhook/budget.js +60 -0
- package/dist/webhook/debounce.d.ts +67 -0
- package/dist/webhook/debounce.js +76 -0
- package/dist/webhook/dispatch.d.ts +45 -0
- package/dist/webhook/dispatch.js +84 -0
- package/dist/webhook/eval-request-handler.d.ts +87 -0
- package/dist/webhook/eval-request-handler.js +181 -0
- package/dist/webhook/handler.d.ts +88 -0
- package/dist/webhook/handler.js +203 -0
- package/dist/webhook/index.d.ts +17 -0
- package/dist/webhook/index.js +12 -0
- package/dist/webhook/types.d.ts +109 -0
- package/dist/webhook/types.js +10 -0
- package/package.json +72 -0
- package/tasks/.expanded.agentic.yaml +51 -0
- package/tasks/.expanded.yaml +66 -0
- package/tasks/frameworks.yaml +98 -0
- package/tasks/functions.yaml +51 -0
- package/tasks/groq.yaml +216 -0
- package/tasks/nextjs-live.yaml +62 -0
- package/tasks/studio-setup.yaml +111 -0
- package/tasks/visual-editing.yaml +120 -0
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/targeted-loo.ts
|
|
3
|
+
*
|
|
4
|
+
* Targeted leave-one-out (LOO) attribution for ambiguous cases.
|
|
5
|
+
*
|
|
6
|
+
* Phase 4d of the Scenario Matrix implementation.
|
|
7
|
+
*
|
|
8
|
+
* When correlation-based attribution (Phase 2c) identifies ambiguous tasks
|
|
9
|
+
* (2+ changed docs map to the same task), targeted LOO resolves the ambiguity
|
|
10
|
+
* by running per-document mini-evaluations to measure each document's
|
|
11
|
+
* marginal contribution.
|
|
12
|
+
*
|
|
13
|
+
* This module handles:
|
|
14
|
+
* - Cost estimation before execution (to enable user confirmation)
|
|
15
|
+
* - Result analysis after LOO runs complete
|
|
16
|
+
* - Integration with the existing AttributionReport
|
|
17
|
+
*
|
|
18
|
+
* The actual evaluation execution is handled by the pipeline orchestrator —
|
|
19
|
+
* this module is pure computation on inputs and outputs.
|
|
20
|
+
*
|
|
21
|
+
* @see docs/exec-plans/completed/scenario-matrix-implementation/phase-4-content-release-integration.md
|
|
22
|
+
* @see docs/design-docs/scenario-matrix/per-document-attribution.md
|
|
23
|
+
*/
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
// Constants
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
/** Default estimated cost per test (provider + grader combined) */
|
|
28
|
+
const DEFAULT_COST_PER_TEST = 0.08;
|
|
29
|
+
/** Sum tolerance — marginal contributions should sum to ~total delta (±15%) */
|
|
30
|
+
const SUM_TOLERANCE = 0.15;
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
// Public API
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
/**
|
|
35
|
+
* Analyze LOO evaluation results to compute marginal contributions.
|
|
36
|
+
*
|
|
37
|
+
* Given the full-release score and per-document revert scores,
|
|
38
|
+
* calculates each document's marginal contribution as:
|
|
39
|
+
* marginal(doc) = fullReleaseScore - revertedScore(doc)
|
|
40
|
+
*
|
|
41
|
+
* @param taskId - The task being analyzed
|
|
42
|
+
* @param fullReleaseDelta - The total task delta from the full release
|
|
43
|
+
* @param revertResults - Per-document scores when that document is reverted
|
|
44
|
+
* @param noiseThreshold - Threshold for marking contributions as noise
|
|
45
|
+
* @param additionalCost - Actual cost incurred for the LOO evaluations
|
|
46
|
+
* @returns LOO result with per-document marginal contributions
|
|
47
|
+
*/
|
|
48
|
+
export function analyzeLOOResults(taskId, fullReleaseDelta, revertResults, noiseThreshold, additionalCost) {
|
|
49
|
+
const contributions = revertResults.map(({ revertedDelta, slug }) => {
|
|
50
|
+
// Marginal contribution = full delta - delta when this doc is reverted
|
|
51
|
+
const marginalContribution = fullReleaseDelta - revertedDelta;
|
|
52
|
+
return {
|
|
53
|
+
marginalContribution: Math.round(marginalContribution * 10) / 10,
|
|
54
|
+
slug,
|
|
55
|
+
withinNoiseFloor: Math.abs(marginalContribution) <= noiseThreshold,
|
|
56
|
+
};
|
|
57
|
+
});
|
|
58
|
+
// Check if contributions sum to approximately the total delta
|
|
59
|
+
const contributionSum = contributions.reduce((sum, c) => sum + c.marginalContribution, 0);
|
|
60
|
+
const sumMatchesTotal = fullReleaseDelta === 0 ||
|
|
61
|
+
Math.abs(contributionSum - fullReleaseDelta) / Math.abs(fullReleaseDelta) <=
|
|
62
|
+
SUM_TOLERANCE;
|
|
63
|
+
return {
|
|
64
|
+
additionalCost,
|
|
65
|
+
contributions: contributions.sort((a, b) => Math.abs(b.marginalContribution) - Math.abs(a.marginalContribution)),
|
|
66
|
+
sumMatchesTotal,
|
|
67
|
+
taskId,
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Enrich an attribution report with LOO results for ambiguous tasks.
|
|
72
|
+
*
|
|
73
|
+
* Replaces the "ambiguous" classification with resolved per-document
|
|
74
|
+
* contributions for tasks that have LOO data.
|
|
75
|
+
*
|
|
76
|
+
* @param attribution - Original attribution report
|
|
77
|
+
* @param looResults - LOO results for ambiguous tasks
|
|
78
|
+
* @returns New attribution report with LOO data integrated
|
|
79
|
+
*/
|
|
80
|
+
export function enrichAttributionWithLOO(attribution, looResults) {
|
|
81
|
+
const looByTask = new Map();
|
|
82
|
+
for (const result of looResults) {
|
|
83
|
+
looByTask.set(result.taskId, result);
|
|
84
|
+
}
|
|
85
|
+
return {
|
|
86
|
+
...attribution,
|
|
87
|
+
looResults,
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Estimate the cost of running targeted LOO for ambiguous tasks.
|
|
92
|
+
*
|
|
93
|
+
* @param ambiguousTasks - Tasks identified as ambiguous
|
|
94
|
+
* @param testsPerTask - Number of tests per task (from task YAML)
|
|
95
|
+
* @param costPerTest - Estimated cost per test (default: $0.08)
|
|
96
|
+
* @returns Cost estimate with per-task breakdown
|
|
97
|
+
*/
|
|
98
|
+
export function estimateLOOCost(ambiguousTasks, testsPerTask, costPerTest = DEFAULT_COST_PER_TEST) {
|
|
99
|
+
const perTask = [];
|
|
100
|
+
for (const task of ambiguousTasks) {
|
|
101
|
+
const numDocuments = task.attributedDocs.length;
|
|
102
|
+
const numTests = testsPerTask[task.taskId] ?? 6; // Default estimate
|
|
103
|
+
const estimatedCost = numDocuments * numTests * costPerTest;
|
|
104
|
+
perTask.push({
|
|
105
|
+
estimatedCost: Math.round(estimatedCost * 100) / 100,
|
|
106
|
+
numDocuments,
|
|
107
|
+
numTests,
|
|
108
|
+
taskId: task.taskId,
|
|
109
|
+
});
|
|
110
|
+
}
|
|
111
|
+
const totalEstimatedCost = perTask.reduce((sum, t) => sum + t.estimatedCost, 0);
|
|
112
|
+
return {
|
|
113
|
+
perTask,
|
|
114
|
+
totalEstimatedCost: Math.round(totalEstimatedCost * 100) / 100,
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Identify ambiguous tasks that would benefit from targeted LOO.
|
|
119
|
+
*
|
|
120
|
+
* Filters the attribution report to find tasks where 2+ changed documents
|
|
121
|
+
* are in the canonical set and the delta is outside the noise floor.
|
|
122
|
+
*
|
|
123
|
+
* @param attribution - Attribution report from Phase 2c
|
|
124
|
+
* @returns Ambiguous task attributions suitable for LOO
|
|
125
|
+
*/
|
|
126
|
+
export function findAmbiguousTasks(attribution) {
|
|
127
|
+
return attribution.attributions.filter((a) => a.classification === "ambiguous" && !a.withinNoiseFloor);
|
|
128
|
+
}
|
|
129
|
+
// ---------------------------------------------------------------------------
|
|
130
|
+
// Formatting
|
|
131
|
+
// ---------------------------------------------------------------------------
|
|
132
|
+
/**
|
|
133
|
+
* Format a LOO cost estimate for console output (for user confirmation).
|
|
134
|
+
*/
|
|
135
|
+
export function formatLOOCostEstimate(estimate) {
|
|
136
|
+
const lines = [];
|
|
137
|
+
lines.push("💰 TARGETED LOO COST ESTIMATE");
|
|
138
|
+
lines.push("");
|
|
139
|
+
lines.push(` Targeted LOO for ${estimate.perTask.length} ambiguous task(s):`);
|
|
140
|
+
lines.push("");
|
|
141
|
+
for (const task of estimate.perTask) {
|
|
142
|
+
lines.push(` ${task.taskId}: ${task.numDocuments} documents × ~${task.numTests} tests × $${DEFAULT_COST_PER_TEST}/test = ~$${task.estimatedCost.toFixed(2)}`);
|
|
143
|
+
}
|
|
144
|
+
lines.push("");
|
|
145
|
+
lines.push(` Total additional cost: ~$${estimate.totalEstimatedCost.toFixed(2)}`);
|
|
146
|
+
lines.push("");
|
|
147
|
+
return lines.join("\n");
|
|
148
|
+
}
|
|
149
|
+
/**
|
|
150
|
+
* Format LOO results for console output.
|
|
151
|
+
*/
|
|
152
|
+
export function formatLOOResultsConsole(results) {
|
|
153
|
+
const lines = [];
|
|
154
|
+
lines.push("🔬 TARGETED LOO RESULTS");
|
|
155
|
+
lines.push("");
|
|
156
|
+
for (const result of results) {
|
|
157
|
+
lines.push(` ${result.taskId}:`);
|
|
158
|
+
for (const c of result.contributions) {
|
|
159
|
+
const sign = c.marginalContribution >= 0 ? "+" : "";
|
|
160
|
+
const noise = c.withinNoiseFloor ? " (within noise)" : "";
|
|
161
|
+
lines.push(` ${c.slug}: ${sign}${c.marginalContribution.toFixed(1)}${noise}`);
|
|
162
|
+
}
|
|
163
|
+
if (!result.sumMatchesTotal) {
|
|
164
|
+
lines.push(" ⚠️ Contributions don't sum to total delta (interaction effects likely)");
|
|
165
|
+
}
|
|
166
|
+
lines.push(` Additional cost: $${result.additionalCost.toFixed(2)}`);
|
|
167
|
+
lines.push("");
|
|
168
|
+
}
|
|
169
|
+
return lines.join("\n");
|
|
170
|
+
}
|
|
171
|
+
/**
|
|
172
|
+
* Format LOO results as markdown.
|
|
173
|
+
*/
|
|
174
|
+
export function formatLOOResultsMarkdown(results) {
|
|
175
|
+
const lines = [];
|
|
176
|
+
lines.push("#### 🔬 Targeted LOO Attribution");
|
|
177
|
+
lines.push("");
|
|
178
|
+
if (results.length === 0) {
|
|
179
|
+
lines.push("No ambiguous tasks required LOO resolution.");
|
|
180
|
+
lines.push("");
|
|
181
|
+
return lines.join("\n");
|
|
182
|
+
}
|
|
183
|
+
for (const result of results) {
|
|
184
|
+
lines.push(`**${result.taskId}:**`);
|
|
185
|
+
lines.push("");
|
|
186
|
+
for (const c of result.contributions) {
|
|
187
|
+
const sign = c.marginalContribution >= 0 ? "+" : "";
|
|
188
|
+
const pct = result.contributions.length > 0
|
|
189
|
+
? Math.round((Math.abs(c.marginalContribution) /
|
|
190
|
+
result.contributions.reduce((s, cc) => s + Math.abs(cc.marginalContribution), 0)) *
|
|
191
|
+
100)
|
|
192
|
+
: 0;
|
|
193
|
+
const noise = c.withinNoiseFloor ? " ⚠️" : "";
|
|
194
|
+
lines.push(`- \`${c.slug}\`: ${sign}${c.marginalContribution.toFixed(1)} (${pct}%)${noise}`);
|
|
195
|
+
}
|
|
196
|
+
if (!result.sumMatchesTotal) {
|
|
197
|
+
lines.push("> ⚠️ Marginal contributions don't sum to total delta due to interaction effects.");
|
|
198
|
+
}
|
|
199
|
+
lines.push(`- Additional cost: $${result.additionalCost.toFixed(2)}`);
|
|
200
|
+
lines.push("");
|
|
201
|
+
}
|
|
202
|
+
return lines.join("\n");
|
|
203
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/thresholds.ts
|
|
3
|
+
*
|
|
4
|
+
* Threshold evaluation engine — compares a ScoreSummary against configurable
|
|
5
|
+
* quality thresholds and produces a set of typed violations.
|
|
6
|
+
*
|
|
7
|
+
* Pure functions only: no I/O, no side effects. The caller (pipeline.ts,
|
|
8
|
+
* publish-report-step.ts) is responsible for loading the threshold config
|
|
9
|
+
* and score summary.
|
|
10
|
+
*
|
|
11
|
+
* @see docs/exec-plans/completed/scenario-matrix-implementation/phase-5-readiness-thresholds.md
|
|
12
|
+
*/
|
|
13
|
+
import type { ThresholdConfig } from "./schemas.js";
|
|
14
|
+
import type { ComparisonReport, ScoreSummary, ThresholdEvaluation, ThresholdViolation } from "./types.js";
|
|
15
|
+
/**
|
|
16
|
+
* Evaluate regression thresholds against a comparison report.
|
|
17
|
+
*
|
|
18
|
+
* Returns violations for regressions that exceed configured thresholds.
|
|
19
|
+
*/
|
|
20
|
+
export declare function evaluateRegressionThresholds(comparison: ComparisonReport, config: ThresholdConfig): ThresholdViolation[];
|
|
21
|
+
/**
|
|
22
|
+
* Evaluate a score summary against threshold configuration.
|
|
23
|
+
*
|
|
24
|
+
* Returns all violations sorted by severity (critical first), then by
|
|
25
|
+
* the magnitude of the threshold breach (largest gap first).
|
|
26
|
+
*/
|
|
27
|
+
export declare function evaluateThresholds(scores: ScoreSummary, config: ThresholdConfig): ThresholdEvaluation;
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/thresholds.ts
|
|
3
|
+
*
|
|
4
|
+
* Threshold evaluation engine — compares a ScoreSummary against configurable
|
|
5
|
+
* quality thresholds and produces a set of typed violations.
|
|
6
|
+
*
|
|
7
|
+
* Pure functions only: no I/O, no side effects. The caller (pipeline.ts,
|
|
8
|
+
* publish-report-step.ts) is responsible for loading the threshold config
|
|
9
|
+
* and score summary.
|
|
10
|
+
*
|
|
11
|
+
* @see docs/exec-plans/completed/scenario-matrix-implementation/phase-5-readiness-thresholds.md
|
|
12
|
+
*/
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
// Severity priority for sorting (higher = more severe)
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
const SEVERITY_RANK = {
|
|
17
|
+
critical: 3,
|
|
18
|
+
info: 1,
|
|
19
|
+
warning: 2,
|
|
20
|
+
};
|
|
21
|
+
/**
|
|
22
|
+
* Evaluate regression thresholds against a comparison report.
|
|
23
|
+
*
|
|
24
|
+
* Returns violations for regressions that exceed configured thresholds.
|
|
25
|
+
*/
|
|
26
|
+
export function evaluateRegressionThresholds(comparison, config) {
|
|
27
|
+
const regression = config.regression;
|
|
28
|
+
if (!regression)
|
|
29
|
+
return [];
|
|
30
|
+
const violations = [];
|
|
31
|
+
// Overall composite regression
|
|
32
|
+
if (comparison.deltas.overall < regression.composite) {
|
|
33
|
+
violations.push({
|
|
34
|
+
actual: comparison.deltas.overall,
|
|
35
|
+
area: "overall",
|
|
36
|
+
description: `Overall score dropped by ${fmt(Math.abs(comparison.deltas.overall))} (threshold: ${regression.composite})`,
|
|
37
|
+
metric: "regression-composite",
|
|
38
|
+
severity: classifyRegressionSeverity(comparison.deltas.overall, config),
|
|
39
|
+
threshold: regression.composite,
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
// Per-area regressions
|
|
43
|
+
for (const areaDelta of comparison.areas) {
|
|
44
|
+
if (areaDelta.delta < regression["per-area"]) {
|
|
45
|
+
violations.push({
|
|
46
|
+
actual: areaDelta.delta,
|
|
47
|
+
area: areaDelta.area,
|
|
48
|
+
description: `${areaDelta.area} dropped by ${fmt(Math.abs(areaDelta.delta))} (threshold: ${regression["per-area"]})`,
|
|
49
|
+
metric: "regression-area",
|
|
50
|
+
severity: classifyRegressionSeverity(areaDelta.delta, config),
|
|
51
|
+
threshold: regression["per-area"],
|
|
52
|
+
});
|
|
53
|
+
}
|
|
54
|
+
// Per-dimension regressions
|
|
55
|
+
const dims = areaDelta.dimensions;
|
|
56
|
+
for (const [dimKey, dimData] of Object.entries(dims)) {
|
|
57
|
+
if (dimData.delta < regression["per-dimension"]) {
|
|
58
|
+
violations.push({
|
|
59
|
+
actual: dimData.delta,
|
|
60
|
+
area: areaDelta.area,
|
|
61
|
+
description: `${areaDelta.area} ${dimKey} dropped by ${fmt(Math.abs(dimData.delta))} (threshold: ${regression["per-dimension"]})`,
|
|
62
|
+
metric: `regression-${dimKey}`,
|
|
63
|
+
severity: "warning",
|
|
64
|
+
threshold: regression["per-dimension"],
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
// Sort by severity then magnitude
|
|
70
|
+
violations.sort((a, b) => {
|
|
71
|
+
const sevDiff = SEVERITY_RANK[b.severity] - SEVERITY_RANK[a.severity];
|
|
72
|
+
if (sevDiff !== 0)
|
|
73
|
+
return sevDiff;
|
|
74
|
+
return a.actual - b.actual;
|
|
75
|
+
});
|
|
76
|
+
return violations;
|
|
77
|
+
}
|
|
78
|
+
// ---------------------------------------------------------------------------
|
|
79
|
+
// Per-area evaluation
|
|
80
|
+
// ---------------------------------------------------------------------------
|
|
81
|
+
/**
|
|
82
|
+
* Evaluate a score summary against threshold configuration.
|
|
83
|
+
*
|
|
84
|
+
* Returns all violations sorted by severity (critical first), then by
|
|
85
|
+
* the magnitude of the threshold breach (largest gap first).
|
|
86
|
+
*/
|
|
87
|
+
export function evaluateThresholds(scores, config) {
|
|
88
|
+
const violations = [];
|
|
89
|
+
// Evaluate overall composite score against defaults
|
|
90
|
+
const overallScore = scores.overall.avgScore;
|
|
91
|
+
if (overallScore < config.defaults.composite) {
|
|
92
|
+
violations.push({
|
|
93
|
+
actual: overallScore,
|
|
94
|
+
area: "overall",
|
|
95
|
+
description: `Overall composite score ${fmt(overallScore)} is below threshold ${config.defaults.composite}`,
|
|
96
|
+
metric: "composite",
|
|
97
|
+
severity: classifySeverity(overallScore, config),
|
|
98
|
+
threshold: config.defaults.composite,
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
// Evaluate per-area scores
|
|
102
|
+
for (const areaScore of scores.scores) {
|
|
103
|
+
const areaOverrides = config.areas?.[areaScore.feature];
|
|
104
|
+
const merged = mergeDefaults(config.defaults, areaOverrides);
|
|
105
|
+
evaluateArea(areaScore, merged, config, violations);
|
|
106
|
+
}
|
|
107
|
+
// Sort: critical first, then by delta magnitude (largest breach first)
|
|
108
|
+
violations.sort((a, b) => {
|
|
109
|
+
const sevDiff = SEVERITY_RANK[b.severity] - SEVERITY_RANK[a.severity];
|
|
110
|
+
if (sevDiff !== 0)
|
|
111
|
+
return sevDiff;
|
|
112
|
+
// Larger breach = more negative (actual - threshold)
|
|
113
|
+
return a.actual - a.threshold - (b.actual - b.threshold);
|
|
114
|
+
});
|
|
115
|
+
const maxSeverity = violations.length > 0 ? violations[0].severity : "none";
|
|
116
|
+
return {
|
|
117
|
+
maxSeverity,
|
|
118
|
+
pass: violations.length === 0,
|
|
119
|
+
violations,
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Classify severity for a regression delta using the severity config.
|
|
124
|
+
*/
|
|
125
|
+
function classifyRegressionSeverity(delta, config) {
|
|
126
|
+
const sev = config.severity;
|
|
127
|
+
if (!sev)
|
|
128
|
+
return "warning";
|
|
129
|
+
if (sev.warning?.["regression-exceeds"] !== undefined &&
|
|
130
|
+
delta < sev.warning["regression-exceeds"]) {
|
|
131
|
+
return "warning";
|
|
132
|
+
}
|
|
133
|
+
return "info";
|
|
134
|
+
}
|
|
135
|
+
// ---------------------------------------------------------------------------
|
|
136
|
+
// Severity classification
|
|
137
|
+
// ---------------------------------------------------------------------------
|
|
138
|
+
/**
|
|
139
|
+
* Classify the severity of a score-based violation using the severity config.
|
|
140
|
+
* Checks from most severe to least: critical → warning → info.
|
|
141
|
+
*/
|
|
142
|
+
function classifySeverity(score, config) {
|
|
143
|
+
const sev = config.severity;
|
|
144
|
+
if (!sev)
|
|
145
|
+
return "warning";
|
|
146
|
+
if (sev.critical?.["composite-below"] !== undefined &&
|
|
147
|
+
score < sev.critical["composite-below"]) {
|
|
148
|
+
return "critical";
|
|
149
|
+
}
|
|
150
|
+
if (sev.warning?.["composite-below"] !== undefined &&
|
|
151
|
+
score < sev.warning["composite-below"]) {
|
|
152
|
+
return "warning";
|
|
153
|
+
}
|
|
154
|
+
if (sev.info?.["composite-below"] !== undefined &&
|
|
155
|
+
score < sev.info["composite-below"]) {
|
|
156
|
+
return "info";
|
|
157
|
+
}
|
|
158
|
+
return "warning";
|
|
159
|
+
}
|
|
160
|
+
function evaluateArea(score, thresholds, config, violations) {
|
|
161
|
+
const area = score.feature;
|
|
162
|
+
// Composite score
|
|
163
|
+
if (score.totalScore < thresholds.composite) {
|
|
164
|
+
violations.push({
|
|
165
|
+
actual: score.totalScore,
|
|
166
|
+
area,
|
|
167
|
+
description: `${area} composite score ${fmt(score.totalScore)} is below threshold ${thresholds.composite}`,
|
|
168
|
+
metric: "composite",
|
|
169
|
+
severity: classifySeverity(score.totalScore, config),
|
|
170
|
+
threshold: thresholds.composite,
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
// Per-dimension thresholds
|
|
174
|
+
const dims = thresholds.dimensions;
|
|
175
|
+
if (dims) {
|
|
176
|
+
const dimMap = [
|
|
177
|
+
["task-completion", score.taskCompletion, dims["task-completion"]],
|
|
178
|
+
["code-correctness", score.codeCorrectness, dims["code-correctness"]],
|
|
179
|
+
["doc-coverage", score.docCoverage, dims["doc-coverage"]],
|
|
180
|
+
];
|
|
181
|
+
for (const [dimKey, actual, threshold] of dimMap) {
|
|
182
|
+
if (threshold !== undefined && actual < threshold) {
|
|
183
|
+
violations.push({
|
|
184
|
+
actual,
|
|
185
|
+
area,
|
|
186
|
+
description: `${area} ${dimKey} score ${fmt(actual)} is below threshold ${threshold}`,
|
|
187
|
+
metric: dimKey,
|
|
188
|
+
severity: classifySeverity(actual, config),
|
|
189
|
+
threshold,
|
|
190
|
+
});
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
// Doc Lift threshold (docs must not hurt)
|
|
195
|
+
const docLiftThreshold = thresholds["doc-lift"];
|
|
196
|
+
if (docLiftThreshold !== undefined && score.docLift < docLiftThreshold) {
|
|
197
|
+
const severity = score.negativeDocLift && config.severity?.critical?.["negative-doc-lift"]
|
|
198
|
+
? "critical"
|
|
199
|
+
: "warning";
|
|
200
|
+
violations.push({
|
|
201
|
+
actual: score.docLift,
|
|
202
|
+
area,
|
|
203
|
+
description: `${area} Doc Lift ${fmt(score.docLift)} is below threshold ${docLiftThreshold}${score.negativeDocLift ? " — docs are hurting performance" : ""}`,
|
|
204
|
+
metric: "doc-lift",
|
|
205
|
+
severity,
|
|
206
|
+
threshold: docLiftThreshold,
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
// Ceiling threshold (doc quality floor)
|
|
210
|
+
const ceilingThreshold = thresholds.ceiling;
|
|
211
|
+
if (ceilingThreshold !== undefined && score.ceilingScore < ceilingThreshold) {
|
|
212
|
+
violations.push({
|
|
213
|
+
actual: score.ceilingScore,
|
|
214
|
+
area,
|
|
215
|
+
description: `${area} ceiling score ${fmt(score.ceilingScore)} is below threshold ${ceilingThreshold} — documentation quality is insufficient`,
|
|
216
|
+
metric: "ceiling",
|
|
217
|
+
severity: "warning",
|
|
218
|
+
threshold: ceilingThreshold,
|
|
219
|
+
});
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
// ---------------------------------------------------------------------------
|
|
223
|
+
// Helpers
|
|
224
|
+
// ---------------------------------------------------------------------------
|
|
225
|
+
/** Format a score for display (round to nearest integer) */
|
|
226
|
+
function fmt(n) {
|
|
227
|
+
return String(Math.round(n));
|
|
228
|
+
}
|
|
229
|
+
/**
|
|
230
|
+
* Merge per-area overrides with defaults. Area overrides take precedence.
|
|
231
|
+
*/
|
|
232
|
+
function mergeDefaults(defaults, overrides) {
|
|
233
|
+
if (!overrides) {
|
|
234
|
+
return { ...defaults };
|
|
235
|
+
}
|
|
236
|
+
return {
|
|
237
|
+
ceiling: overrides.ceiling ?? defaults.ceiling,
|
|
238
|
+
composite: overrides.composite ?? defaults.composite,
|
|
239
|
+
dimensions: {
|
|
240
|
+
...defaults.dimensions,
|
|
241
|
+
...overrides.dimensions,
|
|
242
|
+
},
|
|
243
|
+
"doc-lift": overrides["doc-lift"] ?? defaults["doc-lift"],
|
|
244
|
+
};
|
|
245
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/types.ts — Re-export barrel
|
|
3
|
+
*
|
|
4
|
+
* All domain types now live in @sanity/ailf-core. This file re-exports
|
|
5
|
+
* them for backward compatibility — existing imports throughout
|
|
6
|
+
* packages/eval continue to work unchanged.
|
|
7
|
+
*
|
|
8
|
+
* @see packages/core/src/types/index.ts (canonical source)
|
|
9
|
+
*/
|
|
10
|
+
export * from "../_vendor/ailf-core/index.d.ts";
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/types.ts — Re-export barrel
|
|
3
|
+
*
|
|
4
|
+
* All domain types now live in @sanity/ailf-core. This file re-exports
|
|
5
|
+
* them for backward compatibility — existing imports throughout
|
|
6
|
+
* packages/eval continue to work unchanged.
|
|
7
|
+
*
|
|
8
|
+
* @see packages/core/src/types/index.ts (canonical source)
|
|
9
|
+
*/
|
|
10
|
+
export * from "../_vendor/ailf-core/index.js";
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/validate.ts
|
|
3
|
+
*
|
|
4
|
+
* Configuration validation for the evaluation pipeline. Checks that all YAML
|
|
5
|
+
* config files are consistent: every task has a canonical mapping, every
|
|
6
|
+
* mapping has a reference solution file, required files exist, etc.
|
|
7
|
+
*
|
|
8
|
+
* All individual validators are exported so they can be tested independently.
|
|
9
|
+
*/
|
|
10
|
+
import type { ValidationIssue, ValidationResult } from "./types.js";
|
|
11
|
+
/**
|
|
12
|
+
* Run all validation checks and return a combined result.
|
|
13
|
+
* `valid` is true only if there are zero error-severity issues.
|
|
14
|
+
*/
|
|
15
|
+
export declare function validateConfiguration(rootDir: string): ValidationResult;
|
|
16
|
+
/**
|
|
17
|
+
* Check that canonical context files exist. These are the per-task
|
|
18
|
+
* gold-retrieval contexts actually referenced by task definitions.
|
|
19
|
+
*
|
|
20
|
+
* Contexts are generated by fetch-docs and may not exist yet —
|
|
21
|
+
* returns warnings, not errors.
|
|
22
|
+
*/
|
|
23
|
+
export declare function validateContexts(rootDir: string): ValidationIssue[];
|
|
24
|
+
/**
|
|
25
|
+
* Check that config/features.yaml exists, parses, and conforms to the Zod schema.
|
|
26
|
+
* Also cross-references covered features against actual task files for consistency.
|
|
27
|
+
*
|
|
28
|
+
* Returns warnings (not errors) if the file is missing — the feature registry
|
|
29
|
+
* is optional and doesn't block evaluation.
|
|
30
|
+
*/
|
|
31
|
+
export declare function validateFeaturesYaml(rootDir: string): ValidationIssue[];
|
|
32
|
+
/**
|
|
33
|
+
* Check that config/models.yaml exists, parses, has at least one model with an id
|
|
34
|
+
* and label, and has a grader defined.
|
|
35
|
+
*/
|
|
36
|
+
export declare function validateModelsYaml(rootDir: string): ValidationIssue[];
|
|
37
|
+
/**
|
|
38
|
+
* Check that reference solution files exist on disk for every task
|
|
39
|
+
* that declares a reference_solution path. Reads from inline task
|
|
40
|
+
* definitions via resolveMappings.
|
|
41
|
+
*
|
|
42
|
+
* NOTE: When tasks come from the Content Lake, reference solutions also
|
|
43
|
+
* live there (as ailf.referenceSolution documents). This validator only
|
|
44
|
+
* applies to YAML-based tasks with local file paths.
|
|
45
|
+
*/
|
|
46
|
+
export declare function validateReferenceSolutions(rootDir: string): ValidationIssue[];
|
|
47
|
+
/**
|
|
48
|
+
* Check that config/rubrics.yaml exists, parses, and conforms to the Zod schema.
|
|
49
|
+
* Returns the set of valid template keys for cross-referencing by task
|
|
50
|
+
* validation.
|
|
51
|
+
*/
|
|
52
|
+
export declare function validateRubricsYaml(rootDir: string): ValidationIssue[];
|
|
53
|
+
/**
|
|
54
|
+
* Check that tasks/*.yaml files exist, parse, and conform to the Zod schema.
|
|
55
|
+
* Validates both the new single-definition format (with `id`) and the legacy
|
|
56
|
+
* paired format. Uses `TaskFileSchema` from schemas.ts for structural
|
|
57
|
+
* validation, plus cross-entry checks (duplicate IDs, docs path consistency).
|
|
58
|
+
*/
|
|
59
|
+
export declare function validateTaskFiles(rootDir: string): ValidationIssue[];
|
|
60
|
+
/**
|
|
61
|
+
* Check that config/thresholds.yaml exists, parses, and conforms to the Zod schema.
|
|
62
|
+
*
|
|
63
|
+
* Returns warnings (not errors) if the file is missing — thresholds are
|
|
64
|
+
* optional and don't block evaluation. They only activate when
|
|
65
|
+
* `--readiness` or severity-aware sink routing is used.
|
|
66
|
+
*/
|
|
67
|
+
export declare function validateThresholdsYaml(rootDir: string): ValidationIssue[];
|