@sanity/ailf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -0
- package/bin/ailf.js +64 -0
- package/canonical/grader-references/README.md +88 -0
- package/canonical/grader-references/groq.yaml +234 -0
- package/canonical/grader-references/studio-setup.yaml +275 -0
- package/canonical/reference-solutions/.gitkeep +1 -0
- package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
- package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
- package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
- package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
- package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
- package/canonical/reference-solutions/groq/joins-references.ts +300 -0
- package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
- package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
- package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
- package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
- package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
- package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
- package/config/bigquery/README.md +74 -0
- package/config/bigquery/views/area_scores.sql +87 -0
- package/config/bigquery/views/reports.sql +49 -0
- package/config/features.yaml +116 -0
- package/config/models.yaml +115 -0
- package/config/prompts.yaml +75 -0
- package/config/rubrics.yaml +62 -0
- package/config/schedules.yaml +43 -0
- package/config/sinks.yaml +54 -0
- package/config/sources.yaml +51 -0
- package/config/thresholds.yaml +49 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
- package/dist/_vendor/ailf-core/examples/index.js +285 -0
- package/dist/_vendor/ailf-core/index.d.ts +17 -0
- package/dist/_vendor/ailf-core/index.js +17 -0
- package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
- package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
- package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
- package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
- package/dist/_vendor/ailf-core/ports/context.js +14 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
- package/dist/_vendor/ailf-core/ports/index.js +7 -0
- package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
- package/dist/_vendor/ailf-core/ports/logger.js +11 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
- package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/schemas/index.js +16 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
- package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
- package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
- package/dist/_vendor/ailf-core/services/index.js +12 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/scoring.js +222 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
- package/dist/_vendor/ailf-core/types/index.js +21 -0
- package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
- package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
- package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
- package/dist/_vendor/ailf-shared/document-ref.js +1 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -0
- package/dist/_vendor/ailf-shared/index.js +16 -0
- package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
- package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
- package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
- package/dist/_vendor/ailf-shared/score-grades.js +23 -0
- package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
- package/dist/adapters/cache/content-lake-cache.js +59 -0
- package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
- package/dist/adapters/cache/filesystem-cache.js +54 -0
- package/dist/adapters/cache/index.d.ts +2 -0
- package/dist/adapters/cache/index.js +2 -0
- package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
- package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
- package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
- package/dist/adapters/config-sources/file-config-adapter.js +96 -0
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +2 -0
- package/dist/adapters/doc-fetchers/index.d.ts +1 -0
- package/dist/adapters/doc-fetchers/index.js +1 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
- package/dist/adapters/eval-runners/index.d.ts +1 -0
- package/dist/adapters/eval-runners/index.js +1 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
- package/dist/adapters/index.d.ts +12 -0
- package/dist/adapters/index.js +12 -0
- package/dist/adapters/loggers/console-logger.d.ts +22 -0
- package/dist/adapters/loggers/console-logger.js +54 -0
- package/dist/adapters/loggers/index.d.ts +9 -0
- package/dist/adapters/loggers/index.js +9 -0
- package/dist/adapters/loggers/json-logger.d.ts +18 -0
- package/dist/adapters/loggers/json-logger.js +33 -0
- package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
- package/dist/adapters/loggers/quiet-logger.js +30 -0
- package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/composite-task-source.js +59 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
- package/dist/adapters/task-sources/index.d.ts +7 -0
- package/dist/adapters/task-sources/index.js +7 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
- package/dist/adapters/task-sources/repo-schemas.js +234 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
- package/dist/adapters/task-sources/repo-task-source.js +104 -0
- package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
- package/dist/adapters/task-sources/repo-trigger.js +153 -0
- package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
- package/dist/adapters/task-sources/repo-validation.js +164 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
- package/dist/adapters/task-sources/yaml-task-source.js +136 -0
- package/dist/agent-observer/agentic-provider.d.ts +132 -0
- package/dist/agent-observer/agentic-provider.js +983 -0
- package/dist/agent-observer/classifier.d.ts +62 -0
- package/dist/agent-observer/classifier.js +269 -0
- package/dist/agent-observer/index.d.ts +7 -0
- package/dist/agent-observer/index.js +4 -0
- package/dist/agent-observer/pricing.d.ts +35 -0
- package/dist/agent-observer/pricing.js +82 -0
- package/dist/agent-observer/provider.d.ts +77 -0
- package/dist/agent-observer/provider.js +151 -0
- package/dist/agent-observer/proxy.d.ts +91 -0
- package/dist/agent-observer/proxy.js +321 -0
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/agent-observer/types.d.ts +137 -0
- package/dist/agent-observer/types.js +16 -0
- package/dist/assertions/source-isolation.d.ts +72 -0
- package/dist/assertions/source-isolation.js +117 -0
- package/dist/cli.d.ts +24 -0
- package/dist/cli.js +199 -0
- package/dist/commands/agent-report.d.ts +5 -0
- package/dist/commands/agent-report.js +69 -0
- package/dist/commands/baseline.d.ts +9 -0
- package/dist/commands/baseline.js +141 -0
- package/dist/commands/cache.d.ts +13 -0
- package/dist/commands/cache.js +135 -0
- package/dist/commands/calculate-scores.d.ts +8 -0
- package/dist/commands/calculate-scores.js +48 -0
- package/dist/commands/compare.d.ts +8 -0
- package/dist/commands/compare.js +120 -0
- package/dist/commands/completion.d.ts +18 -0
- package/dist/commands/completion.js +260 -0
- package/dist/commands/coverage-audit.d.ts +7 -0
- package/dist/commands/coverage-audit.js +40 -0
- package/dist/commands/discovery-report.d.ts +10 -0
- package/dist/commands/discovery-report.js +44 -0
- package/dist/commands/eval.d.ts +9 -0
- package/dist/commands/eval.js +35 -0
- package/dist/commands/explain-handler.d.ts +34 -0
- package/dist/commands/explain-handler.js +719 -0
- package/dist/commands/fetch-docs.d.ts +8 -0
- package/dist/commands/fetch-docs.js +128 -0
- package/dist/commands/generate-configs.d.ts +8 -0
- package/dist/commands/generate-configs.js +46 -0
- package/dist/commands/grader/index.d.ts +11 -0
- package/dist/commands/grader/index.js +118 -0
- package/dist/commands/init.d.ts +19 -0
- package/dist/commands/init.js +150 -0
- package/dist/commands/interactive.d.ts +12 -0
- package/dist/commands/interactive.js +238 -0
- package/dist/commands/lookup-doc.d.ts +15 -0
- package/dist/commands/lookup-doc.js +84 -0
- package/dist/commands/measure-retrieval.d.ts +5 -0
- package/dist/commands/measure-retrieval.js +65 -0
- package/dist/commands/pipeline-action.d.ts +71 -0
- package/dist/commands/pipeline-action.js +305 -0
- package/dist/commands/pipeline.d.ts +62 -0
- package/dist/commands/pipeline.js +53 -0
- package/dist/commands/pr-comment.d.ts +8 -0
- package/dist/commands/pr-comment.js +47 -0
- package/dist/commands/publish.d.ts +26 -0
- package/dist/commands/publish.js +253 -0
- package/dist/commands/readiness-report.d.ts +10 -0
- package/dist/commands/readiness-report.js +104 -0
- package/dist/commands/shared/options.d.ts +29 -0
- package/dist/commands/shared/options.js +57 -0
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +16 -0
- package/dist/commands/validate-tasks.js +93 -0
- package/dist/commands/validate.d.ts +9 -0
- package/dist/commands/validate.js +73 -0
- package/dist/commands/webhook-server.d.ts +5 -0
- package/dist/commands/webhook-server.js +30 -0
- package/dist/commands/weekly-digest.d.ts +10 -0
- package/dist/commands/weekly-digest.js +104 -0
- package/dist/composition-root.d.ts +26 -0
- package/dist/composition-root.js +107 -0
- package/dist/interpolate.d.ts +26 -0
- package/dist/interpolate.js +70 -0
- package/dist/job-store.d.ts +104 -0
- package/dist/job-store.js +188 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.d.ts +27 -0
- package/dist/orchestration/build-app-context.js +81 -0
- package/dist/orchestration/build-step-sequence.d.ts +15 -0
- package/dist/orchestration/build-step-sequence.js +84 -0
- package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
- package/dist/orchestration/config-to-source-overrides.js +28 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/index.d.ts +11 -0
- package/dist/orchestration/index.js +11 -0
- package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
- package/dist/orchestration/pipeline-orchestrator.js +153 -0
- package/dist/orchestration/step-runner.d.ts +20 -0
- package/dist/orchestration/step-runner.js +88 -0
- package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
- package/dist/orchestration/steps/calculate-scores-step.js +95 -0
- package/dist/orchestration/steps/callback-step.d.ts +24 -0
- package/dist/orchestration/steps/callback-step.js +76 -0
- package/dist/orchestration/steps/compare-step.d.ts +14 -0
- package/dist/orchestration/steps/compare-step.js +92 -0
- package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
- package/dist/orchestration/steps/discovery-report-step.js +55 -0
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
- package/dist/orchestration/steps/fetch-docs-step.js +135 -0
- package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
- package/dist/orchestration/steps/gap-analysis-step.js +136 -0
- package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
- package/dist/orchestration/steps/generate-configs-step.js +85 -0
- package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
- package/dist/orchestration/steps/grader-consistency-step.js +64 -0
- package/dist/orchestration/steps/index.d.ts +19 -0
- package/dist/orchestration/steps/index.js +19 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
- package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
- package/dist/orchestration/steps/publish-report-step.js +216 -0
- package/dist/orchestration/steps/readiness-step.d.ts +13 -0
- package/dist/orchestration/steps/readiness-step.js +91 -0
- package/dist/orchestration/steps/report-step.d.ts +12 -0
- package/dist/orchestration/steps/report-step.js +49 -0
- package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
- package/dist/orchestration/steps/run-eval-step.js +195 -0
- package/dist/orchestration/steps/validate-step.d.ts +12 -0
- package/dist/orchestration/steps/validate-step.js +41 -0
- package/dist/pipeline/agent-behavior-report.d.ts +53 -0
- package/dist/pipeline/agent-behavior-report.js +132 -0
- package/dist/pipeline/attribution.d.ts +47 -0
- package/dist/pipeline/attribution.js +226 -0
- package/dist/pipeline/baseline.d.ts +37 -0
- package/dist/pipeline/baseline.js +141 -0
- package/dist/pipeline/cache.d.ts +101 -0
- package/dist/pipeline/cache.js +283 -0
- package/dist/pipeline/calculate-scores.d.ts +102 -0
- package/dist/pipeline/calculate-scores.js +1128 -0
- package/dist/pipeline/callback-delivery.d.ts +50 -0
- package/dist/pipeline/callback-delivery.js +89 -0
- package/dist/pipeline/checks.d.ts +39 -0
- package/dist/pipeline/checks.js +280 -0
- package/dist/pipeline/classify-url.d.ts +61 -0
- package/dist/pipeline/classify-url.js +93 -0
- package/dist/pipeline/compare.d.ts +31 -0
- package/dist/pipeline/compare.js +208 -0
- package/dist/pipeline/coverage-audit.d.ts +39 -0
- package/dist/pipeline/coverage-audit.js +165 -0
- package/dist/pipeline/degradations.d.ts +85 -0
- package/dist/pipeline/degradations.js +242 -0
- package/dist/pipeline/discovery-report.d.ts +55 -0
- package/dist/pipeline/discovery-report.js +178 -0
- package/dist/pipeline/eval-constants.d.ts +68 -0
- package/dist/pipeline/eval-constants.js +111 -0
- package/dist/pipeline/eval-fingerprint.d.ts +66 -0
- package/dist/pipeline/eval-fingerprint.js +175 -0
- package/dist/pipeline/expand-tasks.d.ts +220 -0
- package/dist/pipeline/expand-tasks.js +421 -0
- package/dist/pipeline/failure-modes.d.ts +46 -0
- package/dist/pipeline/failure-modes.js +348 -0
- package/dist/pipeline/fetch-url-content.d.ts +44 -0
- package/dist/pipeline/fetch-url-content.js +93 -0
- package/dist/pipeline/gap-analysis.d.ts +48 -0
- package/dist/pipeline/gap-analysis.js +231 -0
- package/dist/pipeline/generate-configs.d.ts +72 -0
- package/dist/pipeline/generate-configs.js +395 -0
- package/dist/pipeline/grader-api.d.ts +49 -0
- package/dist/pipeline/grader-api.js +200 -0
- package/dist/pipeline/grader-compare-runner.d.ts +44 -0
- package/dist/pipeline/grader-compare-runner.js +301 -0
- package/dist/pipeline/grader-comparison.d.ts +111 -0
- package/dist/pipeline/grader-comparison.js +161 -0
- package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
- package/dist/pipeline/grader-consistency-runner.js +270 -0
- package/dist/pipeline/grader-consistency.d.ts +103 -0
- package/dist/pipeline/grader-consistency.js +146 -0
- package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
- package/dist/pipeline/grader-sensitivity-runner.js +282 -0
- package/dist/pipeline/grader-sensitivity.d.ts +94 -0
- package/dist/pipeline/grader-sensitivity.js +144 -0
- package/dist/pipeline/grader-validate-runner.d.ts +38 -0
- package/dist/pipeline/grader-validate-runner.js +229 -0
- package/dist/pipeline/grader-validation.d.ts +107 -0
- package/dist/pipeline/grader-validation.js +169 -0
- package/dist/pipeline/map-request-to-config.d.ts +19 -0
- package/dist/pipeline/map-request-to-config.js +80 -0
- package/dist/pipeline/measure-retrieval.d.ts +59 -0
- package/dist/pipeline/measure-retrieval.js +111 -0
- package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
- package/dist/pipeline/mirror-repo-tasks.js +350 -0
- package/dist/pipeline/plan-format.d.ts +33 -0
- package/dist/pipeline/plan-format.js +202 -0
- package/dist/pipeline/plan.d.ts +169 -0
- package/dist/pipeline/plan.js +708 -0
- package/dist/pipeline/pr-comment.d.ts +19 -0
- package/dist/pipeline/pr-comment.js +502 -0
- package/dist/pipeline/probe.d.ts +52 -0
- package/dist/pipeline/probe.js +390 -0
- package/dist/pipeline/provenance.d.ts +47 -0
- package/dist/pipeline/provenance.js +146 -0
- package/dist/pipeline/readiness-report.d.ts +87 -0
- package/dist/pipeline/readiness-report.js +205 -0
- package/dist/pipeline/release-classification.d.ts +54 -0
- package/dist/pipeline/release-classification.js +238 -0
- package/dist/pipeline/release-report.d.ts +37 -0
- package/dist/pipeline/release-report.js +222 -0
- package/dist/pipeline/repo-eval-comment.d.ts +37 -0
- package/dist/pipeline/repo-eval-comment.js +165 -0
- package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
- package/dist/pipeline/repo-threshold-evaluator.js +162 -0
- package/dist/pipeline/resolve-mappings.d.ts +35 -0
- package/dist/pipeline/resolve-mappings.js +72 -0
- package/dist/pipeline/retrieval-metrics.d.ts +39 -0
- package/dist/pipeline/retrieval-metrics.js +136 -0
- package/dist/pipeline/reverse-mapping.d.ts +67 -0
- package/dist/pipeline/reverse-mapping.js +88 -0
- package/dist/pipeline/schemas.d.ts +9 -0
- package/dist/pipeline/schemas.js +9 -0
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +88 -0
- package/dist/pipeline/targeted-loo.js +203 -0
- package/dist/pipeline/thresholds.d.ts +27 -0
- package/dist/pipeline/thresholds.js +245 -0
- package/dist/pipeline/types.d.ts +10 -0
- package/dist/pipeline/types.js +10 -0
- package/dist/pipeline/validate.d.ts +67 -0
- package/dist/pipeline/validate.js +406 -0
- package/dist/pipeline/webhook-server.d.ts +37 -0
- package/dist/pipeline/webhook-server.js +133 -0
- package/dist/report-store.d.ts +84 -0
- package/dist/report-store.js +208 -0
- package/dist/sanity/client.d.ts +38 -0
- package/dist/sanity/client.js +86 -0
- package/dist/sanity/portable-text.d.ts +11 -0
- package/dist/sanity/portable-text.js +211 -0
- package/dist/sanity/queries.d.ts +133 -0
- package/dist/sanity/queries.js +300 -0
- package/dist/schedules/digest.d.ts +116 -0
- package/dist/schedules/digest.js +156 -0
- package/dist/schedules/index.d.ts +12 -0
- package/dist/schedules/index.js +10 -0
- package/dist/schedules/loader.d.ts +31 -0
- package/dist/schedules/loader.js +73 -0
- package/dist/schedules/schema.d.ts +9 -0
- package/dist/schedules/schema.js +9 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +21 -0
- package/dist/scripts/validate-task-sources.js +210 -0
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/bigquery/index.d.ts +131 -0
- package/dist/sinks/bigquery/index.js +222 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/index.d.ts +23 -0
- package/dist/sinks/index.js +18 -0
- package/dist/sinks/loader.d.ts +18 -0
- package/dist/sinks/loader.js +82 -0
- package/dist/sinks/retry.d.ts +24 -0
- package/dist/sinks/retry.js +52 -0
- package/dist/sinks/schema.d.ts +9 -0
- package/dist/sinks/schema.js +9 -0
- package/dist/sinks/slack/format.d.ts +65 -0
- package/dist/sinks/slack/format.js +327 -0
- package/dist/sinks/slack/index.d.ts +27 -0
- package/dist/sinks/slack/index.js +78 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +59 -0
- package/dist/sinks/types.js +44 -0
- package/dist/sinks/webhook/index.d.ts +19 -0
- package/dist/sinks/webhook/index.js +50 -0
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/sources.d.ts +104 -0
- package/dist/sources.js +292 -0
- package/dist/webhook/budget.d.ts +42 -0
- package/dist/webhook/budget.js +60 -0
- package/dist/webhook/debounce.d.ts +67 -0
- package/dist/webhook/debounce.js +76 -0
- package/dist/webhook/dispatch.d.ts +45 -0
- package/dist/webhook/dispatch.js +84 -0
- package/dist/webhook/eval-request-handler.d.ts +87 -0
- package/dist/webhook/eval-request-handler.js +181 -0
- package/dist/webhook/handler.d.ts +88 -0
- package/dist/webhook/handler.js +203 -0
- package/dist/webhook/index.d.ts +17 -0
- package/dist/webhook/index.js +12 -0
- package/dist/webhook/types.d.ts +109 -0
- package/dist/webhook/types.js +10 -0
- package/package.json +72 -0
- package/tasks/.expanded.agentic.yaml +51 -0
- package/tasks/.expanded.yaml +66 -0
- package/tasks/frameworks.yaml +98 -0
- package/tasks/functions.yaml +51 -0
- package/tasks/groq.yaml +216 -0
- package/tasks/nextjs-live.yaml +62 -0
- package/tasks/studio-setup.yaml +111 -0
- package/tasks/visual-editing.yaml +120 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/compare.ts
|
|
3
|
+
*
|
|
4
|
+
* Core comparison primitive for the evaluation framework.
|
|
5
|
+
*
|
|
6
|
+
* Takes two ScoreSummary objects (baseline and experiment) and produces a
|
|
7
|
+
* structured ComparisonReport with overall, per-area, and per-dimension
|
|
8
|
+
* deltas, plus improved/regressed/unchanged classification.
|
|
9
|
+
*
|
|
10
|
+
* This is the single function that backs all comparison scenarios:
|
|
11
|
+
* doc improvement, model comparison, branch validation, etc.
|
|
12
|
+
* What varies is what produced each ScoreSummary — the comparison
|
|
13
|
+
* logic is always the same.
|
|
14
|
+
*
|
|
15
|
+
* @see docs/ideas/evaluation-roadmap.md — BP5: Make comparison a primitive
|
|
16
|
+
* @see docs/ideas/metrics-design.md — Tier 4: Comparison results
|
|
17
|
+
*/
|
|
18
|
+
import { DEFAULT_NOISE_THRESHOLD, } from "./types.js";
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
// Helpers
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
/** Classify a delta as improved, regressed, or unchanged given a threshold */
|
|
23
|
+
export function classifyChange(delta, threshold) {
|
|
24
|
+
if (delta > threshold)
|
|
25
|
+
return "improved";
|
|
26
|
+
if (delta < -threshold)
|
|
27
|
+
return "regressed";
|
|
28
|
+
return "unchanged";
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Compare two evaluation score summaries and produce a structured report.
|
|
32
|
+
*
|
|
33
|
+
* This is a pure function — no side effects, no file I/O.
|
|
34
|
+
*
|
|
35
|
+
* @param baseline The "before" or "control" score summary
|
|
36
|
+
* @param experiment The "after" or "treatment" score summary
|
|
37
|
+
* @param options Optional configuration (noise threshold, etc.)
|
|
38
|
+
* @returns A ComparisonReport with deltas, classifications, and breakdowns
|
|
39
|
+
*/
|
|
40
|
+
export function compare(baseline, experiment, options) {
|
|
41
|
+
// Use grader consistency data for empirical threshold when available
|
|
42
|
+
const graderConsistency = options?.graderConsistency;
|
|
43
|
+
const empirical = graderConsistency !== undefined;
|
|
44
|
+
const threshold = empirical
|
|
45
|
+
? graderConsistency.recommendedThreshold
|
|
46
|
+
: (options?.noiseThreshold ?? DEFAULT_NOISE_THRESHOLD);
|
|
47
|
+
// Collect all unique area names from both summaries
|
|
48
|
+
const baselineAreas = new Set(baseline.scores.map((s) => s.feature));
|
|
49
|
+
const experimentAreas = new Set(experiment.scores.map((s) => s.feature));
|
|
50
|
+
const allAreas = new Set([...baselineAreas, ...experimentAreas]);
|
|
51
|
+
// Identify mismatched areas
|
|
52
|
+
const onlyInBaseline = [...baselineAreas].filter((a) => !experimentAreas.has(a));
|
|
53
|
+
const onlyInExperiment = [...experimentAreas].filter((a) => !baselineAreas.has(a));
|
|
54
|
+
// Build per-area deltas
|
|
55
|
+
const areas = [...allAreas]
|
|
56
|
+
.sort()
|
|
57
|
+
.map((area) => buildAreaDelta(area, findScore(baseline.scores, area), findScore(experiment.scores, area), threshold));
|
|
58
|
+
// Classify areas
|
|
59
|
+
const improved = areas
|
|
60
|
+
.filter((a) => a.change === "improved")
|
|
61
|
+
.map((a) => a.area);
|
|
62
|
+
const regressed = areas
|
|
63
|
+
.filter((a) => a.change === "regressed")
|
|
64
|
+
.map((a) => a.area);
|
|
65
|
+
const unchanged = areas
|
|
66
|
+
.filter((a) => a.change === "unchanged")
|
|
67
|
+
.map((a) => a.area);
|
|
68
|
+
// Per-area deltas as a record
|
|
69
|
+
const perArea = {};
|
|
70
|
+
for (const a of areas) {
|
|
71
|
+
perArea[a.area] = a.delta;
|
|
72
|
+
}
|
|
73
|
+
// Per-dimension average deltas (only for areas present in both summaries)
|
|
74
|
+
const commonAreas = areas.filter((a) => baselineAreas.has(a.area) && experimentAreas.has(a.area));
|
|
75
|
+
const commonCount = commonAreas.length || 1;
|
|
76
|
+
const perDimension = {
|
|
77
|
+
codeCorrectness: commonAreas.reduce((s, a) => s + a.dimensions.codeCorrectness.delta, 0) /
|
|
78
|
+
commonCount,
|
|
79
|
+
docCoverage: commonAreas.reduce((s, a) => s + a.dimensions.docCoverage.delta, 0) /
|
|
80
|
+
commonCount,
|
|
81
|
+
taskCompletion: commonAreas.reduce((s, a) => s + a.dimensions.taskCompletion.delta, 0) /
|
|
82
|
+
commonCount,
|
|
83
|
+
};
|
|
84
|
+
// Doc Lift average delta (common areas only)
|
|
85
|
+
const docLift = commonAreas.reduce((s, a) => s + a.docLiftDelta, 0) / commonCount;
|
|
86
|
+
// Cost delta (if both summaries have cost data)
|
|
87
|
+
const baselineCost = baseline.overall.cost?.total;
|
|
88
|
+
const experimentCost = experiment.overall.cost?.total;
|
|
89
|
+
const costDelta = baselineCost !== undefined && experimentCost !== undefined
|
|
90
|
+
? experimentCost - baselineCost
|
|
91
|
+
: undefined;
|
|
92
|
+
// Overall score delta
|
|
93
|
+
const overallDelta = experiment.overall.avgScore - baseline.overall.avgScore;
|
|
94
|
+
// Per-model deltas (when both summaries have per-model data)
|
|
95
|
+
// Guard: older reports may have perModel as a Record instead of an array
|
|
96
|
+
let perModelDeltas;
|
|
97
|
+
if (Array.isArray(baseline.perModel) && Array.isArray(experiment.perModel)) {
|
|
98
|
+
const allModelIds = new Set([
|
|
99
|
+
...baseline.perModel.map((m) => m.modelId),
|
|
100
|
+
...experiment.perModel.map((m) => m.modelId),
|
|
101
|
+
]);
|
|
102
|
+
perModelDeltas = [];
|
|
103
|
+
for (const modelId of allModelIds) {
|
|
104
|
+
const bScore = baseline.perModel.find((m) => m.modelId === modelId)?.overall
|
|
105
|
+
.avgScore ?? 0;
|
|
106
|
+
const eScore = experiment.perModel.find((m) => m.modelId === modelId)?.overall
|
|
107
|
+
.avgScore ?? 0;
|
|
108
|
+
perModelDeltas.push({ delta: eScore - bScore, modelId });
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
return {
|
|
112
|
+
areas,
|
|
113
|
+
baseline,
|
|
114
|
+
deltas: {
|
|
115
|
+
docLift,
|
|
116
|
+
overall: overallDelta,
|
|
117
|
+
perArea,
|
|
118
|
+
perDimension,
|
|
119
|
+
...(costDelta !== undefined && { cost: costDelta }),
|
|
120
|
+
...(perModelDeltas && { perModel: perModelDeltas }),
|
|
121
|
+
},
|
|
122
|
+
experiment,
|
|
123
|
+
generatedAt: new Date().toISOString(),
|
|
124
|
+
improved,
|
|
125
|
+
mismatched: {
|
|
126
|
+
onlyInBaseline,
|
|
127
|
+
onlyInExperiment,
|
|
128
|
+
},
|
|
129
|
+
noiseThreshold: threshold,
|
|
130
|
+
noiseThresholdEmpirical: empirical,
|
|
131
|
+
regressed,
|
|
132
|
+
unchanged,
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
/** Build an AreaDelta from baseline and experiment scores for a single area */
|
|
136
|
+
function buildAreaDelta(area, baselineScore, experimentScore, threshold) {
|
|
137
|
+
const b = baselineScore;
|
|
138
|
+
const e = experimentScore;
|
|
139
|
+
const bTotal = b?.totalScore ?? 0;
|
|
140
|
+
const eTotal = e?.totalScore ?? 0;
|
|
141
|
+
const delta = eTotal - bTotal;
|
|
142
|
+
const bTask = b?.taskCompletion ?? 0;
|
|
143
|
+
const eTask = e?.taskCompletion ?? 0;
|
|
144
|
+
const bCode = b?.codeCorrectness ?? 0;
|
|
145
|
+
const eCode = e?.codeCorrectness ?? 0;
|
|
146
|
+
const bDoc = b?.docCoverage ?? 0;
|
|
147
|
+
const eDoc = e?.docCoverage ?? 0;
|
|
148
|
+
// Support both new field names and legacy data (old baselines/Sanity docs)
|
|
149
|
+
const bRaw = b;
|
|
150
|
+
const eRaw = e;
|
|
151
|
+
const bLift = b?.docLift ?? bRaw?.["liftFromDocs"] ?? 0;
|
|
152
|
+
const eLift = e?.docLift ?? eRaw?.["liftFromDocs"] ?? 0;
|
|
153
|
+
const bCeiling = b?.ceilingScore ?? bRaw?.["withDocsScore"] ?? 0;
|
|
154
|
+
const eCeiling = e?.ceilingScore ?? eRaw?.["withDocsScore"] ?? 0;
|
|
155
|
+
const bFloor = b?.floorScore ?? bRaw?.["withoutDocsScore"] ?? 0;
|
|
156
|
+
const eFloor = e?.floorScore ?? eRaw?.["withoutDocsScore"] ?? 0;
|
|
157
|
+
const bCost = b?.totalCost ?? 0;
|
|
158
|
+
const eCost = e?.totalCost ?? 0;
|
|
159
|
+
const hasCost = bCost > 0 || eCost > 0;
|
|
160
|
+
// Actual (agentic) score deltas — only when both runs have actual data
|
|
161
|
+
const bActual = b?.actualScore;
|
|
162
|
+
const eActual = e?.actualScore;
|
|
163
|
+
const hasActual = bActual !== undefined && eActual !== undefined;
|
|
164
|
+
// Retrieval gap deltas — only when both runs have retrieval gap data
|
|
165
|
+
const bRetGap = b?.retrievalGap;
|
|
166
|
+
const eRetGap = e?.retrievalGap;
|
|
167
|
+
const hasRetGap = bRetGap !== undefined && eRetGap !== undefined;
|
|
168
|
+
// Infrastructure efficiency deltas
|
|
169
|
+
const bInfra = b?.infrastructureEfficiency;
|
|
170
|
+
const eInfra = e?.infrastructureEfficiency;
|
|
171
|
+
const hasInfra = bInfra != null && eInfra != null;
|
|
172
|
+
return {
|
|
173
|
+
...(hasActual && { actualDelta: eActual - bActual }),
|
|
174
|
+
area,
|
|
175
|
+
baseline: bTotal,
|
|
176
|
+
ceilingDelta: eCeiling - bCeiling,
|
|
177
|
+
change: classifyChange(delta, threshold),
|
|
178
|
+
delta,
|
|
179
|
+
dimensions: {
|
|
180
|
+
codeCorrectness: {
|
|
181
|
+
baseline: bCode,
|
|
182
|
+
delta: eCode - bCode,
|
|
183
|
+
experiment: eCode,
|
|
184
|
+
},
|
|
185
|
+
docCoverage: { baseline: bDoc, delta: eDoc - bDoc, experiment: eDoc },
|
|
186
|
+
taskCompletion: {
|
|
187
|
+
baseline: bTask,
|
|
188
|
+
delta: eTask - bTask,
|
|
189
|
+
experiment: eTask,
|
|
190
|
+
},
|
|
191
|
+
},
|
|
192
|
+
docLiftDelta: eLift - bLift,
|
|
193
|
+
experiment: eTotal,
|
|
194
|
+
floorDelta: eFloor - bFloor,
|
|
195
|
+
...(hasInfra && {
|
|
196
|
+
infrastructureEfficiencyDelta: eInfra - bInfra,
|
|
197
|
+
}),
|
|
198
|
+
...(hasRetGap && { retrievalGapDelta: eRetGap - bRetGap }),
|
|
199
|
+
...(hasCost && { costDelta: eCost - bCost }),
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
// ---------------------------------------------------------------------------
|
|
203
|
+
// Main compare function
|
|
204
|
+
// ---------------------------------------------------------------------------
|
|
205
|
+
/** Safely get a feature score from a scores array */
|
|
206
|
+
function findScore(scores, area) {
|
|
207
|
+
return scores.find((s) => s.feature === area);
|
|
208
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* coverage-audit.ts
|
|
3
|
+
*
|
|
4
|
+
* Pure computation functions for cross-referencing the product feature registry
|
|
5
|
+
* (config/features.yaml) against actual task files (tasks/*.yaml)
|
|
6
|
+
* to produce a documentation coverage audit.
|
|
7
|
+
*
|
|
8
|
+
* Phase 3c of the Scenario Matrix implementation.
|
|
9
|
+
*
|
|
10
|
+
* @see docs/exec-plans/completed/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
11
|
+
*/
|
|
12
|
+
import type { CoverageAuditReport, ProductFeature } from "./types.js";
|
|
13
|
+
/**
|
|
14
|
+
* Count unique document slugs referenced across all tasks.
|
|
15
|
+
*/
|
|
16
|
+
export declare function countReferencedDocs(rootDir: string): {
|
|
17
|
+
slugs: string[];
|
|
18
|
+
total: number;
|
|
19
|
+
};
|
|
20
|
+
/**
|
|
21
|
+
* Count actual tasks per area from task YAML files.
|
|
22
|
+
*/
|
|
23
|
+
export declare function countTasksByArea(rootDir: string): Record<string, number>;
|
|
24
|
+
/**
|
|
25
|
+
* Format coverage audit for console output.
|
|
26
|
+
*/
|
|
27
|
+
export declare function formatCoverageConsole(report: CoverageAuditReport): string;
|
|
28
|
+
/**
|
|
29
|
+
* Format coverage audit as markdown.
|
|
30
|
+
*/
|
|
31
|
+
export declare function formatCoverageMarkdown(report: CoverageAuditReport): string;
|
|
32
|
+
/**
|
|
33
|
+
* Load and validate the feature registry from config/features.yaml.
|
|
34
|
+
*/
|
|
35
|
+
export declare function loadFeatureRegistry(rootDir: string): null | ProductFeature[];
|
|
36
|
+
/**
|
|
37
|
+
* Run the coverage audit and produce a structured report.
|
|
38
|
+
*/
|
|
39
|
+
export declare function runCoverageAudit(rootDir: string): CoverageAuditReport | null;
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* coverage-audit.ts
|
|
3
|
+
*
|
|
4
|
+
* Pure computation functions for cross-referencing the product feature registry
|
|
5
|
+
* (config/features.yaml) against actual task files (tasks/*.yaml)
|
|
6
|
+
* to produce a documentation coverage audit.
|
|
7
|
+
*
|
|
8
|
+
* Phase 3c of the Scenario Matrix implementation.
|
|
9
|
+
*
|
|
10
|
+
* @see docs/exec-plans/completed/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
11
|
+
*/
|
|
12
|
+
import { existsSync, readFileSync } from "fs";
|
|
13
|
+
import { join } from "path";
|
|
14
|
+
import { load } from "js-yaml";
|
|
15
|
+
import { FeatureRegistrySchema } from "./schemas.js";
|
|
16
|
+
import { resolveMappings } from "./resolve-mappings.js";
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
// Core logic (exported for testing)
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
/**
|
|
21
|
+
* Count unique document slugs referenced across all tasks.
|
|
22
|
+
*/
|
|
23
|
+
export function countReferencedDocs(rootDir) {
|
|
24
|
+
const mappings = resolveMappings(rootDir);
|
|
25
|
+
const allSlugs = new Set();
|
|
26
|
+
for (const config of Object.values(mappings.feature_areas)) {
|
|
27
|
+
for (const task of config.tasks) {
|
|
28
|
+
for (const doc of task.canonical_docs) {
|
|
29
|
+
allSlugs.add(doc.slug);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
const slugs = [...allSlugs].sort();
|
|
34
|
+
return { slugs, total: slugs.length };
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Count actual tasks per area from task YAML files.
|
|
38
|
+
*/
|
|
39
|
+
export function countTasksByArea(rootDir) {
|
|
40
|
+
const mappings = resolveMappings(rootDir);
|
|
41
|
+
const counts = {};
|
|
42
|
+
for (const [area, config] of Object.entries(mappings.feature_areas)) {
|
|
43
|
+
counts[area] = config.tasks.length;
|
|
44
|
+
}
|
|
45
|
+
return counts;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Format coverage audit for console output.
|
|
49
|
+
*/
|
|
50
|
+
export function formatCoverageConsole(report) {
|
|
51
|
+
const lines = [];
|
|
52
|
+
lines.push("═══════════════════════════════════════════════════════════════");
|
|
53
|
+
lines.push(" DOCUMENTATION COVERAGE AUDIT");
|
|
54
|
+
lines.push("═══════════════════════════════════════════════════════════════");
|
|
55
|
+
lines.push("");
|
|
56
|
+
lines.push(`Coverage: ${report.covered.length}/${report.totalFeatures} features (${report.coveragePercent}%)`);
|
|
57
|
+
lines.push("");
|
|
58
|
+
// Covered features
|
|
59
|
+
if (report.covered.length > 0) {
|
|
60
|
+
lines.push("COVERED FEATURES:");
|
|
61
|
+
for (const f of report.covered) {
|
|
62
|
+
const taskLabel = f.actualTaskCount === 1 ? "1 task" : `${f.actualTaskCount} tasks`;
|
|
63
|
+
const sections = f.sections.join(", ");
|
|
64
|
+
lines.push(` ✅ ${f.id.padEnd(20)} ${taskLabel.padEnd(10)} ${f.priority.padEnd(10)} ${sections}`);
|
|
65
|
+
}
|
|
66
|
+
lines.push("");
|
|
67
|
+
}
|
|
68
|
+
// Uncovered features
|
|
69
|
+
if (report.uncovered.length > 0) {
|
|
70
|
+
lines.push("UNCOVERED FEATURES (by priority):");
|
|
71
|
+
for (const f of report.uncovered) {
|
|
72
|
+
const sections = f.sections.join(", ");
|
|
73
|
+
lines.push(` ❌ ${f.id.padEnd(20)} ${f.priority.padEnd(10)} ${sections}`);
|
|
74
|
+
}
|
|
75
|
+
lines.push("");
|
|
76
|
+
}
|
|
77
|
+
return lines.join("\n");
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* Format coverage audit as markdown.
|
|
81
|
+
*/
|
|
82
|
+
export function formatCoverageMarkdown(report) {
|
|
83
|
+
const lines = [];
|
|
84
|
+
lines.push("### 📊 Documentation Coverage Audit");
|
|
85
|
+
lines.push("");
|
|
86
|
+
lines.push(`**Coverage: ${report.covered.length}/${report.totalFeatures} features (${report.coveragePercent}%)**`);
|
|
87
|
+
lines.push("");
|
|
88
|
+
if (report.covered.length > 0) {
|
|
89
|
+
lines.push("#### Covered Features");
|
|
90
|
+
lines.push("");
|
|
91
|
+
lines.push("| Feature | Tasks | Priority | Sections |");
|
|
92
|
+
lines.push("|---------|-------|----------|----------|");
|
|
93
|
+
for (const f of report.covered) {
|
|
94
|
+
lines.push(`| ✅ ${f.name} | ${f.actualTaskCount} | ${f.priority} | ${f.sections.join(", ")} |`);
|
|
95
|
+
}
|
|
96
|
+
lines.push("");
|
|
97
|
+
}
|
|
98
|
+
if (report.uncovered.length > 0) {
|
|
99
|
+
lines.push("#### Uncovered Features");
|
|
100
|
+
lines.push("");
|
|
101
|
+
lines.push("| Feature | Priority | Sections |");
|
|
102
|
+
lines.push("|---------|----------|----------|");
|
|
103
|
+
for (const f of report.uncovered) {
|
|
104
|
+
lines.push(`| ❌ ${f.name} | ${f.priority} | ${f.sections.join(", ")} |`);
|
|
105
|
+
}
|
|
106
|
+
lines.push("");
|
|
107
|
+
}
|
|
108
|
+
return lines.join("\n");
|
|
109
|
+
}
|
|
110
|
+
// ---------------------------------------------------------------------------
|
|
111
|
+
// Formatting
|
|
112
|
+
// ---------------------------------------------------------------------------
|
|
113
|
+
/**
|
|
114
|
+
* Load and validate the feature registry from config/features.yaml.
|
|
115
|
+
*/
|
|
116
|
+
export function loadFeatureRegistry(rootDir) {
|
|
117
|
+
const filePath = join(rootDir, "config", "features.yaml");
|
|
118
|
+
if (!existsSync(filePath)) {
|
|
119
|
+
return null;
|
|
120
|
+
}
|
|
121
|
+
const raw = readFileSync(filePath, "utf-8");
|
|
122
|
+
const parsed = load(raw);
|
|
123
|
+
const result = FeatureRegistrySchema.safeParse(parsed);
|
|
124
|
+
if (!result.success) {
|
|
125
|
+
console.error("❌ config/features.yaml validation failed:");
|
|
126
|
+
for (const issue of result.error.issues) {
|
|
127
|
+
console.error(` ${issue.path.join(".")}: ${issue.message}`);
|
|
128
|
+
}
|
|
129
|
+
return null;
|
|
130
|
+
}
|
|
131
|
+
return result.data.features;
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Run the coverage audit and produce a structured report.
|
|
135
|
+
*/
|
|
136
|
+
export function runCoverageAudit(rootDir) {
|
|
137
|
+
const features = loadFeatureRegistry(rootDir);
|
|
138
|
+
if (!features)
|
|
139
|
+
return null;
|
|
140
|
+
const taskCounts = countTasksByArea(rootDir);
|
|
141
|
+
const totalFeatures = features.length;
|
|
142
|
+
const covered = [];
|
|
143
|
+
const uncovered = [];
|
|
144
|
+
for (const feature of features) {
|
|
145
|
+
if (feature.status === "covered" && feature.area) {
|
|
146
|
+
const actualTaskCount = taskCounts[feature.area] ?? 0;
|
|
147
|
+
covered.push({ ...feature, actualTaskCount });
|
|
148
|
+
}
|
|
149
|
+
else if (feature.status === "uncovered" || feature.status === "planned") {
|
|
150
|
+
uncovered.push(feature);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
// Sort uncovered by priority
|
|
154
|
+
const priorityOrder = { critical: 0, high: 1, low: 3, medium: 2 };
|
|
155
|
+
uncovered.sort((a, b) => priorityOrder[a.priority] - priorityOrder[b.priority]);
|
|
156
|
+
const coveredCount = covered.length;
|
|
157
|
+
const coveragePercent = totalFeatures > 0 ? (coveredCount / totalFeatures) * 100 : 0;
|
|
158
|
+
return {
|
|
159
|
+
coveragePercent: Math.round(coveragePercent * 10) / 10,
|
|
160
|
+
covered,
|
|
161
|
+
generatedAt: new Date().toISOString(),
|
|
162
|
+
totalFeatures,
|
|
163
|
+
uncovered,
|
|
164
|
+
};
|
|
165
|
+
}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/degradations.ts
|
|
3
|
+
*
|
|
4
|
+
* Programmatic code degradation strategies for sensitivity testing.
|
|
5
|
+
*
|
|
6
|
+
* Each strategy takes a "good" reference solution (string) and returns a
|
|
7
|
+
* "bad" version that should score lower on a specific dimension:
|
|
8
|
+
*
|
|
9
|
+
* - Task Completion: remove key functional sections
|
|
10
|
+
* - Code Correctness: introduce anti-patterns and deprecated APIs
|
|
11
|
+
* - Doc Coverage: strip documentation references, add hallucinated details
|
|
12
|
+
*
|
|
13
|
+
* These are deterministic, pure functions — no randomness, no side effects.
|
|
14
|
+
*
|
|
15
|
+
* @see docs/exec-plans/completed/grader-reliability.md — Phase 4
|
|
16
|
+
*/
|
|
17
|
+
/** A degradation targeting a specific scoring dimension */
|
|
18
|
+
export interface Degradation {
|
|
19
|
+
/** Apply the degradation to source code */
|
|
20
|
+
apply: (source: string) => string;
|
|
21
|
+
/** Human-readable description of what this degradation does */
|
|
22
|
+
description: string;
|
|
23
|
+
/** Which dimension should score lower after degradation */
|
|
24
|
+
targetDimension: "codeCorrectness" | "docCoverage" | "taskCompletion";
|
|
25
|
+
}
|
|
26
|
+
/** Result of applying a degradation to a reference solution */
|
|
27
|
+
export interface DegradedPair {
|
|
28
|
+
/** The degradation that was applied */
|
|
29
|
+
degradation: Degradation;
|
|
30
|
+
/** The degraded ("bad") version */
|
|
31
|
+
degraded: string;
|
|
32
|
+
/** The original ("good") reference solution */
|
|
33
|
+
original: string;
|
|
34
|
+
/** Reference solution file path (relative) */
|
|
35
|
+
sourcePath: string;
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Remove the bottom half of the code (functions, exports, etc).
|
|
39
|
+
* A response missing half its functionality should score lower on Task Completion.
|
|
40
|
+
*/
|
|
41
|
+
export declare const removeBottomHalf: Degradation;
|
|
42
|
+
/**
|
|
43
|
+
* Remove all export statements and exported functions.
|
|
44
|
+
* Missing exports = incomplete API surface → lower Task Completion.
|
|
45
|
+
*/
|
|
46
|
+
export declare const removeExports: Degradation;
|
|
47
|
+
/**
|
|
48
|
+
* Replace function bodies with TODO comments.
|
|
49
|
+
* Skeleton code that doesn't actually implement anything.
|
|
50
|
+
*/
|
|
51
|
+
export declare const stubFunctions: Degradation;
|
|
52
|
+
/**
|
|
53
|
+
* Replace modern API calls with deprecated/incorrect patterns.
|
|
54
|
+
* Targets Sanity-specific patterns that the grader should catch.
|
|
55
|
+
*/
|
|
56
|
+
export declare const introduceDeprecatedAPIs: Degradation;
|
|
57
|
+
/**
|
|
58
|
+
* Introduce common GROQ syntax errors.
|
|
59
|
+
* Invalid queries that look plausible but wouldn't work.
|
|
60
|
+
*/
|
|
61
|
+
export declare const introduceGroqErrors: Degradation;
|
|
62
|
+
/**
|
|
63
|
+
* Remove all TypeScript types and use any everywhere.
|
|
64
|
+
* Technically works but is an anti-pattern.
|
|
65
|
+
*/
|
|
66
|
+
export declare const removeTypes: Degradation;
|
|
67
|
+
/**
|
|
68
|
+
* Remove all comments and documentation.
|
|
69
|
+
* The response shows no evidence of using documentation.
|
|
70
|
+
*/
|
|
71
|
+
export declare const stripComments: Degradation;
|
|
72
|
+
/**
|
|
73
|
+
* Add hallucinated API calls and made-up configuration options.
|
|
74
|
+
* Response looks confident but uses APIs that don't exist.
|
|
75
|
+
*/
|
|
76
|
+
export declare const addHallucinations: Degradation;
|
|
77
|
+
/**
|
|
78
|
+
* Replace correct import paths with plausible but wrong ones.
|
|
79
|
+
* Shows the response was guessing at the API surface.
|
|
80
|
+
*/
|
|
81
|
+
export declare const wrongImports: Degradation;
|
|
82
|
+
/** All available degradation strategies, grouped by target dimension */
|
|
83
|
+
export declare const DEGRADATION_STRATEGIES: Degradation[];
|
|
84
|
+
/** Get degradation strategies targeting a specific dimension */
|
|
85
|
+
export declare function getStrategiesForDimension(dimension: "codeCorrectness" | "docCoverage" | "taskCompletion"): Degradation[];
|