@sanity/ailf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -0
- package/bin/ailf.js +64 -0
- package/canonical/grader-references/README.md +88 -0
- package/canonical/grader-references/groq.yaml +234 -0
- package/canonical/grader-references/studio-setup.yaml +275 -0
- package/canonical/reference-solutions/.gitkeep +1 -0
- package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
- package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
- package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
- package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
- package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
- package/canonical/reference-solutions/groq/joins-references.ts +300 -0
- package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
- package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
- package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
- package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
- package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
- package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
- package/config/bigquery/README.md +74 -0
- package/config/bigquery/views/area_scores.sql +87 -0
- package/config/bigquery/views/reports.sql +49 -0
- package/config/features.yaml +116 -0
- package/config/models.yaml +115 -0
- package/config/prompts.yaml +75 -0
- package/config/rubrics.yaml +62 -0
- package/config/schedules.yaml +43 -0
- package/config/sinks.yaml +54 -0
- package/config/sources.yaml +51 -0
- package/config/thresholds.yaml +49 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
- package/dist/_vendor/ailf-core/examples/index.js +285 -0
- package/dist/_vendor/ailf-core/index.d.ts +17 -0
- package/dist/_vendor/ailf-core/index.js +17 -0
- package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
- package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
- package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
- package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
- package/dist/_vendor/ailf-core/ports/context.js +14 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
- package/dist/_vendor/ailf-core/ports/index.js +7 -0
- package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
- package/dist/_vendor/ailf-core/ports/logger.js +11 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
- package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/schemas/index.js +16 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
- package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
- package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
- package/dist/_vendor/ailf-core/services/index.js +12 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/scoring.js +222 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
- package/dist/_vendor/ailf-core/types/index.js +21 -0
- package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
- package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
- package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
- package/dist/_vendor/ailf-shared/document-ref.js +1 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -0
- package/dist/_vendor/ailf-shared/index.js +16 -0
- package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
- package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
- package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
- package/dist/_vendor/ailf-shared/score-grades.js +23 -0
- package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
- package/dist/adapters/cache/content-lake-cache.js +59 -0
- package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
- package/dist/adapters/cache/filesystem-cache.js +54 -0
- package/dist/adapters/cache/index.d.ts +2 -0
- package/dist/adapters/cache/index.js +2 -0
- package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
- package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
- package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
- package/dist/adapters/config-sources/file-config-adapter.js +96 -0
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +2 -0
- package/dist/adapters/doc-fetchers/index.d.ts +1 -0
- package/dist/adapters/doc-fetchers/index.js +1 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
- package/dist/adapters/eval-runners/index.d.ts +1 -0
- package/dist/adapters/eval-runners/index.js +1 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
- package/dist/adapters/index.d.ts +12 -0
- package/dist/adapters/index.js +12 -0
- package/dist/adapters/loggers/console-logger.d.ts +22 -0
- package/dist/adapters/loggers/console-logger.js +54 -0
- package/dist/adapters/loggers/index.d.ts +9 -0
- package/dist/adapters/loggers/index.js +9 -0
- package/dist/adapters/loggers/json-logger.d.ts +18 -0
- package/dist/adapters/loggers/json-logger.js +33 -0
- package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
- package/dist/adapters/loggers/quiet-logger.js +30 -0
- package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/composite-task-source.js +59 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
- package/dist/adapters/task-sources/index.d.ts +7 -0
- package/dist/adapters/task-sources/index.js +7 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
- package/dist/adapters/task-sources/repo-schemas.js +234 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
- package/dist/adapters/task-sources/repo-task-source.js +104 -0
- package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
- package/dist/adapters/task-sources/repo-trigger.js +153 -0
- package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
- package/dist/adapters/task-sources/repo-validation.js +164 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
- package/dist/adapters/task-sources/yaml-task-source.js +136 -0
- package/dist/agent-observer/agentic-provider.d.ts +132 -0
- package/dist/agent-observer/agentic-provider.js +983 -0
- package/dist/agent-observer/classifier.d.ts +62 -0
- package/dist/agent-observer/classifier.js +269 -0
- package/dist/agent-observer/index.d.ts +7 -0
- package/dist/agent-observer/index.js +4 -0
- package/dist/agent-observer/pricing.d.ts +35 -0
- package/dist/agent-observer/pricing.js +82 -0
- package/dist/agent-observer/provider.d.ts +77 -0
- package/dist/agent-observer/provider.js +151 -0
- package/dist/agent-observer/proxy.d.ts +91 -0
- package/dist/agent-observer/proxy.js +321 -0
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/agent-observer/types.d.ts +137 -0
- package/dist/agent-observer/types.js +16 -0
- package/dist/assertions/source-isolation.d.ts +72 -0
- package/dist/assertions/source-isolation.js +117 -0
- package/dist/cli.d.ts +24 -0
- package/dist/cli.js +199 -0
- package/dist/commands/agent-report.d.ts +5 -0
- package/dist/commands/agent-report.js +69 -0
- package/dist/commands/baseline.d.ts +9 -0
- package/dist/commands/baseline.js +141 -0
- package/dist/commands/cache.d.ts +13 -0
- package/dist/commands/cache.js +135 -0
- package/dist/commands/calculate-scores.d.ts +8 -0
- package/dist/commands/calculate-scores.js +48 -0
- package/dist/commands/compare.d.ts +8 -0
- package/dist/commands/compare.js +120 -0
- package/dist/commands/completion.d.ts +18 -0
- package/dist/commands/completion.js +260 -0
- package/dist/commands/coverage-audit.d.ts +7 -0
- package/dist/commands/coverage-audit.js +40 -0
- package/dist/commands/discovery-report.d.ts +10 -0
- package/dist/commands/discovery-report.js +44 -0
- package/dist/commands/eval.d.ts +9 -0
- package/dist/commands/eval.js +35 -0
- package/dist/commands/explain-handler.d.ts +34 -0
- package/dist/commands/explain-handler.js +719 -0
- package/dist/commands/fetch-docs.d.ts +8 -0
- package/dist/commands/fetch-docs.js +128 -0
- package/dist/commands/generate-configs.d.ts +8 -0
- package/dist/commands/generate-configs.js +46 -0
- package/dist/commands/grader/index.d.ts +11 -0
- package/dist/commands/grader/index.js +118 -0
- package/dist/commands/init.d.ts +19 -0
- package/dist/commands/init.js +150 -0
- package/dist/commands/interactive.d.ts +12 -0
- package/dist/commands/interactive.js +238 -0
- package/dist/commands/lookup-doc.d.ts +15 -0
- package/dist/commands/lookup-doc.js +84 -0
- package/dist/commands/measure-retrieval.d.ts +5 -0
- package/dist/commands/measure-retrieval.js +65 -0
- package/dist/commands/pipeline-action.d.ts +71 -0
- package/dist/commands/pipeline-action.js +305 -0
- package/dist/commands/pipeline.d.ts +62 -0
- package/dist/commands/pipeline.js +53 -0
- package/dist/commands/pr-comment.d.ts +8 -0
- package/dist/commands/pr-comment.js +47 -0
- package/dist/commands/publish.d.ts +26 -0
- package/dist/commands/publish.js +253 -0
- package/dist/commands/readiness-report.d.ts +10 -0
- package/dist/commands/readiness-report.js +104 -0
- package/dist/commands/shared/options.d.ts +29 -0
- package/dist/commands/shared/options.js +57 -0
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +16 -0
- package/dist/commands/validate-tasks.js +93 -0
- package/dist/commands/validate.d.ts +9 -0
- package/dist/commands/validate.js +73 -0
- package/dist/commands/webhook-server.d.ts +5 -0
- package/dist/commands/webhook-server.js +30 -0
- package/dist/commands/weekly-digest.d.ts +10 -0
- package/dist/commands/weekly-digest.js +104 -0
- package/dist/composition-root.d.ts +26 -0
- package/dist/composition-root.js +107 -0
- package/dist/interpolate.d.ts +26 -0
- package/dist/interpolate.js +70 -0
- package/dist/job-store.d.ts +104 -0
- package/dist/job-store.js +188 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.d.ts +27 -0
- package/dist/orchestration/build-app-context.js +81 -0
- package/dist/orchestration/build-step-sequence.d.ts +15 -0
- package/dist/orchestration/build-step-sequence.js +84 -0
- package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
- package/dist/orchestration/config-to-source-overrides.js +28 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/index.d.ts +11 -0
- package/dist/orchestration/index.js +11 -0
- package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
- package/dist/orchestration/pipeline-orchestrator.js +153 -0
- package/dist/orchestration/step-runner.d.ts +20 -0
- package/dist/orchestration/step-runner.js +88 -0
- package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
- package/dist/orchestration/steps/calculate-scores-step.js +95 -0
- package/dist/orchestration/steps/callback-step.d.ts +24 -0
- package/dist/orchestration/steps/callback-step.js +76 -0
- package/dist/orchestration/steps/compare-step.d.ts +14 -0
- package/dist/orchestration/steps/compare-step.js +92 -0
- package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
- package/dist/orchestration/steps/discovery-report-step.js +55 -0
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
- package/dist/orchestration/steps/fetch-docs-step.js +135 -0
- package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
- package/dist/orchestration/steps/gap-analysis-step.js +136 -0
- package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
- package/dist/orchestration/steps/generate-configs-step.js +85 -0
- package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
- package/dist/orchestration/steps/grader-consistency-step.js +64 -0
- package/dist/orchestration/steps/index.d.ts +19 -0
- package/dist/orchestration/steps/index.js +19 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
- package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
- package/dist/orchestration/steps/publish-report-step.js +216 -0
- package/dist/orchestration/steps/readiness-step.d.ts +13 -0
- package/dist/orchestration/steps/readiness-step.js +91 -0
- package/dist/orchestration/steps/report-step.d.ts +12 -0
- package/dist/orchestration/steps/report-step.js +49 -0
- package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
- package/dist/orchestration/steps/run-eval-step.js +195 -0
- package/dist/orchestration/steps/validate-step.d.ts +12 -0
- package/dist/orchestration/steps/validate-step.js +41 -0
- package/dist/pipeline/agent-behavior-report.d.ts +53 -0
- package/dist/pipeline/agent-behavior-report.js +132 -0
- package/dist/pipeline/attribution.d.ts +47 -0
- package/dist/pipeline/attribution.js +226 -0
- package/dist/pipeline/baseline.d.ts +37 -0
- package/dist/pipeline/baseline.js +141 -0
- package/dist/pipeline/cache.d.ts +101 -0
- package/dist/pipeline/cache.js +283 -0
- package/dist/pipeline/calculate-scores.d.ts +102 -0
- package/dist/pipeline/calculate-scores.js +1128 -0
- package/dist/pipeline/callback-delivery.d.ts +50 -0
- package/dist/pipeline/callback-delivery.js +89 -0
- package/dist/pipeline/checks.d.ts +39 -0
- package/dist/pipeline/checks.js +280 -0
- package/dist/pipeline/classify-url.d.ts +61 -0
- package/dist/pipeline/classify-url.js +93 -0
- package/dist/pipeline/compare.d.ts +31 -0
- package/dist/pipeline/compare.js +208 -0
- package/dist/pipeline/coverage-audit.d.ts +39 -0
- package/dist/pipeline/coverage-audit.js +165 -0
- package/dist/pipeline/degradations.d.ts +85 -0
- package/dist/pipeline/degradations.js +242 -0
- package/dist/pipeline/discovery-report.d.ts +55 -0
- package/dist/pipeline/discovery-report.js +178 -0
- package/dist/pipeline/eval-constants.d.ts +68 -0
- package/dist/pipeline/eval-constants.js +111 -0
- package/dist/pipeline/eval-fingerprint.d.ts +66 -0
- package/dist/pipeline/eval-fingerprint.js +175 -0
- package/dist/pipeline/expand-tasks.d.ts +220 -0
- package/dist/pipeline/expand-tasks.js +421 -0
- package/dist/pipeline/failure-modes.d.ts +46 -0
- package/dist/pipeline/failure-modes.js +348 -0
- package/dist/pipeline/fetch-url-content.d.ts +44 -0
- package/dist/pipeline/fetch-url-content.js +93 -0
- package/dist/pipeline/gap-analysis.d.ts +48 -0
- package/dist/pipeline/gap-analysis.js +231 -0
- package/dist/pipeline/generate-configs.d.ts +72 -0
- package/dist/pipeline/generate-configs.js +395 -0
- package/dist/pipeline/grader-api.d.ts +49 -0
- package/dist/pipeline/grader-api.js +200 -0
- package/dist/pipeline/grader-compare-runner.d.ts +44 -0
- package/dist/pipeline/grader-compare-runner.js +301 -0
- package/dist/pipeline/grader-comparison.d.ts +111 -0
- package/dist/pipeline/grader-comparison.js +161 -0
- package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
- package/dist/pipeline/grader-consistency-runner.js +270 -0
- package/dist/pipeline/grader-consistency.d.ts +103 -0
- package/dist/pipeline/grader-consistency.js +146 -0
- package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
- package/dist/pipeline/grader-sensitivity-runner.js +282 -0
- package/dist/pipeline/grader-sensitivity.d.ts +94 -0
- package/dist/pipeline/grader-sensitivity.js +144 -0
- package/dist/pipeline/grader-validate-runner.d.ts +38 -0
- package/dist/pipeline/grader-validate-runner.js +229 -0
- package/dist/pipeline/grader-validation.d.ts +107 -0
- package/dist/pipeline/grader-validation.js +169 -0
- package/dist/pipeline/map-request-to-config.d.ts +19 -0
- package/dist/pipeline/map-request-to-config.js +80 -0
- package/dist/pipeline/measure-retrieval.d.ts +59 -0
- package/dist/pipeline/measure-retrieval.js +111 -0
- package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
- package/dist/pipeline/mirror-repo-tasks.js +350 -0
- package/dist/pipeline/plan-format.d.ts +33 -0
- package/dist/pipeline/plan-format.js +202 -0
- package/dist/pipeline/plan.d.ts +169 -0
- package/dist/pipeline/plan.js +708 -0
- package/dist/pipeline/pr-comment.d.ts +19 -0
- package/dist/pipeline/pr-comment.js +502 -0
- package/dist/pipeline/probe.d.ts +52 -0
- package/dist/pipeline/probe.js +390 -0
- package/dist/pipeline/provenance.d.ts +47 -0
- package/dist/pipeline/provenance.js +146 -0
- package/dist/pipeline/readiness-report.d.ts +87 -0
- package/dist/pipeline/readiness-report.js +205 -0
- package/dist/pipeline/release-classification.d.ts +54 -0
- package/dist/pipeline/release-classification.js +238 -0
- package/dist/pipeline/release-report.d.ts +37 -0
- package/dist/pipeline/release-report.js +222 -0
- package/dist/pipeline/repo-eval-comment.d.ts +37 -0
- package/dist/pipeline/repo-eval-comment.js +165 -0
- package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
- package/dist/pipeline/repo-threshold-evaluator.js +162 -0
- package/dist/pipeline/resolve-mappings.d.ts +35 -0
- package/dist/pipeline/resolve-mappings.js +72 -0
- package/dist/pipeline/retrieval-metrics.d.ts +39 -0
- package/dist/pipeline/retrieval-metrics.js +136 -0
- package/dist/pipeline/reverse-mapping.d.ts +67 -0
- package/dist/pipeline/reverse-mapping.js +88 -0
- package/dist/pipeline/schemas.d.ts +9 -0
- package/dist/pipeline/schemas.js +9 -0
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +88 -0
- package/dist/pipeline/targeted-loo.js +203 -0
- package/dist/pipeline/thresholds.d.ts +27 -0
- package/dist/pipeline/thresholds.js +245 -0
- package/dist/pipeline/types.d.ts +10 -0
- package/dist/pipeline/types.js +10 -0
- package/dist/pipeline/validate.d.ts +67 -0
- package/dist/pipeline/validate.js +406 -0
- package/dist/pipeline/webhook-server.d.ts +37 -0
- package/dist/pipeline/webhook-server.js +133 -0
- package/dist/report-store.d.ts +84 -0
- package/dist/report-store.js +208 -0
- package/dist/sanity/client.d.ts +38 -0
- package/dist/sanity/client.js +86 -0
- package/dist/sanity/portable-text.d.ts +11 -0
- package/dist/sanity/portable-text.js +211 -0
- package/dist/sanity/queries.d.ts +133 -0
- package/dist/sanity/queries.js +300 -0
- package/dist/schedules/digest.d.ts +116 -0
- package/dist/schedules/digest.js +156 -0
- package/dist/schedules/index.d.ts +12 -0
- package/dist/schedules/index.js +10 -0
- package/dist/schedules/loader.d.ts +31 -0
- package/dist/schedules/loader.js +73 -0
- package/dist/schedules/schema.d.ts +9 -0
- package/dist/schedules/schema.js +9 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +21 -0
- package/dist/scripts/validate-task-sources.js +210 -0
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/bigquery/index.d.ts +131 -0
- package/dist/sinks/bigquery/index.js +222 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/index.d.ts +23 -0
- package/dist/sinks/index.js +18 -0
- package/dist/sinks/loader.d.ts +18 -0
- package/dist/sinks/loader.js +82 -0
- package/dist/sinks/retry.d.ts +24 -0
- package/dist/sinks/retry.js +52 -0
- package/dist/sinks/schema.d.ts +9 -0
- package/dist/sinks/schema.js +9 -0
- package/dist/sinks/slack/format.d.ts +65 -0
- package/dist/sinks/slack/format.js +327 -0
- package/dist/sinks/slack/index.d.ts +27 -0
- package/dist/sinks/slack/index.js +78 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +59 -0
- package/dist/sinks/types.js +44 -0
- package/dist/sinks/webhook/index.d.ts +19 -0
- package/dist/sinks/webhook/index.js +50 -0
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/sources.d.ts +104 -0
- package/dist/sources.js +292 -0
- package/dist/webhook/budget.d.ts +42 -0
- package/dist/webhook/budget.js +60 -0
- package/dist/webhook/debounce.d.ts +67 -0
- package/dist/webhook/debounce.js +76 -0
- package/dist/webhook/dispatch.d.ts +45 -0
- package/dist/webhook/dispatch.js +84 -0
- package/dist/webhook/eval-request-handler.d.ts +87 -0
- package/dist/webhook/eval-request-handler.js +181 -0
- package/dist/webhook/handler.d.ts +88 -0
- package/dist/webhook/handler.js +203 -0
- package/dist/webhook/index.d.ts +17 -0
- package/dist/webhook/index.js +12 -0
- package/dist/webhook/types.d.ts +109 -0
- package/dist/webhook/types.js +10 -0
- package/package.json +72 -0
- package/tasks/.expanded.agentic.yaml +51 -0
- package/tasks/.expanded.yaml +66 -0
- package/tasks/frameworks.yaml +98 -0
- package/tasks/functions.yaml +51 -0
- package/tasks/groq.yaml +216 -0
- package/tasks/nextjs-live.yaml +62 -0
- package/tasks/studio-setup.yaml +111 -0
- package/tasks/visual-editing.yaml +120 -0
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/failure-modes.ts
|
|
3
|
+
*
|
|
4
|
+
* Keyword-based failure mode classifier for grader reasoning text,
|
|
5
|
+
* cross-referenced with ceiling decomposition data.
|
|
6
|
+
*
|
|
7
|
+
* Phase 3a of the Scenario Matrix implementation.
|
|
8
|
+
*
|
|
9
|
+
* The classifier uses two signal sources:
|
|
10
|
+
* 1. Keyword matching on grader reason text (primary)
|
|
11
|
+
* 2. Ceiling decomposition structural signals (supplementary)
|
|
12
|
+
*
|
|
13
|
+
* When both sources agree, confidence is boosted. When only ceiling
|
|
14
|
+
* signals are available, they serve as a fallback for unclassified cases.
|
|
15
|
+
*
|
|
16
|
+
* @see docs/exec-plans/completed/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
17
|
+
*/
|
|
18
|
+
import { detectFeatureArea } from "../_vendor/ailf-core/index.js";
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
// Constants
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
/** Only classify judgments with scores below this threshold */
|
|
23
|
+
const CLASSIFICATION_THRESHOLD = 60;
|
|
24
|
+
/** All failure mode types for initializing empty counts */
|
|
25
|
+
const ALL_MODES = [
|
|
26
|
+
"incorrect-docs",
|
|
27
|
+
"missing-docs",
|
|
28
|
+
"model-limitation",
|
|
29
|
+
"outdated-docs",
|
|
30
|
+
"poor-structure",
|
|
31
|
+
"unclassified",
|
|
32
|
+
];
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
// Keyword patterns
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
const OUTDATED_PATTERN = /deprecated|old api|v[0-9]+ syntax|no longer supported|legacy|previous version|outdated|superseded|replaced by/i;
|
|
37
|
+
const MISSING_PATTERN = /no documentation|not covered|had to guess|not found|missing.*doc|no.*information|undocumented|couldn't find|without.*documentation/i;
|
|
38
|
+
const INCORRECT_PATTERN = /contradicts|incorrect.*doc|doc.*incorrect|wrong.*doc|doc.*wrong|documentation says.*but|factual error|inaccurate|misleading.*doc/i;
|
|
39
|
+
const POOR_STRUCTURE_PATTERN = /unclear|ambiguous|couldn't determine|conflicting|confusing|hard to follow|poorly organized|scattered|fragmented/i;
|
|
40
|
+
// ---------------------------------------------------------------------------
|
|
41
|
+
// Public API
|
|
42
|
+
// ---------------------------------------------------------------------------
|
|
43
|
+
/**
|
|
44
|
+
* Build a complete failure mode report from grader judgments and scores.
|
|
45
|
+
*
|
|
46
|
+
* @param judgments - All grader judgments from the evaluation
|
|
47
|
+
* @param scores - Per-area feature scores (for ceiling decomposition)
|
|
48
|
+
* @returns Failure mode report with per-area breakdowns
|
|
49
|
+
*/
|
|
50
|
+
export function buildFailureModeReport(judgments, scores) {
|
|
51
|
+
// Build a lookup from area → scores
|
|
52
|
+
const scoreByArea = new Map();
|
|
53
|
+
for (const score of scores) {
|
|
54
|
+
scoreByArea.set(score.feature, score);
|
|
55
|
+
}
|
|
56
|
+
const classifiedJudgments = [];
|
|
57
|
+
const summary = initModeCounts();
|
|
58
|
+
const byArea = {};
|
|
59
|
+
for (const judgment of judgments) {
|
|
60
|
+
// Extract area from taskId description (e.g., "GROQ - Blog queries..." → "groq")
|
|
61
|
+
const area = resolveArea(judgment.taskId, scoreByArea);
|
|
62
|
+
const areaScore = area ? scoreByArea.get(area) : undefined;
|
|
63
|
+
const ceilingScore = areaScore?.ceilingScore ?? 100;
|
|
64
|
+
const floorScore = areaScore?.floorScore ?? 0;
|
|
65
|
+
const classification = classifyFailureMode(judgment, ceilingScore, floorScore);
|
|
66
|
+
classifiedJudgments.push({ classification, judgment });
|
|
67
|
+
summary[classification.mode]++;
|
|
68
|
+
// Per-area tracking
|
|
69
|
+
if (area) {
|
|
70
|
+
if (!byArea[area]) {
|
|
71
|
+
byArea[area] = {
|
|
72
|
+
area,
|
|
73
|
+
modes: initModeCounts(),
|
|
74
|
+
topMode: "unclassified",
|
|
75
|
+
totalJudgments: 0,
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
byArea[area].modes[classification.mode]++;
|
|
79
|
+
byArea[area].totalJudgments++;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
// Compute top mode per area
|
|
83
|
+
for (const areaData of Object.values(byArea)) {
|
|
84
|
+
areaData.topMode = findTopMode(areaData.modes);
|
|
85
|
+
}
|
|
86
|
+
const totalJudgments = judgments.length;
|
|
87
|
+
const classified = totalJudgments - (summary["unclassified"] ?? 0);
|
|
88
|
+
const classificationRate = totalJudgments > 0 ? (classified / totalJudgments) * 100 : 0;
|
|
89
|
+
return {
|
|
90
|
+
byArea,
|
|
91
|
+
classificationRate,
|
|
92
|
+
classifiedJudgments,
|
|
93
|
+
summary,
|
|
94
|
+
totalJudgments,
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Classify the failure mode of a low-scoring grader judgment.
|
|
99
|
+
*
|
|
100
|
+
* Uses keyword matching on the reason text, then cross-references with
|
|
101
|
+
* ceiling decomposition data for structural confirmation.
|
|
102
|
+
*
|
|
103
|
+
* @param judgment - The grader judgment to classify
|
|
104
|
+
* @param ceilingScore - The area's ceiling score (with-docs best case)
|
|
105
|
+
* @param floorScore - The area's floor score (no-docs baseline)
|
|
106
|
+
* @returns Classified failure mode with confidence level
|
|
107
|
+
*/
|
|
108
|
+
export function classifyFailureMode(judgment, ceilingScore, floorScore) {
|
|
109
|
+
// Passing scores don't need failure mode analysis
|
|
110
|
+
if (judgment.score >= CLASSIFICATION_THRESHOLD) {
|
|
111
|
+
return { confidence: "low", mode: "unclassified", source: "keyword" };
|
|
112
|
+
}
|
|
113
|
+
const reason = judgment.reason.toLowerCase();
|
|
114
|
+
// Step 1: Keyword-based classification
|
|
115
|
+
const keywordMode = classifyByKeyword(reason);
|
|
116
|
+
// Step 2: Ceiling-based structural classification
|
|
117
|
+
const ceilingMode = classifyByCeiling(judgment.score, ceilingScore, floorScore);
|
|
118
|
+
// Step 3: Combine signals
|
|
119
|
+
return combineClassifications(keywordMode, ceilingMode);
|
|
120
|
+
}
|
|
121
|
+
// ---------------------------------------------------------------------------
|
|
122
|
+
// Formatting
|
|
123
|
+
// ---------------------------------------------------------------------------
|
|
124
|
+
/**
|
|
125
|
+
* Format a failure mode report for console output.
|
|
126
|
+
*/
|
|
127
|
+
export function formatFailureModesConsole(report) {
|
|
128
|
+
const lines = [];
|
|
129
|
+
lines.push("🔍 FAILURE MODE ANALYSIS");
|
|
130
|
+
lines.push("");
|
|
131
|
+
lines.push(` ${report.totalJudgments} judgments analyzed, ${report.classificationRate.toFixed(0)}% classified`);
|
|
132
|
+
lines.push("");
|
|
133
|
+
// Summary table
|
|
134
|
+
lines.push(" Mode Count");
|
|
135
|
+
lines.push(" ────────────────── ─────");
|
|
136
|
+
for (const mode of ALL_MODES) {
|
|
137
|
+
const count = report.summary[mode] ?? 0;
|
|
138
|
+
if (count > 0) {
|
|
139
|
+
const icon = modeIcon(mode);
|
|
140
|
+
lines.push(` ${icon} ${mode.padEnd(18)} ${count}`);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
lines.push("");
|
|
144
|
+
// Per-area breakdown
|
|
145
|
+
if (Object.keys(report.byArea).length > 0) {
|
|
146
|
+
lines.push(" Per-area top failure modes:");
|
|
147
|
+
for (const [area, data] of Object.entries(report.byArea).sort(([a], [b]) => a.localeCompare(b))) {
|
|
148
|
+
const icon = modeIcon(data.topMode);
|
|
149
|
+
lines.push(` ${area}: ${icon} ${data.topMode} (${data.totalJudgments} judgments)`);
|
|
150
|
+
}
|
|
151
|
+
lines.push("");
|
|
152
|
+
}
|
|
153
|
+
return lines.join("\n");
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* Format a failure mode report as markdown for PR comments.
|
|
157
|
+
*/
|
|
158
|
+
export function formatFailureModesMarkdown(report) {
|
|
159
|
+
const lines = [];
|
|
160
|
+
lines.push("### 🔍 Failure Mode Analysis");
|
|
161
|
+
lines.push("");
|
|
162
|
+
if (report.totalJudgments === 0) {
|
|
163
|
+
lines.push("No judgments analyzed.");
|
|
164
|
+
return lines.join("\n");
|
|
165
|
+
}
|
|
166
|
+
lines.push(`**${report.totalJudgments} judgments** analyzed, **${report.classificationRate.toFixed(0)}%** classified`);
|
|
167
|
+
lines.push("");
|
|
168
|
+
// Summary table
|
|
169
|
+
lines.push("| Mode | Count | % |");
|
|
170
|
+
lines.push("|------|-------|---|");
|
|
171
|
+
for (const mode of ALL_MODES) {
|
|
172
|
+
const count = report.summary[mode] ?? 0;
|
|
173
|
+
if (count > 0) {
|
|
174
|
+
const pct = report.totalJudgments > 0
|
|
175
|
+
? ((count / report.totalJudgments) * 100).toFixed(0)
|
|
176
|
+
: "0";
|
|
177
|
+
const icon = modeIcon(mode);
|
|
178
|
+
lines.push(`| ${icon} ${mode} | ${count} | ${pct}% |`);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
lines.push("");
|
|
182
|
+
// Per-area table
|
|
183
|
+
if (Object.keys(report.byArea).length > 0) {
|
|
184
|
+
lines.push("<details>");
|
|
185
|
+
lines.push("<summary>Per-area breakdown</summary>");
|
|
186
|
+
lines.push("");
|
|
187
|
+
lines.push("| Area | Top Mode | Judgments |");
|
|
188
|
+
lines.push("|------|----------|----------|");
|
|
189
|
+
for (const [area, data] of Object.entries(report.byArea).sort(([a], [b]) => a.localeCompare(b))) {
|
|
190
|
+
const icon = modeIcon(data.topMode);
|
|
191
|
+
lines.push(`| ${area} | ${icon} ${data.topMode} | ${data.totalJudgments} |`);
|
|
192
|
+
}
|
|
193
|
+
lines.push("");
|
|
194
|
+
lines.push("</details>");
|
|
195
|
+
lines.push("");
|
|
196
|
+
}
|
|
197
|
+
return lines.join("\n");
|
|
198
|
+
}
|
|
199
|
+
// ---------------------------------------------------------------------------
|
|
200
|
+
// Internal helpers
|
|
201
|
+
// ---------------------------------------------------------------------------
|
|
202
|
+
/** Classify by ceiling decomposition structural signals */
|
|
203
|
+
function classifyByCeiling(score, ceilingScore, floorScore) {
|
|
204
|
+
const docLift = ceilingScore - floorScore;
|
|
205
|
+
// Negative Doc Lift: docs are actively harmful
|
|
206
|
+
if (docLift < 0) {
|
|
207
|
+
return { confidence: "medium", mode: "outdated-docs", source: "ceiling" };
|
|
208
|
+
}
|
|
209
|
+
// High ceiling, low actual: model can't use good docs → model limitation
|
|
210
|
+
if (ceilingScore > 70 && score < 40) {
|
|
211
|
+
return {
|
|
212
|
+
confidence: "medium",
|
|
213
|
+
mode: "model-limitation",
|
|
214
|
+
source: "ceiling",
|
|
215
|
+
};
|
|
216
|
+
}
|
|
217
|
+
// High floor, low ceiling: model knows better than docs → outdated
|
|
218
|
+
if (floorScore > 50 && ceilingScore < floorScore + 10) {
|
|
219
|
+
return { confidence: "medium", mode: "outdated-docs", source: "ceiling" };
|
|
220
|
+
}
|
|
221
|
+
// Low ceiling, low floor: both docs and model knowledge insufficient
|
|
222
|
+
if (ceilingScore < 40 && floorScore < 30) {
|
|
223
|
+
return { confidence: "low", mode: "missing-docs", source: "ceiling" };
|
|
224
|
+
}
|
|
225
|
+
return null;
|
|
226
|
+
}
|
|
227
|
+
/** Classify by keyword matching on the reason text */
|
|
228
|
+
function classifyByKeyword(reason) {
|
|
229
|
+
if (OUTDATED_PATTERN.test(reason)) {
|
|
230
|
+
return { confidence: "high", mode: "outdated-docs", source: "keyword" };
|
|
231
|
+
}
|
|
232
|
+
if (MISSING_PATTERN.test(reason)) {
|
|
233
|
+
return { confidence: "high", mode: "missing-docs", source: "keyword" };
|
|
234
|
+
}
|
|
235
|
+
if (INCORRECT_PATTERN.test(reason)) {
|
|
236
|
+
return { confidence: "medium", mode: "incorrect-docs", source: "keyword" };
|
|
237
|
+
}
|
|
238
|
+
if (POOR_STRUCTURE_PATTERN.test(reason)) {
|
|
239
|
+
return { confidence: "medium", mode: "poor-structure", source: "keyword" };
|
|
240
|
+
}
|
|
241
|
+
return null;
|
|
242
|
+
}
|
|
243
|
+
/**
|
|
244
|
+
* Combine keyword and ceiling classifications.
|
|
245
|
+
*
|
|
246
|
+
* Priority:
|
|
247
|
+
* 1. If both agree on mode → high confidence, source = "keyword+ceiling"
|
|
248
|
+
* 2. If keyword matched → use keyword result
|
|
249
|
+
* 3. If only ceiling matched → use ceiling result (lower confidence)
|
|
250
|
+
* 4. If neither matched → unclassified
|
|
251
|
+
*/
|
|
252
|
+
function combineClassifications(keyword, ceiling) {
|
|
253
|
+
if (keyword && ceiling) {
|
|
254
|
+
if (keyword.mode === ceiling.mode) {
|
|
255
|
+
// Both agree — boost confidence
|
|
256
|
+
return {
|
|
257
|
+
confidence: "high",
|
|
258
|
+
mode: keyword.mode,
|
|
259
|
+
source: "keyword+ceiling",
|
|
260
|
+
};
|
|
261
|
+
}
|
|
262
|
+
// Disagree — prefer keyword (it has more signal)
|
|
263
|
+
return keyword;
|
|
264
|
+
}
|
|
265
|
+
if (keyword)
|
|
266
|
+
return keyword;
|
|
267
|
+
if (ceiling)
|
|
268
|
+
return ceiling;
|
|
269
|
+
return { confidence: "low", mode: "unclassified", source: "keyword" };
|
|
270
|
+
}
|
|
271
|
+
/**
|
|
272
|
+
* Resolve area name from a task ID or description.
|
|
273
|
+
*
|
|
274
|
+
* Task IDs in grader judgments use human-readable descriptions
|
|
275
|
+
* (e.g., "GROQ - Blog queries with filtering and pagination (gold)")
|
|
276
|
+
* while score areas use kebab-case slugs (e.g., "groq").
|
|
277
|
+
*
|
|
278
|
+
* Strategy:
|
|
279
|
+
* 1. Use detectFeatureArea() which handles human-readable descriptions
|
|
280
|
+
* 2. Fall back to prefix matching for kebab-case task IDs
|
|
281
|
+
* 3. Return undefined if no match is found
|
|
282
|
+
*/
|
|
283
|
+
function resolveArea(taskId, scoreByArea) {
|
|
284
|
+
// Strategy 1: Use the shared feature area detector (handles descriptions)
|
|
285
|
+
const detected = detectFeatureArea(taskId);
|
|
286
|
+
if (detected !== "other" && scoreByArea.has(detected))
|
|
287
|
+
return detected;
|
|
288
|
+
// Strategy 2: Direct prefix match (kebab-case task IDs like "groq-blog-queries")
|
|
289
|
+
for (const area of scoreByArea.keys()) {
|
|
290
|
+
if (taskId.startsWith(area))
|
|
291
|
+
return area;
|
|
292
|
+
}
|
|
293
|
+
// Strategy 3: Progressive prefix matching on hyphens
|
|
294
|
+
const parts = taskId.split("-");
|
|
295
|
+
if (parts.length > 1) {
|
|
296
|
+
for (let i = parts.length - 1; i >= 1; i--) {
|
|
297
|
+
const candidate = parts.slice(0, i).join("-");
|
|
298
|
+
if (scoreByArea.has(candidate))
|
|
299
|
+
return candidate;
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
return undefined;
|
|
303
|
+
}
|
|
304
|
+
/** Find the most common failure mode */
|
|
305
|
+
function findTopMode(modes) {
|
|
306
|
+
let topMode = "unclassified";
|
|
307
|
+
let topCount = 0;
|
|
308
|
+
for (const mode of ALL_MODES) {
|
|
309
|
+
if (mode === "unclassified")
|
|
310
|
+
continue; // Prefer classified modes
|
|
311
|
+
if ((modes[mode] ?? 0) > topCount) {
|
|
312
|
+
topCount = modes[mode];
|
|
313
|
+
topMode = mode;
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
// If nothing classified, return unclassified
|
|
317
|
+
if (topCount === 0)
|
|
318
|
+
return "unclassified";
|
|
319
|
+
return topMode;
|
|
320
|
+
}
|
|
321
|
+
/** Initialize mode counts to zero */
|
|
322
|
+
function initModeCounts() {
|
|
323
|
+
return {
|
|
324
|
+
"incorrect-docs": 0,
|
|
325
|
+
"missing-docs": 0,
|
|
326
|
+
"model-limitation": 0,
|
|
327
|
+
"outdated-docs": 0,
|
|
328
|
+
"poor-structure": 0,
|
|
329
|
+
unclassified: 0,
|
|
330
|
+
};
|
|
331
|
+
}
|
|
332
|
+
/** Get icon for a failure mode */
|
|
333
|
+
function modeIcon(mode) {
|
|
334
|
+
switch (mode) {
|
|
335
|
+
case "incorrect-docs":
|
|
336
|
+
return "❌";
|
|
337
|
+
case "missing-docs":
|
|
338
|
+
return "📭";
|
|
339
|
+
case "model-limitation":
|
|
340
|
+
return "🤖";
|
|
341
|
+
case "outdated-docs":
|
|
342
|
+
return "📅";
|
|
343
|
+
case "poor-structure":
|
|
344
|
+
return "🏗️";
|
|
345
|
+
case "unclassified":
|
|
346
|
+
return "❓";
|
|
347
|
+
}
|
|
348
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* fetch-url-content.ts
|
|
3
|
+
*
|
|
4
|
+
* Fetches documentation content from a URL. Tries the .md endpoint first
|
|
5
|
+
* (Sanity's agent-friendly format), then content-type negotiation.
|
|
6
|
+
* Does NOT attempt HTML-to-Markdown conversion — if the endpoint
|
|
7
|
+
* doesn't serve markdown, the fetch fails cleanly.
|
|
8
|
+
*
|
|
9
|
+
* Part of Phase 4: Modular doc fetching for baseline mode.
|
|
10
|
+
*/
|
|
11
|
+
/** Metadata for a single URL fetch, suitable for JSON serialization */
|
|
12
|
+
export interface UrlFetchMetadata {
|
|
13
|
+
contentLength?: number;
|
|
14
|
+
error?: string;
|
|
15
|
+
method: UrlFetchResult["method"];
|
|
16
|
+
status?: number;
|
|
17
|
+
url: string;
|
|
18
|
+
}
|
|
19
|
+
/** Result of fetching documentation content from a URL */
|
|
20
|
+
export interface UrlFetchResult {
|
|
21
|
+
/** The markdown content, or undefined if fetch failed */
|
|
22
|
+
content?: string;
|
|
23
|
+
/** Error message if fetch failed */
|
|
24
|
+
error?: string;
|
|
25
|
+
/** How the content was obtained */
|
|
26
|
+
method: "content-negotiation" | "failed" | "md-endpoint";
|
|
27
|
+
/** HTTP status code */
|
|
28
|
+
status?: number;
|
|
29
|
+
/** The URL that was fetched */
|
|
30
|
+
url: string;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Fetch documentation content from a URL.
|
|
34
|
+
*
|
|
35
|
+
* Strategy:
|
|
36
|
+
* 1. Try `.md` endpoint with `Accept: text/markdown`
|
|
37
|
+
* 2. Try content-type negotiation (ask for markdown)
|
|
38
|
+
* 3. Fail cleanly if only HTML is available (no conversion)
|
|
39
|
+
*
|
|
40
|
+
* @param url - The documentation URL to fetch
|
|
41
|
+
* @param headers - Optional custom headers to merge with defaults
|
|
42
|
+
* @returns A `UrlFetchResult` with success or failure metadata
|
|
43
|
+
*/
|
|
44
|
+
export declare function fetchUrlContent(url: string, headers?: Record<string, string>): Promise<UrlFetchResult>;
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* fetch-url-content.ts
|
|
3
|
+
*
|
|
4
|
+
* Fetches documentation content from a URL. Tries the .md endpoint first
|
|
5
|
+
* (Sanity's agent-friendly format), then content-type negotiation.
|
|
6
|
+
* Does NOT attempt HTML-to-Markdown conversion — if the endpoint
|
|
7
|
+
* doesn't serve markdown, the fetch fails cleanly.
|
|
8
|
+
*
|
|
9
|
+
* Part of Phase 4: Modular doc fetching for baseline mode.
|
|
10
|
+
*/
|
|
11
|
+
/**
|
|
12
|
+
* Fetch documentation content from a URL.
|
|
13
|
+
*
|
|
14
|
+
* Strategy:
|
|
15
|
+
* 1. Try `.md` endpoint with `Accept: text/markdown`
|
|
16
|
+
* 2. Try content-type negotiation (ask for markdown)
|
|
17
|
+
* 3. Fail cleanly if only HTML is available (no conversion)
|
|
18
|
+
*
|
|
19
|
+
* @param url - The documentation URL to fetch
|
|
20
|
+
* @param headers - Optional custom headers to merge with defaults
|
|
21
|
+
* @returns A `UrlFetchResult` with success or failure metadata
|
|
22
|
+
*/
|
|
23
|
+
export async function fetchUrlContent(url, headers) {
|
|
24
|
+
const cleanUrl = url.replace(/\/$/, "");
|
|
25
|
+
const mergedHeaders = {
|
|
26
|
+
...headers,
|
|
27
|
+
"User-Agent": "SanityEvalBot/1.0",
|
|
28
|
+
};
|
|
29
|
+
// Strategy 1: Try .md endpoint
|
|
30
|
+
const mdUrl = cleanUrl.endsWith(".md") ? cleanUrl : `${cleanUrl}.md`;
|
|
31
|
+
try {
|
|
32
|
+
const response = await fetch(mdUrl, {
|
|
33
|
+
headers: { ...mergedHeaders, Accept: "text/markdown, text/plain" },
|
|
34
|
+
});
|
|
35
|
+
if (response.ok) {
|
|
36
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
37
|
+
const text = await response.text();
|
|
38
|
+
// Accept if content-type says markdown OR the content doesn't look like HTML
|
|
39
|
+
if (contentType.includes("markdown") ||
|
|
40
|
+
!text.trimStart().startsWith("<!DOCTYPE")) {
|
|
41
|
+
return {
|
|
42
|
+
content: text,
|
|
43
|
+
method: "md-endpoint",
|
|
44
|
+
status: response.status,
|
|
45
|
+
url,
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
catch {
|
|
51
|
+
// .md endpoint unavailable — try next strategy
|
|
52
|
+
}
|
|
53
|
+
// Strategy 2: Content-type negotiation (ask for markdown)
|
|
54
|
+
try {
|
|
55
|
+
const response = await fetch(cleanUrl, {
|
|
56
|
+
headers: { ...mergedHeaders, Accept: "text/markdown, text/plain" },
|
|
57
|
+
});
|
|
58
|
+
if (response.ok) {
|
|
59
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
60
|
+
const text = await response.text();
|
|
61
|
+
if (contentType.includes("markdown") ||
|
|
62
|
+
contentType.includes("text/plain")) {
|
|
63
|
+
return {
|
|
64
|
+
content: text,
|
|
65
|
+
method: "content-negotiation",
|
|
66
|
+
status: response.status,
|
|
67
|
+
url,
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
// Got HTML or something else — don't try to convert it
|
|
71
|
+
return {
|
|
72
|
+
error: `Endpoint returned ${contentType} instead of markdown. ` +
|
|
73
|
+
"The URL does not appear to support markdown content negotiation.",
|
|
74
|
+
method: "failed",
|
|
75
|
+
status: response.status,
|
|
76
|
+
url,
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
return {
|
|
80
|
+
error: `HTTP ${response.status}`,
|
|
81
|
+
method: "failed",
|
|
82
|
+
status: response.status,
|
|
83
|
+
url,
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
catch (err) {
|
|
87
|
+
return {
|
|
88
|
+
error: `Network error: ${err.message}`,
|
|
89
|
+
method: "failed",
|
|
90
|
+
url,
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/gap-analysis.ts
|
|
3
|
+
*
|
|
4
|
+
* Expected impact estimation for identified gaps.
|
|
5
|
+
*
|
|
6
|
+
* Phase 3b of the Scenario Matrix implementation.
|
|
7
|
+
*
|
|
8
|
+
* Given failure modes and scores, estimates the score lift that fixing
|
|
9
|
+
* each gap would produce. Gaps are prioritized by estimated lift × task count
|
|
10
|
+
* to produce an actionable remediation plan.
|
|
11
|
+
*
|
|
12
|
+
* The estimation model is conservative: it assumes fixing a gap raises the
|
|
13
|
+
* bottleneck dimension to the median of non-bottlenecked dimensions (not 100).
|
|
14
|
+
* This produces realistic estimates rather than theoretical maximums.
|
|
15
|
+
*
|
|
16
|
+
* @see docs/exec-plans/completed/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
17
|
+
*/
|
|
18
|
+
import type { FailureModeReport, FeatureScore, GapAnalysisReport, GapEstimate } from "./types.js";
|
|
19
|
+
/**
|
|
20
|
+
* Build a complete gap analysis report.
|
|
21
|
+
*
|
|
22
|
+
* @param failureModeReport - Classified failure modes from Phase 3a
|
|
23
|
+
* @param scores - Per-area feature scores
|
|
24
|
+
* @param weights - Dimension weights
|
|
25
|
+
* @returns Gap analysis report with prioritized remediation plan
|
|
26
|
+
*/
|
|
27
|
+
export declare function buildGapAnalysisReport(failureModeReport: FailureModeReport, scores: FeatureScore[], weights?: Record<string, number>): GapAnalysisReport;
|
|
28
|
+
/**
|
|
29
|
+
* Estimate the impact of fixing each identified gap.
|
|
30
|
+
*
|
|
31
|
+
* For each area with failure modes, calculates the potential score lift
|
|
32
|
+
* from fixing the identified issues. Uses a conservative estimation model
|
|
33
|
+
* that targets the median of non-bottlenecked dimensions rather than 100.
|
|
34
|
+
*
|
|
35
|
+
* @param failureModeReport - Classified failure modes from Phase 3a
|
|
36
|
+
* @param scores - Per-area feature scores
|
|
37
|
+
* @param weights - Dimension weights (defaults to rubrics.yaml weights)
|
|
38
|
+
* @returns Gap estimates sorted by priority (highest first)
|
|
39
|
+
*/
|
|
40
|
+
export declare function estimateImpact(failureModeReport: FailureModeReport, scores: FeatureScore[], weights?: Record<string, number>): GapEstimate[];
|
|
41
|
+
/**
|
|
42
|
+
* Format a gap analysis report for console output.
|
|
43
|
+
*/
|
|
44
|
+
export declare function formatGapAnalysisConsole(report: GapAnalysisReport): string;
|
|
45
|
+
/**
|
|
46
|
+
* Format a gap analysis report as markdown for PR comments.
|
|
47
|
+
*/
|
|
48
|
+
export declare function formatGapAnalysisMarkdown(report: GapAnalysisReport): string;
|