@sanity/ailf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -0
- package/bin/ailf.js +64 -0
- package/canonical/grader-references/README.md +88 -0
- package/canonical/grader-references/groq.yaml +234 -0
- package/canonical/grader-references/studio-setup.yaml +275 -0
- package/canonical/reference-solutions/.gitkeep +1 -0
- package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
- package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
- package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
- package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
- package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
- package/canonical/reference-solutions/groq/joins-references.ts +300 -0
- package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
- package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
- package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
- package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
- package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
- package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
- package/config/bigquery/README.md +74 -0
- package/config/bigquery/views/area_scores.sql +87 -0
- package/config/bigquery/views/reports.sql +49 -0
- package/config/features.yaml +116 -0
- package/config/models.yaml +115 -0
- package/config/prompts.yaml +75 -0
- package/config/rubrics.yaml +62 -0
- package/config/schedules.yaml +43 -0
- package/config/sinks.yaml +54 -0
- package/config/sources.yaml +51 -0
- package/config/thresholds.yaml +49 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
- package/dist/_vendor/ailf-core/examples/index.js +285 -0
- package/dist/_vendor/ailf-core/index.d.ts +17 -0
- package/dist/_vendor/ailf-core/index.js +17 -0
- package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
- package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
- package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
- package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
- package/dist/_vendor/ailf-core/ports/context.js +14 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
- package/dist/_vendor/ailf-core/ports/index.js +7 -0
- package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
- package/dist/_vendor/ailf-core/ports/logger.js +11 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
- package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/schemas/index.js +16 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
- package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
- package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
- package/dist/_vendor/ailf-core/services/index.js +12 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/scoring.js +222 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
- package/dist/_vendor/ailf-core/types/index.js +21 -0
- package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
- package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
- package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
- package/dist/_vendor/ailf-shared/document-ref.js +1 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -0
- package/dist/_vendor/ailf-shared/index.js +16 -0
- package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
- package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
- package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
- package/dist/_vendor/ailf-shared/score-grades.js +23 -0
- package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
- package/dist/adapters/cache/content-lake-cache.js +59 -0
- package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
- package/dist/adapters/cache/filesystem-cache.js +54 -0
- package/dist/adapters/cache/index.d.ts +2 -0
- package/dist/adapters/cache/index.js +2 -0
- package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
- package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
- package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
- package/dist/adapters/config-sources/file-config-adapter.js +96 -0
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +2 -0
- package/dist/adapters/doc-fetchers/index.d.ts +1 -0
- package/dist/adapters/doc-fetchers/index.js +1 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
- package/dist/adapters/eval-runners/index.d.ts +1 -0
- package/dist/adapters/eval-runners/index.js +1 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
- package/dist/adapters/index.d.ts +12 -0
- package/dist/adapters/index.js +12 -0
- package/dist/adapters/loggers/console-logger.d.ts +22 -0
- package/dist/adapters/loggers/console-logger.js +54 -0
- package/dist/adapters/loggers/index.d.ts +9 -0
- package/dist/adapters/loggers/index.js +9 -0
- package/dist/adapters/loggers/json-logger.d.ts +18 -0
- package/dist/adapters/loggers/json-logger.js +33 -0
- package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
- package/dist/adapters/loggers/quiet-logger.js +30 -0
- package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/composite-task-source.js +59 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
- package/dist/adapters/task-sources/index.d.ts +7 -0
- package/dist/adapters/task-sources/index.js +7 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
- package/dist/adapters/task-sources/repo-schemas.js +234 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
- package/dist/adapters/task-sources/repo-task-source.js +104 -0
- package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
- package/dist/adapters/task-sources/repo-trigger.js +153 -0
- package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
- package/dist/adapters/task-sources/repo-validation.js +164 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
- package/dist/adapters/task-sources/yaml-task-source.js +136 -0
- package/dist/agent-observer/agentic-provider.d.ts +132 -0
- package/dist/agent-observer/agentic-provider.js +983 -0
- package/dist/agent-observer/classifier.d.ts +62 -0
- package/dist/agent-observer/classifier.js +269 -0
- package/dist/agent-observer/index.d.ts +7 -0
- package/dist/agent-observer/index.js +4 -0
- package/dist/agent-observer/pricing.d.ts +35 -0
- package/dist/agent-observer/pricing.js +82 -0
- package/dist/agent-observer/provider.d.ts +77 -0
- package/dist/agent-observer/provider.js +151 -0
- package/dist/agent-observer/proxy.d.ts +91 -0
- package/dist/agent-observer/proxy.js +321 -0
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/agent-observer/types.d.ts +137 -0
- package/dist/agent-observer/types.js +16 -0
- package/dist/assertions/source-isolation.d.ts +72 -0
- package/dist/assertions/source-isolation.js +117 -0
- package/dist/cli.d.ts +24 -0
- package/dist/cli.js +199 -0
- package/dist/commands/agent-report.d.ts +5 -0
- package/dist/commands/agent-report.js +69 -0
- package/dist/commands/baseline.d.ts +9 -0
- package/dist/commands/baseline.js +141 -0
- package/dist/commands/cache.d.ts +13 -0
- package/dist/commands/cache.js +135 -0
- package/dist/commands/calculate-scores.d.ts +8 -0
- package/dist/commands/calculate-scores.js +48 -0
- package/dist/commands/compare.d.ts +8 -0
- package/dist/commands/compare.js +120 -0
- package/dist/commands/completion.d.ts +18 -0
- package/dist/commands/completion.js +260 -0
- package/dist/commands/coverage-audit.d.ts +7 -0
- package/dist/commands/coverage-audit.js +40 -0
- package/dist/commands/discovery-report.d.ts +10 -0
- package/dist/commands/discovery-report.js +44 -0
- package/dist/commands/eval.d.ts +9 -0
- package/dist/commands/eval.js +35 -0
- package/dist/commands/explain-handler.d.ts +34 -0
- package/dist/commands/explain-handler.js +719 -0
- package/dist/commands/fetch-docs.d.ts +8 -0
- package/dist/commands/fetch-docs.js +128 -0
- package/dist/commands/generate-configs.d.ts +8 -0
- package/dist/commands/generate-configs.js +46 -0
- package/dist/commands/grader/index.d.ts +11 -0
- package/dist/commands/grader/index.js +118 -0
- package/dist/commands/init.d.ts +19 -0
- package/dist/commands/init.js +150 -0
- package/dist/commands/interactive.d.ts +12 -0
- package/dist/commands/interactive.js +238 -0
- package/dist/commands/lookup-doc.d.ts +15 -0
- package/dist/commands/lookup-doc.js +84 -0
- package/dist/commands/measure-retrieval.d.ts +5 -0
- package/dist/commands/measure-retrieval.js +65 -0
- package/dist/commands/pipeline-action.d.ts +71 -0
- package/dist/commands/pipeline-action.js +305 -0
- package/dist/commands/pipeline.d.ts +62 -0
- package/dist/commands/pipeline.js +53 -0
- package/dist/commands/pr-comment.d.ts +8 -0
- package/dist/commands/pr-comment.js +47 -0
- package/dist/commands/publish.d.ts +26 -0
- package/dist/commands/publish.js +253 -0
- package/dist/commands/readiness-report.d.ts +10 -0
- package/dist/commands/readiness-report.js +104 -0
- package/dist/commands/shared/options.d.ts +29 -0
- package/dist/commands/shared/options.js +57 -0
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +16 -0
- package/dist/commands/validate-tasks.js +93 -0
- package/dist/commands/validate.d.ts +9 -0
- package/dist/commands/validate.js +73 -0
- package/dist/commands/webhook-server.d.ts +5 -0
- package/dist/commands/webhook-server.js +30 -0
- package/dist/commands/weekly-digest.d.ts +10 -0
- package/dist/commands/weekly-digest.js +104 -0
- package/dist/composition-root.d.ts +26 -0
- package/dist/composition-root.js +107 -0
- package/dist/interpolate.d.ts +26 -0
- package/dist/interpolate.js +70 -0
- package/dist/job-store.d.ts +104 -0
- package/dist/job-store.js +188 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.d.ts +27 -0
- package/dist/orchestration/build-app-context.js +81 -0
- package/dist/orchestration/build-step-sequence.d.ts +15 -0
- package/dist/orchestration/build-step-sequence.js +84 -0
- package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
- package/dist/orchestration/config-to-source-overrides.js +28 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/index.d.ts +11 -0
- package/dist/orchestration/index.js +11 -0
- package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
- package/dist/orchestration/pipeline-orchestrator.js +153 -0
- package/dist/orchestration/step-runner.d.ts +20 -0
- package/dist/orchestration/step-runner.js +88 -0
- package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
- package/dist/orchestration/steps/calculate-scores-step.js +95 -0
- package/dist/orchestration/steps/callback-step.d.ts +24 -0
- package/dist/orchestration/steps/callback-step.js +76 -0
- package/dist/orchestration/steps/compare-step.d.ts +14 -0
- package/dist/orchestration/steps/compare-step.js +92 -0
- package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
- package/dist/orchestration/steps/discovery-report-step.js +55 -0
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
- package/dist/orchestration/steps/fetch-docs-step.js +135 -0
- package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
- package/dist/orchestration/steps/gap-analysis-step.js +136 -0
- package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
- package/dist/orchestration/steps/generate-configs-step.js +85 -0
- package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
- package/dist/orchestration/steps/grader-consistency-step.js +64 -0
- package/dist/orchestration/steps/index.d.ts +19 -0
- package/dist/orchestration/steps/index.js +19 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
- package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
- package/dist/orchestration/steps/publish-report-step.js +216 -0
- package/dist/orchestration/steps/readiness-step.d.ts +13 -0
- package/dist/orchestration/steps/readiness-step.js +91 -0
- package/dist/orchestration/steps/report-step.d.ts +12 -0
- package/dist/orchestration/steps/report-step.js +49 -0
- package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
- package/dist/orchestration/steps/run-eval-step.js +195 -0
- package/dist/orchestration/steps/validate-step.d.ts +12 -0
- package/dist/orchestration/steps/validate-step.js +41 -0
- package/dist/pipeline/agent-behavior-report.d.ts +53 -0
- package/dist/pipeline/agent-behavior-report.js +132 -0
- package/dist/pipeline/attribution.d.ts +47 -0
- package/dist/pipeline/attribution.js +226 -0
- package/dist/pipeline/baseline.d.ts +37 -0
- package/dist/pipeline/baseline.js +141 -0
- package/dist/pipeline/cache.d.ts +101 -0
- package/dist/pipeline/cache.js +283 -0
- package/dist/pipeline/calculate-scores.d.ts +102 -0
- package/dist/pipeline/calculate-scores.js +1128 -0
- package/dist/pipeline/callback-delivery.d.ts +50 -0
- package/dist/pipeline/callback-delivery.js +89 -0
- package/dist/pipeline/checks.d.ts +39 -0
- package/dist/pipeline/checks.js +280 -0
- package/dist/pipeline/classify-url.d.ts +61 -0
- package/dist/pipeline/classify-url.js +93 -0
- package/dist/pipeline/compare.d.ts +31 -0
- package/dist/pipeline/compare.js +208 -0
- package/dist/pipeline/coverage-audit.d.ts +39 -0
- package/dist/pipeline/coverage-audit.js +165 -0
- package/dist/pipeline/degradations.d.ts +85 -0
- package/dist/pipeline/degradations.js +242 -0
- package/dist/pipeline/discovery-report.d.ts +55 -0
- package/dist/pipeline/discovery-report.js +178 -0
- package/dist/pipeline/eval-constants.d.ts +68 -0
- package/dist/pipeline/eval-constants.js +111 -0
- package/dist/pipeline/eval-fingerprint.d.ts +66 -0
- package/dist/pipeline/eval-fingerprint.js +175 -0
- package/dist/pipeline/expand-tasks.d.ts +220 -0
- package/dist/pipeline/expand-tasks.js +421 -0
- package/dist/pipeline/failure-modes.d.ts +46 -0
- package/dist/pipeline/failure-modes.js +348 -0
- package/dist/pipeline/fetch-url-content.d.ts +44 -0
- package/dist/pipeline/fetch-url-content.js +93 -0
- package/dist/pipeline/gap-analysis.d.ts +48 -0
- package/dist/pipeline/gap-analysis.js +231 -0
- package/dist/pipeline/generate-configs.d.ts +72 -0
- package/dist/pipeline/generate-configs.js +395 -0
- package/dist/pipeline/grader-api.d.ts +49 -0
- package/dist/pipeline/grader-api.js +200 -0
- package/dist/pipeline/grader-compare-runner.d.ts +44 -0
- package/dist/pipeline/grader-compare-runner.js +301 -0
- package/dist/pipeline/grader-comparison.d.ts +111 -0
- package/dist/pipeline/grader-comparison.js +161 -0
- package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
- package/dist/pipeline/grader-consistency-runner.js +270 -0
- package/dist/pipeline/grader-consistency.d.ts +103 -0
- package/dist/pipeline/grader-consistency.js +146 -0
- package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
- package/dist/pipeline/grader-sensitivity-runner.js +282 -0
- package/dist/pipeline/grader-sensitivity.d.ts +94 -0
- package/dist/pipeline/grader-sensitivity.js +144 -0
- package/dist/pipeline/grader-validate-runner.d.ts +38 -0
- package/dist/pipeline/grader-validate-runner.js +229 -0
- package/dist/pipeline/grader-validation.d.ts +107 -0
- package/dist/pipeline/grader-validation.js +169 -0
- package/dist/pipeline/map-request-to-config.d.ts +19 -0
- package/dist/pipeline/map-request-to-config.js +80 -0
- package/dist/pipeline/measure-retrieval.d.ts +59 -0
- package/dist/pipeline/measure-retrieval.js +111 -0
- package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
- package/dist/pipeline/mirror-repo-tasks.js +350 -0
- package/dist/pipeline/plan-format.d.ts +33 -0
- package/dist/pipeline/plan-format.js +202 -0
- package/dist/pipeline/plan.d.ts +169 -0
- package/dist/pipeline/plan.js +708 -0
- package/dist/pipeline/pr-comment.d.ts +19 -0
- package/dist/pipeline/pr-comment.js +502 -0
- package/dist/pipeline/probe.d.ts +52 -0
- package/dist/pipeline/probe.js +390 -0
- package/dist/pipeline/provenance.d.ts +47 -0
- package/dist/pipeline/provenance.js +146 -0
- package/dist/pipeline/readiness-report.d.ts +87 -0
- package/dist/pipeline/readiness-report.js +205 -0
- package/dist/pipeline/release-classification.d.ts +54 -0
- package/dist/pipeline/release-classification.js +238 -0
- package/dist/pipeline/release-report.d.ts +37 -0
- package/dist/pipeline/release-report.js +222 -0
- package/dist/pipeline/repo-eval-comment.d.ts +37 -0
- package/dist/pipeline/repo-eval-comment.js +165 -0
- package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
- package/dist/pipeline/repo-threshold-evaluator.js +162 -0
- package/dist/pipeline/resolve-mappings.d.ts +35 -0
- package/dist/pipeline/resolve-mappings.js +72 -0
- package/dist/pipeline/retrieval-metrics.d.ts +39 -0
- package/dist/pipeline/retrieval-metrics.js +136 -0
- package/dist/pipeline/reverse-mapping.d.ts +67 -0
- package/dist/pipeline/reverse-mapping.js +88 -0
- package/dist/pipeline/schemas.d.ts +9 -0
- package/dist/pipeline/schemas.js +9 -0
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +88 -0
- package/dist/pipeline/targeted-loo.js +203 -0
- package/dist/pipeline/thresholds.d.ts +27 -0
- package/dist/pipeline/thresholds.js +245 -0
- package/dist/pipeline/types.d.ts +10 -0
- package/dist/pipeline/types.js +10 -0
- package/dist/pipeline/validate.d.ts +67 -0
- package/dist/pipeline/validate.js +406 -0
- package/dist/pipeline/webhook-server.d.ts +37 -0
- package/dist/pipeline/webhook-server.js +133 -0
- package/dist/report-store.d.ts +84 -0
- package/dist/report-store.js +208 -0
- package/dist/sanity/client.d.ts +38 -0
- package/dist/sanity/client.js +86 -0
- package/dist/sanity/portable-text.d.ts +11 -0
- package/dist/sanity/portable-text.js +211 -0
- package/dist/sanity/queries.d.ts +133 -0
- package/dist/sanity/queries.js +300 -0
- package/dist/schedules/digest.d.ts +116 -0
- package/dist/schedules/digest.js +156 -0
- package/dist/schedules/index.d.ts +12 -0
- package/dist/schedules/index.js +10 -0
- package/dist/schedules/loader.d.ts +31 -0
- package/dist/schedules/loader.js +73 -0
- package/dist/schedules/schema.d.ts +9 -0
- package/dist/schedules/schema.js +9 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +21 -0
- package/dist/scripts/validate-task-sources.js +210 -0
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/bigquery/index.d.ts +131 -0
- package/dist/sinks/bigquery/index.js +222 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/index.d.ts +23 -0
- package/dist/sinks/index.js +18 -0
- package/dist/sinks/loader.d.ts +18 -0
- package/dist/sinks/loader.js +82 -0
- package/dist/sinks/retry.d.ts +24 -0
- package/dist/sinks/retry.js +52 -0
- package/dist/sinks/schema.d.ts +9 -0
- package/dist/sinks/schema.js +9 -0
- package/dist/sinks/slack/format.d.ts +65 -0
- package/dist/sinks/slack/format.js +327 -0
- package/dist/sinks/slack/index.d.ts +27 -0
- package/dist/sinks/slack/index.js +78 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +59 -0
- package/dist/sinks/types.js +44 -0
- package/dist/sinks/webhook/index.d.ts +19 -0
- package/dist/sinks/webhook/index.js +50 -0
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/sources.d.ts +104 -0
- package/dist/sources.js +292 -0
- package/dist/webhook/budget.d.ts +42 -0
- package/dist/webhook/budget.js +60 -0
- package/dist/webhook/debounce.d.ts +67 -0
- package/dist/webhook/debounce.js +76 -0
- package/dist/webhook/dispatch.d.ts +45 -0
- package/dist/webhook/dispatch.js +84 -0
- package/dist/webhook/eval-request-handler.d.ts +87 -0
- package/dist/webhook/eval-request-handler.js +181 -0
- package/dist/webhook/handler.d.ts +88 -0
- package/dist/webhook/handler.js +203 -0
- package/dist/webhook/index.d.ts +17 -0
- package/dist/webhook/index.js +12 -0
- package/dist/webhook/types.d.ts +109 -0
- package/dist/webhook/types.js +10 -0
- package/package.json +72 -0
- package/tasks/.expanded.agentic.yaml +51 -0
- package/tasks/.expanded.yaml +66 -0
- package/tasks/frameworks.yaml +98 -0
- package/tasks/functions.yaml +51 -0
- package/tasks/groq.yaml +216 -0
- package/tasks/nextjs-live.yaml +62 -0
- package/tasks/studio-setup.yaml +111 -0
- package/tasks/visual-editing.yaml +120 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/agent-behavior-report.ts
|
|
3
|
+
*
|
|
4
|
+
* Pure analysis functions for agent behavior observation reports.
|
|
5
|
+
* No I/O, no process.env, no process.argv — all data is passed in.
|
|
6
|
+
*/
|
|
7
|
+
import type { AgentBehaviorSummary } from "../agent-observer/types.js";
|
|
8
|
+
export interface PromptfooResults {
|
|
9
|
+
results: TestResult[];
|
|
10
|
+
}
|
|
11
|
+
export interface PromptfooResultsEnvelope {
|
|
12
|
+
results: {
|
|
13
|
+
results: TestResult[];
|
|
14
|
+
};
|
|
15
|
+
}
|
|
16
|
+
export interface TestResult {
|
|
17
|
+
description: string;
|
|
18
|
+
metadata?: Record<string, unknown>;
|
|
19
|
+
response: {
|
|
20
|
+
output: string;
|
|
21
|
+
};
|
|
22
|
+
vars: Record<string, string>;
|
|
23
|
+
}
|
|
24
|
+
export interface TaskBehavior {
|
|
25
|
+
behavior: AgentBehaviorSummary;
|
|
26
|
+
description: string;
|
|
27
|
+
feature: string;
|
|
28
|
+
hasDocs: boolean;
|
|
29
|
+
}
|
|
30
|
+
export interface FeatureAnalysis {
|
|
31
|
+
allDocSlugs: string[];
|
|
32
|
+
allExternalDomains: string[];
|
|
33
|
+
allSearchQueries: string[];
|
|
34
|
+
avgDocPages: number;
|
|
35
|
+
avgNetworkMs: number;
|
|
36
|
+
avgSearches: number;
|
|
37
|
+
canonicalCoverage: number;
|
|
38
|
+
canonicalSlugs: string[];
|
|
39
|
+
feature: string;
|
|
40
|
+
tasks: TaskBehavior[];
|
|
41
|
+
}
|
|
42
|
+
export interface AnalysisResult {
|
|
43
|
+
features: FeatureAnalysis[];
|
|
44
|
+
hasData: boolean;
|
|
45
|
+
tasks: TaskBehavior[];
|
|
46
|
+
}
|
|
47
|
+
export declare const CANONICAL_DOC_MAP: Record<string, string[]>;
|
|
48
|
+
export declare function detectFeatureArea(description: string): string;
|
|
49
|
+
/**
|
|
50
|
+
* Analyze pre-parsed test results for agent behavior patterns.
|
|
51
|
+
* Caller is responsible for reading/parsing the file — this function is pure.
|
|
52
|
+
*/
|
|
53
|
+
export declare function analyzeResults(results: TestResult[]): AnalysisResult;
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// Canonical doc mapping
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
// Canonical doc mapping: task description patterns -> expected doc slugs
|
|
5
|
+
// This maps what docs a well-informed agent *should* visit for each task
|
|
6
|
+
export const CANONICAL_DOC_MAP = {
|
|
7
|
+
frameworks: [
|
|
8
|
+
"remix",
|
|
9
|
+
"nuxt",
|
|
10
|
+
"svelte",
|
|
11
|
+
"astro",
|
|
12
|
+
"gatsby",
|
|
13
|
+
"client-libraries",
|
|
14
|
+
],
|
|
15
|
+
functions: [
|
|
16
|
+
"functions",
|
|
17
|
+
"webhooks",
|
|
18
|
+
"groq-powered-webhooks",
|
|
19
|
+
"event-driven",
|
|
20
|
+
"automations",
|
|
21
|
+
],
|
|
22
|
+
"nextjs-live": [
|
|
23
|
+
"next-js",
|
|
24
|
+
"live-content-api",
|
|
25
|
+
"content-source-maps",
|
|
26
|
+
"app-router",
|
|
27
|
+
"groq",
|
|
28
|
+
"client-libraries",
|
|
29
|
+
],
|
|
30
|
+
"studio-setup": [
|
|
31
|
+
"studio",
|
|
32
|
+
"schema-types",
|
|
33
|
+
"structure-builder",
|
|
34
|
+
"configuration",
|
|
35
|
+
"plugins",
|
|
36
|
+
],
|
|
37
|
+
"visual-editing": [
|
|
38
|
+
"visual-editing",
|
|
39
|
+
"presentation",
|
|
40
|
+
"preview",
|
|
41
|
+
"overlays",
|
|
42
|
+
"loaders",
|
|
43
|
+
],
|
|
44
|
+
};
|
|
45
|
+
// ---------------------------------------------------------------------------
|
|
46
|
+
// Feature area detection
|
|
47
|
+
// ---------------------------------------------------------------------------
|
|
48
|
+
export function detectFeatureArea(description) {
|
|
49
|
+
const desc = description.toLowerCase();
|
|
50
|
+
if (desc.includes("studio"))
|
|
51
|
+
return "studio-setup";
|
|
52
|
+
if (desc.includes("visual") ||
|
|
53
|
+
desc.includes("presentation") ||
|
|
54
|
+
desc.includes("live preview"))
|
|
55
|
+
return "visual-editing";
|
|
56
|
+
if (desc.includes("function") || desc.includes("webhook"))
|
|
57
|
+
return "functions";
|
|
58
|
+
if (desc.includes("next") || desc.includes("app router"))
|
|
59
|
+
return "nextjs-live";
|
|
60
|
+
if (desc.includes("remix") ||
|
|
61
|
+
desc.includes("nuxt") ||
|
|
62
|
+
desc.includes("svelte"))
|
|
63
|
+
return "frameworks";
|
|
64
|
+
return "other";
|
|
65
|
+
}
|
|
66
|
+
// ---------------------------------------------------------------------------
|
|
67
|
+
// Analysis
|
|
68
|
+
// ---------------------------------------------------------------------------
|
|
69
|
+
/**
|
|
70
|
+
* Analyze pre-parsed test results for agent behavior patterns.
|
|
71
|
+
* Caller is responsible for reading/parsing the file — this function is pure.
|
|
72
|
+
*/
|
|
73
|
+
export function analyzeResults(results) {
|
|
74
|
+
const tasks = [];
|
|
75
|
+
for (const result of results) {
|
|
76
|
+
const metadata = result.metadata;
|
|
77
|
+
if (!metadata?.agentBehaviorSummary)
|
|
78
|
+
continue;
|
|
79
|
+
const behavior = metadata.agentBehaviorSummary;
|
|
80
|
+
tasks.push({
|
|
81
|
+
behavior,
|
|
82
|
+
description: result.description,
|
|
83
|
+
feature: detectFeatureArea(result.description),
|
|
84
|
+
hasDocs: !!(result.vars.docs && result.vars.docs.trim().length > 0),
|
|
85
|
+
});
|
|
86
|
+
}
|
|
87
|
+
if (tasks.length === 0) {
|
|
88
|
+
return { features: [], hasData: false, tasks: [] };
|
|
89
|
+
}
|
|
90
|
+
// Group by feature
|
|
91
|
+
const byFeature = {};
|
|
92
|
+
for (const t of tasks) {
|
|
93
|
+
if (!byFeature[t.feature])
|
|
94
|
+
byFeature[t.feature] = [];
|
|
95
|
+
byFeature[t.feature].push(t);
|
|
96
|
+
}
|
|
97
|
+
const features = Object.entries(byFeature)
|
|
98
|
+
.map(([feature, featureTasks]) => {
|
|
99
|
+
const allDocSlugs = [
|
|
100
|
+
...new Set(featureTasks.flatMap((t) => t.behavior.docSlugsVisited)),
|
|
101
|
+
];
|
|
102
|
+
const allSearchQueries = [
|
|
103
|
+
...new Set(featureTasks.flatMap((t) => t.behavior.uniqueSearchQueries)),
|
|
104
|
+
];
|
|
105
|
+
const allExternalDomains = [
|
|
106
|
+
...new Set(featureTasks.flatMap((t) => t.behavior.externalDomains)),
|
|
107
|
+
];
|
|
108
|
+
const canonicalSlugs = CANONICAL_DOC_MAP[feature] || [];
|
|
109
|
+
const matchedCanonical = canonicalSlugs.filter((slug) => allDocSlugs.some((visited) => visited.includes(slug)));
|
|
110
|
+
const canonicalCoverage = canonicalSlugs.length > 0
|
|
111
|
+
? matchedCanonical.length / canonicalSlugs.length
|
|
112
|
+
: 0;
|
|
113
|
+
const count = featureTasks.length || 1;
|
|
114
|
+
return {
|
|
115
|
+
allDocSlugs,
|
|
116
|
+
allExternalDomains,
|
|
117
|
+
allSearchQueries,
|
|
118
|
+
avgDocPages: featureTasks.reduce((s, t) => s + t.behavior.docPagesVisited, 0) /
|
|
119
|
+
count,
|
|
120
|
+
avgNetworkMs: featureTasks.reduce((s, t) => s + t.behavior.totalNetworkMs, 0) /
|
|
121
|
+
count,
|
|
122
|
+
avgSearches: featureTasks.reduce((s, t) => s + t.behavior.searchesPerformed, 0) /
|
|
123
|
+
count,
|
|
124
|
+
canonicalCoverage,
|
|
125
|
+
canonicalSlugs,
|
|
126
|
+
feature,
|
|
127
|
+
tasks: featureTasks,
|
|
128
|
+
};
|
|
129
|
+
})
|
|
130
|
+
.sort((a, b) => a.feature.localeCompare(b.feature));
|
|
131
|
+
return { features, hasData: true, tasks };
|
|
132
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/attribution.ts
|
|
3
|
+
*
|
|
4
|
+
* Per-document attribution for score changes.
|
|
5
|
+
*
|
|
6
|
+
* Given a ComparisonReport, a list of changed document slugs, and the
|
|
7
|
+
* resolved task mappings, this module classifies each task's score delta
|
|
8
|
+
* as unambiguous (one changed doc), ambiguous (multiple changed docs),
|
|
9
|
+
* or uncorrelated (no changed docs in the task's canonical set).
|
|
10
|
+
*
|
|
11
|
+
* This is Phase 1 of the hybrid attribution approach — zero additional
|
|
12
|
+
* eval cost, purely correlation-based. Phase 2 (targeted LOO) can be
|
|
13
|
+
* layered on top for ambiguous cases when higher precision is needed.
|
|
14
|
+
*
|
|
15
|
+
* @see docs/design-docs/scenario-matrix/per-document-attribution.md
|
|
16
|
+
* @see docs/exec-plans/completed/scenario-matrix-implementation/phase-2-impact-scenarios.md
|
|
17
|
+
*/
|
|
18
|
+
import type { AttributionReport, ComparisonReport } from "./types.js";
|
|
19
|
+
import type { ResolvedMappings } from "./resolve-mappings.js";
|
|
20
|
+
/**
|
|
21
|
+
* Attribute score changes to individual documents.
|
|
22
|
+
*
|
|
23
|
+
* For each area in the comparison report, cross-references the task's
|
|
24
|
+
* canonical_docs with the changed slugs to classify the attribution:
|
|
25
|
+
*
|
|
26
|
+
* - **unambiguous**: exactly 1 changed doc in the task's canonical set
|
|
27
|
+
* - **ambiguous**: 2+ changed docs in the task's canonical set
|
|
28
|
+
* - **uncorrelated**: 0 changed docs in the task's canonical set
|
|
29
|
+
*
|
|
30
|
+
* Tasks with deltas within the noise floor are still attributed but
|
|
31
|
+
* flagged as `withinNoiseFloor: true`.
|
|
32
|
+
*
|
|
33
|
+
* @param comparison - The structured comparison between before/after runs
|
|
34
|
+
* @param changedSlugs - Document slugs that changed
|
|
35
|
+
* @param mappings - Resolved task-to-canonical-docs mappings
|
|
36
|
+
* @param noiseThreshold - Deltas within ±this value are marked as noise
|
|
37
|
+
* @returns Attribution report with per-task classifications
|
|
38
|
+
*/
|
|
39
|
+
export declare function attributeChanges(comparison: ComparisonReport, changedSlugs: string[], mappings: ResolvedMappings, noiseThreshold: number): AttributionReport;
|
|
40
|
+
/**
|
|
41
|
+
* Format an attribution report for console output.
|
|
42
|
+
*/
|
|
43
|
+
export declare function formatAttributionConsole(report: AttributionReport): string;
|
|
44
|
+
/**
|
|
45
|
+
* Format an attribution report as markdown for PR comments.
|
|
46
|
+
*/
|
|
47
|
+
export declare function formatAttributionMarkdown(report: AttributionReport): string;
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/attribution.ts
|
|
3
|
+
*
|
|
4
|
+
* Per-document attribution for score changes.
|
|
5
|
+
*
|
|
6
|
+
* Given a ComparisonReport, a list of changed document slugs, and the
|
|
7
|
+
* resolved task mappings, this module classifies each task's score delta
|
|
8
|
+
* as unambiguous (one changed doc), ambiguous (multiple changed docs),
|
|
9
|
+
* or uncorrelated (no changed docs in the task's canonical set).
|
|
10
|
+
*
|
|
11
|
+
* This is Phase 1 of the hybrid attribution approach — zero additional
|
|
12
|
+
* eval cost, purely correlation-based. Phase 2 (targeted LOO) can be
|
|
13
|
+
* layered on top for ambiguous cases when higher precision is needed.
|
|
14
|
+
*
|
|
15
|
+
* @see docs/design-docs/scenario-matrix/per-document-attribution.md
|
|
16
|
+
* @see docs/exec-plans/completed/scenario-matrix-implementation/phase-2-impact-scenarios.md
|
|
17
|
+
*/
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
// Public API
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
/**
|
|
22
|
+
* Attribute score changes to individual documents.
|
|
23
|
+
*
|
|
24
|
+
* For each area in the comparison report, cross-references the task's
|
|
25
|
+
* canonical_docs with the changed slugs to classify the attribution:
|
|
26
|
+
*
|
|
27
|
+
* - **unambiguous**: exactly 1 changed doc in the task's canonical set
|
|
28
|
+
* - **ambiguous**: 2+ changed docs in the task's canonical set
|
|
29
|
+
* - **uncorrelated**: 0 changed docs in the task's canonical set
|
|
30
|
+
*
|
|
31
|
+
* Tasks with deltas within the noise floor are still attributed but
|
|
32
|
+
* flagged as `withinNoiseFloor: true`.
|
|
33
|
+
*
|
|
34
|
+
* @param comparison - The structured comparison between before/after runs
|
|
35
|
+
* @param changedSlugs - Document slugs that changed
|
|
36
|
+
* @param mappings - Resolved task-to-canonical-docs mappings
|
|
37
|
+
* @param noiseThreshold - Deltas within ±this value are marked as noise
|
|
38
|
+
* @returns Attribution report with per-task classifications
|
|
39
|
+
*/
|
|
40
|
+
export function attributeChanges(comparison, changedSlugs, mappings, noiseThreshold) {
|
|
41
|
+
const changedSet = new Set(changedSlugs);
|
|
42
|
+
const attributions = [];
|
|
43
|
+
// Build a flat list of all tasks from the mappings
|
|
44
|
+
const taskCanonicalDocs = new Map();
|
|
45
|
+
for (const [area, config] of Object.entries(mappings.feature_areas)) {
|
|
46
|
+
for (const task of config.tasks) {
|
|
47
|
+
taskCanonicalDocs.set(task.id, {
|
|
48
|
+
area,
|
|
49
|
+
slugs: task.canonical_docs.map((d) => d.slug),
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
// For each area in the comparison, attribute the delta
|
|
54
|
+
for (const areaDelta of comparison.areas) {
|
|
55
|
+
// Find tasks in this area
|
|
56
|
+
const areaTasks = [...taskCanonicalDocs.entries()].filter(([, info]) => info.area === areaDelta.area);
|
|
57
|
+
if (areaTasks.length === 0) {
|
|
58
|
+
// Area exists in comparison but has no tasks in mappings
|
|
59
|
+
// (shouldn't happen in practice, but handle gracefully)
|
|
60
|
+
continue;
|
|
61
|
+
}
|
|
62
|
+
// For area-level attribution: check which changed docs overlap
|
|
63
|
+
// with any task's canonical docs in this area
|
|
64
|
+
const areaCanonicalSlugs = new Set(areaTasks.flatMap(([, info]) => info.slugs));
|
|
65
|
+
const matchingSlugs = changedSlugs.filter((s) => areaCanonicalSlugs.has(s));
|
|
66
|
+
// Classify each task
|
|
67
|
+
for (const [taskId, taskInfo] of areaTasks) {
|
|
68
|
+
const taskMatchingSlugs = taskInfo.slugs.filter((s) => changedSet.has(s));
|
|
69
|
+
const classification = classifyAttribution(taskMatchingSlugs.length);
|
|
70
|
+
attributions.push({
|
|
71
|
+
area: taskInfo.area,
|
|
72
|
+
attributedDocs: taskMatchingSlugs,
|
|
73
|
+
classification,
|
|
74
|
+
delta: areaDelta.delta,
|
|
75
|
+
taskId,
|
|
76
|
+
withinNoiseFloor: Math.abs(areaDelta.delta) <= noiseThreshold,
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
// If no task-level matches but area has a delta, record area-level
|
|
80
|
+
if (areaTasks.length > 0 && matchingSlugs.length === 0) {
|
|
81
|
+
// All tasks in this area are uncorrelated — already handled above
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
// Find untracked documents: changed slugs not in ANY task's canonical docs
|
|
85
|
+
const allTrackedSlugs = new Set([...taskCanonicalDocs.values()].flatMap((info) => info.slugs));
|
|
86
|
+
const untrackedDocs = changedSlugs
|
|
87
|
+
.filter((s) => !allTrackedSlugs.has(s))
|
|
88
|
+
.sort();
|
|
89
|
+
// Compute summary counts
|
|
90
|
+
const summary = {
|
|
91
|
+
ambiguous: attributions.filter((a) => a.classification === "ambiguous")
|
|
92
|
+
.length,
|
|
93
|
+
unambiguous: attributions.filter((a) => a.classification === "unambiguous")
|
|
94
|
+
.length,
|
|
95
|
+
uncorrelated: attributions.filter((a) => a.classification === "uncorrelated").length,
|
|
96
|
+
withinNoise: attributions.filter((a) => a.withinNoiseFloor).length,
|
|
97
|
+
};
|
|
98
|
+
return {
|
|
99
|
+
attributions,
|
|
100
|
+
summary,
|
|
101
|
+
untrackedDocs,
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
// ---------------------------------------------------------------------------
|
|
105
|
+
// Formatting
|
|
106
|
+
// ---------------------------------------------------------------------------
|
|
107
|
+
/**
|
|
108
|
+
* Format an attribution report for console output.
|
|
109
|
+
*/
|
|
110
|
+
export function formatAttributionConsole(report) {
|
|
111
|
+
const lines = [];
|
|
112
|
+
lines.push("📋 PER-DOCUMENT ATTRIBUTION");
|
|
113
|
+
lines.push("");
|
|
114
|
+
// Unambiguous attributions
|
|
115
|
+
const unambiguous = report.attributions.filter((a) => a.classification === "unambiguous");
|
|
116
|
+
if (unambiguous.length > 0) {
|
|
117
|
+
lines.push(`Unambiguous (${unambiguous.length} tasks):`);
|
|
118
|
+
for (const a of unambiguous) {
|
|
119
|
+
const noiseTag = a.withinNoiseFloor ? " ⚠️ within noise" : "";
|
|
120
|
+
const deltaStr = a.delta > 0 ? `+${Math.round(a.delta)}` : String(Math.round(a.delta));
|
|
121
|
+
lines.push(` ${a.taskId}: ${deltaStr} → ${a.attributedDocs[0]}${noiseTag}`);
|
|
122
|
+
}
|
|
123
|
+
lines.push("");
|
|
124
|
+
}
|
|
125
|
+
// Ambiguous attributions
|
|
126
|
+
const ambiguous = report.attributions.filter((a) => a.classification === "ambiguous");
|
|
127
|
+
if (ambiguous.length > 0) {
|
|
128
|
+
lines.push(`Ambiguous (${ambiguous.length} tasks):`);
|
|
129
|
+
for (const a of ambiguous) {
|
|
130
|
+
const noiseTag = a.withinNoiseFloor ? " ⚠️ within noise" : "";
|
|
131
|
+
const deltaStr = a.delta > 0 ? `+${Math.round(a.delta)}` : String(Math.round(a.delta));
|
|
132
|
+
lines.push(` ${a.taskId}: ${deltaStr} → {${a.attributedDocs.join(", ")}}${noiseTag}`);
|
|
133
|
+
}
|
|
134
|
+
lines.push("");
|
|
135
|
+
}
|
|
136
|
+
// Uncorrelated
|
|
137
|
+
const uncorrelated = report.attributions.filter((a) => a.classification === "uncorrelated" && !a.withinNoiseFloor);
|
|
138
|
+
if (uncorrelated.length > 0) {
|
|
139
|
+
lines.push(`Uncorrelated (${uncorrelated.length} tasks):`);
|
|
140
|
+
for (const a of uncorrelated) {
|
|
141
|
+
const deltaStr = a.delta > 0 ? `+${Math.round(a.delta)}` : String(Math.round(a.delta));
|
|
142
|
+
lines.push(` ${a.taskId}: ${deltaStr} → likely grader variance (no changed docs in canonical set)`);
|
|
143
|
+
}
|
|
144
|
+
lines.push("");
|
|
145
|
+
}
|
|
146
|
+
// Untracked documents
|
|
147
|
+
if (report.untrackedDocs.length > 0) {
|
|
148
|
+
lines.push("Untracked documents:");
|
|
149
|
+
for (const doc of report.untrackedDocs) {
|
|
150
|
+
lines.push(` ${doc} — not in any task's canonical_docs`);
|
|
151
|
+
}
|
|
152
|
+
lines.push("");
|
|
153
|
+
}
|
|
154
|
+
return lines.join("\n");
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Format an attribution report as markdown for PR comments.
|
|
158
|
+
*/
|
|
159
|
+
export function formatAttributionMarkdown(report) {
|
|
160
|
+
const lines = [];
|
|
161
|
+
lines.push("### 📋 Per-Document Attribution");
|
|
162
|
+
lines.push("");
|
|
163
|
+
if (report.attributions.length === 0 && report.untrackedDocs.length === 0) {
|
|
164
|
+
lines.push("No attribution data available.");
|
|
165
|
+
lines.push("");
|
|
166
|
+
return lines.join("\n");
|
|
167
|
+
}
|
|
168
|
+
// Summary
|
|
169
|
+
const { ambiguous, unambiguous, uncorrelated, withinNoise } = report.summary;
|
|
170
|
+
const parts = [];
|
|
171
|
+
if (unambiguous > 0)
|
|
172
|
+
parts.push(`${unambiguous} unambiguous`);
|
|
173
|
+
if (ambiguous > 0)
|
|
174
|
+
parts.push(`${ambiguous} ambiguous`);
|
|
175
|
+
if (uncorrelated > 0)
|
|
176
|
+
parts.push(`${uncorrelated} uncorrelated`);
|
|
177
|
+
if (withinNoise > 0)
|
|
178
|
+
parts.push(`${withinNoise} within noise`);
|
|
179
|
+
if (parts.length > 0) {
|
|
180
|
+
lines.push(`**${parts.join(" · ")}**`);
|
|
181
|
+
lines.push("");
|
|
182
|
+
}
|
|
183
|
+
// Attribution table
|
|
184
|
+
const hasAttributions = report.attributions.some((a) => a.delta !== 0);
|
|
185
|
+
if (hasAttributions) {
|
|
186
|
+
lines.push("| Task | Area | Delta | Attribution | Documents |");
|
|
187
|
+
lines.push("|------|------|-------|-------------|-----------|");
|
|
188
|
+
for (const a of report.attributions) {
|
|
189
|
+
if (a.delta === 0 && a.classification === "uncorrelated")
|
|
190
|
+
continue;
|
|
191
|
+
const deltaStr = a.delta > 0 ? `+${Math.round(a.delta)}` : String(Math.round(a.delta));
|
|
192
|
+
const noiseIcon = a.withinNoiseFloor ? " ⚠️" : "";
|
|
193
|
+
const classIcon = a.classification === "unambiguous"
|
|
194
|
+
? "✅"
|
|
195
|
+
: a.classification === "ambiguous"
|
|
196
|
+
? "🟡"
|
|
197
|
+
: "❓";
|
|
198
|
+
const docs = a.attributedDocs.length > 0
|
|
199
|
+
? a.attributedDocs.map((d) => `\`${d}\``).join(", ")
|
|
200
|
+
: "—";
|
|
201
|
+
lines.push(`| ${a.taskId} | ${a.area} | ${deltaStr}${noiseIcon} | ${classIcon} ${a.classification} | ${docs} |`);
|
|
202
|
+
}
|
|
203
|
+
lines.push("");
|
|
204
|
+
}
|
|
205
|
+
// Untracked documents
|
|
206
|
+
if (report.untrackedDocs.length > 0) {
|
|
207
|
+
lines.push("**Untracked documents** (not in any task's canonical_docs):");
|
|
208
|
+
lines.push("");
|
|
209
|
+
for (const doc of report.untrackedDocs) {
|
|
210
|
+
lines.push(`- \`${doc}\``);
|
|
211
|
+
}
|
|
212
|
+
lines.push("");
|
|
213
|
+
}
|
|
214
|
+
return lines.join("\n");
|
|
215
|
+
}
|
|
216
|
+
// ---------------------------------------------------------------------------
|
|
217
|
+
// Helpers
|
|
218
|
+
// ---------------------------------------------------------------------------
|
|
219
|
+
/** Classify attribution based on the number of matching changed documents */
|
|
220
|
+
function classifyAttribution(matchCount) {
|
|
221
|
+
if (matchCount === 0)
|
|
222
|
+
return "uncorrelated";
|
|
223
|
+
if (matchCount === 1)
|
|
224
|
+
return "unambiguous";
|
|
225
|
+
return "ambiguous";
|
|
226
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/baseline.ts
|
|
3
|
+
*
|
|
4
|
+
* Pure computation functions for managing historical baseline snapshots of
|
|
5
|
+
* evaluation scores. All functions accept `rootDir` as their first parameter
|
|
6
|
+
* and perform no process.env / process.argv access.
|
|
7
|
+
*/
|
|
8
|
+
export interface BaselineMetadata {
|
|
9
|
+
areaCount: number;
|
|
10
|
+
avgScore: number;
|
|
11
|
+
filename: string;
|
|
12
|
+
graderCost?: number;
|
|
13
|
+
tag?: string;
|
|
14
|
+
timestamp: string;
|
|
15
|
+
totalCost?: number;
|
|
16
|
+
}
|
|
17
|
+
export interface CompareResult {
|
|
18
|
+
comparisons?: ScoreComparison[];
|
|
19
|
+
message: string;
|
|
20
|
+
overallDelta?: number;
|
|
21
|
+
success: boolean;
|
|
22
|
+
}
|
|
23
|
+
export interface ScoreComparison {
|
|
24
|
+
baseline: number;
|
|
25
|
+
costBaseline?: number;
|
|
26
|
+
costCurrent?: number;
|
|
27
|
+
costDelta?: number;
|
|
28
|
+
current: number;
|
|
29
|
+
delta: number;
|
|
30
|
+
feature: string;
|
|
31
|
+
}
|
|
32
|
+
export declare function compareBaseline(rootDir: string, baselineFile?: string): CompareResult;
|
|
33
|
+
export declare function listBaselines(rootDir: string): BaselineMetadata[];
|
|
34
|
+
export declare function saveBaseline(rootDir: string, tag?: string): {
|
|
35
|
+
success: boolean;
|
|
36
|
+
message: string;
|
|
37
|
+
};
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/baseline.ts
|
|
3
|
+
*
|
|
4
|
+
* Pure computation functions for managing historical baseline snapshots of
|
|
5
|
+
* evaluation scores. All functions accept `rootDir` as their first parameter
|
|
6
|
+
* and perform no process.env / process.argv access.
|
|
7
|
+
*/
|
|
8
|
+
import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
|
|
9
|
+
import { join } from "path";
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
// Compare
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
export function compareBaseline(rootDir, baselineFile) {
|
|
14
|
+
const baselinesDir = join(rootDir, "results", "baselines");
|
|
15
|
+
const scoreSummaryPath = join(rootDir, "results", "latest", "score-summary.json");
|
|
16
|
+
if (!existsSync(scoreSummaryPath)) {
|
|
17
|
+
return {
|
|
18
|
+
message: "No current score-summary.json found.",
|
|
19
|
+
success: false,
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
// Find baseline to compare against
|
|
23
|
+
const baselines = listBaselines(rootDir);
|
|
24
|
+
if (baselines.length === 0) {
|
|
25
|
+
return {
|
|
26
|
+
message: "No baselines saved yet. Run 'pnpm baseline:save' first.",
|
|
27
|
+
success: false,
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
const targetFile = baselineFile ?? baselines[0].filename;
|
|
31
|
+
const baselinePath = join(baselinesDir, targetFile);
|
|
32
|
+
if (!existsSync(baselinePath)) {
|
|
33
|
+
return {
|
|
34
|
+
message: `Baseline file not found: ${targetFile}`,
|
|
35
|
+
success: false,
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
const current = JSON.parse(readFileSync(scoreSummaryPath, "utf-8"));
|
|
39
|
+
const baseline = JSON.parse(readFileSync(baselinePath, "utf-8"));
|
|
40
|
+
const baselineMap = new Map(baseline.scores.map((s) => [s.feature, s.totalScore]));
|
|
41
|
+
const baselineCostMap = new Map(baseline.scores.map((s) => [s.feature, s.totalCost ?? 0]));
|
|
42
|
+
const comparisons = current.scores.map((s) => {
|
|
43
|
+
const baseScore = baselineMap.get(s.feature) ?? 0;
|
|
44
|
+
const currentCost = s.totalCost ?? 0;
|
|
45
|
+
const baseCost = baselineCostMap.get(s.feature) ?? 0;
|
|
46
|
+
return {
|
|
47
|
+
baseline: baseScore,
|
|
48
|
+
costBaseline: baseCost > 0 ? baseCost : undefined,
|
|
49
|
+
costCurrent: currentCost > 0 ? currentCost : undefined,
|
|
50
|
+
costDelta: currentCost > 0 || baseCost > 0 ? currentCost - baseCost : undefined,
|
|
51
|
+
current: s.totalScore,
|
|
52
|
+
delta: s.totalScore - baseScore,
|
|
53
|
+
feature: s.feature,
|
|
54
|
+
};
|
|
55
|
+
});
|
|
56
|
+
// Check for areas in baseline but not in current
|
|
57
|
+
for (const [feature, score] of baselineMap) {
|
|
58
|
+
if (!comparisons.find((c) => c.feature === feature)) {
|
|
59
|
+
comparisons.push({
|
|
60
|
+
baseline: score,
|
|
61
|
+
current: 0,
|
|
62
|
+
delta: -score,
|
|
63
|
+
feature,
|
|
64
|
+
});
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
comparisons.sort((a, b) => b.delta - a.delta);
|
|
68
|
+
const overallDelta = Math.round(current.overall.avgScore) - Math.round(baseline.overall.avgScore);
|
|
69
|
+
return {
|
|
70
|
+
comparisons,
|
|
71
|
+
message: `Compared against ${targetFile}`,
|
|
72
|
+
overallDelta,
|
|
73
|
+
success: true,
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
// ---------------------------------------------------------------------------
|
|
77
|
+
// List
|
|
78
|
+
// ---------------------------------------------------------------------------
|
|
79
|
+
export function listBaselines(rootDir) {
|
|
80
|
+
const baselinesDir = join(rootDir, "results", "baselines");
|
|
81
|
+
if (!existsSync(baselinesDir)) {
|
|
82
|
+
return [];
|
|
83
|
+
}
|
|
84
|
+
const files = readdirSync(baselinesDir)
|
|
85
|
+
.filter((f) => f.endsWith(".json"))
|
|
86
|
+
.sort()
|
|
87
|
+
.reverse(); // Newest first
|
|
88
|
+
return files.map((filename) => {
|
|
89
|
+
const raw = readFileSync(join(baselinesDir, filename), "utf-8");
|
|
90
|
+
const data = JSON.parse(raw);
|
|
91
|
+
return {
|
|
92
|
+
areaCount: data.scores.length,
|
|
93
|
+
avgScore: Math.round(data.overall.avgScore),
|
|
94
|
+
filename,
|
|
95
|
+
graderCost: data.overall.cost?.graderTotal,
|
|
96
|
+
tag: data.baselineMeta?.tag,
|
|
97
|
+
timestamp: data.timestamp,
|
|
98
|
+
totalCost: data.overall.cost?.total,
|
|
99
|
+
};
|
|
100
|
+
});
|
|
101
|
+
}
|
|
102
|
+
// ---------------------------------------------------------------------------
|
|
103
|
+
// Save
|
|
104
|
+
// ---------------------------------------------------------------------------
|
|
105
|
+
export function saveBaseline(rootDir, tag) {
|
|
106
|
+
const baselinesDir = join(rootDir, "results", "baselines");
|
|
107
|
+
const scoreSummaryPath = join(rootDir, "results", "latest", "score-summary.json");
|
|
108
|
+
if (!existsSync(scoreSummaryPath)) {
|
|
109
|
+
return {
|
|
110
|
+
message: "No score-summary.json found. Run 'pnpm calculate-scores' first.",
|
|
111
|
+
success: false,
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
const raw = readFileSync(scoreSummaryPath, "utf-8");
|
|
115
|
+
const summary = JSON.parse(raw);
|
|
116
|
+
mkdirSync(baselinesDir, { recursive: true });
|
|
117
|
+
// Generate filename: YYYY-MM-DD_HHmmss[_tag].json
|
|
118
|
+
const now = new Date();
|
|
119
|
+
const datePart = now
|
|
120
|
+
.toISOString()
|
|
121
|
+
.slice(0, 19)
|
|
122
|
+
.replace(/[T:]/g, "_")
|
|
123
|
+
.replace(/-/g, "");
|
|
124
|
+
const tagPart = tag
|
|
125
|
+
? `_${tag.replace(/[^a-z0-9-]/gi, "-").toLowerCase()}`
|
|
126
|
+
: "";
|
|
127
|
+
const filename = `${datePart}${tagPart}.json`;
|
|
128
|
+
const baseline = {
|
|
129
|
+
...summary,
|
|
130
|
+
baselineMeta: {
|
|
131
|
+
savedAt: now.toISOString(),
|
|
132
|
+
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string tag should be treated as no tag
|
|
133
|
+
tag: tag || undefined,
|
|
134
|
+
},
|
|
135
|
+
};
|
|
136
|
+
writeFileSync(join(baselinesDir, filename), JSON.stringify(baseline, null, 2));
|
|
137
|
+
return {
|
|
138
|
+
message: `Saved baseline to results/baselines/${filename} (avg: ${Math.round(summary.overall.avgScore)}, ${summary.scores.length} areas)`,
|
|
139
|
+
success: true,
|
|
140
|
+
};
|
|
141
|
+
}
|