@sanity/ailf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -0
- package/bin/ailf.js +64 -0
- package/canonical/grader-references/README.md +88 -0
- package/canonical/grader-references/groq.yaml +234 -0
- package/canonical/grader-references/studio-setup.yaml +275 -0
- package/canonical/reference-solutions/.gitkeep +1 -0
- package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
- package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
- package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
- package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
- package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
- package/canonical/reference-solutions/groq/joins-references.ts +300 -0
- package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
- package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
- package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
- package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
- package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
- package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
- package/config/bigquery/README.md +74 -0
- package/config/bigquery/views/area_scores.sql +87 -0
- package/config/bigquery/views/reports.sql +49 -0
- package/config/features.yaml +116 -0
- package/config/models.yaml +115 -0
- package/config/prompts.yaml +75 -0
- package/config/rubrics.yaml +62 -0
- package/config/schedules.yaml +43 -0
- package/config/sinks.yaml +54 -0
- package/config/sources.yaml +51 -0
- package/config/thresholds.yaml +49 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
- package/dist/_vendor/ailf-core/examples/index.js +285 -0
- package/dist/_vendor/ailf-core/index.d.ts +17 -0
- package/dist/_vendor/ailf-core/index.js +17 -0
- package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
- package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
- package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
- package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
- package/dist/_vendor/ailf-core/ports/context.js +14 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
- package/dist/_vendor/ailf-core/ports/index.js +7 -0
- package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
- package/dist/_vendor/ailf-core/ports/logger.js +11 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
- package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/schemas/index.js +16 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
- package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
- package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
- package/dist/_vendor/ailf-core/services/index.js +12 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/scoring.js +222 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
- package/dist/_vendor/ailf-core/types/index.js +21 -0
- package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
- package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
- package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
- package/dist/_vendor/ailf-shared/document-ref.js +1 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -0
- package/dist/_vendor/ailf-shared/index.js +16 -0
- package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
- package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
- package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
- package/dist/_vendor/ailf-shared/score-grades.js +23 -0
- package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
- package/dist/adapters/cache/content-lake-cache.js +59 -0
- package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
- package/dist/adapters/cache/filesystem-cache.js +54 -0
- package/dist/adapters/cache/index.d.ts +2 -0
- package/dist/adapters/cache/index.js +2 -0
- package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
- package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
- package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
- package/dist/adapters/config-sources/file-config-adapter.js +96 -0
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +2 -0
- package/dist/adapters/doc-fetchers/index.d.ts +1 -0
- package/dist/adapters/doc-fetchers/index.js +1 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
- package/dist/adapters/eval-runners/index.d.ts +1 -0
- package/dist/adapters/eval-runners/index.js +1 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
- package/dist/adapters/index.d.ts +12 -0
- package/dist/adapters/index.js +12 -0
- package/dist/adapters/loggers/console-logger.d.ts +22 -0
- package/dist/adapters/loggers/console-logger.js +54 -0
- package/dist/adapters/loggers/index.d.ts +9 -0
- package/dist/adapters/loggers/index.js +9 -0
- package/dist/adapters/loggers/json-logger.d.ts +18 -0
- package/dist/adapters/loggers/json-logger.js +33 -0
- package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
- package/dist/adapters/loggers/quiet-logger.js +30 -0
- package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/composite-task-source.js +59 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
- package/dist/adapters/task-sources/index.d.ts +7 -0
- package/dist/adapters/task-sources/index.js +7 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
- package/dist/adapters/task-sources/repo-schemas.js +234 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
- package/dist/adapters/task-sources/repo-task-source.js +104 -0
- package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
- package/dist/adapters/task-sources/repo-trigger.js +153 -0
- package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
- package/dist/adapters/task-sources/repo-validation.js +164 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
- package/dist/adapters/task-sources/yaml-task-source.js +136 -0
- package/dist/agent-observer/agentic-provider.d.ts +132 -0
- package/dist/agent-observer/agentic-provider.js +983 -0
- package/dist/agent-observer/classifier.d.ts +62 -0
- package/dist/agent-observer/classifier.js +269 -0
- package/dist/agent-observer/index.d.ts +7 -0
- package/dist/agent-observer/index.js +4 -0
- package/dist/agent-observer/pricing.d.ts +35 -0
- package/dist/agent-observer/pricing.js +82 -0
- package/dist/agent-observer/provider.d.ts +77 -0
- package/dist/agent-observer/provider.js +151 -0
- package/dist/agent-observer/proxy.d.ts +91 -0
- package/dist/agent-observer/proxy.js +321 -0
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/agent-observer/types.d.ts +137 -0
- package/dist/agent-observer/types.js +16 -0
- package/dist/assertions/source-isolation.d.ts +72 -0
- package/dist/assertions/source-isolation.js +117 -0
- package/dist/cli.d.ts +24 -0
- package/dist/cli.js +199 -0
- package/dist/commands/agent-report.d.ts +5 -0
- package/dist/commands/agent-report.js +69 -0
- package/dist/commands/baseline.d.ts +9 -0
- package/dist/commands/baseline.js +141 -0
- package/dist/commands/cache.d.ts +13 -0
- package/dist/commands/cache.js +135 -0
- package/dist/commands/calculate-scores.d.ts +8 -0
- package/dist/commands/calculate-scores.js +48 -0
- package/dist/commands/compare.d.ts +8 -0
- package/dist/commands/compare.js +120 -0
- package/dist/commands/completion.d.ts +18 -0
- package/dist/commands/completion.js +260 -0
- package/dist/commands/coverage-audit.d.ts +7 -0
- package/dist/commands/coverage-audit.js +40 -0
- package/dist/commands/discovery-report.d.ts +10 -0
- package/dist/commands/discovery-report.js +44 -0
- package/dist/commands/eval.d.ts +9 -0
- package/dist/commands/eval.js +35 -0
- package/dist/commands/explain-handler.d.ts +34 -0
- package/dist/commands/explain-handler.js +719 -0
- package/dist/commands/fetch-docs.d.ts +8 -0
- package/dist/commands/fetch-docs.js +128 -0
- package/dist/commands/generate-configs.d.ts +8 -0
- package/dist/commands/generate-configs.js +46 -0
- package/dist/commands/grader/index.d.ts +11 -0
- package/dist/commands/grader/index.js +118 -0
- package/dist/commands/init.d.ts +19 -0
- package/dist/commands/init.js +150 -0
- package/dist/commands/interactive.d.ts +12 -0
- package/dist/commands/interactive.js +238 -0
- package/dist/commands/lookup-doc.d.ts +15 -0
- package/dist/commands/lookup-doc.js +84 -0
- package/dist/commands/measure-retrieval.d.ts +5 -0
- package/dist/commands/measure-retrieval.js +65 -0
- package/dist/commands/pipeline-action.d.ts +71 -0
- package/dist/commands/pipeline-action.js +305 -0
- package/dist/commands/pipeline.d.ts +62 -0
- package/dist/commands/pipeline.js +53 -0
- package/dist/commands/pr-comment.d.ts +8 -0
- package/dist/commands/pr-comment.js +47 -0
- package/dist/commands/publish.d.ts +26 -0
- package/dist/commands/publish.js +253 -0
- package/dist/commands/readiness-report.d.ts +10 -0
- package/dist/commands/readiness-report.js +104 -0
- package/dist/commands/shared/options.d.ts +29 -0
- package/dist/commands/shared/options.js +57 -0
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +16 -0
- package/dist/commands/validate-tasks.js +93 -0
- package/dist/commands/validate.d.ts +9 -0
- package/dist/commands/validate.js +73 -0
- package/dist/commands/webhook-server.d.ts +5 -0
- package/dist/commands/webhook-server.js +30 -0
- package/dist/commands/weekly-digest.d.ts +10 -0
- package/dist/commands/weekly-digest.js +104 -0
- package/dist/composition-root.d.ts +26 -0
- package/dist/composition-root.js +107 -0
- package/dist/interpolate.d.ts +26 -0
- package/dist/interpolate.js +70 -0
- package/dist/job-store.d.ts +104 -0
- package/dist/job-store.js +188 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.d.ts +27 -0
- package/dist/orchestration/build-app-context.js +81 -0
- package/dist/orchestration/build-step-sequence.d.ts +15 -0
- package/dist/orchestration/build-step-sequence.js +84 -0
- package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
- package/dist/orchestration/config-to-source-overrides.js +28 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/index.d.ts +11 -0
- package/dist/orchestration/index.js +11 -0
- package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
- package/dist/orchestration/pipeline-orchestrator.js +153 -0
- package/dist/orchestration/step-runner.d.ts +20 -0
- package/dist/orchestration/step-runner.js +88 -0
- package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
- package/dist/orchestration/steps/calculate-scores-step.js +95 -0
- package/dist/orchestration/steps/callback-step.d.ts +24 -0
- package/dist/orchestration/steps/callback-step.js +76 -0
- package/dist/orchestration/steps/compare-step.d.ts +14 -0
- package/dist/orchestration/steps/compare-step.js +92 -0
- package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
- package/dist/orchestration/steps/discovery-report-step.js +55 -0
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
- package/dist/orchestration/steps/fetch-docs-step.js +135 -0
- package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
- package/dist/orchestration/steps/gap-analysis-step.js +136 -0
- package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
- package/dist/orchestration/steps/generate-configs-step.js +85 -0
- package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
- package/dist/orchestration/steps/grader-consistency-step.js +64 -0
- package/dist/orchestration/steps/index.d.ts +19 -0
- package/dist/orchestration/steps/index.js +19 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
- package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
- package/dist/orchestration/steps/publish-report-step.js +216 -0
- package/dist/orchestration/steps/readiness-step.d.ts +13 -0
- package/dist/orchestration/steps/readiness-step.js +91 -0
- package/dist/orchestration/steps/report-step.d.ts +12 -0
- package/dist/orchestration/steps/report-step.js +49 -0
- package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
- package/dist/orchestration/steps/run-eval-step.js +195 -0
- package/dist/orchestration/steps/validate-step.d.ts +12 -0
- package/dist/orchestration/steps/validate-step.js +41 -0
- package/dist/pipeline/agent-behavior-report.d.ts +53 -0
- package/dist/pipeline/agent-behavior-report.js +132 -0
- package/dist/pipeline/attribution.d.ts +47 -0
- package/dist/pipeline/attribution.js +226 -0
- package/dist/pipeline/baseline.d.ts +37 -0
- package/dist/pipeline/baseline.js +141 -0
- package/dist/pipeline/cache.d.ts +101 -0
- package/dist/pipeline/cache.js +283 -0
- package/dist/pipeline/calculate-scores.d.ts +102 -0
- package/dist/pipeline/calculate-scores.js +1128 -0
- package/dist/pipeline/callback-delivery.d.ts +50 -0
- package/dist/pipeline/callback-delivery.js +89 -0
- package/dist/pipeline/checks.d.ts +39 -0
- package/dist/pipeline/checks.js +280 -0
- package/dist/pipeline/classify-url.d.ts +61 -0
- package/dist/pipeline/classify-url.js +93 -0
- package/dist/pipeline/compare.d.ts +31 -0
- package/dist/pipeline/compare.js +208 -0
- package/dist/pipeline/coverage-audit.d.ts +39 -0
- package/dist/pipeline/coverage-audit.js +165 -0
- package/dist/pipeline/degradations.d.ts +85 -0
- package/dist/pipeline/degradations.js +242 -0
- package/dist/pipeline/discovery-report.d.ts +55 -0
- package/dist/pipeline/discovery-report.js +178 -0
- package/dist/pipeline/eval-constants.d.ts +68 -0
- package/dist/pipeline/eval-constants.js +111 -0
- package/dist/pipeline/eval-fingerprint.d.ts +66 -0
- package/dist/pipeline/eval-fingerprint.js +175 -0
- package/dist/pipeline/expand-tasks.d.ts +220 -0
- package/dist/pipeline/expand-tasks.js +421 -0
- package/dist/pipeline/failure-modes.d.ts +46 -0
- package/dist/pipeline/failure-modes.js +348 -0
- package/dist/pipeline/fetch-url-content.d.ts +44 -0
- package/dist/pipeline/fetch-url-content.js +93 -0
- package/dist/pipeline/gap-analysis.d.ts +48 -0
- package/dist/pipeline/gap-analysis.js +231 -0
- package/dist/pipeline/generate-configs.d.ts +72 -0
- package/dist/pipeline/generate-configs.js +395 -0
- package/dist/pipeline/grader-api.d.ts +49 -0
- package/dist/pipeline/grader-api.js +200 -0
- package/dist/pipeline/grader-compare-runner.d.ts +44 -0
- package/dist/pipeline/grader-compare-runner.js +301 -0
- package/dist/pipeline/grader-comparison.d.ts +111 -0
- package/dist/pipeline/grader-comparison.js +161 -0
- package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
- package/dist/pipeline/grader-consistency-runner.js +270 -0
- package/dist/pipeline/grader-consistency.d.ts +103 -0
- package/dist/pipeline/grader-consistency.js +146 -0
- package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
- package/dist/pipeline/grader-sensitivity-runner.js +282 -0
- package/dist/pipeline/grader-sensitivity.d.ts +94 -0
- package/dist/pipeline/grader-sensitivity.js +144 -0
- package/dist/pipeline/grader-validate-runner.d.ts +38 -0
- package/dist/pipeline/grader-validate-runner.js +229 -0
- package/dist/pipeline/grader-validation.d.ts +107 -0
- package/dist/pipeline/grader-validation.js +169 -0
- package/dist/pipeline/map-request-to-config.d.ts +19 -0
- package/dist/pipeline/map-request-to-config.js +80 -0
- package/dist/pipeline/measure-retrieval.d.ts +59 -0
- package/dist/pipeline/measure-retrieval.js +111 -0
- package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
- package/dist/pipeline/mirror-repo-tasks.js +350 -0
- package/dist/pipeline/plan-format.d.ts +33 -0
- package/dist/pipeline/plan-format.js +202 -0
- package/dist/pipeline/plan.d.ts +169 -0
- package/dist/pipeline/plan.js +708 -0
- package/dist/pipeline/pr-comment.d.ts +19 -0
- package/dist/pipeline/pr-comment.js +502 -0
- package/dist/pipeline/probe.d.ts +52 -0
- package/dist/pipeline/probe.js +390 -0
- package/dist/pipeline/provenance.d.ts +47 -0
- package/dist/pipeline/provenance.js +146 -0
- package/dist/pipeline/readiness-report.d.ts +87 -0
- package/dist/pipeline/readiness-report.js +205 -0
- package/dist/pipeline/release-classification.d.ts +54 -0
- package/dist/pipeline/release-classification.js +238 -0
- package/dist/pipeline/release-report.d.ts +37 -0
- package/dist/pipeline/release-report.js +222 -0
- package/dist/pipeline/repo-eval-comment.d.ts +37 -0
- package/dist/pipeline/repo-eval-comment.js +165 -0
- package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
- package/dist/pipeline/repo-threshold-evaluator.js +162 -0
- package/dist/pipeline/resolve-mappings.d.ts +35 -0
- package/dist/pipeline/resolve-mappings.js +72 -0
- package/dist/pipeline/retrieval-metrics.d.ts +39 -0
- package/dist/pipeline/retrieval-metrics.js +136 -0
- package/dist/pipeline/reverse-mapping.d.ts +67 -0
- package/dist/pipeline/reverse-mapping.js +88 -0
- package/dist/pipeline/schemas.d.ts +9 -0
- package/dist/pipeline/schemas.js +9 -0
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +88 -0
- package/dist/pipeline/targeted-loo.js +203 -0
- package/dist/pipeline/thresholds.d.ts +27 -0
- package/dist/pipeline/thresholds.js +245 -0
- package/dist/pipeline/types.d.ts +10 -0
- package/dist/pipeline/types.js +10 -0
- package/dist/pipeline/validate.d.ts +67 -0
- package/dist/pipeline/validate.js +406 -0
- package/dist/pipeline/webhook-server.d.ts +37 -0
- package/dist/pipeline/webhook-server.js +133 -0
- package/dist/report-store.d.ts +84 -0
- package/dist/report-store.js +208 -0
- package/dist/sanity/client.d.ts +38 -0
- package/dist/sanity/client.js +86 -0
- package/dist/sanity/portable-text.d.ts +11 -0
- package/dist/sanity/portable-text.js +211 -0
- package/dist/sanity/queries.d.ts +133 -0
- package/dist/sanity/queries.js +300 -0
- package/dist/schedules/digest.d.ts +116 -0
- package/dist/schedules/digest.js +156 -0
- package/dist/schedules/index.d.ts +12 -0
- package/dist/schedules/index.js +10 -0
- package/dist/schedules/loader.d.ts +31 -0
- package/dist/schedules/loader.js +73 -0
- package/dist/schedules/schema.d.ts +9 -0
- package/dist/schedules/schema.js +9 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +21 -0
- package/dist/scripts/validate-task-sources.js +210 -0
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/bigquery/index.d.ts +131 -0
- package/dist/sinks/bigquery/index.js +222 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/index.d.ts +23 -0
- package/dist/sinks/index.js +18 -0
- package/dist/sinks/loader.d.ts +18 -0
- package/dist/sinks/loader.js +82 -0
- package/dist/sinks/retry.d.ts +24 -0
- package/dist/sinks/retry.js +52 -0
- package/dist/sinks/schema.d.ts +9 -0
- package/dist/sinks/schema.js +9 -0
- package/dist/sinks/slack/format.d.ts +65 -0
- package/dist/sinks/slack/format.js +327 -0
- package/dist/sinks/slack/index.d.ts +27 -0
- package/dist/sinks/slack/index.js +78 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +59 -0
- package/dist/sinks/types.js +44 -0
- package/dist/sinks/webhook/index.d.ts +19 -0
- package/dist/sinks/webhook/index.js +50 -0
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/sources.d.ts +104 -0
- package/dist/sources.js +292 -0
- package/dist/webhook/budget.d.ts +42 -0
- package/dist/webhook/budget.js +60 -0
- package/dist/webhook/debounce.d.ts +67 -0
- package/dist/webhook/debounce.js +76 -0
- package/dist/webhook/dispatch.d.ts +45 -0
- package/dist/webhook/dispatch.js +84 -0
- package/dist/webhook/eval-request-handler.d.ts +87 -0
- package/dist/webhook/eval-request-handler.js +181 -0
- package/dist/webhook/handler.d.ts +88 -0
- package/dist/webhook/handler.js +203 -0
- package/dist/webhook/index.d.ts +17 -0
- package/dist/webhook/index.js +12 -0
- package/dist/webhook/types.d.ts +109 -0
- package/dist/webhook/types.js +10 -0
- package/package.json +72 -0
- package/tasks/.expanded.agentic.yaml +51 -0
- package/tasks/.expanded.yaml +66 -0
- package/tasks/frameworks.yaml +98 -0
- package/tasks/functions.yaml +51 -0
- package/tasks/groq.yaml +216 -0
- package/tasks/nextjs-live.yaml +62 -0
- package/tasks/studio-setup.yaml +111 -0
- package/tasks/visual-editing.yaml +120 -0
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/release-report.ts
|
|
3
|
+
*
|
|
4
|
+
* Cross-area release impact reports.
|
|
5
|
+
*
|
|
6
|
+
* Phase 4c of the Scenario Matrix implementation.
|
|
7
|
+
*
|
|
8
|
+
* Consolidates impact data from multi-area content releases into a single
|
|
9
|
+
* report that combines document classification (4a), before/after deltas (2b),
|
|
10
|
+
* attribution (2c), and probe results (4b) into the document × area × task
|
|
11
|
+
* impact matrix specified by Scenario 2.4.
|
|
12
|
+
*
|
|
13
|
+
* @see docs/exec-plans/completed/scenario-matrix-implementation/phase-4-content-release-integration.md
|
|
14
|
+
*/
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
// Public API
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
/**
|
|
19
|
+
* Build a cross-area release impact report.
|
|
20
|
+
*
|
|
21
|
+
* Combines classification, comparison, attribution, and probe data into
|
|
22
|
+
* a single consolidated report.
|
|
23
|
+
*
|
|
24
|
+
* @param classification - Document classification from Phase 4a
|
|
25
|
+
* @param comparison - Before/after comparison (if available)
|
|
26
|
+
* @param attribution - Per-document attribution (if available)
|
|
27
|
+
* @param probes - Tier B probe results (if any)
|
|
28
|
+
* @param noiseThreshold - Threshold for "unchanged" classification
|
|
29
|
+
* @returns Consolidated release impact report
|
|
30
|
+
*/
|
|
31
|
+
export function buildReleaseImpactReport(classification, comparison, attribution, probes, noiseThreshold) {
|
|
32
|
+
const threshold = noiseThreshold ?? comparison?.noiseThreshold ?? 2;
|
|
33
|
+
// Build per-area impact from comparison data
|
|
34
|
+
const areas = [];
|
|
35
|
+
const regressions = [];
|
|
36
|
+
const confirmedUnchanged = [];
|
|
37
|
+
if (comparison) {
|
|
38
|
+
for (const areaDelta of comparison.areas) {
|
|
39
|
+
const regressed = areaDelta.delta < -threshold;
|
|
40
|
+
// Find tasks and their attributed documents for this area
|
|
41
|
+
const areaTasks = [];
|
|
42
|
+
if (attribution) {
|
|
43
|
+
const areaAttrs = attribution.attributions.filter((a) => a.area === areaDelta.area);
|
|
44
|
+
for (const attr of areaAttrs) {
|
|
45
|
+
areaTasks.push({
|
|
46
|
+
attributedDocs: attr.attributedDocs,
|
|
47
|
+
delta: attr.delta,
|
|
48
|
+
taskId: attr.taskId,
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
// If no attribution data, create a single task entry per area
|
|
53
|
+
if (areaTasks.length === 0) {
|
|
54
|
+
areaTasks.push({
|
|
55
|
+
attributedDocs: [],
|
|
56
|
+
delta: areaDelta.delta,
|
|
57
|
+
taskId: `${areaDelta.area} (area-level)`,
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
areas.push({
|
|
61
|
+
area: areaDelta.area,
|
|
62
|
+
delta: areaDelta.delta,
|
|
63
|
+
regressed,
|
|
64
|
+
tasks: areaTasks,
|
|
65
|
+
});
|
|
66
|
+
if (regressed) {
|
|
67
|
+
regressions.push(areaDelta.area);
|
|
68
|
+
}
|
|
69
|
+
if (Math.abs(areaDelta.delta) <= threshold) {
|
|
70
|
+
confirmedUnchanged.push(areaDelta.area);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
// Determine which areas were not evaluated
|
|
75
|
+
const evaluatedAreas = new Set(areas.map((a) => a.area));
|
|
76
|
+
const allAffectedAreas = new Set(classification.documents.flatMap((d) => d.affectedAreas));
|
|
77
|
+
const notEvaluated = [...allAffectedAreas]
|
|
78
|
+
.filter((a) => !evaluatedAreas.has(a))
|
|
79
|
+
.sort();
|
|
80
|
+
// Calculate overall delta
|
|
81
|
+
const overallDelta = comparison ? comparison.deltas.overall : 0;
|
|
82
|
+
return {
|
|
83
|
+
areas,
|
|
84
|
+
confirmedUnchanged,
|
|
85
|
+
documents: classification.documents,
|
|
86
|
+
generatedAt: new Date().toISOString(),
|
|
87
|
+
notEvaluated,
|
|
88
|
+
overallDelta,
|
|
89
|
+
probes: probes ?? [],
|
|
90
|
+
regressions,
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
// ---------------------------------------------------------------------------
|
|
94
|
+
// Formatting
|
|
95
|
+
// ---------------------------------------------------------------------------
|
|
96
|
+
/**
|
|
97
|
+
* Format a release impact report for console output.
|
|
98
|
+
*/
|
|
99
|
+
export function formatReleaseImpactConsole(report) {
|
|
100
|
+
const lines = [];
|
|
101
|
+
lines.push("═══════════════════════════════════════════════════════════════");
|
|
102
|
+
lines.push(" CONTENT RELEASE IMPACT REPORT");
|
|
103
|
+
lines.push("═══════════════════════════════════════════════════════════════");
|
|
104
|
+
lines.push("");
|
|
105
|
+
const deltaStr = report.overallDelta >= 0
|
|
106
|
+
? `+${report.overallDelta.toFixed(1)}`
|
|
107
|
+
: report.overallDelta.toFixed(1);
|
|
108
|
+
lines.push(` Overall impact: ${deltaStr} points`);
|
|
109
|
+
lines.push(` Documents: ${report.documents.length}`);
|
|
110
|
+
lines.push(` Areas affected: ${report.areas.length}`);
|
|
111
|
+
lines.push("");
|
|
112
|
+
// Regressions
|
|
113
|
+
if (report.regressions.length > 0) {
|
|
114
|
+
lines.push(" ⚠️ REGRESSIONS DETECTED:");
|
|
115
|
+
for (const area of report.regressions) {
|
|
116
|
+
const areaData = report.areas.find((a) => a.area === area);
|
|
117
|
+
if (areaData) {
|
|
118
|
+
lines.push(` ${area}: ${areaData.delta.toFixed(1)}`);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
lines.push("");
|
|
122
|
+
}
|
|
123
|
+
// Impact matrix
|
|
124
|
+
if (report.areas.length > 0) {
|
|
125
|
+
lines.push(" Document | Area | Task | Delta");
|
|
126
|
+
lines.push(" ────────────────────────────────┼──────────────────┼─────────────────────────┼──────");
|
|
127
|
+
for (const area of report.areas) {
|
|
128
|
+
for (const task of area.tasks) {
|
|
129
|
+
const docs = task.attributedDocs.length > 0
|
|
130
|
+
? task.attributedDocs.join(", ")
|
|
131
|
+
: "(unattributed)";
|
|
132
|
+
const deltaStr = task.delta >= 0 ? `+${task.delta.toFixed(1)}` : task.delta.toFixed(1);
|
|
133
|
+
lines.push(` ${docs.padEnd(32)} | ${area.area.padEnd(16)} | ${task.taskId.padEnd(23)} | ${deltaStr}`);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
lines.push("");
|
|
137
|
+
}
|
|
138
|
+
// Probe results
|
|
139
|
+
if (report.probes.length > 0) {
|
|
140
|
+
lines.push(" TIER B PROBES (directional only):");
|
|
141
|
+
for (const probe of report.probes) {
|
|
142
|
+
lines.push(` ${probe.documentSlugs.join(", ")}: ${probe.usability}`);
|
|
143
|
+
}
|
|
144
|
+
lines.push("");
|
|
145
|
+
}
|
|
146
|
+
// Unchanged and not-evaluated areas
|
|
147
|
+
if (report.confirmedUnchanged.length > 0) {
|
|
148
|
+
lines.push(` Areas with no score change: ${report.confirmedUnchanged.join(", ")}`);
|
|
149
|
+
}
|
|
150
|
+
if (report.notEvaluated.length > 0) {
|
|
151
|
+
lines.push(` Areas not evaluated: ${report.notEvaluated.join(", ")}`);
|
|
152
|
+
}
|
|
153
|
+
if (report.confirmedUnchanged.length > 0 || report.notEvaluated.length > 0) {
|
|
154
|
+
lines.push("");
|
|
155
|
+
}
|
|
156
|
+
return lines.join("\n");
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Format a release impact report as markdown.
|
|
160
|
+
*/
|
|
161
|
+
export function formatReleaseImpactMarkdown(report) {
|
|
162
|
+
const lines = [];
|
|
163
|
+
lines.push("### 📋 Content Release Impact Report");
|
|
164
|
+
lines.push("");
|
|
165
|
+
const deltaStr = report.overallDelta >= 0
|
|
166
|
+
? `+${report.overallDelta.toFixed(1)}`
|
|
167
|
+
: report.overallDelta.toFixed(1);
|
|
168
|
+
lines.push(`**Overall impact: ${deltaStr} points** · ${report.documents.length} documents · ${report.areas.length} areas`);
|
|
169
|
+
lines.push("");
|
|
170
|
+
// Regressions
|
|
171
|
+
if (report.regressions.length > 0) {
|
|
172
|
+
lines.push("#### ⚠️ Regressions");
|
|
173
|
+
lines.push("");
|
|
174
|
+
for (const area of report.regressions) {
|
|
175
|
+
const areaData = report.areas.find((a) => a.area === area);
|
|
176
|
+
if (areaData) {
|
|
177
|
+
lines.push(`- **${area}**: ${areaData.delta.toFixed(1)}`);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
lines.push("");
|
|
181
|
+
}
|
|
182
|
+
// Impact table
|
|
183
|
+
if (report.areas.length > 0) {
|
|
184
|
+
lines.push("#### Impact Matrix");
|
|
185
|
+
lines.push("");
|
|
186
|
+
lines.push("| Document | Area | Task | Delta |");
|
|
187
|
+
lines.push("|----------|------|------|-------|");
|
|
188
|
+
for (const area of report.areas) {
|
|
189
|
+
for (const task of area.tasks) {
|
|
190
|
+
const docs = task.attributedDocs.length > 0
|
|
191
|
+
? task.attributedDocs.map((d) => `\`${d}\``).join(", ")
|
|
192
|
+
: "—";
|
|
193
|
+
const deltaStr = task.delta >= 0 ? `+${task.delta.toFixed(1)}` : task.delta.toFixed(1);
|
|
194
|
+
const regressIcon = area.regressed ? " ⚠️" : "";
|
|
195
|
+
lines.push(`| ${docs} | ${area.area} | ${task.taskId} | ${deltaStr}${regressIcon} |`);
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
lines.push("");
|
|
199
|
+
}
|
|
200
|
+
// Probes
|
|
201
|
+
if (report.probes.length > 0) {
|
|
202
|
+
lines.push("#### 🔍 Tier B Probes (directional only)");
|
|
203
|
+
lines.push("");
|
|
204
|
+
for (const probe of report.probes) {
|
|
205
|
+
lines.push(`- **${probe.documentSlugs.join(", ")}**: ${probe.usability}`);
|
|
206
|
+
}
|
|
207
|
+
lines.push("");
|
|
208
|
+
}
|
|
209
|
+
// Status areas
|
|
210
|
+
const statusParts = [];
|
|
211
|
+
if (report.confirmedUnchanged.length > 0) {
|
|
212
|
+
statusParts.push(`**Unchanged:** ${report.confirmedUnchanged.join(", ")}`);
|
|
213
|
+
}
|
|
214
|
+
if (report.notEvaluated.length > 0) {
|
|
215
|
+
statusParts.push(`**Not evaluated:** ${report.notEvaluated.join(", ")}`);
|
|
216
|
+
}
|
|
217
|
+
if (statusParts.length > 0) {
|
|
218
|
+
lines.push(statusParts.join(" · "));
|
|
219
|
+
lines.push("");
|
|
220
|
+
}
|
|
221
|
+
return lines.join("\n");
|
|
222
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/repo-eval-comment.ts
|
|
3
|
+
*
|
|
4
|
+
* Generates markdown PR comments for repo-based AILF evaluations.
|
|
5
|
+
*
|
|
6
|
+
* This is separate from the main pr-comment.ts (which generates the
|
|
7
|
+
* internal AILF repo's PR comment). Repo eval comments are designed
|
|
8
|
+
* for external teams and emphasize:
|
|
9
|
+
*
|
|
10
|
+
* - Documentation quality (not code quality)
|
|
11
|
+
* - Per-task threshold pass/fail status
|
|
12
|
+
* - Clear "what does this mean?" context
|
|
13
|
+
* - skip-ailf bypass instructions
|
|
14
|
+
*
|
|
15
|
+
* @see docs/exec-plans/completed/tasks-as-content/phase-6-pr-quality-gates.md
|
|
16
|
+
* @see packages/eval/src/pipeline/repo-threshold-evaluator.ts
|
|
17
|
+
*/
|
|
18
|
+
import type { ComparisonReport, ScoreSummary } from "./types.js";
|
|
19
|
+
import type { RepoThresholdEvaluation } from "./repo-threshold-evaluator.js";
|
|
20
|
+
export interface RepoCommentOptions {
|
|
21
|
+
/** Threshold evaluation results */
|
|
22
|
+
thresholdEval: RepoThresholdEvaluation;
|
|
23
|
+
/** Score summary from the pipeline */
|
|
24
|
+
scoreSummary: ScoreSummary;
|
|
25
|
+
/** Comparison report (if baseline exists) */
|
|
26
|
+
comparison?: ComparisonReport;
|
|
27
|
+
/** Link to the full report in Studio */
|
|
28
|
+
reportUrl?: string;
|
|
29
|
+
/** Link to the Promptfoo results */
|
|
30
|
+
promptfooUrl?: string;
|
|
31
|
+
/** Whether this is the first run (no baseline to compare against) */
|
|
32
|
+
firstRun?: boolean;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Generate a markdown PR comment for a repo-based evaluation.
|
|
36
|
+
*/
|
|
37
|
+
export declare function generateRepoEvalComment(options: RepoCommentOptions): string;
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/repo-eval-comment.ts
|
|
3
|
+
*
|
|
4
|
+
* Generates markdown PR comments for repo-based AILF evaluations.
|
|
5
|
+
*
|
|
6
|
+
* This is separate from the main pr-comment.ts (which generates the
|
|
7
|
+
* internal AILF repo's PR comment). Repo eval comments are designed
|
|
8
|
+
* for external teams and emphasize:
|
|
9
|
+
*
|
|
10
|
+
* - Documentation quality (not code quality)
|
|
11
|
+
* - Per-task threshold pass/fail status
|
|
12
|
+
* - Clear "what does this mean?" context
|
|
13
|
+
* - skip-ailf bypass instructions
|
|
14
|
+
*
|
|
15
|
+
* @see docs/exec-plans/completed/tasks-as-content/phase-6-pr-quality-gates.md
|
|
16
|
+
* @see packages/eval/src/pipeline/repo-threshold-evaluator.ts
|
|
17
|
+
*/
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
// Public API
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
/**
|
|
22
|
+
* Generate a markdown PR comment for a repo-based evaluation.
|
|
23
|
+
*/
|
|
24
|
+
export function generateRepoEvalComment(options) {
|
|
25
|
+
const { thresholdEval, scoreSummary, comparison, reportUrl, promptfooUrl, firstRun, } = options;
|
|
26
|
+
const lines = [];
|
|
27
|
+
// Comment marker for update-in-place
|
|
28
|
+
lines.push("<!-- ailf-repo-eval-report -->");
|
|
29
|
+
// Header
|
|
30
|
+
const statusEmoji = thresholdEval.checkPassed ? "✅" : "⚠️";
|
|
31
|
+
lines.push(`## 📊 AI Literacy Evaluation`);
|
|
32
|
+
lines.push("");
|
|
33
|
+
// Summary line
|
|
34
|
+
lines.push(`${statusEmoji} Your team's documentation quality score is **${thresholdEval.overallScore}/100**` +
|
|
35
|
+
(thresholdEval.defaultThreshold > 0
|
|
36
|
+
? ` (threshold: ${thresholdEval.defaultThreshold})`
|
|
37
|
+
: "") +
|
|
38
|
+
". " +
|
|
39
|
+
contextMessage(thresholdEval, comparison, firstRun));
|
|
40
|
+
lines.push("");
|
|
41
|
+
// Cost info
|
|
42
|
+
const totalCost = scoreSummary.scores.reduce((sum, s) => sum + (s.totalCost ?? 0), 0);
|
|
43
|
+
const graderCost = scoreSummary.overall.cost?.graderTotal ?? 0;
|
|
44
|
+
const combinedCost = totalCost + graderCost;
|
|
45
|
+
const testCount = scoreSummary.scores.reduce((sum, s) => sum + s.testCount, 0);
|
|
46
|
+
const costStr = combinedCost > 0 ? ` · Cost: ${formatCost(combinedCost)}` : "";
|
|
47
|
+
lines.push(`**Overall:** ${thresholdEval.overallScore}/100 · ` +
|
|
48
|
+
`**Doc lift:** +${Math.round(scoreSummary.overall.avgDocLift)} points · ` +
|
|
49
|
+
`${testCount} tests${costStr}`);
|
|
50
|
+
lines.push("");
|
|
51
|
+
// Per-task threshold table
|
|
52
|
+
if (thresholdEval.results.length > 0) {
|
|
53
|
+
lines.push("### Per-Task Results");
|
|
54
|
+
lines.push("");
|
|
55
|
+
lines.push("| Task | Score | Threshold | Status |");
|
|
56
|
+
lines.push("|------|-------|-----------|--------|");
|
|
57
|
+
for (const result of thresholdEval.results) {
|
|
58
|
+
lines.push(`| ${result.taskId} | ${result.actualScore} | ${result.threshold} | ${statusIcon(result)} |`);
|
|
59
|
+
}
|
|
60
|
+
lines.push("");
|
|
61
|
+
}
|
|
62
|
+
// Dimension details (collapsible)
|
|
63
|
+
const withDimensions = thresholdEval.results.filter((r) => r.dimensionResults && r.dimensionResults.length > 0);
|
|
64
|
+
if (withDimensions.length > 0) {
|
|
65
|
+
lines.push("<details>");
|
|
66
|
+
lines.push("<summary>📊 Per-dimension breakdown</summary>");
|
|
67
|
+
lines.push("");
|
|
68
|
+
lines.push("| Task | Dimension | Score | Threshold | Status |");
|
|
69
|
+
lines.push("|------|-----------|-------|-----------|--------|");
|
|
70
|
+
for (const result of withDimensions) {
|
|
71
|
+
for (const dim of result.dimensionResults) {
|
|
72
|
+
lines.push(`| ${result.taskId} | ${dim.dimension} | ${dim.actual} | ${dim.threshold} | ${dim.passed ? "✅" : "❌"} |`);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
lines.push("");
|
|
76
|
+
lines.push("</details>");
|
|
77
|
+
lines.push("");
|
|
78
|
+
}
|
|
79
|
+
// Comparison section (when baseline exists)
|
|
80
|
+
if (comparison && !firstRun) {
|
|
81
|
+
const delta = comparison.deltas.overall;
|
|
82
|
+
const direction = delta > 0 ? "up" : delta < 0 ? "down" : "unchanged";
|
|
83
|
+
const arrow = delta > 0 ? "📈" : delta < 0 ? "📉" : "➡️";
|
|
84
|
+
lines.push(`${arrow} Compared to last run: overall **${delta > 0 ? "+" : ""}${Math.round(delta)}** points (${direction}).`);
|
|
85
|
+
lines.push("");
|
|
86
|
+
if (comparison.regressed.length > 0) {
|
|
87
|
+
lines.push(`> ⚠️ **Regressions detected:** ${comparison.regressed.map((a) => `\`${a}\``).join(", ")}`);
|
|
88
|
+
lines.push("");
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
// Score breakdown (collapsible)
|
|
92
|
+
if (scoreSummary.scores.length > 0) {
|
|
93
|
+
lines.push("<details>");
|
|
94
|
+
lines.push("<summary>📋 Full score breakdown</summary>");
|
|
95
|
+
lines.push("");
|
|
96
|
+
lines.push("| Feature Area | Score | Task Completion | Code Correctness | Doc Coverage | Doc Lift |");
|
|
97
|
+
lines.push("|-------------|-------|-----------------|------------------|-------------|----------|");
|
|
98
|
+
const sorted = [...scoreSummary.scores].sort((a, b) => b.totalScore - a.totalScore);
|
|
99
|
+
for (const s of sorted) {
|
|
100
|
+
lines.push(`| ${s.feature} | **${Math.round(s.totalScore)}** | ${Math.round(s.taskCompletion)} | ${Math.round(s.codeCorrectness)} | ${Math.round(s.docCoverage)} | ${liftArrow(s.docLift)} |`);
|
|
101
|
+
}
|
|
102
|
+
lines.push("");
|
|
103
|
+
lines.push("</details>");
|
|
104
|
+
lines.push("");
|
|
105
|
+
}
|
|
106
|
+
// Footer with links and bypass instructions
|
|
107
|
+
const links = [];
|
|
108
|
+
if (reportUrl)
|
|
109
|
+
links.push(`[View full report in Studio](${reportUrl})`);
|
|
110
|
+
if (promptfooUrl)
|
|
111
|
+
links.push(`[Detailed results](${promptfooUrl})`);
|
|
112
|
+
if (links.length > 0) {
|
|
113
|
+
lines.push(links.join(" · "));
|
|
114
|
+
lines.push("");
|
|
115
|
+
}
|
|
116
|
+
lines.push(`> 💡 This check evaluates **documentation quality**, not code correctness. ` +
|
|
117
|
+
`Add the \`skip-ailf\` label to bypass.`);
|
|
118
|
+
lines.push("");
|
|
119
|
+
// Timestamp
|
|
120
|
+
const now = new Date();
|
|
121
|
+
lines.push(`*Generated by [AI Literacy Framework](https://github.com/sanity-labs/ai-literacy-framework) · ${now.toUTCString()}*`);
|
|
122
|
+
return lines.join("\n");
|
|
123
|
+
}
|
|
124
|
+
// ---------------------------------------------------------------------------
|
|
125
|
+
// Internal helpers
|
|
126
|
+
// ---------------------------------------------------------------------------
|
|
127
|
+
function contextMessage(eval_, comparison, firstRun) {
|
|
128
|
+
if (firstRun) {
|
|
129
|
+
return "This is the first evaluation for this task set. Future runs will show trends.";
|
|
130
|
+
}
|
|
131
|
+
if (eval_.checkPassed) {
|
|
132
|
+
return "Documentation quality meets your team's bar.";
|
|
133
|
+
}
|
|
134
|
+
if (comparison) {
|
|
135
|
+
return ("This reflects the current state of your product's documentation, " +
|
|
136
|
+
"not changes in this PR.");
|
|
137
|
+
}
|
|
138
|
+
return ("Some tasks are below their configured thresholds. " +
|
|
139
|
+
"Review the documentation for the affected areas.");
|
|
140
|
+
}
|
|
141
|
+
function statusIcon(result) {
|
|
142
|
+
switch (result.status) {
|
|
143
|
+
case "passed":
|
|
144
|
+
return "✅ Pass";
|
|
145
|
+
case "warning":
|
|
146
|
+
return "⚠️ Below threshold";
|
|
147
|
+
case "failed":
|
|
148
|
+
return "❌ Blocked";
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
function formatCost(cost) {
|
|
152
|
+
if (cost === 0)
|
|
153
|
+
return "$0.00";
|
|
154
|
+
if (cost < 0.01)
|
|
155
|
+
return `$${cost.toFixed(4)}`;
|
|
156
|
+
return `$${cost.toFixed(2)}`;
|
|
157
|
+
}
|
|
158
|
+
function liftArrow(lift) {
|
|
159
|
+
const rounded = Math.round(lift);
|
|
160
|
+
if (rounded > 0)
|
|
161
|
+
return `↑ +${rounded}`;
|
|
162
|
+
if (rounded < 0)
|
|
163
|
+
return `↓ ${rounded}`;
|
|
164
|
+
return "→ 0";
|
|
165
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/repo-threshold-evaluator.ts
|
|
3
|
+
*
|
|
4
|
+
* Evaluates per-task scores against thresholds configured in repo task
|
|
5
|
+
* definitions (`.ailf/tasks/*.yaml` → `execution.threshold`).
|
|
6
|
+
*
|
|
7
|
+
* This is distinct from the readiness-gate threshold system in
|
|
8
|
+
* `config/thresholds.yaml`. Repo thresholds are per-task, defined by
|
|
9
|
+
* the product team, and drive PR check pass/fail status. Framework
|
|
10
|
+
* thresholds are per-area, defined by the AILF team, and drive
|
|
11
|
+
* readiness reports.
|
|
12
|
+
*
|
|
13
|
+
* @see docs/exec-plans/completed/tasks-as-content/phase-6-pr-quality-gates.md
|
|
14
|
+
* @see packages/eval/src/adapters/task-sources/repo-schemas.ts
|
|
15
|
+
*/
|
|
16
|
+
import type { ScoreSummary } from "./types.js";
|
|
17
|
+
/** Threshold configuration from a repo task's execution block */
|
|
18
|
+
export interface TaskThresholdConfig {
|
|
19
|
+
/** Task ID */
|
|
20
|
+
taskId: string;
|
|
21
|
+
/** Whether failing this threshold should block the PR */
|
|
22
|
+
blocking: boolean;
|
|
23
|
+
/** Minimum overall score (0–100) */
|
|
24
|
+
score?: number;
|
|
25
|
+
/** Per-dimension minimums */
|
|
26
|
+
dimensions?: {
|
|
27
|
+
taskCompletion?: number;
|
|
28
|
+
codeCorrectness?: number;
|
|
29
|
+
docCoverage?: number;
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
/** Result of evaluating a single task against its threshold */
|
|
33
|
+
export interface TaskThresholdResult {
|
|
34
|
+
/** Task ID */
|
|
35
|
+
taskId: string;
|
|
36
|
+
/** Feature area */
|
|
37
|
+
area: string;
|
|
38
|
+
/** Actual overall score */
|
|
39
|
+
actualScore: number;
|
|
40
|
+
/** Configured threshold (or framework default) */
|
|
41
|
+
threshold: number;
|
|
42
|
+
/** Whether the score meets or exceeds the threshold */
|
|
43
|
+
passed: boolean;
|
|
44
|
+
/** Whether this result blocks the PR */
|
|
45
|
+
blocking: boolean;
|
|
46
|
+
/** Per-dimension results (when dimension thresholds are configured) */
|
|
47
|
+
dimensionResults?: DimensionThresholdResult[];
|
|
48
|
+
/** Status classification */
|
|
49
|
+
status: "passed" | "warning" | "failed";
|
|
50
|
+
}
|
|
51
|
+
/** Result of evaluating a single dimension against its threshold */
|
|
52
|
+
export interface DimensionThresholdResult {
|
|
53
|
+
dimension: string;
|
|
54
|
+
actual: number;
|
|
55
|
+
threshold: number;
|
|
56
|
+
passed: boolean;
|
|
57
|
+
}
|
|
58
|
+
/** Aggregate result of evaluating all tasks */
|
|
59
|
+
export interface RepoThresholdEvaluation {
|
|
60
|
+
/** Whether the overall check passes (no blocking failures) */
|
|
61
|
+
checkPassed: boolean;
|
|
62
|
+
/** Individual task results */
|
|
63
|
+
results: TaskThresholdResult[];
|
|
64
|
+
/** Summary counts */
|
|
65
|
+
summary: {
|
|
66
|
+
total: number;
|
|
67
|
+
passed: number;
|
|
68
|
+
warnings: number;
|
|
69
|
+
failed: number;
|
|
70
|
+
};
|
|
71
|
+
/** Overall score (average across all evaluated tasks) */
|
|
72
|
+
overallScore: number;
|
|
73
|
+
/** Default threshold used when tasks don't specify one */
|
|
74
|
+
defaultThreshold: number;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Evaluate task scores against repo-configured thresholds.
|
|
78
|
+
*
|
|
79
|
+
* For each task with a threshold configured in its `execution` block,
|
|
80
|
+
* compare the actual score against the threshold and determine pass/fail.
|
|
81
|
+
*
|
|
82
|
+
* Tasks without explicit thresholds use the `configDefaultThreshold`
|
|
83
|
+
* (from `.ailf/config.yaml`) or the framework default (0 = no gate).
|
|
84
|
+
*
|
|
85
|
+
* @param scoreSummary - The evaluation score summary
|
|
86
|
+
* @param thresholds - Per-task threshold configs (from repo task definitions)
|
|
87
|
+
* @param configDefaultThreshold - Default threshold from .ailf/config.yaml
|
|
88
|
+
*/
|
|
89
|
+
export declare function evaluateRepoThresholds(scoreSummary: ScoreSummary, thresholds: TaskThresholdConfig[], configDefaultThreshold?: number): RepoThresholdEvaluation;
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/repo-threshold-evaluator.ts
|
|
3
|
+
*
|
|
4
|
+
* Evaluates per-task scores against thresholds configured in repo task
|
|
5
|
+
* definitions (`.ailf/tasks/*.yaml` → `execution.threshold`).
|
|
6
|
+
*
|
|
7
|
+
* This is distinct from the readiness-gate threshold system in
|
|
8
|
+
* `config/thresholds.yaml`. Repo thresholds are per-task, defined by
|
|
9
|
+
* the product team, and drive PR check pass/fail status. Framework
|
|
10
|
+
* thresholds are per-area, defined by the AILF team, and drive
|
|
11
|
+
* readiness reports.
|
|
12
|
+
*
|
|
13
|
+
* @see docs/exec-plans/completed/tasks-as-content/phase-6-pr-quality-gates.md
|
|
14
|
+
* @see packages/eval/src/adapters/task-sources/repo-schemas.ts
|
|
15
|
+
*/
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
// Constants
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
/** Framework default threshold when neither task nor config specifies one */
|
|
20
|
+
const FRAMEWORK_DEFAULT_THRESHOLD = 0;
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// Public API
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
/**
|
|
25
|
+
* Evaluate task scores against repo-configured thresholds.
|
|
26
|
+
*
|
|
27
|
+
* For each task with a threshold configured in its `execution` block,
|
|
28
|
+
* compare the actual score against the threshold and determine pass/fail.
|
|
29
|
+
*
|
|
30
|
+
* Tasks without explicit thresholds use the `configDefaultThreshold`
|
|
31
|
+
* (from `.ailf/config.yaml`) or the framework default (0 = no gate).
|
|
32
|
+
*
|
|
33
|
+
* @param scoreSummary - The evaluation score summary
|
|
34
|
+
* @param thresholds - Per-task threshold configs (from repo task definitions)
|
|
35
|
+
* @param configDefaultThreshold - Default threshold from .ailf/config.yaml
|
|
36
|
+
*/
|
|
37
|
+
export function evaluateRepoThresholds(scoreSummary, thresholds, configDefaultThreshold = FRAMEWORK_DEFAULT_THRESHOLD) {
|
|
38
|
+
const thresholdMap = new Map(thresholds.map((t) => [t.taskId, t]));
|
|
39
|
+
// Build a task-to-area mapping from the score summary
|
|
40
|
+
const taskScoreMap = buildTaskScoreMap(scoreSummary);
|
|
41
|
+
const results = [];
|
|
42
|
+
// Evaluate each task that has scores
|
|
43
|
+
for (const [taskId, taskScore] of taskScoreMap) {
|
|
44
|
+
const config = thresholdMap.get(taskId);
|
|
45
|
+
const threshold = config?.score ?? configDefaultThreshold;
|
|
46
|
+
const blocking = config?.blocking ?? false;
|
|
47
|
+
const actualScore = Math.round(taskScore.score);
|
|
48
|
+
const passed = actualScore >= threshold;
|
|
49
|
+
// Evaluate dimension thresholds
|
|
50
|
+
let dimensionResults;
|
|
51
|
+
if (config?.dimensions) {
|
|
52
|
+
dimensionResults = evaluateDimensions(taskScore, config.dimensions);
|
|
53
|
+
}
|
|
54
|
+
const dimensionsFailed = dimensionResults?.some((d) => !d.passed) ?? false;
|
|
55
|
+
const overallPassed = passed && !dimensionsFailed;
|
|
56
|
+
results.push({
|
|
57
|
+
taskId,
|
|
58
|
+
area: taskScore.area,
|
|
59
|
+
actualScore,
|
|
60
|
+
threshold,
|
|
61
|
+
passed: overallPassed,
|
|
62
|
+
blocking,
|
|
63
|
+
dimensionResults,
|
|
64
|
+
status: overallPassed ? "passed" : blocking ? "failed" : "warning",
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
// Also add entries for threshold-configured tasks that weren't evaluated
|
|
68
|
+
for (const config of thresholds) {
|
|
69
|
+
if (!taskScoreMap.has(config.taskId)) {
|
|
70
|
+
results.push({
|
|
71
|
+
taskId: config.taskId,
|
|
72
|
+
area: "unknown",
|
|
73
|
+
actualScore: 0,
|
|
74
|
+
threshold: config.score ?? configDefaultThreshold,
|
|
75
|
+
passed: false,
|
|
76
|
+
blocking: config.blocking,
|
|
77
|
+
status: config.blocking ? "failed" : "warning",
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
// Sort: failed first, then warnings, then passed
|
|
82
|
+
results.sort((a, b) => {
|
|
83
|
+
const order = { failed: 0, warning: 1, passed: 2 };
|
|
84
|
+
return order[a.status] - order[b.status];
|
|
85
|
+
});
|
|
86
|
+
const passed = results.filter((r) => r.status === "passed").length;
|
|
87
|
+
const warnings = results.filter((r) => r.status === "warning").length;
|
|
88
|
+
const failed = results.filter((r) => r.status === "failed").length;
|
|
89
|
+
// Check passes if no blocking failures
|
|
90
|
+
const checkPassed = failed === 0;
|
|
91
|
+
// Overall score
|
|
92
|
+
const evaluatedResults = results.filter((r) => r.actualScore > 0);
|
|
93
|
+
const overallScore = evaluatedResults.length > 0
|
|
94
|
+
? Math.round(evaluatedResults.reduce((sum, r) => sum + r.actualScore, 0) /
|
|
95
|
+
evaluatedResults.length)
|
|
96
|
+
: 0;
|
|
97
|
+
return {
|
|
98
|
+
checkPassed,
|
|
99
|
+
results,
|
|
100
|
+
summary: {
|
|
101
|
+
total: results.length,
|
|
102
|
+
passed,
|
|
103
|
+
warnings,
|
|
104
|
+
failed,
|
|
105
|
+
},
|
|
106
|
+
overallScore,
|
|
107
|
+
defaultThreshold: configDefaultThreshold,
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Build a map of task ID → score from the ScoreSummary.
|
|
112
|
+
*
|
|
113
|
+
* ScoreSummary groups scores by feature area, not by task. We derive
|
|
114
|
+
* per-task scores from the area scores. When multiple tasks share an
|
|
115
|
+
* area, they share the area's composite score (this is a simplification;
|
|
116
|
+
* per-task scoring requires individual test results not available in
|
|
117
|
+
* ScoreSummary).
|
|
118
|
+
*/
|
|
119
|
+
function buildTaskScoreMap(summary) {
|
|
120
|
+
const map = new Map();
|
|
121
|
+
for (const areaScore of summary.scores) {
|
|
122
|
+
// Use the feature area name as a proxy task ID if we don't have
|
|
123
|
+
// per-task granularity. In practice, repo tasks map 1:1 to areas
|
|
124
|
+
// in most cases (each .ailf/tasks/*.yaml file is one area).
|
|
125
|
+
map.set(areaScore.feature, {
|
|
126
|
+
area: areaScore.feature,
|
|
127
|
+
score: areaScore.totalScore,
|
|
128
|
+
taskCompletion: areaScore.taskCompletion,
|
|
129
|
+
codeCorrectness: areaScore.codeCorrectness,
|
|
130
|
+
docCoverage: areaScore.docCoverage,
|
|
131
|
+
});
|
|
132
|
+
}
|
|
133
|
+
return map;
|
|
134
|
+
}
|
|
135
|
+
function evaluateDimensions(score, thresholds) {
|
|
136
|
+
const results = [];
|
|
137
|
+
if (thresholds.taskCompletion !== undefined) {
|
|
138
|
+
results.push({
|
|
139
|
+
dimension: "task-completion",
|
|
140
|
+
actual: Math.round(score.taskCompletion),
|
|
141
|
+
threshold: thresholds.taskCompletion,
|
|
142
|
+
passed: Math.round(score.taskCompletion) >= thresholds.taskCompletion,
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
if (thresholds.codeCorrectness !== undefined) {
|
|
146
|
+
results.push({
|
|
147
|
+
dimension: "code-correctness",
|
|
148
|
+
actual: Math.round(score.codeCorrectness),
|
|
149
|
+
threshold: thresholds.codeCorrectness,
|
|
150
|
+
passed: Math.round(score.codeCorrectness) >= thresholds.codeCorrectness,
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
if (thresholds.docCoverage !== undefined) {
|
|
154
|
+
results.push({
|
|
155
|
+
dimension: "doc-coverage",
|
|
156
|
+
actual: Math.round(score.docCoverage),
|
|
157
|
+
threshold: thresholds.docCoverage,
|
|
158
|
+
passed: Math.round(score.docCoverage) >= thresholds.docCoverage,
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
return results;
|
|
162
|
+
}
|