@sanity/ailf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -0
- package/bin/ailf.js +64 -0
- package/canonical/grader-references/README.md +88 -0
- package/canonical/grader-references/groq.yaml +234 -0
- package/canonical/grader-references/studio-setup.yaml +275 -0
- package/canonical/reference-solutions/.gitkeep +1 -0
- package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
- package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
- package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
- package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
- package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
- package/canonical/reference-solutions/groq/joins-references.ts +300 -0
- package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
- package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
- package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
- package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
- package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
- package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
- package/config/bigquery/README.md +74 -0
- package/config/bigquery/views/area_scores.sql +87 -0
- package/config/bigquery/views/reports.sql +49 -0
- package/config/features.yaml +116 -0
- package/config/models.yaml +115 -0
- package/config/prompts.yaml +75 -0
- package/config/rubrics.yaml +62 -0
- package/config/schedules.yaml +43 -0
- package/config/sinks.yaml +54 -0
- package/config/sources.yaml +51 -0
- package/config/thresholds.yaml +49 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
- package/dist/_vendor/ailf-core/examples/index.js +285 -0
- package/dist/_vendor/ailf-core/index.d.ts +17 -0
- package/dist/_vendor/ailf-core/index.js +17 -0
- package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
- package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
- package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
- package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
- package/dist/_vendor/ailf-core/ports/context.js +14 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
- package/dist/_vendor/ailf-core/ports/index.js +7 -0
- package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
- package/dist/_vendor/ailf-core/ports/logger.js +11 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
- package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/schemas/index.js +16 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
- package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
- package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
- package/dist/_vendor/ailf-core/services/index.js +12 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/scoring.js +222 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
- package/dist/_vendor/ailf-core/types/index.js +21 -0
- package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
- package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
- package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
- package/dist/_vendor/ailf-shared/document-ref.js +1 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -0
- package/dist/_vendor/ailf-shared/index.js +16 -0
- package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
- package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
- package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
- package/dist/_vendor/ailf-shared/score-grades.js +23 -0
- package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
- package/dist/adapters/cache/content-lake-cache.js +59 -0
- package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
- package/dist/adapters/cache/filesystem-cache.js +54 -0
- package/dist/adapters/cache/index.d.ts +2 -0
- package/dist/adapters/cache/index.js +2 -0
- package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
- package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
- package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
- package/dist/adapters/config-sources/file-config-adapter.js +96 -0
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +2 -0
- package/dist/adapters/doc-fetchers/index.d.ts +1 -0
- package/dist/adapters/doc-fetchers/index.js +1 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
- package/dist/adapters/eval-runners/index.d.ts +1 -0
- package/dist/adapters/eval-runners/index.js +1 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
- package/dist/adapters/index.d.ts +12 -0
- package/dist/adapters/index.js +12 -0
- package/dist/adapters/loggers/console-logger.d.ts +22 -0
- package/dist/adapters/loggers/console-logger.js +54 -0
- package/dist/adapters/loggers/index.d.ts +9 -0
- package/dist/adapters/loggers/index.js +9 -0
- package/dist/adapters/loggers/json-logger.d.ts +18 -0
- package/dist/adapters/loggers/json-logger.js +33 -0
- package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
- package/dist/adapters/loggers/quiet-logger.js +30 -0
- package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/composite-task-source.js +59 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
- package/dist/adapters/task-sources/index.d.ts +7 -0
- package/dist/adapters/task-sources/index.js +7 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
- package/dist/adapters/task-sources/repo-schemas.js +234 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
- package/dist/adapters/task-sources/repo-task-source.js +104 -0
- package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
- package/dist/adapters/task-sources/repo-trigger.js +153 -0
- package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
- package/dist/adapters/task-sources/repo-validation.js +164 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
- package/dist/adapters/task-sources/yaml-task-source.js +136 -0
- package/dist/agent-observer/agentic-provider.d.ts +132 -0
- package/dist/agent-observer/agentic-provider.js +983 -0
- package/dist/agent-observer/classifier.d.ts +62 -0
- package/dist/agent-observer/classifier.js +269 -0
- package/dist/agent-observer/index.d.ts +7 -0
- package/dist/agent-observer/index.js +4 -0
- package/dist/agent-observer/pricing.d.ts +35 -0
- package/dist/agent-observer/pricing.js +82 -0
- package/dist/agent-observer/provider.d.ts +77 -0
- package/dist/agent-observer/provider.js +151 -0
- package/dist/agent-observer/proxy.d.ts +91 -0
- package/dist/agent-observer/proxy.js +321 -0
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/agent-observer/types.d.ts +137 -0
- package/dist/agent-observer/types.js +16 -0
- package/dist/assertions/source-isolation.d.ts +72 -0
- package/dist/assertions/source-isolation.js +117 -0
- package/dist/cli.d.ts +24 -0
- package/dist/cli.js +199 -0
- package/dist/commands/agent-report.d.ts +5 -0
- package/dist/commands/agent-report.js +69 -0
- package/dist/commands/baseline.d.ts +9 -0
- package/dist/commands/baseline.js +141 -0
- package/dist/commands/cache.d.ts +13 -0
- package/dist/commands/cache.js +135 -0
- package/dist/commands/calculate-scores.d.ts +8 -0
- package/dist/commands/calculate-scores.js +48 -0
- package/dist/commands/compare.d.ts +8 -0
- package/dist/commands/compare.js +120 -0
- package/dist/commands/completion.d.ts +18 -0
- package/dist/commands/completion.js +260 -0
- package/dist/commands/coverage-audit.d.ts +7 -0
- package/dist/commands/coverage-audit.js +40 -0
- package/dist/commands/discovery-report.d.ts +10 -0
- package/dist/commands/discovery-report.js +44 -0
- package/dist/commands/eval.d.ts +9 -0
- package/dist/commands/eval.js +35 -0
- package/dist/commands/explain-handler.d.ts +34 -0
- package/dist/commands/explain-handler.js +719 -0
- package/dist/commands/fetch-docs.d.ts +8 -0
- package/dist/commands/fetch-docs.js +128 -0
- package/dist/commands/generate-configs.d.ts +8 -0
- package/dist/commands/generate-configs.js +46 -0
- package/dist/commands/grader/index.d.ts +11 -0
- package/dist/commands/grader/index.js +118 -0
- package/dist/commands/init.d.ts +19 -0
- package/dist/commands/init.js +150 -0
- package/dist/commands/interactive.d.ts +12 -0
- package/dist/commands/interactive.js +238 -0
- package/dist/commands/lookup-doc.d.ts +15 -0
- package/dist/commands/lookup-doc.js +84 -0
- package/dist/commands/measure-retrieval.d.ts +5 -0
- package/dist/commands/measure-retrieval.js +65 -0
- package/dist/commands/pipeline-action.d.ts +71 -0
- package/dist/commands/pipeline-action.js +305 -0
- package/dist/commands/pipeline.d.ts +62 -0
- package/dist/commands/pipeline.js +53 -0
- package/dist/commands/pr-comment.d.ts +8 -0
- package/dist/commands/pr-comment.js +47 -0
- package/dist/commands/publish.d.ts +26 -0
- package/dist/commands/publish.js +253 -0
- package/dist/commands/readiness-report.d.ts +10 -0
- package/dist/commands/readiness-report.js +104 -0
- package/dist/commands/shared/options.d.ts +29 -0
- package/dist/commands/shared/options.js +57 -0
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +16 -0
- package/dist/commands/validate-tasks.js +93 -0
- package/dist/commands/validate.d.ts +9 -0
- package/dist/commands/validate.js +73 -0
- package/dist/commands/webhook-server.d.ts +5 -0
- package/dist/commands/webhook-server.js +30 -0
- package/dist/commands/weekly-digest.d.ts +10 -0
- package/dist/commands/weekly-digest.js +104 -0
- package/dist/composition-root.d.ts +26 -0
- package/dist/composition-root.js +107 -0
- package/dist/interpolate.d.ts +26 -0
- package/dist/interpolate.js +70 -0
- package/dist/job-store.d.ts +104 -0
- package/dist/job-store.js +188 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.d.ts +27 -0
- package/dist/orchestration/build-app-context.js +81 -0
- package/dist/orchestration/build-step-sequence.d.ts +15 -0
- package/dist/orchestration/build-step-sequence.js +84 -0
- package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
- package/dist/orchestration/config-to-source-overrides.js +28 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/index.d.ts +11 -0
- package/dist/orchestration/index.js +11 -0
- package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
- package/dist/orchestration/pipeline-orchestrator.js +153 -0
- package/dist/orchestration/step-runner.d.ts +20 -0
- package/dist/orchestration/step-runner.js +88 -0
- package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
- package/dist/orchestration/steps/calculate-scores-step.js +95 -0
- package/dist/orchestration/steps/callback-step.d.ts +24 -0
- package/dist/orchestration/steps/callback-step.js +76 -0
- package/dist/orchestration/steps/compare-step.d.ts +14 -0
- package/dist/orchestration/steps/compare-step.js +92 -0
- package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
- package/dist/orchestration/steps/discovery-report-step.js +55 -0
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
- package/dist/orchestration/steps/fetch-docs-step.js +135 -0
- package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
- package/dist/orchestration/steps/gap-analysis-step.js +136 -0
- package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
- package/dist/orchestration/steps/generate-configs-step.js +85 -0
- package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
- package/dist/orchestration/steps/grader-consistency-step.js +64 -0
- package/dist/orchestration/steps/index.d.ts +19 -0
- package/dist/orchestration/steps/index.js +19 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
- package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
- package/dist/orchestration/steps/publish-report-step.js +216 -0
- package/dist/orchestration/steps/readiness-step.d.ts +13 -0
- package/dist/orchestration/steps/readiness-step.js +91 -0
- package/dist/orchestration/steps/report-step.d.ts +12 -0
- package/dist/orchestration/steps/report-step.js +49 -0
- package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
- package/dist/orchestration/steps/run-eval-step.js +195 -0
- package/dist/orchestration/steps/validate-step.d.ts +12 -0
- package/dist/orchestration/steps/validate-step.js +41 -0
- package/dist/pipeline/agent-behavior-report.d.ts +53 -0
- package/dist/pipeline/agent-behavior-report.js +132 -0
- package/dist/pipeline/attribution.d.ts +47 -0
- package/dist/pipeline/attribution.js +226 -0
- package/dist/pipeline/baseline.d.ts +37 -0
- package/dist/pipeline/baseline.js +141 -0
- package/dist/pipeline/cache.d.ts +101 -0
- package/dist/pipeline/cache.js +283 -0
- package/dist/pipeline/calculate-scores.d.ts +102 -0
- package/dist/pipeline/calculate-scores.js +1128 -0
- package/dist/pipeline/callback-delivery.d.ts +50 -0
- package/dist/pipeline/callback-delivery.js +89 -0
- package/dist/pipeline/checks.d.ts +39 -0
- package/dist/pipeline/checks.js +280 -0
- package/dist/pipeline/classify-url.d.ts +61 -0
- package/dist/pipeline/classify-url.js +93 -0
- package/dist/pipeline/compare.d.ts +31 -0
- package/dist/pipeline/compare.js +208 -0
- package/dist/pipeline/coverage-audit.d.ts +39 -0
- package/dist/pipeline/coverage-audit.js +165 -0
- package/dist/pipeline/degradations.d.ts +85 -0
- package/dist/pipeline/degradations.js +242 -0
- package/dist/pipeline/discovery-report.d.ts +55 -0
- package/dist/pipeline/discovery-report.js +178 -0
- package/dist/pipeline/eval-constants.d.ts +68 -0
- package/dist/pipeline/eval-constants.js +111 -0
- package/dist/pipeline/eval-fingerprint.d.ts +66 -0
- package/dist/pipeline/eval-fingerprint.js +175 -0
- package/dist/pipeline/expand-tasks.d.ts +220 -0
- package/dist/pipeline/expand-tasks.js +421 -0
- package/dist/pipeline/failure-modes.d.ts +46 -0
- package/dist/pipeline/failure-modes.js +348 -0
- package/dist/pipeline/fetch-url-content.d.ts +44 -0
- package/dist/pipeline/fetch-url-content.js +93 -0
- package/dist/pipeline/gap-analysis.d.ts +48 -0
- package/dist/pipeline/gap-analysis.js +231 -0
- package/dist/pipeline/generate-configs.d.ts +72 -0
- package/dist/pipeline/generate-configs.js +395 -0
- package/dist/pipeline/grader-api.d.ts +49 -0
- package/dist/pipeline/grader-api.js +200 -0
- package/dist/pipeline/grader-compare-runner.d.ts +44 -0
- package/dist/pipeline/grader-compare-runner.js +301 -0
- package/dist/pipeline/grader-comparison.d.ts +111 -0
- package/dist/pipeline/grader-comparison.js +161 -0
- package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
- package/dist/pipeline/grader-consistency-runner.js +270 -0
- package/dist/pipeline/grader-consistency.d.ts +103 -0
- package/dist/pipeline/grader-consistency.js +146 -0
- package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
- package/dist/pipeline/grader-sensitivity-runner.js +282 -0
- package/dist/pipeline/grader-sensitivity.d.ts +94 -0
- package/dist/pipeline/grader-sensitivity.js +144 -0
- package/dist/pipeline/grader-validate-runner.d.ts +38 -0
- package/dist/pipeline/grader-validate-runner.js +229 -0
- package/dist/pipeline/grader-validation.d.ts +107 -0
- package/dist/pipeline/grader-validation.js +169 -0
- package/dist/pipeline/map-request-to-config.d.ts +19 -0
- package/dist/pipeline/map-request-to-config.js +80 -0
- package/dist/pipeline/measure-retrieval.d.ts +59 -0
- package/dist/pipeline/measure-retrieval.js +111 -0
- package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
- package/dist/pipeline/mirror-repo-tasks.js +350 -0
- package/dist/pipeline/plan-format.d.ts +33 -0
- package/dist/pipeline/plan-format.js +202 -0
- package/dist/pipeline/plan.d.ts +169 -0
- package/dist/pipeline/plan.js +708 -0
- package/dist/pipeline/pr-comment.d.ts +19 -0
- package/dist/pipeline/pr-comment.js +502 -0
- package/dist/pipeline/probe.d.ts +52 -0
- package/dist/pipeline/probe.js +390 -0
- package/dist/pipeline/provenance.d.ts +47 -0
- package/dist/pipeline/provenance.js +146 -0
- package/dist/pipeline/readiness-report.d.ts +87 -0
- package/dist/pipeline/readiness-report.js +205 -0
- package/dist/pipeline/release-classification.d.ts +54 -0
- package/dist/pipeline/release-classification.js +238 -0
- package/dist/pipeline/release-report.d.ts +37 -0
- package/dist/pipeline/release-report.js +222 -0
- package/dist/pipeline/repo-eval-comment.d.ts +37 -0
- package/dist/pipeline/repo-eval-comment.js +165 -0
- package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
- package/dist/pipeline/repo-threshold-evaluator.js +162 -0
- package/dist/pipeline/resolve-mappings.d.ts +35 -0
- package/dist/pipeline/resolve-mappings.js +72 -0
- package/dist/pipeline/retrieval-metrics.d.ts +39 -0
- package/dist/pipeline/retrieval-metrics.js +136 -0
- package/dist/pipeline/reverse-mapping.d.ts +67 -0
- package/dist/pipeline/reverse-mapping.js +88 -0
- package/dist/pipeline/schemas.d.ts +9 -0
- package/dist/pipeline/schemas.js +9 -0
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +88 -0
- package/dist/pipeline/targeted-loo.js +203 -0
- package/dist/pipeline/thresholds.d.ts +27 -0
- package/dist/pipeline/thresholds.js +245 -0
- package/dist/pipeline/types.d.ts +10 -0
- package/dist/pipeline/types.js +10 -0
- package/dist/pipeline/validate.d.ts +67 -0
- package/dist/pipeline/validate.js +406 -0
- package/dist/pipeline/webhook-server.d.ts +37 -0
- package/dist/pipeline/webhook-server.js +133 -0
- package/dist/report-store.d.ts +84 -0
- package/dist/report-store.js +208 -0
- package/dist/sanity/client.d.ts +38 -0
- package/dist/sanity/client.js +86 -0
- package/dist/sanity/portable-text.d.ts +11 -0
- package/dist/sanity/portable-text.js +211 -0
- package/dist/sanity/queries.d.ts +133 -0
- package/dist/sanity/queries.js +300 -0
- package/dist/schedules/digest.d.ts +116 -0
- package/dist/schedules/digest.js +156 -0
- package/dist/schedules/index.d.ts +12 -0
- package/dist/schedules/index.js +10 -0
- package/dist/schedules/loader.d.ts +31 -0
- package/dist/schedules/loader.js +73 -0
- package/dist/schedules/schema.d.ts +9 -0
- package/dist/schedules/schema.js +9 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +21 -0
- package/dist/scripts/validate-task-sources.js +210 -0
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/bigquery/index.d.ts +131 -0
- package/dist/sinks/bigquery/index.js +222 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/index.d.ts +23 -0
- package/dist/sinks/index.js +18 -0
- package/dist/sinks/loader.d.ts +18 -0
- package/dist/sinks/loader.js +82 -0
- package/dist/sinks/retry.d.ts +24 -0
- package/dist/sinks/retry.js +52 -0
- package/dist/sinks/schema.d.ts +9 -0
- package/dist/sinks/schema.js +9 -0
- package/dist/sinks/slack/format.d.ts +65 -0
- package/dist/sinks/slack/format.js +327 -0
- package/dist/sinks/slack/index.d.ts +27 -0
- package/dist/sinks/slack/index.js +78 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +59 -0
- package/dist/sinks/types.js +44 -0
- package/dist/sinks/webhook/index.d.ts +19 -0
- package/dist/sinks/webhook/index.js +50 -0
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/sources.d.ts +104 -0
- package/dist/sources.js +292 -0
- package/dist/webhook/budget.d.ts +42 -0
- package/dist/webhook/budget.js +60 -0
- package/dist/webhook/debounce.d.ts +67 -0
- package/dist/webhook/debounce.js +76 -0
- package/dist/webhook/dispatch.d.ts +45 -0
- package/dist/webhook/dispatch.js +84 -0
- package/dist/webhook/eval-request-handler.d.ts +87 -0
- package/dist/webhook/eval-request-handler.js +181 -0
- package/dist/webhook/handler.d.ts +88 -0
- package/dist/webhook/handler.js +203 -0
- package/dist/webhook/index.d.ts +17 -0
- package/dist/webhook/index.js +12 -0
- package/dist/webhook/types.d.ts +109 -0
- package/dist/webhook/types.js +10 -0
- package/package.json +72 -0
- package/tasks/.expanded.agentic.yaml +51 -0
- package/tasks/.expanded.yaml +66 -0
- package/tasks/frameworks.yaml +98 -0
- package/tasks/functions.yaml +51 -0
- package/tasks/groq.yaml +216 -0
- package/tasks/nextjs-live.yaml +62 -0
- package/tasks/studio-setup.yaml +111 -0
- package/tasks/visual-editing.yaml +120 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/pr-comment.ts — Generates a markdown PR comment from eval score-summary.json.
|
|
3
|
+
*
|
|
4
|
+
* All functions accept rootDir as a parameter — no module-level constants.
|
|
5
|
+
* No process.argv parsing. No env var fallbacks.
|
|
6
|
+
*
|
|
7
|
+
* Reads: results/latest/score-summary.json
|
|
8
|
+
* Writes: markdown to stdout or --output file
|
|
9
|
+
*/
|
|
10
|
+
/** Options for the generatePrComment() function. */
|
|
11
|
+
export interface PrCommentOptions {
|
|
12
|
+
/** Path to write the comment (default: stdout) */
|
|
13
|
+
outputPath?: string;
|
|
14
|
+
/** Promptfoo share URL to include in the comment */
|
|
15
|
+
promptfooUrl?: string;
|
|
16
|
+
/** Root directory of the eval package */
|
|
17
|
+
rootDir: string;
|
|
18
|
+
}
|
|
19
|
+
export declare function generatePrComment(options: PrCommentOptions): void;
|
|
@@ -0,0 +1,502 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/pr-comment.ts — Generates a markdown PR comment from eval score-summary.json.
|
|
3
|
+
*
|
|
4
|
+
* All functions accept rootDir as a parameter — no module-level constants.
|
|
5
|
+
* No process.argv parsing. No env var fallbacks.
|
|
6
|
+
*
|
|
7
|
+
* Reads: results/latest/score-summary.json
|
|
8
|
+
* Writes: markdown to stdout or --output file
|
|
9
|
+
*/
|
|
10
|
+
import { existsSync, readFileSync, writeFileSync } from "node:fs";
|
|
11
|
+
import { resolve } from "node:path";
|
|
12
|
+
function formatCost(cost) {
|
|
13
|
+
if (cost === 0) {
|
|
14
|
+
return "$0.00";
|
|
15
|
+
}
|
|
16
|
+
if (cost < 0.01) {
|
|
17
|
+
return `$${cost.toFixed(4)}`;
|
|
18
|
+
}
|
|
19
|
+
return `$${cost.toFixed(2)}`;
|
|
20
|
+
}
|
|
21
|
+
function generateComment(summary, options = {}) {
|
|
22
|
+
const { belowCritical, overall, scores, source, timestamp } = summary;
|
|
23
|
+
const sorted = [...scores].sort((a, b) => b.totalScore - a.totalScore);
|
|
24
|
+
const lines = [];
|
|
25
|
+
// Header
|
|
26
|
+
lines.push(`## ${overallEmoji(overall.avgScore)} AI Literacy Score Report`);
|
|
27
|
+
lines.push("");
|
|
28
|
+
const totalCost = scores.reduce((sum, s) => sum + (s.totalCost ?? 0), 0);
|
|
29
|
+
const graderCostValue = overall.cost?.graderTotal ?? 0;
|
|
30
|
+
const combinedCostValue = totalCost + graderCostValue;
|
|
31
|
+
const costStr = combinedCostValue > 0 ? ` · Cost: ${formatCost(combinedCostValue)}` : "";
|
|
32
|
+
const actualStr = overall.avgActualScore !== undefined
|
|
33
|
+
? ` · Actual: ${Math.round(overall.avgActualScore)}/100`
|
|
34
|
+
: "";
|
|
35
|
+
const retGapStr = overall.avgRetrievalGap !== undefined
|
|
36
|
+
? ` · Ret. Gap: ${Math.round(overall.avgRetrievalGap)}`
|
|
37
|
+
: "";
|
|
38
|
+
lines.push(`**Overall: ${Math.round(overall.avgScore)}/100** · Doc Lift: +${Math.round(overall.avgDocLift)}${actualStr}${retGapStr} · ${scores.reduce((sum, s) => sum + s.testCount, 0)} tests across ${scores.length} areas${costStr}`);
|
|
39
|
+
lines.push("");
|
|
40
|
+
// Critical warnings
|
|
41
|
+
if (belowCritical.length > 0) {
|
|
42
|
+
lines.push(`> ⚠️ **Below critical threshold:** ${belowCritical.map((a) => `\`${a}\``).join(", ")}`);
|
|
43
|
+
lines.push("");
|
|
44
|
+
}
|
|
45
|
+
// Environment info
|
|
46
|
+
if (source) {
|
|
47
|
+
lines.push("<details>");
|
|
48
|
+
lines.push("<summary>🔧 Environment</summary>");
|
|
49
|
+
lines.push("");
|
|
50
|
+
lines.push("| Setting | Value |");
|
|
51
|
+
lines.push("|---------|-------|");
|
|
52
|
+
lines.push(`| **Source** | ${source.name} |`);
|
|
53
|
+
lines.push(`| **Docs URL** | ${source.baseUrl} |`);
|
|
54
|
+
if (source.dataset) {
|
|
55
|
+
lines.push(`| **Dataset** | ${source.dataset} |`);
|
|
56
|
+
}
|
|
57
|
+
lines.push(`| **Project** | ${source.projectId} |`);
|
|
58
|
+
lines.push("");
|
|
59
|
+
lines.push("</details>");
|
|
60
|
+
lines.push("");
|
|
61
|
+
}
|
|
62
|
+
// Source verification
|
|
63
|
+
const { sourceIsolation, sourceVerification } = summary;
|
|
64
|
+
if (sourceVerification || sourceIsolation) {
|
|
65
|
+
lines.push("<details>");
|
|
66
|
+
lines.push("<summary>🔍 Source verification</summary>");
|
|
67
|
+
lines.push("");
|
|
68
|
+
lines.push("| Setting | Value |");
|
|
69
|
+
lines.push("|---------|-------|");
|
|
70
|
+
if (sourceVerification) {
|
|
71
|
+
lines.push(`| **Source** | ${sourceVerification.source} |`);
|
|
72
|
+
lines.push(`| **Mode** | ${sourceVerification.mode} |`);
|
|
73
|
+
if (sourceVerification.allowedOrigins) {
|
|
74
|
+
lines.push(`| **Sandbox** | ${sourceVerification.allowedOrigins.join(", ")} |`);
|
|
75
|
+
}
|
|
76
|
+
if (sourceVerification.searchMode) {
|
|
77
|
+
lines.push(`| **Search** | ${sourceVerification.searchMode} |`);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
if (sourceIsolation) {
|
|
81
|
+
const pct = Math.round(sourceIsolation.isolationScore * 100);
|
|
82
|
+
const icon = sourceIsolation.offOrigin === 0 ? "✅" : "⚠️";
|
|
83
|
+
lines.push(`| **Agent isolation** | ${icon} ${pct}% (${sourceIsolation.onOrigin}/${sourceIsolation.total} on-origin) |`);
|
|
84
|
+
if (sourceIsolation.offOrigin > 0) {
|
|
85
|
+
lines.push(`| **Off-origin fetches** | ${sourceIsolation.offOriginUrls.slice(0, 5).join(", ")} |`);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
if (sourceVerification?.urlFetch) {
|
|
89
|
+
const uf = sourceVerification.urlFetch;
|
|
90
|
+
lines.push(`| **URL fetch** | ${uf.totalFetched} fetched, ${uf.totalFailed} failed |`);
|
|
91
|
+
for (const f of uf.fetchedUrls) {
|
|
92
|
+
lines.push(`| | ✅ ${f.url} (via ${f.method}) |`);
|
|
93
|
+
}
|
|
94
|
+
for (const f of uf.failures) {
|
|
95
|
+
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty error string should show "unknown"
|
|
96
|
+
lines.push(`| | ⚠️ ${f.url}: ${f.error || "unknown"} |`);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
lines.push("");
|
|
100
|
+
lines.push("</details>");
|
|
101
|
+
lines.push("");
|
|
102
|
+
}
|
|
103
|
+
// Score table
|
|
104
|
+
lines.push("### Scores by Feature Area");
|
|
105
|
+
lines.push("");
|
|
106
|
+
lines.push("| Feature | Score | Grade | Task | Code | Docs | Doc Lift | Tests |");
|
|
107
|
+
lines.push("|---------|-------|-------|------|------|------|----------|-------|");
|
|
108
|
+
for (const s of sorted) {
|
|
109
|
+
lines.push(`| ${s.feature} | **${Math.round(s.totalScore)}** | ${gradeEmoji(s.totalScore)} ${gradeLetter(s.totalScore)} | ${Math.round(s.taskCompletion)} | ${Math.round(s.codeCorrectness)} | ${Math.round(s.docCoverage)} | ${liftArrow(s.docLift)} | ${s.testCount} |`);
|
|
110
|
+
}
|
|
111
|
+
lines.push("");
|
|
112
|
+
// Breakdown legend
|
|
113
|
+
lines.push("<details>");
|
|
114
|
+
lines.push("<summary>📊 Score breakdown legend</summary>");
|
|
115
|
+
lines.push("");
|
|
116
|
+
lines.push("| Column | Description |");
|
|
117
|
+
lines.push("|--------|-------------|");
|
|
118
|
+
lines.push("| **Score** | Overall AI literacy score (0–100) — weighted sum of Task + Code + Docs |");
|
|
119
|
+
lines.push("| **Task** | Task completion — does the LLM understand what to build? |");
|
|
120
|
+
lines.push("| **Code** | Code correctness — does the generated code actually work? |");
|
|
121
|
+
lines.push("| **Docs** | Documentation coverage — are the right APIs/patterns referenced? |");
|
|
122
|
+
lines.push("| **Doc Lift** | Score improvement when docs are provided vs baseline (no docs) |");
|
|
123
|
+
lines.push("| **Grade** | ✅ A (≥80) · 🟡 B (≥70) · 🟠 C (≥50) · 🔴 D (<50) |");
|
|
124
|
+
lines.push("");
|
|
125
|
+
lines.push("</details>");
|
|
126
|
+
lines.push("");
|
|
127
|
+
// Negative Doc Lift warning (always visible, not collapsible)
|
|
128
|
+
const negDocLiftAreas = sorted.filter((s) => s.negativeDocLift);
|
|
129
|
+
if (negDocLiftAreas.length > 0) {
|
|
130
|
+
for (const s of negDocLiftAreas) {
|
|
131
|
+
lines.push(`> 🚨 **Negative Doc Lift:** \`${s.feature}\` (${s.docLift}) — docs hurt performance. Floor: ${s.floorScore}, Ceiling: ${s.ceilingScore}`);
|
|
132
|
+
}
|
|
133
|
+
lines.push("");
|
|
134
|
+
}
|
|
135
|
+
// Three-layer decomposition (when actual scores are present)
|
|
136
|
+
const hasActualScores = sorted.some((s) => s.actualScore !== undefined);
|
|
137
|
+
if (hasActualScores) {
|
|
138
|
+
// Full decomposition — show all three layers prominently
|
|
139
|
+
lines.push("### 🔬 Three-Layer Decomposition");
|
|
140
|
+
lines.push("");
|
|
141
|
+
lines.push("| Feature | Floor | Ceiling | Actual | Doc Lift | Retr. Gap | Infra % |");
|
|
142
|
+
lines.push("|---------|-------|---------|--------|----------|-----------|---------|");
|
|
143
|
+
for (const s of sorted) {
|
|
144
|
+
const actualStr = s.actualScore !== undefined ? String(s.actualScore) : "—";
|
|
145
|
+
const gapStr = s.retrievalGap !== undefined
|
|
146
|
+
? s.retrievalGap >= 0
|
|
147
|
+
? `+${s.retrievalGap}`
|
|
148
|
+
: String(s.retrievalGap)
|
|
149
|
+
: "—";
|
|
150
|
+
const infraStr = s.infrastructureEfficiency != null
|
|
151
|
+
? `${Math.round(s.infrastructureEfficiency * 100)}%`
|
|
152
|
+
: "—";
|
|
153
|
+
const flag = s.invertedRetrievalGap ? " 🔄" : "";
|
|
154
|
+
lines.push(`| ${s.feature} | ${s.floorScore} | ${s.ceilingScore} | ${actualStr} | ${liftArrow(s.docLift)} | ${gapStr}${flag} | ${infraStr} |`);
|
|
155
|
+
}
|
|
156
|
+
lines.push("");
|
|
157
|
+
// Decomposition guide
|
|
158
|
+
lines.push("<details>");
|
|
159
|
+
lines.push("<summary>📖 What do these numbers mean?</summary>");
|
|
160
|
+
lines.push("");
|
|
161
|
+
lines.push("- **Floor:** How well models do without any documentation (just training data)");
|
|
162
|
+
lines.push("- **Ceiling:** How well models do with perfect documentation (hand-picked, injected)");
|
|
163
|
+
lines.push("- **Actual:** How well models do when they have to find docs themselves (like real users)");
|
|
164
|
+
lines.push("- **Doc Lift:** How much docs help (Ceiling − Floor). Negative = docs hurt");
|
|
165
|
+
lines.push("- **Retr. Gap:** How much quality is lost in discovery (Ceiling − Actual)");
|
|
166
|
+
lines.push("- **Infra %:** What fraction of doc quality reaches agents (Actual ÷ Ceiling)");
|
|
167
|
+
if (sorted.some((s) => s.invertedRetrievalGap)) {
|
|
168
|
+
lines.push("- **🔄:** Inverted retrieval gap — agents avoid bad docs, scoring higher than ceiling");
|
|
169
|
+
}
|
|
170
|
+
lines.push("");
|
|
171
|
+
lines.push("</details>");
|
|
172
|
+
lines.push("");
|
|
173
|
+
}
|
|
174
|
+
else {
|
|
175
|
+
// Baseline-only — show the existing ceiling decomposition
|
|
176
|
+
lines.push("<details>");
|
|
177
|
+
lines.push("<summary>📊 Ceiling decomposition</summary>");
|
|
178
|
+
lines.push("");
|
|
179
|
+
lines.push("| Feature | Floor | Ceiling | Doc Lift | Quality Gap |");
|
|
180
|
+
lines.push("|---------|-------|---------|----------|-------------|");
|
|
181
|
+
for (const s of sorted) {
|
|
182
|
+
lines.push(`| ${s.feature} | ${s.floorScore} | ${s.ceilingScore} | ${liftArrow(s.docLift)} | ${s.docQualityGap} |`);
|
|
183
|
+
}
|
|
184
|
+
lines.push("");
|
|
185
|
+
lines.push("</details>");
|
|
186
|
+
lines.push("");
|
|
187
|
+
// With vs without docs comparison (uses ceiling/floor model)
|
|
188
|
+
lines.push("<details>");
|
|
189
|
+
lines.push("<summary>📄 With docs vs without docs</summary>");
|
|
190
|
+
lines.push("");
|
|
191
|
+
lines.push("| Feature | With Docs | Without Docs | Lift |");
|
|
192
|
+
lines.push("|---------|-----------|--------------|------|");
|
|
193
|
+
for (const s of sorted) {
|
|
194
|
+
lines.push(`| ${s.feature} | ${s.ceilingScore} | ${s.floorScore} | ${liftArrow(s.docLift)} |`);
|
|
195
|
+
}
|
|
196
|
+
lines.push("");
|
|
197
|
+
lines.push("</details>");
|
|
198
|
+
lines.push("");
|
|
199
|
+
}
|
|
200
|
+
// Cost breakdown (only when cost data is available)
|
|
201
|
+
if (totalCost > 0 || overall.cost) {
|
|
202
|
+
const graderCost = overall.cost?.graderTotal ?? 0;
|
|
203
|
+
const combinedCost = totalCost + graderCost;
|
|
204
|
+
lines.push("<details>");
|
|
205
|
+
lines.push("<summary>💰 Eval cost breakdown</summary>");
|
|
206
|
+
lines.push("");
|
|
207
|
+
lines.push("| Category | Cost |");
|
|
208
|
+
lines.push("|----------|------|");
|
|
209
|
+
lines.push(`| Provider (model inference) | ${formatCost(totalCost)} |`);
|
|
210
|
+
if (graderCost > 0) {
|
|
211
|
+
const graderLabel = overall.cost?.graderModel ?? "unknown";
|
|
212
|
+
lines.push(`| Grader (${graderLabel}) | ${formatCost(graderCost)} |`);
|
|
213
|
+
}
|
|
214
|
+
lines.push(`| **Total** | **${formatCost(combinedCost)}** |`);
|
|
215
|
+
lines.push("");
|
|
216
|
+
// Per-feature provider cost breakdown
|
|
217
|
+
lines.push("**Provider cost by feature area:**");
|
|
218
|
+
lines.push("");
|
|
219
|
+
lines.push("| Feature | Tests | Cost | Avg/Test |");
|
|
220
|
+
lines.push("|---------|-------|------|----------|");
|
|
221
|
+
for (const s of sorted) {
|
|
222
|
+
const avgCost = s.testCount > 0 ? s.totalCost / s.testCount : 0;
|
|
223
|
+
lines.push(`| ${s.feature} | ${s.testCount} | ${formatCost(s.totalCost)} | ${formatCost(avgCost)} |`);
|
|
224
|
+
}
|
|
225
|
+
lines.push("");
|
|
226
|
+
lines.push("</details>");
|
|
227
|
+
lines.push("");
|
|
228
|
+
}
|
|
229
|
+
// Per-model breakdown (when multiple models were evaluated)
|
|
230
|
+
if (summary.perModel && summary.perModel.length > 1) {
|
|
231
|
+
const sorted = [...summary.perModel].sort((a, b) => b.overall.avgScore - a.overall.avgScore);
|
|
232
|
+
lines.push("<details>");
|
|
233
|
+
lines.push("<summary>🤖 Per-model scores</summary>");
|
|
234
|
+
lines.push("");
|
|
235
|
+
lines.push("| Model | Score | Doc Lift | Tests | Cost |");
|
|
236
|
+
lines.push("|-------|-------|----------|-------|------|");
|
|
237
|
+
for (const entry of sorted) {
|
|
238
|
+
const displayName = entry.label || entry.modelId;
|
|
239
|
+
const costStr = entry.overall.cost ? formatCost(entry.overall.cost) : "—";
|
|
240
|
+
const liftStr = entry.overall.avgDocLift >= 0
|
|
241
|
+
? `+${Math.round(entry.overall.avgDocLift)}`
|
|
242
|
+
: String(Math.round(entry.overall.avgDocLift));
|
|
243
|
+
lines.push(`| ${displayName} | **${Math.round(entry.overall.avgScore)}** | ${liftStr} | ${entry.overall.testCount} | ${costStr} |`);
|
|
244
|
+
}
|
|
245
|
+
lines.push("");
|
|
246
|
+
// Per-model × per-area table
|
|
247
|
+
for (const entry of sorted) {
|
|
248
|
+
const displayName = entry.label || entry.modelId;
|
|
249
|
+
lines.push(`**${displayName}** (${entry.modelId}):`);
|
|
250
|
+
lines.push("");
|
|
251
|
+
lines.push("| Feature | Score | Task | Code | Docs | Lift |");
|
|
252
|
+
lines.push("|---------|-------|------|------|------|------|");
|
|
253
|
+
for (const s of entry.scores) {
|
|
254
|
+
const lift = s.docLift >= 0 ? `+${s.docLift}` : String(s.docLift);
|
|
255
|
+
lines.push(`| ${s.feature} | **${s.totalScore}** | ${s.taskCompletion} | ${s.codeCorrectness} | ${s.docCoverage} | ${lift} |`);
|
|
256
|
+
}
|
|
257
|
+
lines.push("");
|
|
258
|
+
}
|
|
259
|
+
// Cost per quality point
|
|
260
|
+
const withCost = sorted.filter((e) => e.overall.cost && e.overall.cost > 0);
|
|
261
|
+
if (withCost.length > 0) {
|
|
262
|
+
lines.push("**Cost efficiency:**");
|
|
263
|
+
lines.push("");
|
|
264
|
+
lines.push("| Model | $/point | Score | Cost |");
|
|
265
|
+
lines.push("|-------|---------|-------|------|");
|
|
266
|
+
for (const entry of withCost) {
|
|
267
|
+
const costPerPoint = entry.overall.avgScore > 0
|
|
268
|
+
? (entry.overall.cost ?? 0) / entry.overall.avgScore
|
|
269
|
+
: 0;
|
|
270
|
+
lines.push(`| ${entry.label} | ${formatCost(costPerPoint)} | ${Math.round(entry.overall.avgScore)} | ${formatCost(entry.overall.cost ?? 0)} |`);
|
|
271
|
+
}
|
|
272
|
+
lines.push("");
|
|
273
|
+
}
|
|
274
|
+
lines.push("</details>");
|
|
275
|
+
lines.push("");
|
|
276
|
+
}
|
|
277
|
+
// Comparison section (when --compare was used)
|
|
278
|
+
if (options.comparisonReport) {
|
|
279
|
+
const report = options.comparisonReport;
|
|
280
|
+
const overallDelta = report.deltas.overall;
|
|
281
|
+
const overallDeltaStr = overallDelta > 0
|
|
282
|
+
? `+${Math.round(overallDelta)}`
|
|
283
|
+
: String(Math.round(overallDelta));
|
|
284
|
+
const overallChangeIcon = overallDelta > report.noiseThreshold
|
|
285
|
+
? "📈"
|
|
286
|
+
: overallDelta < -report.noiseThreshold
|
|
287
|
+
? "📉"
|
|
288
|
+
: "➡️";
|
|
289
|
+
lines.push("### 📊 Score Comparison");
|
|
290
|
+
lines.push("");
|
|
291
|
+
lines.push(`**Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)}** (${overallChangeIcon} ${overallDeltaStr})`);
|
|
292
|
+
lines.push("");
|
|
293
|
+
// Check if comparison data includes actual/retrieval gap deltas
|
|
294
|
+
const hasActualDeltas = report.areas.some((a) => a.actualDelta !== undefined);
|
|
295
|
+
if (hasActualDeltas) {
|
|
296
|
+
lines.push("| Feature | Baseline | Current | Delta | Actual Δ | Ret. Gap Δ | Infra Δ |");
|
|
297
|
+
lines.push("|---------|----------|---------|-------|----------|------------|---------|");
|
|
298
|
+
for (const a of report.areas) {
|
|
299
|
+
const icon = a.change === "improved"
|
|
300
|
+
? "📈"
|
|
301
|
+
: a.change === "regressed"
|
|
302
|
+
? "📉"
|
|
303
|
+
: "➡️";
|
|
304
|
+
const d = (n) => n > 0 ? `+${Math.round(n)}` : String(Math.round(n));
|
|
305
|
+
const actualStr = a.actualDelta !== undefined ? d(a.actualDelta) : "—";
|
|
306
|
+
const retGapStr = a.retrievalGapDelta !== undefined ? d(a.retrievalGapDelta) : "—";
|
|
307
|
+
const infraStr = a.infrastructureEfficiencyDelta !== undefined
|
|
308
|
+
? `${a.infrastructureEfficiencyDelta > 0 ? "+" : ""}${Math.round(a.infrastructureEfficiencyDelta * 100)}pp`
|
|
309
|
+
: "—";
|
|
310
|
+
lines.push(`| ${a.area} | ${a.baseline} | ${a.experiment} | ${icon} ${d(a.delta)} | ${actualStr} | ${retGapStr} | ${infraStr} |`);
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
else {
|
|
314
|
+
lines.push("| Feature | Baseline | Current | Delta | Task | Code | Docs |");
|
|
315
|
+
lines.push("|---------|----------|---------|-------|------|------|------|");
|
|
316
|
+
for (const a of report.areas) {
|
|
317
|
+
const icon = a.change === "improved"
|
|
318
|
+
? "📈"
|
|
319
|
+
: a.change === "regressed"
|
|
320
|
+
? "📉"
|
|
321
|
+
: "➡️";
|
|
322
|
+
const d = (n) => n > 0 ? `+${Math.round(n)}` : String(Math.round(n));
|
|
323
|
+
lines.push(`| ${a.area} | ${a.baseline} | ${a.experiment} | ${icon} ${d(a.delta)} | ${d(a.dimensions.taskCompletion.delta)} | ${d(a.dimensions.codeCorrectness.delta)} | ${d(a.dimensions.docCoverage.delta)} |`);
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
lines.push("");
|
|
327
|
+
const parts = [];
|
|
328
|
+
if (report.improved.length > 0) {
|
|
329
|
+
parts.push(`📈 ${report.improved.length} improved`);
|
|
330
|
+
}
|
|
331
|
+
if (report.regressed.length > 0) {
|
|
332
|
+
parts.push(`📉 ${report.regressed.length} regressed`);
|
|
333
|
+
}
|
|
334
|
+
if (report.unchanged.length > 0) {
|
|
335
|
+
parts.push(`➡️ ${report.unchanged.length} unchanged`);
|
|
336
|
+
}
|
|
337
|
+
if (parts.length > 0) {
|
|
338
|
+
const isEmpirical = "noiseThresholdEmpirical" in report &&
|
|
339
|
+
report.noiseThresholdEmpirical === true;
|
|
340
|
+
const thresholdNote = isEmpirical
|
|
341
|
+
? ` (empirical threshold: ±${report.noiseThreshold.toFixed(1)})`
|
|
342
|
+
: ` (threshold: ±${report.noiseThreshold})`;
|
|
343
|
+
lines.push(parts.join(" · ") + thresholdNote);
|
|
344
|
+
lines.push("");
|
|
345
|
+
}
|
|
346
|
+
// Dimension averages in collapsible
|
|
347
|
+
lines.push("<details>");
|
|
348
|
+
lines.push("<summary>Dimension averages</summary>");
|
|
349
|
+
lines.push("");
|
|
350
|
+
const dim = report.deltas.perDimension;
|
|
351
|
+
const dd = (n) => n > 0 ? `+${Math.round(n)}` : String(Math.round(n));
|
|
352
|
+
lines.push("| Dimension | Delta |");
|
|
353
|
+
lines.push("|-----------|-------|");
|
|
354
|
+
lines.push(`| Task Completion | ${dd(dim.taskCompletion)} |`);
|
|
355
|
+
lines.push(`| Code Correctness | ${dd(dim.codeCorrectness)} |`);
|
|
356
|
+
lines.push(`| Doc Coverage | ${dd(dim.docCoverage)} |`);
|
|
357
|
+
lines.push(`| Doc Lift | ${dd(report.deltas.docLift)} |`);
|
|
358
|
+
lines.push("");
|
|
359
|
+
lines.push("</details>");
|
|
360
|
+
lines.push("");
|
|
361
|
+
}
|
|
362
|
+
// Recommendations
|
|
363
|
+
if (belowCritical.length > 0 || sorted.some((s) => s.totalScore < 70)) {
|
|
364
|
+
lines.push("### 💡 Recommendations");
|
|
365
|
+
lines.push("");
|
|
366
|
+
for (const s of sorted) {
|
|
367
|
+
if (s.totalScore < 50) {
|
|
368
|
+
lines.push(`- 🔴 **${s.feature}** (score: ${Math.round(s.totalScore)}) — needs significant doc improvements. `);
|
|
369
|
+
if (s.codeCorrectness < 10) {
|
|
370
|
+
lines.push(` Code correctness is very low (${Math.round(s.codeCorrectness)}) — add more complete code examples.`);
|
|
371
|
+
}
|
|
372
|
+
if (s.docCoverage < 10) {
|
|
373
|
+
lines.push(` Doc coverage is very low (${Math.round(s.docCoverage)}) — key APIs/patterns may be missing from docs.`);
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
else if (s.totalScore < 70) {
|
|
377
|
+
lines.push(`- 🟠 **${s.feature}** (score: ${Math.round(s.totalScore)}) — has room for improvement.`);
|
|
378
|
+
if (s.codeCorrectness < 15) {
|
|
379
|
+
lines.push(` Code correctness (${Math.round(s.codeCorrectness)}) could improve with better code examples.`);
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
lines.push("");
|
|
384
|
+
}
|
|
385
|
+
// Footer
|
|
386
|
+
const date = new Date(timestamp).toLocaleString("en-US", {
|
|
387
|
+
day: "numeric",
|
|
388
|
+
hour: "numeric",
|
|
389
|
+
minute: "2-digit",
|
|
390
|
+
month: "short",
|
|
391
|
+
timeZone: "UTC",
|
|
392
|
+
timeZoneName: "short",
|
|
393
|
+
year: "numeric",
|
|
394
|
+
});
|
|
395
|
+
lines.push("---");
|
|
396
|
+
const runUrl = process.env.GITHUB_RUN_URL ?? "";
|
|
397
|
+
const runLink = runUrl ? ` · <a href="${runUrl}">view run</a>` : "";
|
|
398
|
+
const promptfooLink = options.promptfooUrl
|
|
399
|
+
? ` · <a href="${options.promptfooUrl}">view detailed results</a>`
|
|
400
|
+
: "";
|
|
401
|
+
lines.push(`<sub>🤖 Generated by AI Literacy Framework · ${date}${runLink}${promptfooLink} · <a href="https://github.com/sanity-labs/ai-literacy-framework">docs</a></sub>`);
|
|
402
|
+
return lines.join("\n");
|
|
403
|
+
}
|
|
404
|
+
function gradeEmoji(score) {
|
|
405
|
+
if (score >= 80) {
|
|
406
|
+
return "✅";
|
|
407
|
+
}
|
|
408
|
+
if (score >= 70) {
|
|
409
|
+
return "🟡";
|
|
410
|
+
}
|
|
411
|
+
if (score >= 50) {
|
|
412
|
+
return "🟠";
|
|
413
|
+
}
|
|
414
|
+
return "🔴";
|
|
415
|
+
}
|
|
416
|
+
function gradeLetter(score) {
|
|
417
|
+
if (score >= 80) {
|
|
418
|
+
return "A";
|
|
419
|
+
}
|
|
420
|
+
if (score >= 70) {
|
|
421
|
+
return "B";
|
|
422
|
+
}
|
|
423
|
+
if (score >= 50) {
|
|
424
|
+
return "C";
|
|
425
|
+
}
|
|
426
|
+
return "D";
|
|
427
|
+
}
|
|
428
|
+
// ── Main ───────────────────────────────────────────────────────────────
|
|
429
|
+
function liftArrow(lift) {
|
|
430
|
+
const rounded = Math.round(lift);
|
|
431
|
+
if (rounded > 0) {
|
|
432
|
+
return `📈 +${rounded}`;
|
|
433
|
+
}
|
|
434
|
+
if (rounded < 0) {
|
|
435
|
+
return `📉 ${rounded}`;
|
|
436
|
+
}
|
|
437
|
+
return "➡️ 0";
|
|
438
|
+
}
|
|
439
|
+
function overallEmoji(avg) {
|
|
440
|
+
if (avg >= 75) {
|
|
441
|
+
return "🟢";
|
|
442
|
+
}
|
|
443
|
+
if (avg >= 60) {
|
|
444
|
+
return "🟡";
|
|
445
|
+
}
|
|
446
|
+
if (avg >= 45) {
|
|
447
|
+
return "🟠";
|
|
448
|
+
}
|
|
449
|
+
return "🔴";
|
|
450
|
+
}
|
|
451
|
+
/** Normalize legacy field names in a score object */
|
|
452
|
+
function normalizeScore(s) {
|
|
453
|
+
const ceiling = s.ceilingScore ??
|
|
454
|
+
s.withDocsScore ??
|
|
455
|
+
s.totalScore ??
|
|
456
|
+
0;
|
|
457
|
+
const floor = s.floorScore ??
|
|
458
|
+
s.withoutDocsScore ??
|
|
459
|
+
0;
|
|
460
|
+
const lift = s.docLift ??
|
|
461
|
+
s.liftFromDocs ??
|
|
462
|
+
ceiling - floor;
|
|
463
|
+
return {
|
|
464
|
+
...s,
|
|
465
|
+
ceilingScore: ceiling,
|
|
466
|
+
docLift: lift,
|
|
467
|
+
docQualityGap: s.docQualityGap ?? 100 - ceiling,
|
|
468
|
+
floorScore: floor,
|
|
469
|
+
negativeDocLift: s.negativeDocLift ?? lift < 0,
|
|
470
|
+
};
|
|
471
|
+
}
|
|
472
|
+
// ── Main ──────────────────────────────────────────────────────────────
|
|
473
|
+
export function generatePrComment(options) {
|
|
474
|
+
const { outputPath, promptfooUrl, rootDir } = options;
|
|
475
|
+
const summaryPath = resolve(rootDir, "results/latest/score-summary.json");
|
|
476
|
+
const comparisonPath = resolve(rootDir, "results/latest/comparison-report.json");
|
|
477
|
+
const raw = readFileSync(summaryPath, "utf-8");
|
|
478
|
+
const parsed = JSON.parse(raw);
|
|
479
|
+
// Normalize legacy field names in scores
|
|
480
|
+
const summary = {
|
|
481
|
+
...parsed,
|
|
482
|
+
scores: parsed.scores.map((s) => normalizeScore(s)),
|
|
483
|
+
};
|
|
484
|
+
// Load comparison report if it exists (produced by --compare flag)
|
|
485
|
+
let comparisonReport;
|
|
486
|
+
if (existsSync(comparisonPath)) {
|
|
487
|
+
try {
|
|
488
|
+
const compRaw = readFileSync(comparisonPath, "utf-8");
|
|
489
|
+
comparisonReport = JSON.parse(compRaw);
|
|
490
|
+
}
|
|
491
|
+
catch {
|
|
492
|
+
// Non-fatal — comparison report is optional
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
const comment = generateComment(summary, { comparisonReport, promptfooUrl });
|
|
496
|
+
if (outputPath) {
|
|
497
|
+
writeFileSync(outputPath, comment);
|
|
498
|
+
}
|
|
499
|
+
else {
|
|
500
|
+
process.stdout.write(comment);
|
|
501
|
+
}
|
|
502
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/probe.ts
|
|
3
|
+
*
|
|
4
|
+
* Tier B probes for new features without evaluation tasks.
|
|
5
|
+
*
|
|
6
|
+
* Phase 4b of the Scenario Matrix implementation.
|
|
7
|
+
*
|
|
8
|
+
* When a content release adds documents for a genuinely new feature (no
|
|
9
|
+
* evaluation tasks exist), a probe provides a directional "usability"
|
|
10
|
+
* signal by analyzing the document content and, optionally, evaluating
|
|
11
|
+
* a generic implementation prompt.
|
|
12
|
+
*
|
|
13
|
+
* Probes are NOT scored evaluations — they answer "are these docs usable?"
|
|
14
|
+
* not "are these docs good enough?" The output is always labeled as
|
|
15
|
+
* directional and never displayed on the same scale as scored evaluations.
|
|
16
|
+
*
|
|
17
|
+
* @see docs/exec-plans/completed/scenario-matrix-implementation/phase-4-content-release-integration.md
|
|
18
|
+
*/
|
|
19
|
+
import type { ProbeResult } from "./types.js";
|
|
20
|
+
/** Generic probe prompt template */
|
|
21
|
+
export declare const PROBE_PROMPT = "Given the following documentation about a Sanity feature, write a TypeScript\nimplementation that demonstrates the feature's core functionality.\n\nUse only APIs and patterns described in the documentation. If the documentation\nis insufficient to implement something, say so explicitly rather than guessing.\n\nDocumentation:\n{{docs}}";
|
|
22
|
+
/**
|
|
23
|
+
* Analyze a model's probe output against the source documentation.
|
|
24
|
+
*
|
|
25
|
+
* Extracts API names from both the documentation and the model output,
|
|
26
|
+
* identifies hallucinations (APIs used but not documented), and classifies
|
|
27
|
+
* the overall usability.
|
|
28
|
+
*
|
|
29
|
+
* This is a pure function — it doesn't call any LLM. The model output
|
|
30
|
+
* is provided as input (from a prior evaluation step or manual run).
|
|
31
|
+
*
|
|
32
|
+
* @param documentContent - The concatenated documentation text
|
|
33
|
+
* @param modelOutput - The model's response to the probe prompt
|
|
34
|
+
* @param documentSlugs - The slugs of the documents that were probed
|
|
35
|
+
* @returns Probe result with usability classification
|
|
36
|
+
*/
|
|
37
|
+
export declare function analyzeProbeOutput(documentContent: string, modelOutput: string, documentSlugs: string[]): ProbeResult;
|
|
38
|
+
/**
|
|
39
|
+
* Format a probe result for console output.
|
|
40
|
+
*/
|
|
41
|
+
export declare function formatProbeConsole(probe: ProbeResult): string;
|
|
42
|
+
/**
|
|
43
|
+
* Format a probe result as markdown.
|
|
44
|
+
*/
|
|
45
|
+
export declare function formatProbeMarkdown(probe: ProbeResult): string;
|
|
46
|
+
/**
|
|
47
|
+
* Generate task scaffolding suggestions from document content analysis.
|
|
48
|
+
*
|
|
49
|
+
* Extracts function names, configuration patterns, and common operations
|
|
50
|
+
* from code blocks in the documentation to suggest evaluation tasks.
|
|
51
|
+
*/
|
|
52
|
+
export declare function generateTaskSuggestions(documentContent: string, documentSlugs: string[]): string[];
|