@sanity/ailf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -0
- package/bin/ailf.js +64 -0
- package/canonical/grader-references/README.md +88 -0
- package/canonical/grader-references/groq.yaml +234 -0
- package/canonical/grader-references/studio-setup.yaml +275 -0
- package/canonical/reference-solutions/.gitkeep +1 -0
- package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
- package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
- package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
- package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
- package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
- package/canonical/reference-solutions/groq/joins-references.ts +300 -0
- package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
- package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
- package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
- package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
- package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
- package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
- package/config/bigquery/README.md +74 -0
- package/config/bigquery/views/area_scores.sql +87 -0
- package/config/bigquery/views/reports.sql +49 -0
- package/config/features.yaml +116 -0
- package/config/models.yaml +115 -0
- package/config/prompts.yaml +75 -0
- package/config/rubrics.yaml +62 -0
- package/config/schedules.yaml +43 -0
- package/config/sinks.yaml +54 -0
- package/config/sources.yaml +51 -0
- package/config/thresholds.yaml +49 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
- package/dist/_vendor/ailf-core/examples/index.js +285 -0
- package/dist/_vendor/ailf-core/index.d.ts +17 -0
- package/dist/_vendor/ailf-core/index.js +17 -0
- package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
- package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
- package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
- package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
- package/dist/_vendor/ailf-core/ports/context.js +14 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
- package/dist/_vendor/ailf-core/ports/index.js +7 -0
- package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
- package/dist/_vendor/ailf-core/ports/logger.js +11 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
- package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/schemas/index.js +16 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
- package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
- package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
- package/dist/_vendor/ailf-core/services/index.js +12 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/scoring.js +222 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
- package/dist/_vendor/ailf-core/types/index.js +21 -0
- package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
- package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
- package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
- package/dist/_vendor/ailf-shared/document-ref.js +1 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -0
- package/dist/_vendor/ailf-shared/index.js +16 -0
- package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
- package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
- package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
- package/dist/_vendor/ailf-shared/score-grades.js +23 -0
- package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
- package/dist/adapters/cache/content-lake-cache.js +59 -0
- package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
- package/dist/adapters/cache/filesystem-cache.js +54 -0
- package/dist/adapters/cache/index.d.ts +2 -0
- package/dist/adapters/cache/index.js +2 -0
- package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
- package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
- package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
- package/dist/adapters/config-sources/file-config-adapter.js +96 -0
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +2 -0
- package/dist/adapters/doc-fetchers/index.d.ts +1 -0
- package/dist/adapters/doc-fetchers/index.js +1 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
- package/dist/adapters/eval-runners/index.d.ts +1 -0
- package/dist/adapters/eval-runners/index.js +1 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
- package/dist/adapters/index.d.ts +12 -0
- package/dist/adapters/index.js +12 -0
- package/dist/adapters/loggers/console-logger.d.ts +22 -0
- package/dist/adapters/loggers/console-logger.js +54 -0
- package/dist/adapters/loggers/index.d.ts +9 -0
- package/dist/adapters/loggers/index.js +9 -0
- package/dist/adapters/loggers/json-logger.d.ts +18 -0
- package/dist/adapters/loggers/json-logger.js +33 -0
- package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
- package/dist/adapters/loggers/quiet-logger.js +30 -0
- package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/composite-task-source.js +59 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
- package/dist/adapters/task-sources/index.d.ts +7 -0
- package/dist/adapters/task-sources/index.js +7 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
- package/dist/adapters/task-sources/repo-schemas.js +234 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
- package/dist/adapters/task-sources/repo-task-source.js +104 -0
- package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
- package/dist/adapters/task-sources/repo-trigger.js +153 -0
- package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
- package/dist/adapters/task-sources/repo-validation.js +164 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
- package/dist/adapters/task-sources/yaml-task-source.js +136 -0
- package/dist/agent-observer/agentic-provider.d.ts +132 -0
- package/dist/agent-observer/agentic-provider.js +983 -0
- package/dist/agent-observer/classifier.d.ts +62 -0
- package/dist/agent-observer/classifier.js +269 -0
- package/dist/agent-observer/index.d.ts +7 -0
- package/dist/agent-observer/index.js +4 -0
- package/dist/agent-observer/pricing.d.ts +35 -0
- package/dist/agent-observer/pricing.js +82 -0
- package/dist/agent-observer/provider.d.ts +77 -0
- package/dist/agent-observer/provider.js +151 -0
- package/dist/agent-observer/proxy.d.ts +91 -0
- package/dist/agent-observer/proxy.js +321 -0
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/agent-observer/types.d.ts +137 -0
- package/dist/agent-observer/types.js +16 -0
- package/dist/assertions/source-isolation.d.ts +72 -0
- package/dist/assertions/source-isolation.js +117 -0
- package/dist/cli.d.ts +24 -0
- package/dist/cli.js +199 -0
- package/dist/commands/agent-report.d.ts +5 -0
- package/dist/commands/agent-report.js +69 -0
- package/dist/commands/baseline.d.ts +9 -0
- package/dist/commands/baseline.js +141 -0
- package/dist/commands/cache.d.ts +13 -0
- package/dist/commands/cache.js +135 -0
- package/dist/commands/calculate-scores.d.ts +8 -0
- package/dist/commands/calculate-scores.js +48 -0
- package/dist/commands/compare.d.ts +8 -0
- package/dist/commands/compare.js +120 -0
- package/dist/commands/completion.d.ts +18 -0
- package/dist/commands/completion.js +260 -0
- package/dist/commands/coverage-audit.d.ts +7 -0
- package/dist/commands/coverage-audit.js +40 -0
- package/dist/commands/discovery-report.d.ts +10 -0
- package/dist/commands/discovery-report.js +44 -0
- package/dist/commands/eval.d.ts +9 -0
- package/dist/commands/eval.js +35 -0
- package/dist/commands/explain-handler.d.ts +34 -0
- package/dist/commands/explain-handler.js +719 -0
- package/dist/commands/fetch-docs.d.ts +8 -0
- package/dist/commands/fetch-docs.js +128 -0
- package/dist/commands/generate-configs.d.ts +8 -0
- package/dist/commands/generate-configs.js +46 -0
- package/dist/commands/grader/index.d.ts +11 -0
- package/dist/commands/grader/index.js +118 -0
- package/dist/commands/init.d.ts +19 -0
- package/dist/commands/init.js +150 -0
- package/dist/commands/interactive.d.ts +12 -0
- package/dist/commands/interactive.js +238 -0
- package/dist/commands/lookup-doc.d.ts +15 -0
- package/dist/commands/lookup-doc.js +84 -0
- package/dist/commands/measure-retrieval.d.ts +5 -0
- package/dist/commands/measure-retrieval.js +65 -0
- package/dist/commands/pipeline-action.d.ts +71 -0
- package/dist/commands/pipeline-action.js +305 -0
- package/dist/commands/pipeline.d.ts +62 -0
- package/dist/commands/pipeline.js +53 -0
- package/dist/commands/pr-comment.d.ts +8 -0
- package/dist/commands/pr-comment.js +47 -0
- package/dist/commands/publish.d.ts +26 -0
- package/dist/commands/publish.js +253 -0
- package/dist/commands/readiness-report.d.ts +10 -0
- package/dist/commands/readiness-report.js +104 -0
- package/dist/commands/shared/options.d.ts +29 -0
- package/dist/commands/shared/options.js +57 -0
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +16 -0
- package/dist/commands/validate-tasks.js +93 -0
- package/dist/commands/validate.d.ts +9 -0
- package/dist/commands/validate.js +73 -0
- package/dist/commands/webhook-server.d.ts +5 -0
- package/dist/commands/webhook-server.js +30 -0
- package/dist/commands/weekly-digest.d.ts +10 -0
- package/dist/commands/weekly-digest.js +104 -0
- package/dist/composition-root.d.ts +26 -0
- package/dist/composition-root.js +107 -0
- package/dist/interpolate.d.ts +26 -0
- package/dist/interpolate.js +70 -0
- package/dist/job-store.d.ts +104 -0
- package/dist/job-store.js +188 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.d.ts +27 -0
- package/dist/orchestration/build-app-context.js +81 -0
- package/dist/orchestration/build-step-sequence.d.ts +15 -0
- package/dist/orchestration/build-step-sequence.js +84 -0
- package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
- package/dist/orchestration/config-to-source-overrides.js +28 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/index.d.ts +11 -0
- package/dist/orchestration/index.js +11 -0
- package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
- package/dist/orchestration/pipeline-orchestrator.js +153 -0
- package/dist/orchestration/step-runner.d.ts +20 -0
- package/dist/orchestration/step-runner.js +88 -0
- package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
- package/dist/orchestration/steps/calculate-scores-step.js +95 -0
- package/dist/orchestration/steps/callback-step.d.ts +24 -0
- package/dist/orchestration/steps/callback-step.js +76 -0
- package/dist/orchestration/steps/compare-step.d.ts +14 -0
- package/dist/orchestration/steps/compare-step.js +92 -0
- package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
- package/dist/orchestration/steps/discovery-report-step.js +55 -0
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
- package/dist/orchestration/steps/fetch-docs-step.js +135 -0
- package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
- package/dist/orchestration/steps/gap-analysis-step.js +136 -0
- package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
- package/dist/orchestration/steps/generate-configs-step.js +85 -0
- package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
- package/dist/orchestration/steps/grader-consistency-step.js +64 -0
- package/dist/orchestration/steps/index.d.ts +19 -0
- package/dist/orchestration/steps/index.js +19 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
- package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
- package/dist/orchestration/steps/publish-report-step.js +216 -0
- package/dist/orchestration/steps/readiness-step.d.ts +13 -0
- package/dist/orchestration/steps/readiness-step.js +91 -0
- package/dist/orchestration/steps/report-step.d.ts +12 -0
- package/dist/orchestration/steps/report-step.js +49 -0
- package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
- package/dist/orchestration/steps/run-eval-step.js +195 -0
- package/dist/orchestration/steps/validate-step.d.ts +12 -0
- package/dist/orchestration/steps/validate-step.js +41 -0
- package/dist/pipeline/agent-behavior-report.d.ts +53 -0
- package/dist/pipeline/agent-behavior-report.js +132 -0
- package/dist/pipeline/attribution.d.ts +47 -0
- package/dist/pipeline/attribution.js +226 -0
- package/dist/pipeline/baseline.d.ts +37 -0
- package/dist/pipeline/baseline.js +141 -0
- package/dist/pipeline/cache.d.ts +101 -0
- package/dist/pipeline/cache.js +283 -0
- package/dist/pipeline/calculate-scores.d.ts +102 -0
- package/dist/pipeline/calculate-scores.js +1128 -0
- package/dist/pipeline/callback-delivery.d.ts +50 -0
- package/dist/pipeline/callback-delivery.js +89 -0
- package/dist/pipeline/checks.d.ts +39 -0
- package/dist/pipeline/checks.js +280 -0
- package/dist/pipeline/classify-url.d.ts +61 -0
- package/dist/pipeline/classify-url.js +93 -0
- package/dist/pipeline/compare.d.ts +31 -0
- package/dist/pipeline/compare.js +208 -0
- package/dist/pipeline/coverage-audit.d.ts +39 -0
- package/dist/pipeline/coverage-audit.js +165 -0
- package/dist/pipeline/degradations.d.ts +85 -0
- package/dist/pipeline/degradations.js +242 -0
- package/dist/pipeline/discovery-report.d.ts +55 -0
- package/dist/pipeline/discovery-report.js +178 -0
- package/dist/pipeline/eval-constants.d.ts +68 -0
- package/dist/pipeline/eval-constants.js +111 -0
- package/dist/pipeline/eval-fingerprint.d.ts +66 -0
- package/dist/pipeline/eval-fingerprint.js +175 -0
- package/dist/pipeline/expand-tasks.d.ts +220 -0
- package/dist/pipeline/expand-tasks.js +421 -0
- package/dist/pipeline/failure-modes.d.ts +46 -0
- package/dist/pipeline/failure-modes.js +348 -0
- package/dist/pipeline/fetch-url-content.d.ts +44 -0
- package/dist/pipeline/fetch-url-content.js +93 -0
- package/dist/pipeline/gap-analysis.d.ts +48 -0
- package/dist/pipeline/gap-analysis.js +231 -0
- package/dist/pipeline/generate-configs.d.ts +72 -0
- package/dist/pipeline/generate-configs.js +395 -0
- package/dist/pipeline/grader-api.d.ts +49 -0
- package/dist/pipeline/grader-api.js +200 -0
- package/dist/pipeline/grader-compare-runner.d.ts +44 -0
- package/dist/pipeline/grader-compare-runner.js +301 -0
- package/dist/pipeline/grader-comparison.d.ts +111 -0
- package/dist/pipeline/grader-comparison.js +161 -0
- package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
- package/dist/pipeline/grader-consistency-runner.js +270 -0
- package/dist/pipeline/grader-consistency.d.ts +103 -0
- package/dist/pipeline/grader-consistency.js +146 -0
- package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
- package/dist/pipeline/grader-sensitivity-runner.js +282 -0
- package/dist/pipeline/grader-sensitivity.d.ts +94 -0
- package/dist/pipeline/grader-sensitivity.js +144 -0
- package/dist/pipeline/grader-validate-runner.d.ts +38 -0
- package/dist/pipeline/grader-validate-runner.js +229 -0
- package/dist/pipeline/grader-validation.d.ts +107 -0
- package/dist/pipeline/grader-validation.js +169 -0
- package/dist/pipeline/map-request-to-config.d.ts +19 -0
- package/dist/pipeline/map-request-to-config.js +80 -0
- package/dist/pipeline/measure-retrieval.d.ts +59 -0
- package/dist/pipeline/measure-retrieval.js +111 -0
- package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
- package/dist/pipeline/mirror-repo-tasks.js +350 -0
- package/dist/pipeline/plan-format.d.ts +33 -0
- package/dist/pipeline/plan-format.js +202 -0
- package/dist/pipeline/plan.d.ts +169 -0
- package/dist/pipeline/plan.js +708 -0
- package/dist/pipeline/pr-comment.d.ts +19 -0
- package/dist/pipeline/pr-comment.js +502 -0
- package/dist/pipeline/probe.d.ts +52 -0
- package/dist/pipeline/probe.js +390 -0
- package/dist/pipeline/provenance.d.ts +47 -0
- package/dist/pipeline/provenance.js +146 -0
- package/dist/pipeline/readiness-report.d.ts +87 -0
- package/dist/pipeline/readiness-report.js +205 -0
- package/dist/pipeline/release-classification.d.ts +54 -0
- package/dist/pipeline/release-classification.js +238 -0
- package/dist/pipeline/release-report.d.ts +37 -0
- package/dist/pipeline/release-report.js +222 -0
- package/dist/pipeline/repo-eval-comment.d.ts +37 -0
- package/dist/pipeline/repo-eval-comment.js +165 -0
- package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
- package/dist/pipeline/repo-threshold-evaluator.js +162 -0
- package/dist/pipeline/resolve-mappings.d.ts +35 -0
- package/dist/pipeline/resolve-mappings.js +72 -0
- package/dist/pipeline/retrieval-metrics.d.ts +39 -0
- package/dist/pipeline/retrieval-metrics.js +136 -0
- package/dist/pipeline/reverse-mapping.d.ts +67 -0
- package/dist/pipeline/reverse-mapping.js +88 -0
- package/dist/pipeline/schemas.d.ts +9 -0
- package/dist/pipeline/schemas.js +9 -0
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +88 -0
- package/dist/pipeline/targeted-loo.js +203 -0
- package/dist/pipeline/thresholds.d.ts +27 -0
- package/dist/pipeline/thresholds.js +245 -0
- package/dist/pipeline/types.d.ts +10 -0
- package/dist/pipeline/types.js +10 -0
- package/dist/pipeline/validate.d.ts +67 -0
- package/dist/pipeline/validate.js +406 -0
- package/dist/pipeline/webhook-server.d.ts +37 -0
- package/dist/pipeline/webhook-server.js +133 -0
- package/dist/report-store.d.ts +84 -0
- package/dist/report-store.js +208 -0
- package/dist/sanity/client.d.ts +38 -0
- package/dist/sanity/client.js +86 -0
- package/dist/sanity/portable-text.d.ts +11 -0
- package/dist/sanity/portable-text.js +211 -0
- package/dist/sanity/queries.d.ts +133 -0
- package/dist/sanity/queries.js +300 -0
- package/dist/schedules/digest.d.ts +116 -0
- package/dist/schedules/digest.js +156 -0
- package/dist/schedules/index.d.ts +12 -0
- package/dist/schedules/index.js +10 -0
- package/dist/schedules/loader.d.ts +31 -0
- package/dist/schedules/loader.js +73 -0
- package/dist/schedules/schema.d.ts +9 -0
- package/dist/schedules/schema.js +9 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +21 -0
- package/dist/scripts/validate-task-sources.js +210 -0
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/bigquery/index.d.ts +131 -0
- package/dist/sinks/bigquery/index.js +222 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/index.d.ts +23 -0
- package/dist/sinks/index.js +18 -0
- package/dist/sinks/loader.d.ts +18 -0
- package/dist/sinks/loader.js +82 -0
- package/dist/sinks/retry.d.ts +24 -0
- package/dist/sinks/retry.js +52 -0
- package/dist/sinks/schema.d.ts +9 -0
- package/dist/sinks/schema.js +9 -0
- package/dist/sinks/slack/format.d.ts +65 -0
- package/dist/sinks/slack/format.js +327 -0
- package/dist/sinks/slack/index.d.ts +27 -0
- package/dist/sinks/slack/index.js +78 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +59 -0
- package/dist/sinks/types.js +44 -0
- package/dist/sinks/webhook/index.d.ts +19 -0
- package/dist/sinks/webhook/index.js +50 -0
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/sources.d.ts +104 -0
- package/dist/sources.js +292 -0
- package/dist/webhook/budget.d.ts +42 -0
- package/dist/webhook/budget.js +60 -0
- package/dist/webhook/debounce.d.ts +67 -0
- package/dist/webhook/debounce.js +76 -0
- package/dist/webhook/dispatch.d.ts +45 -0
- package/dist/webhook/dispatch.js +84 -0
- package/dist/webhook/eval-request-handler.d.ts +87 -0
- package/dist/webhook/eval-request-handler.js +181 -0
- package/dist/webhook/handler.d.ts +88 -0
- package/dist/webhook/handler.js +203 -0
- package/dist/webhook/index.d.ts +17 -0
- package/dist/webhook/index.js +12 -0
- package/dist/webhook/types.d.ts +109 -0
- package/dist/webhook/types.js +10 -0
- package/package.json +72 -0
- package/tasks/.expanded.agentic.yaml +51 -0
- package/tasks/.expanded.yaml +66 -0
- package/tasks/frameworks.yaml +98 -0
- package/tasks/functions.yaml +51 -0
- package/tasks/groq.yaml +216 -0
- package/tasks/nextjs-live.yaml +62 -0
- package/tasks/studio-setup.yaml +111 -0
- package/tasks/visual-editing.yaml +120 -0
|
@@ -0,0 +1,421 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/expand-tasks.ts
|
|
3
|
+
*
|
|
4
|
+
* Reads task YAML files in the single-definition format and expands each
|
|
5
|
+
* task into gold + baseline Promptfoo test entries. This eliminates the
|
|
6
|
+
* manual duplication where every task had to be written twice.
|
|
7
|
+
*
|
|
8
|
+
* Rubric templates from config/rubrics.yaml are resolved at expansion time:
|
|
9
|
+
* tasks specify `template` + `criteria`, and the expander assembles
|
|
10
|
+
* the full rubric text by injecting criteria into the template.
|
|
11
|
+
*
|
|
12
|
+
* Structured dimension metadata (Approach 5):
|
|
13
|
+
* When a rubric template has a `dimension` field, the resolved assertion
|
|
14
|
+
* includes `metadata.dimension` and `metadata.maxScore`. This flows through
|
|
15
|
+
* Promptfoo into component results, allowing the scoring engine to classify
|
|
16
|
+
* rubrics structurally instead of via heuristic string matching.
|
|
17
|
+
* See docs/design-docs/structured-dimensions.md.
|
|
18
|
+
*
|
|
19
|
+
* Single-definition format:
|
|
20
|
+
* - id: groq-blog-queries
|
|
21
|
+
* description: "GROQ - Blog queries with filtering and pagination"
|
|
22
|
+
* doc_coverage: true
|
|
23
|
+
* vars:
|
|
24
|
+
* task: |
|
|
25
|
+
* Write GROQ queries for a Sanity blog application: ...
|
|
26
|
+
* docs: file://contexts/canonical/groq-blog-queries.md
|
|
27
|
+
* assert:
|
|
28
|
+
* - type: llm-rubric
|
|
29
|
+
* template: task-completion
|
|
30
|
+
* criteria:
|
|
31
|
+
* - GROQ filter with _type == "post"
|
|
32
|
+
* - Projection with aliased slug field
|
|
33
|
+
* - type: contains-any
|
|
34
|
+
* value: ["client.fetch", "createClient"]
|
|
35
|
+
* baseline:
|
|
36
|
+
* enabled: true
|
|
37
|
+
* rubric: abbreviated
|
|
38
|
+
*
|
|
39
|
+
* Expands to:
|
|
40
|
+
* 1. Gold entry — uses vars.docs as-is, resolves templates, appends doc-coverage
|
|
41
|
+
* 2. Baseline entry — sets docs: "", adds transform, uses abbreviated rubric
|
|
42
|
+
*/
|
|
43
|
+
import { existsSync, readFileSync, readdirSync } from "fs";
|
|
44
|
+
import { resolve } from "path";
|
|
45
|
+
import { load } from "js-yaml";
|
|
46
|
+
import { RubricConfigSchema } from "./schemas.js";
|
|
47
|
+
// ---------------------------------------------------------------------------
|
|
48
|
+
// Rubric template loading and assembly
|
|
49
|
+
// ---------------------------------------------------------------------------
|
|
50
|
+
/** Cached rubric config — loaded once per process. */
|
|
51
|
+
let cachedRubricConfig = null;
|
|
52
|
+
/**
|
|
53
|
+
* Assemble a full rubric text string from a template and criteria.
|
|
54
|
+
*
|
|
55
|
+
* Output format:
|
|
56
|
+
* {header}
|
|
57
|
+
* - {scale[0]}
|
|
58
|
+
* - {scale[1]}
|
|
59
|
+
* ...
|
|
60
|
+
*
|
|
61
|
+
* {criteria_label}
|
|
62
|
+
* - {criteria[0]}
|
|
63
|
+
* - {criteria[1]}
|
|
64
|
+
* ...
|
|
65
|
+
*
|
|
66
|
+
* {footer}
|
|
67
|
+
*/
|
|
68
|
+
export function assembleRubric(templateKey, criteria, rubricConfig) {
|
|
69
|
+
const template = rubricConfig.templates[templateKey];
|
|
70
|
+
if (!template) {
|
|
71
|
+
throw new Error(`Unknown rubric template '${templateKey}'. Available: ${Object.keys(rubricConfig.templates).join(", ")}`);
|
|
72
|
+
}
|
|
73
|
+
const parts = [];
|
|
74
|
+
// Header
|
|
75
|
+
parts.push(template.header);
|
|
76
|
+
// Scale
|
|
77
|
+
for (const item of template.scale) {
|
|
78
|
+
parts.push(`- ${item}`);
|
|
79
|
+
}
|
|
80
|
+
// Criteria (if template has a label and criteria are provided)
|
|
81
|
+
if (template.criteria_label && criteria.length > 0) {
|
|
82
|
+
parts.push(""); // blank line
|
|
83
|
+
parts.push(template.criteria_label);
|
|
84
|
+
for (const c of criteria) {
|
|
85
|
+
parts.push(`- ${c}`);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
// Footer
|
|
89
|
+
parts.push(""); // blank line
|
|
90
|
+
parts.push(rubricConfig.footer);
|
|
91
|
+
return parts.join("\n");
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Build baseline assertions based on the rubric mode.
|
|
95
|
+
*
|
|
96
|
+
* - 'full': Copy all assertions as-is
|
|
97
|
+
* - 'abbreviated': Keep only the first llm-rubric (task completion) with
|
|
98
|
+
* a shortened prompt, plus all non-rubric assertions
|
|
99
|
+
* - 'none': No assertions at all
|
|
100
|
+
*/
|
|
101
|
+
export function buildBaselineAsserts(goldAsserts, mode) {
|
|
102
|
+
if (mode === "none")
|
|
103
|
+
return [];
|
|
104
|
+
if (mode === "full")
|
|
105
|
+
return [...goldAsserts];
|
|
106
|
+
// Abbreviated: keep the first llm-rubric with a summary, drop the rest
|
|
107
|
+
const abbreviated = [];
|
|
108
|
+
let foundFirstRubric = false;
|
|
109
|
+
for (const a of goldAsserts) {
|
|
110
|
+
if (a.type === "llm-rubric") {
|
|
111
|
+
if (!foundFirstRubric) {
|
|
112
|
+
foundFirstRubric = true;
|
|
113
|
+
abbreviated.push({
|
|
114
|
+
type: "llm-rubric",
|
|
115
|
+
value: 'Score task completion from 0 to 100 (same criteria as above).\nReturn ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}',
|
|
116
|
+
});
|
|
117
|
+
}
|
|
118
|
+
// Skip subsequent llm-rubrics (code correctness, doc coverage)
|
|
119
|
+
}
|
|
120
|
+
// Non-rubric assertions are excluded from baseline to match current behavior
|
|
121
|
+
}
|
|
122
|
+
return abbreviated;
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Clear the cached rubric config. Used in tests.
|
|
126
|
+
*/
|
|
127
|
+
export function clearRubricCache() {
|
|
128
|
+
cachedRubricConfig = null;
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Expand a single task definition into gold + baseline Promptfoo test entries.
|
|
132
|
+
* Returns 1 entry (gold only) if baseline is disabled, or 2 entries otherwise.
|
|
133
|
+
*
|
|
134
|
+
* Resolves templated assertions and appends doc-coverage if opted in.
|
|
135
|
+
*
|
|
136
|
+
* @param mode - Controls which entries are generated and how:
|
|
137
|
+
* - `'baseline'` (default): Gold + baseline entries with `prompts` filter
|
|
138
|
+
* to prevent cartesian product with multiple prompts. Gold entries get
|
|
139
|
+
* `prompts: ['with-docs']`, baseline entries get `prompts: ['without-docs']`.
|
|
140
|
+
* - `'agentic'`: Gold entries only, no `prompts` filter (agentic mode has
|
|
141
|
+
* a single prompt that doesn't use `{{docs}}`; baseline entries would be
|
|
142
|
+
* pure waste — identical prompts, wasted API calls).
|
|
143
|
+
*/
|
|
144
|
+
export function expandTask(task, rubricConfig, mode = "baseline") {
|
|
145
|
+
const entries = [];
|
|
146
|
+
// Resolve all templated assertions
|
|
147
|
+
const resolvedAsserts = task.assert.map((a) => resolveAssert(a, rubricConfig));
|
|
148
|
+
// Append doc-coverage rubric if opted in
|
|
149
|
+
if (task.doc_coverage) {
|
|
150
|
+
const dcTemplate = rubricConfig.templates["doc-coverage"];
|
|
151
|
+
resolvedAsserts.push({
|
|
152
|
+
type: "llm-rubric",
|
|
153
|
+
value: assembleRubric("doc-coverage", [], rubricConfig),
|
|
154
|
+
...(dcTemplate?.dimension && {
|
|
155
|
+
metadata: {
|
|
156
|
+
dimension: dcTemplate.dimension,
|
|
157
|
+
maxScore: 100,
|
|
158
|
+
},
|
|
159
|
+
}),
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
// Gold entry — ceiling measurement (canonical docs injected directly).
|
|
163
|
+
// In baseline/observed mode: restricted to the 'with-docs' prompt to
|
|
164
|
+
// prevent cartesian product with the 'without-docs' prompt.
|
|
165
|
+
// In agentic mode: no filter needed (single prompt, id = 'agentic').
|
|
166
|
+
// See evaluation-ceiling.md for the floor/ceiling/actual decomposition.
|
|
167
|
+
entries.push({
|
|
168
|
+
assert: [...resolvedAsserts],
|
|
169
|
+
description: `${task.description} (gold)`,
|
|
170
|
+
...(mode === "baseline" ? { prompts: ["with-docs"] } : {}),
|
|
171
|
+
vars: { ...task.vars },
|
|
172
|
+
});
|
|
173
|
+
// Baseline entry — floor measurement (no docs, parametric knowledge only).
|
|
174
|
+
// Skipped entirely in agentic mode: the agentic prompt doesn't reference
|
|
175
|
+
// {{docs}}, so gold and baseline would produce identical prompts — pure
|
|
176
|
+
// waste of API calls and cost.
|
|
177
|
+
if (mode === "agentic") {
|
|
178
|
+
return entries;
|
|
179
|
+
}
|
|
180
|
+
// Restricted to the 'without-docs' prompt. Unless explicitly disabled.
|
|
181
|
+
const baselineEnabled = task.baseline?.enabled !== false;
|
|
182
|
+
if (baselineEnabled) {
|
|
183
|
+
const rubricMode = task.baseline?.rubric ?? "abbreviated";
|
|
184
|
+
const baselineAsserts = buildBaselineAsserts(resolvedAsserts, rubricMode);
|
|
185
|
+
entries.push({
|
|
186
|
+
description: `${task.description} (baseline)`,
|
|
187
|
+
prompts: ["without-docs"],
|
|
188
|
+
vars: {
|
|
189
|
+
...task.vars,
|
|
190
|
+
docs: "",
|
|
191
|
+
},
|
|
192
|
+
...(baselineAsserts.length > 0 ? { assert: baselineAsserts } : {}),
|
|
193
|
+
});
|
|
194
|
+
}
|
|
195
|
+
return entries;
|
|
196
|
+
}
|
|
197
|
+
/**
|
|
198
|
+
* Convert a TaskDefinition (from @sanity/ailf-core) to the local
|
|
199
|
+
* SingleTaskDefinition format used by expandTask().
|
|
200
|
+
*/
|
|
201
|
+
function taskDefinitionToSingle(task) {
|
|
202
|
+
return {
|
|
203
|
+
assert: task.assertions.map((a) => ({ ...a })),
|
|
204
|
+
baseline: task.baseline,
|
|
205
|
+
description: task.description,
|
|
206
|
+
doc_coverage: task.docCoverage,
|
|
207
|
+
id: task.id,
|
|
208
|
+
vars: {
|
|
209
|
+
docs: `file://contexts/canonical/${task.id}.md`,
|
|
210
|
+
task: task.taskPrompt,
|
|
211
|
+
...task.extraVars,
|
|
212
|
+
},
|
|
213
|
+
};
|
|
214
|
+
}
|
|
215
|
+
/**
|
|
216
|
+
* Expand an array of TaskDefinition[] (from any TaskSource adapter) into
|
|
217
|
+
* Promptfoo-compatible test entries. This is the TaskSource-aware counterpart
|
|
218
|
+
* of loadAndExpandTasks() — it skips YAML file I/O and works directly with
|
|
219
|
+
* the canonical domain type.
|
|
220
|
+
*
|
|
221
|
+
* @param tasks - Task definitions from any TaskSource adapter
|
|
222
|
+
* @param rootDir - Eval package root (needed to load rubric templates)
|
|
223
|
+
* @param mode - Expansion mode: 'baseline' (gold + baseline) or 'agentic' (gold only)
|
|
224
|
+
* @returns Expanded test entries and statistics
|
|
225
|
+
*/
|
|
226
|
+
export function expandTaskDefinitions(tasks, rootDir, mode = "baseline") {
|
|
227
|
+
const rubricConfig = loadRubricTemplates(rootDir);
|
|
228
|
+
const entries = [];
|
|
229
|
+
for (const task of tasks) {
|
|
230
|
+
const single = taskDefinitionToSingle(task);
|
|
231
|
+
entries.push(...expandTask(single, rubricConfig, mode));
|
|
232
|
+
}
|
|
233
|
+
return {
|
|
234
|
+
entries,
|
|
235
|
+
stats: {
|
|
236
|
+
expandedTotal: entries.length,
|
|
237
|
+
totalTasks: tasks.length,
|
|
238
|
+
},
|
|
239
|
+
};
|
|
240
|
+
}
|
|
241
|
+
/**
|
|
242
|
+
* Extract all task IDs from task files. Only works with the new
|
|
243
|
+
* single-definition format entries (those that have an `id` field).
|
|
244
|
+
*/
|
|
245
|
+
export function extractTaskIds(rootDir) {
|
|
246
|
+
const tasksDir = resolve(rootDir, "tasks");
|
|
247
|
+
if (!existsSync(tasksDir))
|
|
248
|
+
return [];
|
|
249
|
+
const yamlFiles = readdirSync(tasksDir)
|
|
250
|
+
.filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
|
|
251
|
+
.sort();
|
|
252
|
+
const ids = [];
|
|
253
|
+
for (const file of yamlFiles) {
|
|
254
|
+
const filePath = resolve(tasksDir, file);
|
|
255
|
+
const raw = readFileSync(filePath, "utf-8");
|
|
256
|
+
const parsed = load(raw);
|
|
257
|
+
if (!Array.isArray(parsed))
|
|
258
|
+
continue;
|
|
259
|
+
for (const entry of parsed) {
|
|
260
|
+
if (isSingleTaskDefinition(entry)) {
|
|
261
|
+
ids.push(entry.id);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
return ids;
|
|
266
|
+
}
|
|
267
|
+
// ---------------------------------------------------------------------------
|
|
268
|
+
// Core expansion logic
|
|
269
|
+
// ---------------------------------------------------------------------------
|
|
270
|
+
/**
|
|
271
|
+
* Type guard: checks if an entry is in the new single-definition format.
|
|
272
|
+
* The distinguishing feature is the presence of an `id` field.
|
|
273
|
+
*/
|
|
274
|
+
export function isSingleTaskDefinition(entry) {
|
|
275
|
+
return (typeof entry === "object" &&
|
|
276
|
+
entry !== null &&
|
|
277
|
+
"id" in entry &&
|
|
278
|
+
typeof entry.id === "string" &&
|
|
279
|
+
"description" in entry &&
|
|
280
|
+
"vars" in entry &&
|
|
281
|
+
"assert" in entry);
|
|
282
|
+
}
|
|
283
|
+
/**
|
|
284
|
+
* Type guard: checks if an assertion uses the templated format.
|
|
285
|
+
*/
|
|
286
|
+
export function isTemplatedAssert(entry) {
|
|
287
|
+
return (entry.type === "llm-rubric" &&
|
|
288
|
+
"template" in entry &&
|
|
289
|
+
typeof entry.template === "string" &&
|
|
290
|
+
"criteria" in entry &&
|
|
291
|
+
Array.isArray(entry.criteria));
|
|
292
|
+
}
|
|
293
|
+
// ---------------------------------------------------------------------------
|
|
294
|
+
// Task file loading
|
|
295
|
+
// ---------------------------------------------------------------------------
|
|
296
|
+
/**
|
|
297
|
+
* Load and expand all task files from the tasks/ directory.
|
|
298
|
+
* Supports both the new single-definition format (has `id`) and the legacy
|
|
299
|
+
* paired format (no `id`). Legacy entries pass through unchanged.
|
|
300
|
+
*
|
|
301
|
+
* @param mode - Controls expansion behavior:
|
|
302
|
+
* - `'baseline'` (default): Gold + baseline entries with prompt filters.
|
|
303
|
+
* - `'agentic'`: Gold entries only, no prompt filters.
|
|
304
|
+
*
|
|
305
|
+
* Returns the expanded entries grouped by source file.
|
|
306
|
+
*/
|
|
307
|
+
export function loadAndExpandTasks(rootDir, filter, mode = "baseline") {
|
|
308
|
+
const tasksDir = resolve(rootDir, "tasks");
|
|
309
|
+
if (!existsSync(tasksDir)) {
|
|
310
|
+
// tasks/ may not exist when task definitions come from Content Lake
|
|
311
|
+
return {
|
|
312
|
+
entries: [],
|
|
313
|
+
stats: {
|
|
314
|
+
expandedTotal: 0,
|
|
315
|
+
legacyEntries: 0,
|
|
316
|
+
singleDefinitions: 0,
|
|
317
|
+
totalFiles: 0,
|
|
318
|
+
},
|
|
319
|
+
};
|
|
320
|
+
}
|
|
321
|
+
// Load rubric templates
|
|
322
|
+
const rubricConfig = loadRubricTemplates(rootDir);
|
|
323
|
+
let yamlFiles = readdirSync(tasksDir)
|
|
324
|
+
.filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
|
|
325
|
+
.sort();
|
|
326
|
+
// Apply area filter — area name = filename stem (e.g., "groq" matches "groq.yaml")
|
|
327
|
+
if (filter?.areas && filter.areas.length > 0) {
|
|
328
|
+
const allowedAreas = new Set(filter.areas.map((a) => a.toLowerCase()));
|
|
329
|
+
yamlFiles = yamlFiles.filter((f) => {
|
|
330
|
+
const stem = f.replace(/\.ya?ml$/, "").toLowerCase();
|
|
331
|
+
return allowedAreas.has(stem);
|
|
332
|
+
});
|
|
333
|
+
}
|
|
334
|
+
const entries = [];
|
|
335
|
+
let singleDefinitions = 0;
|
|
336
|
+
let legacyEntries = 0;
|
|
337
|
+
for (const file of yamlFiles) {
|
|
338
|
+
const filePath = resolve(tasksDir, file);
|
|
339
|
+
const raw = readFileSync(filePath, "utf-8");
|
|
340
|
+
let parsed = load(raw);
|
|
341
|
+
if (!Array.isArray(parsed)) {
|
|
342
|
+
throw new Error(`${file} did not parse to an array of tasks`);
|
|
343
|
+
}
|
|
344
|
+
// Apply task ID filter
|
|
345
|
+
if (filter?.taskIds && filter.taskIds.length > 0) {
|
|
346
|
+
const allowedIds = new Set(filter.taskIds);
|
|
347
|
+
parsed = parsed.filter((entry) => typeof entry === "object" &&
|
|
348
|
+
entry !== null &&
|
|
349
|
+
"id" in entry &&
|
|
350
|
+
allowedIds.has(entry.id));
|
|
351
|
+
}
|
|
352
|
+
for (const entry of parsed) {
|
|
353
|
+
if (isSingleTaskDefinition(entry)) {
|
|
354
|
+
singleDefinitions++;
|
|
355
|
+
entries.push(...expandTask(entry, rubricConfig, mode));
|
|
356
|
+
}
|
|
357
|
+
else {
|
|
358
|
+
legacyEntries++;
|
|
359
|
+
entries.push(entry);
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
if (filter?.areas || filter?.taskIds) {
|
|
364
|
+
console.log(` Filter: ${filter.areas ? `areas=[${filter.areas.join(", ")}]` : ""}${filter.areas && filter.taskIds ? ", " : ""}${filter.taskIds ? `tasks=[${filter.taskIds.join(", ")}]` : ""}`);
|
|
365
|
+
}
|
|
366
|
+
return {
|
|
367
|
+
entries,
|
|
368
|
+
stats: {
|
|
369
|
+
expandedTotal: entries.length,
|
|
370
|
+
legacyEntries,
|
|
371
|
+
singleDefinitions,
|
|
372
|
+
totalFiles: yamlFiles.length,
|
|
373
|
+
},
|
|
374
|
+
};
|
|
375
|
+
}
|
|
376
|
+
/**
|
|
377
|
+
* Load and validate config/rubrics.yaml from the given root directory.
|
|
378
|
+
* Caches the result for subsequent calls with the same rootDir.
|
|
379
|
+
*/
|
|
380
|
+
export function loadRubricTemplates(rootDir) {
|
|
381
|
+
if (cachedRubricConfig)
|
|
382
|
+
return cachedRubricConfig;
|
|
383
|
+
const filePath = resolve(rootDir, "config", "rubrics.yaml");
|
|
384
|
+
if (!existsSync(filePath)) {
|
|
385
|
+
throw new Error(`config/rubrics.yaml not found at ${filePath}`);
|
|
386
|
+
}
|
|
387
|
+
const raw = readFileSync(filePath, "utf-8");
|
|
388
|
+
const parsed = load(raw);
|
|
389
|
+
const result = RubricConfigSchema.safeParse(parsed);
|
|
390
|
+
if (!result.success) {
|
|
391
|
+
const messages = result.error.issues
|
|
392
|
+
.map((i) => ` [${i.path.join(".")}]: ${i.message}`)
|
|
393
|
+
.join("\n");
|
|
394
|
+
throw new Error(`Invalid config/rubrics.yaml:\n${messages}`);
|
|
395
|
+
}
|
|
396
|
+
cachedRubricConfig = result.data;
|
|
397
|
+
return result.data;
|
|
398
|
+
}
|
|
399
|
+
/**
|
|
400
|
+
* Resolve a single assertion: if it's templated, assemble the rubric text
|
|
401
|
+
* and attach structured dimension metadata when the template has a
|
|
402
|
+
* `dimension` field. Otherwise, pass through unchanged.
|
|
403
|
+
*/
|
|
404
|
+
export function resolveAssert(entry, rubricConfig) {
|
|
405
|
+
if (isTemplatedAssert(entry)) {
|
|
406
|
+
const template = rubricConfig.templates[entry.template];
|
|
407
|
+
return {
|
|
408
|
+
type: "llm-rubric",
|
|
409
|
+
value: assembleRubric(entry.template, entry.criteria, rubricConfig),
|
|
410
|
+
...(entry.weight !== undefined ? { weight: entry.weight } : {}),
|
|
411
|
+
// Structured dimension metadata (Approach 5)
|
|
412
|
+
...(template?.dimension && {
|
|
413
|
+
metadata: {
|
|
414
|
+
dimension: template.dimension,
|
|
415
|
+
maxScore: 100,
|
|
416
|
+
},
|
|
417
|
+
}),
|
|
418
|
+
};
|
|
419
|
+
}
|
|
420
|
+
return entry;
|
|
421
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/failure-modes.ts
|
|
3
|
+
*
|
|
4
|
+
* Keyword-based failure mode classifier for grader reasoning text,
|
|
5
|
+
* cross-referenced with ceiling decomposition data.
|
|
6
|
+
*
|
|
7
|
+
* Phase 3a of the Scenario Matrix implementation.
|
|
8
|
+
*
|
|
9
|
+
* The classifier uses two signal sources:
|
|
10
|
+
* 1. Keyword matching on grader reason text (primary)
|
|
11
|
+
* 2. Ceiling decomposition structural signals (supplementary)
|
|
12
|
+
*
|
|
13
|
+
* When both sources agree, confidence is boosted. When only ceiling
|
|
14
|
+
* signals are available, they serve as a fallback for unclassified cases.
|
|
15
|
+
*
|
|
16
|
+
* @see docs/exec-plans/completed/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
17
|
+
*/
|
|
18
|
+
import type { FailureMode, FailureModeReport, FeatureScore, GraderJudgment } from "./types.js";
|
|
19
|
+
/**
|
|
20
|
+
* Build a complete failure mode report from grader judgments and scores.
|
|
21
|
+
*
|
|
22
|
+
* @param judgments - All grader judgments from the evaluation
|
|
23
|
+
* @param scores - Per-area feature scores (for ceiling decomposition)
|
|
24
|
+
* @returns Failure mode report with per-area breakdowns
|
|
25
|
+
*/
|
|
26
|
+
export declare function buildFailureModeReport(judgments: GraderJudgment[], scores: FeatureScore[]): FailureModeReport;
|
|
27
|
+
/**
|
|
28
|
+
* Classify the failure mode of a low-scoring grader judgment.
|
|
29
|
+
*
|
|
30
|
+
* Uses keyword matching on the reason text, then cross-references with
|
|
31
|
+
* ceiling decomposition data for structural confirmation.
|
|
32
|
+
*
|
|
33
|
+
* @param judgment - The grader judgment to classify
|
|
34
|
+
* @param ceilingScore - The area's ceiling score (with-docs best case)
|
|
35
|
+
* @param floorScore - The area's floor score (no-docs baseline)
|
|
36
|
+
* @returns Classified failure mode with confidence level
|
|
37
|
+
*/
|
|
38
|
+
export declare function classifyFailureMode(judgment: GraderJudgment, ceilingScore: number, floorScore: number): FailureMode;
|
|
39
|
+
/**
|
|
40
|
+
* Format a failure mode report for console output.
|
|
41
|
+
*/
|
|
42
|
+
export declare function formatFailureModesConsole(report: FailureModeReport): string;
|
|
43
|
+
/**
|
|
44
|
+
* Format a failure mode report as markdown for PR comments.
|
|
45
|
+
*/
|
|
46
|
+
export declare function formatFailureModesMarkdown(report: FailureModeReport): string;
|