@sanity/ailf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -0
- package/bin/ailf.js +64 -0
- package/canonical/grader-references/README.md +88 -0
- package/canonical/grader-references/groq.yaml +234 -0
- package/canonical/grader-references/studio-setup.yaml +275 -0
- package/canonical/reference-solutions/.gitkeep +1 -0
- package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
- package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
- package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
- package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
- package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
- package/canonical/reference-solutions/groq/joins-references.ts +300 -0
- package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
- package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
- package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
- package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
- package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
- package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
- package/config/bigquery/README.md +74 -0
- package/config/bigquery/views/area_scores.sql +87 -0
- package/config/bigquery/views/reports.sql +49 -0
- package/config/features.yaml +116 -0
- package/config/models.yaml +115 -0
- package/config/prompts.yaml +75 -0
- package/config/rubrics.yaml +62 -0
- package/config/schedules.yaml +43 -0
- package/config/sinks.yaml +54 -0
- package/config/sources.yaml +51 -0
- package/config/thresholds.yaml +49 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
- package/dist/_vendor/ailf-core/examples/index.js +285 -0
- package/dist/_vendor/ailf-core/index.d.ts +17 -0
- package/dist/_vendor/ailf-core/index.js +17 -0
- package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
- package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
- package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
- package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
- package/dist/_vendor/ailf-core/ports/context.js +14 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
- package/dist/_vendor/ailf-core/ports/index.js +7 -0
- package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
- package/dist/_vendor/ailf-core/ports/logger.js +11 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
- package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/schemas/index.js +16 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
- package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
- package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
- package/dist/_vendor/ailf-core/services/index.js +12 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/scoring.js +222 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
- package/dist/_vendor/ailf-core/types/index.js +21 -0
- package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
- package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
- package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
- package/dist/_vendor/ailf-shared/document-ref.js +1 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -0
- package/dist/_vendor/ailf-shared/index.js +16 -0
- package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
- package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
- package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
- package/dist/_vendor/ailf-shared/score-grades.js +23 -0
- package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
- package/dist/adapters/cache/content-lake-cache.js +59 -0
- package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
- package/dist/adapters/cache/filesystem-cache.js +54 -0
- package/dist/adapters/cache/index.d.ts +2 -0
- package/dist/adapters/cache/index.js +2 -0
- package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
- package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
- package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
- package/dist/adapters/config-sources/file-config-adapter.js +96 -0
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +2 -0
- package/dist/adapters/doc-fetchers/index.d.ts +1 -0
- package/dist/adapters/doc-fetchers/index.js +1 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
- package/dist/adapters/eval-runners/index.d.ts +1 -0
- package/dist/adapters/eval-runners/index.js +1 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
- package/dist/adapters/index.d.ts +12 -0
- package/dist/adapters/index.js +12 -0
- package/dist/adapters/loggers/console-logger.d.ts +22 -0
- package/dist/adapters/loggers/console-logger.js +54 -0
- package/dist/adapters/loggers/index.d.ts +9 -0
- package/dist/adapters/loggers/index.js +9 -0
- package/dist/adapters/loggers/json-logger.d.ts +18 -0
- package/dist/adapters/loggers/json-logger.js +33 -0
- package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
- package/dist/adapters/loggers/quiet-logger.js +30 -0
- package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/composite-task-source.js +59 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
- package/dist/adapters/task-sources/index.d.ts +7 -0
- package/dist/adapters/task-sources/index.js +7 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
- package/dist/adapters/task-sources/repo-schemas.js +234 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
- package/dist/adapters/task-sources/repo-task-source.js +104 -0
- package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
- package/dist/adapters/task-sources/repo-trigger.js +153 -0
- package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
- package/dist/adapters/task-sources/repo-validation.js +164 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
- package/dist/adapters/task-sources/yaml-task-source.js +136 -0
- package/dist/agent-observer/agentic-provider.d.ts +132 -0
- package/dist/agent-observer/agentic-provider.js +983 -0
- package/dist/agent-observer/classifier.d.ts +62 -0
- package/dist/agent-observer/classifier.js +269 -0
- package/dist/agent-observer/index.d.ts +7 -0
- package/dist/agent-observer/index.js +4 -0
- package/dist/agent-observer/pricing.d.ts +35 -0
- package/dist/agent-observer/pricing.js +82 -0
- package/dist/agent-observer/provider.d.ts +77 -0
- package/dist/agent-observer/provider.js +151 -0
- package/dist/agent-observer/proxy.d.ts +91 -0
- package/dist/agent-observer/proxy.js +321 -0
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/agent-observer/types.d.ts +137 -0
- package/dist/agent-observer/types.js +16 -0
- package/dist/assertions/source-isolation.d.ts +72 -0
- package/dist/assertions/source-isolation.js +117 -0
- package/dist/cli.d.ts +24 -0
- package/dist/cli.js +199 -0
- package/dist/commands/agent-report.d.ts +5 -0
- package/dist/commands/agent-report.js +69 -0
- package/dist/commands/baseline.d.ts +9 -0
- package/dist/commands/baseline.js +141 -0
- package/dist/commands/cache.d.ts +13 -0
- package/dist/commands/cache.js +135 -0
- package/dist/commands/calculate-scores.d.ts +8 -0
- package/dist/commands/calculate-scores.js +48 -0
- package/dist/commands/compare.d.ts +8 -0
- package/dist/commands/compare.js +120 -0
- package/dist/commands/completion.d.ts +18 -0
- package/dist/commands/completion.js +260 -0
- package/dist/commands/coverage-audit.d.ts +7 -0
- package/dist/commands/coverage-audit.js +40 -0
- package/dist/commands/discovery-report.d.ts +10 -0
- package/dist/commands/discovery-report.js +44 -0
- package/dist/commands/eval.d.ts +9 -0
- package/dist/commands/eval.js +35 -0
- package/dist/commands/explain-handler.d.ts +34 -0
- package/dist/commands/explain-handler.js +719 -0
- package/dist/commands/fetch-docs.d.ts +8 -0
- package/dist/commands/fetch-docs.js +128 -0
- package/dist/commands/generate-configs.d.ts +8 -0
- package/dist/commands/generate-configs.js +46 -0
- package/dist/commands/grader/index.d.ts +11 -0
- package/dist/commands/grader/index.js +118 -0
- package/dist/commands/init.d.ts +19 -0
- package/dist/commands/init.js +150 -0
- package/dist/commands/interactive.d.ts +12 -0
- package/dist/commands/interactive.js +238 -0
- package/dist/commands/lookup-doc.d.ts +15 -0
- package/dist/commands/lookup-doc.js +84 -0
- package/dist/commands/measure-retrieval.d.ts +5 -0
- package/dist/commands/measure-retrieval.js +65 -0
- package/dist/commands/pipeline-action.d.ts +71 -0
- package/dist/commands/pipeline-action.js +305 -0
- package/dist/commands/pipeline.d.ts +62 -0
- package/dist/commands/pipeline.js +53 -0
- package/dist/commands/pr-comment.d.ts +8 -0
- package/dist/commands/pr-comment.js +47 -0
- package/dist/commands/publish.d.ts +26 -0
- package/dist/commands/publish.js +253 -0
- package/dist/commands/readiness-report.d.ts +10 -0
- package/dist/commands/readiness-report.js +104 -0
- package/dist/commands/shared/options.d.ts +29 -0
- package/dist/commands/shared/options.js +57 -0
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +16 -0
- package/dist/commands/validate-tasks.js +93 -0
- package/dist/commands/validate.d.ts +9 -0
- package/dist/commands/validate.js +73 -0
- package/dist/commands/webhook-server.d.ts +5 -0
- package/dist/commands/webhook-server.js +30 -0
- package/dist/commands/weekly-digest.d.ts +10 -0
- package/dist/commands/weekly-digest.js +104 -0
- package/dist/composition-root.d.ts +26 -0
- package/dist/composition-root.js +107 -0
- package/dist/interpolate.d.ts +26 -0
- package/dist/interpolate.js +70 -0
- package/dist/job-store.d.ts +104 -0
- package/dist/job-store.js +188 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.d.ts +27 -0
- package/dist/orchestration/build-app-context.js +81 -0
- package/dist/orchestration/build-step-sequence.d.ts +15 -0
- package/dist/orchestration/build-step-sequence.js +84 -0
- package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
- package/dist/orchestration/config-to-source-overrides.js +28 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/index.d.ts +11 -0
- package/dist/orchestration/index.js +11 -0
- package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
- package/dist/orchestration/pipeline-orchestrator.js +153 -0
- package/dist/orchestration/step-runner.d.ts +20 -0
- package/dist/orchestration/step-runner.js +88 -0
- package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
- package/dist/orchestration/steps/calculate-scores-step.js +95 -0
- package/dist/orchestration/steps/callback-step.d.ts +24 -0
- package/dist/orchestration/steps/callback-step.js +76 -0
- package/dist/orchestration/steps/compare-step.d.ts +14 -0
- package/dist/orchestration/steps/compare-step.js +92 -0
- package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
- package/dist/orchestration/steps/discovery-report-step.js +55 -0
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
- package/dist/orchestration/steps/fetch-docs-step.js +135 -0
- package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
- package/dist/orchestration/steps/gap-analysis-step.js +136 -0
- package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
- package/dist/orchestration/steps/generate-configs-step.js +85 -0
- package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
- package/dist/orchestration/steps/grader-consistency-step.js +64 -0
- package/dist/orchestration/steps/index.d.ts +19 -0
- package/dist/orchestration/steps/index.js +19 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
- package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
- package/dist/orchestration/steps/publish-report-step.js +216 -0
- package/dist/orchestration/steps/readiness-step.d.ts +13 -0
- package/dist/orchestration/steps/readiness-step.js +91 -0
- package/dist/orchestration/steps/report-step.d.ts +12 -0
- package/dist/orchestration/steps/report-step.js +49 -0
- package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
- package/dist/orchestration/steps/run-eval-step.js +195 -0
- package/dist/orchestration/steps/validate-step.d.ts +12 -0
- package/dist/orchestration/steps/validate-step.js +41 -0
- package/dist/pipeline/agent-behavior-report.d.ts +53 -0
- package/dist/pipeline/agent-behavior-report.js +132 -0
- package/dist/pipeline/attribution.d.ts +47 -0
- package/dist/pipeline/attribution.js +226 -0
- package/dist/pipeline/baseline.d.ts +37 -0
- package/dist/pipeline/baseline.js +141 -0
- package/dist/pipeline/cache.d.ts +101 -0
- package/dist/pipeline/cache.js +283 -0
- package/dist/pipeline/calculate-scores.d.ts +102 -0
- package/dist/pipeline/calculate-scores.js +1128 -0
- package/dist/pipeline/callback-delivery.d.ts +50 -0
- package/dist/pipeline/callback-delivery.js +89 -0
- package/dist/pipeline/checks.d.ts +39 -0
- package/dist/pipeline/checks.js +280 -0
- package/dist/pipeline/classify-url.d.ts +61 -0
- package/dist/pipeline/classify-url.js +93 -0
- package/dist/pipeline/compare.d.ts +31 -0
- package/dist/pipeline/compare.js +208 -0
- package/dist/pipeline/coverage-audit.d.ts +39 -0
- package/dist/pipeline/coverage-audit.js +165 -0
- package/dist/pipeline/degradations.d.ts +85 -0
- package/dist/pipeline/degradations.js +242 -0
- package/dist/pipeline/discovery-report.d.ts +55 -0
- package/dist/pipeline/discovery-report.js +178 -0
- package/dist/pipeline/eval-constants.d.ts +68 -0
- package/dist/pipeline/eval-constants.js +111 -0
- package/dist/pipeline/eval-fingerprint.d.ts +66 -0
- package/dist/pipeline/eval-fingerprint.js +175 -0
- package/dist/pipeline/expand-tasks.d.ts +220 -0
- package/dist/pipeline/expand-tasks.js +421 -0
- package/dist/pipeline/failure-modes.d.ts +46 -0
- package/dist/pipeline/failure-modes.js +348 -0
- package/dist/pipeline/fetch-url-content.d.ts +44 -0
- package/dist/pipeline/fetch-url-content.js +93 -0
- package/dist/pipeline/gap-analysis.d.ts +48 -0
- package/dist/pipeline/gap-analysis.js +231 -0
- package/dist/pipeline/generate-configs.d.ts +72 -0
- package/dist/pipeline/generate-configs.js +395 -0
- package/dist/pipeline/grader-api.d.ts +49 -0
- package/dist/pipeline/grader-api.js +200 -0
- package/dist/pipeline/grader-compare-runner.d.ts +44 -0
- package/dist/pipeline/grader-compare-runner.js +301 -0
- package/dist/pipeline/grader-comparison.d.ts +111 -0
- package/dist/pipeline/grader-comparison.js +161 -0
- package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
- package/dist/pipeline/grader-consistency-runner.js +270 -0
- package/dist/pipeline/grader-consistency.d.ts +103 -0
- package/dist/pipeline/grader-consistency.js +146 -0
- package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
- package/dist/pipeline/grader-sensitivity-runner.js +282 -0
- package/dist/pipeline/grader-sensitivity.d.ts +94 -0
- package/dist/pipeline/grader-sensitivity.js +144 -0
- package/dist/pipeline/grader-validate-runner.d.ts +38 -0
- package/dist/pipeline/grader-validate-runner.js +229 -0
- package/dist/pipeline/grader-validation.d.ts +107 -0
- package/dist/pipeline/grader-validation.js +169 -0
- package/dist/pipeline/map-request-to-config.d.ts +19 -0
- package/dist/pipeline/map-request-to-config.js +80 -0
- package/dist/pipeline/measure-retrieval.d.ts +59 -0
- package/dist/pipeline/measure-retrieval.js +111 -0
- package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
- package/dist/pipeline/mirror-repo-tasks.js +350 -0
- package/dist/pipeline/plan-format.d.ts +33 -0
- package/dist/pipeline/plan-format.js +202 -0
- package/dist/pipeline/plan.d.ts +169 -0
- package/dist/pipeline/plan.js +708 -0
- package/dist/pipeline/pr-comment.d.ts +19 -0
- package/dist/pipeline/pr-comment.js +502 -0
- package/dist/pipeline/probe.d.ts +52 -0
- package/dist/pipeline/probe.js +390 -0
- package/dist/pipeline/provenance.d.ts +47 -0
- package/dist/pipeline/provenance.js +146 -0
- package/dist/pipeline/readiness-report.d.ts +87 -0
- package/dist/pipeline/readiness-report.js +205 -0
- package/dist/pipeline/release-classification.d.ts +54 -0
- package/dist/pipeline/release-classification.js +238 -0
- package/dist/pipeline/release-report.d.ts +37 -0
- package/dist/pipeline/release-report.js +222 -0
- package/dist/pipeline/repo-eval-comment.d.ts +37 -0
- package/dist/pipeline/repo-eval-comment.js +165 -0
- package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
- package/dist/pipeline/repo-threshold-evaluator.js +162 -0
- package/dist/pipeline/resolve-mappings.d.ts +35 -0
- package/dist/pipeline/resolve-mappings.js +72 -0
- package/dist/pipeline/retrieval-metrics.d.ts +39 -0
- package/dist/pipeline/retrieval-metrics.js +136 -0
- package/dist/pipeline/reverse-mapping.d.ts +67 -0
- package/dist/pipeline/reverse-mapping.js +88 -0
- package/dist/pipeline/schemas.d.ts +9 -0
- package/dist/pipeline/schemas.js +9 -0
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +88 -0
- package/dist/pipeline/targeted-loo.js +203 -0
- package/dist/pipeline/thresholds.d.ts +27 -0
- package/dist/pipeline/thresholds.js +245 -0
- package/dist/pipeline/types.d.ts +10 -0
- package/dist/pipeline/types.js +10 -0
- package/dist/pipeline/validate.d.ts +67 -0
- package/dist/pipeline/validate.js +406 -0
- package/dist/pipeline/webhook-server.d.ts +37 -0
- package/dist/pipeline/webhook-server.js +133 -0
- package/dist/report-store.d.ts +84 -0
- package/dist/report-store.js +208 -0
- package/dist/sanity/client.d.ts +38 -0
- package/dist/sanity/client.js +86 -0
- package/dist/sanity/portable-text.d.ts +11 -0
- package/dist/sanity/portable-text.js +211 -0
- package/dist/sanity/queries.d.ts +133 -0
- package/dist/sanity/queries.js +300 -0
- package/dist/schedules/digest.d.ts +116 -0
- package/dist/schedules/digest.js +156 -0
- package/dist/schedules/index.d.ts +12 -0
- package/dist/schedules/index.js +10 -0
- package/dist/schedules/loader.d.ts +31 -0
- package/dist/schedules/loader.js +73 -0
- package/dist/schedules/schema.d.ts +9 -0
- package/dist/schedules/schema.js +9 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +21 -0
- package/dist/scripts/validate-task-sources.js +210 -0
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/bigquery/index.d.ts +131 -0
- package/dist/sinks/bigquery/index.js +222 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/index.d.ts +23 -0
- package/dist/sinks/index.js +18 -0
- package/dist/sinks/loader.d.ts +18 -0
- package/dist/sinks/loader.js +82 -0
- package/dist/sinks/retry.d.ts +24 -0
- package/dist/sinks/retry.js +52 -0
- package/dist/sinks/schema.d.ts +9 -0
- package/dist/sinks/schema.js +9 -0
- package/dist/sinks/slack/format.d.ts +65 -0
- package/dist/sinks/slack/format.js +327 -0
- package/dist/sinks/slack/index.d.ts +27 -0
- package/dist/sinks/slack/index.js +78 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +59 -0
- package/dist/sinks/types.js +44 -0
- package/dist/sinks/webhook/index.d.ts +19 -0
- package/dist/sinks/webhook/index.js +50 -0
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/sources.d.ts +104 -0
- package/dist/sources.js +292 -0
- package/dist/webhook/budget.d.ts +42 -0
- package/dist/webhook/budget.js +60 -0
- package/dist/webhook/debounce.d.ts +67 -0
- package/dist/webhook/debounce.js +76 -0
- package/dist/webhook/dispatch.d.ts +45 -0
- package/dist/webhook/dispatch.js +84 -0
- package/dist/webhook/eval-request-handler.d.ts +87 -0
- package/dist/webhook/eval-request-handler.js +181 -0
- package/dist/webhook/handler.d.ts +88 -0
- package/dist/webhook/handler.js +203 -0
- package/dist/webhook/index.d.ts +17 -0
- package/dist/webhook/index.js +12 -0
- package/dist/webhook/types.d.ts +109 -0
- package/dist/webhook/types.js +10 -0
- package/package.json +72 -0
- package/tasks/.expanded.agentic.yaml +51 -0
- package/tasks/.expanded.yaml +66 -0
- package/tasks/frameworks.yaml +98 -0
- package/tasks/functions.yaml +51 -0
- package/tasks/groq.yaml +216 -0
- package/tasks/nextjs-live.yaml +62 -0
- package/tasks/studio-setup.yaml +111 -0
- package/tasks/visual-editing.yaml +120 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/eval-fingerprint.ts
|
|
3
|
+
*
|
|
4
|
+
* Computes a deterministic fingerprint of all inputs that affect evaluation
|
|
5
|
+
* output. Used for cross-environment cache lookup: when running in CI, the
|
|
6
|
+
* pipeline can query the Sanity Content Lake for a previous report with an
|
|
7
|
+
* identical fingerprint and skip the expensive eval step.
|
|
8
|
+
*
|
|
9
|
+
* The fingerprint captures everything that would change evaluation results:
|
|
10
|
+
* - Evaluation mode (baseline, observed, agentic)
|
|
11
|
+
* - Model configuration (which models, their settings)
|
|
12
|
+
* - Grader model identity (different graders score differently)
|
|
13
|
+
* - Prompt templates (different instructions → different outputs)
|
|
14
|
+
* - Rubric templates (different criteria → different scores)
|
|
15
|
+
* - Task definitions (what's being evaluated)
|
|
16
|
+
* - Reference solutions (used by grader assertions)
|
|
17
|
+
* - Documentation content (the docs being evaluated — the primary variable)
|
|
18
|
+
* - Filter flags (which subset of tasks is included)
|
|
19
|
+
*
|
|
20
|
+
* The fingerprint intentionally EXCLUDES:
|
|
21
|
+
* - Source name/URL (content matters, not origin)
|
|
22
|
+
* - Git metadata (informational, not eval-affecting)
|
|
23
|
+
* - Trigger type (manual vs CI → same inputs → same results)
|
|
24
|
+
* - Report tags (human labels)
|
|
25
|
+
*
|
|
26
|
+
* @see docs/design-docs/content-lake-eval-caching.md
|
|
27
|
+
*/
|
|
28
|
+
import type { EvalMode, FilterOptions } from "./types.js";
|
|
29
|
+
/** Inputs needed to compute an evaluation fingerprint. */
|
|
30
|
+
export interface FingerprintInput {
|
|
31
|
+
/** Filter options (areas, taskIds) — determines which tasks are included */
|
|
32
|
+
filter?: FilterOptions;
|
|
33
|
+
/** Grader model identifier (e.g., "anthropic:messages:claude-opus-4-5-20251101") */
|
|
34
|
+
graderModel: string;
|
|
35
|
+
/** Evaluation mode */
|
|
36
|
+
mode: EvalMode;
|
|
37
|
+
/** Path to the packages/eval root directory */
|
|
38
|
+
rootDir: string;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Collect all file paths that contribute to the evaluation fingerprint.
|
|
42
|
+
*
|
|
43
|
+
* This is similar to `getStepInputPaths()` in `cache.ts` but is more
|
|
44
|
+
* comprehensive and explicitly designed for cross-environment cache keys:
|
|
45
|
+
*
|
|
46
|
+
* - Includes `config/prompts.yaml` and `config/rubrics.yaml` directly
|
|
47
|
+
* (the local cache only includes them indirectly via generated configs)
|
|
48
|
+
* - Includes `config/models.yaml` (model configuration)
|
|
49
|
+
* - Includes task definitions and reference solutions
|
|
50
|
+
* - Includes the actual documentation content (contexts/canonical/*.md)
|
|
51
|
+
* - Respects filter flags to only include relevant files
|
|
52
|
+
*/
|
|
53
|
+
export declare function collectFingerprintInputPaths(rootDir: string, filter?: FilterOptions): string[];
|
|
54
|
+
/**
|
|
55
|
+
* Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
|
|
56
|
+
*
|
|
57
|
+
* The fingerprint is content-addressed: identical inputs always produce
|
|
58
|
+
* the same fingerprint, regardless of the environment (local, CI, etc.).
|
|
59
|
+
*
|
|
60
|
+
* Reuses the existing `hashFiles()` from `cache.ts` to hash file content,
|
|
61
|
+
* and adds non-file context (mode, grader model, filter flags) as
|
|
62
|
+
* additional context strings.
|
|
63
|
+
*
|
|
64
|
+
* @returns SHA-256 hex string (64 characters)
|
|
65
|
+
*/
|
|
66
|
+
export declare function computeEvalFingerprint(input: FingerprintInput): string;
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/eval-fingerprint.ts
|
|
3
|
+
*
|
|
4
|
+
* Computes a deterministic fingerprint of all inputs that affect evaluation
|
|
5
|
+
* output. Used for cross-environment cache lookup: when running in CI, the
|
|
6
|
+
* pipeline can query the Sanity Content Lake for a previous report with an
|
|
7
|
+
* identical fingerprint and skip the expensive eval step.
|
|
8
|
+
*
|
|
9
|
+
* The fingerprint captures everything that would change evaluation results:
|
|
10
|
+
* - Evaluation mode (baseline, observed, agentic)
|
|
11
|
+
* - Model configuration (which models, their settings)
|
|
12
|
+
* - Grader model identity (different graders score differently)
|
|
13
|
+
* - Prompt templates (different instructions → different outputs)
|
|
14
|
+
* - Rubric templates (different criteria → different scores)
|
|
15
|
+
* - Task definitions (what's being evaluated)
|
|
16
|
+
* - Reference solutions (used by grader assertions)
|
|
17
|
+
* - Documentation content (the docs being evaluated — the primary variable)
|
|
18
|
+
* - Filter flags (which subset of tasks is included)
|
|
19
|
+
*
|
|
20
|
+
* The fingerprint intentionally EXCLUDES:
|
|
21
|
+
* - Source name/URL (content matters, not origin)
|
|
22
|
+
* - Git metadata (informational, not eval-affecting)
|
|
23
|
+
* - Trigger type (manual vs CI → same inputs → same results)
|
|
24
|
+
* - Report tags (human labels)
|
|
25
|
+
*
|
|
26
|
+
* @see docs/design-docs/content-lake-eval-caching.md
|
|
27
|
+
*/
|
|
28
|
+
import { existsSync, readdirSync, statSync } from "fs";
|
|
29
|
+
import { join, resolve } from "path";
|
|
30
|
+
import { hashFiles } from "./cache.js";
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
// Constants
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
/**
|
|
35
|
+
* Version prefix for the fingerprint hash. Bumping this invalidates all
|
|
36
|
+
* existing fingerprints in the Content Lake without needing to clear the
|
|
37
|
+
* store. Change this when adding new inputs to the hash.
|
|
38
|
+
*/
|
|
39
|
+
const FINGERPRINT_VERSION = "eval-fingerprint-v1";
|
|
40
|
+
/**
|
|
41
|
+
* Collect all file paths that contribute to the evaluation fingerprint.
|
|
42
|
+
*
|
|
43
|
+
* This is similar to `getStepInputPaths()` in `cache.ts` but is more
|
|
44
|
+
* comprehensive and explicitly designed for cross-environment cache keys:
|
|
45
|
+
*
|
|
46
|
+
* - Includes `config/prompts.yaml` and `config/rubrics.yaml` directly
|
|
47
|
+
* (the local cache only includes them indirectly via generated configs)
|
|
48
|
+
* - Includes `config/models.yaml` (model configuration)
|
|
49
|
+
* - Includes task definitions and reference solutions
|
|
50
|
+
* - Includes the actual documentation content (contexts/canonical/*.md)
|
|
51
|
+
* - Respects filter flags to only include relevant files
|
|
52
|
+
*/
|
|
53
|
+
export function collectFingerprintInputPaths(rootDir, filter) {
|
|
54
|
+
const r = (rel) => resolve(rootDir, rel);
|
|
55
|
+
const paths = [];
|
|
56
|
+
// -----------------------------------------------------------------------
|
|
57
|
+
// Config files — always included
|
|
58
|
+
// -----------------------------------------------------------------------
|
|
59
|
+
const configFiles = [
|
|
60
|
+
"config/models.yaml",
|
|
61
|
+
"config/prompts.yaml",
|
|
62
|
+
"config/rubrics.yaml",
|
|
63
|
+
];
|
|
64
|
+
for (const f of configFiles) {
|
|
65
|
+
const p = r(f);
|
|
66
|
+
if (existsSync(p))
|
|
67
|
+
paths.push(p);
|
|
68
|
+
}
|
|
69
|
+
// -----------------------------------------------------------------------
|
|
70
|
+
// Task files — filtered if --area is set
|
|
71
|
+
// -----------------------------------------------------------------------
|
|
72
|
+
const tasksDir = r("tasks");
|
|
73
|
+
if (existsSync(tasksDir)) {
|
|
74
|
+
const taskFiles = readdirSync(tasksDir)
|
|
75
|
+
.filter((f) => f.endsWith(".yaml") || f.endsWith(".yml"))
|
|
76
|
+
.filter((f) => !f.startsWith(".")); // exclude .expanded.yaml
|
|
77
|
+
for (const f of taskFiles) {
|
|
78
|
+
// If area filter is set, only include matching task files
|
|
79
|
+
if (filter?.areas && filter.areas.length > 0) {
|
|
80
|
+
const stem = f.replace(/\.ya?ml$/, "");
|
|
81
|
+
if (!filter.areas.includes(stem))
|
|
82
|
+
continue;
|
|
83
|
+
}
|
|
84
|
+
paths.push(join(tasksDir, f));
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
// -----------------------------------------------------------------------
|
|
88
|
+
// Reference solutions — all included (they're referenced by tasks)
|
|
89
|
+
// -----------------------------------------------------------------------
|
|
90
|
+
const refDir = r("canonical/reference-solutions");
|
|
91
|
+
if (existsSync(refDir)) {
|
|
92
|
+
collectFilesRecursive(refDir, paths);
|
|
93
|
+
}
|
|
94
|
+
// -----------------------------------------------------------------------
|
|
95
|
+
// Canonical context files — the documentation content being evaluated
|
|
96
|
+
// This is the KEY differentiator from the local cache (which doesn't
|
|
97
|
+
// include Sanity document content in the fetch-docs cache key).
|
|
98
|
+
// -----------------------------------------------------------------------
|
|
99
|
+
const canonicalDir = r("contexts/canonical");
|
|
100
|
+
if (existsSync(canonicalDir)) {
|
|
101
|
+
const contextFiles = readdirSync(canonicalDir)
|
|
102
|
+
.filter((f) => f.endsWith(".md"))
|
|
103
|
+
.sort();
|
|
104
|
+
for (const f of contextFiles) {
|
|
105
|
+
// If area or task filter is set, we include all context files anyway
|
|
106
|
+
// because context filenames map to task IDs, and task-to-area mapping
|
|
107
|
+
// requires reading the YAML. It's safer to include all — a superset
|
|
108
|
+
// doesn't cause false cache hits, only potential false misses when
|
|
109
|
+
// a non-matching context changes. This is acceptable: the filter
|
|
110
|
+
// flags in the context strings differentiate the fingerprints.
|
|
111
|
+
paths.push(join(canonicalDir, f));
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
return paths;
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
|
|
118
|
+
*
|
|
119
|
+
* The fingerprint is content-addressed: identical inputs always produce
|
|
120
|
+
* the same fingerprint, regardless of the environment (local, CI, etc.).
|
|
121
|
+
*
|
|
122
|
+
* Reuses the existing `hashFiles()` from `cache.ts` to hash file content,
|
|
123
|
+
* and adds non-file context (mode, grader model, filter flags) as
|
|
124
|
+
* additional context strings.
|
|
125
|
+
*
|
|
126
|
+
* @returns SHA-256 hex string (64 characters)
|
|
127
|
+
*/
|
|
128
|
+
export function computeEvalFingerprint(input) {
|
|
129
|
+
const { filter, graderModel, mode, rootDir } = input;
|
|
130
|
+
// -----------------------------------------------------------------------
|
|
131
|
+
// 1. Collect context strings (non-file inputs)
|
|
132
|
+
// -----------------------------------------------------------------------
|
|
133
|
+
const context = [
|
|
134
|
+
FINGERPRINT_VERSION,
|
|
135
|
+
`mode:${mode}`,
|
|
136
|
+
`grader:${graderModel}`,
|
|
137
|
+
];
|
|
138
|
+
// Include filter flags so that scoped runs produce different fingerprints
|
|
139
|
+
if (filter?.areas && filter.areas.length > 0) {
|
|
140
|
+
context.push(`areas:${[...filter.areas].sort().join(",")}`);
|
|
141
|
+
}
|
|
142
|
+
if (filter?.taskIds && filter.taskIds.length > 0) {
|
|
143
|
+
context.push(`tasks:${[...filter.taskIds].sort().join(",")}`);
|
|
144
|
+
}
|
|
145
|
+
// -----------------------------------------------------------------------
|
|
146
|
+
// 2. Collect input file paths (all files that affect eval output)
|
|
147
|
+
// -----------------------------------------------------------------------
|
|
148
|
+
const paths = collectFingerprintInputPaths(rootDir, filter);
|
|
149
|
+
// -----------------------------------------------------------------------
|
|
150
|
+
// 3. Hash everything together
|
|
151
|
+
// -----------------------------------------------------------------------
|
|
152
|
+
return hashFiles(paths, context);
|
|
153
|
+
}
|
|
154
|
+
// ---------------------------------------------------------------------------
|
|
155
|
+
// Helpers
|
|
156
|
+
// ---------------------------------------------------------------------------
|
|
157
|
+
/**
|
|
158
|
+
* Recursively collect all file paths under a directory.
|
|
159
|
+
* Skips hidden files and directories (starting with '.').
|
|
160
|
+
*/
|
|
161
|
+
function collectFilesRecursive(dir, paths) {
|
|
162
|
+
const entries = readdirSync(dir);
|
|
163
|
+
for (const entry of entries) {
|
|
164
|
+
if (entry.startsWith("."))
|
|
165
|
+
continue;
|
|
166
|
+
const fullPath = join(dir, entry);
|
|
167
|
+
const stat = statSync(fullPath);
|
|
168
|
+
if (stat.isDirectory()) {
|
|
169
|
+
collectFilesRecursive(fullPath, paths);
|
|
170
|
+
}
|
|
171
|
+
else if (stat.isFile()) {
|
|
172
|
+
paths.push(fullPath);
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
}
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/expand-tasks.ts
|
|
3
|
+
*
|
|
4
|
+
* Reads task YAML files in the single-definition format and expands each
|
|
5
|
+
* task into gold + baseline Promptfoo test entries. This eliminates the
|
|
6
|
+
* manual duplication where every task had to be written twice.
|
|
7
|
+
*
|
|
8
|
+
* Rubric templates from config/rubrics.yaml are resolved at expansion time:
|
|
9
|
+
* tasks specify `template` + `criteria`, and the expander assembles
|
|
10
|
+
* the full rubric text by injecting criteria into the template.
|
|
11
|
+
*
|
|
12
|
+
* Structured dimension metadata (Approach 5):
|
|
13
|
+
* When a rubric template has a `dimension` field, the resolved assertion
|
|
14
|
+
* includes `metadata.dimension` and `metadata.maxScore`. This flows through
|
|
15
|
+
* Promptfoo into component results, allowing the scoring engine to classify
|
|
16
|
+
* rubrics structurally instead of via heuristic string matching.
|
|
17
|
+
* See docs/design-docs/structured-dimensions.md.
|
|
18
|
+
*
|
|
19
|
+
* Single-definition format:
|
|
20
|
+
* - id: groq-blog-queries
|
|
21
|
+
* description: "GROQ - Blog queries with filtering and pagination"
|
|
22
|
+
* doc_coverage: true
|
|
23
|
+
* vars:
|
|
24
|
+
* task: |
|
|
25
|
+
* Write GROQ queries for a Sanity blog application: ...
|
|
26
|
+
* docs: file://contexts/canonical/groq-blog-queries.md
|
|
27
|
+
* assert:
|
|
28
|
+
* - type: llm-rubric
|
|
29
|
+
* template: task-completion
|
|
30
|
+
* criteria:
|
|
31
|
+
* - GROQ filter with _type == "post"
|
|
32
|
+
* - Projection with aliased slug field
|
|
33
|
+
* - type: contains-any
|
|
34
|
+
* value: ["client.fetch", "createClient"]
|
|
35
|
+
* baseline:
|
|
36
|
+
* enabled: true
|
|
37
|
+
* rubric: abbreviated
|
|
38
|
+
*
|
|
39
|
+
* Expands to:
|
|
40
|
+
* 1. Gold entry — uses vars.docs as-is, resolves templates, appends doc-coverage
|
|
41
|
+
* 2. Baseline entry — sets docs: "", adds transform, uses abbreviated rubric
|
|
42
|
+
*/
|
|
43
|
+
import type { TaskDefinition } from "../_vendor/ailf-core/index.d.ts";
|
|
44
|
+
import { type RubricConfig } from "./schemas.js";
|
|
45
|
+
import type { FilterOptions } from "./types.js";
|
|
46
|
+
/** Any assertion entry (templated or value-based). */
|
|
47
|
+
export type AssertEntry = TemplatedAssert | ValueAssert;
|
|
48
|
+
/** The output format — a Promptfoo-compatible test entry. */
|
|
49
|
+
export interface ExpandedTestEntry {
|
|
50
|
+
assert?: ValueAssert[];
|
|
51
|
+
description: string;
|
|
52
|
+
/**
|
|
53
|
+
* Promptfoo prompt filter — restricts which prompts this test runs against.
|
|
54
|
+
* Matches on prompt `id` or `label`. When absent, all prompts are allowed
|
|
55
|
+
* (Promptfoo's default cartesian product behavior).
|
|
56
|
+
*
|
|
57
|
+
* Gold entries use `['with-docs']` (ceiling measurement).
|
|
58
|
+
* Baseline entries use `['without-docs']` (floor measurement).
|
|
59
|
+
*
|
|
60
|
+
* See: evaluation-ceiling.md for the floor/ceiling/actual decomposition.
|
|
61
|
+
*/
|
|
62
|
+
prompts?: string[];
|
|
63
|
+
vars: Record<string, unknown>;
|
|
64
|
+
}
|
|
65
|
+
/** A legacy task entry (the old paired format without an `id` field). */
|
|
66
|
+
export interface LegacyTaskEntry {
|
|
67
|
+
assert?: AssertEntry[];
|
|
68
|
+
description: string;
|
|
69
|
+
transform?: string;
|
|
70
|
+
vars?: Record<string, unknown>;
|
|
71
|
+
}
|
|
72
|
+
/** A single task definition in the new format (input). */
|
|
73
|
+
export interface SingleTaskDefinition {
|
|
74
|
+
/** Grading assertions (applied to gold; optionally abbreviated for baseline). */
|
|
75
|
+
assert: AssertEntry[];
|
|
76
|
+
/** Baseline generation options. */
|
|
77
|
+
baseline?: {
|
|
78
|
+
/** Whether to generate a baseline variant. Default: true. */
|
|
79
|
+
enabled?: boolean;
|
|
80
|
+
/** Rubric mode: 'full' copies all asserts, 'abbreviated' generates a
|
|
81
|
+
* summary rubric, 'none' omits rubric asserts. Default: 'abbreviated'. */
|
|
82
|
+
rubric?: "abbreviated" | "full" | "none";
|
|
83
|
+
};
|
|
84
|
+
/** Human-readable description of what this task tests. */
|
|
85
|
+
description: string;
|
|
86
|
+
/** Opt-in: auto-generate a documentation coverage rubric for gold. */
|
|
87
|
+
doc_coverage?: boolean;
|
|
88
|
+
/** Explicit task ID — determines the canonical context filename. */
|
|
89
|
+
id: string;
|
|
90
|
+
/** Template variables: task prompt and docs path. */
|
|
91
|
+
vars: {
|
|
92
|
+
task: string;
|
|
93
|
+
docs: string;
|
|
94
|
+
[key: string]: unknown;
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
/** A templated assertion — references a rubric template. */
|
|
98
|
+
export interface TemplatedAssert {
|
|
99
|
+
criteria: string[];
|
|
100
|
+
template: string;
|
|
101
|
+
type: "llm-rubric";
|
|
102
|
+
weight?: number;
|
|
103
|
+
}
|
|
104
|
+
/** A standard assertion with a value. */
|
|
105
|
+
export interface ValueAssert {
|
|
106
|
+
[key: string]: unknown;
|
|
107
|
+
type: string;
|
|
108
|
+
value?: unknown;
|
|
109
|
+
weight?: number;
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Assemble a full rubric text string from a template and criteria.
|
|
113
|
+
*
|
|
114
|
+
* Output format:
|
|
115
|
+
* {header}
|
|
116
|
+
* - {scale[0]}
|
|
117
|
+
* - {scale[1]}
|
|
118
|
+
* ...
|
|
119
|
+
*
|
|
120
|
+
* {criteria_label}
|
|
121
|
+
* - {criteria[0]}
|
|
122
|
+
* - {criteria[1]}
|
|
123
|
+
* ...
|
|
124
|
+
*
|
|
125
|
+
* {footer}
|
|
126
|
+
*/
|
|
127
|
+
export declare function assembleRubric(templateKey: string, criteria: string[], rubricConfig: RubricConfig): string;
|
|
128
|
+
/**
|
|
129
|
+
* Build baseline assertions based on the rubric mode.
|
|
130
|
+
*
|
|
131
|
+
* - 'full': Copy all assertions as-is
|
|
132
|
+
* - 'abbreviated': Keep only the first llm-rubric (task completion) with
|
|
133
|
+
* a shortened prompt, plus all non-rubric assertions
|
|
134
|
+
* - 'none': No assertions at all
|
|
135
|
+
*/
|
|
136
|
+
export declare function buildBaselineAsserts(goldAsserts: ValueAssert[], mode: "abbreviated" | "full" | "none"): ValueAssert[];
|
|
137
|
+
/**
|
|
138
|
+
* Clear the cached rubric config. Used in tests.
|
|
139
|
+
*/
|
|
140
|
+
export declare function clearRubricCache(): void;
|
|
141
|
+
/**
|
|
142
|
+
* Expand a single task definition into gold + baseline Promptfoo test entries.
|
|
143
|
+
* Returns 1 entry (gold only) if baseline is disabled, or 2 entries otherwise.
|
|
144
|
+
*
|
|
145
|
+
* Resolves templated assertions and appends doc-coverage if opted in.
|
|
146
|
+
*
|
|
147
|
+
* @param mode - Controls which entries are generated and how:
|
|
148
|
+
* - `'baseline'` (default): Gold + baseline entries with `prompts` filter
|
|
149
|
+
* to prevent cartesian product with multiple prompts. Gold entries get
|
|
150
|
+
* `prompts: ['with-docs']`, baseline entries get `prompts: ['without-docs']`.
|
|
151
|
+
* - `'agentic'`: Gold entries only, no `prompts` filter (agentic mode has
|
|
152
|
+
* a single prompt that doesn't use `{{docs}}`; baseline entries would be
|
|
153
|
+
* pure waste — identical prompts, wasted API calls).
|
|
154
|
+
*/
|
|
155
|
+
export declare function expandTask(task: SingleTaskDefinition, rubricConfig: RubricConfig, mode?: "agentic" | "baseline"): ExpandedTestEntry[];
|
|
156
|
+
/**
|
|
157
|
+
* Expand an array of TaskDefinition[] (from any TaskSource adapter) into
|
|
158
|
+
* Promptfoo-compatible test entries. This is the TaskSource-aware counterpart
|
|
159
|
+
* of loadAndExpandTasks() — it skips YAML file I/O and works directly with
|
|
160
|
+
* the canonical domain type.
|
|
161
|
+
*
|
|
162
|
+
* @param tasks - Task definitions from any TaskSource adapter
|
|
163
|
+
* @param rootDir - Eval package root (needed to load rubric templates)
|
|
164
|
+
* @param mode - Expansion mode: 'baseline' (gold + baseline) or 'agentic' (gold only)
|
|
165
|
+
* @returns Expanded test entries and statistics
|
|
166
|
+
*/
|
|
167
|
+
export declare function expandTaskDefinitions(tasks: TaskDefinition[], rootDir: string, mode?: "agentic" | "baseline"): {
|
|
168
|
+
entries: ExpandedTestEntry[];
|
|
169
|
+
stats: {
|
|
170
|
+
totalTasks: number;
|
|
171
|
+
expandedTotal: number;
|
|
172
|
+
};
|
|
173
|
+
};
|
|
174
|
+
/**
|
|
175
|
+
* Extract all task IDs from task files. Only works with the new
|
|
176
|
+
* single-definition format entries (those that have an `id` field).
|
|
177
|
+
*/
|
|
178
|
+
export declare function extractTaskIds(rootDir: string): string[];
|
|
179
|
+
/**
|
|
180
|
+
* Type guard: checks if an entry is in the new single-definition format.
|
|
181
|
+
* The distinguishing feature is the presence of an `id` field.
|
|
182
|
+
*/
|
|
183
|
+
export declare function isSingleTaskDefinition(entry: unknown): entry is SingleTaskDefinition;
|
|
184
|
+
/**
|
|
185
|
+
* Type guard: checks if an assertion uses the templated format.
|
|
186
|
+
*/
|
|
187
|
+
export declare function isTemplatedAssert(entry: AssertEntry): entry is TemplatedAssert;
|
|
188
|
+
/**
|
|
189
|
+
* Load and expand all task files from the tasks/ directory.
|
|
190
|
+
* Supports both the new single-definition format (has `id`) and the legacy
|
|
191
|
+
* paired format (no `id`). Legacy entries pass through unchanged.
|
|
192
|
+
*
|
|
193
|
+
* @param mode - Controls expansion behavior:
|
|
194
|
+
* - `'baseline'` (default): Gold + baseline entries with prompt filters.
|
|
195
|
+
* - `'agentic'`: Gold entries only, no prompt filters.
|
|
196
|
+
*
|
|
197
|
+
* Returns the expanded entries grouped by source file.
|
|
198
|
+
*/
|
|
199
|
+
export declare function loadAndExpandTasks(rootDir: string, filter?: FilterOptions, mode?: "agentic" | "baseline"): {
|
|
200
|
+
/** All expanded test entries, in order. */
|
|
201
|
+
entries: ExpandedTestEntry[];
|
|
202
|
+
/** Statistics about what was processed. */
|
|
203
|
+
stats: {
|
|
204
|
+
totalFiles: number;
|
|
205
|
+
singleDefinitions: number;
|
|
206
|
+
legacyEntries: number;
|
|
207
|
+
expandedTotal: number;
|
|
208
|
+
};
|
|
209
|
+
};
|
|
210
|
+
/**
|
|
211
|
+
* Load and validate config/rubrics.yaml from the given root directory.
|
|
212
|
+
* Caches the result for subsequent calls with the same rootDir.
|
|
213
|
+
*/
|
|
214
|
+
export declare function loadRubricTemplates(rootDir: string): RubricConfig;
|
|
215
|
+
/**
|
|
216
|
+
* Resolve a single assertion: if it's templated, assemble the rubric text
|
|
217
|
+
* and attach structured dimension metadata when the template has a
|
|
218
|
+
* `dimension` field. Otherwise, pass through unchanged.
|
|
219
|
+
*/
|
|
220
|
+
export declare function resolveAssert(entry: AssertEntry, rubricConfig: RubricConfig): ValueAssert;
|