@sanity/ailf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -0
- package/bin/ailf.js +64 -0
- package/canonical/grader-references/README.md +88 -0
- package/canonical/grader-references/groq.yaml +234 -0
- package/canonical/grader-references/studio-setup.yaml +275 -0
- package/canonical/reference-solutions/.gitkeep +1 -0
- package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
- package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
- package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
- package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
- package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
- package/canonical/reference-solutions/groq/joins-references.ts +300 -0
- package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
- package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
- package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
- package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
- package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
- package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
- package/config/bigquery/README.md +74 -0
- package/config/bigquery/views/area_scores.sql +87 -0
- package/config/bigquery/views/reports.sql +49 -0
- package/config/features.yaml +116 -0
- package/config/models.yaml +115 -0
- package/config/prompts.yaml +75 -0
- package/config/rubrics.yaml +62 -0
- package/config/schedules.yaml +43 -0
- package/config/sinks.yaml +54 -0
- package/config/sources.yaml +51 -0
- package/config/thresholds.yaml +49 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
- package/dist/_vendor/ailf-core/examples/index.js +285 -0
- package/dist/_vendor/ailf-core/index.d.ts +17 -0
- package/dist/_vendor/ailf-core/index.js +17 -0
- package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
- package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
- package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
- package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
- package/dist/_vendor/ailf-core/ports/context.js +14 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
- package/dist/_vendor/ailf-core/ports/index.js +7 -0
- package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
- package/dist/_vendor/ailf-core/ports/logger.js +11 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
- package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/schemas/index.js +16 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
- package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
- package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
- package/dist/_vendor/ailf-core/services/index.js +12 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/scoring.js +222 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
- package/dist/_vendor/ailf-core/types/index.js +21 -0
- package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
- package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
- package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
- package/dist/_vendor/ailf-shared/document-ref.js +1 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -0
- package/dist/_vendor/ailf-shared/index.js +16 -0
- package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
- package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
- package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
- package/dist/_vendor/ailf-shared/score-grades.js +23 -0
- package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
- package/dist/adapters/cache/content-lake-cache.js +59 -0
- package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
- package/dist/adapters/cache/filesystem-cache.js +54 -0
- package/dist/adapters/cache/index.d.ts +2 -0
- package/dist/adapters/cache/index.js +2 -0
- package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
- package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
- package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
- package/dist/adapters/config-sources/file-config-adapter.js +96 -0
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +2 -0
- package/dist/adapters/doc-fetchers/index.d.ts +1 -0
- package/dist/adapters/doc-fetchers/index.js +1 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
- package/dist/adapters/eval-runners/index.d.ts +1 -0
- package/dist/adapters/eval-runners/index.js +1 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
- package/dist/adapters/index.d.ts +12 -0
- package/dist/adapters/index.js +12 -0
- package/dist/adapters/loggers/console-logger.d.ts +22 -0
- package/dist/adapters/loggers/console-logger.js +54 -0
- package/dist/adapters/loggers/index.d.ts +9 -0
- package/dist/adapters/loggers/index.js +9 -0
- package/dist/adapters/loggers/json-logger.d.ts +18 -0
- package/dist/adapters/loggers/json-logger.js +33 -0
- package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
- package/dist/adapters/loggers/quiet-logger.js +30 -0
- package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/composite-task-source.js +59 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
- package/dist/adapters/task-sources/index.d.ts +7 -0
- package/dist/adapters/task-sources/index.js +7 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
- package/dist/adapters/task-sources/repo-schemas.js +234 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
- package/dist/adapters/task-sources/repo-task-source.js +104 -0
- package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
- package/dist/adapters/task-sources/repo-trigger.js +153 -0
- package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
- package/dist/adapters/task-sources/repo-validation.js +164 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
- package/dist/adapters/task-sources/yaml-task-source.js +136 -0
- package/dist/agent-observer/agentic-provider.d.ts +132 -0
- package/dist/agent-observer/agentic-provider.js +983 -0
- package/dist/agent-observer/classifier.d.ts +62 -0
- package/dist/agent-observer/classifier.js +269 -0
- package/dist/agent-observer/index.d.ts +7 -0
- package/dist/agent-observer/index.js +4 -0
- package/dist/agent-observer/pricing.d.ts +35 -0
- package/dist/agent-observer/pricing.js +82 -0
- package/dist/agent-observer/provider.d.ts +77 -0
- package/dist/agent-observer/provider.js +151 -0
- package/dist/agent-observer/proxy.d.ts +91 -0
- package/dist/agent-observer/proxy.js +321 -0
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/agent-observer/types.d.ts +137 -0
- package/dist/agent-observer/types.js +16 -0
- package/dist/assertions/source-isolation.d.ts +72 -0
- package/dist/assertions/source-isolation.js +117 -0
- package/dist/cli.d.ts +24 -0
- package/dist/cli.js +199 -0
- package/dist/commands/agent-report.d.ts +5 -0
- package/dist/commands/agent-report.js +69 -0
- package/dist/commands/baseline.d.ts +9 -0
- package/dist/commands/baseline.js +141 -0
- package/dist/commands/cache.d.ts +13 -0
- package/dist/commands/cache.js +135 -0
- package/dist/commands/calculate-scores.d.ts +8 -0
- package/dist/commands/calculate-scores.js +48 -0
- package/dist/commands/compare.d.ts +8 -0
- package/dist/commands/compare.js +120 -0
- package/dist/commands/completion.d.ts +18 -0
- package/dist/commands/completion.js +260 -0
- package/dist/commands/coverage-audit.d.ts +7 -0
- package/dist/commands/coverage-audit.js +40 -0
- package/dist/commands/discovery-report.d.ts +10 -0
- package/dist/commands/discovery-report.js +44 -0
- package/dist/commands/eval.d.ts +9 -0
- package/dist/commands/eval.js +35 -0
- package/dist/commands/explain-handler.d.ts +34 -0
- package/dist/commands/explain-handler.js +719 -0
- package/dist/commands/fetch-docs.d.ts +8 -0
- package/dist/commands/fetch-docs.js +128 -0
- package/dist/commands/generate-configs.d.ts +8 -0
- package/dist/commands/generate-configs.js +46 -0
- package/dist/commands/grader/index.d.ts +11 -0
- package/dist/commands/grader/index.js +118 -0
- package/dist/commands/init.d.ts +19 -0
- package/dist/commands/init.js +150 -0
- package/dist/commands/interactive.d.ts +12 -0
- package/dist/commands/interactive.js +238 -0
- package/dist/commands/lookup-doc.d.ts +15 -0
- package/dist/commands/lookup-doc.js +84 -0
- package/dist/commands/measure-retrieval.d.ts +5 -0
- package/dist/commands/measure-retrieval.js +65 -0
- package/dist/commands/pipeline-action.d.ts +71 -0
- package/dist/commands/pipeline-action.js +305 -0
- package/dist/commands/pipeline.d.ts +62 -0
- package/dist/commands/pipeline.js +53 -0
- package/dist/commands/pr-comment.d.ts +8 -0
- package/dist/commands/pr-comment.js +47 -0
- package/dist/commands/publish.d.ts +26 -0
- package/dist/commands/publish.js +253 -0
- package/dist/commands/readiness-report.d.ts +10 -0
- package/dist/commands/readiness-report.js +104 -0
- package/dist/commands/shared/options.d.ts +29 -0
- package/dist/commands/shared/options.js +57 -0
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +16 -0
- package/dist/commands/validate-tasks.js +93 -0
- package/dist/commands/validate.d.ts +9 -0
- package/dist/commands/validate.js +73 -0
- package/dist/commands/webhook-server.d.ts +5 -0
- package/dist/commands/webhook-server.js +30 -0
- package/dist/commands/weekly-digest.d.ts +10 -0
- package/dist/commands/weekly-digest.js +104 -0
- package/dist/composition-root.d.ts +26 -0
- package/dist/composition-root.js +107 -0
- package/dist/interpolate.d.ts +26 -0
- package/dist/interpolate.js +70 -0
- package/dist/job-store.d.ts +104 -0
- package/dist/job-store.js +188 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.d.ts +27 -0
- package/dist/orchestration/build-app-context.js +81 -0
- package/dist/orchestration/build-step-sequence.d.ts +15 -0
- package/dist/orchestration/build-step-sequence.js +84 -0
- package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
- package/dist/orchestration/config-to-source-overrides.js +28 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/index.d.ts +11 -0
- package/dist/orchestration/index.js +11 -0
- package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
- package/dist/orchestration/pipeline-orchestrator.js +153 -0
- package/dist/orchestration/step-runner.d.ts +20 -0
- package/dist/orchestration/step-runner.js +88 -0
- package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
- package/dist/orchestration/steps/calculate-scores-step.js +95 -0
- package/dist/orchestration/steps/callback-step.d.ts +24 -0
- package/dist/orchestration/steps/callback-step.js +76 -0
- package/dist/orchestration/steps/compare-step.d.ts +14 -0
- package/dist/orchestration/steps/compare-step.js +92 -0
- package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
- package/dist/orchestration/steps/discovery-report-step.js +55 -0
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
- package/dist/orchestration/steps/fetch-docs-step.js +135 -0
- package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
- package/dist/orchestration/steps/gap-analysis-step.js +136 -0
- package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
- package/dist/orchestration/steps/generate-configs-step.js +85 -0
- package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
- package/dist/orchestration/steps/grader-consistency-step.js +64 -0
- package/dist/orchestration/steps/index.d.ts +19 -0
- package/dist/orchestration/steps/index.js +19 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
- package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
- package/dist/orchestration/steps/publish-report-step.js +216 -0
- package/dist/orchestration/steps/readiness-step.d.ts +13 -0
- package/dist/orchestration/steps/readiness-step.js +91 -0
- package/dist/orchestration/steps/report-step.d.ts +12 -0
- package/dist/orchestration/steps/report-step.js +49 -0
- package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
- package/dist/orchestration/steps/run-eval-step.js +195 -0
- package/dist/orchestration/steps/validate-step.d.ts +12 -0
- package/dist/orchestration/steps/validate-step.js +41 -0
- package/dist/pipeline/agent-behavior-report.d.ts +53 -0
- package/dist/pipeline/agent-behavior-report.js +132 -0
- package/dist/pipeline/attribution.d.ts +47 -0
- package/dist/pipeline/attribution.js +226 -0
- package/dist/pipeline/baseline.d.ts +37 -0
- package/dist/pipeline/baseline.js +141 -0
- package/dist/pipeline/cache.d.ts +101 -0
- package/dist/pipeline/cache.js +283 -0
- package/dist/pipeline/calculate-scores.d.ts +102 -0
- package/dist/pipeline/calculate-scores.js +1128 -0
- package/dist/pipeline/callback-delivery.d.ts +50 -0
- package/dist/pipeline/callback-delivery.js +89 -0
- package/dist/pipeline/checks.d.ts +39 -0
- package/dist/pipeline/checks.js +280 -0
- package/dist/pipeline/classify-url.d.ts +61 -0
- package/dist/pipeline/classify-url.js +93 -0
- package/dist/pipeline/compare.d.ts +31 -0
- package/dist/pipeline/compare.js +208 -0
- package/dist/pipeline/coverage-audit.d.ts +39 -0
- package/dist/pipeline/coverage-audit.js +165 -0
- package/dist/pipeline/degradations.d.ts +85 -0
- package/dist/pipeline/degradations.js +242 -0
- package/dist/pipeline/discovery-report.d.ts +55 -0
- package/dist/pipeline/discovery-report.js +178 -0
- package/dist/pipeline/eval-constants.d.ts +68 -0
- package/dist/pipeline/eval-constants.js +111 -0
- package/dist/pipeline/eval-fingerprint.d.ts +66 -0
- package/dist/pipeline/eval-fingerprint.js +175 -0
- package/dist/pipeline/expand-tasks.d.ts +220 -0
- package/dist/pipeline/expand-tasks.js +421 -0
- package/dist/pipeline/failure-modes.d.ts +46 -0
- package/dist/pipeline/failure-modes.js +348 -0
- package/dist/pipeline/fetch-url-content.d.ts +44 -0
- package/dist/pipeline/fetch-url-content.js +93 -0
- package/dist/pipeline/gap-analysis.d.ts +48 -0
- package/dist/pipeline/gap-analysis.js +231 -0
- package/dist/pipeline/generate-configs.d.ts +72 -0
- package/dist/pipeline/generate-configs.js +395 -0
- package/dist/pipeline/grader-api.d.ts +49 -0
- package/dist/pipeline/grader-api.js +200 -0
- package/dist/pipeline/grader-compare-runner.d.ts +44 -0
- package/dist/pipeline/grader-compare-runner.js +301 -0
- package/dist/pipeline/grader-comparison.d.ts +111 -0
- package/dist/pipeline/grader-comparison.js +161 -0
- package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
- package/dist/pipeline/grader-consistency-runner.js +270 -0
- package/dist/pipeline/grader-consistency.d.ts +103 -0
- package/dist/pipeline/grader-consistency.js +146 -0
- package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
- package/dist/pipeline/grader-sensitivity-runner.js +282 -0
- package/dist/pipeline/grader-sensitivity.d.ts +94 -0
- package/dist/pipeline/grader-sensitivity.js +144 -0
- package/dist/pipeline/grader-validate-runner.d.ts +38 -0
- package/dist/pipeline/grader-validate-runner.js +229 -0
- package/dist/pipeline/grader-validation.d.ts +107 -0
- package/dist/pipeline/grader-validation.js +169 -0
- package/dist/pipeline/map-request-to-config.d.ts +19 -0
- package/dist/pipeline/map-request-to-config.js +80 -0
- package/dist/pipeline/measure-retrieval.d.ts +59 -0
- package/dist/pipeline/measure-retrieval.js +111 -0
- package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
- package/dist/pipeline/mirror-repo-tasks.js +350 -0
- package/dist/pipeline/plan-format.d.ts +33 -0
- package/dist/pipeline/plan-format.js +202 -0
- package/dist/pipeline/plan.d.ts +169 -0
- package/dist/pipeline/plan.js +708 -0
- package/dist/pipeline/pr-comment.d.ts +19 -0
- package/dist/pipeline/pr-comment.js +502 -0
- package/dist/pipeline/probe.d.ts +52 -0
- package/dist/pipeline/probe.js +390 -0
- package/dist/pipeline/provenance.d.ts +47 -0
- package/dist/pipeline/provenance.js +146 -0
- package/dist/pipeline/readiness-report.d.ts +87 -0
- package/dist/pipeline/readiness-report.js +205 -0
- package/dist/pipeline/release-classification.d.ts +54 -0
- package/dist/pipeline/release-classification.js +238 -0
- package/dist/pipeline/release-report.d.ts +37 -0
- package/dist/pipeline/release-report.js +222 -0
- package/dist/pipeline/repo-eval-comment.d.ts +37 -0
- package/dist/pipeline/repo-eval-comment.js +165 -0
- package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
- package/dist/pipeline/repo-threshold-evaluator.js +162 -0
- package/dist/pipeline/resolve-mappings.d.ts +35 -0
- package/dist/pipeline/resolve-mappings.js +72 -0
- package/dist/pipeline/retrieval-metrics.d.ts +39 -0
- package/dist/pipeline/retrieval-metrics.js +136 -0
- package/dist/pipeline/reverse-mapping.d.ts +67 -0
- package/dist/pipeline/reverse-mapping.js +88 -0
- package/dist/pipeline/schemas.d.ts +9 -0
- package/dist/pipeline/schemas.js +9 -0
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +88 -0
- package/dist/pipeline/targeted-loo.js +203 -0
- package/dist/pipeline/thresholds.d.ts +27 -0
- package/dist/pipeline/thresholds.js +245 -0
- package/dist/pipeline/types.d.ts +10 -0
- package/dist/pipeline/types.js +10 -0
- package/dist/pipeline/validate.d.ts +67 -0
- package/dist/pipeline/validate.js +406 -0
- package/dist/pipeline/webhook-server.d.ts +37 -0
- package/dist/pipeline/webhook-server.js +133 -0
- package/dist/report-store.d.ts +84 -0
- package/dist/report-store.js +208 -0
- package/dist/sanity/client.d.ts +38 -0
- package/dist/sanity/client.js +86 -0
- package/dist/sanity/portable-text.d.ts +11 -0
- package/dist/sanity/portable-text.js +211 -0
- package/dist/sanity/queries.d.ts +133 -0
- package/dist/sanity/queries.js +300 -0
- package/dist/schedules/digest.d.ts +116 -0
- package/dist/schedules/digest.js +156 -0
- package/dist/schedules/index.d.ts +12 -0
- package/dist/schedules/index.js +10 -0
- package/dist/schedules/loader.d.ts +31 -0
- package/dist/schedules/loader.js +73 -0
- package/dist/schedules/schema.d.ts +9 -0
- package/dist/schedules/schema.js +9 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +21 -0
- package/dist/scripts/validate-task-sources.js +210 -0
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/bigquery/index.d.ts +131 -0
- package/dist/sinks/bigquery/index.js +222 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/index.d.ts +23 -0
- package/dist/sinks/index.js +18 -0
- package/dist/sinks/loader.d.ts +18 -0
- package/dist/sinks/loader.js +82 -0
- package/dist/sinks/retry.d.ts +24 -0
- package/dist/sinks/retry.js +52 -0
- package/dist/sinks/schema.d.ts +9 -0
- package/dist/sinks/schema.js +9 -0
- package/dist/sinks/slack/format.d.ts +65 -0
- package/dist/sinks/slack/format.js +327 -0
- package/dist/sinks/slack/index.d.ts +27 -0
- package/dist/sinks/slack/index.js +78 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +59 -0
- package/dist/sinks/types.js +44 -0
- package/dist/sinks/webhook/index.d.ts +19 -0
- package/dist/sinks/webhook/index.js +50 -0
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/sources.d.ts +104 -0
- package/dist/sources.js +292 -0
- package/dist/webhook/budget.d.ts +42 -0
- package/dist/webhook/budget.js +60 -0
- package/dist/webhook/debounce.d.ts +67 -0
- package/dist/webhook/debounce.js +76 -0
- package/dist/webhook/dispatch.d.ts +45 -0
- package/dist/webhook/dispatch.js +84 -0
- package/dist/webhook/eval-request-handler.d.ts +87 -0
- package/dist/webhook/eval-request-handler.js +181 -0
- package/dist/webhook/handler.d.ts +88 -0
- package/dist/webhook/handler.js +203 -0
- package/dist/webhook/index.d.ts +17 -0
- package/dist/webhook/index.js +12 -0
- package/dist/webhook/types.d.ts +109 -0
- package/dist/webhook/types.js +10 -0
- package/package.json +72 -0
- package/tasks/.expanded.agentic.yaml +51 -0
- package/tasks/.expanded.yaml +66 -0
- package/tasks/frameworks.yaml +98 -0
- package/tasks/functions.yaml +51 -0
- package/tasks/groq.yaml +216 -0
- package/tasks/nextjs-live.yaml +62 -0
- package/tasks/studio-setup.yaml +111 -0
- package/tasks/visual-editing.yaml +120 -0
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Baseline.ts
|
|
3
|
+
*
|
|
4
|
+
* Manages historical baseline snapshots of evaluation scores.
|
|
5
|
+
* Allows saving, comparing, and listing score baselines over time.
|
|
6
|
+
*
|
|
7
|
+
* Usage:
|
|
8
|
+
* pnpm baseline:save # save current scores as baseline
|
|
9
|
+
* pnpm baseline:save --tag "pre-groq" # save with a descriptive tag
|
|
10
|
+
* pnpm baseline:compare # compare current vs latest baseline
|
|
11
|
+
* pnpm baseline:history # list all saved baselines
|
|
12
|
+
*/
|
|
13
|
+
import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
|
|
14
|
+
import { dirname, join, resolve } from "path";
|
|
15
|
+
import { fileURLToPath } from "url";
|
|
16
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
17
|
+
const ROOT = resolve(__dirname, "..", "..");
|
|
18
|
+
const BASELINES_DIR = join(ROOT, "results", "baselines");
|
|
19
|
+
const SCORE_SUMMARY_PATH = join(ROOT, "results", "latest", "score-summary.json");
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
// Compare
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
export function compareBaseline(baselineFile) {
|
|
24
|
+
if (!existsSync(SCORE_SUMMARY_PATH)) {
|
|
25
|
+
return {
|
|
26
|
+
message: "No current score-summary.json found.",
|
|
27
|
+
success: false,
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
// Find baseline to compare against
|
|
31
|
+
const baselines = listBaselines();
|
|
32
|
+
if (baselines.length === 0) {
|
|
33
|
+
return {
|
|
34
|
+
message: "No baselines saved yet. Run 'pnpm baseline:save' first.",
|
|
35
|
+
success: false,
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
const targetFile = baselineFile ?? baselines[0].filename;
|
|
39
|
+
const baselinePath = join(BASELINES_DIR, targetFile);
|
|
40
|
+
if (!existsSync(baselinePath)) {
|
|
41
|
+
return {
|
|
42
|
+
message: `Baseline file not found: ${targetFile}`,
|
|
43
|
+
success: false,
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
const current = JSON.parse(readFileSync(SCORE_SUMMARY_PATH, "utf-8"));
|
|
47
|
+
const baseline = JSON.parse(readFileSync(baselinePath, "utf-8"));
|
|
48
|
+
const baselineMap = new Map(baseline.scores.map((s) => [s.feature, s.totalScore]));
|
|
49
|
+
const baselineCostMap = new Map(baseline.scores.map((s) => [s.feature, s.totalCost ?? 0]));
|
|
50
|
+
const comparisons = current.scores.map((s) => {
|
|
51
|
+
const baseScore = baselineMap.get(s.feature) ?? 0;
|
|
52
|
+
const currentCost = s.totalCost ?? 0;
|
|
53
|
+
const baseCost = baselineCostMap.get(s.feature) ?? 0;
|
|
54
|
+
return {
|
|
55
|
+
baseline: baseScore,
|
|
56
|
+
costBaseline: baseCost > 0 ? baseCost : undefined,
|
|
57
|
+
costCurrent: currentCost > 0 ? currentCost : undefined,
|
|
58
|
+
costDelta: currentCost > 0 || baseCost > 0 ? currentCost - baseCost : undefined,
|
|
59
|
+
current: s.totalScore,
|
|
60
|
+
delta: s.totalScore - baseScore,
|
|
61
|
+
feature: s.feature,
|
|
62
|
+
};
|
|
63
|
+
});
|
|
64
|
+
// Check for areas in baseline but not in current
|
|
65
|
+
for (const [feature, score] of baselineMap) {
|
|
66
|
+
if (!comparisons.find((c) => c.feature === feature)) {
|
|
67
|
+
comparisons.push({
|
|
68
|
+
baseline: score,
|
|
69
|
+
current: 0,
|
|
70
|
+
delta: -score,
|
|
71
|
+
feature,
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
comparisons.sort((a, b) => b.delta - a.delta);
|
|
76
|
+
const overallDelta = Math.round(current.overall.avgScore) - Math.round(baseline.overall.avgScore);
|
|
77
|
+
return {
|
|
78
|
+
comparisons,
|
|
79
|
+
message: `Compared against ${targetFile}`,
|
|
80
|
+
overallDelta,
|
|
81
|
+
success: true,
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
export function listBaselines() {
|
|
85
|
+
if (!existsSync(BASELINES_DIR)) {
|
|
86
|
+
return [];
|
|
87
|
+
}
|
|
88
|
+
const files = readdirSync(BASELINES_DIR)
|
|
89
|
+
.filter((f) => f.endsWith(".json"))
|
|
90
|
+
.sort()
|
|
91
|
+
.reverse(); // Newest first
|
|
92
|
+
return files.map((filename) => {
|
|
93
|
+
const raw = readFileSync(join(BASELINES_DIR, filename), "utf-8");
|
|
94
|
+
const data = JSON.parse(raw);
|
|
95
|
+
return {
|
|
96
|
+
areaCount: data.scores.length,
|
|
97
|
+
avgScore: Math.round(data.overall.avgScore),
|
|
98
|
+
filename,
|
|
99
|
+
graderCost: data.overall.cost?.graderTotal,
|
|
100
|
+
tag: data.baselineMeta?.tag,
|
|
101
|
+
timestamp: data.timestamp,
|
|
102
|
+
totalCost: data.overall.cost?.total,
|
|
103
|
+
};
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
export function saveBaseline(tag) {
|
|
107
|
+
if (!existsSync(SCORE_SUMMARY_PATH)) {
|
|
108
|
+
return {
|
|
109
|
+
message: "No score-summary.json found. Run 'pnpm calculate-scores' first.",
|
|
110
|
+
success: false,
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
const raw = readFileSync(SCORE_SUMMARY_PATH, "utf-8");
|
|
114
|
+
const summary = JSON.parse(raw);
|
|
115
|
+
mkdirSync(BASELINES_DIR, { recursive: true });
|
|
116
|
+
// Generate filename: YYYY-MM-DD_HHmmss[_tag].json
|
|
117
|
+
const now = new Date();
|
|
118
|
+
const datePart = now
|
|
119
|
+
.toISOString()
|
|
120
|
+
.slice(0, 19)
|
|
121
|
+
.replace(/[T:]/g, "_")
|
|
122
|
+
.replace(/-/g, "");
|
|
123
|
+
const tagPart = tag
|
|
124
|
+
? `_${tag.replace(/[^a-z0-9-]/gi, "-").toLowerCase()}`
|
|
125
|
+
: "";
|
|
126
|
+
const filename = `${datePart}${tagPart}.json`;
|
|
127
|
+
const baseline = {
|
|
128
|
+
...summary,
|
|
129
|
+
baselineMeta: {
|
|
130
|
+
savedAt: now.toISOString(),
|
|
131
|
+
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string tag should be treated as no tag
|
|
132
|
+
tag: tag || undefined,
|
|
133
|
+
},
|
|
134
|
+
};
|
|
135
|
+
writeFileSync(join(BASELINES_DIR, filename), JSON.stringify(baseline, null, 2));
|
|
136
|
+
return {
|
|
137
|
+
message: `Saved baseline to results/baselines/${filename} (avg: ${Math.round(summary.overall.avgScore)}, ${summary.scores.length} areas)`,
|
|
138
|
+
success: true,
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
// ---------------------------------------------------------------------------
|
|
142
|
+
// CLI
|
|
143
|
+
// ---------------------------------------------------------------------------
|
|
144
|
+
if (process.argv[1]?.endsWith("baseline.ts") ||
|
|
145
|
+
process.argv[1]?.endsWith("baseline.js")) {
|
|
146
|
+
const args = process.argv.slice(2);
|
|
147
|
+
const command = args[0] || "save";
|
|
148
|
+
function getArg(name) {
|
|
149
|
+
const idx = args.indexOf(`--${name}`);
|
|
150
|
+
return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined;
|
|
151
|
+
}
|
|
152
|
+
switch (command) {
|
|
153
|
+
case "compare": {
|
|
154
|
+
const file = getArg("file");
|
|
155
|
+
console.log("=== Baseline Comparison ===\n");
|
|
156
|
+
const result = compareBaseline(file);
|
|
157
|
+
if (!result.success) {
|
|
158
|
+
console.error(` ❌ ${result.message}`);
|
|
159
|
+
process.exit(1);
|
|
160
|
+
}
|
|
161
|
+
console.log(` ${result.message}\n`);
|
|
162
|
+
console.log(" " +
|
|
163
|
+
"Feature Area".padEnd(18) +
|
|
164
|
+
"Current".padEnd(10) +
|
|
165
|
+
"Baseline".padEnd(10) +
|
|
166
|
+
"Delta");
|
|
167
|
+
console.log(" " + "-".repeat(50));
|
|
168
|
+
for (const c of result.comparisons) {
|
|
169
|
+
const deltaStr = c.delta > 0 ? `+${c.delta}` : c.delta === 0 ? "=" : String(c.delta);
|
|
170
|
+
const icon = c.delta > 0 ? "📈" : c.delta < 0 ? "📉" : "➡️";
|
|
171
|
+
console.log(" " +
|
|
172
|
+
c.feature.padEnd(18) +
|
|
173
|
+
String(c.current).padEnd(10) +
|
|
174
|
+
String(c.baseline).padEnd(10) +
|
|
175
|
+
`${icon} ${deltaStr}`);
|
|
176
|
+
}
|
|
177
|
+
// Cost comparison (only if cost data exists)
|
|
178
|
+
const hasCostData = result.comparisons.some((c) => c.costCurrent !== undefined || c.costBaseline !== undefined);
|
|
179
|
+
if (hasCostData) {
|
|
180
|
+
console.log();
|
|
181
|
+
console.log(" " + "Cost Comparison:");
|
|
182
|
+
console.log(" " +
|
|
183
|
+
"Feature Area".padEnd(18) +
|
|
184
|
+
"Current".padEnd(10) +
|
|
185
|
+
"Baseline".padEnd(10) +
|
|
186
|
+
"Delta");
|
|
187
|
+
console.log(" " + "-".repeat(50));
|
|
188
|
+
for (const c of result.comparisons) {
|
|
189
|
+
if (c.costCurrent === undefined && c.costBaseline === undefined) {
|
|
190
|
+
continue;
|
|
191
|
+
}
|
|
192
|
+
const cur = `$${(c.costCurrent ?? 0).toFixed(4)}`;
|
|
193
|
+
const base = `$${(c.costBaseline ?? 0).toFixed(4)}`;
|
|
194
|
+
const delta = c.costDelta ?? 0;
|
|
195
|
+
const deltaStr = delta > 0
|
|
196
|
+
? `+$${delta.toFixed(4)}`
|
|
197
|
+
: delta < 0
|
|
198
|
+
? `-$${Math.abs(delta).toFixed(4)}`
|
|
199
|
+
: "=";
|
|
200
|
+
const icon = delta > 0 ? "📈" : delta < 0 ? "📉" : "➡️";
|
|
201
|
+
console.log(" " +
|
|
202
|
+
c.feature.padEnd(18) +
|
|
203
|
+
cur.padEnd(10) +
|
|
204
|
+
base.padEnd(10) +
|
|
205
|
+
`${icon} ${deltaStr}`);
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
console.log();
|
|
209
|
+
const overallIcon = result.overallDelta > 0 ? "📈" : result.overallDelta < 0 ? "📉" : "➡️";
|
|
210
|
+
const overallStr = result.overallDelta > 0
|
|
211
|
+
? `+${result.overallDelta}`
|
|
212
|
+
: result.overallDelta === 0
|
|
213
|
+
? "="
|
|
214
|
+
: String(result.overallDelta);
|
|
215
|
+
console.log(` Overall: ${overallIcon} ${overallStr} points`);
|
|
216
|
+
break;
|
|
217
|
+
}
|
|
218
|
+
case "history": {
|
|
219
|
+
console.log("=== Baseline History ===\n");
|
|
220
|
+
const baselines = listBaselines();
|
|
221
|
+
if (baselines.length === 0) {
|
|
222
|
+
console.log(" No baselines saved yet.");
|
|
223
|
+
}
|
|
224
|
+
else {
|
|
225
|
+
const hasCosts = baselines.some((b) => b.totalCost !== undefined || b.graderCost !== undefined);
|
|
226
|
+
const costHeader = hasCosts ? "Cost".padEnd(10) : "";
|
|
227
|
+
console.log(" " +
|
|
228
|
+
"Date".padEnd(22) +
|
|
229
|
+
"Avg".padEnd(6) +
|
|
230
|
+
"Areas".padEnd(7) +
|
|
231
|
+
costHeader +
|
|
232
|
+
"Tag");
|
|
233
|
+
console.log(" " + "-".repeat(hasCosts ? 60 : 50));
|
|
234
|
+
for (const b of baselines) {
|
|
235
|
+
const date = new Date(b.timestamp).toLocaleString();
|
|
236
|
+
const combinedCost = (b.totalCost ?? 0) + (b.graderCost ?? 0);
|
|
237
|
+
const costStr = hasCosts
|
|
238
|
+
? (combinedCost > 0 ? `$${combinedCost.toFixed(2)}` : "-").padEnd(10)
|
|
239
|
+
: "";
|
|
240
|
+
console.log(" " +
|
|
241
|
+
date.padEnd(22) +
|
|
242
|
+
String(b.avgScore).padEnd(6) +
|
|
243
|
+
String(b.areaCount).padEnd(7) +
|
|
244
|
+
costStr +
|
|
245
|
+
(b.tag ?? ""));
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
break;
|
|
249
|
+
}
|
|
250
|
+
case "save": {
|
|
251
|
+
const tag = getArg("tag");
|
|
252
|
+
console.log("=== Saving baseline snapshot ===\n");
|
|
253
|
+
const result = saveBaseline(tag);
|
|
254
|
+
if (result.success) {
|
|
255
|
+
console.log(` ✅ ${result.message}`);
|
|
256
|
+
}
|
|
257
|
+
else {
|
|
258
|
+
console.error(` ❌ ${result.message}`);
|
|
259
|
+
process.exit(1);
|
|
260
|
+
}
|
|
261
|
+
break;
|
|
262
|
+
}
|
|
263
|
+
default:
|
|
264
|
+
console.error(`Unknown command: "${command}". Use: save, history, compare`);
|
|
265
|
+
process.exit(1);
|
|
266
|
+
}
|
|
267
|
+
}
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Calculate-scores.ts
|
|
3
|
+
*
|
|
4
|
+
* Reads Promptfoo evaluation output and computes the AI Literacy Score
|
|
5
|
+
* for each feature area. Each dimension is scored on a uniform 0–100 scale:
|
|
6
|
+
*
|
|
7
|
+
* Task Completion (0–100) — Can the LLM implement the feature?
|
|
8
|
+
* Code Correctness (0–100) — Is the code idiomatic and correct?
|
|
9
|
+
* Doc Coverage (0–100) — Did docs provide the needed info?
|
|
10
|
+
*
|
|
11
|
+
* Dimensions are combined into a weighted composite (0–100) using weights
|
|
12
|
+
* from config/rubrics.yaml (default: Task×0.50 + Code×0.25 + Docs×0.25).
|
|
13
|
+
*
|
|
14
|
+
* Additionally compares with-docs vs without-docs scores to calculate
|
|
15
|
+
* the "Doc Lift" — how much documentation helps vs parametric knowledge.
|
|
16
|
+
*
|
|
17
|
+
* When tests are run with the InstrumentedProvider (agent-observer),
|
|
18
|
+
* this script also aggregates and reports agent behavior data: which
|
|
19
|
+
* documentation pages were visited, what searches were performed, and
|
|
20
|
+
* overall network activity patterns.
|
|
21
|
+
*/
|
|
22
|
+
import "dotenv/config";
|
|
23
|
+
import type { FeatureScore, GraderJudgment, PerModelEntry } from "../pipeline/types.js";
|
|
24
|
+
export interface ComponentResult {
|
|
25
|
+
assertion?: {
|
|
26
|
+
type: string;
|
|
27
|
+
value?: string;
|
|
28
|
+
/** Structured metadata propagated from rubric templates (Approach 5). */
|
|
29
|
+
metadata?: Record<string, unknown>;
|
|
30
|
+
};
|
|
31
|
+
pass: boolean;
|
|
32
|
+
reason?: string;
|
|
33
|
+
score?: number;
|
|
34
|
+
}
|
|
35
|
+
export interface PromptfooResultsWrapper {
|
|
36
|
+
results: RawTestResult[];
|
|
37
|
+
stats: {
|
|
38
|
+
successes: number;
|
|
39
|
+
failures: number;
|
|
40
|
+
tokenUsage?: {
|
|
41
|
+
assertions?: {
|
|
42
|
+
completion: number;
|
|
43
|
+
prompt: number;
|
|
44
|
+
total: number;
|
|
45
|
+
};
|
|
46
|
+
completion: number;
|
|
47
|
+
prompt: number;
|
|
48
|
+
total: number;
|
|
49
|
+
};
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
export interface RawPromptfooFile {
|
|
53
|
+
[key: string]: unknown;
|
|
54
|
+
config?: {
|
|
55
|
+
defaultTest?: {
|
|
56
|
+
options?: {
|
|
57
|
+
provider?: string;
|
|
58
|
+
rubricProvider?: string;
|
|
59
|
+
};
|
|
60
|
+
};
|
|
61
|
+
};
|
|
62
|
+
results: PromptfooResultsWrapper;
|
|
63
|
+
}
|
|
64
|
+
export interface RawTestResult {
|
|
65
|
+
cost?: number;
|
|
66
|
+
error?: string;
|
|
67
|
+
gradingResult: null | {
|
|
68
|
+
componentResults: ComponentResult[];
|
|
69
|
+
pass: boolean;
|
|
70
|
+
};
|
|
71
|
+
metadata?: Record<string, unknown>;
|
|
72
|
+
provider?: {
|
|
73
|
+
id?: string;
|
|
74
|
+
label?: string;
|
|
75
|
+
};
|
|
76
|
+
response: {
|
|
77
|
+
output: string;
|
|
78
|
+
};
|
|
79
|
+
testCase?: {
|
|
80
|
+
description?: string;
|
|
81
|
+
vars?: Record<string, string>;
|
|
82
|
+
};
|
|
83
|
+
vars: Record<string, string>;
|
|
84
|
+
}
|
|
85
|
+
export interface TestResult {
|
|
86
|
+
cost: number;
|
|
87
|
+
description: string;
|
|
88
|
+
gradingResult: {
|
|
89
|
+
componentResults: ComponentResult[];
|
|
90
|
+
pass: boolean;
|
|
91
|
+
};
|
|
92
|
+
metadata?: Record<string, unknown>;
|
|
93
|
+
/** Provider identifier (e.g., "openai:gpt-4o") */
|
|
94
|
+
providerId?: string;
|
|
95
|
+
/** Provider label (e.g., "GPT-4o") */
|
|
96
|
+
providerLabel?: string;
|
|
97
|
+
response: {
|
|
98
|
+
output: string;
|
|
99
|
+
};
|
|
100
|
+
vars: Record<string, string>;
|
|
101
|
+
}
|
|
102
|
+
export interface UrlMetadata {
|
|
103
|
+
otherUrls: string[];
|
|
104
|
+
sanityUrlCount: number;
|
|
105
|
+
sanityUrls: string[];
|
|
106
|
+
totalUrlCount: number;
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Calculate scores grouped by model. Each model gets its own FeatureScore[]
|
|
110
|
+
* and model-level aggregates.
|
|
111
|
+
*
|
|
112
|
+
* Uses the provider.id from Promptfoo results to identify models.
|
|
113
|
+
* Falls back to provider.label, then "unknown" if neither is available.
|
|
114
|
+
*
|
|
115
|
+
* @returns Record keyed by model ID, or null if only one model was used
|
|
116
|
+
* (per-model breakdown is redundant when there's only one model).
|
|
117
|
+
*/
|
|
118
|
+
export declare function calculateScoresPerModel(resultsPath: string, weights: Record<string, number>): null | PerModelEntry[];
|
|
119
|
+
export declare function classifyRubric(component: ComponentResult): "codeCorrectness" | "docCoverage" | "taskCompletion" | null;
|
|
120
|
+
export declare function detectFeatureArea(description: string): string;
|
|
121
|
+
/**
|
|
122
|
+
* Extract grader judgments (reason text + scores) from evaluation results.
|
|
123
|
+
*
|
|
124
|
+
* This preserves the grader's natural language reasoning for downstream
|
|
125
|
+
* analysis (failure mode classification, gap analysis). Each llm-rubric
|
|
126
|
+
* assertion produces one GraderJudgment entry.
|
|
127
|
+
*
|
|
128
|
+
* Phase 3a prerequisite: structured judgment data for failure mode extraction.
|
|
129
|
+
*/
|
|
130
|
+
export declare function extractGraderJudgments(resultsPath: string): GraderJudgment[];
|
|
131
|
+
/**
|
|
132
|
+
* Finds the URL-extraction assertion result in a test's componentResults
|
|
133
|
+
* and parses the structured JSON from its `reason` field.
|
|
134
|
+
*/
|
|
135
|
+
export declare function extractUrlMetadata(test: TestResult): null | UrlMetadata;
|
|
136
|
+
export declare function parseRubricScore(component: ComponentResult): number;
|
|
137
|
+
/**
|
|
138
|
+
* Score agentic evaluation results. In agentic mode, all test entries are
|
|
139
|
+
* gold-only (no baseline entries — the .expanded.agentic.yaml fix ensures this).
|
|
140
|
+
* The model retrieves docs via tools, so all results map to "actual" scores.
|
|
141
|
+
*
|
|
142
|
+
* Returns a record keyed by feature area with the composite actual score.
|
|
143
|
+
*/
|
|
144
|
+
interface ActualScoreEntry {
|
|
145
|
+
actualScore: number;
|
|
146
|
+
codeCorrectness: number;
|
|
147
|
+
docCoverage: number;
|
|
148
|
+
taskCompletion: number;
|
|
149
|
+
testCount: number;
|
|
150
|
+
totalCost: number;
|
|
151
|
+
}
|
|
152
|
+
export declare function scoreAgenticResults(resultsPath: string, weights: Record<string, number>): Record<string, ActualScoreEntry>;
|
|
153
|
+
/**
|
|
154
|
+
* Merge baseline FeatureScore[] with agentic actual scores to produce
|
|
155
|
+
* the full three-layer decomposition.
|
|
156
|
+
*
|
|
157
|
+
* The merge is per feature area. For each area:
|
|
158
|
+
* - If baseline data exists: floor, ceiling, docLift, docQualityGap are populated
|
|
159
|
+
* - If agentic data exists: actualScore is populated
|
|
160
|
+
* - If both exist: retrievalGap and infrastructureEfficiency are computed
|
|
161
|
+
*
|
|
162
|
+
* @param baselineScores Floor/ceiling scores from baseline evaluation (may be empty)
|
|
163
|
+
* @param agenticScores Actual scores from agentic evaluation (may be empty)
|
|
164
|
+
*/
|
|
165
|
+
export declare function mergeScores(baselineScores: FeatureScore[], agenticScores: Record<string, ActualScoreEntry>): FeatureScore[];
|
|
166
|
+
export {};
|