@sanity/ailf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -0
- package/bin/ailf.js +64 -0
- package/canonical/grader-references/README.md +88 -0
- package/canonical/grader-references/groq.yaml +234 -0
- package/canonical/grader-references/studio-setup.yaml +275 -0
- package/canonical/reference-solutions/.gitkeep +1 -0
- package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
- package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
- package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
- package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
- package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
- package/canonical/reference-solutions/groq/joins-references.ts +300 -0
- package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
- package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
- package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
- package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
- package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
- package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
- package/config/bigquery/README.md +74 -0
- package/config/bigquery/views/area_scores.sql +87 -0
- package/config/bigquery/views/reports.sql +49 -0
- package/config/features.yaml +116 -0
- package/config/models.yaml +115 -0
- package/config/prompts.yaml +75 -0
- package/config/rubrics.yaml +62 -0
- package/config/schedules.yaml +43 -0
- package/config/sinks.yaml +54 -0
- package/config/sources.yaml +51 -0
- package/config/thresholds.yaml +49 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
- package/dist/_vendor/ailf-core/examples/index.js +285 -0
- package/dist/_vendor/ailf-core/index.d.ts +17 -0
- package/dist/_vendor/ailf-core/index.js +17 -0
- package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
- package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
- package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
- package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
- package/dist/_vendor/ailf-core/ports/context.js +14 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
- package/dist/_vendor/ailf-core/ports/index.js +7 -0
- package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
- package/dist/_vendor/ailf-core/ports/logger.js +11 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
- package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/schemas/index.js +16 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
- package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
- package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
- package/dist/_vendor/ailf-core/services/index.js +12 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/scoring.js +222 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
- package/dist/_vendor/ailf-core/types/index.js +21 -0
- package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
- package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
- package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
- package/dist/_vendor/ailf-shared/document-ref.js +1 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -0
- package/dist/_vendor/ailf-shared/index.js +16 -0
- package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
- package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
- package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
- package/dist/_vendor/ailf-shared/score-grades.js +23 -0
- package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
- package/dist/adapters/cache/content-lake-cache.js +59 -0
- package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
- package/dist/adapters/cache/filesystem-cache.js +54 -0
- package/dist/adapters/cache/index.d.ts +2 -0
- package/dist/adapters/cache/index.js +2 -0
- package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
- package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
- package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
- package/dist/adapters/config-sources/file-config-adapter.js +96 -0
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +2 -0
- package/dist/adapters/doc-fetchers/index.d.ts +1 -0
- package/dist/adapters/doc-fetchers/index.js +1 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
- package/dist/adapters/eval-runners/index.d.ts +1 -0
- package/dist/adapters/eval-runners/index.js +1 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
- package/dist/adapters/index.d.ts +12 -0
- package/dist/adapters/index.js +12 -0
- package/dist/adapters/loggers/console-logger.d.ts +22 -0
- package/dist/adapters/loggers/console-logger.js +54 -0
- package/dist/adapters/loggers/index.d.ts +9 -0
- package/dist/adapters/loggers/index.js +9 -0
- package/dist/adapters/loggers/json-logger.d.ts +18 -0
- package/dist/adapters/loggers/json-logger.js +33 -0
- package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
- package/dist/adapters/loggers/quiet-logger.js +30 -0
- package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/composite-task-source.js +59 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
- package/dist/adapters/task-sources/index.d.ts +7 -0
- package/dist/adapters/task-sources/index.js +7 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
- package/dist/adapters/task-sources/repo-schemas.js +234 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
- package/dist/adapters/task-sources/repo-task-source.js +104 -0
- package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
- package/dist/adapters/task-sources/repo-trigger.js +153 -0
- package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
- package/dist/adapters/task-sources/repo-validation.js +164 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
- package/dist/adapters/task-sources/yaml-task-source.js +136 -0
- package/dist/agent-observer/agentic-provider.d.ts +132 -0
- package/dist/agent-observer/agentic-provider.js +983 -0
- package/dist/agent-observer/classifier.d.ts +62 -0
- package/dist/agent-observer/classifier.js +269 -0
- package/dist/agent-observer/index.d.ts +7 -0
- package/dist/agent-observer/index.js +4 -0
- package/dist/agent-observer/pricing.d.ts +35 -0
- package/dist/agent-observer/pricing.js +82 -0
- package/dist/agent-observer/provider.d.ts +77 -0
- package/dist/agent-observer/provider.js +151 -0
- package/dist/agent-observer/proxy.d.ts +91 -0
- package/dist/agent-observer/proxy.js +321 -0
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/agent-observer/types.d.ts +137 -0
- package/dist/agent-observer/types.js +16 -0
- package/dist/assertions/source-isolation.d.ts +72 -0
- package/dist/assertions/source-isolation.js +117 -0
- package/dist/cli.d.ts +24 -0
- package/dist/cli.js +199 -0
- package/dist/commands/agent-report.d.ts +5 -0
- package/dist/commands/agent-report.js +69 -0
- package/dist/commands/baseline.d.ts +9 -0
- package/dist/commands/baseline.js +141 -0
- package/dist/commands/cache.d.ts +13 -0
- package/dist/commands/cache.js +135 -0
- package/dist/commands/calculate-scores.d.ts +8 -0
- package/dist/commands/calculate-scores.js +48 -0
- package/dist/commands/compare.d.ts +8 -0
- package/dist/commands/compare.js +120 -0
- package/dist/commands/completion.d.ts +18 -0
- package/dist/commands/completion.js +260 -0
- package/dist/commands/coverage-audit.d.ts +7 -0
- package/dist/commands/coverage-audit.js +40 -0
- package/dist/commands/discovery-report.d.ts +10 -0
- package/dist/commands/discovery-report.js +44 -0
- package/dist/commands/eval.d.ts +9 -0
- package/dist/commands/eval.js +35 -0
- package/dist/commands/explain-handler.d.ts +34 -0
- package/dist/commands/explain-handler.js +719 -0
- package/dist/commands/fetch-docs.d.ts +8 -0
- package/dist/commands/fetch-docs.js +128 -0
- package/dist/commands/generate-configs.d.ts +8 -0
- package/dist/commands/generate-configs.js +46 -0
- package/dist/commands/grader/index.d.ts +11 -0
- package/dist/commands/grader/index.js +118 -0
- package/dist/commands/init.d.ts +19 -0
- package/dist/commands/init.js +150 -0
- package/dist/commands/interactive.d.ts +12 -0
- package/dist/commands/interactive.js +238 -0
- package/dist/commands/lookup-doc.d.ts +15 -0
- package/dist/commands/lookup-doc.js +84 -0
- package/dist/commands/measure-retrieval.d.ts +5 -0
- package/dist/commands/measure-retrieval.js +65 -0
- package/dist/commands/pipeline-action.d.ts +71 -0
- package/dist/commands/pipeline-action.js +305 -0
- package/dist/commands/pipeline.d.ts +62 -0
- package/dist/commands/pipeline.js +53 -0
- package/dist/commands/pr-comment.d.ts +8 -0
- package/dist/commands/pr-comment.js +47 -0
- package/dist/commands/publish.d.ts +26 -0
- package/dist/commands/publish.js +253 -0
- package/dist/commands/readiness-report.d.ts +10 -0
- package/dist/commands/readiness-report.js +104 -0
- package/dist/commands/shared/options.d.ts +29 -0
- package/dist/commands/shared/options.js +57 -0
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +16 -0
- package/dist/commands/validate-tasks.js +93 -0
- package/dist/commands/validate.d.ts +9 -0
- package/dist/commands/validate.js +73 -0
- package/dist/commands/webhook-server.d.ts +5 -0
- package/dist/commands/webhook-server.js +30 -0
- package/dist/commands/weekly-digest.d.ts +10 -0
- package/dist/commands/weekly-digest.js +104 -0
- package/dist/composition-root.d.ts +26 -0
- package/dist/composition-root.js +107 -0
- package/dist/interpolate.d.ts +26 -0
- package/dist/interpolate.js +70 -0
- package/dist/job-store.d.ts +104 -0
- package/dist/job-store.js +188 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.d.ts +27 -0
- package/dist/orchestration/build-app-context.js +81 -0
- package/dist/orchestration/build-step-sequence.d.ts +15 -0
- package/dist/orchestration/build-step-sequence.js +84 -0
- package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
- package/dist/orchestration/config-to-source-overrides.js +28 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/index.d.ts +11 -0
- package/dist/orchestration/index.js +11 -0
- package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
- package/dist/orchestration/pipeline-orchestrator.js +153 -0
- package/dist/orchestration/step-runner.d.ts +20 -0
- package/dist/orchestration/step-runner.js +88 -0
- package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
- package/dist/orchestration/steps/calculate-scores-step.js +95 -0
- package/dist/orchestration/steps/callback-step.d.ts +24 -0
- package/dist/orchestration/steps/callback-step.js +76 -0
- package/dist/orchestration/steps/compare-step.d.ts +14 -0
- package/dist/orchestration/steps/compare-step.js +92 -0
- package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
- package/dist/orchestration/steps/discovery-report-step.js +55 -0
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
- package/dist/orchestration/steps/fetch-docs-step.js +135 -0
- package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
- package/dist/orchestration/steps/gap-analysis-step.js +136 -0
- package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
- package/dist/orchestration/steps/generate-configs-step.js +85 -0
- package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
- package/dist/orchestration/steps/grader-consistency-step.js +64 -0
- package/dist/orchestration/steps/index.d.ts +19 -0
- package/dist/orchestration/steps/index.js +19 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
- package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
- package/dist/orchestration/steps/publish-report-step.js +216 -0
- package/dist/orchestration/steps/readiness-step.d.ts +13 -0
- package/dist/orchestration/steps/readiness-step.js +91 -0
- package/dist/orchestration/steps/report-step.d.ts +12 -0
- package/dist/orchestration/steps/report-step.js +49 -0
- package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
- package/dist/orchestration/steps/run-eval-step.js +195 -0
- package/dist/orchestration/steps/validate-step.d.ts +12 -0
- package/dist/orchestration/steps/validate-step.js +41 -0
- package/dist/pipeline/agent-behavior-report.d.ts +53 -0
- package/dist/pipeline/agent-behavior-report.js +132 -0
- package/dist/pipeline/attribution.d.ts +47 -0
- package/dist/pipeline/attribution.js +226 -0
- package/dist/pipeline/baseline.d.ts +37 -0
- package/dist/pipeline/baseline.js +141 -0
- package/dist/pipeline/cache.d.ts +101 -0
- package/dist/pipeline/cache.js +283 -0
- package/dist/pipeline/calculate-scores.d.ts +102 -0
- package/dist/pipeline/calculate-scores.js +1128 -0
- package/dist/pipeline/callback-delivery.d.ts +50 -0
- package/dist/pipeline/callback-delivery.js +89 -0
- package/dist/pipeline/checks.d.ts +39 -0
- package/dist/pipeline/checks.js +280 -0
- package/dist/pipeline/classify-url.d.ts +61 -0
- package/dist/pipeline/classify-url.js +93 -0
- package/dist/pipeline/compare.d.ts +31 -0
- package/dist/pipeline/compare.js +208 -0
- package/dist/pipeline/coverage-audit.d.ts +39 -0
- package/dist/pipeline/coverage-audit.js +165 -0
- package/dist/pipeline/degradations.d.ts +85 -0
- package/dist/pipeline/degradations.js +242 -0
- package/dist/pipeline/discovery-report.d.ts +55 -0
- package/dist/pipeline/discovery-report.js +178 -0
- package/dist/pipeline/eval-constants.d.ts +68 -0
- package/dist/pipeline/eval-constants.js +111 -0
- package/dist/pipeline/eval-fingerprint.d.ts +66 -0
- package/dist/pipeline/eval-fingerprint.js +175 -0
- package/dist/pipeline/expand-tasks.d.ts +220 -0
- package/dist/pipeline/expand-tasks.js +421 -0
- package/dist/pipeline/failure-modes.d.ts +46 -0
- package/dist/pipeline/failure-modes.js +348 -0
- package/dist/pipeline/fetch-url-content.d.ts +44 -0
- package/dist/pipeline/fetch-url-content.js +93 -0
- package/dist/pipeline/gap-analysis.d.ts +48 -0
- package/dist/pipeline/gap-analysis.js +231 -0
- package/dist/pipeline/generate-configs.d.ts +72 -0
- package/dist/pipeline/generate-configs.js +395 -0
- package/dist/pipeline/grader-api.d.ts +49 -0
- package/dist/pipeline/grader-api.js +200 -0
- package/dist/pipeline/grader-compare-runner.d.ts +44 -0
- package/dist/pipeline/grader-compare-runner.js +301 -0
- package/dist/pipeline/grader-comparison.d.ts +111 -0
- package/dist/pipeline/grader-comparison.js +161 -0
- package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
- package/dist/pipeline/grader-consistency-runner.js +270 -0
- package/dist/pipeline/grader-consistency.d.ts +103 -0
- package/dist/pipeline/grader-consistency.js +146 -0
- package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
- package/dist/pipeline/grader-sensitivity-runner.js +282 -0
- package/dist/pipeline/grader-sensitivity.d.ts +94 -0
- package/dist/pipeline/grader-sensitivity.js +144 -0
- package/dist/pipeline/grader-validate-runner.d.ts +38 -0
- package/dist/pipeline/grader-validate-runner.js +229 -0
- package/dist/pipeline/grader-validation.d.ts +107 -0
- package/dist/pipeline/grader-validation.js +169 -0
- package/dist/pipeline/map-request-to-config.d.ts +19 -0
- package/dist/pipeline/map-request-to-config.js +80 -0
- package/dist/pipeline/measure-retrieval.d.ts +59 -0
- package/dist/pipeline/measure-retrieval.js +111 -0
- package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
- package/dist/pipeline/mirror-repo-tasks.js +350 -0
- package/dist/pipeline/plan-format.d.ts +33 -0
- package/dist/pipeline/plan-format.js +202 -0
- package/dist/pipeline/plan.d.ts +169 -0
- package/dist/pipeline/plan.js +708 -0
- package/dist/pipeline/pr-comment.d.ts +19 -0
- package/dist/pipeline/pr-comment.js +502 -0
- package/dist/pipeline/probe.d.ts +52 -0
- package/dist/pipeline/probe.js +390 -0
- package/dist/pipeline/provenance.d.ts +47 -0
- package/dist/pipeline/provenance.js +146 -0
- package/dist/pipeline/readiness-report.d.ts +87 -0
- package/dist/pipeline/readiness-report.js +205 -0
- package/dist/pipeline/release-classification.d.ts +54 -0
- package/dist/pipeline/release-classification.js +238 -0
- package/dist/pipeline/release-report.d.ts +37 -0
- package/dist/pipeline/release-report.js +222 -0
- package/dist/pipeline/repo-eval-comment.d.ts +37 -0
- package/dist/pipeline/repo-eval-comment.js +165 -0
- package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
- package/dist/pipeline/repo-threshold-evaluator.js +162 -0
- package/dist/pipeline/resolve-mappings.d.ts +35 -0
- package/dist/pipeline/resolve-mappings.js +72 -0
- package/dist/pipeline/retrieval-metrics.d.ts +39 -0
- package/dist/pipeline/retrieval-metrics.js +136 -0
- package/dist/pipeline/reverse-mapping.d.ts +67 -0
- package/dist/pipeline/reverse-mapping.js +88 -0
- package/dist/pipeline/schemas.d.ts +9 -0
- package/dist/pipeline/schemas.js +9 -0
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +88 -0
- package/dist/pipeline/targeted-loo.js +203 -0
- package/dist/pipeline/thresholds.d.ts +27 -0
- package/dist/pipeline/thresholds.js +245 -0
- package/dist/pipeline/types.d.ts +10 -0
- package/dist/pipeline/types.js +10 -0
- package/dist/pipeline/validate.d.ts +67 -0
- package/dist/pipeline/validate.js +406 -0
- package/dist/pipeline/webhook-server.d.ts +37 -0
- package/dist/pipeline/webhook-server.js +133 -0
- package/dist/report-store.d.ts +84 -0
- package/dist/report-store.js +208 -0
- package/dist/sanity/client.d.ts +38 -0
- package/dist/sanity/client.js +86 -0
- package/dist/sanity/portable-text.d.ts +11 -0
- package/dist/sanity/portable-text.js +211 -0
- package/dist/sanity/queries.d.ts +133 -0
- package/dist/sanity/queries.js +300 -0
- package/dist/schedules/digest.d.ts +116 -0
- package/dist/schedules/digest.js +156 -0
- package/dist/schedules/index.d.ts +12 -0
- package/dist/schedules/index.js +10 -0
- package/dist/schedules/loader.d.ts +31 -0
- package/dist/schedules/loader.js +73 -0
- package/dist/schedules/schema.d.ts +9 -0
- package/dist/schedules/schema.js +9 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +21 -0
- package/dist/scripts/validate-task-sources.js +210 -0
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/bigquery/index.d.ts +131 -0
- package/dist/sinks/bigquery/index.js +222 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/index.d.ts +23 -0
- package/dist/sinks/index.js +18 -0
- package/dist/sinks/loader.d.ts +18 -0
- package/dist/sinks/loader.js +82 -0
- package/dist/sinks/retry.d.ts +24 -0
- package/dist/sinks/retry.js +52 -0
- package/dist/sinks/schema.d.ts +9 -0
- package/dist/sinks/schema.js +9 -0
- package/dist/sinks/slack/format.d.ts +65 -0
- package/dist/sinks/slack/format.js +327 -0
- package/dist/sinks/slack/index.d.ts +27 -0
- package/dist/sinks/slack/index.js +78 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +59 -0
- package/dist/sinks/types.js +44 -0
- package/dist/sinks/webhook/index.d.ts +19 -0
- package/dist/sinks/webhook/index.js +50 -0
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/sources.d.ts +104 -0
- package/dist/sources.js +292 -0
- package/dist/webhook/budget.d.ts +42 -0
- package/dist/webhook/budget.js +60 -0
- package/dist/webhook/debounce.d.ts +67 -0
- package/dist/webhook/debounce.js +76 -0
- package/dist/webhook/dispatch.d.ts +45 -0
- package/dist/webhook/dispatch.js +84 -0
- package/dist/webhook/eval-request-handler.d.ts +87 -0
- package/dist/webhook/eval-request-handler.js +181 -0
- package/dist/webhook/handler.d.ts +88 -0
- package/dist/webhook/handler.js +203 -0
- package/dist/webhook/index.d.ts +17 -0
- package/dist/webhook/index.js +12 -0
- package/dist/webhook/types.d.ts +109 -0
- package/dist/webhook/types.js +10 -0
- package/package.json +72 -0
- package/tasks/.expanded.agentic.yaml +51 -0
- package/tasks/.expanded.yaml +66 -0
- package/tasks/frameworks.yaml +98 -0
- package/tasks/functions.yaml +51 -0
- package/tasks/groq.yaml +216 -0
- package/tasks/nextjs-live.yaml +62 -0
- package/tasks/studio-setup.yaml +111 -0
- package/tasks/visual-editing.yaml +120 -0
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/degradations.ts
|
|
3
|
+
*
|
|
4
|
+
* Programmatic code degradation strategies for sensitivity testing.
|
|
5
|
+
*
|
|
6
|
+
* Each strategy takes a "good" reference solution (string) and returns a
|
|
7
|
+
* "bad" version that should score lower on a specific dimension:
|
|
8
|
+
*
|
|
9
|
+
* - Task Completion: remove key functional sections
|
|
10
|
+
* - Code Correctness: introduce anti-patterns and deprecated APIs
|
|
11
|
+
* - Doc Coverage: strip documentation references, add hallucinated details
|
|
12
|
+
*
|
|
13
|
+
* These are deterministic, pure functions — no randomness, no side effects.
|
|
14
|
+
*
|
|
15
|
+
* @see docs/exec-plans/completed/grader-reliability.md — Phase 4
|
|
16
|
+
*/
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
// Task Completion degradations
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
/**
|
|
21
|
+
* Remove the bottom half of the code (functions, exports, etc).
|
|
22
|
+
* A response missing half its functionality should score lower on Task Completion.
|
|
23
|
+
*/
|
|
24
|
+
export const removeBottomHalf = {
|
|
25
|
+
apply(source) {
|
|
26
|
+
const lines = source.split("\n");
|
|
27
|
+
const midpoint = Math.floor(lines.length / 2);
|
|
28
|
+
const kept = lines.slice(0, midpoint);
|
|
29
|
+
kept.push("");
|
|
30
|
+
kept.push("// ... (remaining implementation not provided)");
|
|
31
|
+
return kept.join("\n");
|
|
32
|
+
},
|
|
33
|
+
description: "Remove bottom half of code — missing key functionality",
|
|
34
|
+
targetDimension: "taskCompletion",
|
|
35
|
+
};
|
|
36
|
+
/**
|
|
37
|
+
* Remove all export statements and exported functions.
|
|
38
|
+
* Missing exports = incomplete API surface → lower Task Completion.
|
|
39
|
+
*/
|
|
40
|
+
export const removeExports = {
|
|
41
|
+
apply(source) {
|
|
42
|
+
const lines = source.split("\n");
|
|
43
|
+
return lines
|
|
44
|
+
.filter((line) => {
|
|
45
|
+
const trimmed = line.trim();
|
|
46
|
+
// Remove export declarations
|
|
47
|
+
if (trimmed.startsWith("export "))
|
|
48
|
+
return false;
|
|
49
|
+
// Remove lines that are just "export {"
|
|
50
|
+
if (/^export\s*\{/.test(trimmed))
|
|
51
|
+
return false;
|
|
52
|
+
return true;
|
|
53
|
+
})
|
|
54
|
+
.join("\n");
|
|
55
|
+
},
|
|
56
|
+
description: "Remove export statements — incomplete public API",
|
|
57
|
+
targetDimension: "taskCompletion",
|
|
58
|
+
};
|
|
59
|
+
/**
|
|
60
|
+
* Replace function bodies with TODO comments.
|
|
61
|
+
* Skeleton code that doesn't actually implement anything.
|
|
62
|
+
*/
|
|
63
|
+
export const stubFunctions = {
|
|
64
|
+
apply(source) {
|
|
65
|
+
// Match function/method declarations and replace their bodies
|
|
66
|
+
// This handles: async function foo() { ... }, function foo() { ... },
|
|
67
|
+
// const foo = async () => { ... }, etc.
|
|
68
|
+
return source.replace(/(\{)\s*\n([\s\S]*?)(\n\s*\})/g, (match, open, _body, close) => {
|
|
69
|
+
// Only replace if the body has more than 2 lines (avoid replacing object literals)
|
|
70
|
+
const bodyLines = _body.split("\n").filter((l) => l.trim().length > 0);
|
|
71
|
+
if (bodyLines.length < 3)
|
|
72
|
+
return match;
|
|
73
|
+
return `${open}\n // TODO: implement\n throw new Error("Not implemented")\n${close}`;
|
|
74
|
+
});
|
|
75
|
+
},
|
|
76
|
+
description: "Replace function bodies with TODO stubs — no implementation",
|
|
77
|
+
targetDimension: "taskCompletion",
|
|
78
|
+
};
|
|
79
|
+
// ---------------------------------------------------------------------------
|
|
80
|
+
// Code Correctness degradations
|
|
81
|
+
// ---------------------------------------------------------------------------
|
|
82
|
+
/**
|
|
83
|
+
* Replace modern API calls with deprecated/incorrect patterns.
|
|
84
|
+
* Targets Sanity-specific patterns that the grader should catch.
|
|
85
|
+
*/
|
|
86
|
+
export const introduceDeprecatedAPIs = {
|
|
87
|
+
apply(source) {
|
|
88
|
+
let result = source;
|
|
89
|
+
// Replace createClient with deprecated @sanity/client import
|
|
90
|
+
result = result.replace(/import\s*\{\s*createClient\s*\}\s*from\s*["']@sanity\/client["']/g, 'import sanityClient from "@sanity/client" // deprecated default import');
|
|
91
|
+
result = result.replace(/import\s*\{\s*createClient\s*\}\s*from\s*["']next-sanity["']/g, 'import sanityClient from "next-sanity" // deprecated default import');
|
|
92
|
+
// Replace defineType/defineField with plain objects
|
|
93
|
+
result = result.replace(/import\s*\{[^}]*defineType[^}]*\}\s*from\s*["']sanity["']/g, "// Using plain objects instead of defineType/defineField");
|
|
94
|
+
result = result.replace(/defineType\(\{/g, "({");
|
|
95
|
+
result = result.replace(/defineField\(\{/g, "({");
|
|
96
|
+
// Replace apiVersion with very old version
|
|
97
|
+
result = result.replace(/apiVersion:\s*["'][^"']+["']/g, 'apiVersion: "2021-03-25" // outdated API version');
|
|
98
|
+
// Replace useCdn: true with useCdn: false (incorrect for read-only)
|
|
99
|
+
result = result.replace(/useCdn:\s*true/g, "useCdn: false // unnecessary");
|
|
100
|
+
return result;
|
|
101
|
+
},
|
|
102
|
+
description: "Replace modern APIs with deprecated patterns",
|
|
103
|
+
targetDimension: "codeCorrectness",
|
|
104
|
+
};
|
|
105
|
+
/**
|
|
106
|
+
* Introduce common GROQ syntax errors.
|
|
107
|
+
* Invalid queries that look plausible but wouldn't work.
|
|
108
|
+
*/
|
|
109
|
+
export const introduceGroqErrors = {
|
|
110
|
+
apply(source) {
|
|
111
|
+
let result = source;
|
|
112
|
+
// Replace -> dereference with . (incorrect)
|
|
113
|
+
result = result.replace(/->/g, ".");
|
|
114
|
+
// Replace valid array slice [0...10] with incorrect [0:10]
|
|
115
|
+
result = result.replace(/\[(\d+)\.\.\.(\d+)\]/g, "[$1:$2]");
|
|
116
|
+
// Replace references() with invalid refs()
|
|
117
|
+
result = result.replace(/references\(/g, "refs(");
|
|
118
|
+
// Replace | order() with incorrect .sort()
|
|
119
|
+
result = result.replace(/\|\s*order\(/g, ".sort(");
|
|
120
|
+
return result;
|
|
121
|
+
},
|
|
122
|
+
description: "Introduce GROQ syntax errors — plausible but broken queries",
|
|
123
|
+
targetDimension: "codeCorrectness",
|
|
124
|
+
};
|
|
125
|
+
/**
|
|
126
|
+
* Remove all TypeScript types and use any everywhere.
|
|
127
|
+
* Technically works but is an anti-pattern.
|
|
128
|
+
*/
|
|
129
|
+
export const removeTypes = {
|
|
130
|
+
apply(source) {
|
|
131
|
+
let result = source;
|
|
132
|
+
// Remove interface/type declarations
|
|
133
|
+
result = result.replace(/^(?:export\s+)?(?:interface|type)\s+\w+[\s\S]*?^\}/gm, "");
|
|
134
|
+
// Replace typed parameters with any
|
|
135
|
+
result = result.replace(/:\s*[A-Z]\w+(?:\[\])?(?:\s*\|[^,)]+)?/g, ": any");
|
|
136
|
+
// Replace Promise<Type> with Promise<any>
|
|
137
|
+
result = result.replace(/Promise<[^>]+>/g, "Promise<any>");
|
|
138
|
+
// Remove generic type parameters
|
|
139
|
+
result = result.replace(/<[A-Z]\w+(?:\[\])?>/g, "");
|
|
140
|
+
return result;
|
|
141
|
+
},
|
|
142
|
+
description: "Strip TypeScript types and use 'any' — works but anti-pattern",
|
|
143
|
+
targetDimension: "codeCorrectness",
|
|
144
|
+
};
|
|
145
|
+
// ---------------------------------------------------------------------------
|
|
146
|
+
// Doc Coverage degradations
|
|
147
|
+
// ---------------------------------------------------------------------------
|
|
148
|
+
/**
|
|
149
|
+
* Remove all comments and documentation.
|
|
150
|
+
* The response shows no evidence of using documentation.
|
|
151
|
+
*/
|
|
152
|
+
export const stripComments = {
|
|
153
|
+
apply(source) {
|
|
154
|
+
const lines = source.split("\n");
|
|
155
|
+
return (lines
|
|
156
|
+
.filter((line) => {
|
|
157
|
+
const trimmed = line.trim();
|
|
158
|
+
// Remove single-line comments
|
|
159
|
+
if (trimmed.startsWith("//"))
|
|
160
|
+
return false;
|
|
161
|
+
// Remove JSDoc comment blocks
|
|
162
|
+
if (trimmed.startsWith("*") ||
|
|
163
|
+
trimmed.startsWith("/**") ||
|
|
164
|
+
trimmed.startsWith("*/"))
|
|
165
|
+
return false;
|
|
166
|
+
return true;
|
|
167
|
+
})
|
|
168
|
+
.join("\n")
|
|
169
|
+
// Remove remaining inline comments
|
|
170
|
+
.replace(/\s*\/\/[^"']*$/gm, "")
|
|
171
|
+
// Remove multi-line comments
|
|
172
|
+
.replace(/\/\*[\s\S]*?\*\//g, "")
|
|
173
|
+
// Clean up excessive blank lines
|
|
174
|
+
.replace(/\n{3,}/g, "\n\n"));
|
|
175
|
+
},
|
|
176
|
+
description: "Remove all comments — no evidence of documentation usage",
|
|
177
|
+
targetDimension: "docCoverage",
|
|
178
|
+
};
|
|
179
|
+
/**
|
|
180
|
+
* Add hallucinated API calls and made-up configuration options.
|
|
181
|
+
* Response looks confident but uses APIs that don't exist.
|
|
182
|
+
*/
|
|
183
|
+
export const addHallucinations = {
|
|
184
|
+
apply(source) {
|
|
185
|
+
const hallucinations = [
|
|
186
|
+
"\n// Enable real-time sync (recommended for production)",
|
|
187
|
+
'const sync = client.enableRealTimeSync({ mode: "aggressive" })',
|
|
188
|
+
"",
|
|
189
|
+
"// Configure auto-indexing for faster queries",
|
|
190
|
+
'client.configureIndex({ fields: ["title", "slug"], type: "fulltext" })',
|
|
191
|
+
"",
|
|
192
|
+
"// Use built-in caching middleware",
|
|
193
|
+
'import { withSanityCache } from "@sanity/cache-middleware"',
|
|
194
|
+
"const cachedClient = withSanityCache(client, { ttl: 3600 })",
|
|
195
|
+
"",
|
|
196
|
+
];
|
|
197
|
+
return hallucinations.join("\n") + "\n" + source;
|
|
198
|
+
},
|
|
199
|
+
description: "Add hallucinated APIs — confident but wrong information",
|
|
200
|
+
targetDimension: "docCoverage",
|
|
201
|
+
};
|
|
202
|
+
/**
|
|
203
|
+
* Replace correct import paths with plausible but wrong ones.
|
|
204
|
+
* Shows the response was guessing at the API surface.
|
|
205
|
+
*/
|
|
206
|
+
export const wrongImports = {
|
|
207
|
+
apply(source) {
|
|
208
|
+
let result = source;
|
|
209
|
+
// Replace real packages with plausible wrong ones
|
|
210
|
+
result = result.replace(/from\s*["']@sanity\/client["']/g, 'from "sanity-client"');
|
|
211
|
+
result = result.replace(/from\s*["']next-sanity["']/g, 'from "@next/sanity"');
|
|
212
|
+
result = result.replace(/from\s*["']sanity["']/g, 'from "@sanity/core"');
|
|
213
|
+
result = result.replace(/from\s*["']sanity\/presentation["']/g, 'from "@sanity/presentation-tool"');
|
|
214
|
+
result = result.replace(/from\s*["']sanity\/structure["']/g, 'from "@sanity/desk-tool"');
|
|
215
|
+
result = result.replace(/from\s*["']sanity\/functions["']/g, 'from "@sanity/serverless"');
|
|
216
|
+
return result;
|
|
217
|
+
},
|
|
218
|
+
description: "Replace imports with wrong packages — guessing at API surface",
|
|
219
|
+
targetDimension: "docCoverage",
|
|
220
|
+
};
|
|
221
|
+
// ---------------------------------------------------------------------------
|
|
222
|
+
// Strategy registry
|
|
223
|
+
// ---------------------------------------------------------------------------
|
|
224
|
+
/** All available degradation strategies, grouped by target dimension */
|
|
225
|
+
export const DEGRADATION_STRATEGIES = [
|
|
226
|
+
// Task Completion
|
|
227
|
+
removeBottomHalf,
|
|
228
|
+
removeExports,
|
|
229
|
+
stubFunctions,
|
|
230
|
+
// Code Correctness
|
|
231
|
+
introduceDeprecatedAPIs,
|
|
232
|
+
introduceGroqErrors,
|
|
233
|
+
removeTypes,
|
|
234
|
+
// Doc Coverage
|
|
235
|
+
stripComments,
|
|
236
|
+
addHallucinations,
|
|
237
|
+
wrongImports,
|
|
238
|
+
];
|
|
239
|
+
/** Get degradation strategies targeting a specific dimension */
|
|
240
|
+
export function getStrategiesForDimension(dimension) {
|
|
241
|
+
return DEGRADATION_STRATEGIES.filter((s) => s.targetDimension === dimension);
|
|
242
|
+
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/discovery-report.ts
|
|
3
|
+
*
|
|
4
|
+
* Generates an agent discoverability report from agentic mode retrieval
|
|
5
|
+
* metrics. Reads score-summary.json (which contains `retrievalMetrics`
|
|
6
|
+
* from agentic evaluation) and produces a markdown report showing:
|
|
7
|
+
*
|
|
8
|
+
* - Retrieval summary (recall, precision, F1)
|
|
9
|
+
* - Per-area retrieval breakdown
|
|
10
|
+
* - Invisible documents (never retrieved by any task)
|
|
11
|
+
* - Recommendations for improving discoverability
|
|
12
|
+
*
|
|
13
|
+
* All functions accept rootDir as a parameter — no module-level constants.
|
|
14
|
+
* No process.argv parsing. No env var fallbacks.
|
|
15
|
+
*
|
|
16
|
+
* Phase 5c of the Scenario Matrix implementation (Scenarios 4.1 and 4.2).
|
|
17
|
+
*
|
|
18
|
+
* @see docs/design-docs/retrieval-metrics.md
|
|
19
|
+
*/
|
|
20
|
+
import type { AreaRetrievalMetrics, RetrievalMetrics, ScoreSummary } from "./types.js";
|
|
21
|
+
export interface DiscoveryReport {
|
|
22
|
+
/** All areas included in the report (after filtering) */
|
|
23
|
+
areas: AreaRetrievalMetrics[];
|
|
24
|
+
/** Base URL from the score summary source config */
|
|
25
|
+
baseUrl: string | undefined;
|
|
26
|
+
/** Document slugs that were never retrieved by any task */
|
|
27
|
+
invisibleDocs: InvisibleDoc[];
|
|
28
|
+
/** Overall retrieval metrics */
|
|
29
|
+
overall: RetrievalMetrics["overall"];
|
|
30
|
+
/** Actionable recommendations */
|
|
31
|
+
recommendations: string[];
|
|
32
|
+
/** ISO timestamp of the source evaluation */
|
|
33
|
+
timestamp: string;
|
|
34
|
+
/** Total canonical docs across included areas */
|
|
35
|
+
totalCanonicalDocs: number;
|
|
36
|
+
/** Total hits (canonical docs successfully retrieved) */
|
|
37
|
+
totalHits: number;
|
|
38
|
+
}
|
|
39
|
+
export interface InvisibleDoc {
|
|
40
|
+
/** Tasks that reference this document via canonical_docs */
|
|
41
|
+
referencedBy: string[];
|
|
42
|
+
/** The document slug */
|
|
43
|
+
slug: string;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Format a discovery report as markdown.
|
|
47
|
+
*/
|
|
48
|
+
export declare function formatDiscoveryMarkdown(report: DiscoveryReport): string;
|
|
49
|
+
/**
|
|
50
|
+
* Generate a structured discovery report from a score summary.
|
|
51
|
+
*
|
|
52
|
+
* @param summary - Parsed score-summary.json
|
|
53
|
+
* @param areaFilter - Optional area names to include (all if empty)
|
|
54
|
+
*/
|
|
55
|
+
export declare function generateDiscoveryReport(summary: ScoreSummary, areaFilter?: string[]): DiscoveryReport;
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// Core logic (exported for testing)
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
/**
|
|
5
|
+
* Format a discovery report as markdown.
|
|
6
|
+
*/
|
|
7
|
+
export function formatDiscoveryMarkdown(report) {
|
|
8
|
+
const lines = [];
|
|
9
|
+
// Header
|
|
10
|
+
lines.push("## 🔍 Agent Discoverability Report");
|
|
11
|
+
lines.push("");
|
|
12
|
+
if (report.baseUrl) {
|
|
13
|
+
lines.push(`**Base URL:** ${report.baseUrl}`);
|
|
14
|
+
}
|
|
15
|
+
lines.push("**Mode:** Agentic");
|
|
16
|
+
lines.push("");
|
|
17
|
+
// Retrieval summary table
|
|
18
|
+
lines.push("### Retrieval Summary");
|
|
19
|
+
lines.push("");
|
|
20
|
+
lines.push("| Metric | Value |");
|
|
21
|
+
lines.push("|---|---|");
|
|
22
|
+
lines.push(`| Recall (canonical docs found) | ${pct(report.overall.avgRecall)} (${report.totalHits}/${report.totalCanonicalDocs}) |`);
|
|
23
|
+
lines.push(`| Precision (relevant docs fetched) | ${pct(report.overall.avgPrecision)} |`);
|
|
24
|
+
lines.push(`| F1 Score | ${report.overall.avgF1.toFixed(2)} |`);
|
|
25
|
+
lines.push(`| Invisible docs | ${report.invisibleDocs.length} |`);
|
|
26
|
+
lines.push("");
|
|
27
|
+
// Per-area breakdown
|
|
28
|
+
if (report.areas.length > 0) {
|
|
29
|
+
lines.push("### Per-Area Breakdown");
|
|
30
|
+
lines.push("");
|
|
31
|
+
lines.push("| Area | Recall | Precision | F1 | Tasks |");
|
|
32
|
+
lines.push("|---|---|---|---|---|");
|
|
33
|
+
for (const area of sortedAreas(report.areas)) {
|
|
34
|
+
lines.push(`| ${area.area} | ${pct(area.avgRecall)} | ${pct(area.avgPrecision)} | ${area.avgF1.toFixed(2)} | ${area.taskCount} |`);
|
|
35
|
+
}
|
|
36
|
+
lines.push("");
|
|
37
|
+
}
|
|
38
|
+
// Invisible documents
|
|
39
|
+
if (report.invisibleDocs.length > 0) {
|
|
40
|
+
lines.push("### Invisible Documents (never retrieved by any task)");
|
|
41
|
+
lines.push("");
|
|
42
|
+
for (const doc of report.invisibleDocs) {
|
|
43
|
+
const refs = doc.referencedBy.join(", ");
|
|
44
|
+
lines.push(`- \`${doc.slug}\` — referenced by ${refs}`);
|
|
45
|
+
}
|
|
46
|
+
lines.push("");
|
|
47
|
+
}
|
|
48
|
+
// Recommendations
|
|
49
|
+
if (report.recommendations.length > 0) {
|
|
50
|
+
lines.push("### Recommendations");
|
|
51
|
+
lines.push("");
|
|
52
|
+
for (let i = 0; i < report.recommendations.length; i++) {
|
|
53
|
+
lines.push(`${i + 1}. ${report.recommendations[i]}`);
|
|
54
|
+
}
|
|
55
|
+
lines.push("");
|
|
56
|
+
}
|
|
57
|
+
return lines.join("\n");
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Generate a structured discovery report from a score summary.
|
|
61
|
+
*
|
|
62
|
+
* @param summary - Parsed score-summary.json
|
|
63
|
+
* @param areaFilter - Optional area names to include (all if empty)
|
|
64
|
+
*/
|
|
65
|
+
export function generateDiscoveryReport(summary, areaFilter) {
|
|
66
|
+
const metrics = summary.retrievalMetrics;
|
|
67
|
+
if (!metrics) {
|
|
68
|
+
throw new Error("score-summary.json does not contain retrievalMetrics. " +
|
|
69
|
+
"Run an agentic evaluation first: pnpm pipeline -- --mode agentic");
|
|
70
|
+
}
|
|
71
|
+
// Apply area filter
|
|
72
|
+
const areas = areaFilter && areaFilter.length > 0
|
|
73
|
+
? metrics.areas.filter((a) => areaFilter.includes(a.area))
|
|
74
|
+
: metrics.areas;
|
|
75
|
+
if (areaFilter && areaFilter.length > 0 && areas.length === 0) {
|
|
76
|
+
throw new Error(`No retrieval data found for area(s): ${areaFilter.join(", ")}. ` +
|
|
77
|
+
`Available areas: ${metrics.areas.map((a) => a.area).join(", ")}`);
|
|
78
|
+
}
|
|
79
|
+
// Recompute overall metrics for filtered areas
|
|
80
|
+
const overall = areas.length === metrics.areas.length
|
|
81
|
+
? metrics.overall
|
|
82
|
+
: computeOverall(areas);
|
|
83
|
+
// Build invisible docs list with task references
|
|
84
|
+
const invisibleDocs = buildInvisibleDocs(areas);
|
|
85
|
+
// Compute totals for the summary table
|
|
86
|
+
const allTasks = areas.flatMap((a) => a.tasks);
|
|
87
|
+
const allExpected = new Set(allTasks.flatMap((t) => t.expected));
|
|
88
|
+
const allHits = new Set(allTasks.flatMap((t) => t.hits));
|
|
89
|
+
const totalCanonicalDocs = allExpected.size;
|
|
90
|
+
const totalHits = allHits.size;
|
|
91
|
+
// Generate recommendations
|
|
92
|
+
const recommendations = generateRecommendations(invisibleDocs, areas, overall);
|
|
93
|
+
return {
|
|
94
|
+
areas,
|
|
95
|
+
baseUrl: summary.source?.baseUrl,
|
|
96
|
+
invisibleDocs,
|
|
97
|
+
overall,
|
|
98
|
+
recommendations,
|
|
99
|
+
timestamp: summary.timestamp,
|
|
100
|
+
totalCanonicalDocs,
|
|
101
|
+
totalHits,
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
// ---------------------------------------------------------------------------
|
|
105
|
+
// Helpers (alphabetical for perfectionist/sort-modules)
|
|
106
|
+
// ---------------------------------------------------------------------------
|
|
107
|
+
function buildInvisibleDocs(areas) {
|
|
108
|
+
// Collect all invisible slugs and map them to the tasks that reference them
|
|
109
|
+
const slugToTasks = new Map();
|
|
110
|
+
for (const area of areas) {
|
|
111
|
+
for (const task of area.tasks) {
|
|
112
|
+
for (const slug of task.missed) {
|
|
113
|
+
// Check if this slug is globally invisible (never retrieved by ANY task)
|
|
114
|
+
const isGloballyInvisible = areas.every((a) => a.tasks.every((t) => !t.retrieved.includes(slug)));
|
|
115
|
+
if (isGloballyInvisible) {
|
|
116
|
+
if (!slugToTasks.has(slug)) {
|
|
117
|
+
slugToTasks.set(slug, new Set());
|
|
118
|
+
}
|
|
119
|
+
slugToTasks.get(slug).add(task.taskId);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
return [...slugToTasks.entries()]
|
|
125
|
+
.map(([slug, tasks]) => ({
|
|
126
|
+
referencedBy: [...tasks].sort(),
|
|
127
|
+
slug,
|
|
128
|
+
}))
|
|
129
|
+
.sort((a, b) => b.referencedBy.length - a.referencedBy.length);
|
|
130
|
+
}
|
|
131
|
+
function computeOverall(areas) {
|
|
132
|
+
if (areas.length === 0) {
|
|
133
|
+
return { avgF1: 0, avgPrecision: 0, avgRecall: 0 };
|
|
134
|
+
}
|
|
135
|
+
// Weight by task count for fair averaging
|
|
136
|
+
const totalTasks = areas.reduce((s, a) => s + a.taskCount, 0);
|
|
137
|
+
if (totalTasks === 0) {
|
|
138
|
+
return { avgF1: 0, avgPrecision: 0, avgRecall: 0 };
|
|
139
|
+
}
|
|
140
|
+
const avgRecall = areas.reduce((s, a) => s + a.avgRecall * a.taskCount, 0) / totalTasks;
|
|
141
|
+
const avgPrecision = areas.reduce((s, a) => s + a.avgPrecision * a.taskCount, 0) / totalTasks;
|
|
142
|
+
const avgF1 = areas.reduce((s, a) => s + a.avgF1 * a.taskCount, 0) / totalTasks;
|
|
143
|
+
return { avgF1, avgPrecision, avgRecall };
|
|
144
|
+
}
|
|
145
|
+
function generateRecommendations(invisibleDocs, areas, overall) {
|
|
146
|
+
const recs = [];
|
|
147
|
+
// Recommend adding invisible docs to llms.txt
|
|
148
|
+
const highImpactInvisible = invisibleDocs.filter((d) => d.referencedBy.length > 0);
|
|
149
|
+
for (const doc of highImpactInvisible.slice(0, 5)) {
|
|
150
|
+
const taskWord = doc.referencedBy.length === 1 ? "task" : "tasks";
|
|
151
|
+
recs.push(`Add \`${doc.slug}\` to llms.txt (referenced by ${doc.referencedBy.length} ${taskWord})`);
|
|
152
|
+
}
|
|
153
|
+
// Recommend cross-linking for invisible docs
|
|
154
|
+
if (invisibleDocs.length > 0) {
|
|
155
|
+
recs.push(`Improve cross-linking to ${invisibleDocs.length} invisible document${invisibleDocs.length === 1 ? "" : "s"}`);
|
|
156
|
+
}
|
|
157
|
+
// Flag low-recall areas
|
|
158
|
+
const lowRecallAreas = areas.filter((a) => a.avgRecall < 0.5);
|
|
159
|
+
for (const area of lowRecallAreas) {
|
|
160
|
+
recs.push(`Investigate low recall in \`${area.area}\` (${pct(area.avgRecall)}) — agents miss most canonical docs`);
|
|
161
|
+
}
|
|
162
|
+
// Flag low-precision areas
|
|
163
|
+
const lowPrecisionAreas = areas.filter((a) => a.avgPrecision < 0.5);
|
|
164
|
+
for (const area of lowPrecisionAreas) {
|
|
165
|
+
recs.push(`Review search relevance for \`${area.area}\` (precision ${pct(area.avgPrecision)}) — agents fetch many irrelevant docs`);
|
|
166
|
+
}
|
|
167
|
+
// Overall recommendation
|
|
168
|
+
if (overall.avgF1 < 0.6) {
|
|
169
|
+
recs.push("Overall F1 is below 0.60 — consider a documentation restructure for agent accessibility");
|
|
170
|
+
}
|
|
171
|
+
return recs;
|
|
172
|
+
}
|
|
173
|
+
function pct(value) {
|
|
174
|
+
return `${Math.round(value * 100)}%`;
|
|
175
|
+
}
|
|
176
|
+
function sortedAreas(areas) {
|
|
177
|
+
return [...areas].sort((a, b) => a.area.localeCompare(b.area));
|
|
178
|
+
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Constants and types shared across evaluation steps.
|
|
3
|
+
*
|
|
4
|
+
* Extracted from pipeline/steps/eval-step.ts so that the legacy step
|
|
5
|
+
* files can be deleted while tests and other modules retain access
|
|
6
|
+
* to these shared definitions.
|
|
7
|
+
*/
|
|
8
|
+
import type { ConcreteEvalMode, DebugOptions, FilterOptions, StepResult } from "../_vendor/ailf-core/index.d.ts";
|
|
9
|
+
/** Promptfoo config file per concrete eval mode */
|
|
10
|
+
export declare const CONFIG_FILES: Record<ConcreteEvalMode, string>;
|
|
11
|
+
/** Each mode writes eval results to a different file (set in the config's outputPath) */
|
|
12
|
+
export declare const RESULTS_FILES: Record<ConcreteEvalMode, string>;
|
|
13
|
+
/** Extended step result that carries cache metadata for downstream steps */
|
|
14
|
+
export interface EvalStepResult {
|
|
15
|
+
/** The computed eval fingerprint (for publishing in provenance) */
|
|
16
|
+
evalFingerprint?: string;
|
|
17
|
+
/** Whether this result came from a remote cache hit */
|
|
18
|
+
remoteCacheHit?: boolean;
|
|
19
|
+
/** The step result */
|
|
20
|
+
stepResult: StepResult;
|
|
21
|
+
}
|
|
22
|
+
/** Options for the remote cache (Content Lake fingerprint lookup) */
|
|
23
|
+
export interface RemoteCacheOptions {
|
|
24
|
+
/** Whether this is a debug run (debug runs don't use remote cache) */
|
|
25
|
+
debug?: boolean;
|
|
26
|
+
/** Filter options used for fingerprint computation */
|
|
27
|
+
filter?: FilterOptions;
|
|
28
|
+
/** Grader model identifier from models.yaml */
|
|
29
|
+
graderModel: string;
|
|
30
|
+
/** Disable remote cache lookup (--no-remote-cache) */
|
|
31
|
+
noRemoteCache?: boolean;
|
|
32
|
+
/** Sanity API token for reading cached reports */
|
|
33
|
+
sanityToken?: string;
|
|
34
|
+
}
|
|
35
|
+
/** Minimal shape of a raw Promptfoo result entry for error scanning */
|
|
36
|
+
export interface RawResult {
|
|
37
|
+
description?: string;
|
|
38
|
+
error?: string;
|
|
39
|
+
gradingResult?: null | {
|
|
40
|
+
pass: boolean;
|
|
41
|
+
};
|
|
42
|
+
provider?: {
|
|
43
|
+
id?: string;
|
|
44
|
+
label?: string;
|
|
45
|
+
};
|
|
46
|
+
success?: boolean;
|
|
47
|
+
testCase?: {
|
|
48
|
+
description?: string;
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Build promptfoo filter flags from debug options.
|
|
53
|
+
*/
|
|
54
|
+
export declare function buildFilterFlags(debug?: DebugOptions): string;
|
|
55
|
+
/**
|
|
56
|
+
* Extract the Promptfoo share URL from the eval results JSON.
|
|
57
|
+
*
|
|
58
|
+
* Promptfoo writes a `shareableUrl` field into the results file when
|
|
59
|
+
* `PROMPTFOO_API_KEY` is set.
|
|
60
|
+
*/
|
|
61
|
+
export declare function extractShareUrl(resultsPath: string): string | undefined;
|
|
62
|
+
/**
|
|
63
|
+
* Read the eval results JSON and produce a human-readable summary of any
|
|
64
|
+
* errored or failed tests.
|
|
65
|
+
*
|
|
66
|
+
* Returns null if there are no errors/failures worth reporting.
|
|
67
|
+
*/
|
|
68
|
+
export declare function scanResultsForErrors(resultsPath: string): null | string;
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Constants and types shared across evaluation steps.
|
|
3
|
+
*
|
|
4
|
+
* Extracted from pipeline/steps/eval-step.ts so that the legacy step
|
|
5
|
+
* files can be deleted while tests and other modules retain access
|
|
6
|
+
* to these shared definitions.
|
|
7
|
+
*/
|
|
8
|
+
import { existsSync, readFileSync } from "fs";
|
|
9
|
+
/** Promptfoo config file per concrete eval mode */
|
|
10
|
+
export const CONFIG_FILES = {
|
|
11
|
+
agentic: "promptfooconfig.agentic.yaml",
|
|
12
|
+
baseline: "promptfooconfig.yaml",
|
|
13
|
+
observed: "promptfooconfig.observed.yaml",
|
|
14
|
+
};
|
|
15
|
+
/** Each mode writes eval results to a different file (set in the config's outputPath) */
|
|
16
|
+
export const RESULTS_FILES = {
|
|
17
|
+
agentic: "results/latest/eval-results-agentic.json",
|
|
18
|
+
baseline: "results/latest/eval-results.json",
|
|
19
|
+
observed: "results/latest/eval-results-observed.json",
|
|
20
|
+
};
|
|
21
|
+
/**
|
|
22
|
+
* Build promptfoo filter flags from debug options.
|
|
23
|
+
*/
|
|
24
|
+
export function buildFilterFlags(debug) {
|
|
25
|
+
if (!debug?.enabled)
|
|
26
|
+
return "";
|
|
27
|
+
const flags = [];
|
|
28
|
+
if (debug.pattern) {
|
|
29
|
+
flags.push(`--filter-pattern '${debug.pattern}'`);
|
|
30
|
+
}
|
|
31
|
+
if (debug.sample) {
|
|
32
|
+
flags.push(`--filter-sample ${debug.sample}`);
|
|
33
|
+
}
|
|
34
|
+
if (debug.firstN) {
|
|
35
|
+
flags.push(`--filter-first-n ${debug.firstN}`);
|
|
36
|
+
}
|
|
37
|
+
// Default: first 2 tests when no other filters specified
|
|
38
|
+
if (flags.length === 0) {
|
|
39
|
+
flags.push("--filter-first-n 2");
|
|
40
|
+
}
|
|
41
|
+
return " " + flags.join(" ");
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Extract the Promptfoo share URL from the eval results JSON.
|
|
45
|
+
*
|
|
46
|
+
* Promptfoo writes a `shareableUrl` field into the results file when
|
|
47
|
+
* `PROMPTFOO_API_KEY` is set.
|
|
48
|
+
*/
|
|
49
|
+
export function extractShareUrl(resultsPath) {
|
|
50
|
+
if (!existsSync(resultsPath))
|
|
51
|
+
return undefined;
|
|
52
|
+
try {
|
|
53
|
+
const raw = readFileSync(resultsPath, "utf-8");
|
|
54
|
+
const data = JSON.parse(raw);
|
|
55
|
+
return data.shareableUrl ?? undefined;
|
|
56
|
+
}
|
|
57
|
+
catch {
|
|
58
|
+
return undefined;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Read the eval results JSON and produce a human-readable summary of any
|
|
63
|
+
* errored or failed tests.
|
|
64
|
+
*
|
|
65
|
+
* Returns null if there are no errors/failures worth reporting.
|
|
66
|
+
*/
|
|
67
|
+
export function scanResultsForErrors(resultsPath) {
|
|
68
|
+
if (!existsSync(resultsPath))
|
|
69
|
+
return null;
|
|
70
|
+
let file;
|
|
71
|
+
try {
|
|
72
|
+
const raw = readFileSync(resultsPath, "utf-8");
|
|
73
|
+
file = JSON.parse(raw);
|
|
74
|
+
}
|
|
75
|
+
catch {
|
|
76
|
+
return null;
|
|
77
|
+
}
|
|
78
|
+
const results = file?.results?.results;
|
|
79
|
+
if (!Array.isArray(results))
|
|
80
|
+
return null;
|
|
81
|
+
const errored = [];
|
|
82
|
+
for (const r of results) {
|
|
83
|
+
if (r.gradingResult !== null)
|
|
84
|
+
continue;
|
|
85
|
+
const desc = r.testCase?.description ?? r.description ?? "unknown";
|
|
86
|
+
const provider = r.provider?.label ?? r.provider?.id ?? "unknown";
|
|
87
|
+
const errorMsg = r.error
|
|
88
|
+
? (typeof r.error === "string" ? r.error : JSON.stringify(r.error)).slice(0, 200)
|
|
89
|
+
: "Provider returned no scorable result";
|
|
90
|
+
errored.push({ description: desc, error: errorMsg, provider });
|
|
91
|
+
}
|
|
92
|
+
if (errored.length === 0)
|
|
93
|
+
return null;
|
|
94
|
+
const total = results.length;
|
|
95
|
+
const lines = [];
|
|
96
|
+
lines.push(` ┌─────────────────────────────────────────────────────────────`);
|
|
97
|
+
lines.push(` │ ⚠️ ${errored.length} of ${total} eval result(s) errored (no gradingResult)`);
|
|
98
|
+
lines.push(` │`);
|
|
99
|
+
for (const e of errored) {
|
|
100
|
+
lines.push(` │ ✗ [${e.provider}] ${e.description}`);
|
|
101
|
+
lines.push(` │ → ${e.error}`);
|
|
102
|
+
}
|
|
103
|
+
const errorRate = Math.round((errored.length / total) * 100);
|
|
104
|
+
if (errorRate >= 25) {
|
|
105
|
+
lines.push(` │`);
|
|
106
|
+
lines.push(` │ 🔥 High error rate (${errorRate}%) — check API keys, rate limits,`);
|
|
107
|
+
lines.push(` │ or model availability. Errored results are excluded from scoring.`);
|
|
108
|
+
}
|
|
109
|
+
lines.push(` └─────────────────────────────────────────────────────────────`);
|
|
110
|
+
return lines.join("\n");
|
|
111
|
+
}
|