@sanity/ailf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -0
- package/bin/ailf.js +64 -0
- package/canonical/grader-references/README.md +88 -0
- package/canonical/grader-references/groq.yaml +234 -0
- package/canonical/grader-references/studio-setup.yaml +275 -0
- package/canonical/reference-solutions/.gitkeep +1 -0
- package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
- package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
- package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
- package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
- package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
- package/canonical/reference-solutions/groq/joins-references.ts +300 -0
- package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
- package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
- package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
- package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
- package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
- package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
- package/config/bigquery/README.md +74 -0
- package/config/bigquery/views/area_scores.sql +87 -0
- package/config/bigquery/views/reports.sql +49 -0
- package/config/features.yaml +116 -0
- package/config/models.yaml +115 -0
- package/config/prompts.yaml +75 -0
- package/config/rubrics.yaml +62 -0
- package/config/schedules.yaml +43 -0
- package/config/sinks.yaml +54 -0
- package/config/sources.yaml +51 -0
- package/config/thresholds.yaml +49 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
- package/dist/_vendor/ailf-core/examples/index.js +285 -0
- package/dist/_vendor/ailf-core/index.d.ts +17 -0
- package/dist/_vendor/ailf-core/index.js +17 -0
- package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
- package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
- package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
- package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
- package/dist/_vendor/ailf-core/ports/context.js +14 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
- package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
- package/dist/_vendor/ailf-core/ports/index.js +7 -0
- package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
- package/dist/_vendor/ailf-core/ports/logger.js +11 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
- package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
- package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/schemas/index.js +16 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
- package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
- package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
- package/dist/_vendor/ailf-core/services/index.js +12 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/scoring.js +222 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
- package/dist/_vendor/ailf-core/types/index.js +21 -0
- package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
- package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
- package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
- package/dist/_vendor/ailf-shared/document-ref.js +1 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -0
- package/dist/_vendor/ailf-shared/index.js +16 -0
- package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
- package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
- package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
- package/dist/_vendor/ailf-shared/score-grades.js +23 -0
- package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
- package/dist/adapters/cache/content-lake-cache.js +59 -0
- package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
- package/dist/adapters/cache/filesystem-cache.js +54 -0
- package/dist/adapters/cache/index.d.ts +2 -0
- package/dist/adapters/cache/index.js +2 -0
- package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
- package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
- package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
- package/dist/adapters/config-sources/file-config-adapter.js +96 -0
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +2 -0
- package/dist/adapters/doc-fetchers/index.d.ts +1 -0
- package/dist/adapters/doc-fetchers/index.js +1 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
- package/dist/adapters/eval-runners/index.d.ts +1 -0
- package/dist/adapters/eval-runners/index.js +1 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
- package/dist/adapters/index.d.ts +12 -0
- package/dist/adapters/index.js +12 -0
- package/dist/adapters/loggers/console-logger.d.ts +22 -0
- package/dist/adapters/loggers/console-logger.js +54 -0
- package/dist/adapters/loggers/index.d.ts +9 -0
- package/dist/adapters/loggers/index.js +9 -0
- package/dist/adapters/loggers/json-logger.d.ts +18 -0
- package/dist/adapters/loggers/json-logger.js +33 -0
- package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
- package/dist/adapters/loggers/quiet-logger.js +30 -0
- package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/composite-task-source.js +59 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
- package/dist/adapters/task-sources/index.d.ts +7 -0
- package/dist/adapters/task-sources/index.js +7 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
- package/dist/adapters/task-sources/repo-schemas.js +234 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
- package/dist/adapters/task-sources/repo-task-source.js +104 -0
- package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
- package/dist/adapters/task-sources/repo-trigger.js +153 -0
- package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
- package/dist/adapters/task-sources/repo-validation.js +164 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
- package/dist/adapters/task-sources/yaml-task-source.js +136 -0
- package/dist/agent-observer/agentic-provider.d.ts +132 -0
- package/dist/agent-observer/agentic-provider.js +983 -0
- package/dist/agent-observer/classifier.d.ts +62 -0
- package/dist/agent-observer/classifier.js +269 -0
- package/dist/agent-observer/index.d.ts +7 -0
- package/dist/agent-observer/index.js +4 -0
- package/dist/agent-observer/pricing.d.ts +35 -0
- package/dist/agent-observer/pricing.js +82 -0
- package/dist/agent-observer/provider.d.ts +77 -0
- package/dist/agent-observer/provider.js +151 -0
- package/dist/agent-observer/proxy.d.ts +91 -0
- package/dist/agent-observer/proxy.js +321 -0
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/agent-observer/types.d.ts +137 -0
- package/dist/agent-observer/types.js +16 -0
- package/dist/assertions/source-isolation.d.ts +72 -0
- package/dist/assertions/source-isolation.js +117 -0
- package/dist/cli.d.ts +24 -0
- package/dist/cli.js +199 -0
- package/dist/commands/agent-report.d.ts +5 -0
- package/dist/commands/agent-report.js +69 -0
- package/dist/commands/baseline.d.ts +9 -0
- package/dist/commands/baseline.js +141 -0
- package/dist/commands/cache.d.ts +13 -0
- package/dist/commands/cache.js +135 -0
- package/dist/commands/calculate-scores.d.ts +8 -0
- package/dist/commands/calculate-scores.js +48 -0
- package/dist/commands/compare.d.ts +8 -0
- package/dist/commands/compare.js +120 -0
- package/dist/commands/completion.d.ts +18 -0
- package/dist/commands/completion.js +260 -0
- package/dist/commands/coverage-audit.d.ts +7 -0
- package/dist/commands/coverage-audit.js +40 -0
- package/dist/commands/discovery-report.d.ts +10 -0
- package/dist/commands/discovery-report.js +44 -0
- package/dist/commands/eval.d.ts +9 -0
- package/dist/commands/eval.js +35 -0
- package/dist/commands/explain-handler.d.ts +34 -0
- package/dist/commands/explain-handler.js +719 -0
- package/dist/commands/fetch-docs.d.ts +8 -0
- package/dist/commands/fetch-docs.js +128 -0
- package/dist/commands/generate-configs.d.ts +8 -0
- package/dist/commands/generate-configs.js +46 -0
- package/dist/commands/grader/index.d.ts +11 -0
- package/dist/commands/grader/index.js +118 -0
- package/dist/commands/init.d.ts +19 -0
- package/dist/commands/init.js +150 -0
- package/dist/commands/interactive.d.ts +12 -0
- package/dist/commands/interactive.js +238 -0
- package/dist/commands/lookup-doc.d.ts +15 -0
- package/dist/commands/lookup-doc.js +84 -0
- package/dist/commands/measure-retrieval.d.ts +5 -0
- package/dist/commands/measure-retrieval.js +65 -0
- package/dist/commands/pipeline-action.d.ts +71 -0
- package/dist/commands/pipeline-action.js +305 -0
- package/dist/commands/pipeline.d.ts +62 -0
- package/dist/commands/pipeline.js +53 -0
- package/dist/commands/pr-comment.d.ts +8 -0
- package/dist/commands/pr-comment.js +47 -0
- package/dist/commands/publish.d.ts +26 -0
- package/dist/commands/publish.js +253 -0
- package/dist/commands/readiness-report.d.ts +10 -0
- package/dist/commands/readiness-report.js +104 -0
- package/dist/commands/shared/options.d.ts +29 -0
- package/dist/commands/shared/options.js +57 -0
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +16 -0
- package/dist/commands/validate-tasks.js +93 -0
- package/dist/commands/validate.d.ts +9 -0
- package/dist/commands/validate.js +73 -0
- package/dist/commands/webhook-server.d.ts +5 -0
- package/dist/commands/webhook-server.js +30 -0
- package/dist/commands/weekly-digest.d.ts +10 -0
- package/dist/commands/weekly-digest.js +104 -0
- package/dist/composition-root.d.ts +26 -0
- package/dist/composition-root.js +107 -0
- package/dist/interpolate.d.ts +26 -0
- package/dist/interpolate.js +70 -0
- package/dist/job-store.d.ts +104 -0
- package/dist/job-store.js +188 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.d.ts +27 -0
- package/dist/orchestration/build-app-context.js +81 -0
- package/dist/orchestration/build-step-sequence.d.ts +15 -0
- package/dist/orchestration/build-step-sequence.js +84 -0
- package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
- package/dist/orchestration/config-to-source-overrides.js +28 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/index.d.ts +11 -0
- package/dist/orchestration/index.js +11 -0
- package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
- package/dist/orchestration/pipeline-orchestrator.js +153 -0
- package/dist/orchestration/step-runner.d.ts +20 -0
- package/dist/orchestration/step-runner.js +88 -0
- package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
- package/dist/orchestration/steps/calculate-scores-step.js +95 -0
- package/dist/orchestration/steps/callback-step.d.ts +24 -0
- package/dist/orchestration/steps/callback-step.js +76 -0
- package/dist/orchestration/steps/compare-step.d.ts +14 -0
- package/dist/orchestration/steps/compare-step.js +92 -0
- package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
- package/dist/orchestration/steps/discovery-report-step.js +55 -0
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
- package/dist/orchestration/steps/fetch-docs-step.js +135 -0
- package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
- package/dist/orchestration/steps/gap-analysis-step.js +136 -0
- package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
- package/dist/orchestration/steps/generate-configs-step.js +85 -0
- package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
- package/dist/orchestration/steps/grader-consistency-step.js +64 -0
- package/dist/orchestration/steps/index.d.ts +19 -0
- package/dist/orchestration/steps/index.js +19 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
- package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
- package/dist/orchestration/steps/publish-report-step.js +216 -0
- package/dist/orchestration/steps/readiness-step.d.ts +13 -0
- package/dist/orchestration/steps/readiness-step.js +91 -0
- package/dist/orchestration/steps/report-step.d.ts +12 -0
- package/dist/orchestration/steps/report-step.js +49 -0
- package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
- package/dist/orchestration/steps/run-eval-step.js +195 -0
- package/dist/orchestration/steps/validate-step.d.ts +12 -0
- package/dist/orchestration/steps/validate-step.js +41 -0
- package/dist/pipeline/agent-behavior-report.d.ts +53 -0
- package/dist/pipeline/agent-behavior-report.js +132 -0
- package/dist/pipeline/attribution.d.ts +47 -0
- package/dist/pipeline/attribution.js +226 -0
- package/dist/pipeline/baseline.d.ts +37 -0
- package/dist/pipeline/baseline.js +141 -0
- package/dist/pipeline/cache.d.ts +101 -0
- package/dist/pipeline/cache.js +283 -0
- package/dist/pipeline/calculate-scores.d.ts +102 -0
- package/dist/pipeline/calculate-scores.js +1128 -0
- package/dist/pipeline/callback-delivery.d.ts +50 -0
- package/dist/pipeline/callback-delivery.js +89 -0
- package/dist/pipeline/checks.d.ts +39 -0
- package/dist/pipeline/checks.js +280 -0
- package/dist/pipeline/classify-url.d.ts +61 -0
- package/dist/pipeline/classify-url.js +93 -0
- package/dist/pipeline/compare.d.ts +31 -0
- package/dist/pipeline/compare.js +208 -0
- package/dist/pipeline/coverage-audit.d.ts +39 -0
- package/dist/pipeline/coverage-audit.js +165 -0
- package/dist/pipeline/degradations.d.ts +85 -0
- package/dist/pipeline/degradations.js +242 -0
- package/dist/pipeline/discovery-report.d.ts +55 -0
- package/dist/pipeline/discovery-report.js +178 -0
- package/dist/pipeline/eval-constants.d.ts +68 -0
- package/dist/pipeline/eval-constants.js +111 -0
- package/dist/pipeline/eval-fingerprint.d.ts +66 -0
- package/dist/pipeline/eval-fingerprint.js +175 -0
- package/dist/pipeline/expand-tasks.d.ts +220 -0
- package/dist/pipeline/expand-tasks.js +421 -0
- package/dist/pipeline/failure-modes.d.ts +46 -0
- package/dist/pipeline/failure-modes.js +348 -0
- package/dist/pipeline/fetch-url-content.d.ts +44 -0
- package/dist/pipeline/fetch-url-content.js +93 -0
- package/dist/pipeline/gap-analysis.d.ts +48 -0
- package/dist/pipeline/gap-analysis.js +231 -0
- package/dist/pipeline/generate-configs.d.ts +72 -0
- package/dist/pipeline/generate-configs.js +395 -0
- package/dist/pipeline/grader-api.d.ts +49 -0
- package/dist/pipeline/grader-api.js +200 -0
- package/dist/pipeline/grader-compare-runner.d.ts +44 -0
- package/dist/pipeline/grader-compare-runner.js +301 -0
- package/dist/pipeline/grader-comparison.d.ts +111 -0
- package/dist/pipeline/grader-comparison.js +161 -0
- package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
- package/dist/pipeline/grader-consistency-runner.js +270 -0
- package/dist/pipeline/grader-consistency.d.ts +103 -0
- package/dist/pipeline/grader-consistency.js +146 -0
- package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
- package/dist/pipeline/grader-sensitivity-runner.js +282 -0
- package/dist/pipeline/grader-sensitivity.d.ts +94 -0
- package/dist/pipeline/grader-sensitivity.js +144 -0
- package/dist/pipeline/grader-validate-runner.d.ts +38 -0
- package/dist/pipeline/grader-validate-runner.js +229 -0
- package/dist/pipeline/grader-validation.d.ts +107 -0
- package/dist/pipeline/grader-validation.js +169 -0
- package/dist/pipeline/map-request-to-config.d.ts +19 -0
- package/dist/pipeline/map-request-to-config.js +80 -0
- package/dist/pipeline/measure-retrieval.d.ts +59 -0
- package/dist/pipeline/measure-retrieval.js +111 -0
- package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
- package/dist/pipeline/mirror-repo-tasks.js +350 -0
- package/dist/pipeline/plan-format.d.ts +33 -0
- package/dist/pipeline/plan-format.js +202 -0
- package/dist/pipeline/plan.d.ts +169 -0
- package/dist/pipeline/plan.js +708 -0
- package/dist/pipeline/pr-comment.d.ts +19 -0
- package/dist/pipeline/pr-comment.js +502 -0
- package/dist/pipeline/probe.d.ts +52 -0
- package/dist/pipeline/probe.js +390 -0
- package/dist/pipeline/provenance.d.ts +47 -0
- package/dist/pipeline/provenance.js +146 -0
- package/dist/pipeline/readiness-report.d.ts +87 -0
- package/dist/pipeline/readiness-report.js +205 -0
- package/dist/pipeline/release-classification.d.ts +54 -0
- package/dist/pipeline/release-classification.js +238 -0
- package/dist/pipeline/release-report.d.ts +37 -0
- package/dist/pipeline/release-report.js +222 -0
- package/dist/pipeline/repo-eval-comment.d.ts +37 -0
- package/dist/pipeline/repo-eval-comment.js +165 -0
- package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
- package/dist/pipeline/repo-threshold-evaluator.js +162 -0
- package/dist/pipeline/resolve-mappings.d.ts +35 -0
- package/dist/pipeline/resolve-mappings.js +72 -0
- package/dist/pipeline/retrieval-metrics.d.ts +39 -0
- package/dist/pipeline/retrieval-metrics.js +136 -0
- package/dist/pipeline/reverse-mapping.d.ts +67 -0
- package/dist/pipeline/reverse-mapping.js +88 -0
- package/dist/pipeline/schemas.d.ts +9 -0
- package/dist/pipeline/schemas.js +9 -0
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +88 -0
- package/dist/pipeline/targeted-loo.js +203 -0
- package/dist/pipeline/thresholds.d.ts +27 -0
- package/dist/pipeline/thresholds.js +245 -0
- package/dist/pipeline/types.d.ts +10 -0
- package/dist/pipeline/types.js +10 -0
- package/dist/pipeline/validate.d.ts +67 -0
- package/dist/pipeline/validate.js +406 -0
- package/dist/pipeline/webhook-server.d.ts +37 -0
- package/dist/pipeline/webhook-server.js +133 -0
- package/dist/report-store.d.ts +84 -0
- package/dist/report-store.js +208 -0
- package/dist/sanity/client.d.ts +38 -0
- package/dist/sanity/client.js +86 -0
- package/dist/sanity/portable-text.d.ts +11 -0
- package/dist/sanity/portable-text.js +211 -0
- package/dist/sanity/queries.d.ts +133 -0
- package/dist/sanity/queries.js +300 -0
- package/dist/schedules/digest.d.ts +116 -0
- package/dist/schedules/digest.js +156 -0
- package/dist/schedules/index.d.ts +12 -0
- package/dist/schedules/index.js +10 -0
- package/dist/schedules/loader.d.ts +31 -0
- package/dist/schedules/loader.js +73 -0
- package/dist/schedules/schema.d.ts +9 -0
- package/dist/schedules/schema.js +9 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +21 -0
- package/dist/scripts/validate-task-sources.js +210 -0
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/bigquery/index.d.ts +131 -0
- package/dist/sinks/bigquery/index.js +222 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/index.d.ts +23 -0
- package/dist/sinks/index.js +18 -0
- package/dist/sinks/loader.d.ts +18 -0
- package/dist/sinks/loader.js +82 -0
- package/dist/sinks/retry.d.ts +24 -0
- package/dist/sinks/retry.js +52 -0
- package/dist/sinks/schema.d.ts +9 -0
- package/dist/sinks/schema.js +9 -0
- package/dist/sinks/slack/format.d.ts +65 -0
- package/dist/sinks/slack/format.js +327 -0
- package/dist/sinks/slack/index.d.ts +27 -0
- package/dist/sinks/slack/index.js +78 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +59 -0
- package/dist/sinks/types.js +44 -0
- package/dist/sinks/webhook/index.d.ts +19 -0
- package/dist/sinks/webhook/index.js +50 -0
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/sources.d.ts +104 -0
- package/dist/sources.js +292 -0
- package/dist/webhook/budget.d.ts +42 -0
- package/dist/webhook/budget.js +60 -0
- package/dist/webhook/debounce.d.ts +67 -0
- package/dist/webhook/debounce.js +76 -0
- package/dist/webhook/dispatch.d.ts +45 -0
- package/dist/webhook/dispatch.js +84 -0
- package/dist/webhook/eval-request-handler.d.ts +87 -0
- package/dist/webhook/eval-request-handler.js +181 -0
- package/dist/webhook/handler.d.ts +88 -0
- package/dist/webhook/handler.js +203 -0
- package/dist/webhook/index.d.ts +17 -0
- package/dist/webhook/index.js +12 -0
- package/dist/webhook/types.d.ts +109 -0
- package/dist/webhook/types.js +10 -0
- package/package.json +72 -0
- package/tasks/.expanded.agentic.yaml +51 -0
- package/tasks/.expanded.yaml +66 -0
- package/tasks/frameworks.yaml +98 -0
- package/tasks/functions.yaml +51 -0
- package/tasks/groq.yaml +216 -0
- package/tasks/nextjs-live.yaml +62 -0
- package/tasks/studio-setup.yaml +111 -0
- package/tasks/visual-editing.yaml +120 -0
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* types.ts
|
|
3
|
+
*
|
|
4
|
+
* Data structures for capturing AI agent behavior during evaluation.
|
|
5
|
+
*
|
|
6
|
+
* When an AI agent runs a task (e.g., "implement a Sanity studio schema"),
|
|
7
|
+
* it may browse documentation, search for APIs, download code samples, etc.
|
|
8
|
+
* These types describe the observed network activity so we can answer:
|
|
9
|
+
*
|
|
10
|
+
* - What URLs did the agent visit?
|
|
11
|
+
* - What content did it actually download?
|
|
12
|
+
* - Did it search sanity.io/docs? What queries?
|
|
13
|
+
* - How long did it spend fetching vs generating?
|
|
14
|
+
* - Did it find the *right* documentation pages?
|
|
15
|
+
*/
|
|
16
|
+
export interface AgentBehaviorLog {
|
|
17
|
+
/** Sanity API calls (non-docs) */
|
|
18
|
+
apiCalls: ApiCall[];
|
|
19
|
+
/** Sanity doc pages the agent visited */
|
|
20
|
+
docPageVisits: DocPageVisit[];
|
|
21
|
+
/** ISO 8601 timestamp when observation ended */
|
|
22
|
+
endedAt: string;
|
|
23
|
+
/** Requests to non-Sanity domains */
|
|
24
|
+
externalRequests: ExternalRequest[];
|
|
25
|
+
/** Time spent waiting on network requests, in ms */
|
|
26
|
+
networkDurationMs: number;
|
|
27
|
+
/** Provider that was observed (e.g., "openai:gpt-4o") */
|
|
28
|
+
provider: string;
|
|
29
|
+
/** Every HTTP request/response observed, in order */
|
|
30
|
+
requests: ObservedRequest[];
|
|
31
|
+
/** Search queries the agent performed */
|
|
32
|
+
searchQueries: SearchQuery[];
|
|
33
|
+
/** Unique identifier for this observation session */
|
|
34
|
+
sessionId: string;
|
|
35
|
+
/** ISO 8601 timestamp when observation started */
|
|
36
|
+
startedAt: string;
|
|
37
|
+
summary: AgentBehaviorSummary;
|
|
38
|
+
/** Task description from the test case */
|
|
39
|
+
taskDescription: string;
|
|
40
|
+
/** Total wall-clock time for the test, in ms */
|
|
41
|
+
totalDurationMs: number;
|
|
42
|
+
}
|
|
43
|
+
/** Roll-up stats for quick analysis */
|
|
44
|
+
export interface AgentBehaviorSummary {
|
|
45
|
+
/** Number of Sanity API calls */
|
|
46
|
+
apiCallCount: number;
|
|
47
|
+
/** Number of sanity.io doc pages visited */
|
|
48
|
+
docPagesVisited: number;
|
|
49
|
+
/** List of unique sanity.io doc slugs visited */
|
|
50
|
+
docSlugsVisited: string[];
|
|
51
|
+
/** List of unique external domains contacted */
|
|
52
|
+
externalDomains: string[];
|
|
53
|
+
/** Number of external (non-Sanity) requests */
|
|
54
|
+
externalRequestCount: number;
|
|
55
|
+
/** Number of search queries performed */
|
|
56
|
+
searchesPerformed: number;
|
|
57
|
+
/** Total bytes downloaded */
|
|
58
|
+
totalBytesDownloaded: number;
|
|
59
|
+
/** Total network time in ms */
|
|
60
|
+
totalNetworkMs: number;
|
|
61
|
+
/** Total number of HTTP requests observed */
|
|
62
|
+
totalRequests: number;
|
|
63
|
+
/** List of unique search queries */
|
|
64
|
+
uniqueSearchQueries: string[];
|
|
65
|
+
/** Number of unique URLs visited */
|
|
66
|
+
uniqueUrls: number;
|
|
67
|
+
/** Whether the agent visited any sanity.io docs at all */
|
|
68
|
+
usedDocs: boolean;
|
|
69
|
+
/** Whether the agent performed any searches */
|
|
70
|
+
usedSearch: boolean;
|
|
71
|
+
}
|
|
72
|
+
/** An API call to Sanity's API (not docs) */
|
|
73
|
+
export interface ApiCall {
|
|
74
|
+
/** API endpoint path */
|
|
75
|
+
endpoint: string;
|
|
76
|
+
/** HTTP method */
|
|
77
|
+
method: string;
|
|
78
|
+
/** Timestamp */
|
|
79
|
+
timestamp: string;
|
|
80
|
+
/** Full URL */
|
|
81
|
+
url: string;
|
|
82
|
+
}
|
|
83
|
+
/** A page the agent visited on sanity.io/docs */
|
|
84
|
+
export interface DocPageVisit {
|
|
85
|
+
/** Response size in bytes */
|
|
86
|
+
contentSize: number;
|
|
87
|
+
/** Slug extracted from the URL, e.g., "groq-introduction" */
|
|
88
|
+
slug: string;
|
|
89
|
+
/** Timestamp of the visit */
|
|
90
|
+
timestamp: string;
|
|
91
|
+
/** Page title if extractable from response */
|
|
92
|
+
title?: string;
|
|
93
|
+
url: string;
|
|
94
|
+
}
|
|
95
|
+
/** A request to a non-Sanity URL */
|
|
96
|
+
export interface ExternalRequest {
|
|
97
|
+
/** Domain extracted from URL */
|
|
98
|
+
domain: string;
|
|
99
|
+
method: string;
|
|
100
|
+
timestamp: string;
|
|
101
|
+
url: string;
|
|
102
|
+
}
|
|
103
|
+
export interface ObservedRequest {
|
|
104
|
+
/** Request body (for POST searches, etc.), truncated to maxBodyBytes */
|
|
105
|
+
body?: string;
|
|
106
|
+
/** Content-Type of the response */
|
|
107
|
+
contentType?: string;
|
|
108
|
+
/** Relevant request headers (e.g., Accept, User-Agent) */
|
|
109
|
+
headers: Record<string, string>;
|
|
110
|
+
/** Time from request start to response complete, in ms */
|
|
111
|
+
latencyMs: number;
|
|
112
|
+
/** HTTP method */
|
|
113
|
+
method: string;
|
|
114
|
+
/** Response body preview (first N chars), useful for seeing what the agent actually read */
|
|
115
|
+
responsePreview?: string;
|
|
116
|
+
/** Response body size in bytes */
|
|
117
|
+
responseSize: number;
|
|
118
|
+
/** Monotonic sequence number within the test run */
|
|
119
|
+
seq: number;
|
|
120
|
+
/** HTTP status code of the response */
|
|
121
|
+
statusCode: number;
|
|
122
|
+
/** ISO 8601 timestamp when the request was initiated */
|
|
123
|
+
timestamp: string;
|
|
124
|
+
/** Full URL requested */
|
|
125
|
+
url: string;
|
|
126
|
+
}
|
|
127
|
+
/** A search query the agent performed */
|
|
128
|
+
export interface SearchQuery {
|
|
129
|
+
/** Extracted query string */
|
|
130
|
+
query: string;
|
|
131
|
+
/** Number of results returned (if detectable) */
|
|
132
|
+
resultCount?: number;
|
|
133
|
+
/** Timestamp */
|
|
134
|
+
timestamp: string;
|
|
135
|
+
/** The search endpoint URL */
|
|
136
|
+
url: string;
|
|
137
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* types.ts
|
|
3
|
+
*
|
|
4
|
+
* Data structures for capturing AI agent behavior during evaluation.
|
|
5
|
+
*
|
|
6
|
+
* When an AI agent runs a task (e.g., "implement a Sanity studio schema"),
|
|
7
|
+
* it may browse documentation, search for APIs, download code samples, etc.
|
|
8
|
+
* These types describe the observed network activity so we can answer:
|
|
9
|
+
*
|
|
10
|
+
* - What URLs did the agent visit?
|
|
11
|
+
* - What content did it actually download?
|
|
12
|
+
* - Did it search sanity.io/docs? What queries?
|
|
13
|
+
* - How long did it spend fetching vs generating?
|
|
14
|
+
* - Did it find the *right* documentation pages?
|
|
15
|
+
*/
|
|
16
|
+
export {};
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* source-isolation.ts
|
|
3
|
+
*
|
|
4
|
+
* Promptfoo custom assertion: verifies that the agentic provider only
|
|
5
|
+
* fetched documentation from the configured allowed origins.
|
|
6
|
+
*
|
|
7
|
+
* Compiled to dist/assertions/source-isolation.js and referenced via
|
|
8
|
+
* file:// in generated Promptfoo configs.
|
|
9
|
+
*
|
|
10
|
+
* The assertion has weight: 0 — it doesn't affect scores. It surfaces
|
|
11
|
+
* as an advisory pass/fail signal in results.
|
|
12
|
+
*
|
|
13
|
+
* @see docs/exec-plans/completed/source-aware-eval-isolation.md (Phase 3b)
|
|
14
|
+
*/
|
|
15
|
+
export interface SourceIsolationReport {
|
|
16
|
+
blocked: number;
|
|
17
|
+
isolationScore: number;
|
|
18
|
+
offOrigin: number;
|
|
19
|
+
offOriginUrls: string[];
|
|
20
|
+
onOrigin: number;
|
|
21
|
+
originBreakdown: Record<string, number>;
|
|
22
|
+
total: number;
|
|
23
|
+
}
|
|
24
|
+
interface AgentBehaviorData {
|
|
25
|
+
docPageVisits?: {
|
|
26
|
+
slug: string;
|
|
27
|
+
url: string;
|
|
28
|
+
}[];
|
|
29
|
+
requests?: {
|
|
30
|
+
statusCode: number;
|
|
31
|
+
url: string;
|
|
32
|
+
}[];
|
|
33
|
+
}
|
|
34
|
+
interface AssertionContext {
|
|
35
|
+
providerResponse?: {
|
|
36
|
+
metadata?: {
|
|
37
|
+
agentBehavior?: AgentBehaviorData;
|
|
38
|
+
};
|
|
39
|
+
};
|
|
40
|
+
vars?: Record<string, string>;
|
|
41
|
+
}
|
|
42
|
+
interface GradingResult {
|
|
43
|
+
pass: boolean;
|
|
44
|
+
reason: string;
|
|
45
|
+
score: number;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Analyze doc page visits and compute an isolation score.
|
|
49
|
+
*
|
|
50
|
+
* The isolation score is the fraction of doc-fetching requests that went
|
|
51
|
+
* to an allowed origin. A score of 1.0 means perfect isolation.
|
|
52
|
+
*
|
|
53
|
+
* @param docPageVisits - Classified doc page visit records
|
|
54
|
+
* @param allowedOrigins - Origin patterns (glob-capable)
|
|
55
|
+
* @returns Isolation report with score and breakdown
|
|
56
|
+
*/
|
|
57
|
+
export declare function analyzeSourceIsolation(docPageVisits: {
|
|
58
|
+
url: string;
|
|
59
|
+
}[], allowedOrigins: string[]): SourceIsolationReport;
|
|
60
|
+
/**
|
|
61
|
+
* Promptfoo custom assertion function.
|
|
62
|
+
*
|
|
63
|
+
* Called by Promptfoo for each test case when referenced as:
|
|
64
|
+
* type: javascript
|
|
65
|
+
* value: file://dist/assertions/source-isolation.js
|
|
66
|
+
* weight: 0
|
|
67
|
+
*
|
|
68
|
+
* @param output - The model's text output (unused)
|
|
69
|
+
* @param context - Promptfoo assertion context with provider metadata
|
|
70
|
+
*/
|
|
71
|
+
export default function (_output: string, context: AssertionContext): GradingResult;
|
|
72
|
+
export {};
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* source-isolation.ts
|
|
3
|
+
*
|
|
4
|
+
* Promptfoo custom assertion: verifies that the agentic provider only
|
|
5
|
+
* fetched documentation from the configured allowed origins.
|
|
6
|
+
*
|
|
7
|
+
* Compiled to dist/assertions/source-isolation.js and referenced via
|
|
8
|
+
* file:// in generated Promptfoo configs.
|
|
9
|
+
*
|
|
10
|
+
* The assertion has weight: 0 — it doesn't affect scores. It surfaces
|
|
11
|
+
* as an advisory pass/fail signal in results.
|
|
12
|
+
*
|
|
13
|
+
* @see docs/exec-plans/completed/source-aware-eval-isolation.md (Phase 3b)
|
|
14
|
+
*/
|
|
15
|
+
import { isAllowedOrigin } from "../sources.js";
|
|
16
|
+
/**
|
|
17
|
+
* Analyze doc page visits and compute an isolation score.
|
|
18
|
+
*
|
|
19
|
+
* The isolation score is the fraction of doc-fetching requests that went
|
|
20
|
+
* to an allowed origin. A score of 1.0 means perfect isolation.
|
|
21
|
+
*
|
|
22
|
+
* @param docPageVisits - Classified doc page visit records
|
|
23
|
+
* @param allowedOrigins - Origin patterns (glob-capable)
|
|
24
|
+
* @returns Isolation report with score and breakdown
|
|
25
|
+
*/
|
|
26
|
+
export function analyzeSourceIsolation(docPageVisits, allowedOrigins) {
|
|
27
|
+
if (allowedOrigins.length === 0 || docPageVisits.length === 0) {
|
|
28
|
+
return {
|
|
29
|
+
blocked: 0,
|
|
30
|
+
isolationScore: 1.0,
|
|
31
|
+
offOrigin: 0,
|
|
32
|
+
offOriginUrls: [],
|
|
33
|
+
onOrigin: docPageVisits.length,
|
|
34
|
+
originBreakdown: {},
|
|
35
|
+
total: docPageVisits.length,
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
let onOrigin = 0;
|
|
39
|
+
let offOrigin = 0;
|
|
40
|
+
const offOriginUrls = [];
|
|
41
|
+
const originBreakdown = {};
|
|
42
|
+
for (const visit of docPageVisits) {
|
|
43
|
+
try {
|
|
44
|
+
const hostname = new URL(visit.url).hostname.replace(/^www\./, "");
|
|
45
|
+
originBreakdown[hostname] = (originBreakdown[hostname] || 0) + 1;
|
|
46
|
+
if (isAllowedOrigin(visit.url, allowedOrigins)) {
|
|
47
|
+
onOrigin++;
|
|
48
|
+
}
|
|
49
|
+
else {
|
|
50
|
+
offOrigin++;
|
|
51
|
+
offOriginUrls.push(visit.url);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
catch {
|
|
55
|
+
offOrigin++;
|
|
56
|
+
offOriginUrls.push(visit.url);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
const total = onOrigin + offOrigin;
|
|
60
|
+
const isolationScore = total > 0 ? onOrigin / total : 1.0;
|
|
61
|
+
return {
|
|
62
|
+
blocked: 0, // Blocked requests aren't in docPageVisits — they're caught earlier
|
|
63
|
+
isolationScore,
|
|
64
|
+
offOrigin,
|
|
65
|
+
offOriginUrls,
|
|
66
|
+
onOrigin,
|
|
67
|
+
originBreakdown,
|
|
68
|
+
total,
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
// ---------------------------------------------------------------------------
|
|
72
|
+
// Promptfoo assertion entry point
|
|
73
|
+
// ---------------------------------------------------------------------------
|
|
74
|
+
/**
|
|
75
|
+
* Promptfoo custom assertion function.
|
|
76
|
+
*
|
|
77
|
+
* Called by Promptfoo for each test case when referenced as:
|
|
78
|
+
* type: javascript
|
|
79
|
+
* value: file://dist/assertions/source-isolation.js
|
|
80
|
+
* weight: 0
|
|
81
|
+
*
|
|
82
|
+
* @param output - The model's text output (unused)
|
|
83
|
+
* @param context - Promptfoo assertion context with provider metadata
|
|
84
|
+
*/
|
|
85
|
+
export default function (_output, context) {
|
|
86
|
+
const behavior = context.providerResponse?.metadata?.agentBehavior;
|
|
87
|
+
if (!behavior) {
|
|
88
|
+
return { pass: true, reason: "No agent behavior recorded", score: 1 };
|
|
89
|
+
}
|
|
90
|
+
const docVisits = behavior.docPageVisits ?? [];
|
|
91
|
+
if (docVisits.length === 0) {
|
|
92
|
+
return { pass: true, reason: "No doc page visits recorded", score: 1 };
|
|
93
|
+
}
|
|
94
|
+
// Read allowed origins from env (set by pipeline.ts)
|
|
95
|
+
const originsEnv = process.env.DOC_ALLOWED_ORIGINS;
|
|
96
|
+
const allowedOrigins = originsEnv
|
|
97
|
+
? originsEnv
|
|
98
|
+
.split(",")
|
|
99
|
+
.map((o) => o.trim())
|
|
100
|
+
.filter(Boolean)
|
|
101
|
+
: [];
|
|
102
|
+
if (allowedOrigins.length === 0) {
|
|
103
|
+
return {
|
|
104
|
+
pass: true,
|
|
105
|
+
reason: `No origin sandboxing configured (${docVisits.length} doc visits)`,
|
|
106
|
+
score: 1,
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
const report = analyzeSourceIsolation(docVisits, allowedOrigins);
|
|
110
|
+
return {
|
|
111
|
+
pass: report.offOrigin === 0,
|
|
112
|
+
reason: report.offOrigin === 0
|
|
113
|
+
? `All doc fetches on-origin (${report.onOrigin} visits, origins: ${allowedOrigins.join(", ")})`
|
|
114
|
+
: `${report.offOrigin} off-origin doc fetch(es): ${report.offOriginUrls.join(", ")}`,
|
|
115
|
+
score: report.isolationScore,
|
|
116
|
+
};
|
|
117
|
+
}
|
package/dist/cli.d.ts
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* cli.ts — AILF CLI entry point.
|
|
4
|
+
*
|
|
5
|
+
* Unified command-line interface for the AI Literacy Framework.
|
|
6
|
+
* All evaluation commands are exposed as subcommands under `ailf`.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* ailf pipeline [flags] # full evaluation pipeline
|
|
10
|
+
* ailf compare [flags] # compare evaluation runs
|
|
11
|
+
* ailf baseline <cmd> [flags] # baseline management
|
|
12
|
+
* ailf validate [flags] # config validation
|
|
13
|
+
* ailf completion bash # generate shell completions
|
|
14
|
+
* ailf --help # list all commands
|
|
15
|
+
*
|
|
16
|
+
* Global options:
|
|
17
|
+
* --verbose / -v # increase log output
|
|
18
|
+
* --quiet / -q # suppress non-error output
|
|
19
|
+
* --dotenv <path> # override default .env path
|
|
20
|
+
*
|
|
21
|
+
* Dev mode (without building):
|
|
22
|
+
* tsx src/cli.ts pipeline --debug
|
|
23
|
+
*/
|
|
24
|
+
export {};
|
package/dist/cli.js
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/* oxlint-disable import/first -- imports are intentionally interleaved with
|
|
3
|
+
command registration for readability and lazy loading */
|
|
4
|
+
/**
|
|
5
|
+
* cli.ts — AILF CLI entry point.
|
|
6
|
+
*
|
|
7
|
+
* Unified command-line interface for the AI Literacy Framework.
|
|
8
|
+
* All evaluation commands are exposed as subcommands under `ailf`.
|
|
9
|
+
*
|
|
10
|
+
* Usage:
|
|
11
|
+
* ailf pipeline [flags] # full evaluation pipeline
|
|
12
|
+
* ailf compare [flags] # compare evaluation runs
|
|
13
|
+
* ailf baseline <cmd> [flags] # baseline management
|
|
14
|
+
* ailf validate [flags] # config validation
|
|
15
|
+
* ailf completion bash # generate shell completions
|
|
16
|
+
* ailf --help # list all commands
|
|
17
|
+
*
|
|
18
|
+
* Global options:
|
|
19
|
+
* --verbose / -v # increase log output
|
|
20
|
+
* --quiet / -q # suppress non-error output
|
|
21
|
+
* --dotenv <path> # override default .env path
|
|
22
|
+
*
|
|
23
|
+
* Dev mode (without building):
|
|
24
|
+
* tsx src/cli.ts pipeline --debug
|
|
25
|
+
*/
|
|
26
|
+
import { config as dotenvConfig } from "dotenv";
|
|
27
|
+
import { existsSync, readFileSync } from "fs";
|
|
28
|
+
import { dirname, resolve } from "path";
|
|
29
|
+
import { fileURLToPath } from "url";
|
|
30
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
31
|
+
const ROOT = resolve(__dirname, "..");
|
|
32
|
+
/** Path to the eval package root (packages/eval). Used by --explain. */
|
|
33
|
+
const EVAL_ROOT = ROOT;
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
// Load .env — must happen before Commander parses so that .env()
|
|
36
|
+
// fallbacks resolve correctly.
|
|
37
|
+
//
|
|
38
|
+
// Resolution order:
|
|
39
|
+
// 1. Explicit --dotenv <path> flag
|
|
40
|
+
// 2. Monorepo root .env (../../.env relative to packages/eval/)
|
|
41
|
+
// 3. Caller's working directory .env
|
|
42
|
+
//
|
|
43
|
+
// This allows the CLI to work both in the monorepo (dev) and when
|
|
44
|
+
// installed globally via npm (production).
|
|
45
|
+
// ---------------------------------------------------------------------------
|
|
46
|
+
function resolveEnvPath() {
|
|
47
|
+
const idx = process.argv.indexOf("--dotenv");
|
|
48
|
+
if (idx !== -1 && process.argv[idx + 1]) {
|
|
49
|
+
return resolve(process.argv[idx + 1]);
|
|
50
|
+
}
|
|
51
|
+
// Monorepo root .env (dev mode)
|
|
52
|
+
const monorepoEnv = resolve(ROOT, "..", "..", ".env");
|
|
53
|
+
if (existsSync(monorepoEnv))
|
|
54
|
+
return monorepoEnv;
|
|
55
|
+
// Caller's working directory .env (npm install mode)
|
|
56
|
+
const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
|
|
57
|
+
return resolve(callerCwd, ".env");
|
|
58
|
+
}
|
|
59
|
+
const envPath = resolveEnvPath();
|
|
60
|
+
if (existsSync(envPath)) {
|
|
61
|
+
dotenvConfig({ override: true, path: envPath });
|
|
62
|
+
}
|
|
63
|
+
// ---------------------------------------------------------------------------
|
|
64
|
+
// Pre-scan for --verbose / --quiet to make them available globally before
|
|
65
|
+
// Commander parses. Commands can check process.env.AILF_LOG_LEVEL.
|
|
66
|
+
// ---------------------------------------------------------------------------
|
|
67
|
+
if (process.argv.includes("--verbose") || process.argv.includes("-v")) {
|
|
68
|
+
process.env.AILF_LOG_LEVEL = "verbose";
|
|
69
|
+
}
|
|
70
|
+
else if (process.argv.includes("--quiet") || process.argv.includes("-q")) {
|
|
71
|
+
process.env.AILF_LOG_LEVEL = "quiet";
|
|
72
|
+
}
|
|
73
|
+
// ---------------------------------------------------------------------------
|
|
74
|
+
// Build CLI program
|
|
75
|
+
// ---------------------------------------------------------------------------
|
|
76
|
+
import { Command } from "commander";
|
|
77
|
+
// Read version from package.json
|
|
78
|
+
const pkgPath = resolve(ROOT, "package.json");
|
|
79
|
+
const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
|
|
80
|
+
const program = new Command()
|
|
81
|
+
.name("ailf")
|
|
82
|
+
.description("AI Literacy Framework — evaluate how well docs enable AI coding tools")
|
|
83
|
+
.version(pkg.version)
|
|
84
|
+
.option("-v, --verbose", "Increase log output")
|
|
85
|
+
.option("-q, --quiet", "Suppress non-error output")
|
|
86
|
+
.option("--dotenv <path>", "Override default .env file path")
|
|
87
|
+
.option("--explain", "Show execution plan without running")
|
|
88
|
+
.option("--format <fmt>", "Output format for --explain (console, json)", "console")
|
|
89
|
+
.option("-y, --yes", "With --explain: show plan then prompt to confirm execution");
|
|
90
|
+
// ---------------------------------------------------------------------------
|
|
91
|
+
// Global --explain hook — intercepts any command before execution
|
|
92
|
+
// ---------------------------------------------------------------------------
|
|
93
|
+
program.hook("preAction", async (thisCommand, actionCommand) => {
|
|
94
|
+
const globalOpts = thisCommand.opts();
|
|
95
|
+
if (!globalOpts.explain)
|
|
96
|
+
return;
|
|
97
|
+
const { handleExplain } = await import("./commands/explain-handler.js");
|
|
98
|
+
try {
|
|
99
|
+
await handleExplain(actionCommand, globalOpts.yes ?? false, EVAL_ROOT);
|
|
100
|
+
process.exit(0);
|
|
101
|
+
}
|
|
102
|
+
catch (err) {
|
|
103
|
+
// Sentinel from --yes confirmation: user wants to proceed
|
|
104
|
+
if (err !== null &&
|
|
105
|
+
typeof err === "object" &&
|
|
106
|
+
"__proceedArgv" in err) {
|
|
107
|
+
const filteredArgv = err.__proceedArgv;
|
|
108
|
+
console.log("\n ▸ Proceeding with execution…\n");
|
|
109
|
+
await program.parseAsync(filteredArgv);
|
|
110
|
+
return;
|
|
111
|
+
}
|
|
112
|
+
throw err;
|
|
113
|
+
}
|
|
114
|
+
});
|
|
115
|
+
// ---------------------------------------------------------------------------
|
|
116
|
+
// Register commands
|
|
117
|
+
// ---------------------------------------------------------------------------
|
|
118
|
+
// Pipeline — the main orchestrator
|
|
119
|
+
import { createPipelineCommand } from "./commands/pipeline.js";
|
|
120
|
+
program.addCommand(createPipelineCommand());
|
|
121
|
+
// Compare — structured score comparison
|
|
122
|
+
import { createCompareCommand } from "./commands/compare.js";
|
|
123
|
+
program.addCommand(createCompareCommand());
|
|
124
|
+
// Baseline — save/compare/history
|
|
125
|
+
import { createBaselineCommand } from "./commands/baseline.js";
|
|
126
|
+
program.addCommand(createBaselineCommand());
|
|
127
|
+
// Validate — config validation
|
|
128
|
+
import { createValidateCommand } from "./commands/validate.js";
|
|
129
|
+
program.addCommand(createValidateCommand());
|
|
130
|
+
// Coverage audit — feature coverage analysis
|
|
131
|
+
import { createCoverageAuditCommand } from "./commands/coverage-audit.js";
|
|
132
|
+
program.addCommand(createCoverageAuditCommand());
|
|
133
|
+
// Weekly digest — trend digest delivery
|
|
134
|
+
import { createWeeklyDigestCommand } from "./commands/weekly-digest.js";
|
|
135
|
+
program.addCommand(createWeeklyDigestCommand());
|
|
136
|
+
// Readiness report — launch readiness checklist
|
|
137
|
+
import { createReadinessReportCommand } from "./commands/readiness-report.js";
|
|
138
|
+
program.addCommand(createReadinessReportCommand());
|
|
139
|
+
// Discovery report — agent discoverability analysis
|
|
140
|
+
import { createDiscoveryReportCommand } from "./commands/discovery-report.js";
|
|
141
|
+
program.addCommand(createDiscoveryReportCommand());
|
|
142
|
+
// Grader — reliability tools (consistency, compare, sensitivity, validate)
|
|
143
|
+
import { createGraderCommand } from "./commands/grader/index.js";
|
|
144
|
+
program.addCommand(createGraderCommand());
|
|
145
|
+
// Fetch docs — pull documentation from Sanity CMS
|
|
146
|
+
import { createFetchDocsCommand } from "./commands/fetch-docs.js";
|
|
147
|
+
program.addCommand(createFetchDocsCommand());
|
|
148
|
+
// Generate configs — generate promptfoo config files
|
|
149
|
+
import { createGenerateConfigsCommand } from "./commands/generate-configs.js";
|
|
150
|
+
program.addCommand(createGenerateConfigsCommand());
|
|
151
|
+
// Calculate scores — compute AI Literacy Scores from eval results
|
|
152
|
+
import { createCalculateScoresCommand } from "./commands/calculate-scores.js";
|
|
153
|
+
program.addCommand(createCalculateScoresCommand());
|
|
154
|
+
// Eval — direct promptfoo eval passthrough
|
|
155
|
+
import { createEvalCommand } from "./commands/eval.js";
|
|
156
|
+
program.addCommand(createEvalCommand());
|
|
157
|
+
// PR comment — generate markdown PR comment
|
|
158
|
+
import { createPrCommentCommand } from "./commands/pr-comment.js";
|
|
159
|
+
program.addCommand(createPrCommentCommand());
|
|
160
|
+
// Publish — standalone report publishing to Sanity Content Lake
|
|
161
|
+
import { createPublishCommand } from "./commands/publish.js";
|
|
162
|
+
program.addCommand(createPublishCommand());
|
|
163
|
+
// Agent report — agent behavior observation report
|
|
164
|
+
import { createAgentReportCommand } from "./commands/agent-report.js";
|
|
165
|
+
program.addCommand(createAgentReportCommand());
|
|
166
|
+
// Cache — local pipeline cache management
|
|
167
|
+
import { createCacheCommand } from "./commands/cache.js";
|
|
168
|
+
program.addCommand(createCacheCommand());
|
|
169
|
+
// Webhook server — local development server
|
|
170
|
+
import { createWebhookServerCommand } from "./commands/webhook-server.js";
|
|
171
|
+
program.addCommand(createWebhookServerCommand());
|
|
172
|
+
// Lookup doc — search Sanity for documentation articles
|
|
173
|
+
import { createLookupDocCommand } from "./commands/lookup-doc.js";
|
|
174
|
+
program.addCommand(createLookupDocCommand());
|
|
175
|
+
// Measure retrieval — retrieval quality measurement
|
|
176
|
+
import { createMeasureRetrievalCommand } from "./commands/measure-retrieval.js";
|
|
177
|
+
program.addCommand(createMeasureRetrievalCommand());
|
|
178
|
+
// Init — initialize a directory for AILF
|
|
179
|
+
import { createInitCommand } from "./commands/init.js";
|
|
180
|
+
program.addCommand(createInitCommand());
|
|
181
|
+
// Validate tasks — standalone repo task validation
|
|
182
|
+
import { createValidateTasksCommand } from "./commands/validate-tasks.js";
|
|
183
|
+
program.addCommand(createValidateTasksCommand());
|
|
184
|
+
// Interactive — guided wizard
|
|
185
|
+
import { createInteractiveCommand } from "./commands/interactive.js";
|
|
186
|
+
program.addCommand(createInteractiveCommand());
|
|
187
|
+
// Shell completion — must be registered last (needs full program tree)
|
|
188
|
+
import { createCompletionCommand } from "./commands/completion.js";
|
|
189
|
+
program.addCommand(createCompletionCommand(program));
|
|
190
|
+
// ---------------------------------------------------------------------------
|
|
191
|
+
// Parse and run — default to interactive mode when no arguments given
|
|
192
|
+
// ---------------------------------------------------------------------------
|
|
193
|
+
// If no command is specified (just `ailf`), launch interactive mode
|
|
194
|
+
if (process.argv.length <= 2) {
|
|
195
|
+
await program.parseAsync([...process.argv, "interactive"]);
|
|
196
|
+
}
|
|
197
|
+
else {
|
|
198
|
+
await program.parseAsync();
|
|
199
|
+
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* agent-report command — generate an agent behavior observation report.
|
|
3
|
+
*/
|
|
4
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
5
|
+
import { dirname, join } from "path";
|
|
6
|
+
import { Command } from "commander";
|
|
7
|
+
import { analyzeResults } from "../pipeline/agent-behavior-report.js";
|
|
8
|
+
export function createAgentReportCommand() {
|
|
9
|
+
return new Command("agent-report")
|
|
10
|
+
.description("Generate an agent behavior observation report from eval results")
|
|
11
|
+
.argument("[results-path]", "Path to eval-results.json (default: results/latest/eval-results.json)")
|
|
12
|
+
.action(async (resultsPath) => {
|
|
13
|
+
try {
|
|
14
|
+
const ROOT = join(dirname(new URL(import.meta.url).pathname), "..", "..");
|
|
15
|
+
const resolvedPath = resultsPath ?? join(ROOT, "results", "latest", "eval-results.json");
|
|
16
|
+
if (!existsSync(resolvedPath)) {
|
|
17
|
+
console.error(`Results file not found: ${resolvedPath}`);
|
|
18
|
+
console.error("Run an evaluation first: pnpm eval:observed");
|
|
19
|
+
process.exitCode = 1;
|
|
20
|
+
return;
|
|
21
|
+
}
|
|
22
|
+
console.log(`Reading results from: ${resolvedPath}`);
|
|
23
|
+
console.log();
|
|
24
|
+
const json = JSON.parse(readFileSync(resolvedPath, "utf-8"));
|
|
25
|
+
const rawResults = Array.isArray(json.results)
|
|
26
|
+
? json.results
|
|
27
|
+
: json.results.results;
|
|
28
|
+
const analysis = analyzeResults(rawResults);
|
|
29
|
+
if (!analysis.hasData) {
|
|
30
|
+
console.log("No agent behavior data found in the results.");
|
|
31
|
+
console.log("Make sure you ran the evaluation with the observed config:");
|
|
32
|
+
console.log(" pnpm eval:observed");
|
|
33
|
+
return;
|
|
34
|
+
}
|
|
35
|
+
// Write JSON report
|
|
36
|
+
const outDir = join(ROOT, "results", "latest");
|
|
37
|
+
mkdirSync(outDir, { recursive: true });
|
|
38
|
+
const reportData = {
|
|
39
|
+
features: analysis.features.map((f) => ({
|
|
40
|
+
avgDocPages: f.avgDocPages,
|
|
41
|
+
avgNetworkMs: f.avgNetworkMs,
|
|
42
|
+
avgSearches: f.avgSearches,
|
|
43
|
+
canonicalCoverage: f.canonicalCoverage,
|
|
44
|
+
canonicalSlugs: f.canonicalSlugs,
|
|
45
|
+
docSlugsVisited: f.allDocSlugs,
|
|
46
|
+
externalDomains: f.allExternalDomains,
|
|
47
|
+
feature: f.feature,
|
|
48
|
+
searchQueries: f.allSearchQueries,
|
|
49
|
+
taskCount: f.tasks.length,
|
|
50
|
+
})),
|
|
51
|
+
tasks: analysis.tasks.map((t) => ({
|
|
52
|
+
behavior: t.behavior,
|
|
53
|
+
description: t.description,
|
|
54
|
+
feature: t.feature,
|
|
55
|
+
hasDocs: t.hasDocs,
|
|
56
|
+
})),
|
|
57
|
+
timestamp: new Date().toISOString(),
|
|
58
|
+
totalTasks: analysis.tasks.length,
|
|
59
|
+
};
|
|
60
|
+
writeFileSync(join(outDir, "agent-behavior-report.json"), JSON.stringify(reportData, null, 2));
|
|
61
|
+
console.log("Agent behavior report written to results/latest/agent-behavior-report.json");
|
|
62
|
+
}
|
|
63
|
+
catch (err) {
|
|
64
|
+
process.exitCode = 1;
|
|
65
|
+
if (err instanceof Error)
|
|
66
|
+
console.error(err.message);
|
|
67
|
+
}
|
|
68
|
+
});
|
|
69
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* baseline command — manage historical baseline snapshots of evaluation scores.
|
|
3
|
+
*
|
|
4
|
+
* Wraps the core baseline functions from pipeline/baseline.ts behind a
|
|
5
|
+
* Commander subcommand interface: `baseline save`, `baseline compare`,
|
|
6
|
+
* `baseline history`.
|
|
7
|
+
*/
|
|
8
|
+
import { Command } from "commander";
|
|
9
|
+
export declare function createBaselineCommand(): Command;
|