@sanity/ailf 2.0.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +28 -23
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
- package/dist/_vendor/ailf-core/config-helpers.js +29 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
- package/dist/_vendor/ailf-core/examples/index.js +208 -114
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
- package/dist/_vendor/ailf-tasks/cli.js +61 -0
- package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
- package/dist/_vendor/ailf-tasks/index.js +16 -0
- package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
- package/dist/_vendor/ailf-tasks/parser.js +73 -0
- package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
- package/dist/_vendor/ailf-tasks/schemas.js +180 -0
- package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
- package/dist/_vendor/ailf-tasks/validation.js +162 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +6 -1
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
- package/dist/adapters/task-sources/index.d.ts +1 -2
- package/dist/adapters/task-sources/index.js +1 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
- package/dist/adapters/task-sources/repo-schemas.js +2 -2
- package/dist/adapters/task-sources/repo-task-source.js +1 -1
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
- package/dist/adapters/task-sources/task-file-loader.js +20 -6
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +2 -3
- package/dist/commands/init.js +56 -170
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/composition-root.d.ts +2 -3
- package/dist/composition-root.js +27 -14
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +30 -16
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +50 -15
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +52 -32
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/checks.d.ts +8 -3
- package/dist/pipeline/checks.js +23 -3
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
- package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
- package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
- package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +15 -8
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +15 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/dist/pipeline/mirror-repo-tasks.js +1 -1
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +67 -29
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +25 -25
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -0,0 +1,459 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Generate-configs.ts
|
|
3
|
+
*
|
|
4
|
+
* Reads config/models.yaml (the central model registry) and generates all
|
|
5
|
+
* promptfoo config files with the correct provider entries.
|
|
6
|
+
*
|
|
7
|
+
* This keeps model definitions in one place — add a model to config/models.yaml
|
|
8
|
+
* and run `pnpm generate-configs` to propagate it to all eval modes.
|
|
9
|
+
*
|
|
10
|
+
* Generated configs:
|
|
11
|
+
* - promptfooconfig.yaml (baseline: with-docs vs without-docs)
|
|
12
|
+
* - promptfooconfig.observed.yaml (instrumented HTTP recording)
|
|
13
|
+
* - promptfooconfig.agentic.yaml (agentic tool-calling: naive vs optimized)
|
|
14
|
+
*
|
|
15
|
+
* Usage:
|
|
16
|
+
* pnpm generate-configs
|
|
17
|
+
* # or
|
|
18
|
+
* tsx src/scripts/generate-configs.ts
|
|
19
|
+
*/
|
|
20
|
+
import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
|
|
21
|
+
import { dirname, resolve } from "path";
|
|
22
|
+
import { fileURLToPath } from "url";
|
|
23
|
+
import { dump, load } from "js-yaml";
|
|
24
|
+
import { loadAndExpandTasks } from "../pipeline/expand-tasks.js";
|
|
25
|
+
import { validateModelsYaml } from "../pipeline/validate.js";
|
|
26
|
+
import { loadSource } from "../sources.js";
|
|
27
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
28
|
+
const ROOT = resolve(__dirname, "../..");
|
|
29
|
+
// ---------------------------------------------------------------------------
|
|
30
|
+
// Helpers
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
/** Auto-discover all task YAML files in the tasks/ directory. */
|
|
33
|
+
export function discoverTaskFiles(rootDir) {
|
|
34
|
+
const tasksDir = resolve(rootDir, "tasks");
|
|
35
|
+
if (!existsSync(tasksDir)) {
|
|
36
|
+
throw new Error(`tasks/ directory not found at ${tasksDir}`);
|
|
37
|
+
}
|
|
38
|
+
return readdirSync(tasksDir)
|
|
39
|
+
.filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
|
|
40
|
+
.sort()
|
|
41
|
+
.map((f) => `file://tasks/${f}`);
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Known promptfoo provider prefixes that are NOT part of the model name.
|
|
45
|
+
* Order matters: longer prefixes must come first to avoid partial matches.
|
|
46
|
+
*/
|
|
47
|
+
const PROVIDER_PREFIXES = [
|
|
48
|
+
"anthropic:messages:",
|
|
49
|
+
"openai:chat:",
|
|
50
|
+
"openai:responses:",
|
|
51
|
+
"openai:",
|
|
52
|
+
"anthropic:",
|
|
53
|
+
"google:",
|
|
54
|
+
];
|
|
55
|
+
/**
|
|
56
|
+
* Extract the raw API model name from a promptfoo provider ID.
|
|
57
|
+
*
|
|
58
|
+
* Promptfoo IDs encode the provider + sub-protocol + model, e.g.:
|
|
59
|
+
* "openai:chat:gpt-5.2" → "gpt-5.2"
|
|
60
|
+
* "anthropic:messages:claude-opus-4-6" → "claude-opus-4-6"
|
|
61
|
+
* "openai:gpt-4o" → "gpt-4o"
|
|
62
|
+
* "google:gemini-2.5-pro" → "gemini-2.5-pro"
|
|
63
|
+
*
|
|
64
|
+
* Falls back to stripping everything before the first colon for unknown
|
|
65
|
+
* providers (e.g., "openrouter:deepseek/deepseek-r1" → "deepseek/deepseek-r1").
|
|
66
|
+
*/
|
|
67
|
+
export function extractModelName(id) {
|
|
68
|
+
for (const prefix of PROVIDER_PREFIXES) {
|
|
69
|
+
if (id.startsWith(prefix)) {
|
|
70
|
+
return id.slice(prefix.length);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
// Fallback: strip first colon-separated segment
|
|
74
|
+
const parts = id.split(":");
|
|
75
|
+
return parts.length > 1 ? parts.slice(1).join(":") : id;
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Extract the LLM provider family from a promptfoo provider ID.
|
|
79
|
+
*
|
|
80
|
+
* "openai:chat:gpt-5.2" → "openai"
|
|
81
|
+
* "anthropic:messages:claude-opus-4-6" → "anthropic"
|
|
82
|
+
* "google:gemini-2.5-pro" → "google"
|
|
83
|
+
*/
|
|
84
|
+
export function extractProvider(id) {
|
|
85
|
+
const colon = id.indexOf(":");
|
|
86
|
+
return colon > 0 ? id.slice(0, colon) : "openai";
|
|
87
|
+
}
|
|
88
|
+
/** Load prompt templates from config/prompts.yaml. Throws if missing or malformed. */
|
|
89
|
+
export function loadPrompts(rootDir) {
|
|
90
|
+
const promptsPath = resolve(rootDir, "config", "prompts.yaml");
|
|
91
|
+
if (!existsSync(promptsPath)) {
|
|
92
|
+
throw new Error(`config/prompts.yaml not found at ${promptsPath}. This file is required — it defines the prompt templates for all evaluation modes.`);
|
|
93
|
+
}
|
|
94
|
+
const raw = readFileSync(promptsPath, "utf-8");
|
|
95
|
+
const data = load(raw);
|
|
96
|
+
const toPrompt = (entry) => ({
|
|
97
|
+
id: entry.id,
|
|
98
|
+
label: entry.label,
|
|
99
|
+
raw: entry.template,
|
|
100
|
+
});
|
|
101
|
+
if (!data["with-docs"] || !data["without-docs"] || !data["agentic"]) {
|
|
102
|
+
const missing = ["with-docs", "without-docs", "agentic"].filter((k) => !data[k]);
|
|
103
|
+
throw new Error(`config/prompts.yaml is missing required keys: ${missing.join(", ")}. Each prompt must have id, label, and template fields.`);
|
|
104
|
+
}
|
|
105
|
+
return {
|
|
106
|
+
agentic: toPrompt(data["agentic"]),
|
|
107
|
+
withDocs: toPrompt(data["with-docs"]),
|
|
108
|
+
withoutDocs: toPrompt(data["without-docs"]),
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
// ---------------------------------------------------------------------------
|
|
112
|
+
// Task discovery
|
|
113
|
+
// ---------------------------------------------------------------------------
|
|
114
|
+
/** Merge default config with model-specific config */
|
|
115
|
+
export function mergeConfig(defaults, modelConfig, overrides) {
|
|
116
|
+
const result = {};
|
|
117
|
+
// Only pick scalar defaults (temperature, max_tokens)
|
|
118
|
+
for (const [key, value] of Object.entries(defaults)) {
|
|
119
|
+
if (typeof value !== "object" || value === null) {
|
|
120
|
+
result[key] = value;
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
// Model-specific overrides
|
|
124
|
+
if (modelConfig) {
|
|
125
|
+
Object.assign(result, modelConfig);
|
|
126
|
+
}
|
|
127
|
+
// Mode-specific overrides
|
|
128
|
+
if (overrides) {
|
|
129
|
+
Object.assign(result, overrides);
|
|
130
|
+
}
|
|
131
|
+
return result;
|
|
132
|
+
}
|
|
133
|
+
// ---------------------------------------------------------------------------
|
|
134
|
+
// Prompt loading
|
|
135
|
+
// ---------------------------------------------------------------------------
|
|
136
|
+
export function modelMatchesMode(model, mode) {
|
|
137
|
+
if (!model.modes || model.modes.length === 0) {
|
|
138
|
+
return true;
|
|
139
|
+
}
|
|
140
|
+
return model.modes.includes(mode);
|
|
141
|
+
}
|
|
142
|
+
function loadModels() {
|
|
143
|
+
const raw = readFileSync(resolve(ROOT, "config", "models.yaml"), "utf-8");
|
|
144
|
+
return load(raw);
|
|
145
|
+
}
|
|
146
|
+
// ---------------------------------------------------------------------------
|
|
147
|
+
// Shared components
|
|
148
|
+
// ---------------------------------------------------------------------------
|
|
149
|
+
const URL_EXTRACTION_ASSERT = {
|
|
150
|
+
type: "javascript",
|
|
151
|
+
value: `const urlPattern = /https?:\\/\\/[^\\s\\)\\"\\'\\\`>]+/g;
|
|
152
|
+
const urls = [...new Set((output.match(urlPattern) || []))];
|
|
153
|
+
const sanityUrls = urls.filter(u => u.includes('sanity.io'));
|
|
154
|
+
return {
|
|
155
|
+
pass: true,
|
|
156
|
+
score: 1,
|
|
157
|
+
reason: JSON.stringify({
|
|
158
|
+
sanityUrls,
|
|
159
|
+
otherUrls: urls.filter(u => !u.includes('sanity.io')),
|
|
160
|
+
totalUrlCount: urls.length,
|
|
161
|
+
sanityUrlCount: sanityUrls.length
|
|
162
|
+
})
|
|
163
|
+
};`,
|
|
164
|
+
weight: 0,
|
|
165
|
+
};
|
|
166
|
+
/**
|
|
167
|
+
* Source isolation assertion — advisory (weight: 0).
|
|
168
|
+
* Verifies that the agentic provider only fetched docs from allowed origins.
|
|
169
|
+
* Only injected when origin sandboxing is active (DOC_ALLOWED_ORIGINS is set).
|
|
170
|
+
*/
|
|
171
|
+
const SOURCE_ISOLATION_ASSERT = {
|
|
172
|
+
metadata: { dimension: "source-isolation" },
|
|
173
|
+
type: "javascript",
|
|
174
|
+
value: "file://dist/assertions/source-isolation.js",
|
|
175
|
+
weight: 0,
|
|
176
|
+
};
|
|
177
|
+
// ---------------------------------------------------------------------------
|
|
178
|
+
// Config generators
|
|
179
|
+
// ---------------------------------------------------------------------------
|
|
180
|
+
function generateAgenticConfig(models, tests, prompts, source) {
|
|
181
|
+
const naiveModels = models.models.filter((m) => modelMatchesMode(m, "agentic-naive"));
|
|
182
|
+
const optimizedModels = models.models.filter((m) => modelMatchesMode(m, "agentic-optimized"));
|
|
183
|
+
const providers = [];
|
|
184
|
+
// Build doc source config to inject into providers
|
|
185
|
+
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string means unset
|
|
186
|
+
const searchMode = process.env.EVAL_SEARCH_MODE || "open";
|
|
187
|
+
const sourceConfig = source
|
|
188
|
+
? {
|
|
189
|
+
...(source.allowedOrigins?.length
|
|
190
|
+
? { allowedOrigins: source.allowedOrigins }
|
|
191
|
+
: {}),
|
|
192
|
+
docBaseUrl: source.baseUrl,
|
|
193
|
+
...(source.headers && Object.keys(source.headers).length > 0
|
|
194
|
+
? { customHeaders: source.headers }
|
|
195
|
+
: {}),
|
|
196
|
+
llmsTxtUrl: source.llmsTxt,
|
|
197
|
+
...(source.priorityDomain
|
|
198
|
+
? { priorityDomain: source.priorityDomain }
|
|
199
|
+
: {}),
|
|
200
|
+
// Tool access control: search mode for web_search behavior
|
|
201
|
+
...(searchMode !== "open" ? { searchMode } : {}),
|
|
202
|
+
}
|
|
203
|
+
: {};
|
|
204
|
+
for (const model of naiveModels) {
|
|
205
|
+
const modelName = extractModelName(model.id);
|
|
206
|
+
const provider = extractProvider(model.id);
|
|
207
|
+
providers.push({
|
|
208
|
+
config: {
|
|
209
|
+
...mergeConfig(models.defaults, model.config, {
|
|
210
|
+
agentMode: "naive",
|
|
211
|
+
maxToolRounds: models.defaults.maxToolRounds ?? 5,
|
|
212
|
+
model: modelName,
|
|
213
|
+
provider,
|
|
214
|
+
}),
|
|
215
|
+
...sourceConfig,
|
|
216
|
+
observe: true,
|
|
217
|
+
observerOptions: models.defaults.observerOptions ?? {},
|
|
218
|
+
},
|
|
219
|
+
id: "file://dist/agent-observer/agentic-provider.js",
|
|
220
|
+
label: `${model.label} (Naive Agent)`,
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
for (const model of optimizedModels) {
|
|
224
|
+
const modelName = extractModelName(model.id);
|
|
225
|
+
const provider = extractProvider(model.id);
|
|
226
|
+
providers.push({
|
|
227
|
+
config: {
|
|
228
|
+
...mergeConfig(models.defaults, model.config, {
|
|
229
|
+
agentMode: "optimized",
|
|
230
|
+
maxToolRounds: models.defaults.maxToolRounds ?? 5,
|
|
231
|
+
model: modelName,
|
|
232
|
+
provider,
|
|
233
|
+
}),
|
|
234
|
+
...sourceConfig,
|
|
235
|
+
observe: true,
|
|
236
|
+
observerOptions: models.defaults.observerOptions ?? {},
|
|
237
|
+
},
|
|
238
|
+
id: "file://dist/agent-observer/agentic-provider.js",
|
|
239
|
+
label: `${model.label} (Optimized Agent)`,
|
|
240
|
+
});
|
|
241
|
+
}
|
|
242
|
+
// Inject source isolation assertion when origin sandboxing is active
|
|
243
|
+
const hasOriginSandbox = Boolean(process.env.DOC_ALLOWED_ORIGINS);
|
|
244
|
+
const agenticAssertions = hasOriginSandbox ? [SOURCE_ISOLATION_ASSERT] : [];
|
|
245
|
+
return {
|
|
246
|
+
commandLineOptions: { table: false },
|
|
247
|
+
defaultTest: {
|
|
248
|
+
...(agenticAssertions.length > 0 ? { assert: agenticAssertions } : {}),
|
|
249
|
+
options: {
|
|
250
|
+
provider: models.grader.id,
|
|
251
|
+
rubricProvider: models.grader.id,
|
|
252
|
+
},
|
|
253
|
+
},
|
|
254
|
+
description: "Sanity AI Literacy Evaluation — Agentic (naive vs optimized)",
|
|
255
|
+
...(models.maxConcurrency
|
|
256
|
+
? { evaluateOptions: { maxConcurrency: models.maxConcurrency } }
|
|
257
|
+
: {}),
|
|
258
|
+
outputPath: "results/latest/eval-results-agentic.json",
|
|
259
|
+
prompts: [prompts.agentic],
|
|
260
|
+
providers,
|
|
261
|
+
tests,
|
|
262
|
+
};
|
|
263
|
+
}
|
|
264
|
+
function generateBaselineConfig(models, tests, prompts) {
|
|
265
|
+
const baselineModels = models.models.filter((m) => modelMatchesMode(m, "baseline"));
|
|
266
|
+
const providers = baselineModels.map((model) => ({
|
|
267
|
+
config: mergeConfig(models.defaults, model.config),
|
|
268
|
+
id: model.id,
|
|
269
|
+
label: model.label,
|
|
270
|
+
}));
|
|
271
|
+
return {
|
|
272
|
+
commandLineOptions: { table: false },
|
|
273
|
+
defaultTest: {
|
|
274
|
+
assert: [URL_EXTRACTION_ASSERT],
|
|
275
|
+
options: {
|
|
276
|
+
provider: models.grader.id,
|
|
277
|
+
rubricProvider: models.grader.id,
|
|
278
|
+
},
|
|
279
|
+
},
|
|
280
|
+
description: "Sanity AI Literacy Evaluation — Baseline",
|
|
281
|
+
...(models.maxConcurrency
|
|
282
|
+
? { evaluateOptions: { maxConcurrency: models.maxConcurrency } }
|
|
283
|
+
: {}),
|
|
284
|
+
outputPath: "results/latest/eval-results.json",
|
|
285
|
+
prompts: [prompts.withDocs, prompts.withoutDocs],
|
|
286
|
+
providers,
|
|
287
|
+
tests,
|
|
288
|
+
};
|
|
289
|
+
}
|
|
290
|
+
function generateObservedConfig(models, tests, prompts) {
|
|
291
|
+
const observedModels = models.models.filter((m) => modelMatchesMode(m, "observed"));
|
|
292
|
+
const providers = observedModels.map((model) => {
|
|
293
|
+
const modelName = extractModelName(model.id);
|
|
294
|
+
return {
|
|
295
|
+
config: {
|
|
296
|
+
...mergeConfig(models.defaults, model.config),
|
|
297
|
+
modelName,
|
|
298
|
+
observe: true,
|
|
299
|
+
recordOptions: models.defaults.observerOptions ?? {},
|
|
300
|
+
},
|
|
301
|
+
id: "file://dist/agent-observer/provider.js",
|
|
302
|
+
label: `${model.label} (Observed)`,
|
|
303
|
+
};
|
|
304
|
+
});
|
|
305
|
+
return {
|
|
306
|
+
commandLineOptions: { table: false },
|
|
307
|
+
defaultTest: {
|
|
308
|
+
options: {
|
|
309
|
+
provider: models.grader.id,
|
|
310
|
+
rubricProvider: models.grader.id,
|
|
311
|
+
},
|
|
312
|
+
},
|
|
313
|
+
description: "Sanity AI Literacy Evaluation — Observed",
|
|
314
|
+
...(models.maxConcurrency
|
|
315
|
+
? { evaluateOptions: { maxConcurrency: models.maxConcurrency } }
|
|
316
|
+
: {}),
|
|
317
|
+
outputPath: "results/latest/eval-results-observed.json",
|
|
318
|
+
prompts: [prompts.withDocs, prompts.withoutDocs],
|
|
319
|
+
providers,
|
|
320
|
+
tests,
|
|
321
|
+
};
|
|
322
|
+
}
|
|
323
|
+
// ---------------------------------------------------------------------------
|
|
324
|
+
// File writing
|
|
325
|
+
// ---------------------------------------------------------------------------
|
|
326
|
+
function main() {
|
|
327
|
+
// Validate config/models.yaml before generating configs
|
|
328
|
+
const modelIssues = validateModelsYaml(ROOT);
|
|
329
|
+
const modelErrors = modelIssues.filter((i) => i.severity === "error");
|
|
330
|
+
if (modelErrors.length > 0) {
|
|
331
|
+
console.error("❌ config/models.yaml validation failed:");
|
|
332
|
+
for (const e of modelErrors) {
|
|
333
|
+
console.error(` ERROR: ${e.message}`);
|
|
334
|
+
if (e.path) {
|
|
335
|
+
console.error(` at ${e.path}`);
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
console.error("\nFix config/models.yaml before generating configs. Run 'pnpm validate' for details.");
|
|
339
|
+
process.exit(1);
|
|
340
|
+
}
|
|
341
|
+
console.log("Loading config/models.yaml...");
|
|
342
|
+
const models = loadModels();
|
|
343
|
+
const activeModels = models.models.filter((m) => m.id && m.label);
|
|
344
|
+
console.log(` Found ${activeModels.length} active model(s):`);
|
|
345
|
+
for (const m of activeModels) {
|
|
346
|
+
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty array join → "all"
|
|
347
|
+
const modes = m.modes?.join(", ") || "all";
|
|
348
|
+
console.log(` - ${m.label} (${m.id}) → [${modes}]`);
|
|
349
|
+
}
|
|
350
|
+
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty label falls through to id
|
|
351
|
+
console.log(` Grader: ${models.grader.label || models.grader.id}`);
|
|
352
|
+
// Expand tasks: read single-definition tasks and generate gold+baseline pairs
|
|
353
|
+
// Build filter options from environment variables (set by pipeline.ts)
|
|
354
|
+
const filterAreas = process.env.EVAL_FILTER_AREAS;
|
|
355
|
+
const filterTaskIds = process.env.EVAL_FILTER_TASKS;
|
|
356
|
+
const filter = filterAreas || filterTaskIds
|
|
357
|
+
? {
|
|
358
|
+
areas: filterAreas
|
|
359
|
+
? filterAreas.split(",").map((a) => a.trim())
|
|
360
|
+
: undefined,
|
|
361
|
+
taskIds: filterTaskIds
|
|
362
|
+
? filterTaskIds.split(",").map((t) => t.trim())
|
|
363
|
+
: undefined,
|
|
364
|
+
}
|
|
365
|
+
: undefined;
|
|
366
|
+
// Expand tasks for baseline/observed mode (gold + baseline with prompt filters)
|
|
367
|
+
const { entries, stats } = loadAndExpandTasks(ROOT, filter, "baseline");
|
|
368
|
+
console.log(` Expanded ${stats.singleDefinitions} task(s) → ${stats.expandedTotal} test entries`);
|
|
369
|
+
if (stats.legacyEntries > 0) {
|
|
370
|
+
console.log(` ⚠ ${stats.legacyEntries} legacy (paired) entries passed through unchanged`);
|
|
371
|
+
}
|
|
372
|
+
if (filter) {
|
|
373
|
+
const parts = [];
|
|
374
|
+
if (filter.areas) {
|
|
375
|
+
parts.push(`areas: ${filter.areas.join(", ")}`);
|
|
376
|
+
}
|
|
377
|
+
if (filter.taskIds) {
|
|
378
|
+
parts.push(`tasks: ${filter.taskIds.join(", ")}`);
|
|
379
|
+
}
|
|
380
|
+
console.log(` Scoped to: ${parts.join("; ")}`);
|
|
381
|
+
}
|
|
382
|
+
// Expand tasks for agentic mode (gold entries only, no prompt filters).
|
|
383
|
+
// Agentic mode has a single prompt that doesn't use {{docs}}, so baseline
|
|
384
|
+
// entries would produce identical prompts — pure waste of API calls.
|
|
385
|
+
const { entries: agenticEntries, stats: agenticStats } = loadAndExpandTasks(ROOT, filter, "agentic");
|
|
386
|
+
console.log(` Agentic: ${agenticStats.expandedTotal} entries (gold only, no baseline)`);
|
|
387
|
+
// Write expanded tasks to generated files for Promptfoo to consume
|
|
388
|
+
const expandedPath = resolve(ROOT, "tasks", ".expanded.yaml");
|
|
389
|
+
const expandedYaml = dump(entries, {
|
|
390
|
+
forceQuotes: false,
|
|
391
|
+
lineWidth: 120,
|
|
392
|
+
noRefs: true,
|
|
393
|
+
quotingType: "'",
|
|
394
|
+
});
|
|
395
|
+
writeFileSync(expandedPath, `# .expanded.yaml\n#\n# AUTO-GENERATED — do not edit directly.\n# Source: tasks/*.yaml (single-definition format)\n# Run: pnpm generate-configs\n\n${expandedYaml}`, "utf-8");
|
|
396
|
+
console.log(` ✓ tasks/.expanded.yaml (${entries.length} entries)`);
|
|
397
|
+
const agenticExpandedPath = resolve(ROOT, "tasks", ".expanded.agentic.yaml");
|
|
398
|
+
const agenticExpandedYaml = dump(agenticEntries, {
|
|
399
|
+
forceQuotes: false,
|
|
400
|
+
lineWidth: 120,
|
|
401
|
+
noRefs: true,
|
|
402
|
+
quotingType: "'",
|
|
403
|
+
});
|
|
404
|
+
writeFileSync(agenticExpandedPath, `# .expanded.agentic.yaml\n#\n# AUTO-GENERATED — do not edit directly.\n# Gold entries only (no baseline) for agentic evaluation mode.\n# Source: tasks/*.yaml (single-definition format)\n# Run: pnpm generate-configs\n\n${agenticExpandedYaml}`, "utf-8");
|
|
405
|
+
console.log(` ✓ tasks/.expanded.agentic.yaml (${agenticEntries.length} entries)`);
|
|
406
|
+
const taskFiles = ["file://tasks/.expanded.yaml"];
|
|
407
|
+
const agenticTaskFiles = ["file://tasks/.expanded.agentic.yaml"];
|
|
408
|
+
// Load prompt templates
|
|
409
|
+
const prompts = loadPrompts(ROOT);
|
|
410
|
+
console.log(` Loaded prompts: ${Object.keys(prompts).join(", ")}`);
|
|
411
|
+
// Load optional documentation source configuration
|
|
412
|
+
// Usage: pnpm generate-configs [source-name]
|
|
413
|
+
// Or: DOC_SOURCE=branch pnpm generate-configs
|
|
414
|
+
const sourceName = process.argv[2] || process.env.DOC_SOURCE;
|
|
415
|
+
let source;
|
|
416
|
+
if (sourceName) {
|
|
417
|
+
console.log(`\nLoading source: ${sourceName}`);
|
|
418
|
+
try {
|
|
419
|
+
source = loadSource(sourceName);
|
|
420
|
+
console.log(` Base URL: ${source.baseUrl}`);
|
|
421
|
+
console.log(` Dataset: ${source.dataset}`);
|
|
422
|
+
if (source.allowedOrigins?.length) {
|
|
423
|
+
console.log(` Allowed origins: ${source.allowedOrigins.join(", ")}`);
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
catch (err) {
|
|
427
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
428
|
+
console.warn(`\n⚠ Failed to load source "${sourceName}": ${msg}`);
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
console.log("\nGenerating configs...");
|
|
432
|
+
writeConfig("promptfooconfig.yaml", generateBaselineConfig(models, taskFiles, prompts), `# promptfooconfig.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`);
|
|
433
|
+
writeConfig("promptfooconfig.observed.yaml", generateObservedConfig(models, taskFiles, prompts), `# promptfooconfig.observed.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`);
|
|
434
|
+
writeConfig("promptfooconfig.agentic.yaml", generateAgenticConfig(models, agenticTaskFiles, prompts, source), `# promptfooconfig.agentic.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`);
|
|
435
|
+
console.log("\nDone! Configs are ready.");
|
|
436
|
+
if (source) {
|
|
437
|
+
console.log(` (using doc source: ${sourceName})`);
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
// ---------------------------------------------------------------------------
|
|
441
|
+
// Main
|
|
442
|
+
// ---------------------------------------------------------------------------
|
|
443
|
+
function writeConfig(filename, config, header) {
|
|
444
|
+
const yamlStr = dump(config, {
|
|
445
|
+
forceQuotes: false,
|
|
446
|
+
lineWidth: 120,
|
|
447
|
+
noRefs: true,
|
|
448
|
+
quotingType: "'",
|
|
449
|
+
});
|
|
450
|
+
const content = `${header}\n${yamlStr}`;
|
|
451
|
+
const outPath = resolve(ROOT, filename);
|
|
452
|
+
writeFileSync(outPath, content, "utf-8");
|
|
453
|
+
console.log(` ✓ ${filename}`);
|
|
454
|
+
}
|
|
455
|
+
// Only run when invoked directly (not when imported for testing)
|
|
456
|
+
if (process.argv[1]?.endsWith("generate-configs.ts") ||
|
|
457
|
+
process.argv[1]?.endsWith("generate-configs.js")) {
|
|
458
|
+
main();
|
|
459
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* grader-api.ts
|
|
3
|
+
*
|
|
4
|
+
* Shared utility for calling LLM grading APIs from grader scripts.
|
|
5
|
+
*
|
|
6
|
+
* Dispatches to the correct provider API (OpenAI, Anthropic) based on the
|
|
7
|
+
* grader model prefix. Reads the appropriate API key from environment.
|
|
8
|
+
*
|
|
9
|
+
* Also exports `loadGraderModel()` to resolve the grader from
|
|
10
|
+
* `config/models.yaml`.
|
|
11
|
+
*/
|
|
12
|
+
/**
|
|
13
|
+
* Call the grader model once to score a response against a rubric.
|
|
14
|
+
*
|
|
15
|
+
* Dispatches to the correct provider API based on the model prefix.
|
|
16
|
+
* Returns a numeric score (0–100) or null if the call or parse fails.
|
|
17
|
+
*/
|
|
18
|
+
export declare function gradeOnce(graderModel: string, responseText: string, rubricText: string): Promise<null | number>;
|
|
19
|
+
/**
|
|
20
|
+
* Load the grader model from `config/models.yaml`.
|
|
21
|
+
* Returns both the model ID and human-readable label.
|
|
22
|
+
* Falls back to `openai:gpt-5` if not configured.
|
|
23
|
+
*/
|
|
24
|
+
export declare function loadGraderModel(): {
|
|
25
|
+
id: string;
|
|
26
|
+
label: string;
|
|
27
|
+
};
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* grader-api.ts
|
|
3
|
+
*
|
|
4
|
+
* Shared utility for calling LLM grading APIs from grader scripts.
|
|
5
|
+
*
|
|
6
|
+
* Dispatches to the correct provider API (OpenAI, Anthropic) based on the
|
|
7
|
+
* grader model prefix. Reads the appropriate API key from environment.
|
|
8
|
+
*
|
|
9
|
+
* Also exports `loadGraderModel()` to resolve the grader from
|
|
10
|
+
* `config/models.yaml`.
|
|
11
|
+
*/
|
|
12
|
+
import { config as dotenvConfig } from "dotenv";
|
|
13
|
+
import { existsSync, readFileSync } from "fs";
|
|
14
|
+
import { dirname, join, resolve } from "path";
|
|
15
|
+
import { fileURLToPath } from "url";
|
|
16
|
+
import { load } from "js-yaml";
|
|
17
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
18
|
+
const EVAL_ROOT = resolve(__dirname, "..", "..");
|
|
19
|
+
// Load root .env (two levels above packages/eval/) so API keys are available
|
|
20
|
+
// even when this module is invoked via `tsx` from packages/eval/.
|
|
21
|
+
const rootEnvPath = resolve(EVAL_ROOT, "..", "..", ".env");
|
|
22
|
+
if (existsSync(rootEnvPath)) {
|
|
23
|
+
dotenvConfig({ override: true, path: rootEnvPath });
|
|
24
|
+
}
|
|
25
|
+
// ---------------------------------------------------------------------------
|
|
26
|
+
// Provider detection
|
|
27
|
+
// ---------------------------------------------------------------------------
|
|
28
|
+
/**
|
|
29
|
+
* Call the grader model once to score a response against a rubric.
|
|
30
|
+
*
|
|
31
|
+
* Dispatches to the correct provider API based on the model prefix.
|
|
32
|
+
* Returns a numeric score (0–100) or null if the call or parse fails.
|
|
33
|
+
*/
|
|
34
|
+
export async function gradeOnce(graderModel, responseText, rubricText) {
|
|
35
|
+
const config = resolveProvider(graderModel);
|
|
36
|
+
const prompt = `You are evaluating an AI assistant's response. Grade the response according to the following rubric.
|
|
37
|
+
|
|
38
|
+
## Response to evaluate:
|
|
39
|
+
${responseText.slice(0, 8000)}
|
|
40
|
+
|
|
41
|
+
## Rubric:
|
|
42
|
+
${rubricText}
|
|
43
|
+
`;
|
|
44
|
+
try {
|
|
45
|
+
const provider = graderModel.split(":")[0];
|
|
46
|
+
let content;
|
|
47
|
+
if (provider === "anthropic") {
|
|
48
|
+
content = await callAnthropic(config, prompt);
|
|
49
|
+
}
|
|
50
|
+
else if (provider === "openai") {
|
|
51
|
+
content = await callOpenAI(config, prompt);
|
|
52
|
+
}
|
|
53
|
+
else {
|
|
54
|
+
// resolveProvider already throws for unknown providers, but just in case
|
|
55
|
+
return null;
|
|
56
|
+
}
|
|
57
|
+
if (content === null)
|
|
58
|
+
return null;
|
|
59
|
+
const score = extractScore(content);
|
|
60
|
+
if (score === null) {
|
|
61
|
+
console.error(` ⚠ Could not parse grader response: ${content.slice(0, 100)}`);
|
|
62
|
+
}
|
|
63
|
+
return score;
|
|
64
|
+
}
|
|
65
|
+
catch (err) {
|
|
66
|
+
console.error(` ⚠ Grader call failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
67
|
+
return null;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
// ---------------------------------------------------------------------------
|
|
71
|
+
// Provider-specific API calls
|
|
72
|
+
// ---------------------------------------------------------------------------
|
|
73
|
+
/**
|
|
74
|
+
* Load the grader model from `config/models.yaml`.
|
|
75
|
+
* Returns both the model ID and human-readable label.
|
|
76
|
+
* Falls back to `openai:gpt-5` if not configured.
|
|
77
|
+
*/
|
|
78
|
+
export function loadGraderModel() {
|
|
79
|
+
const modelsPath = join(EVAL_ROOT, "config", "models.yaml");
|
|
80
|
+
if (!existsSync(modelsPath)) {
|
|
81
|
+
console.error("❌ config/models.yaml not found");
|
|
82
|
+
process.exit(1);
|
|
83
|
+
}
|
|
84
|
+
const raw = readFileSync(modelsPath, "utf-8");
|
|
85
|
+
const data = load(raw);
|
|
86
|
+
return {
|
|
87
|
+
id: data?.grader?.id ?? "openai:gpt-5",
|
|
88
|
+
label: data?.grader?.label ?? "GPT-5 (grader)",
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
async function callAnthropic(config, prompt) {
|
|
92
|
+
const response = await fetch(config.baseUrl, {
|
|
93
|
+
body: JSON.stringify({
|
|
94
|
+
max_tokens: 256,
|
|
95
|
+
messages: [{ content: prompt, role: "user" }],
|
|
96
|
+
model: config.modelName,
|
|
97
|
+
temperature: 0.2,
|
|
98
|
+
}),
|
|
99
|
+
headers: {
|
|
100
|
+
"anthropic-version": "2023-06-01",
|
|
101
|
+
"Content-Type": "application/json",
|
|
102
|
+
"x-api-key": config.apiKey,
|
|
103
|
+
},
|
|
104
|
+
method: "POST",
|
|
105
|
+
});
|
|
106
|
+
if (!response.ok) {
|
|
107
|
+
const text = await response.text();
|
|
108
|
+
console.error(` ⚠ Grader API error (Anthropic): ${response.status} ${text.slice(0, 200)}`);
|
|
109
|
+
return null;
|
|
110
|
+
}
|
|
111
|
+
const data = (await response.json());
|
|
112
|
+
const textBlock = data.content?.find((c) => c.type === "text");
|
|
113
|
+
return textBlock?.text ?? "";
|
|
114
|
+
}
|
|
115
|
+
// ---------------------------------------------------------------------------
|
|
116
|
+
// Score extraction
|
|
117
|
+
// ---------------------------------------------------------------------------
|
|
118
|
+
async function callOpenAI(config, prompt) {
|
|
119
|
+
const response = await fetch(config.baseUrl, {
|
|
120
|
+
body: JSON.stringify({
|
|
121
|
+
max_tokens: 256,
|
|
122
|
+
messages: [{ content: prompt, role: "user" }],
|
|
123
|
+
model: config.modelName,
|
|
124
|
+
temperature: 0.2,
|
|
125
|
+
}),
|
|
126
|
+
headers: {
|
|
127
|
+
Authorization: `Bearer ${config.apiKey}`,
|
|
128
|
+
"Content-Type": "application/json",
|
|
129
|
+
},
|
|
130
|
+
method: "POST",
|
|
131
|
+
});
|
|
132
|
+
if (!response.ok) {
|
|
133
|
+
const text = await response.text();
|
|
134
|
+
console.error(` ⚠ Grader API error (OpenAI): ${response.status} ${text.slice(0, 200)}`);
|
|
135
|
+
return null;
|
|
136
|
+
}
|
|
137
|
+
const data = (await response.json());
|
|
138
|
+
return data.choices?.[0]?.message?.content ?? "";
|
|
139
|
+
}
|
|
140
|
+
// ---------------------------------------------------------------------------
|
|
141
|
+
// Public API
|
|
142
|
+
// ---------------------------------------------------------------------------
|
|
143
|
+
function extractScore(content) {
|
|
144
|
+
// Try JSON parse first: {"score": 85, "reason": "..."}
|
|
145
|
+
try {
|
|
146
|
+
const parsed = JSON.parse(content);
|
|
147
|
+
if (typeof parsed === "object" &&
|
|
148
|
+
parsed !== null &&
|
|
149
|
+
"score" in parsed &&
|
|
150
|
+
typeof parsed.score === "number") {
|
|
151
|
+
return parsed.score;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
catch {
|
|
155
|
+
// Not JSON — fall through
|
|
156
|
+
}
|
|
157
|
+
// Fallback: extract first bare number
|
|
158
|
+
const match = content.match(/(\d+)/);
|
|
159
|
+
if (match)
|
|
160
|
+
return parseInt(match[1], 10);
|
|
161
|
+
return null;
|
|
162
|
+
}
|
|
163
|
+
/**
|
|
164
|
+
* Parse a Promptfoo-style model ID and resolve the provider config.
|
|
165
|
+
*
|
|
166
|
+
* Supported formats:
|
|
167
|
+
* - `openai:chat:gpt-5.2` → OpenAI, model = `gpt-5.2`
|
|
168
|
+
* - `openai:gpt-5` → OpenAI, model = `gpt-5`
|
|
169
|
+
* - `anthropic:messages:claude-opus-4-5-20251101` → Anthropic, model = `claude-opus-4-5-20251101`
|
|
170
|
+
* - `anthropic:claude-sonnet-4` → Anthropic, model = `claude-sonnet-4`
|
|
171
|
+
*/
|
|
172
|
+
function resolveProvider(graderModel) {
|
|
173
|
+
const parts = graderModel.split(":");
|
|
174
|
+
const provider = parts[0];
|
|
175
|
+
if (provider === "anthropic") {
|
|
176
|
+
// "anthropic:messages:claude-opus-4-5" → "claude-opus-4-5"
|
|
177
|
+
// "anthropic:claude-sonnet-4" → "claude-sonnet-4"
|
|
178
|
+
const modelName = parts.length >= 3 && parts[1] === "messages"
|
|
179
|
+
? parts.slice(2).join(":")
|
|
180
|
+
: parts.slice(1).join(":");
|
|
181
|
+
const apiKey = process.env.ANTHROPIC_API_KEY;
|
|
182
|
+
if (!apiKey) {
|
|
183
|
+
throw new Error("ANTHROPIC_API_KEY not set. Required for grader model: " + graderModel);
|
|
184
|
+
}
|
|
185
|
+
return {
|
|
186
|
+
apiKey,
|
|
187
|
+
baseUrl: "https://api.anthropic.com/v1/messages",
|
|
188
|
+
modelName,
|
|
189
|
+
};
|
|
190
|
+
}
|
|
191
|
+
if (provider === "openai") {
|
|
192
|
+
// "openai:chat:gpt-5.2" → "gpt-5.2", "openai:gpt-5" → "gpt-5"
|
|
193
|
+
const modelName = parts.length >= 3 ? parts.slice(2).join(":") : parts[1];
|
|
194
|
+
const apiKey = process.env.OPENAI_API_KEY;
|
|
195
|
+
if (!apiKey) {
|
|
196
|
+
throw new Error("OPENAI_API_KEY not set. Required for grader model: " + graderModel);
|
|
197
|
+
}
|
|
198
|
+
return {
|
|
199
|
+
apiKey,
|
|
200
|
+
baseUrl: "https://api.openai.com/v1/chat/completions",
|
|
201
|
+
modelName,
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
throw new Error(`Unsupported grader provider "${provider}" in model "${graderModel}". ` +
|
|
205
|
+
"Supported: openai, anthropic.");
|
|
206
|
+
}
|