@sanity/ailf 1.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +29 -12
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
- package/dist/_vendor/ailf-core/config-helpers.js +51 -2
- package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
- package/dist/_vendor/ailf-core/examples/index.js +213 -94
- package/dist/_vendor/ailf-core/index.d.ts +3 -2
- package/dist/_vendor/ailf-core/index.js +2 -1
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
- package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +7 -1
- package/dist/adapters/config-sources/ts-config-loader.js +21 -13
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
- package/dist/adapters/task-sources/index.d.ts +3 -4
- package/dist/adapters/task-sources/index.js +3 -4
- package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
- package/dist/adapters/task-sources/repo-schemas.js +228 -20
- package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
- package/dist/adapters/task-sources/repo-task-source.js +81 -122
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
- package/dist/adapters/task-sources/task-file-loader.js +21 -7
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/coverage-audit.js +3 -1
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +5 -4
- package/dist/commands/init.js +190 -25
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +15 -4
- package/dist/composition-root.js +100 -55
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/build-step-sequence.js +4 -2
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +32 -19
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +77 -26
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +51 -31
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
- package/dist/pipeline/compiler/literacy-bridge.js +2 -2
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
- package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
- package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/expand-tasks.d.ts +2 -2
- package/dist/pipeline/expand-tasks.js +2 -2
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +16 -9
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +16 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
- package/dist/pipeline/mirror-repo-tasks.js +10 -10
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +68 -30
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +32 -24
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* validate-tasks command — standalone validation of
|
|
2
|
+
* validate-tasks command — standalone validation of task files.
|
|
3
3
|
*
|
|
4
|
-
* Validates .ailf/tasks/*.yaml files against the
|
|
4
|
+
* Validates .ailf/tasks/*.yaml files against the CanonicalTaskSchema without
|
|
5
5
|
* running the full pipeline. Useful for pre-commit hooks and CI checks
|
|
6
6
|
* in external repos.
|
|
7
7
|
*
|
|
@@ -16,11 +16,11 @@ import { existsSync, readdirSync, readFileSync } from "fs";
|
|
|
16
16
|
import { resolve, relative } from "path";
|
|
17
17
|
import { Command } from "commander";
|
|
18
18
|
import { load } from "js-yaml";
|
|
19
|
-
import {
|
|
20
|
-
import {
|
|
19
|
+
import { detectLegacyFieldNames, parseCanonicalTaskFile, } from "../adapters/task-sources/repo-schemas.js";
|
|
20
|
+
import { validateCanonicalTasks, formatValidationResult, } from "../adapters/task-sources/repo-validation.js";
|
|
21
21
|
export function createValidateTasksCommand() {
|
|
22
22
|
return new Command("validate-tasks")
|
|
23
|
-
.description("Validate
|
|
23
|
+
.description("Validate task YAML files (.ailf/tasks/) against the canonical schema")
|
|
24
24
|
.argument("[path]", "Path to tasks directory (default: .ailf/tasks/)", ".ailf/tasks")
|
|
25
25
|
.option("--strict", "Treat warnings as errors", false)
|
|
26
26
|
.action(async (tasksPath, opts) => {
|
|
@@ -29,12 +29,12 @@ export function createValidateTasksCommand() {
|
|
|
29
29
|
const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
|
|
30
30
|
const resolvedPath = resolve(callerCwd, tasksPath);
|
|
31
31
|
if (!existsSync(resolvedPath)) {
|
|
32
|
-
console.error(
|
|
32
|
+
console.error(`Directory not found: ${resolvedPath}`);
|
|
33
33
|
process.exit(1);
|
|
34
34
|
}
|
|
35
35
|
const yamlFiles = readdirSync(resolvedPath).filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."));
|
|
36
36
|
if (yamlFiles.length === 0) {
|
|
37
|
-
console.error(
|
|
37
|
+
console.error(`No YAML files found in ${resolvedPath}`);
|
|
38
38
|
process.exit(1);
|
|
39
39
|
}
|
|
40
40
|
console.log(`\nValidating ${yamlFiles.length} task file(s) in ${relative(process.cwd(), resolvedPath)}/\n`);
|
|
@@ -50,25 +50,36 @@ export function createValidateTasksCommand() {
|
|
|
50
50
|
}
|
|
51
51
|
catch (err) {
|
|
52
52
|
const msg = err instanceof Error ? err.message : String(err);
|
|
53
|
-
console.error(`
|
|
53
|
+
console.error(` ${file}: YAML parse error`);
|
|
54
54
|
console.error(` ${msg}\n`);
|
|
55
55
|
hasErrors = true;
|
|
56
56
|
continue;
|
|
57
57
|
}
|
|
58
58
|
if (!Array.isArray(parsed)) {
|
|
59
|
-
console.error(`
|
|
59
|
+
console.error(` ${file}: Expected a YAML array of task definitions`);
|
|
60
|
+
hasErrors = true;
|
|
61
|
+
continue;
|
|
62
|
+
}
|
|
63
|
+
// Detect legacy field names before Zod validation
|
|
64
|
+
const legacyWarnings = detectLegacyFieldNames(parsed, file);
|
|
65
|
+
if (legacyWarnings.length > 0) {
|
|
66
|
+
console.error(` ${file}: Uses legacy field names`);
|
|
67
|
+
for (const w of legacyWarnings) {
|
|
68
|
+
console.error(` ${w}`);
|
|
69
|
+
}
|
|
70
|
+
console.error();
|
|
60
71
|
hasErrors = true;
|
|
61
72
|
continue;
|
|
62
73
|
}
|
|
63
74
|
try {
|
|
64
|
-
const tasks =
|
|
65
|
-
console.log(`
|
|
75
|
+
const tasks = parseCanonicalTaskFile(parsed, file);
|
|
76
|
+
console.log(` ${file}: ${tasks.length} task${tasks.length === 1 ? "" : "s"} valid`);
|
|
66
77
|
totalTasks += tasks.length;
|
|
67
78
|
allTasks.push(...tasks);
|
|
68
79
|
}
|
|
69
80
|
catch (err) {
|
|
70
81
|
const msg = err instanceof Error ? err.message : String(err);
|
|
71
|
-
console.error(`
|
|
82
|
+
console.error(` ${file}: Schema validation failed`);
|
|
72
83
|
console.error(`${msg
|
|
73
84
|
.split("\n")
|
|
74
85
|
.map((l) => ` ${l}`)
|
|
@@ -79,7 +90,7 @@ export function createValidateTasksCommand() {
|
|
|
79
90
|
// Run semantic validation on all parsed tasks
|
|
80
91
|
if (allTasks.length > 0) {
|
|
81
92
|
console.log(); // blank line
|
|
82
|
-
const semanticResult =
|
|
93
|
+
const semanticResult = validateCanonicalTasks(allTasks);
|
|
83
94
|
const formatted = formatValidationResult(semanticResult);
|
|
84
95
|
console.log(formatted);
|
|
85
96
|
if (!semanticResult.valid) {
|
|
@@ -87,10 +98,10 @@ export function createValidateTasksCommand() {
|
|
|
87
98
|
}
|
|
88
99
|
if (opts.strict && semanticResult.warnings.length > 0) {
|
|
89
100
|
hasErrors = true;
|
|
90
|
-
console.log("\n
|
|
101
|
+
console.log("\n --strict mode: warnings treated as errors");
|
|
91
102
|
}
|
|
92
103
|
}
|
|
93
|
-
console.log(`\n${hasErrors ? "
|
|
104
|
+
console.log(`\n${hasErrors ? "FAIL" : "OK"} ${totalTasks} task${totalTasks === 1 ? "" : "s"} across ${yamlFiles.length} file${yamlFiles.length === 1 ? "" : "s"}\n`);
|
|
94
105
|
process.exit(hasErrors ? 1 : 0);
|
|
95
106
|
});
|
|
96
107
|
}
|
|
@@ -13,14 +13,25 @@
|
|
|
13
13
|
* - After: one factory, one place to change adapter wiring
|
|
14
14
|
*
|
|
15
15
|
* @see packages/core/src/ports/context.ts — AppContext interface
|
|
16
|
-
* @see docs/exec-plans/ports-and-adapters/phase-7-composition-root.md
|
|
16
|
+
* @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
|
|
17
17
|
*/
|
|
18
|
-
import { type AppContext, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
|
|
18
|
+
import { type AppContext, type AssertionRegistration, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
|
|
19
19
|
/**
|
|
20
20
|
* Create a fully wired AppContext from resolved configuration.
|
|
21
21
|
*
|
|
22
22
|
* Every adapter is constructed here and nowhere else (outside of tests).
|
|
23
|
-
* Swapping an adapter
|
|
24
|
-
* is a one-line change in this function.
|
|
23
|
+
* Swapping an adapter is a one-line change in this function.
|
|
25
24
|
*/
|
|
26
25
|
export declare function createAppContext(config: ResolvedConfig): AppContext;
|
|
26
|
+
/**
|
|
27
|
+
* Generic Promptfoo assertion types available to all evaluation modes.
|
|
28
|
+
*
|
|
29
|
+
* These are evaluation primitives (text matching, JSON validation, LLM grading)
|
|
30
|
+
* that aren't specific to any mode or domain. They're registered before any
|
|
31
|
+
* preset so every mode has access to them.
|
|
32
|
+
*
|
|
33
|
+
* `compatibleModes` is omitted — when undefined, the assertion is compatible
|
|
34
|
+
* with all modes. Mode-specific assertions can be registered by presets with
|
|
35
|
+
* explicit mode whitelists.
|
|
36
|
+
*/
|
|
37
|
+
export declare const FRAMEWORK_ASSERTIONS: AssertionRegistration[];
|
package/dist/composition-root.js
CHANGED
|
@@ -13,14 +13,18 @@
|
|
|
13
13
|
* - After: one factory, one place to change adapter wiring
|
|
14
14
|
*
|
|
15
15
|
* @see packages/core/src/ports/context.ts — AppContext interface
|
|
16
|
-
* @see docs/exec-plans/ports-and-adapters/phase-7-composition-root.md
|
|
16
|
+
* @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
|
|
17
17
|
*/
|
|
18
|
-
import {
|
|
18
|
+
import { join } from "node:path";
|
|
19
|
+
import { InMemoryPluginRegistry, NoOpArtifactCollector, } from "./_vendor/ailf-core/index.js";
|
|
20
|
+
import { FilesystemArtifactCollector } from "./artifact-capture/filesystem-collector.js";
|
|
19
21
|
import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js";
|
|
22
|
+
import { loadExternalPresets } from "./pipeline/compiler/preset-loader.js";
|
|
20
23
|
import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
|
|
21
24
|
import { PromptfooEvalAdapter } from "./adapters/eval-runners/promptfoo-eval-adapter.js";
|
|
22
25
|
import { ConsoleLogger, JsonLogger, QuietLogger, } from "./adapters/loggers/index.js";
|
|
23
|
-
import { CompositeTaskSource, ContentLakeTaskSource, RepoTaskSource,
|
|
26
|
+
import { CompositeTaskSource, ContentLakeTaskSource, RepoTaskSource, } from "./adapters/task-sources/index.js";
|
|
27
|
+
import { createAgentHarnessBase, createKnowledgeProbeBase, createLiteracyModeBase, createMcpServerModeBase, } from "./pipeline/compiler/mode-bases/index.js";
|
|
24
28
|
import { createSanityLiteracyPreset } from "./pipeline/compiler/presets/index.js";
|
|
25
29
|
import { getSanityClient } from "./sanity/client.js";
|
|
26
30
|
import { ReportStore } from "./report-store.js";
|
|
@@ -29,8 +33,7 @@ import { loadSinks } from "./sinks/index.js";
|
|
|
29
33
|
* Create a fully wired AppContext from resolved configuration.
|
|
30
34
|
*
|
|
31
35
|
* Every adapter is constructed here and nowhere else (outside of tests).
|
|
32
|
-
* Swapping an adapter
|
|
33
|
-
* is a one-line change in this function.
|
|
36
|
+
* Swapping an adapter is a one-line change in this function.
|
|
34
37
|
*/
|
|
35
38
|
export function createAppContext(config) {
|
|
36
39
|
// Logger — selected by env var preferences
|
|
@@ -39,10 +42,12 @@ export function createAppContext(config) {
|
|
|
39
42
|
const cache = config.noCache ? undefined : createCache(config);
|
|
40
43
|
// Task source — selected by config.taskSourceType
|
|
41
44
|
const taskSource = createTaskSource(config);
|
|
42
|
-
// Plugin registry — mode
|
|
43
|
-
//
|
|
44
|
-
|
|
45
|
-
|
|
45
|
+
// Plugin registry — mode bases, assertions, presets, doc fetcher.
|
|
46
|
+
// External presets from config are loaded and registered after built-ins.
|
|
47
|
+
const externalPresets = config.presets && config.presets.length > 0
|
|
48
|
+
? loadExternalPresets(config.presets, config.rootDir)
|
|
49
|
+
: undefined;
|
|
50
|
+
const registry = createRegistry(config.rootDir, externalPresets);
|
|
46
51
|
// Doc fetcher — provided by the registered preset's factory
|
|
47
52
|
const docFetcherFactory = registry.getDocFetcherFactory();
|
|
48
53
|
const docFetcher = docFetcherFactory ? docFetcherFactory() : undefined;
|
|
@@ -52,8 +57,23 @@ export function createAppContext(config) {
|
|
|
52
57
|
const reportStore = createReportStore(config);
|
|
53
58
|
// Sinks — loaded from config/sinks
|
|
54
59
|
const sinks = loadSinks();
|
|
60
|
+
// Artifact collector — no-op by default, filesystem when --capture is set
|
|
61
|
+
const collector = config.captureEnabled
|
|
62
|
+
? new FilesystemArtifactCollector({
|
|
63
|
+
captureDir: config.captureDir ?? join(config.rootDir, "results", "captures"),
|
|
64
|
+
mode: config.mode,
|
|
65
|
+
compress: config.captureCompress ?? true,
|
|
66
|
+
extras: config.captureExtras ?? true,
|
|
67
|
+
pipeline: {
|
|
68
|
+
variant: config.variant,
|
|
69
|
+
source: config.source,
|
|
70
|
+
areas: config.areas,
|
|
71
|
+
},
|
|
72
|
+
})
|
|
73
|
+
: new NoOpArtifactCollector();
|
|
55
74
|
return {
|
|
56
75
|
cache,
|
|
76
|
+
collector,
|
|
57
77
|
config,
|
|
58
78
|
docFetcher,
|
|
59
79
|
evalRunner,
|
|
@@ -101,15 +121,12 @@ function createTaskSource(config) {
|
|
|
101
121
|
}
|
|
102
122
|
return new RepoTaskSource(config.repoTasksPath);
|
|
103
123
|
}
|
|
104
|
-
// Primary source —
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
process.env.SANITY_API_TOKEN ??
|
|
111
|
-
undefined,
|
|
112
|
-
}));
|
|
124
|
+
// Primary source — Content Lake (the only non-repo source remaining)
|
|
125
|
+
const primary = new ContentLakeTaskSource(getSanityClient({
|
|
126
|
+
token: process.env.AILF_REPORT_SANITY_API_TOKEN ??
|
|
127
|
+
process.env.SANITY_API_TOKEN ??
|
|
128
|
+
undefined,
|
|
129
|
+
}));
|
|
113
130
|
// If repo tasks path is set, combine primary + repo sources.
|
|
114
131
|
// This is the "augment" mode — repo tasks extend the primary source.
|
|
115
132
|
if (config.repoTasksPath) {
|
|
@@ -121,58 +138,86 @@ function createTaskSource(config) {
|
|
|
121
138
|
return primary;
|
|
122
139
|
}
|
|
123
140
|
// ---------------------------------------------------------------------------
|
|
124
|
-
//
|
|
141
|
+
// Layer 0: Framework built-in assertions
|
|
125
142
|
// ---------------------------------------------------------------------------
|
|
126
|
-
|
|
143
|
+
/**
|
|
144
|
+
* Generic Promptfoo assertion types available to all evaluation modes.
|
|
145
|
+
*
|
|
146
|
+
* These are evaluation primitives (text matching, JSON validation, LLM grading)
|
|
147
|
+
* that aren't specific to any mode or domain. They're registered before any
|
|
148
|
+
* preset so every mode has access to them.
|
|
149
|
+
*
|
|
150
|
+
* `compatibleModes` is omitted — when undefined, the assertion is compatible
|
|
151
|
+
* with all modes. Mode-specific assertions can be registered by presets with
|
|
152
|
+
* explicit mode whitelists.
|
|
153
|
+
*/
|
|
154
|
+
export const FRAMEWORK_ASSERTIONS = [
|
|
155
|
+
{
|
|
156
|
+
type: "contains",
|
|
157
|
+
label: "Contains text",
|
|
158
|
+
handlerModule: "promptfoo:builtin",
|
|
159
|
+
},
|
|
160
|
+
{
|
|
161
|
+
type: "contains-all",
|
|
162
|
+
label: "Contains all texts",
|
|
163
|
+
handlerModule: "promptfoo:builtin",
|
|
164
|
+
},
|
|
165
|
+
{
|
|
166
|
+
type: "contains-any",
|
|
167
|
+
label: "Contains any text",
|
|
168
|
+
handlerModule: "promptfoo:builtin",
|
|
169
|
+
},
|
|
170
|
+
{ type: "equals", label: "Exact match", handlerModule: "promptfoo:builtin" },
|
|
171
|
+
{ type: "regex", label: "Regex match", handlerModule: "promptfoo:builtin" },
|
|
172
|
+
{ type: "is-json", label: "Valid JSON", handlerModule: "promptfoo:builtin" },
|
|
127
173
|
{
|
|
128
|
-
|
|
129
|
-
label: "
|
|
130
|
-
|
|
131
|
-
rubricTemplateIds: [],
|
|
132
|
-
handlerModule: "./mode-handlers/knowledge-probe-handler.js",
|
|
174
|
+
type: "javascript",
|
|
175
|
+
label: "JavaScript assertion",
|
|
176
|
+
handlerModule: "promptfoo:builtin",
|
|
133
177
|
},
|
|
134
178
|
{
|
|
135
|
-
|
|
136
|
-
label: "
|
|
137
|
-
|
|
138
|
-
rubricTemplateIds: [
|
|
139
|
-
"mcp-input-validation",
|
|
140
|
-
"mcp-output-correctness",
|
|
141
|
-
"mcp-error-handling",
|
|
142
|
-
],
|
|
143
|
-
handlerModule: "./mode-handlers/mcp-server-handler.js",
|
|
179
|
+
type: "llm-rubric",
|
|
180
|
+
label: "LLM-graded rubric",
|
|
181
|
+
handlerModule: "promptfoo:builtin",
|
|
144
182
|
},
|
|
145
183
|
{
|
|
146
|
-
|
|
147
|
-
label: "
|
|
148
|
-
|
|
149
|
-
rubricTemplateIds: [],
|
|
150
|
-
handlerModule: "./mode-handlers/agent-harness-handler.js",
|
|
184
|
+
type: "similar",
|
|
185
|
+
label: "Semantic similarity",
|
|
186
|
+
handlerModule: "promptfoo:builtin",
|
|
151
187
|
},
|
|
152
188
|
];
|
|
153
189
|
/**
|
|
154
190
|
* Build and populate the plugin registry.
|
|
155
191
|
*
|
|
156
|
-
*
|
|
157
|
-
*
|
|
158
|
-
*
|
|
159
|
-
*
|
|
160
|
-
*
|
|
161
|
-
* the appropriate register method (registerMode, registerRubricTemplate, …).
|
|
162
|
-
* 3. After registration the rest of createAppContext() can pull capabilities
|
|
163
|
-
* from the registry (e.g. getDocFetcherFactory()) without knowing which
|
|
164
|
-
* preset provided them.
|
|
192
|
+
* Registration follows the five-layer model:
|
|
193
|
+
*
|
|
194
|
+
* Layer 0: Framework built-in assertions (generic Promptfoo builtins)
|
|
195
|
+
* Layer 0.5: Mode bases (shared evaluation methodology per mode)
|
|
196
|
+
* Layer 1: Domain presets (domain-specific config targeting a mode base)
|
|
165
197
|
*
|
|
166
|
-
*
|
|
167
|
-
*
|
|
198
|
+
* Mode bases define HOW you evaluate (rubrics, scoring, prompts).
|
|
199
|
+
* Domain presets define WHAT you evaluate (sources, features, doc fetcher)
|
|
200
|
+
* and target a mode base by ID. When a preset is registered, it inherits
|
|
201
|
+
* its mode base's defaults and can optionally override them.
|
|
168
202
|
*/
|
|
169
|
-
function createRegistry(rootDir) {
|
|
203
|
+
function createRegistry(rootDir, externalPresets) {
|
|
170
204
|
const registry = new InMemoryPluginRegistry();
|
|
171
|
-
//
|
|
205
|
+
// Layer 0: Framework built-in assertions (available to all modes)
|
|
206
|
+
for (const assertion of FRAMEWORK_ASSERTIONS) {
|
|
207
|
+
registry.registerAssertion(assertion);
|
|
208
|
+
}
|
|
209
|
+
// Layer 0.5: Mode bases (evaluation methodology)
|
|
210
|
+
registry.registerModeBase(createLiteracyModeBase());
|
|
211
|
+
registry.registerModeBase(createMcpServerModeBase());
|
|
212
|
+
registry.registerModeBase(createKnowledgeProbeBase());
|
|
213
|
+
registry.registerModeBase(createAgentHarnessBase());
|
|
214
|
+
// Layer 1: Built-in domain presets
|
|
172
215
|
registry.registerPreset(createSanityLiteracyPreset({ rootDir }));
|
|
173
|
-
//
|
|
174
|
-
|
|
175
|
-
|
|
216
|
+
// Layer 1+: External domain presets (from config.presets)
|
|
217
|
+
if (externalPresets) {
|
|
218
|
+
for (const preset of externalPresets) {
|
|
219
|
+
registry.registerPreset(preset);
|
|
220
|
+
}
|
|
176
221
|
}
|
|
177
222
|
return registry;
|
|
178
223
|
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* features.ts — Product feature registry for documentation coverage auditing.
|
|
3
|
+
*
|
|
4
|
+
* Default features are provided by the sanity-literacy preset registered
|
|
5
|
+
* in the composition root. This file exists as an override point — any
|
|
6
|
+
* features defined here take precedence over preset-provided features
|
|
7
|
+
* during coverage auditing.
|
|
8
|
+
*
|
|
9
|
+
* To track custom features, define them here:
|
|
10
|
+
*
|
|
11
|
+
* export default defineFeatures({
|
|
12
|
+
* features: [
|
|
13
|
+
* { id: "my-feature", name: "My Feature", sections: ["api"], ... },
|
|
14
|
+
* ],
|
|
15
|
+
* })
|
|
16
|
+
*
|
|
17
|
+
* @see packages/eval/src/pipeline/compiler/presets/sanity-literacy.ts
|
|
18
|
+
* @see docs/archive/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
import { defineFeatures } from "../_vendor/ailf-core/index.js"
|
|
22
|
+
|
|
23
|
+
export default defineFeatures({ features: [] })
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* models.ts — Central model registry for AILF evaluations.
|
|
3
|
+
*
|
|
4
|
+
* Define all models to test here. Each model declares which evaluation
|
|
5
|
+
* modes it participates in (e.g., "literacy", "mcp-server") and
|
|
6
|
+
* optionally which variants within those modes.
|
|
7
|
+
*
|
|
8
|
+
* When a model enrolls in a mode without specifying variants, all
|
|
9
|
+
* variants defined by the mode base are included by default.
|
|
10
|
+
*
|
|
11
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-1-ts-config-loading.md
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { defineModels } from "../_vendor/ailf-core/index.js"
|
|
15
|
+
|
|
16
|
+
export default defineModels({
|
|
17
|
+
models: [
|
|
18
|
+
// ── Anthropic ──────────────────────────────────────────────
|
|
19
|
+
{
|
|
20
|
+
id: "anthropic:messages:claude-opus-4-6",
|
|
21
|
+
label: "Claude Opus 4.6",
|
|
22
|
+
config: { temperature: 0.2, max_tokens: 4096 },
|
|
23
|
+
modes: ["literacy", "mcp-server", "knowledge-probe"],
|
|
24
|
+
// All literacy variants included by default (baseline, observed,
|
|
25
|
+
// agentic-naive, agentic-optimized)
|
|
26
|
+
},
|
|
27
|
+
|
|
28
|
+
// ── Google ─────────────────────────────────────────────────
|
|
29
|
+
// {
|
|
30
|
+
// id: "google:gemini-2.5-pro",
|
|
31
|
+
// label: "Gemini 2.5 Pro",
|
|
32
|
+
// config: { temperature: 0.2, max_tokens: 4096 },
|
|
33
|
+
// modes: ["literacy"],
|
|
34
|
+
// },
|
|
35
|
+
|
|
36
|
+
// ── OpenAI ─────────────────────────────────────────────────
|
|
37
|
+
{
|
|
38
|
+
id: "openai:chat:gpt-5.2",
|
|
39
|
+
label: "GPT 5.2",
|
|
40
|
+
config: { temperature: 0.2, max_tokens: 4096 },
|
|
41
|
+
modes: ["literacy", "knowledge-probe"],
|
|
42
|
+
// All literacy variants included by default
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
id: "openai:responses:gpt-5.4",
|
|
46
|
+
label: "GPT 5.4",
|
|
47
|
+
config: {
|
|
48
|
+
reasoning_effort: "medium",
|
|
49
|
+
max_output_tokens: 4096,
|
|
50
|
+
maxRetries: 1,
|
|
51
|
+
},
|
|
52
|
+
timeoutMs: 600_000, // 10 min — reasoning model needs more headroom
|
|
53
|
+
modes: ["literacy", "mcp-server", "knowledge-probe"],
|
|
54
|
+
// All literacy variants included by default
|
|
55
|
+
},
|
|
56
|
+
|
|
57
|
+
// ── Disabled models (uncomment to enable) ──────────────────
|
|
58
|
+
// { id: "anthropic:claude-sonnet-4-20250514", label: "Claude Sonnet 4",
|
|
59
|
+
// config: { temperature: 0.2, max_tokens: 4096 },
|
|
60
|
+
// modes: ["literacy"],
|
|
61
|
+
// variants: { literacy: ["baseline"] } },
|
|
62
|
+
// { id: "anthropic:claude-3.5-sonnet-20241022", label: "Claude 3.5 Sonnet",
|
|
63
|
+
// config: { temperature: 0.2, max_tokens: 4096 },
|
|
64
|
+
// modes: ["literacy"],
|
|
65
|
+
// variants: { literacy: ["baseline", "agentic-naive", "agentic-optimized"] } },
|
|
66
|
+
// { id: "google:gemini-2.0-flash", label: "Gemini 2.0 Flash",
|
|
67
|
+
// config: { temperature: 0.2, max_tokens: 4096 },
|
|
68
|
+
// modes: ["literacy"],
|
|
69
|
+
// variants: { literacy: ["baseline"] } },
|
|
70
|
+
// { id: "openrouter:deepseek/deepseek-r1", label: "DeepSeek R1",
|
|
71
|
+
// config: { temperature: 0.2, max_tokens: 4096 },
|
|
72
|
+
// modes: ["literacy"],
|
|
73
|
+
// variants: { literacy: ["baseline"] } },
|
|
74
|
+
],
|
|
75
|
+
|
|
76
|
+
// ── Grading Model ──────────────────────────────────────────
|
|
77
|
+
// Which model scores the responses. Separate from the models being tested.
|
|
78
|
+
grader: {
|
|
79
|
+
id: "anthropic:messages:claude-opus-4-5-20251101",
|
|
80
|
+
label: "Claude Opus 4.5 (grader)",
|
|
81
|
+
},
|
|
82
|
+
|
|
83
|
+
// ── Evaluation Options ─────────────────────────────────────
|
|
84
|
+
evalBudgetMs: 1_200_000, // 20 min per eval mode — outer kill switch
|
|
85
|
+
maxConcurrency: 32, // max parallel API calls — benchmarked in DOC-1896
|
|
86
|
+
|
|
87
|
+
// ── Default Config ─────────────────────────────────────────
|
|
88
|
+
// Applied to all models unless overridden per-model.
|
|
89
|
+
defaults: {
|
|
90
|
+
temperature: 0.2,
|
|
91
|
+
max_tokens: 4096,
|
|
92
|
+
maxToolRounds: 5, // for agentic modes
|
|
93
|
+
observerOptions: {
|
|
94
|
+
maxPreviewBytes: 2048,
|
|
95
|
+
captureResponsePreview: true,
|
|
96
|
+
includePatterns: ["sanity.io", "sanity.dev", "cdn.sanity.io"],
|
|
97
|
+
sensitiveHeaders: ["authorization", "cookie", "x-api-key"],
|
|
98
|
+
},
|
|
99
|
+
},
|
|
100
|
+
})
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* prompts.ts — User-override prompt templates.
|
|
3
|
+
*
|
|
4
|
+
* Canonical literacy prompt templates now live in the literacy mode handler:
|
|
5
|
+
* src/pipeline/compiler/mode-handlers/literacy-handler.ts
|
|
6
|
+
*
|
|
7
|
+
* Each mode handler owns its own prompts via getPrompts(). This file exists
|
|
8
|
+
* for user-level overrides only. Add entries here to replace handler-owned
|
|
9
|
+
* defaults for specific prompt IDs.
|
|
10
|
+
*
|
|
11
|
+
* @see packages/eval/src/pipeline/compiler/mode-handlers/literacy-handler.ts
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { definePrompts } from "../_vendor/ailf-core/index.js"
|
|
15
|
+
|
|
16
|
+
export default definePrompts([])
|