@sanity/ailf 0.5.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/config/features.ts +23 -0
- package/config/models.ts +95 -0
- package/config/prompts.ts +16 -0
- package/config/rubrics.ts +225 -0
- package/config/schedules.ts +47 -0
- package/config/sinks.ts +37 -0
- package/config/sources.ts +21 -0
- package/config/thresholds.ts +61 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +171 -0
- package/dist/_vendor/ailf-core/config-helpers.js +170 -0
- package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
- package/dist/_vendor/ailf-core/env-helper.js +45 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/examples/index.js +25 -0
- package/dist/_vendor/ailf-core/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/index.js +5 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +17 -2
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
- package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +8 -2
- package/dist/_vendor/ailf-core/schemas/eval-config.js +17 -2
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +9 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +8 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -31
- package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -9
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
- package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/services/index.js +2 -1
- package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
- package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
- package/dist/_vendor/ailf-core/services/scoring.js +25 -15
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +332 -0
- package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +45 -83
- package/dist/_vendor/ailf-core/types/index.js +8 -1
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +257 -0
- package/dist/_vendor/ailf-core/types/plugin-registry.js +185 -0
- package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
- package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
- package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
- package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
- package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
- package/dist/_vendor/ailf-core/types/trace.js +18 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
- package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
- package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
- package/dist/_vendor/ailf-shared/index.d.ts +0 -1
- package/dist/_vendor/ailf-shared/index.js +0 -1
- package/dist/adapters/api-client/build-request.js +14 -13
- package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
- package/dist/adapters/config-sources/file-config-adapter.js +39 -12
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
- package/dist/adapters/config-sources/ts-config-loader.js +141 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
- package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +35 -39
- package/dist/adapters/task-sources/index.d.ts +3 -2
- package/dist/adapters/task-sources/index.js +3 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
- package/dist/adapters/task-sources/repo-schemas.js +227 -19
- package/dist/adapters/task-sources/repo-task-source.d.ts +16 -12
- package/dist/adapters/task-sources/repo-task-source.js +92 -80
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
- package/dist/adapters/task-sources/task-file-loader.js +83 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
- package/dist/adapters/task-sources/yaml-task-source.js +19 -16
- package/dist/cli.js +0 -2
- package/dist/commands/baseline.js +4 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/coverage-audit.js +9 -1
- package/dist/commands/explain-handler.js +25 -23
- package/dist/commands/fetch-docs.js +3 -2
- package/dist/commands/generate-configs.js +1 -1
- package/dist/commands/init.d.ts +6 -4
- package/dist/commands/init.js +302 -23
- package/dist/commands/interactive.js +11 -7
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +16 -6
- package/dist/commands/pipeline.d.ts +1 -0
- package/dist/commands/pipeline.js +4 -2
- package/dist/commands/pr-comment.js +1 -1
- package/dist/commands/publish.js +2 -2
- package/dist/commands/readiness-report.js +13 -6
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +13 -1
- package/dist/composition-root.js +99 -4
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +28 -8
- package/dist/orchestration/steps/calculate-scores-step.js +24 -11
- package/dist/orchestration/steps/fetch-docs-step.js +8 -7
- package/dist/orchestration/steps/gap-analysis-step.js +8 -7
- package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
- package/dist/orchestration/steps/generate-configs-step.js +261 -51
- package/dist/orchestration/steps/grader-consistency-step.js +7 -4
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/readiness-step.js +5 -6
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
- package/dist/orchestration/steps/run-eval-step.js +8 -7
- package/dist/pipeline/cache.d.ts +1 -1
- package/dist/pipeline/cache.js +36 -8
- package/dist/pipeline/calculate-scores.d.ts +2 -4
- package/dist/pipeline/calculate-scores.js +43 -113
- package/dist/pipeline/checks.js +2 -2
- package/dist/pipeline/compare.js +8 -8
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +392 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +404 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
- package/dist/pipeline/compiler/assertion-mapper.js +175 -0
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
- package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
- package/dist/pipeline/compiler/config-loader.d.ts +56 -0
- package/dist/pipeline/compiler/config-loader.js +111 -0
- package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
- package/dist/pipeline/compiler/fixture-resolver.js +113 -0
- package/dist/pipeline/compiler/hash.d.ts +11 -0
- package/dist/pipeline/compiler/hash.js +18 -0
- package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
- package/dist/pipeline/compiler/ignore-fields.js +113 -0
- package/dist/pipeline/compiler/index.d.ts +29 -0
- package/dist/pipeline/compiler/index.js +45 -0
- package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
- package/dist/pipeline/compiler/literacy-bridge.js +172 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +15 -0
- package/dist/pipeline/compiler/mode-handlers/index.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/index.d.ts +9 -0
- package/dist/pipeline/compiler/presets/index.js +8 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +42 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.js +208 -0
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
- package/dist/pipeline/compiler/provider-assembler.js +137 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
- package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
- package/dist/pipeline/compiler/sandbox/index.js +11 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
- package/dist/pipeline/compiler/scoring-bridge.js +114 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
- package/dist/pipeline/compiler/task-graph-builder.js +291 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
- package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
- package/dist/pipeline/compiler/telemetry/index.js +19 -0
- package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
- package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
- package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
- package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
- package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
- package/dist/pipeline/compiler/variable-resolver.js +115 -0
- package/dist/pipeline/coverage-audit.d.ts +15 -5
- package/dist/pipeline/coverage-audit.js +41 -22
- package/dist/pipeline/eval-constants.d.ts +16 -6
- package/dist/pipeline/eval-constants.js +25 -4
- package/dist/pipeline/eval-fingerprint.d.ts +2 -2
- package/dist/pipeline/eval-fingerprint.js +8 -9
- package/dist/pipeline/expand-tasks.d.ts +19 -10
- package/dist/pipeline/expand-tasks.js +34 -28
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +2 -2
- package/dist/pipeline/generate-configs.d.ts +22 -4
- package/dist/pipeline/generate-configs.js +53 -24
- package/dist/pipeline/grader-api.d.ts +3 -3
- package/dist/pipeline/grader-api.js +5 -12
- package/dist/pipeline/grader-compare-runner.js +20 -27
- package/dist/pipeline/grader-comparison.d.ts +4 -8
- package/dist/pipeline/grader-comparison.js +11 -17
- package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
- package/dist/pipeline/grader-consistency-runner.js +16 -20
- package/dist/pipeline/grader-consistency.d.ts +6 -10
- package/dist/pipeline/grader-consistency.js +13 -32
- package/dist/pipeline/grader-sensitivity-runner.js +7 -5
- package/dist/pipeline/grader-sensitivity.d.ts +2 -6
- package/dist/pipeline/grader-sensitivity.js +10 -10
- package/dist/pipeline/grader-validate-runner.js +7 -5
- package/dist/pipeline/grader-validation.d.ts +2 -6
- package/dist/pipeline/grader-validation.js +14 -22
- package/dist/pipeline/map-request-to-config.js +7 -1
- package/dist/pipeline/mirror-repo-tasks.d.ts +13 -13
- package/dist/pipeline/mirror-repo-tasks.js +22 -21
- package/dist/pipeline/normalize-mode.d.ts +49 -0
- package/dist/pipeline/normalize-mode.js +64 -0
- package/dist/pipeline/plan.d.ts +5 -2
- package/dist/pipeline/plan.js +134 -78
- package/dist/pipeline/pr-comment.js +2 -0
- package/dist/pipeline/profile-resolution.d.ts +22 -14
- package/dist/pipeline/profile-resolution.js +41 -19
- package/dist/pipeline/provenance.d.ts +2 -2
- package/dist/pipeline/provenance.js +12 -17
- package/dist/pipeline/release-report.js +4 -4
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/rubric-loader.d.ts +20 -0
- package/dist/pipeline/rubric-loader.js +37 -0
- package/dist/pipeline/validate.d.ts +4 -4
- package/dist/pipeline/validate.js +64 -53
- package/dist/schedules/loader.js +18 -8
- package/dist/scripts/migrate-task-mode.d.ts +24 -0
- package/dist/scripts/migrate-task-mode.js +85 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +15 -15
- package/dist/sinks/loader.js +5 -7
- package/dist/sources.d.ts +7 -7
- package/dist/sources.js +22 -24
- package/dist/webhook/dispatch.js +2 -1
- package/package.json +15 -4
- package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
- package/tasks/literacy/frameworks.task.ts +128 -0
- package/tasks/literacy/functions.task.ts +69 -0
- package/tasks/literacy/groq.task.ts +258 -0
- package/tasks/literacy/nextjs-live.task.ts +75 -0
- package/tasks/literacy/studio-setup.task.ts +131 -0
- package/tasks/literacy/visual-editing.task.ts +146 -0
- package/config/features.yaml +0 -116
- package/config/models.yaml +0 -116
- package/config/prompts.yaml +0 -75
- package/config/rubrics.yaml +0 -81
- package/config/schedules.yaml +0 -43
- package/config/sinks.yaml +0 -54
- package/config/sources.yaml +0 -51
- package/config/thresholds.yaml +0 -49
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Pipeline step: Generate Promptfoo configuration files.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
4
|
+
* ALL modes route through the Plugin Registry. The step looks up the mode
|
|
5
|
+
* handler via ctx.registry.getMode() and delegates compilation to it.
|
|
6
|
+
*
|
|
7
|
+
* Literacy mode has a variant strategy: baseline/agentic/observed/full.
|
|
8
|
+
* When the variant is "full", the handler is called twice (baseline + agentic)
|
|
9
|
+
* and three YAML files are written. Other modes produce one YAML file.
|
|
7
10
|
*/
|
|
11
|
+
import { LiteracyVariant } from "../../pipeline/normalize-mode.js";
|
|
8
12
|
import { getStepInputPaths } from "../../pipeline/cache.js";
|
|
9
13
|
import { checkGeneratedConfigsExist } from "../../pipeline/checks.js";
|
|
10
|
-
import { generateConfigs } from "../../pipeline/generate-configs.js";
|
|
11
14
|
import { validateModelsYaml } from "../../pipeline/validate.js";
|
|
12
15
|
import { loadSource } from "../../sources.js";
|
|
13
16
|
import { configToSourceOverrides } from "../config-to-source-overrides.js";
|
|
@@ -19,68 +22,246 @@ export class GenerateConfigsStep {
|
|
|
19
22
|
}
|
|
20
23
|
async execute(ctx, state) {
|
|
21
24
|
const start = Date.now();
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
const
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
25
|
+
const mode = ctx.config.mode;
|
|
26
|
+
// Look up mode handler in the registry
|
|
27
|
+
const registration = ctx.registry.getMode(mode);
|
|
28
|
+
if (!registration) {
|
|
29
|
+
return {
|
|
30
|
+
durationMs: Date.now() - start,
|
|
31
|
+
error: `No handler registered for mode "${mode}". ` +
|
|
32
|
+
`Available modes: ${ctx.registry
|
|
33
|
+
.getModes()
|
|
34
|
+
.map((m) => m.id)
|
|
35
|
+
.join(", ")}`,
|
|
36
|
+
status: "failed",
|
|
37
|
+
};
|
|
38
|
+
}
|
|
30
39
|
try {
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
40
|
+
// Dynamically import the handler module
|
|
41
|
+
const handlerModule = await import(`../../pipeline/compiler/${registration.handlerModule}`);
|
|
42
|
+
const handler = handlerModule.handler;
|
|
43
|
+
if (!handler?.compileTask) {
|
|
44
|
+
return {
|
|
45
|
+
durationMs: Date.now() - start,
|
|
46
|
+
error: `Handler module for "${mode}" does not export a valid ModeHandler`,
|
|
47
|
+
status: "failed",
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
// Load tasks
|
|
51
|
+
const tasks = await this.loadTasks(ctx, mode, state);
|
|
52
|
+
if (tasks.length === 0) {
|
|
53
|
+
return {
|
|
54
|
+
durationMs: Date.now() - start,
|
|
55
|
+
error: `No ${mode} tasks found. Create *.task.ts files in ` +
|
|
56
|
+
`packages/eval/tasks/${mode}/`,
|
|
57
|
+
status: "failed",
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
// Load models
|
|
61
|
+
const { loadModelsAndProviders } = await import("../../pipeline/compiler/provider-assembler.js");
|
|
62
|
+
const overrides = configToSourceOverrides(ctx.config);
|
|
63
|
+
const resolvedSource = ctx.config.source
|
|
64
|
+
? loadSource(ctx.config.source, overrides)
|
|
37
65
|
: undefined;
|
|
38
|
-
|
|
66
|
+
const { models, providers } = loadModelsAndProviders(ctx.config.rootDir, resolvedSource, ctx.config.searchMode, ctx.config.allowedOrigins);
|
|
67
|
+
// Literacy mode: variant expansion (baseline + agentic → 3 YAML files)
|
|
68
|
+
if (mode === "literacy") {
|
|
69
|
+
return this.compileLiteracyVariants(ctx, handler, tasks, models, providers, start);
|
|
70
|
+
}
|
|
71
|
+
// All other modes: single compilation → single YAML file
|
|
72
|
+
return this.compileSingleMode(ctx, handler, tasks, mode, models, start);
|
|
39
73
|
}
|
|
40
74
|
catch (err) {
|
|
75
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
41
76
|
return {
|
|
42
77
|
durationMs: Date.now() - start,
|
|
43
|
-
error:
|
|
78
|
+
error: `${mode} compilation failed: ${msg}`,
|
|
44
79
|
status: "failed",
|
|
45
80
|
};
|
|
46
81
|
}
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
82
|
+
}
|
|
83
|
+
// ---------------------------------------------------------------------------
|
|
84
|
+
// Literacy variant compilation (baseline + agentic → 3 YAML files)
|
|
85
|
+
// ---------------------------------------------------------------------------
|
|
86
|
+
async compileLiteracyVariants(ctx, handler, tasks, models, providers, start) {
|
|
87
|
+
ctx.logger.info(`Compiling ${tasks.length} literacy task(s) via registry handler...`);
|
|
88
|
+
// Filter models per variant
|
|
89
|
+
const baselineModels = models.models
|
|
90
|
+
.filter((m) => !m.modes || m.modes.includes(LiteracyVariant.STANDARD))
|
|
91
|
+
.map((m) => ({
|
|
92
|
+
id: m.id,
|
|
93
|
+
label: m.label,
|
|
94
|
+
}));
|
|
95
|
+
const agenticModels = models.models
|
|
96
|
+
.filter((m) => !m.modes ||
|
|
97
|
+
m.modes.includes("agentic-naive") ||
|
|
98
|
+
m.modes.includes("agentic-optimized"))
|
|
99
|
+
.map((m) => ({
|
|
100
|
+
id: m.id,
|
|
101
|
+
label: m.label,
|
|
102
|
+
}));
|
|
103
|
+
// Load rubric config for template resolution
|
|
104
|
+
let rubricConfig;
|
|
105
|
+
try {
|
|
106
|
+
const { loadRubricTemplates } = await import("../../pipeline/rubric-loader.js");
|
|
107
|
+
rubricConfig = loadRubricTemplates(ctx.config.rootDir);
|
|
108
|
+
}
|
|
109
|
+
catch {
|
|
110
|
+
ctx.logger.warn(" ⚠ Could not load rubric config — templates will not resolve");
|
|
111
|
+
}
|
|
112
|
+
// Compile for each variant
|
|
113
|
+
const baselineResults = this.compileAll(handler, tasks, {
|
|
114
|
+
rootDir: ctx.config.rootDir,
|
|
115
|
+
graderProvider: models.grader.id,
|
|
116
|
+
models: baselineModels,
|
|
117
|
+
rubricConfig,
|
|
118
|
+
evalMode: LiteracyVariant.STANDARD,
|
|
119
|
+
});
|
|
120
|
+
const agenticResults = this.compileAll(handler, tasks, {
|
|
121
|
+
rootDir: ctx.config.rootDir,
|
|
122
|
+
graderProvider: models.grader.id,
|
|
123
|
+
models: agenticModels,
|
|
124
|
+
rubricConfig,
|
|
125
|
+
evalMode: LiteracyVariant.AGENTIC,
|
|
126
|
+
});
|
|
127
|
+
// Log warnings
|
|
128
|
+
for (const w of [...baselineResults.warnings, ...agenticResults.warnings]) {
|
|
129
|
+
ctx.logger.warn(` ⚠ ${w}`);
|
|
130
|
+
}
|
|
131
|
+
ctx.logger.info(` Compiled ${tasks.length} task(s) → ${baselineResults.tests.length} baseline + ${agenticResults.tests.length} agentic entries`);
|
|
132
|
+
// Write 3 YAML files via the literacy-specific writer
|
|
133
|
+
const { writeCompiledLiteracyConfigs } = await import("../../pipeline/compiler/compiler-to-yaml.js");
|
|
134
|
+
writeCompiledLiteracyConfigs(baselineResults, agenticResults, providers, {
|
|
135
|
+
rootDir: ctx.config.rootDir,
|
|
136
|
+
graderProvider: models.grader.id,
|
|
137
|
+
maxConcurrency: models.maxConcurrency,
|
|
138
|
+
logger: ctx.logger,
|
|
139
|
+
});
|
|
140
|
+
return this.checkLiteracyPostconditions(ctx, start);
|
|
141
|
+
}
|
|
142
|
+
// ---------------------------------------------------------------------------
|
|
143
|
+
// Single-mode compilation (all non-literacy modes)
|
|
144
|
+
// ---------------------------------------------------------------------------
|
|
145
|
+
async compileSingleMode(ctx, handler, tasks, mode, models, start) {
|
|
146
|
+
ctx.logger.info(`Compiling ${tasks.length} ${mode} task(s) via registry handler...`);
|
|
147
|
+
// Filter models to those that declare this mode in their modes array
|
|
148
|
+
const modeModels = models.models
|
|
149
|
+
.filter((m) => !m.modes || m.modes.includes(mode))
|
|
150
|
+
.map((m) => ({
|
|
151
|
+
id: m.id,
|
|
152
|
+
label: m.label,
|
|
153
|
+
config: m.config,
|
|
154
|
+
}));
|
|
155
|
+
const merged = this.compileAll(handler, tasks, {
|
|
156
|
+
rootDir: ctx.config.rootDir,
|
|
157
|
+
graderProvider: models.grader.id,
|
|
158
|
+
models: modeModels,
|
|
159
|
+
});
|
|
160
|
+
for (const w of merged.warnings) {
|
|
161
|
+
ctx.logger.warn(` ⚠ ${w}`);
|
|
162
|
+
}
|
|
163
|
+
ctx.logger.info(` Compiled ${tasks.length} task(s) → ${merged.tests.length} test entries`);
|
|
164
|
+
const { writeCompiledModeConfig } = await import("../../pipeline/compiler/compiler-to-yaml.js");
|
|
165
|
+
writeCompiledModeConfig(merged, mode, {
|
|
166
|
+
rootDir: ctx.config.rootDir,
|
|
167
|
+
graderProvider: models.grader.id,
|
|
168
|
+
maxConcurrency: models.maxConcurrency,
|
|
169
|
+
logger: ctx.logger,
|
|
170
|
+
});
|
|
171
|
+
return {
|
|
172
|
+
durationMs: Date.now() - start,
|
|
173
|
+
status: "success",
|
|
174
|
+
summary: `Generated promptfooconfig.${mode}.yaml`,
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
// ---------------------------------------------------------------------------
|
|
178
|
+
// Task loading — unified for all modes
|
|
179
|
+
// ---------------------------------------------------------------------------
|
|
180
|
+
async loadTasks(ctx, mode, state) {
|
|
181
|
+
const { resolve } = await import("path");
|
|
182
|
+
const { discoverTsTaskFiles, loadTsTaskFile } = await import("../../adapters/task-sources/task-file-loader.js");
|
|
183
|
+
// Discover task files from the mode-specific directory and --repo-tasks-path
|
|
184
|
+
const tasksDir = resolve(ctx.config.rootDir, "tasks", mode);
|
|
185
|
+
const dirs = [tasksDir];
|
|
186
|
+
// Also search --repo-tasks-path (e.g., .ailf/tasks/) for repo-based tasks
|
|
187
|
+
if (ctx.config.repoTasksPath) {
|
|
188
|
+
const repoDir = resolve(ctx.config.repoTasksPath);
|
|
189
|
+
if (!dirs.includes(repoDir)) {
|
|
190
|
+
dirs.push(repoDir);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
const tasks = [];
|
|
194
|
+
for (const dir of dirs) {
|
|
195
|
+
const files = discoverTsTaskFiles(dir);
|
|
196
|
+
for (const file of files) {
|
|
197
|
+
const raw = await loadTsTaskFile(file);
|
|
198
|
+
for (const t of raw.tasks) {
|
|
199
|
+
const task = t;
|
|
200
|
+
// Filter to matching mode (skip tasks from other modes in same dir)
|
|
201
|
+
if (!("mode" in task) || task.mode === mode) {
|
|
202
|
+
tasks.push(task);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
// Apply area/task/tag filters
|
|
208
|
+
const filtered = this.applyFilters(ctx, tasks);
|
|
209
|
+
// Release auto-scope
|
|
51
210
|
if (state.releaseAutoScope && !ctx.config.noAutoScope) {
|
|
52
211
|
const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
|
|
53
|
-
const beforeCount =
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
212
|
+
const beforeCount = filtered.length;
|
|
213
|
+
const scoped = filtered.filter((t) => "id" in t && scopedIds.has(t.id));
|
|
214
|
+
ctx.logger.info(` 🎯 Auto-scoped to ${scoped.length} of ${beforeCount} task(s) affected by release`);
|
|
215
|
+
return scoped;
|
|
57
216
|
}
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
: undefined,
|
|
68
|
-
logger: ctx.logger,
|
|
69
|
-
resolvedSource,
|
|
70
|
-
rootDir: ctx.config.rootDir,
|
|
71
|
-
searchMode: ctx.config.searchMode,
|
|
72
|
-
source: ctx.config.source,
|
|
73
|
-
tasks,
|
|
217
|
+
return filtered;
|
|
218
|
+
}
|
|
219
|
+
applyFilters(ctx, tasks) {
|
|
220
|
+
let result = tasks;
|
|
221
|
+
if (ctx.config.areas?.length) {
|
|
222
|
+
const allowed = new Set(ctx.config.areas.map((a) => a.toLowerCase()));
|
|
223
|
+
result = result.filter((t) => {
|
|
224
|
+
const area = t.area?.toLowerCase();
|
|
225
|
+
return area && allowed.has(area);
|
|
74
226
|
});
|
|
75
227
|
}
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
};
|
|
228
|
+
if (ctx.config.tasks?.length) {
|
|
229
|
+
const allowed = new Set(ctx.config.tasks);
|
|
230
|
+
result = result.filter((t) => {
|
|
231
|
+
const id = t.id;
|
|
232
|
+
return id && allowed.has(id);
|
|
233
|
+
});
|
|
82
234
|
}
|
|
83
|
-
|
|
235
|
+
if (ctx.config.tags?.length) {
|
|
236
|
+
const allowed = new Set(ctx.config.tags);
|
|
237
|
+
result = result.filter((t) => {
|
|
238
|
+
const tags = t.tags;
|
|
239
|
+
return tags?.some((tag) => allowed.has(tag));
|
|
240
|
+
});
|
|
241
|
+
}
|
|
242
|
+
return result;
|
|
243
|
+
}
|
|
244
|
+
// ---------------------------------------------------------------------------
|
|
245
|
+
// Compilation helpers
|
|
246
|
+
// ---------------------------------------------------------------------------
|
|
247
|
+
/**
|
|
248
|
+
* Compile all tasks through a handler, merging results.
|
|
249
|
+
* For literacy mode, ctx can carry evalMode as an extension.
|
|
250
|
+
*/
|
|
251
|
+
compileAll(handler, tasks, ctx) {
|
|
252
|
+
const results = [];
|
|
253
|
+
const warnings = [];
|
|
254
|
+
for (const task of tasks) {
|
|
255
|
+
const result = handler.compileTask(task, ctx);
|
|
256
|
+
results.push(result);
|
|
257
|
+
warnings.push(...result.warnings);
|
|
258
|
+
}
|
|
259
|
+
return mergeCompileResults(results);
|
|
260
|
+
}
|
|
261
|
+
// ---------------------------------------------------------------------------
|
|
262
|
+
// Postcondition checks
|
|
263
|
+
// ---------------------------------------------------------------------------
|
|
264
|
+
checkLiteracyPostconditions(ctx, start) {
|
|
84
265
|
const configIssues = checkGeneratedConfigsExist(ctx.config.rootDir);
|
|
85
266
|
const configErrors = configIssues.filter((i) => i.severity === "error");
|
|
86
267
|
if (configErrors.length > 0) {
|
|
@@ -100,3 +281,32 @@ export class GenerateConfigsStep {
|
|
|
100
281
|
return getStepInputPaths(ctx.config.rootDir, "generate-configs");
|
|
101
282
|
}
|
|
102
283
|
}
|
|
284
|
+
// ---------------------------------------------------------------------------
|
|
285
|
+
// Helpers
|
|
286
|
+
// ---------------------------------------------------------------------------
|
|
287
|
+
/**
|
|
288
|
+
* Merge multiple compile results into one.
|
|
289
|
+
*
|
|
290
|
+
* Note: `providers` and `prompts` are taken from the first result only.
|
|
291
|
+
* This is correct for single-mode compilation where all tasks share the
|
|
292
|
+
* same provider set. Cross-mode merging with per-task provider overrides
|
|
293
|
+
* would need deduplication here.
|
|
294
|
+
*/
|
|
295
|
+
function mergeCompileResults(results) {
|
|
296
|
+
const tests = results.flatMap((r) => r.tests);
|
|
297
|
+
const warnings = results.flatMap((r) => r.warnings);
|
|
298
|
+
const providers = results[0]?.providers ?? [];
|
|
299
|
+
const prompts = results[0]?.prompts ?? [];
|
|
300
|
+
const extras = {};
|
|
301
|
+
for (const r of results) {
|
|
302
|
+
if (r.extras)
|
|
303
|
+
Object.assign(extras, r.extras);
|
|
304
|
+
}
|
|
305
|
+
return {
|
|
306
|
+
providers,
|
|
307
|
+
tests,
|
|
308
|
+
prompts,
|
|
309
|
+
warnings,
|
|
310
|
+
...(Object.keys(extras).length > 0 ? { extras } : {}),
|
|
311
|
+
};
|
|
312
|
+
}
|
|
@@ -6,8 +6,9 @@
|
|
|
6
6
|
*/
|
|
7
7
|
import { existsSync } from "fs";
|
|
8
8
|
import { resolve } from "path";
|
|
9
|
+
import { LiteracyVariant } from "../../pipeline/normalize-mode.js";
|
|
9
10
|
import { checkResultsExist } from "../../pipeline/checks.js";
|
|
10
|
-
import {
|
|
11
|
+
import { resultsFileForMode } from "../../pipeline/eval-constants.js";
|
|
11
12
|
import { runGraderConsistency } from "../../pipeline/grader-consistency-runner.js";
|
|
12
13
|
export class GraderConsistencyStep {
|
|
13
14
|
name = "grader-consistency";
|
|
@@ -18,10 +19,12 @@ export class GraderConsistencyStep {
|
|
|
18
19
|
async execute(ctx) {
|
|
19
20
|
const start = Date.now();
|
|
20
21
|
const replications = ctx.config.graderReplications ?? 5;
|
|
21
|
-
const
|
|
22
|
-
?
|
|
22
|
+
const primaryResultsRun = ctx.config.mode === "literacy"
|
|
23
|
+
? ctx.config.variant === LiteracyVariant.FULL
|
|
24
|
+
? LiteracyVariant.STANDARD
|
|
25
|
+
: (ctx.config.variant ?? LiteracyVariant.STANDARD)
|
|
23
26
|
: ctx.config.mode;
|
|
24
|
-
const resultsFile =
|
|
27
|
+
const resultsFile = resultsFileForMode(primaryResultsRun);
|
|
25
28
|
// Precondition: results file exists
|
|
26
29
|
const resultsIssues = checkResultsExist(ctx.config.rootDir, resultsFile);
|
|
27
30
|
const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
|
|
@@ -43,7 +43,7 @@ export class MirrorRepoTasksStep {
|
|
|
43
43
|
// RepoTaskSource via a fresh instance.
|
|
44
44
|
const { RepoTaskSource } = await import("../../adapters/task-sources/repo-task-source.js");
|
|
45
45
|
const repoSource = new RepoTaskSource(ctx.config.repoTasksPath);
|
|
46
|
-
const repoTasks = await repoSource.loadTasks();
|
|
46
|
+
const repoTasks = (await repoSource.loadTasks()).filter((t) => t.mode === "literacy");
|
|
47
47
|
if (repoTasks.length === 0) {
|
|
48
48
|
return {
|
|
49
49
|
durationMs: Date.now() - start,
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
*/
|
|
7
7
|
import { existsSync, readFileSync, writeFileSync } from "fs";
|
|
8
8
|
import { resolve } from "path";
|
|
9
|
-
import {
|
|
9
|
+
import { tryLoadConfigFile } from "../../pipeline/compiler/config-loader.js";
|
|
10
10
|
import { formatReadinessMarkdown, generateReadinessReport, } from "../../pipeline/readiness-report.js";
|
|
11
11
|
import { ThresholdConfigSchema } from "../../pipeline/schemas.js";
|
|
12
12
|
export class ReadinessStep {
|
|
@@ -20,7 +20,6 @@ export class ReadinessStep {
|
|
|
20
20
|
const start = Date.now();
|
|
21
21
|
try {
|
|
22
22
|
const scoreSummaryPath = resolve(root, "results", "latest", "score-summary.json");
|
|
23
|
-
const thresholdsPath = resolve(root, "config", "thresholds.yaml");
|
|
24
23
|
if (!existsSync(scoreSummaryPath)) {
|
|
25
24
|
return {
|
|
26
25
|
durationMs: Date.now() - start,
|
|
@@ -28,16 +27,16 @@ export class ReadinessStep {
|
|
|
28
27
|
status: "failed",
|
|
29
28
|
};
|
|
30
29
|
}
|
|
31
|
-
|
|
30
|
+
const thresholdsLoaded = tryLoadConfigFile("thresholds", root);
|
|
31
|
+
if (!thresholdsLoaded) {
|
|
32
32
|
return {
|
|
33
33
|
durationMs: Date.now() - start,
|
|
34
|
-
error: "config/thresholds
|
|
34
|
+
error: "config/thresholds not found",
|
|
35
35
|
status: "failed",
|
|
36
36
|
};
|
|
37
37
|
}
|
|
38
38
|
const scoreSummary = JSON.parse(readFileSync(scoreSummaryPath, "utf-8"));
|
|
39
|
-
const
|
|
40
|
-
const thresholdConfig = ThresholdConfigSchema.parse(rawThresholds);
|
|
39
|
+
const thresholdConfig = ThresholdConfigSchema.parse(thresholdsLoaded.data);
|
|
41
40
|
const gapPath = resolve(root, "results", "latest", "gap-analysis.json");
|
|
42
41
|
const gapAnalysis = existsSync(gapPath)
|
|
43
42
|
? JSON.parse(readFileSync(gapPath, "utf-8"))
|
|
@@ -5,12 +5,11 @@
|
|
|
5
5
|
* invocation. Builds a clean env object for the subprocess instead of
|
|
6
6
|
* polluting global process.env.
|
|
7
7
|
*/
|
|
8
|
-
import type { ConcreteEvalMode } from "../../_vendor/ailf-shared/index.d.ts";
|
|
9
8
|
import type { AppContext, PipelineState, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
10
9
|
export declare class RunEvalStep implements PipelineStep {
|
|
11
10
|
private readonly mode;
|
|
12
11
|
readonly name: string;
|
|
13
|
-
constructor(mode:
|
|
12
|
+
constructor(mode: string);
|
|
14
13
|
check(): ValidationIssue[];
|
|
15
14
|
execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
|
|
16
15
|
cacheInputs(ctx: AppContext): string[];
|
|
@@ -10,7 +10,7 @@ import { resolve } from "path";
|
|
|
10
10
|
import { getStepInputPaths } from "../../pipeline/cache.js";
|
|
11
11
|
import { checkCanonicalContextsExist, checkGeneratedConfigsExist, checkResultsExist, } from "../../pipeline/checks.js";
|
|
12
12
|
import { computeEvalFingerprint } from "../../pipeline/eval-fingerprint.js";
|
|
13
|
-
import { buildFilterFlags,
|
|
13
|
+
import { buildFilterFlags, configFileForMode, resultsFileForMode, scanResultsForErrors, } from "../../pipeline/eval-constants.js";
|
|
14
14
|
export class RunEvalStep {
|
|
15
15
|
mode;
|
|
16
16
|
name;
|
|
@@ -59,7 +59,8 @@ export class RunEvalStep {
|
|
|
59
59
|
// The generated Promptfoo config still includes their "without-docs"
|
|
60
60
|
// variant (testing model knowledge alone), which doesn't need a
|
|
61
61
|
// context file.
|
|
62
|
-
|
|
62
|
+
// Bridge: narrow to literacy tasks with docs
|
|
63
|
+
const tasksWithDocs = tasks.filter((t) => t.mode === "literacy" && (t.context?.docs?.length ?? 0) > 0);
|
|
63
64
|
const taskIds = tasksWithDocs.map((t) => t.id);
|
|
64
65
|
const contextIssues = checkCanonicalContextsExist(rootDir, taskIds);
|
|
65
66
|
const contextErrors = contextIssues.filter((i) => i.severity === "error");
|
|
@@ -123,7 +124,7 @@ export class RunEvalStep {
|
|
|
123
124
|
};
|
|
124
125
|
// Only set env vars that differ from defaults — the subprocess inherits
|
|
125
126
|
// process.env via PromptfooEvalAdapter's { ...process.env, ...config.env }
|
|
126
|
-
if (ctx.config.mode !== "
|
|
127
|
+
if (ctx.config.mode !== "literacy") {
|
|
127
128
|
subprocessEnv.EVAL_MODE = ctx.config.mode;
|
|
128
129
|
}
|
|
129
130
|
if (ctx.config.searchMode !== "open") {
|
|
@@ -135,7 +136,7 @@ export class RunEvalStep {
|
|
|
135
136
|
// -----------------------------------------------------------------
|
|
136
137
|
// Execute — use the EvalRunner port
|
|
137
138
|
// -----------------------------------------------------------------
|
|
138
|
-
const configFile =
|
|
139
|
+
const configFile = configFileForMode(this.mode);
|
|
139
140
|
const filterFlags = buildFilterFlags(debug);
|
|
140
141
|
const result = await ctx.evalRunner.run({
|
|
141
142
|
concurrency,
|
|
@@ -145,7 +146,7 @@ export class RunEvalStep {
|
|
|
145
146
|
});
|
|
146
147
|
// Check if results were written despite non-zero exit
|
|
147
148
|
if (result.status === "failed") {
|
|
148
|
-
const resultsExist = checkResultsExist(rootDir,
|
|
149
|
+
const resultsExist = checkResultsExist(rootDir, resultsFileForMode(this.mode));
|
|
149
150
|
const hasResults = resultsExist.filter((i) => i.severity === "error").length === 0;
|
|
150
151
|
if (!hasResults) {
|
|
151
152
|
return {
|
|
@@ -156,7 +157,7 @@ export class RunEvalStep {
|
|
|
156
157
|
}
|
|
157
158
|
}
|
|
158
159
|
// Postcondition: results file exists
|
|
159
|
-
const resultsIssues = checkResultsExist(rootDir,
|
|
160
|
+
const resultsIssues = checkResultsExist(rootDir, resultsFileForMode(this.mode));
|
|
160
161
|
const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
|
|
161
162
|
if (resultsErrors.length > 0) {
|
|
162
163
|
return {
|
|
@@ -166,7 +167,7 @@ export class RunEvalStep {
|
|
|
166
167
|
};
|
|
167
168
|
}
|
|
168
169
|
// Scan results for errors
|
|
169
|
-
const errorSummary = scanResultsForErrors(resolve(rootDir,
|
|
170
|
+
const errorSummary = scanResultsForErrors(resolve(rootDir, resultsFileForMode(this.mode)));
|
|
170
171
|
if (errorSummary) {
|
|
171
172
|
console.log();
|
|
172
173
|
console.log(errorSummary);
|
package/dist/pipeline/cache.d.ts
CHANGED
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
*
|
|
11
11
|
* Cache invalidation triggers:
|
|
12
12
|
* - Content change: any input file's content changes → hash changes → miss
|
|
13
|
-
* - Config change: config/models
|
|
13
|
+
* - Config change: config/models, config/sources, tasks/*.yaml changes → miss
|
|
14
14
|
* - Manual bypass: --no-cache flag skips all cache lookups
|
|
15
15
|
* - Cache clear: delete results/cache/ to start fresh
|
|
16
16
|
*/
|
package/dist/pipeline/cache.js
CHANGED
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
*
|
|
11
11
|
* Cache invalidation triggers:
|
|
12
12
|
* - Content change: any input file's content changes → hash changes → miss
|
|
13
|
-
* - Config change: config/models
|
|
13
|
+
* - Config change: config/models, config/sources, tasks/*.yaml changes → miss
|
|
14
14
|
* - Manual bypass: --no-cache flag skips all cache lookups
|
|
15
15
|
* - Cache clear: delete results/cache/ to start fresh
|
|
16
16
|
*/
|
|
@@ -18,6 +18,19 @@ import { createHash } from "crypto";
|
|
|
18
18
|
import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync, } from "fs";
|
|
19
19
|
import { join, resolve } from "path";
|
|
20
20
|
// ---------------------------------------------------------------------------
|
|
21
|
+
// Helpers
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
/** Resolve first existing config file (matches loadConfigFile priority chain) */
|
|
24
|
+
function resolveConfig(rootDir, name) {
|
|
25
|
+
const r = (f) => resolve(rootDir, f);
|
|
26
|
+
for (const ext of [".ts", ".js", ".yaml", ".yml", ".json"]) {
|
|
27
|
+
const p = r(`config/${name}${ext}`);
|
|
28
|
+
if (existsSync(p))
|
|
29
|
+
return p;
|
|
30
|
+
}
|
|
31
|
+
return undefined;
|
|
32
|
+
}
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
21
34
|
// Constants
|
|
22
35
|
// ---------------------------------------------------------------------------
|
|
23
36
|
const CACHE_DIR_NAME = "cache";
|
|
@@ -79,7 +92,10 @@ export function getStepInputPaths(rootDir, step) {
|
|
|
79
92
|
const isBaseline = step === "eval-baseline" || step === "eval";
|
|
80
93
|
const isAgentic = step === "eval-agentic" || step === "eval";
|
|
81
94
|
const isObserved = step === "eval-observed" || step === "eval";
|
|
82
|
-
const paths = [
|
|
95
|
+
const paths = [];
|
|
96
|
+
const modelsPath = resolveConfig(rootDir, "models");
|
|
97
|
+
if (modelsPath)
|
|
98
|
+
paths.push(modelsPath);
|
|
83
99
|
// Config files — only the relevant ones for this mode
|
|
84
100
|
if (isBaseline) {
|
|
85
101
|
paths.push(r("promptfooconfig.yaml"));
|
|
@@ -130,25 +146,37 @@ export function getStepInputPaths(rootDir, step) {
|
|
|
130
146
|
return paths;
|
|
131
147
|
}
|
|
132
148
|
case "fetch-docs": {
|
|
133
|
-
// Inputs: config
|
|
134
|
-
const paths = [
|
|
149
|
+
// Inputs: config sources + models, task files
|
|
150
|
+
const paths = [];
|
|
151
|
+
const sourcesPath = resolveConfig(rootDir, "sources");
|
|
152
|
+
const modelsPath2 = resolveConfig(rootDir, "models");
|
|
153
|
+
if (sourcesPath)
|
|
154
|
+
paths.push(sourcesPath);
|
|
155
|
+
if (modelsPath2)
|
|
156
|
+
paths.push(modelsPath2);
|
|
135
157
|
// Include all task files (they define feature areas)
|
|
136
158
|
const tasksDir = r("tasks");
|
|
137
159
|
if (existsSync(tasksDir)) {
|
|
138
160
|
const taskFiles = readdirSync(tasksDir)
|
|
139
|
-
.filter((f) =>
|
|
161
|
+
.filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
|
|
140
162
|
.map((f) => join(tasksDir, f));
|
|
141
163
|
paths.push(...taskFiles);
|
|
142
164
|
}
|
|
143
165
|
return paths;
|
|
144
166
|
}
|
|
145
167
|
case "generate-configs": {
|
|
146
|
-
// Inputs: config
|
|
147
|
-
const paths = [
|
|
168
|
+
// Inputs: config models + sources, all task files
|
|
169
|
+
const paths = [];
|
|
170
|
+
const modelsPath3 = resolveConfig(rootDir, "models");
|
|
171
|
+
const sourcesPath2 = resolveConfig(rootDir, "sources");
|
|
172
|
+
if (modelsPath3)
|
|
173
|
+
paths.push(modelsPath3);
|
|
174
|
+
if (sourcesPath2)
|
|
175
|
+
paths.push(sourcesPath2);
|
|
148
176
|
const tasksDir = r("tasks");
|
|
149
177
|
if (existsSync(tasksDir)) {
|
|
150
178
|
const taskFiles = readdirSync(tasksDir)
|
|
151
|
-
.filter((f) =>
|
|
179
|
+
.filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
|
|
152
180
|
.map((f) => join(tasksDir, f));
|
|
153
181
|
paths.push(...taskFiles);
|
|
154
182
|
}
|
|
@@ -1,9 +1,7 @@
|
|
|
1
|
-
import type
|
|
1
|
+
import { type ActualScoreEntry, type ComponentResult, type Logger, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
|
|
2
2
|
import { type ResolvedSourceConfig } from "../sources.js";
|
|
3
|
-
import { type ActualScoreEntry, type ComponentResult } from "../_vendor/ailf-core/index.d.ts";
|
|
4
3
|
import type { GraderJudgment, PerModelEntry } from "./types.js";
|
|
5
|
-
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.d.ts";
|
|
6
|
-
export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
|
|
4
|
+
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
|
|
7
5
|
export interface PromptfooResultsWrapper {
|
|
8
6
|
results: RawTestResult[];
|
|
9
7
|
stats: {
|