@sanity/ailf 0.5.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/config/features.ts +23 -0
- package/config/models.ts +95 -0
- package/config/prompts.ts +16 -0
- package/config/rubrics.ts +225 -0
- package/config/schedules.ts +47 -0
- package/config/sinks.ts +37 -0
- package/config/sources.ts +21 -0
- package/config/thresholds.ts +61 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +171 -0
- package/dist/_vendor/ailf-core/config-helpers.js +170 -0
- package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
- package/dist/_vendor/ailf-core/env-helper.js +45 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/examples/index.js +25 -0
- package/dist/_vendor/ailf-core/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/index.js +5 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +17 -2
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
- package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +8 -2
- package/dist/_vendor/ailf-core/schemas/eval-config.js +17 -2
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +9 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +8 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -31
- package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -9
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
- package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/services/index.js +2 -1
- package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
- package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
- package/dist/_vendor/ailf-core/services/scoring.js +25 -15
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +332 -0
- package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +45 -83
- package/dist/_vendor/ailf-core/types/index.js +8 -1
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +257 -0
- package/dist/_vendor/ailf-core/types/plugin-registry.js +185 -0
- package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
- package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
- package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
- package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
- package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
- package/dist/_vendor/ailf-core/types/trace.js +18 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
- package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
- package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
- package/dist/_vendor/ailf-shared/index.d.ts +0 -1
- package/dist/_vendor/ailf-shared/index.js +0 -1
- package/dist/adapters/api-client/build-request.js +14 -13
- package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
- package/dist/adapters/config-sources/file-config-adapter.js +39 -12
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
- package/dist/adapters/config-sources/ts-config-loader.js +141 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
- package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +35 -39
- package/dist/adapters/task-sources/index.d.ts +3 -2
- package/dist/adapters/task-sources/index.js +3 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
- package/dist/adapters/task-sources/repo-schemas.js +227 -19
- package/dist/adapters/task-sources/repo-task-source.d.ts +16 -12
- package/dist/adapters/task-sources/repo-task-source.js +92 -80
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
- package/dist/adapters/task-sources/task-file-loader.js +83 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
- package/dist/adapters/task-sources/yaml-task-source.js +19 -16
- package/dist/cli.js +0 -2
- package/dist/commands/baseline.js +4 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/coverage-audit.js +9 -1
- package/dist/commands/explain-handler.js +25 -23
- package/dist/commands/fetch-docs.js +3 -2
- package/dist/commands/generate-configs.js +1 -1
- package/dist/commands/init.d.ts +6 -4
- package/dist/commands/init.js +302 -23
- package/dist/commands/interactive.js +11 -7
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +16 -6
- package/dist/commands/pipeline.d.ts +1 -0
- package/dist/commands/pipeline.js +4 -2
- package/dist/commands/pr-comment.js +1 -1
- package/dist/commands/publish.js +2 -2
- package/dist/commands/readiness-report.js +13 -6
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +13 -1
- package/dist/composition-root.js +99 -4
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +28 -8
- package/dist/orchestration/steps/calculate-scores-step.js +24 -11
- package/dist/orchestration/steps/fetch-docs-step.js +8 -7
- package/dist/orchestration/steps/gap-analysis-step.js +8 -7
- package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
- package/dist/orchestration/steps/generate-configs-step.js +261 -51
- package/dist/orchestration/steps/grader-consistency-step.js +7 -4
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/readiness-step.js +5 -6
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
- package/dist/orchestration/steps/run-eval-step.js +8 -7
- package/dist/pipeline/cache.d.ts +1 -1
- package/dist/pipeline/cache.js +36 -8
- package/dist/pipeline/calculate-scores.d.ts +2 -4
- package/dist/pipeline/calculate-scores.js +43 -113
- package/dist/pipeline/checks.js +2 -2
- package/dist/pipeline/compare.js +8 -8
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +392 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +404 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
- package/dist/pipeline/compiler/assertion-mapper.js +175 -0
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
- package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
- package/dist/pipeline/compiler/config-loader.d.ts +56 -0
- package/dist/pipeline/compiler/config-loader.js +111 -0
- package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
- package/dist/pipeline/compiler/fixture-resolver.js +113 -0
- package/dist/pipeline/compiler/hash.d.ts +11 -0
- package/dist/pipeline/compiler/hash.js +18 -0
- package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
- package/dist/pipeline/compiler/ignore-fields.js +113 -0
- package/dist/pipeline/compiler/index.d.ts +29 -0
- package/dist/pipeline/compiler/index.js +45 -0
- package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
- package/dist/pipeline/compiler/literacy-bridge.js +172 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +15 -0
- package/dist/pipeline/compiler/mode-handlers/index.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/index.d.ts +9 -0
- package/dist/pipeline/compiler/presets/index.js +8 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +42 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.js +208 -0
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
- package/dist/pipeline/compiler/provider-assembler.js +137 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
- package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
- package/dist/pipeline/compiler/sandbox/index.js +11 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
- package/dist/pipeline/compiler/scoring-bridge.js +114 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
- package/dist/pipeline/compiler/task-graph-builder.js +291 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
- package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
- package/dist/pipeline/compiler/telemetry/index.js +19 -0
- package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
- package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
- package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
- package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
- package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
- package/dist/pipeline/compiler/variable-resolver.js +115 -0
- package/dist/pipeline/coverage-audit.d.ts +15 -5
- package/dist/pipeline/coverage-audit.js +41 -22
- package/dist/pipeline/eval-constants.d.ts +16 -6
- package/dist/pipeline/eval-constants.js +25 -4
- package/dist/pipeline/eval-fingerprint.d.ts +2 -2
- package/dist/pipeline/eval-fingerprint.js +8 -9
- package/dist/pipeline/expand-tasks.d.ts +19 -10
- package/dist/pipeline/expand-tasks.js +34 -28
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +2 -2
- package/dist/pipeline/generate-configs.d.ts +22 -4
- package/dist/pipeline/generate-configs.js +53 -24
- package/dist/pipeline/grader-api.d.ts +3 -3
- package/dist/pipeline/grader-api.js +5 -12
- package/dist/pipeline/grader-compare-runner.js +20 -27
- package/dist/pipeline/grader-comparison.d.ts +4 -8
- package/dist/pipeline/grader-comparison.js +11 -17
- package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
- package/dist/pipeline/grader-consistency-runner.js +16 -20
- package/dist/pipeline/grader-consistency.d.ts +6 -10
- package/dist/pipeline/grader-consistency.js +13 -32
- package/dist/pipeline/grader-sensitivity-runner.js +7 -5
- package/dist/pipeline/grader-sensitivity.d.ts +2 -6
- package/dist/pipeline/grader-sensitivity.js +10 -10
- package/dist/pipeline/grader-validate-runner.js +7 -5
- package/dist/pipeline/grader-validation.d.ts +2 -6
- package/dist/pipeline/grader-validation.js +14 -22
- package/dist/pipeline/map-request-to-config.js +7 -1
- package/dist/pipeline/mirror-repo-tasks.d.ts +13 -13
- package/dist/pipeline/mirror-repo-tasks.js +22 -21
- package/dist/pipeline/normalize-mode.d.ts +49 -0
- package/dist/pipeline/normalize-mode.js +64 -0
- package/dist/pipeline/plan.d.ts +5 -2
- package/dist/pipeline/plan.js +134 -78
- package/dist/pipeline/pr-comment.js +2 -0
- package/dist/pipeline/profile-resolution.d.ts +22 -14
- package/dist/pipeline/profile-resolution.js +41 -19
- package/dist/pipeline/provenance.d.ts +2 -2
- package/dist/pipeline/provenance.js +12 -17
- package/dist/pipeline/release-report.js +4 -4
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/rubric-loader.d.ts +20 -0
- package/dist/pipeline/rubric-loader.js +37 -0
- package/dist/pipeline/validate.d.ts +4 -4
- package/dist/pipeline/validate.js +64 -53
- package/dist/schedules/loader.js +18 -8
- package/dist/scripts/migrate-task-mode.d.ts +24 -0
- package/dist/scripts/migrate-task-mode.js +85 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +15 -15
- package/dist/sinks/loader.js +5 -7
- package/dist/sources.d.ts +7 -7
- package/dist/sources.js +22 -24
- package/dist/webhook/dispatch.js +2 -1
- package/package.json +15 -4
- package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
- package/tasks/literacy/frameworks.task.ts +128 -0
- package/tasks/literacy/functions.task.ts +69 -0
- package/tasks/literacy/groq.task.ts +258 -0
- package/tasks/literacy/nextjs-live.task.ts +75 -0
- package/tasks/literacy/studio-setup.task.ts +131 -0
- package/tasks/literacy/visual-editing.task.ts +146 -0
- package/config/features.yaml +0 -116
- package/config/models.yaml +0 -116
- package/config/prompts.yaml +0 -75
- package/config/rubrics.yaml +0 -81
- package/config/schedules.yaml +0 -43
- package/config/sinks.yaml +0 -54
- package/config/sources.yaml +0 -51
- package/config/thresholds.yaml +0 -49
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Example MCP server task definitions — Sanity MCP Server reference.
|
|
3
|
+
*
|
|
4
|
+
* These are reference implementations showing how to author MCP server
|
|
5
|
+
* evaluation tasks. They model the real Sanity MCP Server (mcp.sanity.io)
|
|
6
|
+
* and its actual tool names. They serve as:
|
|
7
|
+
*
|
|
8
|
+
* 1. Documentation — developers learn the API by reading examples
|
|
9
|
+
* 2. Test fixtures — the compiler tests compile these and verify output
|
|
10
|
+
* 3. Validation — proves the compiler handles real-world task shapes
|
|
11
|
+
* 4. Reference — shows all transport types, auth patterns, and assertion styles
|
|
12
|
+
*
|
|
13
|
+
* NOTE: These are NOT executable without a valid SANITY_MCP_AUTH_TOKEN.
|
|
14
|
+
* They exist purely for compilation testing and as authoring examples.
|
|
15
|
+
* For real evaluation, write tasks in an external repo's .ailf/tasks/
|
|
16
|
+
* directory or a standalone config file.
|
|
17
|
+
*
|
|
18
|
+
* @see https://www.sanity.io/docs/ai/mcp-server — Sanity MCP Server docs
|
|
19
|
+
* @see https://www.promptfoo.dev/docs/providers/mcp/ — Promptfoo MCP provider
|
|
20
|
+
*/
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// Shared server config — reusable across tasks targeting the same server
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
const SANITY_MCP_SERVER = {
|
|
25
|
+
transport: "streamable-http",
|
|
26
|
+
url: "https://mcp.sanity.io",
|
|
27
|
+
auth: {
|
|
28
|
+
type: "bearer",
|
|
29
|
+
token: "{{env.SANITY_MCP_AUTH_TOKEN}}",
|
|
30
|
+
},
|
|
31
|
+
};
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
// Task 1: GROQ query via query_documents tool
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
/**
|
|
36
|
+
* Tests that a model can construct a valid GROQ query and use the
|
|
37
|
+
* query_documents tool to fetch results from a Sanity dataset.
|
|
38
|
+
*/
|
|
39
|
+
export const queryDocumentsTask = {
|
|
40
|
+
mode: "mcp-server",
|
|
41
|
+
id: "mcp-query-documents",
|
|
42
|
+
title: "MCP: Query documents with GROQ",
|
|
43
|
+
description: "Use the Sanity MCP server's query_documents tool to fetch " +
|
|
44
|
+
"documents from a dataset using GROQ. Verify the model constructs " +
|
|
45
|
+
"a valid GROQ query and interprets the response correctly.",
|
|
46
|
+
area: "mcp",
|
|
47
|
+
difficulty: "intermediate",
|
|
48
|
+
tags: ["mcp", "groq", "query"],
|
|
49
|
+
serverConfig: SANITY_MCP_SERVER,
|
|
50
|
+
capabilities: ["query_documents", "get_schema"],
|
|
51
|
+
prompt: {
|
|
52
|
+
text: "Using the Sanity MCP server, query for all documents of type " +
|
|
53
|
+
'"article" in the production dataset. Return their titles and ' +
|
|
54
|
+
"slugs, ordered by creation date descending. Limit to 5 results.",
|
|
55
|
+
},
|
|
56
|
+
assertions: [
|
|
57
|
+
{ type: "tool-called", value: "query_documents" },
|
|
58
|
+
{ type: "contains", value: "_type" },
|
|
59
|
+
{ type: "contains", value: "article" },
|
|
60
|
+
{
|
|
61
|
+
type: "llm-rubric",
|
|
62
|
+
template: "mcp-output-correctness",
|
|
63
|
+
criteria: [
|
|
64
|
+
"Constructs a valid GROQ query with _type filter",
|
|
65
|
+
"Includes a projection with title and slug fields",
|
|
66
|
+
"Orders by _createdAt descending",
|
|
67
|
+
"Limits results to 5 with slice syntax [0...5] or [0..4]",
|
|
68
|
+
],
|
|
69
|
+
},
|
|
70
|
+
],
|
|
71
|
+
};
|
|
72
|
+
// ---------------------------------------------------------------------------
|
|
73
|
+
// Task 2: Schema inspection via get_schema tool
|
|
74
|
+
// ---------------------------------------------------------------------------
|
|
75
|
+
/**
|
|
76
|
+
* Tests that a model can retrieve and interpret schema information
|
|
77
|
+
* from the MCP server.
|
|
78
|
+
*/
|
|
79
|
+
export const inspectSchemaTask = {
|
|
80
|
+
mode: "mcp-server",
|
|
81
|
+
id: "mcp-get-schema",
|
|
82
|
+
title: "MCP: Inspect schema and describe document types",
|
|
83
|
+
description: "Use the Sanity MCP server's get_schema tool to inspect the " +
|
|
84
|
+
"content model and describe the available document types.",
|
|
85
|
+
area: "mcp",
|
|
86
|
+
difficulty: "basic",
|
|
87
|
+
tags: ["mcp", "schema"],
|
|
88
|
+
serverConfig: SANITY_MCP_SERVER,
|
|
89
|
+
capabilities: ["get_schema"],
|
|
90
|
+
prompt: {
|
|
91
|
+
text: "Using the Sanity MCP server, retrieve the schema for the current " +
|
|
92
|
+
"workspace. List all document types and describe the fields of the " +
|
|
93
|
+
'most complex one. Focus on reference fields and "block" (Portable Text) fields.',
|
|
94
|
+
},
|
|
95
|
+
assertions: [
|
|
96
|
+
{ type: "tool-called", value: "get_schema" },
|
|
97
|
+
{
|
|
98
|
+
type: "llm-rubric",
|
|
99
|
+
template: "mcp-output-correctness",
|
|
100
|
+
criteria: [
|
|
101
|
+
"Successfully calls get_schema to retrieve the workspace schema",
|
|
102
|
+
"Identifies and lists the document types",
|
|
103
|
+
"Describes field structures including types and references",
|
|
104
|
+
"Correctly identifies Portable Text (block) fields",
|
|
105
|
+
],
|
|
106
|
+
},
|
|
107
|
+
],
|
|
108
|
+
};
|
|
109
|
+
// ---------------------------------------------------------------------------
|
|
110
|
+
// Task 3: Multi-step create → verify → publish (multi-turn)
|
|
111
|
+
// ---------------------------------------------------------------------------
|
|
112
|
+
/**
|
|
113
|
+
* Tests a multi-step document lifecycle workflow using multiple tools
|
|
114
|
+
* across a multi-turn conversation. This is the most complex example,
|
|
115
|
+
* demonstrating tool chaining and verification patterns.
|
|
116
|
+
*/
|
|
117
|
+
export const createAndPublishTask = {
|
|
118
|
+
mode: "mcp-server",
|
|
119
|
+
id: "mcp-create-publish",
|
|
120
|
+
title: "MCP: Create a draft document and publish it",
|
|
121
|
+
description: "Use the Sanity MCP server to create a new draft document, " +
|
|
122
|
+
"verify it was created, then publish it. Tests the multi-step " +
|
|
123
|
+
"workflow of create → verify → publish.",
|
|
124
|
+
area: "mcp",
|
|
125
|
+
difficulty: "advanced",
|
|
126
|
+
tags: ["mcp", "crud", "multi-step"],
|
|
127
|
+
serverConfig: SANITY_MCP_SERVER,
|
|
128
|
+
capabilities: [
|
|
129
|
+
"create_documents_from_json",
|
|
130
|
+
"get_document",
|
|
131
|
+
"publish_documents",
|
|
132
|
+
"query_documents",
|
|
133
|
+
],
|
|
134
|
+
multiTurn: {
|
|
135
|
+
turns: [
|
|
136
|
+
{
|
|
137
|
+
role: "user",
|
|
138
|
+
content: "Create a new draft article document with the title " +
|
|
139
|
+
'"MCP Integration Test" and a slug of "mcp-integration-test". ' +
|
|
140
|
+
"Use the create_documents_from_json tool.",
|
|
141
|
+
},
|
|
142
|
+
{
|
|
143
|
+
role: "user",
|
|
144
|
+
content: "Now verify the draft was created by querying for it. " +
|
|
145
|
+
"Use query_documents with a GROQ query that fetches the " +
|
|
146
|
+
'document by its slug "mcp-integration-test".',
|
|
147
|
+
},
|
|
148
|
+
{
|
|
149
|
+
role: "user",
|
|
150
|
+
content: "Finally, publish the draft document using the publish_documents tool.",
|
|
151
|
+
},
|
|
152
|
+
],
|
|
153
|
+
},
|
|
154
|
+
assertions: [
|
|
155
|
+
{ type: "tool-called", value: "create_documents_from_json" },
|
|
156
|
+
{ type: "tool-called", value: "query_documents" },
|
|
157
|
+
{ type: "tool-called", value: "publish_documents" },
|
|
158
|
+
{
|
|
159
|
+
type: "llm-rubric",
|
|
160
|
+
template: "mcp-output-correctness",
|
|
161
|
+
criteria: [
|
|
162
|
+
"Creates a draft document with the correct _type and fields",
|
|
163
|
+
"Verifies the draft exists via GROQ query",
|
|
164
|
+
"Publishes the draft using the correct document ID",
|
|
165
|
+
"Handles the multi-step workflow in the correct order",
|
|
166
|
+
],
|
|
167
|
+
},
|
|
168
|
+
{
|
|
169
|
+
type: "llm-rubric",
|
|
170
|
+
template: "mcp-error-handling",
|
|
171
|
+
criteria: [
|
|
172
|
+
"Handles potential errors (missing type, invalid schema) gracefully",
|
|
173
|
+
"Verifies each step before proceeding to the next",
|
|
174
|
+
],
|
|
175
|
+
},
|
|
176
|
+
],
|
|
177
|
+
};
|
|
178
|
+
// ---------------------------------------------------------------------------
|
|
179
|
+
// Task 4: Semantic search (embeddings)
|
|
180
|
+
// ---------------------------------------------------------------------------
|
|
181
|
+
/**
|
|
182
|
+
* Tests that a model can discover embeddings indices and perform
|
|
183
|
+
* semantic search against them.
|
|
184
|
+
*/
|
|
185
|
+
export const semanticSearchTask = {
|
|
186
|
+
mode: "mcp-server",
|
|
187
|
+
id: "mcp-semantic-search",
|
|
188
|
+
title: "MCP: Semantic search for related content",
|
|
189
|
+
description: "Use the Sanity MCP server's semantic_search tool to find " +
|
|
190
|
+
"content related to a natural language query.",
|
|
191
|
+
area: "mcp",
|
|
192
|
+
difficulty: "intermediate",
|
|
193
|
+
tags: ["mcp", "search", "embeddings"],
|
|
194
|
+
serverConfig: SANITY_MCP_SERVER,
|
|
195
|
+
capabilities: ["semantic_search", "list_embeddings_indices", "get_document"],
|
|
196
|
+
prompt: {
|
|
197
|
+
text: "Using the Sanity MCP server, first list the available embeddings " +
|
|
198
|
+
"indices, then perform a semantic search for content related to " +
|
|
199
|
+
'"how to set up real-time previews in Next.js". Return the top 3 ' +
|
|
200
|
+
"most relevant documents with their titles and relevance scores.",
|
|
201
|
+
},
|
|
202
|
+
assertions: [
|
|
203
|
+
{ type: "tool-called", value: "list_embeddings_indices" },
|
|
204
|
+
{ type: "tool-called", value: "semantic_search" },
|
|
205
|
+
{
|
|
206
|
+
type: "llm-rubric",
|
|
207
|
+
template: "mcp-output-correctness",
|
|
208
|
+
criteria: [
|
|
209
|
+
"Lists available embeddings indices before searching",
|
|
210
|
+
"Selects an appropriate index for the search",
|
|
211
|
+
"Constructs a meaningful search query",
|
|
212
|
+
"Returns relevant results with titles and scores",
|
|
213
|
+
],
|
|
214
|
+
},
|
|
215
|
+
],
|
|
216
|
+
};
|
|
217
|
+
// ---------------------------------------------------------------------------
|
|
218
|
+
// Task 5: Local stdio server (shows command-based transport)
|
|
219
|
+
// ---------------------------------------------------------------------------
|
|
220
|
+
/**
|
|
221
|
+
* Demonstrates a stdio-based MCP server (local process) for contrast
|
|
222
|
+
* with the remote streamable-http tasks above.
|
|
223
|
+
*/
|
|
224
|
+
export const stdioServerTask = {
|
|
225
|
+
mode: "mcp-server",
|
|
226
|
+
id: "mcp-stdio-local",
|
|
227
|
+
title: "MCP: Local stdio server — getDocument",
|
|
228
|
+
description: "Call a locally running MCP server via stdio transport to " +
|
|
229
|
+
"fetch a document. Demonstrates command-based server config.",
|
|
230
|
+
area: "mcp",
|
|
231
|
+
difficulty: "basic",
|
|
232
|
+
tags: ["mcp", "stdio", "local"],
|
|
233
|
+
serverConfig: {
|
|
234
|
+
transport: "stdio",
|
|
235
|
+
command: "node dist/sanity-mcp-server.js",
|
|
236
|
+
env: {
|
|
237
|
+
SANITY_PROJECT_ID: "test-project",
|
|
238
|
+
SANITY_DATASET: "production",
|
|
239
|
+
},
|
|
240
|
+
startupTimeoutMs: 5000,
|
|
241
|
+
},
|
|
242
|
+
capabilities: ["tools/list", "tools/call"],
|
|
243
|
+
prompt: {
|
|
244
|
+
text: "Fetch the document with ID 'article-123' from the Sanity Content Lake.",
|
|
245
|
+
},
|
|
246
|
+
assertions: [
|
|
247
|
+
{ type: "tool-called", value: "getDocument" },
|
|
248
|
+
{ type: "contains", value: "article-123" },
|
|
249
|
+
],
|
|
250
|
+
options: { timeout: 30_000 },
|
|
251
|
+
};
|
|
252
|
+
/** All example MCP tasks — used by compiler tests */
|
|
253
|
+
export const allMCPExampleTasks = [
|
|
254
|
+
queryDocumentsTask,
|
|
255
|
+
inspectSchemaTask,
|
|
256
|
+
createAndPublishTask,
|
|
257
|
+
semanticSearchTask,
|
|
258
|
+
stdioServerTask,
|
|
259
|
+
];
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Assertion mapping and builders for agent harness tasks.
|
|
3
|
+
*
|
|
4
|
+
* Handles agent-specific assertion types (file-exists, file-contains,
|
|
5
|
+
* command-succeeds, diff-matches) as well as standard pass-through
|
|
6
|
+
* assertion types.
|
|
7
|
+
*/
|
|
8
|
+
import type { PromptfooAssertion } from "../../assertion-mapper.js";
|
|
9
|
+
import type { AgentHarnessCompileOptions } from "./types.js";
|
|
10
|
+
export declare function mapAgentAssertion(assertion: {
|
|
11
|
+
type: string;
|
|
12
|
+
[k: string]: unknown;
|
|
13
|
+
}, options: AgentHarnessCompileOptions | undefined, warnings: string[]): PromptfooAssertion | null;
|
|
14
|
+
export declare function buildFileExistsAssertion(assertion: {
|
|
15
|
+
type: string;
|
|
16
|
+
[k: string]: unknown;
|
|
17
|
+
}): PromptfooAssertion;
|
|
18
|
+
export declare function buildFileContainsAssertion(assertion: {
|
|
19
|
+
type: string;
|
|
20
|
+
[k: string]: unknown;
|
|
21
|
+
}): PromptfooAssertion;
|
|
22
|
+
/**
|
|
23
|
+
* SECURITY: Trusted-input boundary.
|
|
24
|
+
*
|
|
25
|
+
* The `command-succeeds` assertion executes an arbitrary shell command
|
|
26
|
+
* inside the sandbox's working directory. The command string comes from
|
|
27
|
+
* task definitions (YAML or TypeScript config files), which are authored
|
|
28
|
+
* by developers — not from user input or LLM output.
|
|
29
|
+
*
|
|
30
|
+
* This is intentional: the assertion is designed to verify agent output
|
|
31
|
+
* by running build/test commands (e.g., "npm test", "tsc --noEmit").
|
|
32
|
+
*
|
|
33
|
+
* Task definitions are the trust boundary. If you accept task definitions
|
|
34
|
+
* from untrusted sources, validate commands against an allowlist first.
|
|
35
|
+
*/
|
|
36
|
+
export declare function buildCommandSucceedsAssertion(assertion: {
|
|
37
|
+
type: string;
|
|
38
|
+
[k: string]: unknown;
|
|
39
|
+
}): PromptfooAssertion;
|
|
40
|
+
export declare function buildDiffMatchesAssertion(assertion: {
|
|
41
|
+
type: string;
|
|
42
|
+
[k: string]: unknown;
|
|
43
|
+
}): PromptfooAssertion;
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Assertion mapping and builders for agent harness tasks.
|
|
3
|
+
*
|
|
4
|
+
* Handles agent-specific assertion types (file-exists, file-contains,
|
|
5
|
+
* command-succeeds, diff-matches) as well as standard pass-through
|
|
6
|
+
* assertion types.
|
|
7
|
+
*/
|
|
8
|
+
// ---------------------------------------------------------------------------
|
|
9
|
+
// Assertion mapping
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
export function mapAgentAssertion(assertion, options, warnings) {
|
|
12
|
+
switch (assertion.type) {
|
|
13
|
+
case "file-exists":
|
|
14
|
+
return buildFileExistsAssertion(assertion);
|
|
15
|
+
case "file-contains":
|
|
16
|
+
return buildFileContainsAssertion(assertion);
|
|
17
|
+
case "command-succeeds":
|
|
18
|
+
return buildCommandSucceedsAssertion(assertion);
|
|
19
|
+
case "diff-matches":
|
|
20
|
+
return buildDiffMatchesAssertion(assertion);
|
|
21
|
+
// Standard assertions pass through
|
|
22
|
+
case "contains":
|
|
23
|
+
case "equals":
|
|
24
|
+
case "regex":
|
|
25
|
+
case "is-json":
|
|
26
|
+
case "javascript":
|
|
27
|
+
case "python":
|
|
28
|
+
return {
|
|
29
|
+
type: assertion.type,
|
|
30
|
+
...("value" in assertion ? { value: assertion.value } : {}),
|
|
31
|
+
...(typeof assertion.weight === "number"
|
|
32
|
+
? { weight: assertion.weight }
|
|
33
|
+
: {}),
|
|
34
|
+
};
|
|
35
|
+
case "llm-rubric":
|
|
36
|
+
return {
|
|
37
|
+
type: "llm-rubric",
|
|
38
|
+
...("value" in assertion ? { value: assertion.value } : {}),
|
|
39
|
+
...(typeof assertion.weight === "number"
|
|
40
|
+
? { weight: assertion.weight }
|
|
41
|
+
: {}),
|
|
42
|
+
...(options?.graderProvider
|
|
43
|
+
? { provider: options.graderProvider }
|
|
44
|
+
: {}),
|
|
45
|
+
};
|
|
46
|
+
default:
|
|
47
|
+
warnings.push(`Agent task: unknown assertion type "${assertion.type}" — passed through`);
|
|
48
|
+
return {
|
|
49
|
+
type: assertion.type,
|
|
50
|
+
...("value" in assertion ? { value: assertion.value } : {}),
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
// ---------------------------------------------------------------------------
|
|
55
|
+
// Agent-specific assertion builders
|
|
56
|
+
// ---------------------------------------------------------------------------
|
|
57
|
+
export function buildFileExistsAssertion(assertion) {
|
|
58
|
+
const filePath = String(assertion.value ?? "");
|
|
59
|
+
// Use JSON.stringify for all interpolated values in generated JS to
|
|
60
|
+
// prevent broken strings from filePaths containing quotes/backslashes
|
|
61
|
+
const safeFilePath = JSON.stringify(filePath);
|
|
62
|
+
return {
|
|
63
|
+
type: "javascript",
|
|
64
|
+
value: `// file-exists: ${filePath}\n` +
|
|
65
|
+
`(function() {\n` +
|
|
66
|
+
` const fs = require('fs');\n` +
|
|
67
|
+
` const path = require('path');\n` +
|
|
68
|
+
` const workDir = path.resolve(context.vars.__workingDir || '.');\n` +
|
|
69
|
+
` const target = path.resolve(workDir, ${safeFilePath});\n` +
|
|
70
|
+
` if (!target.startsWith(workDir + path.sep) && target !== workDir) {\n` +
|
|
71
|
+
` return { pass: false, score: 0, reason: 'Path traversal: ' + ${safeFilePath} + ' escapes sandbox' };\n` +
|
|
72
|
+
` }\n` +
|
|
73
|
+
` const exists = fs.existsSync(target);\n` +
|
|
74
|
+
` return {\n` +
|
|
75
|
+
` pass: exists,\n` +
|
|
76
|
+
` score: exists ? 1 : 0,\n` +
|
|
77
|
+
` reason: exists\n` +
|
|
78
|
+
` ? 'File exists: ' + ${safeFilePath}\n` +
|
|
79
|
+
` : 'Expected file not found: ' + ${safeFilePath},\n` +
|
|
80
|
+
` };\n` +
|
|
81
|
+
`})()`,
|
|
82
|
+
...(typeof assertion.weight === "number"
|
|
83
|
+
? { weight: assertion.weight }
|
|
84
|
+
: {}),
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
export function buildFileContainsAssertion(assertion) {
|
|
88
|
+
const config = assertion.value;
|
|
89
|
+
const filePath = config?.path ?? "";
|
|
90
|
+
const expectedContent = config?.content ?? "";
|
|
91
|
+
const safeFilePath = JSON.stringify(filePath);
|
|
92
|
+
return {
|
|
93
|
+
type: "javascript",
|
|
94
|
+
value: `// file-contains: ${filePath}\n` +
|
|
95
|
+
`(function() {\n` +
|
|
96
|
+
` const fs = require('fs');\n` +
|
|
97
|
+
` const path = require('path');\n` +
|
|
98
|
+
` const workDir = path.resolve(context.vars.__workingDir || '.');\n` +
|
|
99
|
+
` const target = path.resolve(workDir, ${safeFilePath});\n` +
|
|
100
|
+
` if (!target.startsWith(workDir + path.sep) && target !== workDir) {\n` +
|
|
101
|
+
` return { pass: false, score: 0, reason: 'Path traversal: ' + ${safeFilePath} + ' escapes sandbox' };\n` +
|
|
102
|
+
` }\n` +
|
|
103
|
+
` if (!fs.existsSync(target)) {\n` +
|
|
104
|
+
` return { pass: false, score: 0, reason: 'File not found: ' + ${safeFilePath} };\n` +
|
|
105
|
+
` }\n` +
|
|
106
|
+
` const content = fs.readFileSync(target, 'utf-8');\n` +
|
|
107
|
+
` const contains = content.includes(${JSON.stringify(expectedContent)});\n` +
|
|
108
|
+
` return {\n` +
|
|
109
|
+
` pass: contains,\n` +
|
|
110
|
+
` score: contains ? 1 : 0,\n` +
|
|
111
|
+
` reason: contains\n` +
|
|
112
|
+
` ? 'File contains expected content'\n` +
|
|
113
|
+
` : 'File does not contain expected content',\n` +
|
|
114
|
+
` };\n` +
|
|
115
|
+
`})()`,
|
|
116
|
+
...(typeof assertion.weight === "number"
|
|
117
|
+
? { weight: assertion.weight }
|
|
118
|
+
: {}),
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* SECURITY: Trusted-input boundary.
|
|
123
|
+
*
|
|
124
|
+
* The `command-succeeds` assertion executes an arbitrary shell command
|
|
125
|
+
* inside the sandbox's working directory. The command string comes from
|
|
126
|
+
* task definitions (YAML or TypeScript config files), which are authored
|
|
127
|
+
* by developers — not from user input or LLM output.
|
|
128
|
+
*
|
|
129
|
+
* This is intentional: the assertion is designed to verify agent output
|
|
130
|
+
* by running build/test commands (e.g., "npm test", "tsc --noEmit").
|
|
131
|
+
*
|
|
132
|
+
* Task definitions are the trust boundary. If you accept task definitions
|
|
133
|
+
* from untrusted sources, validate commands against an allowlist first.
|
|
134
|
+
*/
|
|
135
|
+
export function buildCommandSucceedsAssertion(assertion) {
|
|
136
|
+
const command = String(assertion.value ?? "");
|
|
137
|
+
return {
|
|
138
|
+
type: "javascript",
|
|
139
|
+
value: `// command-succeeds: ${command}\n` +
|
|
140
|
+
`(function() {\n` +
|
|
141
|
+
` const { execSync } = require('child_process');\n` +
|
|
142
|
+
` const workDir = context.vars.__workingDir || '.';\n` +
|
|
143
|
+
` try {\n` +
|
|
144
|
+
` execSync(${JSON.stringify(command)}, { cwd: workDir, timeout: 30000 });\n` +
|
|
145
|
+
` return { pass: true, score: 1, reason: 'Command succeeded: ' + ${JSON.stringify(command)} };\n` +
|
|
146
|
+
` } catch (err) {\n` +
|
|
147
|
+
` return {\n` +
|
|
148
|
+
` pass: false,\n` +
|
|
149
|
+
` score: 0,\n` +
|
|
150
|
+
` reason: 'Command failed: ' + (err.message || err),\n` +
|
|
151
|
+
` };\n` +
|
|
152
|
+
` }\n` +
|
|
153
|
+
`})()`,
|
|
154
|
+
...(typeof assertion.weight === "number"
|
|
155
|
+
? { weight: assertion.weight }
|
|
156
|
+
: {}),
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
export function buildDiffMatchesAssertion(assertion) {
|
|
160
|
+
const expected = assertion.value;
|
|
161
|
+
return {
|
|
162
|
+
type: "javascript",
|
|
163
|
+
value: `// diff-matches\n` +
|
|
164
|
+
`(function() {\n` +
|
|
165
|
+
` const { execSync } = require('child_process');\n` +
|
|
166
|
+
` const workDir = context.vars.__workingDir || '.';\n` +
|
|
167
|
+
` try {\n` +
|
|
168
|
+
` const diff = execSync('git diff', { cwd: workDir, encoding: 'utf-8' });\n` +
|
|
169
|
+
` const expected = ${JSON.stringify(expected)};\n` +
|
|
170
|
+
` if (typeof expected === 'string') {\n` +
|
|
171
|
+
` const contains = diff.includes(expected);\n` +
|
|
172
|
+
` return {\n` +
|
|
173
|
+
` pass: contains,\n` +
|
|
174
|
+
` score: contains ? 1 : 0,\n` +
|
|
175
|
+
` reason: contains ? 'Diff matches expected pattern' : 'Diff does not match',\n` +
|
|
176
|
+
` };\n` +
|
|
177
|
+
` }\n` +
|
|
178
|
+
` return { pass: diff.length > 0, score: diff.length > 0 ? 1 : 0, reason: 'Diff exists' };\n` +
|
|
179
|
+
` } catch (err) {\n` +
|
|
180
|
+
` return { pass: false, score: 0, reason: 'Failed to get diff: ' + err.message };\n` +
|
|
181
|
+
` }\n` +
|
|
182
|
+
`})()`,
|
|
183
|
+
...(typeof assertion.weight === "number"
|
|
184
|
+
? { weight: assertion.weight }
|
|
185
|
+
: {}),
|
|
186
|
+
};
|
|
187
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent harness task compilation — core compilation logic.
|
|
3
|
+
*
|
|
4
|
+
* Maps agent harness task definitions to Promptfoo configuration with:
|
|
5
|
+
* - Claude Agent SDK / OpenAI Codex SDK providers
|
|
6
|
+
* - Tool permission configuration (preset/allowed/disallowed)
|
|
7
|
+
* - Sandbox setup/teardown via Promptfoo extensions
|
|
8
|
+
* - Fixture provisioning into sandbox working directory
|
|
9
|
+
*/
|
|
10
|
+
import type { AgentHarnessTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
11
|
+
import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
|
|
12
|
+
import type { AgentHarnessCompileOptions, AgentHarnessCompileResult } from "./types.js";
|
|
13
|
+
/**
|
|
14
|
+
* Compile an agent harness task definition into Promptfoo configuration.
|
|
15
|
+
*/
|
|
16
|
+
export declare function compileAgentHarnessTask(task: AgentHarnessTaskDefinition, options?: AgentHarnessCompileOptions): AgentHarnessCompileResult;
|
|
17
|
+
export declare function buildAgentProvider(task: AgentHarnessTaskDefinition, _warnings: string[]): PromptfooProvider[];
|
|
18
|
+
export declare function buildAgentPrompts(task: AgentHarnessTaskDefinition): PromptfooPrompt[];
|
|
19
|
+
export declare function buildAgentTestCases(task: AgentHarnessTaskDefinition, options: AgentHarnessCompileOptions | undefined, warnings: string[]): PromptfooTestCase[];
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent harness task compilation — core compilation logic.
|
|
3
|
+
*
|
|
4
|
+
* Maps agent harness task definitions to Promptfoo configuration with:
|
|
5
|
+
* - Claude Agent SDK / OpenAI Codex SDK providers
|
|
6
|
+
* - Tool permission configuration (preset/allowed/disallowed)
|
|
7
|
+
* - Sandbox setup/teardown via Promptfoo extensions
|
|
8
|
+
* - Fixture provisioning into sandbox working directory
|
|
9
|
+
*/
|
|
10
|
+
import { mapAgentAssertion } from "./assertions.js";
|
|
11
|
+
import { buildLifecycleExtensions, buildSandboxConfig } from "./sandbox.js";
|
|
12
|
+
import { TOOL_PRESETS } from "./tool-presets.js";
|
|
13
|
+
import { validateAgentHarnessTask } from "./validation.js";
|
|
14
|
+
/**
|
|
15
|
+
* Compile an agent harness task definition into Promptfoo configuration.
|
|
16
|
+
*/
|
|
17
|
+
export function compileAgentHarnessTask(task, options) {
|
|
18
|
+
const warnings = [];
|
|
19
|
+
// Validate
|
|
20
|
+
const validationErrors = validateAgentHarnessTask(task);
|
|
21
|
+
for (const err of validationErrors) {
|
|
22
|
+
warnings.push(`Agent harness task "${task.id}": ${err.field} — ${err.message}`);
|
|
23
|
+
}
|
|
24
|
+
// Build provider
|
|
25
|
+
const providers = buildAgentProvider(task, warnings);
|
|
26
|
+
// Build prompts
|
|
27
|
+
const prompts = buildAgentPrompts(task);
|
|
28
|
+
// Build test cases
|
|
29
|
+
const tests = buildAgentTestCases(task, options, warnings);
|
|
30
|
+
// Build sandbox extensions
|
|
31
|
+
const sandboxConfig = buildSandboxConfig(task);
|
|
32
|
+
const extensions = buildLifecycleExtensions(task, sandboxConfig);
|
|
33
|
+
return { providers, tests, prompts, extensions, sandboxConfig, warnings };
|
|
34
|
+
}
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
// Provider assembly
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
export function buildAgentProvider(task, _warnings) {
|
|
39
|
+
// Resolve tool permissions
|
|
40
|
+
const tools = resolveToolPermissions(task.tools);
|
|
41
|
+
const config = {};
|
|
42
|
+
if (tools.length > 0) {
|
|
43
|
+
config.allowedTools = tools;
|
|
44
|
+
}
|
|
45
|
+
if (task.sandbox) {
|
|
46
|
+
config.sandbox = {
|
|
47
|
+
type: task.sandbox.type,
|
|
48
|
+
...(task.sandbox.image ? { image: task.sandbox.image } : {}),
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
// Default to Claude Agent SDK provider
|
|
52
|
+
return [
|
|
53
|
+
{
|
|
54
|
+
id: `agent:${task.id}`,
|
|
55
|
+
label: `Agent Harness: ${task.title}`,
|
|
56
|
+
config,
|
|
57
|
+
},
|
|
58
|
+
];
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Resolve tool permissions from task config.
|
|
62
|
+
*
|
|
63
|
+
* Handles:
|
|
64
|
+
* - Preset names ("coding", "read-only", "full-access")
|
|
65
|
+
* - Explicit tool names ("Bash", "Read", "Write")
|
|
66
|
+
* - Mixed arrays ["coding", "WebSearch"] → preset expansion + extras
|
|
67
|
+
*/
|
|
68
|
+
function resolveToolPermissions(tools) {
|
|
69
|
+
if (!tools || tools.length === 0)
|
|
70
|
+
return [];
|
|
71
|
+
const resolved = new Set();
|
|
72
|
+
for (const tool of tools) {
|
|
73
|
+
const preset = TOOL_PRESETS[tool];
|
|
74
|
+
if (preset) {
|
|
75
|
+
for (const t of preset)
|
|
76
|
+
resolved.add(t);
|
|
77
|
+
}
|
|
78
|
+
else {
|
|
79
|
+
resolved.add(tool);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
return [...resolved];
|
|
83
|
+
}
|
|
84
|
+
// ---------------------------------------------------------------------------
|
|
85
|
+
// Prompt assembly
|
|
86
|
+
// ---------------------------------------------------------------------------
|
|
87
|
+
export function buildAgentPrompts(task) {
|
|
88
|
+
const promptText = task.prompt?.text ??
|
|
89
|
+
task.prompt?.vars?.task ??
|
|
90
|
+
task.description ??
|
|
91
|
+
`Agent task: ${task.title}`;
|
|
92
|
+
return [
|
|
93
|
+
{
|
|
94
|
+
id: "agent-harness",
|
|
95
|
+
label: `Agent: ${task.title}`,
|
|
96
|
+
raw: String(promptText),
|
|
97
|
+
},
|
|
98
|
+
];
|
|
99
|
+
}
|
|
100
|
+
// ---------------------------------------------------------------------------
|
|
101
|
+
// Test case assembly
|
|
102
|
+
// ---------------------------------------------------------------------------
|
|
103
|
+
export function buildAgentTestCases(task, options, warnings) {
|
|
104
|
+
const assertions = [];
|
|
105
|
+
if (task.assertions) {
|
|
106
|
+
for (const assertion of task.assertions) {
|
|
107
|
+
const mapped = mapAgentAssertion(assertion, options, warnings);
|
|
108
|
+
if (mapped)
|
|
109
|
+
assertions.push(mapped);
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
const vars = {
|
|
113
|
+
task: task.prompt?.vars?.task ?? task.description ?? `Complete: ${task.title}`,
|
|
114
|
+
...(task.prompt?.vars ?? {}),
|
|
115
|
+
// Internal metadata for sandbox lifecycle hooks
|
|
116
|
+
__sandboxType: task.sandbox?.type ?? "tempdir",
|
|
117
|
+
__fixtures: task.fixtures ?? [],
|
|
118
|
+
};
|
|
119
|
+
const tests = [
|
|
120
|
+
{
|
|
121
|
+
description: `${task.id} — ${task.title}`,
|
|
122
|
+
vars,
|
|
123
|
+
...(assertions.length > 0 ? { assert: assertions } : {}),
|
|
124
|
+
},
|
|
125
|
+
];
|
|
126
|
+
// Multi-turn support
|
|
127
|
+
if (task.multiTurn?.turns && task.multiTurn.turns.length > 0) {
|
|
128
|
+
tests.push({
|
|
129
|
+
description: `${task.id} — ${task.title} [multi-turn]`,
|
|
130
|
+
vars: {
|
|
131
|
+
...vars,
|
|
132
|
+
__multiTurn: task.multiTurn.turns,
|
|
133
|
+
},
|
|
134
|
+
...(assertions.length > 0 ? { assert: assertions } : {}),
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
return tests;
|
|
138
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent harness mode handler — compiles AgentHarnessTaskDefinition into Promptfoo config.
|
|
3
|
+
*
|
|
4
|
+
* @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
|
|
5
|
+
* @see packages/core/src/types/generalized-task.ts — AgentHarnessTaskDefinition
|
|
6
|
+
*/
|
|
7
|
+
import type { ModeHandler } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
8
|
+
export { AGENT_HARNESS_PROMPT_TEMPLATES } from "./prompts.js";
|
|
9
|
+
export { TOOL_PRESETS } from "./tool-presets.js";
|
|
10
|
+
export { validateAgentHarnessTask } from "./validation.js";
|
|
11
|
+
export { mapAgentAssertion, buildFileExistsAssertion, buildFileContainsAssertion, buildCommandSucceedsAssertion, buildDiffMatchesAssertion, } from "./assertions.js";
|
|
12
|
+
export { buildLifecycleExtensions, buildBeforeEachHook, buildAfterEachHook, buildSandboxConfig, } from "./sandbox.js";
|
|
13
|
+
export { compileAgentHarnessTask, buildAgentProvider, buildAgentPrompts, buildAgentTestCases, } from "./compiler.js";
|
|
14
|
+
export type { AgentHarnessCompileOptions, AgentHarnessCompileResult, AgentHarnessValidationError, PromptfooExtension, SandboxConfigMeta, } from "./types.js";
|
|
15
|
+
/** ModeHandler-conformant export for the agent-harness evaluation mode. */
|
|
16
|
+
export declare const handler: ModeHandler;
|