@sanity/ailf 1.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +29 -12
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
- package/dist/_vendor/ailf-core/config-helpers.js +51 -2
- package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
- package/dist/_vendor/ailf-core/examples/index.js +213 -94
- package/dist/_vendor/ailf-core/index.d.ts +3 -2
- package/dist/_vendor/ailf-core/index.js +2 -1
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
- package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +7 -1
- package/dist/adapters/config-sources/ts-config-loader.js +21 -13
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
- package/dist/adapters/task-sources/index.d.ts +3 -4
- package/dist/adapters/task-sources/index.js +3 -4
- package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
- package/dist/adapters/task-sources/repo-schemas.js +228 -20
- package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
- package/dist/adapters/task-sources/repo-task-source.js +81 -122
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
- package/dist/adapters/task-sources/task-file-loader.js +21 -7
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/coverage-audit.js +3 -1
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +5 -4
- package/dist/commands/init.js +190 -25
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +15 -4
- package/dist/composition-root.js +100 -55
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/build-step-sequence.js +4 -2
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +32 -19
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +77 -26
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +51 -31
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
- package/dist/pipeline/compiler/literacy-bridge.js +2 -2
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
- package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
- package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/expand-tasks.d.ts +2 -2
- package/dist/pipeline/expand-tasks.js +2 -2
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +16 -9
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +16 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
- package/dist/pipeline/mirror-repo-tasks.js +10 -10
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +68 -30
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +32 -24
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent harness task compilation — core compilation logic.
|
|
3
|
+
*
|
|
4
|
+
* Maps agent harness task definitions to Promptfoo configuration with:
|
|
5
|
+
* - Claude Agent SDK / OpenAI Codex SDK providers
|
|
6
|
+
* - Tool permission configuration (preset/allowed/disallowed)
|
|
7
|
+
* - Sandbox setup/teardown via Promptfoo extensions
|
|
8
|
+
* - Fixture provisioning into sandbox working directory
|
|
9
|
+
*/
|
|
10
|
+
import { mapAgentAssertion } from "./assertions.js";
|
|
11
|
+
import { buildLifecycleExtensions, buildSandboxConfig } from "./sandbox.js";
|
|
12
|
+
import { TOOL_PRESETS } from "./tool-presets.js";
|
|
13
|
+
import { validateAgentHarnessTask } from "./validation.js";
|
|
14
|
+
/**
|
|
15
|
+
* Compile an agent harness task definition into Promptfoo configuration.
|
|
16
|
+
*/
|
|
17
|
+
export function compileAgentHarnessTask(task, options) {
|
|
18
|
+
const warnings = [];
|
|
19
|
+
// Validate
|
|
20
|
+
const validationErrors = validateAgentHarnessTask(task);
|
|
21
|
+
for (const err of validationErrors) {
|
|
22
|
+
warnings.push(`Agent harness task "${task.id}": ${err.field} — ${err.message}`);
|
|
23
|
+
}
|
|
24
|
+
// Build provider
|
|
25
|
+
const providers = buildAgentProvider(task, warnings);
|
|
26
|
+
// Build prompts
|
|
27
|
+
const prompts = buildAgentPrompts(task);
|
|
28
|
+
// Build test cases
|
|
29
|
+
const tests = buildAgentTestCases(task, options, warnings);
|
|
30
|
+
// Build sandbox extensions
|
|
31
|
+
const sandboxConfig = buildSandboxConfig(task);
|
|
32
|
+
const extensions = buildLifecycleExtensions(task, sandboxConfig);
|
|
33
|
+
return { providers, tests, prompts, extensions, sandboxConfig, warnings };
|
|
34
|
+
}
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
// Provider assembly
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
export function buildAgentProvider(task, _warnings) {
|
|
39
|
+
// Resolve tool permissions
|
|
40
|
+
const tools = resolveToolPermissions(task.tools);
|
|
41
|
+
const config = {};
|
|
42
|
+
if (tools.length > 0) {
|
|
43
|
+
config.allowedTools = tools;
|
|
44
|
+
}
|
|
45
|
+
if (task.sandbox) {
|
|
46
|
+
config.sandbox = {
|
|
47
|
+
type: task.sandbox.type,
|
|
48
|
+
...(task.sandbox.image ? { image: task.sandbox.image } : {}),
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
// Default to Claude Agent SDK provider
|
|
52
|
+
return [
|
|
53
|
+
{
|
|
54
|
+
id: `agent:${task.id}`,
|
|
55
|
+
label: `Agent Harness: ${task.title}`,
|
|
56
|
+
config,
|
|
57
|
+
},
|
|
58
|
+
];
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Resolve tool permissions from task config.
|
|
62
|
+
*
|
|
63
|
+
* Handles:
|
|
64
|
+
* - Preset names ("coding", "read-only", "full-access")
|
|
65
|
+
* - Explicit tool names ("Bash", "Read", "Write")
|
|
66
|
+
* - Mixed arrays ["coding", "WebSearch"] → preset expansion + extras
|
|
67
|
+
*/
|
|
68
|
+
function resolveToolPermissions(tools) {
|
|
69
|
+
if (!tools || tools.length === 0)
|
|
70
|
+
return [];
|
|
71
|
+
const resolved = new Set();
|
|
72
|
+
for (const tool of tools) {
|
|
73
|
+
const preset = TOOL_PRESETS[tool];
|
|
74
|
+
if (preset) {
|
|
75
|
+
for (const t of preset)
|
|
76
|
+
resolved.add(t);
|
|
77
|
+
}
|
|
78
|
+
else {
|
|
79
|
+
resolved.add(tool);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
return [...resolved];
|
|
83
|
+
}
|
|
84
|
+
// ---------------------------------------------------------------------------
|
|
85
|
+
// Prompt assembly
|
|
86
|
+
// ---------------------------------------------------------------------------
|
|
87
|
+
export function buildAgentPrompts(task) {
|
|
88
|
+
const promptText = task.prompt?.text ??
|
|
89
|
+
task.prompt?.vars?.task ??
|
|
90
|
+
task.description ??
|
|
91
|
+
`Agent task: ${task.title}`;
|
|
92
|
+
return [
|
|
93
|
+
{
|
|
94
|
+
id: "agent-harness",
|
|
95
|
+
label: `Agent: ${task.title}`,
|
|
96
|
+
raw: String(promptText),
|
|
97
|
+
},
|
|
98
|
+
];
|
|
99
|
+
}
|
|
100
|
+
// ---------------------------------------------------------------------------
|
|
101
|
+
// Test case assembly
|
|
102
|
+
// ---------------------------------------------------------------------------
|
|
103
|
+
export function buildAgentTestCases(task, options, warnings) {
|
|
104
|
+
const assertions = [];
|
|
105
|
+
if (task.assertions) {
|
|
106
|
+
for (const assertion of task.assertions) {
|
|
107
|
+
const mapped = mapAgentAssertion(assertion, options, warnings);
|
|
108
|
+
if (mapped)
|
|
109
|
+
assertions.push(mapped);
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
const vars = {
|
|
113
|
+
task: task.prompt?.vars?.task ?? task.description ?? `Complete: ${task.title}`,
|
|
114
|
+
...(task.prompt?.vars ?? {}),
|
|
115
|
+
// Internal metadata for sandbox lifecycle hooks
|
|
116
|
+
__sandboxType: task.sandbox?.type ?? "tempdir",
|
|
117
|
+
__fixtures: task.fixtures ?? [],
|
|
118
|
+
};
|
|
119
|
+
const tests = [
|
|
120
|
+
{
|
|
121
|
+
description: `${task.id} — ${task.title}`,
|
|
122
|
+
vars,
|
|
123
|
+
...(assertions.length > 0 ? { assert: assertions } : {}),
|
|
124
|
+
},
|
|
125
|
+
];
|
|
126
|
+
// Multi-turn support
|
|
127
|
+
if (task.multiTurn?.turns && task.multiTurn.turns.length > 0) {
|
|
128
|
+
tests.push({
|
|
129
|
+
description: `${task.id} — ${task.title} [multi-turn]`,
|
|
130
|
+
vars: {
|
|
131
|
+
...vars,
|
|
132
|
+
__multiTurn: task.multiTurn.turns,
|
|
133
|
+
},
|
|
134
|
+
...(assertions.length > 0 ? { assert: assertions } : {}),
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
return tests;
|
|
138
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent harness mode handler — compiles AgentHarnessTaskDefinition into Promptfoo config.
|
|
3
|
+
*
|
|
4
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
|
|
5
|
+
* @see packages/core/src/types/generalized-task.ts — AgentHarnessTaskDefinition
|
|
6
|
+
*/
|
|
7
|
+
import type { ModeHandler } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
8
|
+
export { AGENT_HARNESS_PROMPT_TEMPLATES } from "./prompts.js";
|
|
9
|
+
export { TOOL_PRESETS } from "./tool-presets.js";
|
|
10
|
+
export { validateAgentHarnessTask } from "./validation.js";
|
|
11
|
+
export { mapAgentAssertion, buildFileExistsAssertion, buildFileContainsAssertion, buildCommandSucceedsAssertion, buildDiffMatchesAssertion, } from "./assertions.js";
|
|
12
|
+
export { buildLifecycleExtensions, buildBeforeEachHook, buildAfterEachHook, buildSandboxConfig, } from "./sandbox.js";
|
|
13
|
+
export { compileAgentHarnessTask, buildAgentProvider, buildAgentPrompts, buildAgentTestCases, } from "./compiler.js";
|
|
14
|
+
export type { AgentHarnessCompileOptions, AgentHarnessCompileResult, AgentHarnessValidationError, PromptfooExtension, SandboxConfigMeta, } from "./types.js";
|
|
15
|
+
/** ModeHandler-conformant export for the agent-harness evaluation mode. */
|
|
16
|
+
export declare const handler: ModeHandler;
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent harness mode handler — compiles AgentHarnessTaskDefinition into Promptfoo config.
|
|
3
|
+
*
|
|
4
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
|
|
5
|
+
* @see packages/core/src/types/generalized-task.ts — AgentHarnessTaskDefinition
|
|
6
|
+
*/
|
|
7
|
+
import { compileAgentHarnessTask } from "./compiler.js";
|
|
8
|
+
import { AGENT_HARNESS_PROMPT_TEMPLATES } from "./prompts.js";
|
|
9
|
+
// Re-export public API
|
|
10
|
+
export { AGENT_HARNESS_PROMPT_TEMPLATES } from "./prompts.js";
|
|
11
|
+
export { TOOL_PRESETS } from "./tool-presets.js";
|
|
12
|
+
export { validateAgentHarnessTask } from "./validation.js";
|
|
13
|
+
export { mapAgentAssertion, buildFileExistsAssertion, buildFileContainsAssertion, buildCommandSucceedsAssertion, buildDiffMatchesAssertion, } from "./assertions.js";
|
|
14
|
+
export { buildLifecycleExtensions, buildBeforeEachHook, buildAfterEachHook, buildSandboxConfig, } from "./sandbox.js";
|
|
15
|
+
export { compileAgentHarnessTask, buildAgentProvider, buildAgentPrompts, buildAgentTestCases, } from "./compiler.js";
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
// ModeHandler adapter — wraps compileAgentHarnessTask for registry dispatch
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
/** ModeHandler-conformant export for the agent-harness evaluation mode. */
|
|
20
|
+
export const handler = {
|
|
21
|
+
getPrompts() {
|
|
22
|
+
return AGENT_HARNESS_PROMPT_TEMPLATES;
|
|
23
|
+
},
|
|
24
|
+
compileTask(task, ctx) {
|
|
25
|
+
if (!("mode" in task) || task.mode !== "agent-harness") {
|
|
26
|
+
throw new Error(`Agent harness handler received task with mode "${task.mode ?? "undefined"}" — expected "agent-harness"`);
|
|
27
|
+
}
|
|
28
|
+
const result = compileAgentHarnessTask(task, {
|
|
29
|
+
graderProvider: ctx.graderProvider,
|
|
30
|
+
rootDir: ctx.rootDir,
|
|
31
|
+
});
|
|
32
|
+
return {
|
|
33
|
+
providers: result.providers,
|
|
34
|
+
tests: result.tests,
|
|
35
|
+
prompts: result.prompts,
|
|
36
|
+
warnings: result.warnings,
|
|
37
|
+
extras: {
|
|
38
|
+
extensions: result.extensions,
|
|
39
|
+
sandboxConfig: result.sandboxConfig,
|
|
40
|
+
},
|
|
41
|
+
};
|
|
42
|
+
},
|
|
43
|
+
};
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Canonical prompt templates for agent-harness-mode evaluations.
|
|
3
|
+
*
|
|
4
|
+
* Handler-owned prompts for agent harness evaluations. Describes the task
|
|
5
|
+
* for autonomous agent execution within a sandboxed environment with file
|
|
6
|
+
* system and tool access.
|
|
7
|
+
*/
|
|
8
|
+
import type { PromptTemplate } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
9
|
+
export declare const AGENT_HARNESS_PROMPT_TEMPLATES: Record<string, PromptTemplate>;
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Canonical prompt templates for agent-harness-mode evaluations.
|
|
3
|
+
*
|
|
4
|
+
* Handler-owned prompts for agent harness evaluations. Describes the task
|
|
5
|
+
* for autonomous agent execution within a sandboxed environment with file
|
|
6
|
+
* system and tool access.
|
|
7
|
+
*/
|
|
8
|
+
export const AGENT_HARNESS_PROMPT_TEMPLATES = {
|
|
9
|
+
"agent-harness": {
|
|
10
|
+
id: "agent-harness",
|
|
11
|
+
label: "Agent Harness Task",
|
|
12
|
+
template: `You are a coding agent working in a sandboxed environment. You have access to file system tools (read, write, edit) and a shell to complete the following task.
|
|
13
|
+
|
|
14
|
+
## Task
|
|
15
|
+
{{task}}
|
|
16
|
+
|
|
17
|
+
## Instructions
|
|
18
|
+
|
|
19
|
+
1. Read existing files to understand the project structure before making changes
|
|
20
|
+
2. Implement a complete, working solution — no placeholders or TODOs
|
|
21
|
+
3. Ensure all necessary imports and dependencies are included
|
|
22
|
+
4. Verify your implementation compiles and passes any provided test commands
|
|
23
|
+
5. Keep changes minimal and focused on the task
|
|
24
|
+
|
|
25
|
+
Complete the implementation:
|
|
26
|
+
`,
|
|
27
|
+
variables: ["task"],
|
|
28
|
+
},
|
|
29
|
+
};
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Sandbox configuration and lifecycle extensions for agent harness tasks.
|
|
3
|
+
*
|
|
4
|
+
* Builds Promptfoo beforeEach/afterEach hooks for provisioning and
|
|
5
|
+
* tearing down sandbox working directories.
|
|
6
|
+
*/
|
|
7
|
+
import type { AgentHarnessTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
8
|
+
import type { PromptfooExtension, SandboxConfigMeta } from "./types.js";
|
|
9
|
+
export declare function buildSandboxConfig(task: AgentHarnessTaskDefinition): SandboxConfigMeta;
|
|
10
|
+
export declare function buildLifecycleExtensions(task: AgentHarnessTaskDefinition, sandboxConfig: SandboxConfigMeta): PromptfooExtension[];
|
|
11
|
+
export declare function buildBeforeEachHook(taskId: string, config: SandboxConfigMeta): string;
|
|
12
|
+
export declare function buildAfterEachHook(taskId: string): string;
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Sandbox configuration and lifecycle extensions for agent harness tasks.
|
|
3
|
+
*
|
|
4
|
+
* Builds Promptfoo beforeEach/afterEach hooks for provisioning and
|
|
5
|
+
* tearing down sandbox working directories.
|
|
6
|
+
*/
|
|
7
|
+
// ---------------------------------------------------------------------------
|
|
8
|
+
// Sandbox configuration
|
|
9
|
+
// ---------------------------------------------------------------------------
|
|
10
|
+
export function buildSandboxConfig(task) {
|
|
11
|
+
return {
|
|
12
|
+
type: task.sandbox?.type ?? "tempdir",
|
|
13
|
+
image: task.sandbox?.image,
|
|
14
|
+
fixtures: task.fixtures ?? [],
|
|
15
|
+
limits: task.sandbox?.limits
|
|
16
|
+
? {
|
|
17
|
+
cpus: task.sandbox.limits.cpus,
|
|
18
|
+
memoryBytes: task.sandbox.limits.memoryBytes,
|
|
19
|
+
networkAccess: task.sandbox.limits.networkAccess,
|
|
20
|
+
}
|
|
21
|
+
: undefined,
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
// Lifecycle extensions
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
export function buildLifecycleExtensions(task, sandboxConfig) {
|
|
28
|
+
const extensions = [];
|
|
29
|
+
// beforeEach: provision sandbox + inject fixtures
|
|
30
|
+
extensions.push({
|
|
31
|
+
type: "beforeEach",
|
|
32
|
+
code: buildBeforeEachHook(task.id, sandboxConfig),
|
|
33
|
+
});
|
|
34
|
+
// afterEach: collect artifacts + teardown
|
|
35
|
+
extensions.push({
|
|
36
|
+
type: "afterEach",
|
|
37
|
+
code: buildAfterEachHook(task.id),
|
|
38
|
+
});
|
|
39
|
+
return extensions;
|
|
40
|
+
}
|
|
41
|
+
export function buildBeforeEachHook(taskId, config) {
|
|
42
|
+
return (`// beforeEach: provision sandbox for ${taskId}\n` +
|
|
43
|
+
`async function({ vars }) {\n` +
|
|
44
|
+
` const { mkdirSync, writeFileSync } = require('fs');\n` +
|
|
45
|
+
` const { tmpdir } = require('os');\n` +
|
|
46
|
+
` const { resolve } = require('path');\n` +
|
|
47
|
+
` const id = 'ailf-${taskId}-' + require('crypto').randomUUID().slice(0, 8);\n` +
|
|
48
|
+
` const workDir = resolve(tmpdir(), id);\n` +
|
|
49
|
+
` mkdirSync(workDir, { recursive: true });\n` +
|
|
50
|
+
` vars.__workingDir = workDir;\n` +
|
|
51
|
+
` vars.__sandboxId = id;\n` +
|
|
52
|
+
` // Fixture list: ${JSON.stringify(config.fixtures)}\n` +
|
|
53
|
+
`}`);
|
|
54
|
+
}
|
|
55
|
+
export function buildAfterEachHook(taskId) {
|
|
56
|
+
return (`// afterEach: collect artifacts + teardown for ${taskId}\n` +
|
|
57
|
+
`async function({ vars }) {\n` +
|
|
58
|
+
` const { rmSync, readdirSync, existsSync } = require('fs');\n` +
|
|
59
|
+
` const workDir = vars.__workingDir;\n` +
|
|
60
|
+
` if (workDir && existsSync(workDir)) {\n` +
|
|
61
|
+
` try {\n` +
|
|
62
|
+
` // Collect modified files list\n` +
|
|
63
|
+
` const files = [];\n` +
|
|
64
|
+
` function collect(dir, prefix) {\n` +
|
|
65
|
+
` for (const e of readdirSync(dir, { withFileTypes: true })) {\n` +
|
|
66
|
+
` const rel = prefix ? prefix + '/' + e.name : e.name;\n` +
|
|
67
|
+
` if (e.isDirectory()) collect(require('path').resolve(dir, e.name), rel);\n` +
|
|
68
|
+
` else files.push(rel);\n` +
|
|
69
|
+
` }\n` +
|
|
70
|
+
` }\n` +
|
|
71
|
+
` collect(workDir, '');\n` +
|
|
72
|
+
` vars.__artifacts = { modifiedFiles: files };\n` +
|
|
73
|
+
` } finally {\n` +
|
|
74
|
+
` // Guard: only delete directories under os.tmpdir()\n` +
|
|
75
|
+
` const tmp = require('os').tmpdir();\n` +
|
|
76
|
+
` if (require('path').resolve(workDir).startsWith(require('path').resolve(tmp))) {\n` +
|
|
77
|
+
` rmSync(workDir, { recursive: true, force: true });\n` +
|
|
78
|
+
` }\n` +
|
|
79
|
+
` }\n` +
|
|
80
|
+
` }\n` +
|
|
81
|
+
`}`);
|
|
82
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Predefined tool permission sets for agent harness evaluations.
|
|
3
|
+
*/
|
|
4
|
+
export const TOOL_PRESETS = {
|
|
5
|
+
coding: ["Bash", "Read", "Write", "Edit", "Glob", "Grep"],
|
|
6
|
+
"full-access": [
|
|
7
|
+
"Bash",
|
|
8
|
+
"Read",
|
|
9
|
+
"Write",
|
|
10
|
+
"Edit",
|
|
11
|
+
"Glob",
|
|
12
|
+
"Grep",
|
|
13
|
+
"WebSearch",
|
|
14
|
+
"WebFetch",
|
|
15
|
+
"TodoRead",
|
|
16
|
+
"TodoWrite",
|
|
17
|
+
],
|
|
18
|
+
"read-only": ["Read", "Glob", "Grep", "WebSearch"],
|
|
19
|
+
};
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared types for the agent harness mode handler.
|
|
3
|
+
*/
|
|
4
|
+
import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
|
|
5
|
+
import type { SandboxType } from "../../sandbox/sandbox-strategy.js";
|
|
6
|
+
/** Options for compiling an agent harness task */
|
|
7
|
+
export interface AgentHarnessCompileOptions {
|
|
8
|
+
/** Grader provider for LLM-graded assertions */
|
|
9
|
+
graderProvider?: string;
|
|
10
|
+
/** Root directory for fixture resolution */
|
|
11
|
+
rootDir?: string;
|
|
12
|
+
}
|
|
13
|
+
/** Result of compiling a single agent harness task */
|
|
14
|
+
export interface AgentHarnessCompileResult {
|
|
15
|
+
/** Promptfoo provider config */
|
|
16
|
+
providers: PromptfooProvider[];
|
|
17
|
+
/** Compiled test cases */
|
|
18
|
+
tests: PromptfooTestCase[];
|
|
19
|
+
/** Prompts for evaluation */
|
|
20
|
+
prompts: PromptfooPrompt[];
|
|
21
|
+
/** Promptfoo extensions for sandbox lifecycle */
|
|
22
|
+
extensions: PromptfooExtension[];
|
|
23
|
+
/** Sandbox configuration metadata */
|
|
24
|
+
sandboxConfig: SandboxConfigMeta;
|
|
25
|
+
/** Warnings generated during compilation */
|
|
26
|
+
warnings: string[];
|
|
27
|
+
}
|
|
28
|
+
/** Promptfoo extension hook */
|
|
29
|
+
export interface PromptfooExtension {
|
|
30
|
+
type: "afterEach" | "beforeEach";
|
|
31
|
+
/** JavaScript code or module path for the hook */
|
|
32
|
+
code: string;
|
|
33
|
+
}
|
|
34
|
+
/** Metadata about sandbox configuration for this task */
|
|
35
|
+
export interface SandboxConfigMeta {
|
|
36
|
+
type: SandboxType;
|
|
37
|
+
image?: string;
|
|
38
|
+
fixtures: string[];
|
|
39
|
+
limits?: {
|
|
40
|
+
cpus?: number;
|
|
41
|
+
memoryBytes?: number;
|
|
42
|
+
networkAccess?: boolean;
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
/** Validation errors for agent harness task definitions */
|
|
46
|
+
export interface AgentHarnessValidationError {
|
|
47
|
+
field: string;
|
|
48
|
+
message: string;
|
|
49
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Validation for agent harness task definitions.
|
|
3
|
+
*/
|
|
4
|
+
import type { AgentHarnessTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
5
|
+
import type { AgentHarnessValidationError } from "./types.js";
|
|
6
|
+
/**
|
|
7
|
+
* Validate that an agent harness task definition has all required fields.
|
|
8
|
+
*/
|
|
9
|
+
export declare function validateAgentHarnessTask(task: AgentHarnessTaskDefinition): AgentHarnessValidationError[];
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Validation for agent harness task definitions.
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Validate that an agent harness task definition has all required fields.
|
|
6
|
+
*/
|
|
7
|
+
export function validateAgentHarnessTask(task) {
|
|
8
|
+
const errors = [];
|
|
9
|
+
if (!task.id) {
|
|
10
|
+
errors.push({ field: "id", message: "Task ID is required" });
|
|
11
|
+
}
|
|
12
|
+
if (!task.title) {
|
|
13
|
+
errors.push({ field: "title", message: "Task title is required" });
|
|
14
|
+
}
|
|
15
|
+
return errors;
|
|
16
|
+
}
|
|
@@ -6,11 +6,10 @@
|
|
|
6
6
|
* is the extensibility mechanism for the compiler: adding a new
|
|
7
7
|
* evaluation mode means adding a new handler file.
|
|
8
8
|
*
|
|
9
|
-
* @see docs/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
|
|
10
|
-
* @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
|
|
9
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
|
|
10
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
|
|
11
11
|
*/
|
|
12
|
-
export { compileMCPTask, handler as mcpServerHandler, validateMCPTask, type MCPCompileOptions, type MCPCompileResult, type MCPValidationError, } from "./mcp-server
|
|
13
|
-
export {
|
|
14
|
-
export {
|
|
15
|
-
export {
|
|
16
|
-
export { compileAgentHarnessTask, handler as agentHarnessHandler, validateAgentHarnessTask, type AgentHarnessCompileOptions, type AgentHarnessCompileResult, type AgentHarnessValidationError, type PromptfooExtension, type SandboxConfigMeta, } from "./agent-harness-handler.js";
|
|
12
|
+
export { buildMCPAssertions, compileMCPTask, handler as mcpServerHandler, validateMCPTask, type MCPAssertionContext, type MCPCompileOptions, type MCPCompileResult, type MCPValidationError, } from "./mcp-server/index.js";
|
|
13
|
+
export { compileLiteracyTask, handler as literacyHandler, validateLiteracyTask, type LiteracyCompileOptions, type LiteracyCompileResult, type LiteracyValidationError, type RubricConfig, } from "./literacy/index.js";
|
|
14
|
+
export { compileKnowledgeProbeTask, handler as knowledgeProbeHandler, validateKnowledgeProbeTask, type KnowledgeProbeCompileOptions, type KnowledgeProbeCompileResult, type KnowledgeProbeMetadata, type KnowledgeProbeValidationError, } from "./knowledge-probe/index.js";
|
|
15
|
+
export { compileAgentHarnessTask, handler as agentHarnessHandler, validateAgentHarnessTask, type AgentHarnessCompileOptions, type AgentHarnessCompileResult, type AgentHarnessValidationError, type PromptfooExtension, type SandboxConfigMeta, } from "./agent-harness/index.js";
|
|
@@ -6,16 +6,14 @@
|
|
|
6
6
|
* is the extensibility mechanism for the compiler: adding a new
|
|
7
7
|
* evaluation mode means adding a new handler file.
|
|
8
8
|
*
|
|
9
|
-
* @see docs/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
|
|
10
|
-
* @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
|
|
9
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
|
|
10
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
|
|
11
11
|
*/
|
|
12
12
|
// MCP Server mode
|
|
13
|
-
export { compileMCPTask, handler as mcpServerHandler, validateMCPTask, } from "./mcp-server
|
|
14
|
-
// MCP assertion builders
|
|
15
|
-
export { buildMCPAssertions, } from "./mcp-assertions.js";
|
|
13
|
+
export { buildMCPAssertions, compileMCPTask, handler as mcpServerHandler, validateMCPTask, } from "./mcp-server/index.js";
|
|
16
14
|
// Literacy mode
|
|
17
|
-
export { compileLiteracyTask, handler as literacyHandler, validateLiteracyTask, } from "./literacy
|
|
15
|
+
export { compileLiteracyTask, handler as literacyHandler, validateLiteracyTask, } from "./literacy/index.js";
|
|
18
16
|
// Knowledge Probe mode
|
|
19
|
-
export { compileKnowledgeProbeTask, handler as knowledgeProbeHandler, validateKnowledgeProbeTask, } from "./knowledge-probe
|
|
17
|
+
export { compileKnowledgeProbeTask, handler as knowledgeProbeHandler, validateKnowledgeProbeTask, } from "./knowledge-probe/index.js";
|
|
20
18
|
// Agent Harness mode
|
|
21
|
-
export { compileAgentHarnessTask, handler as agentHarnessHandler, validateAgentHarnessTask, } from "./agent-harness
|
|
19
|
+
export { compileAgentHarnessTask, handler as agentHarnessHandler, validateAgentHarnessTask, } from "./agent-harness/index.js";
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Assertion mapping for knowledge probe evaluations.
|
|
3
|
+
*/
|
|
4
|
+
import type { PromptfooAssertion } from "../../assertion-mapper.js";
|
|
5
|
+
import type { KnowledgeProbeCompileOptions } from "./types.js";
|
|
6
|
+
/**
|
|
7
|
+
* Map a raw knowledge probe assertion to a Promptfoo assertion.
|
|
8
|
+
*
|
|
9
|
+
* Tool-use assertions are rejected (knowledge probes don't use tools).
|
|
10
|
+
* LLM-graded assertions receive the configured grader provider.
|
|
11
|
+
* All other assertions are passed through.
|
|
12
|
+
*/
|
|
13
|
+
export declare function mapKnowledgeProbeAssertion(assertion: {
|
|
14
|
+
type: string;
|
|
15
|
+
[k: string]: unknown;
|
|
16
|
+
}, options: KnowledgeProbeCompileOptions | undefined, warnings: string[]): PromptfooAssertion | null;
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Assertion mapping for knowledge probe evaluations.
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Map a raw knowledge probe assertion to a Promptfoo assertion.
|
|
6
|
+
*
|
|
7
|
+
* Tool-use assertions are rejected (knowledge probes don't use tools).
|
|
8
|
+
* LLM-graded assertions receive the configured grader provider.
|
|
9
|
+
* All other assertions are passed through.
|
|
10
|
+
*/
|
|
11
|
+
export function mapKnowledgeProbeAssertion(assertion, options, warnings) {
|
|
12
|
+
switch (assertion.type) {
|
|
13
|
+
// Standard assertions — pass through
|
|
14
|
+
case "contains":
|
|
15
|
+
case "contains-all":
|
|
16
|
+
case "contains-any":
|
|
17
|
+
case "equals":
|
|
18
|
+
case "is-json":
|
|
19
|
+
case "javascript":
|
|
20
|
+
case "python":
|
|
21
|
+
case "regex":
|
|
22
|
+
case "similar":
|
|
23
|
+
return {
|
|
24
|
+
type: assertion.type,
|
|
25
|
+
...("value" in assertion ? { value: assertion.value } : {}),
|
|
26
|
+
...(typeof assertion.weight === "number"
|
|
27
|
+
? { weight: assertion.weight }
|
|
28
|
+
: {}),
|
|
29
|
+
};
|
|
30
|
+
// LLM-graded assertions — add grader provider
|
|
31
|
+
case "g-eval":
|
|
32
|
+
case "llm-rubric":
|
|
33
|
+
case "model-graded-closedqa":
|
|
34
|
+
case "model-graded-factuality":
|
|
35
|
+
return {
|
|
36
|
+
type: assertion.type,
|
|
37
|
+
...("value" in assertion ? { value: assertion.value } : {}),
|
|
38
|
+
...(typeof assertion.weight === "number"
|
|
39
|
+
? { weight: assertion.weight }
|
|
40
|
+
: {}),
|
|
41
|
+
...(options?.graderProvider
|
|
42
|
+
? { provider: options.graderProvider }
|
|
43
|
+
: {}),
|
|
44
|
+
};
|
|
45
|
+
// Tool-use assertions are NOT valid for knowledge probes
|
|
46
|
+
case "skill-used":
|
|
47
|
+
case "tool-call-f1":
|
|
48
|
+
case "tool-called":
|
|
49
|
+
case "tool-input-matches":
|
|
50
|
+
case "tool-output-matches":
|
|
51
|
+
warnings.push(`Knowledge probe "${assertion.type}" assertion is not applicable — ` +
|
|
52
|
+
"knowledge probes don't use tools. Assertion skipped.");
|
|
53
|
+
return null;
|
|
54
|
+
default:
|
|
55
|
+
warnings.push(`Knowledge probe: unknown assertion type "${assertion.type}" — passed through`);
|
|
56
|
+
return {
|
|
57
|
+
type: assertion.type,
|
|
58
|
+
...("value" in assertion ? { value: assertion.value } : {}),
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core compilation logic for knowledge probe tasks.
|
|
3
|
+
*
|
|
4
|
+
* Compiles a knowledge probe task definition into Promptfoo configuration.
|
|
5
|
+
* This is intentionally minimal — knowledge probes map almost 1:1 to
|
|
6
|
+
* basic Promptfoo test cases. The AILF value-add is type-safe authoring,
|
|
7
|
+
* cross-model comparison, and score normalization.
|
|
8
|
+
*/
|
|
9
|
+
import type { KnowledgeProbeTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
10
|
+
import type { KnowledgeProbeCompileOptions, KnowledgeProbeCompileResult } from "./types.js";
|
|
11
|
+
/**
|
|
12
|
+
* Compile a knowledge probe task definition into Promptfoo configuration.
|
|
13
|
+
*
|
|
14
|
+
* This is intentionally minimal — knowledge probes map almost 1:1 to
|
|
15
|
+
* basic Promptfoo test cases. The AILF value-add is type-safe authoring,
|
|
16
|
+
* cross-model comparison, and score normalization.
|
|
17
|
+
*/
|
|
18
|
+
export declare function compileKnowledgeProbeTask(task: KnowledgeProbeTaskDefinition, options?: KnowledgeProbeCompileOptions): KnowledgeProbeCompileResult;
|