npm - @sanity/ailf - Versions diffs - 1.0.0 → 2.0.1 - Mend

@sanity/ailf 1.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (499) hide show

package/README.md +0 -1
package/canonical/grader-references/README.md +2 -2
package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
package/config/features.ts +1 -1
package/config/models.ts +29 -12
package/config/sources.ts +1 -1
package/config/thresholds.ts +1 -1
package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
package/dist/_vendor/ailf-core/config-helpers.js +51 -2
package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
package/dist/_vendor/ailf-core/examples/index.js +213 -94
package/dist/_vendor/ailf-core/index.d.ts +3 -2
package/dist/_vendor/ailf-core/index.js +2 -1
package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
package/dist/_vendor/ailf-core/services/index.js +1 -1
package/dist/_vendor/ailf-core/services/scoring.js +9 -0
package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
package/dist/adapters/api-client/remediation.js +2 -2
package/dist/adapters/config-sources/file-config-adapter.js +7 -1
package/dist/adapters/config-sources/ts-config-loader.js +21 -13
package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
package/dist/adapters/index.d.ts +0 -1
package/dist/adapters/index.js +0 -1
package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
package/dist/adapters/task-sources/composite-task-source.js +1 -1
package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
package/dist/adapters/task-sources/index.d.ts +3 -4
package/dist/adapters/task-sources/index.js +3 -4
package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
package/dist/adapters/task-sources/repo-schemas.js +228 -20
package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
package/dist/adapters/task-sources/repo-task-source.js +81 -122
package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
package/dist/adapters/task-sources/repo-trigger.js +1 -1
package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
package/dist/adapters/task-sources/repo-validation.js +126 -5
package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
package/dist/adapters/task-sources/task-file-loader.js +21 -7
package/dist/agent-observer/test-imports.d.ts +7 -0
package/dist/agent-observer/test-imports.js +185 -0
package/dist/artifact-capture/comparator.d.ts +22 -0
package/dist/artifact-capture/comparator.js +493 -0
package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
package/dist/artifact-capture/filesystem-collector.js +237 -0
package/dist/artifact-capture/redact-artifact.d.ts +20 -0
package/dist/artifact-capture/redact-artifact.js +115 -0
package/dist/assertions/source-isolation.d.ts +1 -1
package/dist/assertions/source-isolation.js +1 -1
package/dist/cli.js +4 -0
package/dist/commands/calculate-scores.js +1 -0
package/dist/commands/capture-compare.d.ts +15 -0
package/dist/commands/capture-compare.js +253 -0
package/dist/commands/capture-list.d.ts +12 -0
package/dist/commands/capture-list.js +147 -0
package/dist/commands/capture.d.ts +9 -0
package/dist/commands/capture.js +16 -0
package/dist/commands/chronic-failures.d.ts +8 -0
package/dist/commands/chronic-failures.js +33 -0
package/dist/commands/coverage-audit.js +3 -1
package/dist/commands/explain-handler.d.ts +1 -1
package/dist/commands/explain-handler.js +37 -8
package/dist/commands/fetch-docs.js +1 -0
package/dist/commands/generate-configs.d.ts +3 -3
package/dist/commands/generate-configs.js +20 -8
package/dist/commands/init.d.ts +5 -4
package/dist/commands/init.js +190 -25
package/dist/commands/pipeline-action.d.ts +7 -1
package/dist/commands/pipeline-action.js +43 -19
package/dist/commands/pipeline.d.ts +6 -1
package/dist/commands/pipeline.js +7 -2
package/dist/commands/pr-comment.js +1 -0
package/dist/commands/publish.js +1 -0
package/dist/commands/shared/help.js +2 -2
package/dist/commands/update-quality-scores.d.ts +5 -0
package/dist/commands/update-quality-scores.js +20 -0
package/dist/commands/validate-tasks.d.ts +2 -2
package/dist/commands/validate-tasks.js +26 -15
package/dist/composition-root.d.ts +15 -4
package/dist/composition-root.js +100 -55
package/dist/config/features.ts +23 -0
package/dist/config/models.ts +100 -0
package/dist/config/prompts.ts +16 -0
package/dist/config/rubrics.ts +225 -0
package/dist/config/schedules.ts +47 -0
package/dist/config/sinks.ts +37 -0
package/dist/config/sources.ts +21 -0
package/dist/config/thresholds.ts +61 -0
package/dist/index.d.ts +41 -0
package/dist/index.js +48 -0
package/dist/lib/agent-behavior-report.d.ts +8 -0
package/dist/lib/agent-behavior-report.js +185 -0
package/dist/lib/baseline.d.ts +19 -0
package/dist/lib/baseline.js +153 -0
package/dist/lib/calculate-scores.d.ts +23 -0
package/dist/lib/calculate-scores.js +42 -0
package/dist/lib/compare.d.ts +18 -0
package/dist/lib/compare.js +170 -0
package/dist/lib/coverage-audit.d.ts +4 -0
package/dist/lib/coverage-audit.js +42 -0
package/dist/lib/discovery-report.d.ts +13 -0
package/dist/lib/discovery-report.js +57 -0
package/dist/lib/fetch-docs.d.ts +30 -0
package/dist/lib/fetch-docs.js +171 -0
package/dist/lib/generate-configs.d.ts +25 -0
package/dist/lib/generate-configs.js +42 -0
package/dist/lib/grader-api.d.ts +21 -0
package/dist/lib/grader-api.js +34 -0
package/dist/lib/grader-compare.d.ts +19 -0
package/dist/lib/grader-compare.js +91 -0
package/dist/lib/grader-consistency.d.ts +27 -0
package/dist/lib/grader-consistency.js +79 -0
package/dist/lib/grader-sensitivity.d.ts +19 -0
package/dist/lib/grader-sensitivity.js +75 -0
package/dist/lib/grader-validate.d.ts +19 -0
package/dist/lib/grader-validate.js +78 -0
package/dist/lib/measure-retrieval.d.ts +14 -0
package/dist/lib/measure-retrieval.js +71 -0
package/dist/lib/pr-comment.d.ts +16 -0
package/dist/lib/pr-comment.js +28 -0
package/dist/lib/readiness-report.d.ts +13 -0
package/dist/lib/readiness-report.js +108 -0
package/dist/lib/webhook-server.d.ts +11 -0
package/dist/lib/webhook-server.js +24 -0
package/dist/lib/weekly-digest.d.ts +24 -0
package/dist/lib/weekly-digest.js +148 -0
package/dist/orchestration/build-app-context.js +13 -0
package/dist/orchestration/build-step-sequence.js +4 -2
package/dist/orchestration/cache-context.d.ts +23 -0
package/dist/orchestration/cache-context.js +43 -0
package/dist/orchestration/env-bridge.d.ts +21 -0
package/dist/orchestration/env-bridge.js +66 -0
package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
package/dist/orchestration/load-pipeline-tasks.js +52 -0
package/dist/orchestration/pipeline-orchestrator.js +75 -5
package/dist/orchestration/step-runner.js +5 -1
package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
package/dist/orchestration/steps/calculate-scores-step.js +13 -0
package/dist/orchestration/steps/callback-step.js +10 -1
package/dist/orchestration/steps/compare-step.js +6 -3
package/dist/orchestration/steps/discovery-report-step.js +6 -2
package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
package/dist/orchestration/steps/fetch-docs-step.js +32 -19
package/dist/orchestration/steps/gap-analysis-step.js +13 -2
package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
package/dist/orchestration/steps/generate-configs-step.js +77 -26
package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
package/dist/orchestration/steps/publish-report-step.js +19 -0
package/dist/orchestration/steps/readiness-step.js +8 -3
package/dist/orchestration/steps/report-step.js +17 -4
package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
package/dist/orchestration/steps/run-eval-step.js +51 -31
package/dist/pipeline/agent-behavior-report.js +6 -0
package/dist/pipeline/attribution.d.ts +1 -1
package/dist/pipeline/attribution.js +1 -1
package/dist/pipeline/cache.js +29 -15
package/dist/pipeline/calculate-scores.d.ts +2 -0
package/dist/pipeline/calculate-scores.js +70 -33
package/dist/pipeline/chronic-failures.d.ts +55 -0
package/dist/pipeline/chronic-failures.js +110 -0
package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
package/dist/pipeline/compiler/assertion-mapper.js +1 -1
package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
package/dist/pipeline/compiler/config-loader.d.ts +14 -0
package/dist/pipeline/compiler/config-loader.js +42 -2
package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
package/dist/pipeline/compiler/fixture-resolver.js +1 -1
package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
package/dist/pipeline/compiler/ignore-fields.js +1 -1
package/dist/pipeline/compiler/index.d.ts +2 -5
package/dist/pipeline/compiler/index.js +2 -5
package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
package/dist/pipeline/compiler/literacy-bridge.js +2 -2
package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
package/dist/pipeline/compiler/mode-bases/index.js +4 -0
package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
package/dist/pipeline/compiler/preset-loader.js +99 -0
package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
package/dist/pipeline/compiler/provider-assembler.js +13 -7
package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
package/dist/pipeline/compiler/sandbox/index.js +1 -1
package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
package/dist/pipeline/compiler/scoring-bridge.js +1 -1
package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
package/dist/pipeline/compiler/task-bridge.js +92 -0
package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
package/dist/pipeline/compiler/task-graph-builder.js +1 -4
package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
package/dist/pipeline/compiler/telemetry/index.js +1 -1
package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
package/dist/pipeline/compiler/variable-resolver.js +1 -1
package/dist/pipeline/coverage-audit.d.ts +1 -1
package/dist/pipeline/coverage-audit.js +1 -1
package/dist/pipeline/degradations.d.ts +1 -1
package/dist/pipeline/degradations.js +1 -1
package/dist/pipeline/expand-tasks.d.ts +2 -2
package/dist/pipeline/expand-tasks.js +2 -2
package/dist/pipeline/failure-modes.d.ts +1 -1
package/dist/pipeline/failure-modes.js +13 -1
package/dist/pipeline/gap-analysis.d.ts +1 -1
package/dist/pipeline/gap-analysis.js +3 -1
package/dist/pipeline/generate-configs.d.ts +2 -2
package/dist/pipeline/generate-configs.js +16 -9
package/dist/pipeline/grader-compare-runner.d.ts +1 -1
package/dist/pipeline/grader-compare-runner.js +7 -1
package/dist/pipeline/grader-comparison.d.ts +1 -1
package/dist/pipeline/grader-comparison.js +1 -1
package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
package/dist/pipeline/grader-consistency-runner.js +7 -1
package/dist/pipeline/grader-consistency.d.ts +1 -1
package/dist/pipeline/grader-consistency.js +1 -1
package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
package/dist/pipeline/grader-sensitivity-runner.js +1 -1
package/dist/pipeline/grader-sensitivity.d.ts +1 -1
package/dist/pipeline/grader-sensitivity.js +1 -1
package/dist/pipeline/grader-validate-runner.d.ts +1 -1
package/dist/pipeline/grader-validate-runner.js +2 -2
package/dist/pipeline/grader-validation.d.ts +1 -1
package/dist/pipeline/grader-validation.js +1 -1
package/dist/pipeline/map-request-to-config.js +16 -2
package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
package/dist/pipeline/mirror-repo-tasks.js +10 -10
package/dist/pipeline/plan-format.d.ts +1 -1
package/dist/pipeline/plan-format.js +1 -1
package/dist/pipeline/plan.d.ts +1 -1
package/dist/pipeline/plan.js +68 -30
package/dist/pipeline/probe.d.ts +1 -1
package/dist/pipeline/probe.js +1 -1
package/dist/pipeline/readiness-report.d.ts +2 -2
package/dist/pipeline/readiness-report.js +2 -2
package/dist/pipeline/release-classification.d.ts +1 -1
package/dist/pipeline/release-classification.js +1 -1
package/dist/pipeline/release-report.d.ts +1 -1
package/dist/pipeline/release-report.js +1 -1
package/dist/pipeline/repo-eval-comment.d.ts +1 -1
package/dist/pipeline/repo-eval-comment.js +1 -1
package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
package/dist/pipeline/repo-threshold-evaluator.js +1 -1
package/dist/pipeline/resolve-mappings.d.ts +6 -6
package/dist/pipeline/resolve-mappings.js +44 -44
package/dist/pipeline/retrieval-metrics.d.ts +3 -3
package/dist/pipeline/retrieval-metrics.js +28 -20
package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
package/dist/pipeline/steps/calculate-scores-step.js +89 -0
package/dist/pipeline/steps/compare-step.d.ts +18 -0
package/dist/pipeline/steps/compare-step.js +90 -0
package/dist/pipeline/steps/eval-step.d.ts +53 -0
package/dist/pipeline/steps/eval-step.js +347 -0
package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
package/dist/pipeline/steps/fetch-docs-step.js +84 -0
package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
package/dist/pipeline/steps/generate-configs-step.js +98 -0
package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
package/dist/pipeline/steps/grader-consistency-step.js +74 -0
package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
package/dist/pipeline/steps/publish-report-step.js +243 -0
package/dist/pipeline/steps/report-step.d.ts +13 -0
package/dist/pipeline/steps/report-step.js +56 -0
package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
package/dist/pipeline/steps/update-scores-step.js +42 -0
package/dist/pipeline/targeted-loo.d.ts +1 -1
package/dist/pipeline/targeted-loo.js +1 -1
package/dist/pipeline/thresholds.d.ts +1 -1
package/dist/pipeline/thresholds.js +1 -1
package/dist/pipeline/validate.js +13 -0
package/dist/report-store.d.ts +17 -0
package/dist/report-store.js +24 -0
package/dist/scripts/agent-behavior-report.d.ts +19 -0
package/dist/scripts/agent-behavior-report.js +315 -0
package/dist/scripts/baseline.d.ts +43 -0
package/dist/scripts/baseline.js +267 -0
package/dist/scripts/calculate-scores.d.ts +166 -0
package/dist/scripts/calculate-scores.js +1296 -0
package/dist/scripts/compare.d.ts +22 -0
package/dist/scripts/compare.js +334 -0
package/dist/scripts/coverage-audit.d.ts +44 -0
package/dist/scripts/coverage-audit.js +209 -0
package/dist/scripts/debug-eval.d.ts +19 -0
package/dist/scripts/debug-eval.js +73 -0
package/dist/scripts/discovery-report.d.ts +58 -0
package/dist/scripts/discovery-report.js +250 -0
package/dist/scripts/fetch-docs.d.ts +35 -0
package/dist/scripts/fetch-docs.js +472 -0
package/dist/scripts/generate-configs.d.ts +66 -0
package/dist/scripts/generate-configs.js +459 -0
package/dist/scripts/grader-api.d.ts +27 -0
package/dist/scripts/grader-api.js +206 -0
package/dist/scripts/grader-compare.d.ts +22 -0
package/dist/scripts/grader-compare.js +368 -0
package/dist/scripts/grader-consistency.d.ts +20 -0
package/dist/scripts/grader-consistency.js +313 -0
package/dist/scripts/grader-sensitivity.d.ts +22 -0
package/dist/scripts/grader-sensitivity.js +354 -0
package/dist/scripts/grader-validate.d.ts +19 -0
package/dist/scripts/grader-validate.js +267 -0
package/dist/scripts/measure-retrieval.d.ts +10 -0
package/dist/scripts/measure-retrieval.js +145 -0
package/dist/scripts/migrate-task-mode.d.ts +1 -1
package/dist/scripts/migrate-task-mode.js +1 -1
package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
package/dist/scripts/pipeline.d.ts +76 -0
package/dist/scripts/pipeline.js +1031 -0
package/dist/scripts/pr-comment.d.ts +10 -0
package/dist/scripts/pr-comment.js +510 -0
package/dist/scripts/readiness-report.d.ts +88 -0
package/dist/scripts/readiness-report.js +342 -0
package/dist/scripts/update-quality-scores.d.ts +15 -0
package/dist/scripts/update-quality-scores.js +184 -0
package/dist/scripts/validate-task-sources.d.ts +1 -1
package/dist/scripts/validate-task-sources.js +1 -1
package/dist/scripts/validate.d.ts +13 -0
package/dist/scripts/validate.js +79 -0
package/dist/scripts/webhook-server.d.ts +26 -0
package/dist/scripts/webhook-server.js +147 -0
package/dist/scripts/weekly-digest.d.ts +24 -0
package/dist/scripts/weekly-digest.js +144 -0
package/dist/sinks/format-slack.d.ts +64 -0
package/dist/sinks/format-slack.js +306 -0
package/dist/sinks/slack-sink.d.ts +27 -0
package/dist/sinks/slack-sink.js +78 -0
package/dist/sinks/types.d.ts +1 -1
package/dist/sinks/types.js +1 -1
package/dist/sinks/webhook-sink.d.ts +19 -0
package/dist/sinks/webhook-sink.js +50 -0
package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
package/dist/tasks/literacy/content-lake.task.ts +181 -0
package/dist/tasks/literacy/frameworks.task.ts +129 -0
package/dist/tasks/literacy/functions.task.ts +70 -0
package/dist/tasks/literacy/groq.task.ts +259 -0
package/dist/tasks/literacy/image-handling.task.ts +95 -0
package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
package/dist/tasks/literacy/portable-text.task.ts +169 -0
package/dist/tasks/literacy/studio-setup.task.ts +134 -0
package/dist/tasks/literacy/visual-editing.task.ts +147 -0
package/package.json +32 -24
package/tasks/.expanded.agentic.yaml +280 -0
package/tasks/.expanded.yaml +565 -0
package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
package/tasks/literacy/content-lake.task.ts +181 -0
package/tasks/literacy/frameworks.task.ts +1 -0
package/tasks/literacy/functions.task.ts +1 -0
package/tasks/literacy/groq.task.ts +1 -0
package/tasks/literacy/image-handling.task.ts +95 -0
package/tasks/literacy/nextjs-live.task.ts +2 -1
package/tasks/literacy/portable-text.task.ts +169 -0
package/tasks/literacy/studio-setup.task.ts +5 -2
package/tasks/literacy/visual-editing.task.ts +1 -0
package/LICENSE +0 -21
package/tasks/frameworks.yaml +0 -98
package/tasks/functions.yaml +0 -51
package/tasks/groq.yaml +0 -216
package/tasks/nextjs-live.yaml +0 -62
package/tasks/studio-setup.yaml +0 -111
package/tasks/visual-editing.yaml +0 -120

package/dist/pipeline/compiler/ignore-fields.js CHANGED Viewed

@@ -17,7 +17,7 @@
  * ]
  * ```
  *
- * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
  */
 // ---------------------------------------------------------------------------
 // Public API

package/dist/pipeline/compiler/index.d.ts CHANGED Viewed

@@ -1,13 +1,10 @@
 /**
- * Config compiler — the heart of the new architecture.
+ * Config compiler — the heart of the compilation architecture.
  *
  * Converts task definitions from any source into a TaskGraph IR,
  * then compiles the graph into Promptfoo YAML configuration.
  *
- * This module coexists with the existing `generate-configs.ts` path.
- * Phase 7 will migrate callers to use the compiler exclusively.
- *
- * @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
  */
 export { buildTaskGraph, detectCycle, type TaskGraphBuildOptions, type TaskGraphBuildResult, } from "./task-graph-builder.js";
 export { compileToPromptfoo, type CompilationResult, type CompiledPromptfooConfig, type PromptfooCompilerOptions, type PromptfooPrompt, type PromptfooProvider, type PromptfooTestCase, } from "./promptfoo-compiler.js";

package/dist/pipeline/compiler/index.js CHANGED Viewed

@@ -1,13 +1,10 @@
 /**
- * Config compiler — the heart of the new architecture.
+ * Config compiler — the heart of the compilation architecture.
  *
  * Converts task definitions from any source into a TaskGraph IR,
  * then compiles the graph into Promptfoo YAML configuration.
  *
- * This module coexists with the existing `generate-configs.ts` path.
- * Phase 7 will migrate callers to use the compiler exclusively.
- *
- * @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
  */
 // TaskGraph builder
 export { buildTaskGraph, detectCycle, } from "./task-graph-builder.js";

package/dist/pipeline/compiler/literacy-bridge.d.ts CHANGED Viewed

@@ -16,10 +16,10 @@
  * - Prompts from config/prompts are integrated
  * - TaskGraphBuilder validates the DAG, deduplicates, and orders tasks
  *
- * @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
  */
 import type { LiteracyTaskDefinition } from "../../_vendor/ailf-core/index.d.ts";
-import { type LiteracyCompileResult } from "./mode-handlers/literacy-handler.js";
+import { type LiteracyCompileResult } from "./mode-handlers/literacy/index.js";
 import { type LiteracyEvalSubMode } from "../normalize-mode.js";
 /** Options for compiling all literacy tasks via the new compiler */
 export interface LiteracyBridgeOptions {

package/dist/pipeline/compiler/literacy-bridge.js CHANGED Viewed

@@ -16,9 +16,9 @@
  * - Prompts from config/prompts are integrated
  * - TaskGraphBuilder validates the DAG, deduplicates, and orders tasks
  *
- * @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
  */
-import { compileLiteracyTask, } from "./mode-handlers/literacy-handler.js";
+import { compileLiteracyTask, } from "./mode-handlers/literacy/index.js";
 import { tryLoadConfigFile } from "./config-loader.js";
 import { buildTaskGraph } from "./task-graph-builder.js";
 // ---------------------------------------------------------------------------

package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts ADDED Viewed

@@ -0,0 +1,10 @@
+/**
+ * Agent Harness mode base — evaluation methodology for autonomous agent testing.
+ *
+ * Tests whether an autonomous agent can complete implementation tasks
+ * end-to-end, including tool use, file creation, and code generation.
+ *
+ * @see docs/modes.md
+ */
+import type { ModeBase } from "../../../_vendor/ailf-core/index.d.ts";
+export declare function createAgentHarnessBase(): ModeBase;

package/dist/pipeline/compiler/mode-bases/agent-harness.js ADDED Viewed

@@ -0,0 +1,21 @@
+/**
+ * Agent Harness mode base — evaluation methodology for autonomous agent testing.
+ *
+ * Tests whether an autonomous agent can complete implementation tasks
+ * end-to-end, including tool use, file creation, and code generation.
+ *
+ * @see docs/modes.md
+ */
+export function createAgentHarnessBase() {
+    return {
+        mode: {
+            id: "agent-harness",
+            label: "Agent Harness",
+            validProviderPatterns: ["^openai:", "^anthropic:", "^file://"],
+            rubricTemplateIds: [],
+            handlerModule: "./mode-handlers/agent-harness/index.js",
+        },
+        // Agent harness rubric templates and scoring profiles will be defined
+        // as the mode matures. The structural registration is in place.
+    };
+}

package/dist/pipeline/compiler/mode-bases/index.d.ts ADDED Viewed

@@ -0,0 +1,4 @@
+export { createAgentHarnessBase } from "./agent-harness.js";
+export { createKnowledgeProbeBase } from "./knowledge-probe.js";
+export { createLiteracyModeBase } from "./literacy.js";
+export { createMcpServerModeBase } from "./mcp-server.js";

package/dist/pipeline/compiler/mode-bases/index.js ADDED Viewed

@@ -0,0 +1,4 @@
+export { createAgentHarnessBase } from "./agent-harness.js";
+export { createKnowledgeProbeBase } from "./knowledge-probe.js";
+export { createLiteracyModeBase } from "./literacy.js";
+export { createMcpServerModeBase } from "./mcp-server.js";

package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts ADDED Viewed

@@ -0,0 +1,10 @@
+/**
+ * Knowledge Probe mode base — evaluation methodology for testing model knowledge.
+ *
+ * Tests what the model knows about a topic without providing documentation,
+ * establishing a baseline of model knowledge.
+ *
+ * @see docs/modes.md
+ */
+import type { ModeBase } from "../../../_vendor/ailf-core/index.d.ts";
+export declare function createKnowledgeProbeBase(): ModeBase;

package/dist/pipeline/compiler/mode-bases/knowledge-probe.js ADDED Viewed

@@ -0,0 +1,22 @@
+/**
+ * Knowledge Probe mode base — evaluation methodology for testing model knowledge.
+ *
+ * Tests what the model knows about a topic without providing documentation,
+ * establishing a baseline of model knowledge.
+ *
+ * @see docs/modes.md
+ */
+export function createKnowledgeProbeBase() {
+    return {
+        mode: {
+            id: "knowledge-probe",
+            label: "Knowledge Probe",
+            validProviderPatterns: ["^openai:", "^anthropic:", "^file://"],
+            rubricTemplateIds: [],
+            handlerModule: "./mode-handlers/knowledge-probe/index.js",
+        },
+        // Knowledge probe uses the same rubric dimensions as literacy
+        // but without doc-coverage (since no docs are provided).
+        // Rubric templates will be inherited or defined as the mode matures.
+    };
+}

package/dist/pipeline/compiler/mode-bases/literacy.d.ts ADDED Viewed

@@ -0,0 +1,23 @@
+/**
+ * Literacy mode base — shared evaluation methodology for documentation literacy.
+ *
+ * Defines HOW literacy evaluations are scored (rubrics, weights, prompts),
+ * independently of WHAT documentation is being evaluated. Domain presets
+ * like `sanity-literacy` target this mode base and add their own sources,
+ * features, and doc fetcher.
+ *
+ * @see docs/modes.md
+ */
+import { type ModeBase, type ModelEntry } from "../../../_vendor/ailf-core/index.d.ts";
+export declare function createLiteracyModeBase(): ModeBase;
+/**
+ * Check whether a model participates in a specific literacy variant.
+ *
+ * A model matches if:
+ * 1. It's enrolled in the `literacy` eval mode (or has no `modes` field)
+ * 2. The variant is in its resolved variant set (defaults to all variants)
+ *
+ * This is the single source of truth for literacy variant matching —
+ * import this instead of reimplementing the pattern.
+ */
+export declare function modelMatchesLiteracyVariant(model: ModelEntry, variant: string): boolean;

package/dist/pipeline/compiler/mode-bases/literacy.js ADDED Viewed

@@ -0,0 +1,132 @@
+/**
+ * Literacy mode base — shared evaluation methodology for documentation literacy.
+ *
+ * Defines HOW literacy evaluations are scored (rubrics, weights, prompts),
+ * independently of WHAT documentation is being evaluated. Domain presets
+ * like `sanity-literacy` target this mode base and add their own sources,
+ * features, and doc fetcher.
+ *
+ * @see docs/modes.md
+ */
+import { modelMatchesMode, resolveModelVariants, } from "../../../_vendor/ailf-core/index.js";
+import { LITERACY_PROMPT_TEMPLATES } from "../mode-handlers/literacy/index.js";
+// ---------------------------------------------------------------------------
+// Mode base factory
+// ---------------------------------------------------------------------------
+export function createLiteracyModeBase() {
+    return {
+        mode: {
+            id: "literacy",
+            label: "Documentation Literacy",
+            validProviderPatterns: ["^openai:", "^anthropic:", "^file://"],
+            rubricTemplateIds: [
+                "task-completion",
+                "code-correctness",
+                "doc-coverage",
+            ],
+            handlerModule: "./mode-handlers/literacy/index.js",
+            variants: [
+                {
+                    id: "baseline",
+                    label: "Standard (baseline)",
+                    description: "Standard with-docs and without-docs evaluation prompts",
+                },
+                {
+                    id: "observed",
+                    label: "Observed (HTTP-instrumented)",
+                    description: "HTTP-instrumented evaluation that records model behavior",
+                },
+                {
+                    id: "agentic-naive",
+                    label: "Agentic (naive)",
+                    description: "Model uses tools to find docs with default system prompt",
+                },
+                {
+                    id: "agentic-optimized",
+                    label: "Agentic (optimized)",
+                    description: "Model uses tools to find docs with optimized system prompt",
+                },
+            ],
+        },
+        rubricTemplates: [
+            {
+                id: "task-completion",
+                dimension: "task-completion",
+                header: "Score task completion from 0 to 100:",
+                scale: [
+                    "0: Couldn't attempt — missing critical information",
+                    "20: Attempted but fundamentally wrong approach",
+                    "50: Partial implementation — major functional gaps",
+                    "80: Mostly complete — minor issues or missing edge cases",
+                    "100: Fully functional code — works as expected",
+                ],
+                criteriaLabel: "Must demonstrate:",
+            },
+            {
+                id: "code-correctness",
+                dimension: "code-correctness",
+                header: "Score code correctness from 0 to 100:",
+                scale: [
+                    "0: Broken code, syntax errors, or deprecated APIs",
+                    "30: Works but uses anti-patterns or inefficient approaches",
+                    "50: Works but not idiomatic",
+                    "80: Follows most best practices",
+                    "100: Follows all best practices, idiomatic implementation",
+                ],
+                criteriaLabel: "Check for:",
+            },
+            {
+                id: "doc-coverage",
+                dimension: "doc-coverage",
+                header: "Score documentation coverage from 0 to 100:",
+                scale: [
+                    "0: Had to hallucinate/guess most implementation details",
+                    "30: Significant gaps — filled with assumptions",
+                    "50: Some gaps — inferred from partial information",
+                    "80: Minor gaps — almost everything was documented",
+                    "100: Complete coverage — all necessary info was in docs",
+                ],
+            },
+        ],
+        scoringProfiles: {
+            default: {
+                "task-completion": 0.5,
+                "code-correctness": 0.25,
+                "doc-coverage": 0.25,
+            },
+            "output-only": {
+                "task-completion": 0.6,
+                "code-correctness": 0.4,
+            },
+        },
+        promptTemplates: LITERACY_PROMPT_TEMPLATES,
+    };
+}
+// ---------------------------------------------------------------------------
+// Shared variant matching helper
+// ---------------------------------------------------------------------------
+/** Lazily initialized mode base — avoids re-creating on every call */
+let _cachedBase;
+function getLiteracyModeBase() {
+    if (!_cachedBase)
+        _cachedBase = createLiteracyModeBase();
+    return _cachedBase;
+}
+/**
+ * Check whether a model participates in a specific literacy variant.
+ *
+ * A model matches if:
+ * 1. It's enrolled in the `literacy` eval mode (or has no `modes` field)
+ * 2. The variant is in its resolved variant set (defaults to all variants)
+ *
+ * This is the single source of truth for literacy variant matching —
+ * import this instead of reimplementing the pattern.
+ */
+export function modelMatchesLiteracyVariant(model, variant) {
+    if (!modelMatchesMode(model, "literacy"))
+        return false;
+    const variants = resolveModelVariants(model, getLiteracyModeBase());
+    if (!variants)
+        return true;
+    return variants.includes(variant);
+}

package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts ADDED Viewed

@@ -0,0 +1,10 @@
+/**
+ * MCP Server mode base — evaluation methodology for MCP server tool-use testing.
+ *
+ * Defines rubric templates and scoring for evaluating how well an LLM can
+ * discover and use MCP server tools correctly.
+ *
+ * @see docs/modes.md
+ */
+import type { ModeBase } from "../../../_vendor/ailf-core/index.d.ts";
+export declare function createMcpServerModeBase(): ModeBase;

package/dist/pipeline/compiler/mode-bases/mcp-server.js ADDED Viewed

@@ -0,0 +1,70 @@
+/**
+ * MCP Server mode base — evaluation methodology for MCP server tool-use testing.
+ *
+ * Defines rubric templates and scoring for evaluating how well an LLM can
+ * discover and use MCP server tools correctly.
+ *
+ * @see docs/modes.md
+ */
+export function createMcpServerModeBase() {
+    return {
+        mode: {
+            id: "mcp-server",
+            label: "MCP Server Testing",
+            validProviderPatterns: ["^mcp:", "^file://"],
+            rubricTemplateIds: [
+                "mcp-input-validation",
+                "mcp-output-correctness",
+                "mcp-error-handling",
+            ],
+            handlerModule: "./mode-handlers/mcp-server/index.js",
+        },
+        rubricTemplates: [
+            {
+                id: "mcp-input-validation",
+                dimension: "mcp-input-validation",
+                header: "Score MCP input validation from 0 to 100:",
+                scale: [
+                    "0: Tool called with completely wrong parameters",
+                    "30: Some parameters correct but critical ones missing or wrong",
+                    "50: Parameters mostly correct but types or formats are off",
+                    "80: All required parameters correct, minor optional parameter issues",
+                    "100: Perfect tool invocation with all parameters correctly formed",
+                ],
+                criteriaLabel: "Check for:",
+            },
+            {
+                id: "mcp-output-correctness",
+                dimension: "mcp-output-correctness",
+                header: "Score MCP output correctness from 0 to 100:",
+                scale: [
+                    "0: Output is completely wrong or tool returned an error",
+                    "30: Output partially correct but missing key data",
+                    "50: Output correct but incomplete or improperly formatted",
+                    "80: Output correct and well-formatted, minor issues",
+                    "100: Output is correct, complete, and properly formatted",
+                ],
+                criteriaLabel: "Check for:",
+            },
+            {
+                id: "mcp-error-handling",
+                dimension: "mcp-error-handling",
+                header: "Score MCP error handling from 0 to 100:",
+                scale: [
+                    "0: No error handling — crashes or returns garbage on bad input",
+                    "30: Basic error detection but poor recovery or messaging",
+                    "50: Errors detected and reported but not gracefully handled",
+                    "80: Good error handling with clear messages and appropriate fallbacks",
+                    "100: Excellent error handling — validates input, provides actionable errors, degrades gracefully",
+                ],
+            },
+        ],
+        scoringProfiles: {
+            default: {
+                "mcp-input-validation": 0.4,
+                "mcp-output-correctness": 0.4,
+                "mcp-error-handling": 0.2,
+            },
+        },
+    };
+}

package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts ADDED Viewed

@@ -0,0 +1,43 @@
+/**
+ * Assertion mapping and builders for agent harness tasks.
+ *
+ * Handles agent-specific assertion types (file-exists, file-contains,
+ * command-succeeds, diff-matches) as well as standard pass-through
+ * assertion types.
+ */
+import type { PromptfooAssertion } from "../../assertion-mapper.js";
+import type { AgentHarnessCompileOptions } from "./types.js";
+export declare function mapAgentAssertion(assertion: {
+    type: string;
+    [k: string]: unknown;
+}, options: AgentHarnessCompileOptions | undefined, warnings: string[]): PromptfooAssertion | null;
+export declare function buildFileExistsAssertion(assertion: {
+    type: string;
+    [k: string]: unknown;
+}): PromptfooAssertion;
+export declare function buildFileContainsAssertion(assertion: {
+    type: string;
+    [k: string]: unknown;
+}): PromptfooAssertion;
+/**
+ * SECURITY: Trusted-input boundary.
+ *
+ * The `command-succeeds` assertion executes an arbitrary shell command
+ * inside the sandbox's working directory. The command string comes from
+ * task definitions (YAML or TypeScript config files), which are authored
+ * by developers — not from user input or LLM output.
+ *
+ * This is intentional: the assertion is designed to verify agent output
+ * by running build/test commands (e.g., "npm test", "tsc --noEmit").
+ *
+ * Task definitions are the trust boundary. If you accept task definitions
+ * from untrusted sources, validate commands against an allowlist first.
+ */
+export declare function buildCommandSucceedsAssertion(assertion: {
+    type: string;
+    [k: string]: unknown;
+}): PromptfooAssertion;
+export declare function buildDiffMatchesAssertion(assertion: {
+    type: string;
+    [k: string]: unknown;
+}): PromptfooAssertion;

package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js ADDED Viewed

@@ -0,0 +1,187 @@
+/**
+ * Assertion mapping and builders for agent harness tasks.
+ *
+ * Handles agent-specific assertion types (file-exists, file-contains,
+ * command-succeeds, diff-matches) as well as standard pass-through
+ * assertion types.
+ */
+// ---------------------------------------------------------------------------
+// Assertion mapping
+// ---------------------------------------------------------------------------
+export function mapAgentAssertion(assertion, options, warnings) {
+    switch (assertion.type) {
+        case "file-exists":
+            return buildFileExistsAssertion(assertion);
+        case "file-contains":
+            return buildFileContainsAssertion(assertion);
+        case "command-succeeds":
+            return buildCommandSucceedsAssertion(assertion);
+        case "diff-matches":
+            return buildDiffMatchesAssertion(assertion);
+        // Standard assertions pass through
+        case "contains":
+        case "equals":
+        case "regex":
+        case "is-json":
+        case "javascript":
+        case "python":
+            return {
+                type: assertion.type,
+                ...("value" in assertion ? { value: assertion.value } : {}),
+                ...(typeof assertion.weight === "number"
+                    ? { weight: assertion.weight }
+                    : {}),
+            };
+        case "llm-rubric":
+            return {
+                type: "llm-rubric",
+                ...("value" in assertion ? { value: assertion.value } : {}),
+                ...(typeof assertion.weight === "number"
+                    ? { weight: assertion.weight }
+                    : {}),
+                ...(options?.graderProvider
+                    ? { provider: options.graderProvider }
+                    : {}),
+            };
+        default:
+            warnings.push(`Agent task: unknown assertion type "${assertion.type}" — passed through`);
+            return {
+                type: assertion.type,
+                ...("value" in assertion ? { value: assertion.value } : {}),
+            };
+    }
+}
+// ---------------------------------------------------------------------------
+// Agent-specific assertion builders
+// ---------------------------------------------------------------------------
+export function buildFileExistsAssertion(assertion) {
+    const filePath = String(assertion.value ?? "");
+    // Use JSON.stringify for all interpolated values in generated JS to
+    // prevent broken strings from filePaths containing quotes/backslashes
+    const safeFilePath = JSON.stringify(filePath);
+    return {
+        type: "javascript",
+        value: `// file-exists: ${filePath}\n` +
+            `(function() {\n` +
+            `  const fs = require('fs');\n` +
+            `  const path = require('path');\n` +
+            `  const workDir = path.resolve(context.vars.__workingDir || '.');\n` +
+            `  const target = path.resolve(workDir, ${safeFilePath});\n` +
+            `  if (!target.startsWith(workDir + path.sep) && target !== workDir) {\n` +
+            `    return { pass: false, score: 0, reason: 'Path traversal: ' + ${safeFilePath} + ' escapes sandbox' };\n` +
+            `  }\n` +
+            `  const exists = fs.existsSync(target);\n` +
+            `  return {\n` +
+            `    pass: exists,\n` +
+            `    score: exists ? 1 : 0,\n` +
+            `    reason: exists\n` +
+            `      ? 'File exists: ' + ${safeFilePath}\n` +
+            `      : 'Expected file not found: ' + ${safeFilePath},\n` +
+            `  };\n` +
+            `})()`,
+        ...(typeof assertion.weight === "number"
+            ? { weight: assertion.weight }
+            : {}),
+    };
+}
+export function buildFileContainsAssertion(assertion) {
+    const config = assertion.value;
+    const filePath = config?.path ?? "";
+    const expectedContent = config?.content ?? "";
+    const safeFilePath = JSON.stringify(filePath);
+    return {
+        type: "javascript",
+        value: `// file-contains: ${filePath}\n` +
+            `(function() {\n` +
+            `  const fs = require('fs');\n` +
+            `  const path = require('path');\n` +
+            `  const workDir = path.resolve(context.vars.__workingDir || '.');\n` +
+            `  const target = path.resolve(workDir, ${safeFilePath});\n` +
+            `  if (!target.startsWith(workDir + path.sep) && target !== workDir) {\n` +
+            `    return { pass: false, score: 0, reason: 'Path traversal: ' + ${safeFilePath} + ' escapes sandbox' };\n` +
+            `  }\n` +
+            `  if (!fs.existsSync(target)) {\n` +
+            `    return { pass: false, score: 0, reason: 'File not found: ' + ${safeFilePath} };\n` +
+            `  }\n` +
+            `  const content = fs.readFileSync(target, 'utf-8');\n` +
+            `  const contains = content.includes(${JSON.stringify(expectedContent)});\n` +
+            `  return {\n` +
+            `    pass: contains,\n` +
+            `    score: contains ? 1 : 0,\n` +
+            `    reason: contains\n` +
+            `      ? 'File contains expected content'\n` +
+            `      : 'File does not contain expected content',\n` +
+            `  };\n` +
+            `})()`,
+        ...(typeof assertion.weight === "number"
+            ? { weight: assertion.weight }
+            : {}),
+    };
+}
+/**
+ * SECURITY: Trusted-input boundary.
+ *
+ * The `command-succeeds` assertion executes an arbitrary shell command
+ * inside the sandbox's working directory. The command string comes from
+ * task definitions (YAML or TypeScript config files), which are authored
+ * by developers — not from user input or LLM output.
+ *
+ * This is intentional: the assertion is designed to verify agent output
+ * by running build/test commands (e.g., "npm test", "tsc --noEmit").
+ *
+ * Task definitions are the trust boundary. If you accept task definitions
+ * from untrusted sources, validate commands against an allowlist first.
+ */
+export function buildCommandSucceedsAssertion(assertion) {
+    const command = String(assertion.value ?? "");
+    return {
+        type: "javascript",
+        value: `// command-succeeds: ${command}\n` +
+            `(function() {\n` +
+            `  const { execSync } = require('child_process');\n` +
+            `  const workDir = context.vars.__workingDir || '.';\n` +
+            `  try {\n` +
+            `    execSync(${JSON.stringify(command)}, { cwd: workDir, timeout: 30000 });\n` +
+            `    return { pass: true, score: 1, reason: 'Command succeeded: ' + ${JSON.stringify(command)} };\n` +
+            `  } catch (err) {\n` +
+            `    return {\n` +
+            `      pass: false,\n` +
+            `      score: 0,\n` +
+            `      reason: 'Command failed: ' + (err.message || err),\n` +
+            `    };\n` +
+            `  }\n` +
+            `})()`,
+        ...(typeof assertion.weight === "number"
+            ? { weight: assertion.weight }
+            : {}),
+    };
+}
+export function buildDiffMatchesAssertion(assertion) {
+    const expected = assertion.value;
+    return {
+        type: "javascript",
+        value: `// diff-matches\n` +
+            `(function() {\n` +
+            `  const { execSync } = require('child_process');\n` +
+            `  const workDir = context.vars.__workingDir || '.';\n` +
+            `  try {\n` +
+            `    const diff = execSync('git diff', { cwd: workDir, encoding: 'utf-8' });\n` +
+            `    const expected = ${JSON.stringify(expected)};\n` +
+            `    if (typeof expected === 'string') {\n` +
+            `      const contains = diff.includes(expected);\n` +
+            `      return {\n` +
+            `        pass: contains,\n` +
+            `        score: contains ? 1 : 0,\n` +
+            `        reason: contains ? 'Diff matches expected pattern' : 'Diff does not match',\n` +
+            `      };\n` +
+            `    }\n` +
+            `    return { pass: diff.length > 0, score: diff.length > 0 ? 1 : 0, reason: 'Diff exists' };\n` +
+            `  } catch (err) {\n` +
+            `    return { pass: false, score: 0, reason: 'Failed to get diff: ' + err.message };\n` +
+            `  }\n` +
+            `})()`,
+        ...(typeof assertion.weight === "number"
+            ? { weight: assertion.weight }
+            : {}),
+    };
+}

package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts ADDED Viewed

@@ -0,0 +1,19 @@
+/**
+ * Agent harness task compilation — core compilation logic.
+ *
+ * Maps agent harness task definitions to Promptfoo configuration with:
+ * - Claude Agent SDK / OpenAI Codex SDK providers
+ * - Tool permission configuration (preset/allowed/disallowed)
+ * - Sandbox setup/teardown via Promptfoo extensions
+ * - Fixture provisioning into sandbox working directory
+ */
+import type { AgentHarnessTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
+import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
+import type { AgentHarnessCompileOptions, AgentHarnessCompileResult } from "./types.js";
+/**
+ * Compile an agent harness task definition into Promptfoo configuration.
+ */
+export declare function compileAgentHarnessTask(task: AgentHarnessTaskDefinition, options?: AgentHarnessCompileOptions): AgentHarnessCompileResult;
+export declare function buildAgentProvider(task: AgentHarnessTaskDefinition, _warnings: string[]): PromptfooProvider[];
+export declare function buildAgentPrompts(task: AgentHarnessTaskDefinition): PromptfooPrompt[];
+export declare function buildAgentTestCases(task: AgentHarnessTaskDefinition, options: AgentHarnessCompileOptions | undefined, warnings: string[]): PromptfooTestCase[];