@sanity/ailf 0.5.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/config/features.ts +23 -0
- package/config/models.ts +95 -0
- package/config/prompts.ts +16 -0
- package/config/rubrics.ts +225 -0
- package/config/schedules.ts +47 -0
- package/config/sinks.ts +37 -0
- package/config/sources.ts +21 -0
- package/config/thresholds.ts +61 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +171 -0
- package/dist/_vendor/ailf-core/config-helpers.js +170 -0
- package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
- package/dist/_vendor/ailf-core/env-helper.js +45 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/examples/index.js +25 -0
- package/dist/_vendor/ailf-core/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/index.js +5 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +17 -2
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
- package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +8 -2
- package/dist/_vendor/ailf-core/schemas/eval-config.js +17 -2
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +9 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +8 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -31
- package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -9
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
- package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/services/index.js +2 -1
- package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
- package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
- package/dist/_vendor/ailf-core/services/scoring.js +25 -15
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +332 -0
- package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +45 -83
- package/dist/_vendor/ailf-core/types/index.js +8 -1
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +257 -0
- package/dist/_vendor/ailf-core/types/plugin-registry.js +185 -0
- package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
- package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
- package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
- package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
- package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
- package/dist/_vendor/ailf-core/types/trace.js +18 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
- package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
- package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
- package/dist/_vendor/ailf-shared/index.d.ts +0 -1
- package/dist/_vendor/ailf-shared/index.js +0 -1
- package/dist/adapters/api-client/build-request.js +14 -13
- package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
- package/dist/adapters/config-sources/file-config-adapter.js +39 -12
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
- package/dist/adapters/config-sources/ts-config-loader.js +141 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
- package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +35 -39
- package/dist/adapters/task-sources/index.d.ts +3 -2
- package/dist/adapters/task-sources/index.js +3 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
- package/dist/adapters/task-sources/repo-schemas.js +227 -19
- package/dist/adapters/task-sources/repo-task-source.d.ts +16 -12
- package/dist/adapters/task-sources/repo-task-source.js +92 -80
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
- package/dist/adapters/task-sources/task-file-loader.js +83 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
- package/dist/adapters/task-sources/yaml-task-source.js +19 -16
- package/dist/cli.js +0 -2
- package/dist/commands/baseline.js +4 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/coverage-audit.js +9 -1
- package/dist/commands/explain-handler.js +25 -23
- package/dist/commands/fetch-docs.js +3 -2
- package/dist/commands/generate-configs.js +1 -1
- package/dist/commands/init.d.ts +6 -4
- package/dist/commands/init.js +302 -23
- package/dist/commands/interactive.js +11 -7
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +16 -6
- package/dist/commands/pipeline.d.ts +1 -0
- package/dist/commands/pipeline.js +4 -2
- package/dist/commands/pr-comment.js +1 -1
- package/dist/commands/publish.js +2 -2
- package/dist/commands/readiness-report.js +13 -6
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +13 -1
- package/dist/composition-root.js +99 -4
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +28 -8
- package/dist/orchestration/steps/calculate-scores-step.js +24 -11
- package/dist/orchestration/steps/fetch-docs-step.js +8 -7
- package/dist/orchestration/steps/gap-analysis-step.js +8 -7
- package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
- package/dist/orchestration/steps/generate-configs-step.js +261 -51
- package/dist/orchestration/steps/grader-consistency-step.js +7 -4
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/readiness-step.js +5 -6
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
- package/dist/orchestration/steps/run-eval-step.js +8 -7
- package/dist/pipeline/cache.d.ts +1 -1
- package/dist/pipeline/cache.js +36 -8
- package/dist/pipeline/calculate-scores.d.ts +2 -4
- package/dist/pipeline/calculate-scores.js +43 -113
- package/dist/pipeline/checks.js +2 -2
- package/dist/pipeline/compare.js +8 -8
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +392 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +404 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
- package/dist/pipeline/compiler/assertion-mapper.js +175 -0
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
- package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
- package/dist/pipeline/compiler/config-loader.d.ts +56 -0
- package/dist/pipeline/compiler/config-loader.js +111 -0
- package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
- package/dist/pipeline/compiler/fixture-resolver.js +113 -0
- package/dist/pipeline/compiler/hash.d.ts +11 -0
- package/dist/pipeline/compiler/hash.js +18 -0
- package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
- package/dist/pipeline/compiler/ignore-fields.js +113 -0
- package/dist/pipeline/compiler/index.d.ts +29 -0
- package/dist/pipeline/compiler/index.js +45 -0
- package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
- package/dist/pipeline/compiler/literacy-bridge.js +172 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +15 -0
- package/dist/pipeline/compiler/mode-handlers/index.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/index.d.ts +9 -0
- package/dist/pipeline/compiler/presets/index.js +8 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +42 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.js +208 -0
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
- package/dist/pipeline/compiler/provider-assembler.js +137 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
- package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
- package/dist/pipeline/compiler/sandbox/index.js +11 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
- package/dist/pipeline/compiler/scoring-bridge.js +114 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
- package/dist/pipeline/compiler/task-graph-builder.js +291 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
- package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
- package/dist/pipeline/compiler/telemetry/index.js +19 -0
- package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
- package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
- package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
- package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
- package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
- package/dist/pipeline/compiler/variable-resolver.js +115 -0
- package/dist/pipeline/coverage-audit.d.ts +15 -5
- package/dist/pipeline/coverage-audit.js +41 -22
- package/dist/pipeline/eval-constants.d.ts +16 -6
- package/dist/pipeline/eval-constants.js +25 -4
- package/dist/pipeline/eval-fingerprint.d.ts +2 -2
- package/dist/pipeline/eval-fingerprint.js +8 -9
- package/dist/pipeline/expand-tasks.d.ts +19 -10
- package/dist/pipeline/expand-tasks.js +34 -28
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +2 -2
- package/dist/pipeline/generate-configs.d.ts +22 -4
- package/dist/pipeline/generate-configs.js +53 -24
- package/dist/pipeline/grader-api.d.ts +3 -3
- package/dist/pipeline/grader-api.js +5 -12
- package/dist/pipeline/grader-compare-runner.js +20 -27
- package/dist/pipeline/grader-comparison.d.ts +4 -8
- package/dist/pipeline/grader-comparison.js +11 -17
- package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
- package/dist/pipeline/grader-consistency-runner.js +16 -20
- package/dist/pipeline/grader-consistency.d.ts +6 -10
- package/dist/pipeline/grader-consistency.js +13 -32
- package/dist/pipeline/grader-sensitivity-runner.js +7 -5
- package/dist/pipeline/grader-sensitivity.d.ts +2 -6
- package/dist/pipeline/grader-sensitivity.js +10 -10
- package/dist/pipeline/grader-validate-runner.js +7 -5
- package/dist/pipeline/grader-validation.d.ts +2 -6
- package/dist/pipeline/grader-validation.js +14 -22
- package/dist/pipeline/map-request-to-config.js +7 -1
- package/dist/pipeline/mirror-repo-tasks.d.ts +13 -13
- package/dist/pipeline/mirror-repo-tasks.js +22 -21
- package/dist/pipeline/normalize-mode.d.ts +49 -0
- package/dist/pipeline/normalize-mode.js +64 -0
- package/dist/pipeline/plan.d.ts +5 -2
- package/dist/pipeline/plan.js +134 -78
- package/dist/pipeline/pr-comment.js +2 -0
- package/dist/pipeline/profile-resolution.d.ts +22 -14
- package/dist/pipeline/profile-resolution.js +41 -19
- package/dist/pipeline/provenance.d.ts +2 -2
- package/dist/pipeline/provenance.js +12 -17
- package/dist/pipeline/release-report.js +4 -4
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/rubric-loader.d.ts +20 -0
- package/dist/pipeline/rubric-loader.js +37 -0
- package/dist/pipeline/validate.d.ts +4 -4
- package/dist/pipeline/validate.js +64 -53
- package/dist/schedules/loader.js +18 -8
- package/dist/scripts/migrate-task-mode.d.ts +24 -0
- package/dist/scripts/migrate-task-mode.js +85 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +15 -15
- package/dist/sinks/loader.js +5 -7
- package/dist/sources.d.ts +7 -7
- package/dist/sources.js +22 -24
- package/dist/webhook/dispatch.js +2 -1
- package/package.json +15 -4
- package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
- package/tasks/literacy/frameworks.task.ts +128 -0
- package/tasks/literacy/functions.task.ts +69 -0
- package/tasks/literacy/groq.task.ts +258 -0
- package/tasks/literacy/nextjs-live.task.ts +75 -0
- package/tasks/literacy/studio-setup.task.ts +131 -0
- package/tasks/literacy/visual-editing.task.ts +146 -0
- package/config/features.yaml +0 -116
- package/config/models.yaml +0 -116
- package/config/prompts.yaml +0 -75
- package/config/rubrics.yaml +0 -81
- package/config/schedules.yaml +0 -43
- package/config/sinks.yaml +0 -54
- package/config/sources.yaml +0 -51
- package/config/thresholds.yaml +0 -49
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
package/dist/commands/init.js
CHANGED
|
@@ -5,12 +5,14 @@
|
|
|
5
5
|
* task files. The generated files are ready-to-edit starting points —
|
|
6
6
|
* not live evaluation tasks.
|
|
7
7
|
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
8
|
+
* TypeScript output (default) uses define* helpers from @sanity/ailf-core
|
|
9
|
+
* for full IDE autocomplete and type checking. YAML output preserves
|
|
10
|
+
* inline comments from the source files. JSON output is a plain
|
|
11
|
+
* serialization of the parsed data.
|
|
11
12
|
*
|
|
12
13
|
* Usage:
|
|
13
|
-
* ailf init #
|
|
14
|
+
* ailf init # TypeScript output (default)
|
|
15
|
+
* ailf init --output-format yaml # YAML output
|
|
14
16
|
* ailf init --output-format json # JSON output
|
|
15
17
|
* ailf init --force # overwrite existing files
|
|
16
18
|
* ailf init --path ./my-dir # target a specific directory
|
|
@@ -18,16 +20,17 @@
|
|
|
18
20
|
import { Command } from "commander";
|
|
19
21
|
import { existsSync, mkdirSync, writeFileSync } from "fs";
|
|
20
22
|
import { resolve, relative } from "path";
|
|
21
|
-
import { ailfConfigData, ailfConfigYaml, taskYamlFiles, TASK_FILE_NAMES, allTaskData, workflowYaml, } from "../_vendor/ailf-core/index.js";
|
|
23
|
+
import { ailfConfigData, ailfConfigYaml, ailfConfigTs, taskYamlFiles, taskTsFiles, TASK_FILE_NAMES, TASK_TS_FILE_NAMES, allTaskData, workflowYaml, } from "../_vendor/ailf-core/index.js";
|
|
22
24
|
// ---------------------------------------------------------------------------
|
|
23
25
|
// Command factory
|
|
24
26
|
// ---------------------------------------------------------------------------
|
|
25
27
|
export function createInitCommand() {
|
|
26
28
|
return new Command("init")
|
|
27
29
|
.description("Initialize a directory for AI Literacy Framework evaluation")
|
|
28
|
-
.option("--output-format <fmt>", 'Output format for generated files: "
|
|
30
|
+
.option("--output-format <fmt>", 'Output format for generated files: "ts" (default), "yaml", or "json"', "ts")
|
|
29
31
|
.option("--force", "Overwrite existing files", false)
|
|
30
32
|
.option("--path <dir>", "Target directory (default: current directory)", ".")
|
|
33
|
+
.option("--mode <mode>", "Scaffold for a specific mode: literacy, mcp-server, custom (default: all modes)")
|
|
31
34
|
.action(async (opts) => {
|
|
32
35
|
await runInit(opts);
|
|
33
36
|
});
|
|
@@ -55,8 +58,13 @@ function rel(from, to) {
|
|
|
55
58
|
// Init logic
|
|
56
59
|
// ---------------------------------------------------------------------------
|
|
57
60
|
async function runInit(opts) {
|
|
58
|
-
const
|
|
59
|
-
|
|
61
|
+
const validFormats = new Set(["ts", "yaml", "json"]);
|
|
62
|
+
if (!validFormats.has(opts.outputFormat)) {
|
|
63
|
+
console.error(` ✗ Invalid output format "${opts.outputFormat}". Valid options: ts, yaml, json`);
|
|
64
|
+
process.exitCode = 1;
|
|
65
|
+
return;
|
|
66
|
+
}
|
|
67
|
+
const format = opts.outputFormat;
|
|
60
68
|
const force = opts.force;
|
|
61
69
|
// Resolve target from the caller's actual working directory
|
|
62
70
|
const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
|
|
@@ -72,24 +80,103 @@ async function runInit(opts) {
|
|
|
72
80
|
console.log(` ✓ Created ${rel(targetDir, tasksDir)}/`);
|
|
73
81
|
const written = [];
|
|
74
82
|
const skipped = [];
|
|
75
|
-
// 2. Write
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
83
|
+
// 2. Write project config
|
|
84
|
+
if (format === "ts") {
|
|
85
|
+
// TypeScript: ailf.config.ts with defineConfig helper
|
|
86
|
+
const configPath = resolve(ailfDir, "ailf.config.ts");
|
|
87
|
+
if (writeIfNew(configPath, ailfConfigTs, force)) {
|
|
88
|
+
written.push(rel(targetDir, configPath));
|
|
89
|
+
}
|
|
90
|
+
else {
|
|
91
|
+
skipped.push(rel(targetDir, configPath));
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
else if (format === "yaml") {
|
|
95
|
+
// YAML: raw string passthrough (preserves comments)
|
|
96
|
+
const configPath = resolve(ailfDir, "config.yaml");
|
|
97
|
+
if (writeIfNew(configPath, ailfConfigYaml, force)) {
|
|
98
|
+
written.push(rel(targetDir, configPath));
|
|
99
|
+
}
|
|
100
|
+
else {
|
|
101
|
+
skipped.push(rel(targetDir, configPath));
|
|
102
|
+
}
|
|
84
103
|
}
|
|
85
104
|
else {
|
|
86
|
-
|
|
105
|
+
// JSON: serialize the parsed data
|
|
106
|
+
const configPath = resolve(ailfDir, "config.json");
|
|
107
|
+
const content = JSON.stringify(ailfConfigData, null, 2) + "\n";
|
|
108
|
+
if (writeIfNew(configPath, content, force)) {
|
|
109
|
+
written.push(rel(targetDir, configPath));
|
|
110
|
+
}
|
|
111
|
+
else {
|
|
112
|
+
skipped.push(rel(targetDir, configPath));
|
|
113
|
+
}
|
|
87
114
|
}
|
|
88
115
|
// 3. Write example tasks to .ailf/tasks/
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
if (format === "
|
|
92
|
-
//
|
|
116
|
+
const modeFilter = opts.mode;
|
|
117
|
+
const isCustomMode = modeFilter === "custom";
|
|
118
|
+
if (format === "ts") {
|
|
119
|
+
// TypeScript: *.task.ts files with defineTask helper
|
|
120
|
+
// Default (no --mode): write literacy examples + draft MCP/probe examples
|
|
121
|
+
// --mode literacy: only literacy examples
|
|
122
|
+
// --mode mcp-server: only MCP examples (active, not draft)
|
|
123
|
+
// --mode custom: only a custom example task
|
|
124
|
+
if (!modeFilter || modeFilter === "literacy") {
|
|
125
|
+
for (const stem of TASK_TS_FILE_NAMES) {
|
|
126
|
+
const taskPath = resolve(tasksDir, `${stem}.task.ts`);
|
|
127
|
+
const content = taskTsFiles[stem];
|
|
128
|
+
if (writeIfNew(taskPath, content, force)) {
|
|
129
|
+
written.push(rel(targetDir, taskPath));
|
|
130
|
+
}
|
|
131
|
+
else {
|
|
132
|
+
skipped.push(rel(targetDir, taskPath));
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
// Draft examples for other modes (default init only)
|
|
137
|
+
if (!modeFilter) {
|
|
138
|
+
const mcpPath = resolve(tasksDir, "example-mcp-tool-usage.task.ts");
|
|
139
|
+
if (writeIfNew(mcpPath, MCP_DRAFT_TASK_TS, force)) {
|
|
140
|
+
written.push(rel(targetDir, mcpPath));
|
|
141
|
+
}
|
|
142
|
+
else {
|
|
143
|
+
skipped.push(rel(targetDir, mcpPath));
|
|
144
|
+
}
|
|
145
|
+
const probePath = resolve(tasksDir, "example-knowledge-probe.task.ts");
|
|
146
|
+
if (writeIfNew(probePath, PROBE_DRAFT_TASK_TS, force)) {
|
|
147
|
+
written.push(rel(targetDir, probePath));
|
|
148
|
+
}
|
|
149
|
+
else {
|
|
150
|
+
skipped.push(rel(targetDir, probePath));
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
// MCP-only init
|
|
154
|
+
if (modeFilter === "mcp-server") {
|
|
155
|
+
const mcpContent = MCP_DRAFT_TASK_TS.replace('status: "draft",', '// status: "active", // Activated — this task runs in evaluations');
|
|
156
|
+
const mcpPath = resolve(tasksDir, "example-mcp-tool-usage.task.ts");
|
|
157
|
+
if (writeIfNew(mcpPath, mcpContent, force)) {
|
|
158
|
+
written.push(rel(targetDir, mcpPath));
|
|
159
|
+
}
|
|
160
|
+
else {
|
|
161
|
+
skipped.push(rel(targetDir, mcpPath));
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
// Custom preset scaffold
|
|
165
|
+
if (isCustomMode) {
|
|
166
|
+
const customTaskPath = resolve(tasksDir, "example-custom.task.ts");
|
|
167
|
+
// Reuse the GROQ literacy task as a starting point
|
|
168
|
+
if (taskTsFiles[TASK_TS_FILE_NAMES[0]]) {
|
|
169
|
+
if (writeIfNew(customTaskPath, taskTsFiles[TASK_TS_FILE_NAMES[0]], force)) {
|
|
170
|
+
written.push(rel(targetDir, customTaskPath));
|
|
171
|
+
}
|
|
172
|
+
else {
|
|
173
|
+
skipped.push(rel(targetDir, customTaskPath));
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
else if (format === "yaml") {
|
|
179
|
+
// YAML: raw string passthrough (preserves comments)
|
|
93
180
|
for (const stem of TASK_FILE_NAMES) {
|
|
94
181
|
const taskPath = resolve(tasksDir, `${stem}.yaml`);
|
|
95
182
|
const content = taskYamlFiles[stem];
|
|
@@ -118,6 +205,16 @@ async function runInit(opts) {
|
|
|
118
205
|
}
|
|
119
206
|
}
|
|
120
207
|
}
|
|
208
|
+
// 3b. Write custom preset scaffold (--mode custom only)
|
|
209
|
+
if (isCustomMode && format === "ts") {
|
|
210
|
+
const presetPath = resolve(ailfDir, "preset.ts");
|
|
211
|
+
if (writeIfNew(presetPath, CUSTOM_PRESET_TS, force)) {
|
|
212
|
+
written.push(rel(targetDir, presetPath));
|
|
213
|
+
}
|
|
214
|
+
else {
|
|
215
|
+
skipped.push(rel(targetDir, presetPath));
|
|
216
|
+
}
|
|
217
|
+
}
|
|
121
218
|
// 4. Write .gitignore in .ailf/ (keep results out of version control)
|
|
122
219
|
const gitignorePath = resolve(ailfDir, ".gitignore");
|
|
123
220
|
const gitignoreContent = `# AILF generated files\nresults/\ncontexts/\n`;
|
|
@@ -150,18 +247,25 @@ async function runInit(opts) {
|
|
|
150
247
|
console.log(` ⊘ Skipped ${f} (already exists, use --force to overwrite)`);
|
|
151
248
|
}
|
|
152
249
|
}
|
|
250
|
+
const taskExt = format === "ts" ? ".task.ts" : format === "yaml" ? ".yaml" : ".json";
|
|
153
251
|
console.log();
|
|
154
252
|
console.log(" Next steps:");
|
|
155
253
|
console.log();
|
|
156
254
|
console.log(` 1. Edit the example tasks in ${rel(targetDir, tasksDir)}/ — update`);
|
|
157
255
|
console.log(" slugs and prompts for your documentation");
|
|
158
|
-
console.log(
|
|
256
|
+
console.log(` 2. Validate locally: npx @sanity/ailf@latest validate-tasks .ailf/tasks/`);
|
|
159
257
|
console.log(" 3. Add two GitHub Actions secrets");
|
|
160
258
|
console.log(" (Settings → Secrets and variables → Actions):");
|
|
161
259
|
console.log(" • AILF_API_KEY — your API key (starts with ailf_live_sk_)");
|
|
162
260
|
console.log(" • NPM_TOKEN — npm token with read access to @sanity scope");
|
|
163
261
|
console.log(" 4. Push — the workflow at .github/workflows/ailf-eval.yml runs");
|
|
164
262
|
console.log(" automatically on PRs");
|
|
263
|
+
if (format === "ts") {
|
|
264
|
+
console.log();
|
|
265
|
+
console.log(` 💡 TypeScript tasks (${taskExt}) give you full IDE autocomplete`);
|
|
266
|
+
console.log(" via defineTask() from @sanity/ailf-core. YAML and JSON are");
|
|
267
|
+
console.log(" also supported — re-run with --output-format yaml if preferred.");
|
|
268
|
+
}
|
|
165
269
|
console.log();
|
|
166
270
|
console.log(" 🔑 Retrieve secrets from 1Password (Sanity employees):");
|
|
167
271
|
console.log();
|
|
@@ -177,3 +281,178 @@ async function runInit(opts) {
|
|
|
177
281
|
console.log(" AILF_API_KEY=... npx @sanity/ailf@latest pipeline --remote --debug");
|
|
178
282
|
console.log();
|
|
179
283
|
}
|
|
284
|
+
// ---------------------------------------------------------------------------
|
|
285
|
+
// Draft example templates for non-literacy modes
|
|
286
|
+
// ---------------------------------------------------------------------------
|
|
287
|
+
const MCP_DRAFT_TASK_TS = `/**
|
|
288
|
+
* Example Task: MCP Server tool-use evaluation (DRAFT).
|
|
289
|
+
*
|
|
290
|
+
* Tests whether an LLM can correctly discover and invoke Sanity MCP server
|
|
291
|
+
* tools. Connects to the hosted Sanity MCP server at https://mcp.sanity.io.
|
|
292
|
+
*
|
|
293
|
+
* Prerequisites:
|
|
294
|
+
* - A Sanity API token with read access (for token-based auth)
|
|
295
|
+
* - Or: OAuth authentication will be prompted on first connect
|
|
296
|
+
*
|
|
297
|
+
* Authentication options:
|
|
298
|
+
* 1. Token-based: set SANITY_API_TOKEN env var
|
|
299
|
+
* 2. OAuth: the server prompts for login on first connect
|
|
300
|
+
*
|
|
301
|
+
* Setup: npx sanity@latest mcp configure
|
|
302
|
+
* Docs: https://www.sanity.io/docs/ai/mcp-server
|
|
303
|
+
*
|
|
304
|
+
* This task is a DRAFT — it won't run unless activated or explicitly targeted.
|
|
305
|
+
* To activate: change status to "active" or remove the status field.
|
|
306
|
+
*/
|
|
307
|
+
|
|
308
|
+
import { defineTask } from "../_vendor/ailf-core/index.js"
|
|
309
|
+
|
|
310
|
+
export default defineTask({
|
|
311
|
+
mode: "mcp-server",
|
|
312
|
+
id: "example-mcp-tool-usage",
|
|
313
|
+
title: "MCP tool discovery and invocation",
|
|
314
|
+
description: "Example — tests Sanity MCP server tool-use (draft)",
|
|
315
|
+
area: "mcp",
|
|
316
|
+
|
|
317
|
+
// ── Server configuration ────────────────────────────────────
|
|
318
|
+
// The Sanity MCP server is hosted remotely at https://mcp.sanity.io.
|
|
319
|
+
// Authentication via API token header or OAuth.
|
|
320
|
+
//
|
|
321
|
+
// For token auth, set SANITY_API_TOKEN in your environment.
|
|
322
|
+
serverConfig: {
|
|
323
|
+
transport: "streamable-http",
|
|
324
|
+
url: "https://mcp.sanity.io",
|
|
325
|
+
env: {
|
|
326
|
+
SANITY_API_TOKEN: process.env.SANITY_API_TOKEN ?? "",
|
|
327
|
+
},
|
|
328
|
+
},
|
|
329
|
+
|
|
330
|
+
prompt: {
|
|
331
|
+
text: \`Use the available MCP tools to query all documents of type "article"
|
|
332
|
+
in the Sanity dataset. Return the title and slug for each document.
|
|
333
|
+
Limit results to 5 documents.\`,
|
|
334
|
+
},
|
|
335
|
+
|
|
336
|
+
assertions: [
|
|
337
|
+
{
|
|
338
|
+
type: "llm-rubric",
|
|
339
|
+
template: "mcp-input-validation",
|
|
340
|
+
criteria: [
|
|
341
|
+
"Correctly identifies the query_documents tool",
|
|
342
|
+
"Passes a valid GROQ query to filter by document type",
|
|
343
|
+
"Requests only the needed fields (title, slug)",
|
|
344
|
+
],
|
|
345
|
+
},
|
|
346
|
+
],
|
|
347
|
+
|
|
348
|
+
status: "draft",
|
|
349
|
+
})
|
|
350
|
+
`;
|
|
351
|
+
const PROBE_DRAFT_TASK_TS = `/**
|
|
352
|
+
* Example Task: Knowledge probe baseline (DRAFT).
|
|
353
|
+
*
|
|
354
|
+
* Tests what the model knows about a topic without providing documentation.
|
|
355
|
+
* Used to establish a baseline for comparison with literacy evaluations.
|
|
356
|
+
* This task is a DRAFT — it won't run unless activated or explicitly targeted.
|
|
357
|
+
*
|
|
358
|
+
* To activate: change status to "active" or remove the status field.
|
|
359
|
+
*/
|
|
360
|
+
|
|
361
|
+
import { defineTask } from "../_vendor/ailf-core/index.js"
|
|
362
|
+
|
|
363
|
+
export default defineTask({
|
|
364
|
+
mode: "knowledge-probe",
|
|
365
|
+
id: "example-knowledge-probe",
|
|
366
|
+
title: "Model knowledge of GROQ syntax",
|
|
367
|
+
description: "Example — probes baseline model knowledge (draft)",
|
|
368
|
+
area: "groq",
|
|
369
|
+
|
|
370
|
+
prompt: {
|
|
371
|
+
text: \`Explain the GROQ query language used by Sanity. Cover:
|
|
372
|
+
1. Basic query syntax and projections
|
|
373
|
+
2. How to filter and sort results
|
|
374
|
+
3. Common patterns for fetching related documents
|
|
375
|
+
Provide working code examples.\`,
|
|
376
|
+
},
|
|
377
|
+
|
|
378
|
+
assertions: [
|
|
379
|
+
{
|
|
380
|
+
type: "llm-rubric",
|
|
381
|
+
template: "task-completion",
|
|
382
|
+
criteria: [
|
|
383
|
+
"Demonstrates understanding of GROQ query syntax",
|
|
384
|
+
"Shows filtering and projection patterns",
|
|
385
|
+
"Code examples use valid GROQ syntax",
|
|
386
|
+
],
|
|
387
|
+
},
|
|
388
|
+
],
|
|
389
|
+
|
|
390
|
+
status: "draft",
|
|
391
|
+
})
|
|
392
|
+
`;
|
|
393
|
+
const CUSTOM_PRESET_TS = `/**
|
|
394
|
+
* Custom preset — your domain-specific evaluation configuration.
|
|
395
|
+
*
|
|
396
|
+
* This preset targets the "literacy" mode base and inherits its evaluation
|
|
397
|
+
* methodology (rubrics, scoring profiles, prompt templates). You only need
|
|
398
|
+
* to provide domain-specific configuration: where your docs live, what
|
|
399
|
+
* features to track, and how to fetch documentation.
|
|
400
|
+
*
|
|
401
|
+
* To use a different mode (e.g., "mcp-server"), change the mode field.
|
|
402
|
+
* Available built-in modes: literacy, mcp-server, knowledge-probe, agent-harness.
|
|
403
|
+
*
|
|
404
|
+
* @see https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/PRESETS.md
|
|
405
|
+
*/
|
|
406
|
+
|
|
407
|
+
import { definePreset } from "../_vendor/ailf-core/index.js"
|
|
408
|
+
|
|
409
|
+
export default definePreset({
|
|
410
|
+
name: "my-docs-evaluation",
|
|
411
|
+
manifest: {
|
|
412
|
+
name: "my-docs-evaluation",
|
|
413
|
+
version: "1.0.0",
|
|
414
|
+
description: "Documentation literacy evaluation for my project.",
|
|
415
|
+
pluginApiVersion: 1,
|
|
416
|
+
},
|
|
417
|
+
|
|
418
|
+
// Target the literacy mode base — inherits rubrics, scoring, prompts.
|
|
419
|
+
// Change to "mcp-server" to evaluate MCP tool usage instead.
|
|
420
|
+
mode: "literacy",
|
|
421
|
+
|
|
422
|
+
// Source definitions — where your documentation lives.
|
|
423
|
+
sourceDefs: [
|
|
424
|
+
{
|
|
425
|
+
name: "production",
|
|
426
|
+
baseUrl: "https://docs.example.com",
|
|
427
|
+
// projectId: "your-sanity-project-id",
|
|
428
|
+
// dataset: "production",
|
|
429
|
+
},
|
|
430
|
+
],
|
|
431
|
+
|
|
432
|
+
// Feature registry — what product features you're tracking coverage for.
|
|
433
|
+
featureDefs: {
|
|
434
|
+
features: [
|
|
435
|
+
{
|
|
436
|
+
id: "getting-started",
|
|
437
|
+
name: "Getting Started Guide",
|
|
438
|
+
sections: ["guides"],
|
|
439
|
+
status: "covered",
|
|
440
|
+
area: "guides",
|
|
441
|
+
priority: "critical",
|
|
442
|
+
},
|
|
443
|
+
{
|
|
444
|
+
id: "api-reference",
|
|
445
|
+
name: "API Reference",
|
|
446
|
+
sections: ["reference"],
|
|
447
|
+
status: "uncovered",
|
|
448
|
+
priority: "high",
|
|
449
|
+
},
|
|
450
|
+
],
|
|
451
|
+
},
|
|
452
|
+
|
|
453
|
+
// Optional: override mode base rubrics, scoring, or prompts here.
|
|
454
|
+
// rubricTemplates: [{ ... }],
|
|
455
|
+
// scoringProfiles: { ... },
|
|
456
|
+
// promptTemplates: { ... },
|
|
457
|
+
})
|
|
458
|
+
`;
|
|
@@ -9,6 +9,10 @@
|
|
|
9
9
|
* Uses @inquirer/prompts for a clean, modern terminal UI.
|
|
10
10
|
*/
|
|
11
11
|
import { Command } from "commander";
|
|
12
|
+
import { LiteracyVariant } from "../pipeline/normalize-mode.js";
|
|
13
|
+
// CLI command name for the baseline snapshot management subcommand.
|
|
14
|
+
// Defined as a constant to avoid scattering the literal string across routing code.
|
|
15
|
+
const BASELINE_CMD = "baseline";
|
|
12
16
|
export function createInteractiveCommand() {
|
|
13
17
|
return new Command("interactive")
|
|
14
18
|
.description("Guided wizard for common evaluation workflows")
|
|
@@ -65,7 +69,7 @@ async function runInteractiveWizard() {
|
|
|
65
69
|
{
|
|
66
70
|
description: "Save, compare, or list historical score snapshots",
|
|
67
71
|
name: "Manage baselines",
|
|
68
|
-
value:
|
|
72
|
+
value: BASELINE_CMD,
|
|
69
73
|
},
|
|
70
74
|
{
|
|
71
75
|
description: "Weekly evaluation trends and area summaries",
|
|
@@ -93,7 +97,7 @@ async function runInteractiveWizard() {
|
|
|
93
97
|
});
|
|
94
98
|
return { args: dryRun ? ["--dry-run"] : [], command: "weekly-digest" };
|
|
95
99
|
}
|
|
96
|
-
if (workflow ===
|
|
100
|
+
if (workflow === BASELINE_CMD) {
|
|
97
101
|
const subcommand = await select({
|
|
98
102
|
choices: [
|
|
99
103
|
{ name: "Save current scores", value: "save" },
|
|
@@ -102,7 +106,7 @@ async function runInteractiveWizard() {
|
|
|
102
106
|
],
|
|
103
107
|
message: "Baseline operation:",
|
|
104
108
|
});
|
|
105
|
-
return { args: [subcommand], command:
|
|
109
|
+
return { args: [subcommand], command: BASELINE_CMD };
|
|
106
110
|
}
|
|
107
111
|
if (workflow === "grader") {
|
|
108
112
|
const subcommand = await select({
|
|
@@ -140,22 +144,22 @@ async function runInteractiveWizard() {
|
|
|
140
144
|
{
|
|
141
145
|
description: "Evaluate with pre-fetched documentation context",
|
|
142
146
|
name: "Baseline (with docs vs without docs)",
|
|
143
|
-
value:
|
|
147
|
+
value: LiteracyVariant.STANDARD,
|
|
144
148
|
},
|
|
145
149
|
{
|
|
146
150
|
description: "Baseline + record HTTP request patterns",
|
|
147
151
|
name: "Observed (instrumented)",
|
|
148
|
-
value:
|
|
152
|
+
value: LiteracyVariant.OBSERVED,
|
|
149
153
|
},
|
|
150
154
|
{
|
|
151
155
|
description: "Agent searches for docs itself via web tools",
|
|
152
156
|
name: "Agentic (agent-driven retrieval)",
|
|
153
|
-
value:
|
|
157
|
+
value: LiteracyVariant.AGENTIC,
|
|
154
158
|
},
|
|
155
159
|
],
|
|
156
160
|
message: "Evaluation mode:",
|
|
157
161
|
});
|
|
158
|
-
if (mode !==
|
|
162
|
+
if (mode !== LiteracyVariant.STANDARD) {
|
|
159
163
|
args.push("--mode", mode);
|
|
160
164
|
}
|
|
161
165
|
// Step 3: Area scoping
|
|
@@ -31,6 +31,8 @@ export interface ResolvedOptions {
|
|
|
31
31
|
headerArgs: string[];
|
|
32
32
|
impactSummary?: ImpactSummary;
|
|
33
33
|
mode: EvalMode;
|
|
34
|
+
/** Literacy variant — set when the user passes a legacy mode name */
|
|
35
|
+
variant?: string;
|
|
34
36
|
noAutoScope: boolean;
|
|
35
37
|
noCache: boolean;
|
|
36
38
|
noRemoteCache: boolean;
|
|
@@ -14,6 +14,7 @@ import { existsSync, readFileSync, writeFileSync } from "fs";
|
|
|
14
14
|
import { dirname, resolve } from "path";
|
|
15
15
|
import { fileURLToPath } from "url";
|
|
16
16
|
import { classifyUrls } from "../pipeline/classify-url.js";
|
|
17
|
+
import { normalizeMode } from "../pipeline/normalize-mode.js";
|
|
17
18
|
import { assessImpact, buildReverseMapping, } from "../pipeline/reverse-mapping.js";
|
|
18
19
|
import { buildAppContext } from "../orchestration/build-app-context.js";
|
|
19
20
|
import { buildStepSequence } from "../orchestration/build-step-sequence.js";
|
|
@@ -23,9 +24,8 @@ import { parseRepoConfig, } from "../adapters/task-sources/repo-schemas.js";
|
|
|
23
24
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
24
25
|
const ROOT = resolve(__dirname, "..", "..");
|
|
25
26
|
// ---------------------------------------------------------------------------
|
|
26
|
-
// Valid
|
|
27
|
+
// Valid search modes
|
|
27
28
|
// ---------------------------------------------------------------------------
|
|
28
|
-
const VALID_MODES = ["baseline", "observed", "agentic", "full"];
|
|
29
29
|
const VALID_SEARCH_MODES = ["open", "origin-only", "off"];
|
|
30
30
|
/**
|
|
31
31
|
* Pure option resolution — computes ResolvedOptions from CLI flags without
|
|
@@ -36,10 +36,19 @@ const VALID_SEARCH_MODES = ["open", "origin-only", "off"];
|
|
|
36
36
|
export function computeResolvedOptions(opts) {
|
|
37
37
|
// Resolve paths relative to the caller's cwd, not the eval package root
|
|
38
38
|
const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
|
|
39
|
-
// Validate mode
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
39
|
+
// Validate + normalize mode via the single boundary function.
|
|
40
|
+
// normalizeMode() maps legacy variant names (baseline, agentic, etc.)
|
|
41
|
+
// to canonical mode "literacy" + variant, and throws on invalid input.
|
|
42
|
+
let mode;
|
|
43
|
+
let variant;
|
|
44
|
+
try {
|
|
45
|
+
const normalized = normalizeMode(opts.mode);
|
|
46
|
+
mode = normalized.mode;
|
|
47
|
+
// Explicit --variant flag takes precedence over what normalizeMode inferred
|
|
48
|
+
variant = opts.variant ?? normalized.variant;
|
|
49
|
+
}
|
|
50
|
+
catch (err) {
|
|
51
|
+
console.error(`❌ ${err instanceof Error ? err.message : String(err)}`);
|
|
43
52
|
process.exit(1);
|
|
44
53
|
}
|
|
45
54
|
// Debug options — any sub-flag (--debug-n, --debug-pattern, --debug-sample)
|
|
@@ -220,6 +229,7 @@ export function computeResolvedOptions(opts) {
|
|
|
220
229
|
headerArgs,
|
|
221
230
|
impactSummary,
|
|
222
231
|
mode,
|
|
232
|
+
variant,
|
|
223
233
|
noAutoScope: opts.autoScope === false,
|
|
224
234
|
noCache: !opts.cache,
|
|
225
235
|
noRemoteCache: opts.remoteCache === false,
|
|
@@ -8,11 +8,13 @@
|
|
|
8
8
|
* @see docs/CLI.md for the full flag reference.
|
|
9
9
|
*/
|
|
10
10
|
import { Command } from "commander";
|
|
11
|
+
import { LiteracyVariant } from "../pipeline/normalize-mode.js";
|
|
11
12
|
import { addAgenticOptions, addDebugOptions, addSanitySourceOptions, } from "./shared/options.js";
|
|
12
13
|
export function createPipelineCommand() {
|
|
13
14
|
const cmd = new Command("pipeline")
|
|
14
15
|
.description("Run the full evaluation pipeline")
|
|
15
|
-
.option("-m, --mode <mode>", "Evaluation mode:
|
|
16
|
+
.option("-m, --mode <mode>", "Evaluation mode: literacy (default), mcp-server, agent-harness, knowledge-probe, custom. Legacy aliases (baseline, agentic, observed, full) are accepted and normalized to literacy + variant.", LiteracyVariant.FULL)
|
|
17
|
+
.option("--variant <variant>", "Literacy variant: full (default — standard + agentic), baseline (standard only), agentic (agentic only), observed. Only applies to --mode literacy.")
|
|
16
18
|
.option("-s, --source <name>", "Documentation source name (from sources.yaml)")
|
|
17
19
|
.option("-n, --dry-run", "Validate configuration only, no execution", false)
|
|
18
20
|
.option("--skip-fetch", "Reuse cached documentation contexts", false)
|
|
@@ -44,7 +46,7 @@ export function createPipelineCommand() {
|
|
|
44
46
|
.option("--publish-tag <tag>", "Label for published report")
|
|
45
47
|
.option("--report-dataset <name>", "Sanity dataset for report store")
|
|
46
48
|
.option("--report-project <id>", "Sanity project ID for report store")
|
|
47
|
-
.option("--config <path>", "Load pipeline config from a
|
|
49
|
+
.option("--config <path>", "Load pipeline config from a TS/JS/YAML/JSON file (overrides most CLI flags)")
|
|
48
50
|
.option("-o, --output <path>", "Write PR comment markdown to file")
|
|
49
51
|
.option("--promptfoo-url <url>", "Promptfoo share URL for report")
|
|
50
52
|
.option("--task-source <type>", "Task definition source: content-lake (default — Sanity Content Lake), repo (repo tasks only, no Content Lake merge), yaml (tasks/*.yaml files, legacy)", "content-lake")
|
package/dist/commands/publish.js
CHANGED
|
@@ -52,7 +52,7 @@ export function createPublishCommand() {
|
|
|
52
52
|
*/
|
|
53
53
|
function buildProvenanceFromSummary(summary) {
|
|
54
54
|
const areas = summary.scores.map((s) => s.feature);
|
|
55
|
-
const mode = (process.env.EVAL_MODE ?? "
|
|
55
|
+
const mode = (process.env.EVAL_MODE ?? "literacy");
|
|
56
56
|
const source = {
|
|
57
57
|
baseUrl: summary.source?.baseUrl ?? "https://www.sanity.io/docs",
|
|
58
58
|
dataset: summary.source?.dataset ?? process.env.SANITY_DATASET ?? "next",
|
|
@@ -83,7 +83,7 @@ async function runPublishCommand(summaryPath, opts) {
|
|
|
83
83
|
compareEnabled: false,
|
|
84
84
|
discoveryReportEnabled: false,
|
|
85
85
|
gapAnalysisEnabled: false,
|
|
86
|
-
mode: "
|
|
86
|
+
mode: "literacy",
|
|
87
87
|
noAutoScope: false,
|
|
88
88
|
noCache: true,
|
|
89
89
|
noRemoteCache: true,
|
|
@@ -10,14 +10,14 @@ import { Command } from "commander";
|
|
|
10
10
|
import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
|
|
11
11
|
import { dirname, join, resolve } from "path";
|
|
12
12
|
import { fileURLToPath } from "url";
|
|
13
|
-
import {
|
|
13
|
+
import { ConfigNotFoundError, loadConfigFile, } from "../pipeline/compiler/config-loader.js";
|
|
14
14
|
import { formatReadinessMarkdown, generateReadinessReport, } from "../pipeline/readiness-report.js";
|
|
15
15
|
import { ThresholdConfigSchema, } from "../pipeline/schemas.js";
|
|
16
16
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
17
17
|
const ROOT = resolve(__dirname, "..", "..");
|
|
18
18
|
const SCORE_SUMMARY_PATH = join(ROOT, "results", "latest", "score-summary.json");
|
|
19
19
|
const GAP_ANALYSIS_PATH = join(ROOT, "results", "latest", "gap-analysis.json");
|
|
20
|
-
|
|
20
|
+
// thresholds loaded via loadConfigFile below
|
|
21
21
|
const BASELINES_DIR = join(ROOT, "results", "baselines");
|
|
22
22
|
export function createReadinessReportCommand() {
|
|
23
23
|
return new Command("readiness-report")
|
|
@@ -33,12 +33,19 @@ export function createReadinessReportCommand() {
|
|
|
33
33
|
}
|
|
34
34
|
const scoreSummary = JSON.parse(readFileSync(SCORE_SUMMARY_PATH, "utf-8"));
|
|
35
35
|
// Load threshold config
|
|
36
|
-
|
|
37
|
-
|
|
36
|
+
let parsedThresholds;
|
|
37
|
+
try {
|
|
38
|
+
parsedThresholds = loadConfigFile("thresholds", ROOT).data;
|
|
39
|
+
}
|
|
40
|
+
catch (err) {
|
|
41
|
+
if (err instanceof ConfigNotFoundError) {
|
|
42
|
+
console.error("❌ Threshold config not found in config/.");
|
|
43
|
+
}
|
|
44
|
+
else {
|
|
45
|
+
console.error(`❌ Failed to load threshold config: ${err instanceof Error ? err.message : err}`);
|
|
46
|
+
}
|
|
38
47
|
process.exit(1);
|
|
39
48
|
}
|
|
40
|
-
const rawThresholds = readFileSync(THRESHOLDS_PATH, "utf-8");
|
|
41
|
-
const parsedThresholds = load(rawThresholds);
|
|
42
49
|
const thresholdResult = ThresholdConfigSchema.safeParse(parsedThresholds);
|
|
43
50
|
if (!thresholdResult.success) {
|
|
44
51
|
const messages = thresholdResult.error.issues
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* validate-tasks command — standalone validation of
|
|
2
|
+
* validate-tasks command — standalone validation of task files.
|
|
3
3
|
*
|
|
4
|
-
* Validates .ailf/tasks/*.yaml files against the
|
|
4
|
+
* Validates .ailf/tasks/*.yaml files against the CanonicalTaskSchema without
|
|
5
5
|
* running the full pipeline. Useful for pre-commit hooks and CI checks
|
|
6
6
|
* in external repos.
|
|
7
7
|
*
|