@sanity/ailf 0.5.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/config/features.ts +23 -0
- package/config/models.ts +95 -0
- package/config/prompts.ts +16 -0
- package/config/rubrics.ts +225 -0
- package/config/schedules.ts +47 -0
- package/config/sinks.ts +37 -0
- package/config/sources.ts +21 -0
- package/config/thresholds.ts +61 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +171 -0
- package/dist/_vendor/ailf-core/config-helpers.js +170 -0
- package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
- package/dist/_vendor/ailf-core/env-helper.js +45 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/examples/index.js +25 -0
- package/dist/_vendor/ailf-core/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/index.js +5 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +17 -2
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
- package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +8 -2
- package/dist/_vendor/ailf-core/schemas/eval-config.js +17 -2
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +9 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +8 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -31
- package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -9
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
- package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/services/index.js +2 -1
- package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
- package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
- package/dist/_vendor/ailf-core/services/scoring.js +25 -15
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +332 -0
- package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +45 -83
- package/dist/_vendor/ailf-core/types/index.js +8 -1
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +257 -0
- package/dist/_vendor/ailf-core/types/plugin-registry.js +185 -0
- package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
- package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
- package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
- package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
- package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
- package/dist/_vendor/ailf-core/types/trace.js +18 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
- package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
- package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
- package/dist/_vendor/ailf-shared/index.d.ts +0 -1
- package/dist/_vendor/ailf-shared/index.js +0 -1
- package/dist/adapters/api-client/build-request.js +14 -13
- package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
- package/dist/adapters/config-sources/file-config-adapter.js +39 -12
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
- package/dist/adapters/config-sources/ts-config-loader.js +141 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
- package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +35 -39
- package/dist/adapters/task-sources/index.d.ts +3 -2
- package/dist/adapters/task-sources/index.js +3 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
- package/dist/adapters/task-sources/repo-schemas.js +227 -19
- package/dist/adapters/task-sources/repo-task-source.d.ts +16 -12
- package/dist/adapters/task-sources/repo-task-source.js +92 -80
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
- package/dist/adapters/task-sources/task-file-loader.js +83 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
- package/dist/adapters/task-sources/yaml-task-source.js +19 -16
- package/dist/cli.js +0 -2
- package/dist/commands/baseline.js +4 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/coverage-audit.js +9 -1
- package/dist/commands/explain-handler.js +25 -23
- package/dist/commands/fetch-docs.js +3 -2
- package/dist/commands/generate-configs.js +1 -1
- package/dist/commands/init.d.ts +6 -4
- package/dist/commands/init.js +302 -23
- package/dist/commands/interactive.js +11 -7
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +16 -6
- package/dist/commands/pipeline.d.ts +1 -0
- package/dist/commands/pipeline.js +4 -2
- package/dist/commands/pr-comment.js +1 -1
- package/dist/commands/publish.js +2 -2
- package/dist/commands/readiness-report.js +13 -6
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +13 -1
- package/dist/composition-root.js +99 -4
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +28 -8
- package/dist/orchestration/steps/calculate-scores-step.js +24 -11
- package/dist/orchestration/steps/fetch-docs-step.js +8 -7
- package/dist/orchestration/steps/gap-analysis-step.js +8 -7
- package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
- package/dist/orchestration/steps/generate-configs-step.js +261 -51
- package/dist/orchestration/steps/grader-consistency-step.js +7 -4
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/readiness-step.js +5 -6
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
- package/dist/orchestration/steps/run-eval-step.js +8 -7
- package/dist/pipeline/cache.d.ts +1 -1
- package/dist/pipeline/cache.js +36 -8
- package/dist/pipeline/calculate-scores.d.ts +2 -4
- package/dist/pipeline/calculate-scores.js +43 -113
- package/dist/pipeline/checks.js +2 -2
- package/dist/pipeline/compare.js +8 -8
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +392 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +404 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
- package/dist/pipeline/compiler/assertion-mapper.js +175 -0
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
- package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
- package/dist/pipeline/compiler/config-loader.d.ts +56 -0
- package/dist/pipeline/compiler/config-loader.js +111 -0
- package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
- package/dist/pipeline/compiler/fixture-resolver.js +113 -0
- package/dist/pipeline/compiler/hash.d.ts +11 -0
- package/dist/pipeline/compiler/hash.js +18 -0
- package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
- package/dist/pipeline/compiler/ignore-fields.js +113 -0
- package/dist/pipeline/compiler/index.d.ts +29 -0
- package/dist/pipeline/compiler/index.js +45 -0
- package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
- package/dist/pipeline/compiler/literacy-bridge.js +172 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +15 -0
- package/dist/pipeline/compiler/mode-handlers/index.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/index.d.ts +9 -0
- package/dist/pipeline/compiler/presets/index.js +8 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +42 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.js +208 -0
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
- package/dist/pipeline/compiler/provider-assembler.js +137 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
- package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
- package/dist/pipeline/compiler/sandbox/index.js +11 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
- package/dist/pipeline/compiler/scoring-bridge.js +114 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
- package/dist/pipeline/compiler/task-graph-builder.js +291 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
- package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
- package/dist/pipeline/compiler/telemetry/index.js +19 -0
- package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
- package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
- package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
- package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
- package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
- package/dist/pipeline/compiler/variable-resolver.js +115 -0
- package/dist/pipeline/coverage-audit.d.ts +15 -5
- package/dist/pipeline/coverage-audit.js +41 -22
- package/dist/pipeline/eval-constants.d.ts +16 -6
- package/dist/pipeline/eval-constants.js +25 -4
- package/dist/pipeline/eval-fingerprint.d.ts +2 -2
- package/dist/pipeline/eval-fingerprint.js +8 -9
- package/dist/pipeline/expand-tasks.d.ts +19 -10
- package/dist/pipeline/expand-tasks.js +34 -28
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +2 -2
- package/dist/pipeline/generate-configs.d.ts +22 -4
- package/dist/pipeline/generate-configs.js +53 -24
- package/dist/pipeline/grader-api.d.ts +3 -3
- package/dist/pipeline/grader-api.js +5 -12
- package/dist/pipeline/grader-compare-runner.js +20 -27
- package/dist/pipeline/grader-comparison.d.ts +4 -8
- package/dist/pipeline/grader-comparison.js +11 -17
- package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
- package/dist/pipeline/grader-consistency-runner.js +16 -20
- package/dist/pipeline/grader-consistency.d.ts +6 -10
- package/dist/pipeline/grader-consistency.js +13 -32
- package/dist/pipeline/grader-sensitivity-runner.js +7 -5
- package/dist/pipeline/grader-sensitivity.d.ts +2 -6
- package/dist/pipeline/grader-sensitivity.js +10 -10
- package/dist/pipeline/grader-validate-runner.js +7 -5
- package/dist/pipeline/grader-validation.d.ts +2 -6
- package/dist/pipeline/grader-validation.js +14 -22
- package/dist/pipeline/map-request-to-config.js +7 -1
- package/dist/pipeline/mirror-repo-tasks.d.ts +13 -13
- package/dist/pipeline/mirror-repo-tasks.js +22 -21
- package/dist/pipeline/normalize-mode.d.ts +49 -0
- package/dist/pipeline/normalize-mode.js +64 -0
- package/dist/pipeline/plan.d.ts +5 -2
- package/dist/pipeline/plan.js +134 -78
- package/dist/pipeline/pr-comment.js +2 -0
- package/dist/pipeline/profile-resolution.d.ts +22 -14
- package/dist/pipeline/profile-resolution.js +41 -19
- package/dist/pipeline/provenance.d.ts +2 -2
- package/dist/pipeline/provenance.js +12 -17
- package/dist/pipeline/release-report.js +4 -4
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/rubric-loader.d.ts +20 -0
- package/dist/pipeline/rubric-loader.js +37 -0
- package/dist/pipeline/validate.d.ts +4 -4
- package/dist/pipeline/validate.js +64 -53
- package/dist/schedules/loader.js +18 -8
- package/dist/scripts/migrate-task-mode.d.ts +24 -0
- package/dist/scripts/migrate-task-mode.js +85 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +15 -15
- package/dist/sinks/loader.js +5 -7
- package/dist/sources.d.ts +7 -7
- package/dist/sources.js +22 -24
- package/dist/webhook/dispatch.js +2 -1
- package/package.json +15 -4
- package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
- package/tasks/literacy/frameworks.task.ts +128 -0
- package/tasks/literacy/functions.task.ts +69 -0
- package/tasks/literacy/groq.task.ts +258 -0
- package/tasks/literacy/nextjs-live.task.ts +75 -0
- package/tasks/literacy/studio-setup.task.ts +131 -0
- package/tasks/literacy/visual-editing.task.ts +146 -0
- package/config/features.yaml +0 -116
- package/config/models.yaml +0 -116
- package/config/prompts.yaml +0 -75
- package/config/rubrics.yaml +0 -81
- package/config/schedules.yaml +0 -43
- package/config/sinks.yaml +0 -54
- package/config/sources.yaml +0 -51
- package/config/thresholds.yaml +0 -49
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
|
@@ -0,0 +1,404 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* scoring-and-presets.test.ts — Tests for 4-tier scoring engine,
|
|
3
|
+
* storage schema, and plugin registry / presets.
|
|
4
|
+
*
|
|
5
|
+
* Run: npx tsx --test src/pipeline/compiler/__tests__/scoring-and-presets.test.ts
|
|
6
|
+
*/
|
|
7
|
+
import assert from "node:assert/strict";
|
|
8
|
+
import { dirname, resolve } from "node:path";
|
|
9
|
+
import { describe, it } from "node:test";
|
|
10
|
+
import { fileURLToPath } from "node:url";
|
|
11
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
12
|
+
import { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, } from "../../../_vendor/ailf-core/index.js";
|
|
13
|
+
import { CURRENT_SCHEMA_VERSION, InMemoryPluginRegistry, isSchemaVersioned, migrateDocument, } from "../../../_vendor/ailf-core/index.js";
|
|
14
|
+
import { createSanityLiteracyPreset, sanityLiteracyPreset, } from "../presets/sanity-literacy.js";
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
// Helpers
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
function makeAssertion(overrides) {
|
|
19
|
+
return {
|
|
20
|
+
pass: true,
|
|
21
|
+
score: 0.8,
|
|
22
|
+
reason: "Good",
|
|
23
|
+
assertionType: "llm-rubric",
|
|
24
|
+
dimension: "task-completion",
|
|
25
|
+
latencyMs: 100,
|
|
26
|
+
weight: 1.0,
|
|
27
|
+
...overrides,
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
function makeDimension(overrides) {
|
|
31
|
+
return {
|
|
32
|
+
dimensionId: "task-completion",
|
|
33
|
+
label: "Task Completion",
|
|
34
|
+
score: 0.8,
|
|
35
|
+
assertionCount: 2,
|
|
36
|
+
passCount: 2,
|
|
37
|
+
aggregation: "weighted-mean",
|
|
38
|
+
assertions: [],
|
|
39
|
+
...overrides,
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
// ---------------------------------------------------------------------------
|
|
43
|
+
// Tier 1 → Tier 2: Assertion → Dimension aggregation
|
|
44
|
+
// ---------------------------------------------------------------------------
|
|
45
|
+
describe("aggregateDimensions", () => {
|
|
46
|
+
it("groups assertions by dimension", () => {
|
|
47
|
+
const assertions = [
|
|
48
|
+
makeAssertion({ dimension: "code-correctness", score: 0.9 }),
|
|
49
|
+
makeAssertion({ dimension: "code-correctness", score: 0.7 }),
|
|
50
|
+
makeAssertion({ dimension: "task-completion", score: 0.8 }),
|
|
51
|
+
];
|
|
52
|
+
const dims = aggregateDimensions(assertions);
|
|
53
|
+
assert.equal(dims.length, 2);
|
|
54
|
+
const cc = dims.find((d) => d.dimensionId === "code-correctness");
|
|
55
|
+
assert.ok(cc);
|
|
56
|
+
assert.equal(cc.assertionCount, 2);
|
|
57
|
+
});
|
|
58
|
+
it("uses weighted-mean by default", () => {
|
|
59
|
+
const assertions = [
|
|
60
|
+
makeAssertion({ score: 0.6, weight: 1.0 }),
|
|
61
|
+
makeAssertion({ score: 0.8, weight: 3.0 }),
|
|
62
|
+
];
|
|
63
|
+
const dims = aggregateDimensions(assertions);
|
|
64
|
+
// Weighted mean: (0.6*1 + 0.8*3) / (1+3) = 3.0/4 = 0.75
|
|
65
|
+
assert.ok(Math.abs(dims[0].score - 0.75) < 0.01);
|
|
66
|
+
});
|
|
67
|
+
it("falls back to pass rate when no numeric scores", () => {
|
|
68
|
+
const assertions = [
|
|
69
|
+
makeAssertion({ score: null, pass: true }),
|
|
70
|
+
makeAssertion({ score: null, pass: false }),
|
|
71
|
+
];
|
|
72
|
+
const dims = aggregateDimensions(assertions);
|
|
73
|
+
assert.equal(dims[0].score, 0.5);
|
|
74
|
+
});
|
|
75
|
+
it("applies custom dimension labels", () => {
|
|
76
|
+
const assertions = [makeAssertion({ dimension: "tc" })];
|
|
77
|
+
const dims = aggregateDimensions(assertions, {
|
|
78
|
+
dimensionLabels: { tc: "Task Completion" },
|
|
79
|
+
});
|
|
80
|
+
assert.equal(dims[0].label, "Task Completion");
|
|
81
|
+
});
|
|
82
|
+
});
|
|
83
|
+
// ---------------------------------------------------------------------------
|
|
84
|
+
// Tier 2 → Tier 3: Dimension → Task scoring
|
|
85
|
+
// ---------------------------------------------------------------------------
|
|
86
|
+
describe("computeTaskScore", () => {
|
|
87
|
+
it("computes weighted score from dimensions", () => {
|
|
88
|
+
const dims = [
|
|
89
|
+
makeDimension({ dimensionId: "tc", score: 0.8 }),
|
|
90
|
+
makeDimension({ dimensionId: "cc", score: 0.6 }),
|
|
91
|
+
];
|
|
92
|
+
const task = computeTaskScore(dims, {
|
|
93
|
+
taskId: "test-task",
|
|
94
|
+
weights: { tc: 0.6, cc: 0.4 },
|
|
95
|
+
});
|
|
96
|
+
// 0.8*0.6 + 0.6*0.4 = 0.48 + 0.24 = 0.72
|
|
97
|
+
assert.ok(Math.abs(task.score - 0.72) < 0.01);
|
|
98
|
+
});
|
|
99
|
+
it("normalizes weights that don't sum to 1", () => {
|
|
100
|
+
const dims = [
|
|
101
|
+
makeDimension({ dimensionId: "tc", score: 1.0 }),
|
|
102
|
+
makeDimension({ dimensionId: "cc", score: 0.0 }),
|
|
103
|
+
];
|
|
104
|
+
const task = computeTaskScore(dims, {
|
|
105
|
+
taskId: "test-task",
|
|
106
|
+
weights: { tc: 2, cc: 2 },
|
|
107
|
+
});
|
|
108
|
+
// (1.0*2 + 0.0*2) / (2+2) = 2/4 = 0.5
|
|
109
|
+
assert.ok(Math.abs(task.score - 0.5) < 0.01);
|
|
110
|
+
});
|
|
111
|
+
it("checks against threshold", () => {
|
|
112
|
+
const dims = [makeDimension({ dimensionId: "tc", score: 0.6 })];
|
|
113
|
+
const passing = computeTaskScore(dims, {
|
|
114
|
+
taskId: "t1",
|
|
115
|
+
weights: { tc: 1.0 },
|
|
116
|
+
threshold: 0.5,
|
|
117
|
+
});
|
|
118
|
+
assert.equal(passing.passesThreshold, true);
|
|
119
|
+
const failing = computeTaskScore(dims, {
|
|
120
|
+
taskId: "t2",
|
|
121
|
+
weights: { tc: 1.0 },
|
|
122
|
+
threshold: 0.7,
|
|
123
|
+
});
|
|
124
|
+
assert.equal(failing.passesThreshold, false);
|
|
125
|
+
});
|
|
126
|
+
it("records weight source", () => {
|
|
127
|
+
const task = computeTaskScore([makeDimension()], {
|
|
128
|
+
taskId: "t1",
|
|
129
|
+
weights: { "task-completion": 1.0 },
|
|
130
|
+
weightSource: "rubrics.yaml:default",
|
|
131
|
+
});
|
|
132
|
+
assert.equal(task.weightSource, "rubrics.yaml:default");
|
|
133
|
+
});
|
|
134
|
+
});
|
|
135
|
+
// ---------------------------------------------------------------------------
|
|
136
|
+
// Tier 3 → Tier 4: Task → Area aggregation
|
|
137
|
+
// ---------------------------------------------------------------------------
|
|
138
|
+
describe("aggregateAreas", () => {
|
|
139
|
+
it("groups tasks by area prefix", () => {
|
|
140
|
+
const tasks = [
|
|
141
|
+
computeTaskScore([makeDimension({ score: 0.8 })], {
|
|
142
|
+
taskId: "groq-basic",
|
|
143
|
+
weights: { "task-completion": 1.0 },
|
|
144
|
+
}),
|
|
145
|
+
computeTaskScore([makeDimension({ score: 0.6 })], {
|
|
146
|
+
taskId: "groq-advanced",
|
|
147
|
+
weights: { "task-completion": 1.0 },
|
|
148
|
+
}),
|
|
149
|
+
computeTaskScore([makeDimension({ score: 0.9 })], {
|
|
150
|
+
taskId: "studio-schema",
|
|
151
|
+
weights: { "task-completion": 1.0 },
|
|
152
|
+
}),
|
|
153
|
+
];
|
|
154
|
+
const areas = aggregateAreas(tasks);
|
|
155
|
+
assert.equal(areas.length, 2);
|
|
156
|
+
const groq = areas.find((a) => a.areaId === "groq");
|
|
157
|
+
assert.ok(groq);
|
|
158
|
+
assert.equal(groq.taskCount, 2);
|
|
159
|
+
assert.ok(Math.abs(groq.score - 0.7) < 0.01); // (0.8+0.6)/2
|
|
160
|
+
const studio = areas.find((a) => a.areaId === "studio");
|
|
161
|
+
assert.ok(studio);
|
|
162
|
+
assert.equal(studio.taskCount, 1);
|
|
163
|
+
});
|
|
164
|
+
it("computes delta from previous scores", () => {
|
|
165
|
+
const tasks = [
|
|
166
|
+
computeTaskScore([makeDimension({ score: 0.8 })], {
|
|
167
|
+
taskId: "groq-basic",
|
|
168
|
+
weights: { "task-completion": 1.0 },
|
|
169
|
+
}),
|
|
170
|
+
];
|
|
171
|
+
const areas = aggregateAreas(tasks, { groq: 0.6 });
|
|
172
|
+
assert.ok(areas[0].delta !== null);
|
|
173
|
+
assert.ok(Math.abs(areas[0].delta - 0.2) < 0.01);
|
|
174
|
+
});
|
|
175
|
+
});
|
|
176
|
+
// ---------------------------------------------------------------------------
|
|
177
|
+
// Score normalization
|
|
178
|
+
// ---------------------------------------------------------------------------
|
|
179
|
+
describe("normalizeScore", () => {
|
|
180
|
+
it("normalizes LLM rubric scores (0-100 → 0-1)", () => {
|
|
181
|
+
assert.ok(Math.abs(normalizeScore(75, "llm-rubric") - 0.75) < 0.01);
|
|
182
|
+
});
|
|
183
|
+
it("passes through already-normalized scores", () => {
|
|
184
|
+
assert.ok(Math.abs(normalizeScore(0.75, "llm-rubric") - 0.75) < 0.01);
|
|
185
|
+
});
|
|
186
|
+
it("normalizes boolean assertions to 0 or 1", () => {
|
|
187
|
+
assert.equal(normalizeScore(1, "contains"), 1);
|
|
188
|
+
assert.equal(normalizeScore(0, "contains"), 0);
|
|
189
|
+
});
|
|
190
|
+
it("clamps similarity scores to [0, 1]", () => {
|
|
191
|
+
assert.equal(normalizeScore(1.5, "similar"), 1);
|
|
192
|
+
assert.equal(normalizeScore(-0.1, "similar"), 0);
|
|
193
|
+
});
|
|
194
|
+
});
|
|
195
|
+
// ---------------------------------------------------------------------------
|
|
196
|
+
// Ensemble grading
|
|
197
|
+
// ---------------------------------------------------------------------------
|
|
198
|
+
describe("computeEnsembleScore", () => {
|
|
199
|
+
it("computes mean ensemble score", () => {
|
|
200
|
+
const { score, agreement } = computeEnsembleScore([0.8, 0.6, 0.7], "mean");
|
|
201
|
+
assert.ok(Math.abs(score - 0.7) < 0.01);
|
|
202
|
+
assert.ok(agreement > 0);
|
|
203
|
+
});
|
|
204
|
+
it("computes median ensemble score", () => {
|
|
205
|
+
const { score } = computeEnsembleScore([0.9, 0.5, 0.7], "median");
|
|
206
|
+
assert.ok(Math.abs(score - 0.7) < 0.01);
|
|
207
|
+
});
|
|
208
|
+
it("computes max ensemble score", () => {
|
|
209
|
+
const { score } = computeEnsembleScore([0.9, 0.5, 0.7], "max");
|
|
210
|
+
assert.ok(Math.abs(score - 0.9) < 0.01);
|
|
211
|
+
});
|
|
212
|
+
it("agreement is 1 for identical scores", () => {
|
|
213
|
+
const { agreement } = computeEnsembleScore([0.8, 0.8, 0.8]);
|
|
214
|
+
assert.ok(Math.abs(agreement - 1.0) < 0.01);
|
|
215
|
+
});
|
|
216
|
+
it("agreement decreases with divergent scores", () => {
|
|
217
|
+
const { agreement } = computeEnsembleScore([0.0, 1.0]);
|
|
218
|
+
assert.ok(agreement < 0.6);
|
|
219
|
+
});
|
|
220
|
+
});
|
|
221
|
+
// ---------------------------------------------------------------------------
|
|
222
|
+
// Storage schema
|
|
223
|
+
// ---------------------------------------------------------------------------
|
|
224
|
+
describe("storage schema", () => {
|
|
225
|
+
it("CURRENT_SCHEMA_VERSION is 1", () => {
|
|
226
|
+
assert.equal(CURRENT_SCHEMA_VERSION, 1);
|
|
227
|
+
});
|
|
228
|
+
it("isSchemaVersioned detects versioned docs", () => {
|
|
229
|
+
assert.equal(isSchemaVersioned({ schemaVersion: 1 }), true);
|
|
230
|
+
assert.equal(isSchemaVersioned({}), false);
|
|
231
|
+
assert.equal(isSchemaVersioned(null), false);
|
|
232
|
+
});
|
|
233
|
+
it("migrateDocument is no-op for current version", () => {
|
|
234
|
+
const doc = { schemaVersion: 1, _type: "ailf.run" };
|
|
235
|
+
const migrated = migrateDocument(doc);
|
|
236
|
+
assert.equal(migrated.schemaVersion, 1);
|
|
237
|
+
});
|
|
238
|
+
});
|
|
239
|
+
// ---------------------------------------------------------------------------
|
|
240
|
+
// Plugin registry
|
|
241
|
+
// ---------------------------------------------------------------------------
|
|
242
|
+
describe("InMemoryPluginRegistry", () => {
|
|
243
|
+
it("registers and retrieves modes", () => {
|
|
244
|
+
const registry = new InMemoryPluginRegistry();
|
|
245
|
+
registry.registerMode({
|
|
246
|
+
id: "custom",
|
|
247
|
+
label: "Custom Mode",
|
|
248
|
+
validProviderPatterns: [".*"],
|
|
249
|
+
rubricTemplateIds: [],
|
|
250
|
+
handlerModule: "./custom.js",
|
|
251
|
+
});
|
|
252
|
+
assert.equal(registry.getModes().length, 1);
|
|
253
|
+
assert.equal(registry.getMode("custom")?.label, "Custom Mode");
|
|
254
|
+
});
|
|
255
|
+
it("registers and retrieves assertions", () => {
|
|
256
|
+
const registry = new InMemoryPluginRegistry();
|
|
257
|
+
registry.registerAssertion({
|
|
258
|
+
type: "api-match",
|
|
259
|
+
label: "API Match",
|
|
260
|
+
compatibleModes: ["custom"],
|
|
261
|
+
handlerModule: "./api-match.js",
|
|
262
|
+
});
|
|
263
|
+
assert.equal(registry.getAssertions().length, 1);
|
|
264
|
+
});
|
|
265
|
+
it("registers a complete preset with mode base", () => {
|
|
266
|
+
const registry = new InMemoryPluginRegistry();
|
|
267
|
+
// Must register mode base first
|
|
268
|
+
const { createLiteracyModeBase } = require("../mode-bases/literacy.js");
|
|
269
|
+
registry.registerModeBase(createLiteracyModeBase());
|
|
270
|
+
registry.registerPreset(sanityLiteracyPreset);
|
|
271
|
+
// Mode + rubrics from mode base, domain config from preset
|
|
272
|
+
assert.ok(registry.getMode("literacy"));
|
|
273
|
+
assert.ok(registry.getRubricTemplates().length > 0);
|
|
274
|
+
assert.ok(registry.getPresets().length === 1);
|
|
275
|
+
});
|
|
276
|
+
});
|
|
277
|
+
// ---------------------------------------------------------------------------
|
|
278
|
+
// sanity-literacy preset
|
|
279
|
+
// ---------------------------------------------------------------------------
|
|
280
|
+
describe("sanityLiteracyPreset", () => {
|
|
281
|
+
it("has correct manifest", () => {
|
|
282
|
+
assert.equal(sanityLiteracyPreset.name, "sanity-literacy");
|
|
283
|
+
assert.equal(sanityLiteracyPreset.manifest.pluginApiVersion, 1);
|
|
284
|
+
});
|
|
285
|
+
it("targets literacy mode base", () => {
|
|
286
|
+
assert.equal(sanityLiteracyPreset.mode, "literacy");
|
|
287
|
+
});
|
|
288
|
+
it("does not bundle assertions (now framework built-ins)", () => {
|
|
289
|
+
assert.equal(sanityLiteracyPreset.assertions, undefined);
|
|
290
|
+
});
|
|
291
|
+
it("does not bundle rubrics/scoring/prompts (now in literacy mode base)", () => {
|
|
292
|
+
// Evaluation methodology moved to mode-bases/literacy.ts
|
|
293
|
+
assert.equal(sanityLiteracyPreset.rubricTemplates, undefined);
|
|
294
|
+
assert.equal(sanityLiteracyPreset.scoringProfiles, undefined);
|
|
295
|
+
assert.equal(sanityLiteracyPreset.promptTemplates, undefined);
|
|
296
|
+
});
|
|
297
|
+
it("includes sanity:// fixture resolver", () => {
|
|
298
|
+
assert.ok(sanityLiteracyPreset.fixtureResolvers?.some((r) => r.scheme === "sanity://"));
|
|
299
|
+
});
|
|
300
|
+
it("includes 3 source definitions", () => {
|
|
301
|
+
const sources = sanityLiteracyPreset.sourceDefs;
|
|
302
|
+
assert.ok(sources);
|
|
303
|
+
assert.equal(sources.length, 3);
|
|
304
|
+
const names = sources.map((s) => s.name);
|
|
305
|
+
assert.ok(names.includes("production"));
|
|
306
|
+
assert.ok(names.includes("branch"));
|
|
307
|
+
assert.ok(names.includes("local"));
|
|
308
|
+
});
|
|
309
|
+
it("production source has correct baseUrl", () => {
|
|
310
|
+
const prod = sanityLiteracyPreset.sourceDefs.find((s) => s.name === "production");
|
|
311
|
+
assert.ok(prod);
|
|
312
|
+
assert.equal(prod.baseUrl, "https://www.sanity.io/docs");
|
|
313
|
+
});
|
|
314
|
+
it("includes feature registry with all features", () => {
|
|
315
|
+
const features = sanityLiteracyPreset.featureDefs;
|
|
316
|
+
assert.ok(features);
|
|
317
|
+
assert.equal(features.features.length, 14);
|
|
318
|
+
const ids = features.features.map((f) => f.id);
|
|
319
|
+
assert.ok(ids.includes("groq"));
|
|
320
|
+
assert.ok(ids.includes("visual-editing"));
|
|
321
|
+
assert.ok(ids.includes("portable-text"));
|
|
322
|
+
assert.ok(ids.includes("ai-assist"));
|
|
323
|
+
});
|
|
324
|
+
it("includes a docFetcher factory", () => {
|
|
325
|
+
assert.equal(typeof sanityLiteracyPreset.docFetcher, "function");
|
|
326
|
+
const fetcher = sanityLiteracyPreset.docFetcher();
|
|
327
|
+
assert.ok(fetcher);
|
|
328
|
+
assert.equal(typeof fetcher.fetch, "function");
|
|
329
|
+
});
|
|
330
|
+
});
|
|
331
|
+
// ---------------------------------------------------------------------------
|
|
332
|
+
// createSanityLiteracyPreset factory
|
|
333
|
+
// ---------------------------------------------------------------------------
|
|
334
|
+
describe("createSanityLiteracyPreset", () => {
|
|
335
|
+
it("returns a domain-only preset targeting literacy mode", () => {
|
|
336
|
+
const preset = createSanityLiteracyPreset({ rootDir: "/tmp/test" });
|
|
337
|
+
assert.equal(preset.name, "sanity-literacy");
|
|
338
|
+
assert.equal(preset.mode, "literacy");
|
|
339
|
+
// Domain config present
|
|
340
|
+
assert.ok(preset.fixtureResolvers);
|
|
341
|
+
assert.ok(preset.docFetcher);
|
|
342
|
+
assert.ok(preset.sourceDefs);
|
|
343
|
+
assert.ok(preset.featureDefs);
|
|
344
|
+
// Methodology inherited from mode base, not on preset
|
|
345
|
+
assert.equal(preset.rubricTemplates, undefined);
|
|
346
|
+
assert.equal(preset.scoringProfiles, undefined);
|
|
347
|
+
assert.equal(preset.promptTemplates, undefined);
|
|
348
|
+
});
|
|
349
|
+
it("registers all extension points via mode base + domain config", () => {
|
|
350
|
+
const registry = new InMemoryPluginRegistry();
|
|
351
|
+
// Must register mode base first (composition root does this)
|
|
352
|
+
const { createLiteracyModeBase } = require("../mode-bases/literacy.js");
|
|
353
|
+
registry.registerModeBase(createLiteracyModeBase());
|
|
354
|
+
const preset = createSanityLiteracyPreset({ rootDir: "/tmp/test" });
|
|
355
|
+
registry.registerPreset(preset);
|
|
356
|
+
// Mode from mode base
|
|
357
|
+
assert.ok(registry.getMode("literacy"));
|
|
358
|
+
// Rubrics, scoring, prompts inherited from mode base
|
|
359
|
+
assert.equal(registry.getRubricTemplates().length, 3);
|
|
360
|
+
assert.equal(Object.keys(registry.getPromptTemplates()).length, 3);
|
|
361
|
+
assert.equal(Object.keys(registry.getScoringProfiles()).length, 2);
|
|
362
|
+
// Domain config from preset
|
|
363
|
+
assert.ok(registry.getDocFetcherFactory());
|
|
364
|
+
assert.equal(registry.getSourceDefs().length, 3);
|
|
365
|
+
assert.ok(registry.getFeatureDefs());
|
|
366
|
+
assert.equal(registry.getFeatureDefs().features.length, 14);
|
|
367
|
+
});
|
|
368
|
+
});
|
|
369
|
+
// ---------------------------------------------------------------------------
|
|
370
|
+
// Preset is single source of truth for sources and features
|
|
371
|
+
// ---------------------------------------------------------------------------
|
|
372
|
+
describe("preset is single source of truth for Sanity config", () => {
|
|
373
|
+
it("config/sources.ts exports an empty array", async () => {
|
|
374
|
+
const { tryLoadConfigFile } = await import("../../compiler/config-loader.js");
|
|
375
|
+
const ROOT = resolve(__dirname, "..", "..", "..", "..");
|
|
376
|
+
const loaded = tryLoadConfigFile("sources", ROOT);
|
|
377
|
+
assert.ok(loaded, "config/sources.ts should exist");
|
|
378
|
+
const sources = loaded.data;
|
|
379
|
+
assert.ok(Array.isArray(sources), "should export an array");
|
|
380
|
+
assert.equal(sources.length, 0, "config/sources should be empty (preset provides sources)");
|
|
381
|
+
});
|
|
382
|
+
it("config/features.ts exports an empty features array", async () => {
|
|
383
|
+
const { tryLoadConfigFile } = await import("../../compiler/config-loader.js");
|
|
384
|
+
const ROOT = resolve(__dirname, "..", "..", "..", "..");
|
|
385
|
+
const loaded = tryLoadConfigFile("features", ROOT);
|
|
386
|
+
assert.ok(loaded, "config/features.ts should exist");
|
|
387
|
+
assert.ok(Array.isArray(loaded.data.features), "should have a features array");
|
|
388
|
+
assert.equal(loaded.data.features.length, 0, "config/features should be empty (preset provides features)");
|
|
389
|
+
});
|
|
390
|
+
it("preset contains all 3 source entries", () => {
|
|
391
|
+
const sources = sanityLiteracyPreset.sourceDefs;
|
|
392
|
+
assert.equal(sources.length, 3);
|
|
393
|
+
const names = sources.map((s) => s.name).sort();
|
|
394
|
+
assert.deepEqual(names, ["branch", "local", "production"]);
|
|
395
|
+
});
|
|
396
|
+
it("preset contains all 14 feature entries", () => {
|
|
397
|
+
const features = sanityLiteracyPreset.featureDefs.features;
|
|
398
|
+
assert.equal(features.length, 14);
|
|
399
|
+
const covered = features.filter((f) => f.status === "covered");
|
|
400
|
+
const uncovered = features.filter((f) => f.status === "uncovered");
|
|
401
|
+
assert.equal(covered.length, 6, "should have 6 covered features");
|
|
402
|
+
assert.equal(uncovered.length, 8, "should have 8 uncovered features");
|
|
403
|
+
});
|
|
404
|
+
});
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* scoring-bridge.test.ts — Tests for the 4-tier scoring engine bridge.
|
|
3
|
+
*
|
|
4
|
+
* Verifies that `scoreTestGroup` produces the same 0–100 output as the
|
|
5
|
+
* legacy `accumulateDimensions → averageDimensions → weightedComposite`
|
|
6
|
+
* chain when given identical inputs.
|
|
7
|
+
*
|
|
8
|
+
* Run: npx tsx --test src/pipeline/compiler/__tests__/scoring-bridge.test.ts
|
|
9
|
+
*/
|
|
10
|
+
export {};
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* scoring-bridge.test.ts — Tests for the 4-tier scoring engine bridge.
|
|
3
|
+
*
|
|
4
|
+
* Verifies that `scoreTestGroup` produces the same 0–100 output as the
|
|
5
|
+
* legacy `accumulateDimensions → averageDimensions → weightedComposite`
|
|
6
|
+
* chain when given identical inputs.
|
|
7
|
+
*
|
|
8
|
+
* Run: npx tsx --test src/pipeline/compiler/__tests__/scoring-bridge.test.ts
|
|
9
|
+
*/
|
|
10
|
+
import assert from "node:assert/strict";
|
|
11
|
+
import { describe, it } from "node:test";
|
|
12
|
+
import { scoreTestGroup } from "../scoring-bridge.js";
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
// Helpers
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
function makeTestResult(overrides) {
|
|
17
|
+
const dims = overrides?.dimensions ?? {};
|
|
18
|
+
const componentResults = [];
|
|
19
|
+
if (dims.taskCompletion !== undefined) {
|
|
20
|
+
componentResults.push({
|
|
21
|
+
assertion: {
|
|
22
|
+
type: "llm-rubric",
|
|
23
|
+
metadata: { dimension: "task-completion" },
|
|
24
|
+
},
|
|
25
|
+
pass: true,
|
|
26
|
+
reason: JSON.stringify({ score: dims.taskCompletion }),
|
|
27
|
+
score: dims.taskCompletion / 100,
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
if (dims.codeCorrectness !== undefined) {
|
|
31
|
+
componentResults.push({
|
|
32
|
+
assertion: {
|
|
33
|
+
type: "llm-rubric",
|
|
34
|
+
metadata: { dimension: "code-correctness" },
|
|
35
|
+
},
|
|
36
|
+
pass: true,
|
|
37
|
+
reason: JSON.stringify({ score: dims.codeCorrectness }),
|
|
38
|
+
score: dims.codeCorrectness / 100,
|
|
39
|
+
});
|
|
40
|
+
}
|
|
41
|
+
if (dims.docCoverage !== undefined) {
|
|
42
|
+
componentResults.push({
|
|
43
|
+
assertion: {
|
|
44
|
+
type: "llm-rubric",
|
|
45
|
+
metadata: { dimension: "doc-coverage" },
|
|
46
|
+
},
|
|
47
|
+
pass: true,
|
|
48
|
+
reason: JSON.stringify({ score: dims.docCoverage }),
|
|
49
|
+
score: dims.docCoverage / 100,
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
return {
|
|
53
|
+
cost: overrides?.cost ?? 0.01,
|
|
54
|
+
description: overrides?.description ?? "test",
|
|
55
|
+
gradingResult: {
|
|
56
|
+
componentResults,
|
|
57
|
+
pass: true,
|
|
58
|
+
},
|
|
59
|
+
response: { output: "mock output" },
|
|
60
|
+
vars: overrides?.vars ?? { task: "test", docs: "" },
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
const DEFAULT_PROFILE = {
|
|
64
|
+
"code-correctness": 0.35,
|
|
65
|
+
"doc-coverage": 0.25,
|
|
66
|
+
"task-completion": 0.4,
|
|
67
|
+
};
|
|
68
|
+
const OUTPUT_ONLY_PROFILE = {
|
|
69
|
+
"code-correctness": 0.55,
|
|
70
|
+
"task-completion": 0.45,
|
|
71
|
+
};
|
|
72
|
+
// ---------------------------------------------------------------------------
|
|
73
|
+
// Tests
|
|
74
|
+
// ---------------------------------------------------------------------------
|
|
75
|
+
describe("scoreTestGroup — basic scoring", () => {
|
|
76
|
+
it("returns zeroes for empty test array", () => {
|
|
77
|
+
const result = scoreTestGroup([], DEFAULT_PROFILE);
|
|
78
|
+
assert.equal(result.composite, 0);
|
|
79
|
+
assert.equal(result.totalCost, 0);
|
|
80
|
+
assert.deepEqual(result.dimensions, {});
|
|
81
|
+
});
|
|
82
|
+
it("scores a single test with all dimensions", () => {
|
|
83
|
+
const tests = [
|
|
84
|
+
makeTestResult({
|
|
85
|
+
dimensions: {
|
|
86
|
+
taskCompletion: 80,
|
|
87
|
+
codeCorrectness: 70,
|
|
88
|
+
docCoverage: 60,
|
|
89
|
+
},
|
|
90
|
+
}),
|
|
91
|
+
];
|
|
92
|
+
const result = scoreTestGroup(tests, DEFAULT_PROFILE);
|
|
93
|
+
// Expected: 80*0.4 + 70*0.35 + 60*0.25 = 32 + 24.5 + 15 = 71.5 → 72
|
|
94
|
+
assert.equal(result.dimensions.taskCompletion, 80);
|
|
95
|
+
assert.equal(result.dimensions.codeCorrectness, 70);
|
|
96
|
+
assert.equal(result.dimensions.docCoverage, 60);
|
|
97
|
+
assert.equal(result.composite, 72);
|
|
98
|
+
});
|
|
99
|
+
it("averages across multiple tests", () => {
|
|
100
|
+
const tests = [
|
|
101
|
+
makeTestResult({
|
|
102
|
+
dimensions: { taskCompletion: 80, codeCorrectness: 60 },
|
|
103
|
+
}),
|
|
104
|
+
makeTestResult({
|
|
105
|
+
dimensions: { taskCompletion: 60, codeCorrectness: 80 },
|
|
106
|
+
}),
|
|
107
|
+
];
|
|
108
|
+
const result = scoreTestGroup(tests, OUTPUT_ONLY_PROFILE);
|
|
109
|
+
// taskCompletion avg = 70, codeCorrectness avg = 70
|
|
110
|
+
// Expected: 70*0.45 + 70*0.55 = 31.5 + 38.5 = 70
|
|
111
|
+
assert.equal(result.dimensions.taskCompletion, 70);
|
|
112
|
+
assert.equal(result.dimensions.codeCorrectness, 70);
|
|
113
|
+
assert.equal(result.composite, 70);
|
|
114
|
+
});
|
|
115
|
+
it("accumulates cost across tests", () => {
|
|
116
|
+
const tests = [
|
|
117
|
+
makeTestResult({ cost: 0.05, dimensions: { taskCompletion: 80 } }),
|
|
118
|
+
makeTestResult({ cost: 0.03, dimensions: { taskCompletion: 70 } }),
|
|
119
|
+
];
|
|
120
|
+
const result = scoreTestGroup(tests, DEFAULT_PROFILE);
|
|
121
|
+
assert.ok(Math.abs(result.totalCost - 0.08) < 0.001);
|
|
122
|
+
});
|
|
123
|
+
});
|
|
124
|
+
describe("scoreTestGroup — profile handling", () => {
|
|
125
|
+
it("uses output-only profile (excludes doc-coverage)", () => {
|
|
126
|
+
const tests = [
|
|
127
|
+
makeTestResult({
|
|
128
|
+
dimensions: {
|
|
129
|
+
taskCompletion: 80,
|
|
130
|
+
codeCorrectness: 60,
|
|
131
|
+
docCoverage: 100,
|
|
132
|
+
},
|
|
133
|
+
}),
|
|
134
|
+
];
|
|
135
|
+
const result = scoreTestGroup(tests, OUTPUT_ONLY_PROFILE);
|
|
136
|
+
// doc-coverage should be present in dimensions but NOT affect composite
|
|
137
|
+
// Expected: 80*0.45 + 60*0.55 = 36 + 33 = 69
|
|
138
|
+
assert.equal(result.dimensions.docCoverage, 100);
|
|
139
|
+
assert.equal(result.composite, 69);
|
|
140
|
+
});
|
|
141
|
+
it("handles profile with only one dimension", () => {
|
|
142
|
+
const tests = [
|
|
143
|
+
makeTestResult({
|
|
144
|
+
dimensions: { taskCompletion: 90, codeCorrectness: 50 },
|
|
145
|
+
}),
|
|
146
|
+
];
|
|
147
|
+
const result = scoreTestGroup(tests, { "task-completion": 1.0 });
|
|
148
|
+
// Only taskCompletion should count
|
|
149
|
+
assert.equal(result.composite, 90);
|
|
150
|
+
});
|
|
151
|
+
});
|
|
152
|
+
describe("scoreTestGroup — edge cases", () => {
|
|
153
|
+
it("handles tests with no rubric components", () => {
|
|
154
|
+
const test = {
|
|
155
|
+
cost: 0.01,
|
|
156
|
+
description: "no rubrics",
|
|
157
|
+
gradingResult: {
|
|
158
|
+
componentResults: [
|
|
159
|
+
{ assertion: { type: "javascript" }, pass: true, score: 1 },
|
|
160
|
+
],
|
|
161
|
+
pass: true,
|
|
162
|
+
},
|
|
163
|
+
response: { output: "mock" },
|
|
164
|
+
vars: { task: "test", docs: "" },
|
|
165
|
+
};
|
|
166
|
+
const result = scoreTestGroup([test], DEFAULT_PROFILE);
|
|
167
|
+
// No llm-rubric components → 0 composite
|
|
168
|
+
assert.equal(result.composite, 0);
|
|
169
|
+
assert.equal(result.totalCost, 0.01);
|
|
170
|
+
});
|
|
171
|
+
it("provides raw DimensionScore objects for advanced consumers", () => {
|
|
172
|
+
const tests = [
|
|
173
|
+
makeTestResult({
|
|
174
|
+
dimensions: { taskCompletion: 80, codeCorrectness: 60 },
|
|
175
|
+
}),
|
|
176
|
+
];
|
|
177
|
+
const result = scoreTestGroup(tests, DEFAULT_PROFILE);
|
|
178
|
+
assert.ok(result.rawDimensions.length >= 2);
|
|
179
|
+
const tcDim = result.rawDimensions.find((d) => d.dimensionId === "task-completion");
|
|
180
|
+
assert.ok(tcDim);
|
|
181
|
+
assert.ok(tcDim.score >= 0 && tcDim.score <= 1); // 0–1 scale
|
|
182
|
+
assert.equal(tcDim.assertionCount, 1);
|
|
183
|
+
});
|
|
184
|
+
});
|