@sanity/ailf 0.5.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/config/features.ts +23 -0
- package/config/models.ts +95 -0
- package/config/prompts.ts +16 -0
- package/config/rubrics.ts +225 -0
- package/config/schedules.ts +47 -0
- package/config/sinks.ts +37 -0
- package/config/sources.ts +21 -0
- package/config/thresholds.ts +61 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +171 -0
- package/dist/_vendor/ailf-core/config-helpers.js +170 -0
- package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
- package/dist/_vendor/ailf-core/env-helper.js +45 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/examples/index.js +25 -0
- package/dist/_vendor/ailf-core/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/index.js +5 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +17 -2
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
- package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +8 -2
- package/dist/_vendor/ailf-core/schemas/eval-config.js +17 -2
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +9 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +8 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -31
- package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -9
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
- package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/services/index.js +2 -1
- package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
- package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
- package/dist/_vendor/ailf-core/services/scoring.js +25 -15
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +332 -0
- package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +45 -83
- package/dist/_vendor/ailf-core/types/index.js +8 -1
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +257 -0
- package/dist/_vendor/ailf-core/types/plugin-registry.js +185 -0
- package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
- package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
- package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
- package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
- package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
- package/dist/_vendor/ailf-core/types/trace.js +18 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
- package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
- package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
- package/dist/_vendor/ailf-shared/index.d.ts +0 -1
- package/dist/_vendor/ailf-shared/index.js +0 -1
- package/dist/adapters/api-client/build-request.js +14 -13
- package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
- package/dist/adapters/config-sources/file-config-adapter.js +39 -12
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
- package/dist/adapters/config-sources/ts-config-loader.js +141 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
- package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +35 -39
- package/dist/adapters/task-sources/index.d.ts +3 -2
- package/dist/adapters/task-sources/index.js +3 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
- package/dist/adapters/task-sources/repo-schemas.js +227 -19
- package/dist/adapters/task-sources/repo-task-source.d.ts +16 -12
- package/dist/adapters/task-sources/repo-task-source.js +92 -80
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
- package/dist/adapters/task-sources/task-file-loader.js +83 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
- package/dist/adapters/task-sources/yaml-task-source.js +19 -16
- package/dist/cli.js +0 -2
- package/dist/commands/baseline.js +4 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/coverage-audit.js +9 -1
- package/dist/commands/explain-handler.js +25 -23
- package/dist/commands/fetch-docs.js +3 -2
- package/dist/commands/generate-configs.js +1 -1
- package/dist/commands/init.d.ts +6 -4
- package/dist/commands/init.js +302 -23
- package/dist/commands/interactive.js +11 -7
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +16 -6
- package/dist/commands/pipeline.d.ts +1 -0
- package/dist/commands/pipeline.js +4 -2
- package/dist/commands/pr-comment.js +1 -1
- package/dist/commands/publish.js +2 -2
- package/dist/commands/readiness-report.js +13 -6
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +13 -1
- package/dist/composition-root.js +99 -4
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +28 -8
- package/dist/orchestration/steps/calculate-scores-step.js +24 -11
- package/dist/orchestration/steps/fetch-docs-step.js +8 -7
- package/dist/orchestration/steps/gap-analysis-step.js +8 -7
- package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
- package/dist/orchestration/steps/generate-configs-step.js +261 -51
- package/dist/orchestration/steps/grader-consistency-step.js +7 -4
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/readiness-step.js +5 -6
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
- package/dist/orchestration/steps/run-eval-step.js +8 -7
- package/dist/pipeline/cache.d.ts +1 -1
- package/dist/pipeline/cache.js +36 -8
- package/dist/pipeline/calculate-scores.d.ts +2 -4
- package/dist/pipeline/calculate-scores.js +43 -113
- package/dist/pipeline/checks.js +2 -2
- package/dist/pipeline/compare.js +8 -8
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +392 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +404 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
- package/dist/pipeline/compiler/assertion-mapper.js +175 -0
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
- package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
- package/dist/pipeline/compiler/config-loader.d.ts +56 -0
- package/dist/pipeline/compiler/config-loader.js +111 -0
- package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
- package/dist/pipeline/compiler/fixture-resolver.js +113 -0
- package/dist/pipeline/compiler/hash.d.ts +11 -0
- package/dist/pipeline/compiler/hash.js +18 -0
- package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
- package/dist/pipeline/compiler/ignore-fields.js +113 -0
- package/dist/pipeline/compiler/index.d.ts +29 -0
- package/dist/pipeline/compiler/index.js +45 -0
- package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
- package/dist/pipeline/compiler/literacy-bridge.js +172 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +15 -0
- package/dist/pipeline/compiler/mode-handlers/index.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/index.d.ts +9 -0
- package/dist/pipeline/compiler/presets/index.js +8 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +42 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.js +208 -0
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
- package/dist/pipeline/compiler/provider-assembler.js +137 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
- package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
- package/dist/pipeline/compiler/sandbox/index.js +11 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
- package/dist/pipeline/compiler/scoring-bridge.js +114 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
- package/dist/pipeline/compiler/task-graph-builder.js +291 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
- package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
- package/dist/pipeline/compiler/telemetry/index.js +19 -0
- package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
- package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
- package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
- package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
- package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
- package/dist/pipeline/compiler/variable-resolver.js +115 -0
- package/dist/pipeline/coverage-audit.d.ts +15 -5
- package/dist/pipeline/coverage-audit.js +41 -22
- package/dist/pipeline/eval-constants.d.ts +16 -6
- package/dist/pipeline/eval-constants.js +25 -4
- package/dist/pipeline/eval-fingerprint.d.ts +2 -2
- package/dist/pipeline/eval-fingerprint.js +8 -9
- package/dist/pipeline/expand-tasks.d.ts +19 -10
- package/dist/pipeline/expand-tasks.js +34 -28
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +2 -2
- package/dist/pipeline/generate-configs.d.ts +22 -4
- package/dist/pipeline/generate-configs.js +53 -24
- package/dist/pipeline/grader-api.d.ts +3 -3
- package/dist/pipeline/grader-api.js +5 -12
- package/dist/pipeline/grader-compare-runner.js +20 -27
- package/dist/pipeline/grader-comparison.d.ts +4 -8
- package/dist/pipeline/grader-comparison.js +11 -17
- package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
- package/dist/pipeline/grader-consistency-runner.js +16 -20
- package/dist/pipeline/grader-consistency.d.ts +6 -10
- package/dist/pipeline/grader-consistency.js +13 -32
- package/dist/pipeline/grader-sensitivity-runner.js +7 -5
- package/dist/pipeline/grader-sensitivity.d.ts +2 -6
- package/dist/pipeline/grader-sensitivity.js +10 -10
- package/dist/pipeline/grader-validate-runner.js +7 -5
- package/dist/pipeline/grader-validation.d.ts +2 -6
- package/dist/pipeline/grader-validation.js +14 -22
- package/dist/pipeline/map-request-to-config.js +7 -1
- package/dist/pipeline/mirror-repo-tasks.d.ts +13 -13
- package/dist/pipeline/mirror-repo-tasks.js +22 -21
- package/dist/pipeline/normalize-mode.d.ts +49 -0
- package/dist/pipeline/normalize-mode.js +64 -0
- package/dist/pipeline/plan.d.ts +5 -2
- package/dist/pipeline/plan.js +134 -78
- package/dist/pipeline/pr-comment.js +2 -0
- package/dist/pipeline/profile-resolution.d.ts +22 -14
- package/dist/pipeline/profile-resolution.js +41 -19
- package/dist/pipeline/provenance.d.ts +2 -2
- package/dist/pipeline/provenance.js +12 -17
- package/dist/pipeline/release-report.js +4 -4
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/rubric-loader.d.ts +20 -0
- package/dist/pipeline/rubric-loader.js +37 -0
- package/dist/pipeline/validate.d.ts +4 -4
- package/dist/pipeline/validate.js +64 -53
- package/dist/schedules/loader.js +18 -8
- package/dist/scripts/migrate-task-mode.d.ts +24 -0
- package/dist/scripts/migrate-task-mode.js +85 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +15 -15
- package/dist/sinks/loader.js +5 -7
- package/dist/sources.d.ts +7 -7
- package/dist/sources.js +22 -24
- package/dist/webhook/dispatch.js +2 -1
- package/package.json +15 -4
- package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
- package/tasks/literacy/frameworks.task.ts +128 -0
- package/tasks/literacy/functions.task.ts +69 -0
- package/tasks/literacy/groq.task.ts +258 -0
- package/tasks/literacy/nextjs-live.task.ts +75 -0
- package/tasks/literacy/studio-setup.task.ts +131 -0
- package/tasks/literacy/visual-editing.task.ts +146 -0
- package/config/features.yaml +0 -116
- package/config/models.yaml +0 -116
- package/config/prompts.yaml +0 -75
- package/config/rubrics.yaml +0 -81
- package/config/schedules.yaml +0 -43
- package/config/sinks.yaml +0 -54
- package/config/sources.yaml +0 -51
- package/config/thresholds.yaml +0 -49
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
|
@@ -47,7 +47,7 @@ export type WeightProfile = z.infer<typeof WeightProfileSchema>;
|
|
|
47
47
|
*/
|
|
48
48
|
export declare const RubricConfigSchema: z.ZodObject<{
|
|
49
49
|
footer: z.ZodString;
|
|
50
|
-
"mode-profiles": z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodRecord<z.ZodString, z.ZodString
|
|
50
|
+
"mode-profiles": z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnion<readonly [z.ZodString, z.ZodRecord<z.ZodString, z.ZodString>]>>>>;
|
|
51
51
|
profiles: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodRecord<z.ZodString, z.ZodNumber>>>;
|
|
52
52
|
templates: z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
53
53
|
criteria_label: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
@@ -67,19 +67,18 @@ export declare const FeatureSchema: z.ZodObject<{
|
|
|
67
67
|
id: z.ZodString;
|
|
68
68
|
name: z.ZodString;
|
|
69
69
|
priority: z.ZodEnum<{
|
|
70
|
+
critical: "critical";
|
|
70
71
|
high: "high";
|
|
71
|
-
low: "low";
|
|
72
72
|
medium: "medium";
|
|
73
|
-
|
|
73
|
+
low: "low";
|
|
74
74
|
}>;
|
|
75
75
|
sections: z.ZodArray<z.ZodString>;
|
|
76
76
|
status: z.ZodEnum<{
|
|
77
77
|
covered: "covered";
|
|
78
|
-
"out-of-scope": "out-of-scope";
|
|
79
|
-
planned: "planned";
|
|
80
78
|
uncovered: "uncovered";
|
|
79
|
+
planned: "planned";
|
|
80
|
+
"out-of-scope": "out-of-scope";
|
|
81
81
|
}>;
|
|
82
|
-
taskCount: z.ZodOptional<z.ZodNumber>;
|
|
83
82
|
}, z.core.$strip>;
|
|
84
83
|
/** Inferred TypeScript type for a product feature. */
|
|
85
84
|
export type Feature = z.infer<typeof FeatureSchema>;
|
|
@@ -92,19 +91,18 @@ export declare const FeatureRegistrySchema: z.ZodObject<{
|
|
|
92
91
|
id: z.ZodString;
|
|
93
92
|
name: z.ZodString;
|
|
94
93
|
priority: z.ZodEnum<{
|
|
94
|
+
critical: "critical";
|
|
95
95
|
high: "high";
|
|
96
|
-
low: "low";
|
|
97
96
|
medium: "medium";
|
|
98
|
-
|
|
97
|
+
low: "low";
|
|
99
98
|
}>;
|
|
100
99
|
sections: z.ZodArray<z.ZodString>;
|
|
101
100
|
status: z.ZodEnum<{
|
|
102
101
|
covered: "covered";
|
|
103
|
-
"out-of-scope": "out-of-scope";
|
|
104
|
-
planned: "planned";
|
|
105
102
|
uncovered: "uncovered";
|
|
103
|
+
planned: "planned";
|
|
104
|
+
"out-of-scope": "out-of-scope";
|
|
106
105
|
}>;
|
|
107
|
-
taskCount: z.ZodOptional<z.ZodNumber>;
|
|
108
106
|
}, z.core.$strip>>;
|
|
109
107
|
}, z.core.$strip>;
|
|
110
108
|
/** Inferred TypeScript type for the feature registry. */
|
|
@@ -440,14 +438,11 @@ export declare const TaskFileSchema: z.ZodArray<z.ZodUnion<readonly [z.ZodObject
|
|
|
440
438
|
export type TaskFile = z.infer<typeof TaskFileSchema>;
|
|
441
439
|
/**
|
|
442
440
|
* Schema for per-dimension threshold values.
|
|
441
|
+
* Uses a dynamic record to support all evaluation modes, not just literacy.
|
|
443
442
|
* Keys use kebab-case to match YAML convention; the threshold engine
|
|
444
443
|
* normalizes to camelCase for comparison against FeatureScore fields.
|
|
445
444
|
*/
|
|
446
|
-
export declare const ThresholdDimensionsSchema: z.
|
|
447
|
-
"code-correctness": z.ZodOptional<z.ZodNumber>;
|
|
448
|
-
"doc-coverage": z.ZodOptional<z.ZodNumber>;
|
|
449
|
-
"task-completion": z.ZodOptional<z.ZodNumber>;
|
|
450
|
-
}, z.core.$strip>;
|
|
445
|
+
export declare const ThresholdDimensionsSchema: z.ZodRecord<z.ZodString, z.ZodNumber>;
|
|
451
446
|
/** Inferred TypeScript type for threshold dimension overrides. */
|
|
452
447
|
export type ThresholdDimensions = z.infer<typeof ThresholdDimensionsSchema>;
|
|
453
448
|
/**
|
|
@@ -457,11 +452,7 @@ export type ThresholdDimensions = z.infer<typeof ThresholdDimensionsSchema>;
|
|
|
457
452
|
export declare const ThresholdDefaultsSchema: z.ZodObject<{
|
|
458
453
|
ceiling: z.ZodOptional<z.ZodNumber>;
|
|
459
454
|
composite: z.ZodNumber;
|
|
460
|
-
dimensions: z.ZodOptional<z.
|
|
461
|
-
"code-correctness": z.ZodOptional<z.ZodNumber>;
|
|
462
|
-
"doc-coverage": z.ZodOptional<z.ZodNumber>;
|
|
463
|
-
"task-completion": z.ZodOptional<z.ZodNumber>;
|
|
464
|
-
}, z.core.$strip>>;
|
|
455
|
+
dimensions: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
|
|
465
456
|
"doc-lift": z.ZodOptional<z.ZodNumber>;
|
|
466
457
|
}, z.core.$strip>;
|
|
467
458
|
/** Inferred TypeScript type for threshold defaults. */
|
|
@@ -501,21 +492,13 @@ export declare const ThresholdConfigSchema: z.ZodObject<{
|
|
|
501
492
|
areas: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
502
493
|
ceiling: z.ZodOptional<z.ZodOptional<z.ZodNumber>>;
|
|
503
494
|
composite: z.ZodOptional<z.ZodNumber>;
|
|
504
|
-
dimensions: z.ZodOptional<z.ZodOptional<z.
|
|
505
|
-
"code-correctness": z.ZodOptional<z.ZodNumber>;
|
|
506
|
-
"doc-coverage": z.ZodOptional<z.ZodNumber>;
|
|
507
|
-
"task-completion": z.ZodOptional<z.ZodNumber>;
|
|
508
|
-
}, z.core.$strip>>>;
|
|
495
|
+
dimensions: z.ZodOptional<z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>>;
|
|
509
496
|
"doc-lift": z.ZodOptional<z.ZodOptional<z.ZodNumber>>;
|
|
510
497
|
}, z.core.$strip>>>;
|
|
511
498
|
defaults: z.ZodObject<{
|
|
512
499
|
ceiling: z.ZodOptional<z.ZodNumber>;
|
|
513
500
|
composite: z.ZodNumber;
|
|
514
|
-
dimensions: z.ZodOptional<z.
|
|
515
|
-
"code-correctness": z.ZodOptional<z.ZodNumber>;
|
|
516
|
-
"doc-coverage": z.ZodOptional<z.ZodNumber>;
|
|
517
|
-
"task-completion": z.ZodOptional<z.ZodNumber>;
|
|
518
|
-
}, z.core.$strip>>;
|
|
501
|
+
dimensions: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
|
|
519
502
|
"doc-lift": z.ZodOptional<z.ZodNumber>;
|
|
520
503
|
}, z.core.$strip>;
|
|
521
504
|
regression: z.ZodOptional<z.ZodObject<{
|
|
@@ -43,10 +43,22 @@ const WeightProfileSchema = z
|
|
|
43
43
|
return Math.abs(sum - 1.0) < 0.001;
|
|
44
44
|
}, { message: "profile weights must sum to 1.0" });
|
|
45
45
|
/**
|
|
46
|
-
* Mode-to-profile bindings — maps (mode,
|
|
47
|
-
*
|
|
46
|
+
* Mode-to-profile bindings — maps (mode, perspective) pairs to profile names.
|
|
47
|
+
*
|
|
48
|
+
* Flat form (most modes):
|
|
49
|
+
* { "mcp-server": { gold: "mcp-behavior" } }
|
|
50
|
+
*
|
|
51
|
+
* Nested form (literacy mode with variant sub-keys):
|
|
52
|
+
* { literacy: { baseline: { gold: "default", baseline: "output-only" }, agentic: { gold: "default" } } }
|
|
53
|
+
*
|
|
54
|
+
* The nested form adds a variant level between mode and perspective,
|
|
55
|
+
* allowing a single canonical mode to host multiple scoring variants.
|
|
48
56
|
*/
|
|
49
|
-
const
|
|
57
|
+
const ModeProfileEntrySchema = z.union([
|
|
58
|
+
z.string(),
|
|
59
|
+
z.record(z.string(), z.string()),
|
|
60
|
+
]);
|
|
61
|
+
const ModeProfilesSchema = z.record(z.string(), z.record(z.string(), ModeProfileEntrySchema));
|
|
50
62
|
/**
|
|
51
63
|
* Schema for the full config/rubrics.yaml config file.
|
|
52
64
|
*
|
|
@@ -96,7 +108,6 @@ export const FeatureSchema = z.object({
|
|
|
96
108
|
priority: z.enum(["critical", "high", "medium", "low"]),
|
|
97
109
|
sections: z.array(z.string().min(1)).min(1),
|
|
98
110
|
status: z.enum(["covered", "uncovered", "planned", "out-of-scope"]),
|
|
99
|
-
taskCount: z.number().int().min(0).optional(),
|
|
100
111
|
});
|
|
101
112
|
/**
|
|
102
113
|
* Schema for the full config/features.yaml config file.
|
|
@@ -277,14 +288,11 @@ export const TaskFileSchema = z
|
|
|
277
288
|
// ---------------------------------------------------------------------------
|
|
278
289
|
/**
|
|
279
290
|
* Schema for per-dimension threshold values.
|
|
291
|
+
* Uses a dynamic record to support all evaluation modes, not just literacy.
|
|
280
292
|
* Keys use kebab-case to match YAML convention; the threshold engine
|
|
281
293
|
* normalizes to camelCase for comparison against FeatureScore fields.
|
|
282
294
|
*/
|
|
283
|
-
export const ThresholdDimensionsSchema = z.
|
|
284
|
-
"code-correctness": z.number().min(0).max(100).optional(),
|
|
285
|
-
"doc-coverage": z.number().min(0).max(100).optional(),
|
|
286
|
-
"task-completion": z.number().min(0).max(100).optional(),
|
|
287
|
-
});
|
|
295
|
+
export const ThresholdDimensionsSchema = z.record(z.string(), z.number().min(0).max(100));
|
|
288
296
|
/**
|
|
289
297
|
* Schema for threshold defaults (and per-area overrides).
|
|
290
298
|
* All fields are optional in per-area overrides; defaults must have composite.
|
|
@@ -18,10 +18,15 @@ export declare const ScheduleEntrySchema: z.ZodObject<{
|
|
|
18
18
|
cron: z.ZodString;
|
|
19
19
|
enabled: z.ZodDefault<z.ZodBoolean>;
|
|
20
20
|
mode: z.ZodDefault<z.ZodEnum<{
|
|
21
|
-
|
|
21
|
+
custom: "custom";
|
|
22
|
+
literacy: "literacy";
|
|
23
|
+
"mcp-server": "mcp-server";
|
|
24
|
+
"agent-harness": "agent-harness";
|
|
25
|
+
"knowledge-probe": "knowledge-probe";
|
|
22
26
|
baseline: "baseline";
|
|
23
|
-
|
|
27
|
+
agentic: "agentic";
|
|
24
28
|
observed: "observed";
|
|
29
|
+
full: "full";
|
|
25
30
|
}>>;
|
|
26
31
|
name: z.ZodString;
|
|
27
32
|
publish: z.ZodDefault<z.ZodBoolean>;
|
|
@@ -53,10 +58,15 @@ export declare const SchedulesFileSchema: z.ZodObject<{
|
|
|
53
58
|
cron: z.ZodString;
|
|
54
59
|
enabled: z.ZodDefault<z.ZodBoolean>;
|
|
55
60
|
mode: z.ZodDefault<z.ZodEnum<{
|
|
56
|
-
|
|
61
|
+
custom: "custom";
|
|
62
|
+
literacy: "literacy";
|
|
63
|
+
"mcp-server": "mcp-server";
|
|
64
|
+
"agent-harness": "agent-harness";
|
|
65
|
+
"knowledge-probe": "knowledge-probe";
|
|
57
66
|
baseline: "baseline";
|
|
58
|
-
|
|
67
|
+
agentic: "agentic";
|
|
59
68
|
observed: "observed";
|
|
69
|
+
full: "full";
|
|
60
70
|
}>>;
|
|
61
71
|
name: z.ZodString;
|
|
62
72
|
publish: z.ZodDefault<z.ZodBoolean>;
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
* @see docs/design-docs/report-store/implementation.md — Phase 5
|
|
12
12
|
*/
|
|
13
13
|
import { z } from "zod";
|
|
14
|
+
import { RAW_EVAL_MODES } from "../../ailf-shared/index.js";
|
|
14
15
|
// ---------------------------------------------------------------------------
|
|
15
16
|
// Cron expression validation
|
|
16
17
|
// ---------------------------------------------------------------------------
|
|
@@ -34,8 +35,11 @@ export const ScheduleEntrySchema = z.object({
|
|
|
34
35
|
cron: CronSchema,
|
|
35
36
|
/** Whether this schedule is active */
|
|
36
37
|
enabled: z.boolean().default(true),
|
|
37
|
-
/**
|
|
38
|
-
|
|
38
|
+
/**
|
|
39
|
+
* Evaluation mode — accepts both canonical and legacy names.
|
|
40
|
+
* Legacy names must pass through normalizeMode() before entering typed pipeline code.
|
|
41
|
+
*/
|
|
42
|
+
mode: z.enum(RAW_EVAL_MODES).default("baseline"),
|
|
39
43
|
/** Human-readable schedule name (used as report tag) */
|
|
40
44
|
name: z
|
|
41
45
|
.string()
|
|
@@ -17,10 +17,10 @@
|
|
|
17
17
|
import { z } from "zod";
|
|
18
18
|
/** All supported sink types as a Zod union. */
|
|
19
19
|
export declare const SinkTypeSchema: z.ZodEnum<{
|
|
20
|
-
webhook: "webhook";
|
|
21
20
|
bigquery: "bigquery";
|
|
22
21
|
"github-comment": "github-comment";
|
|
23
22
|
slack: "slack";
|
|
23
|
+
webhook: "webhook";
|
|
24
24
|
}>;
|
|
25
25
|
/** Supported sink type string literal union. */
|
|
26
26
|
export type SinkType = z.infer<typeof SinkTypeSchema>;
|
|
@@ -25,12 +25,21 @@ export function formatComparisonMarkdown(report) {
|
|
|
25
25
|
lines.push("");
|
|
26
26
|
lines.push(`**Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)}** (${overallIcon} ${deltaStr(overall)})`);
|
|
27
27
|
lines.push("");
|
|
28
|
-
//
|
|
29
|
-
|
|
30
|
-
|
|
28
|
+
// Derive dimension columns from the first area's keys (all areas share the
|
|
29
|
+
// same scoring profile, so the key set is uniform).
|
|
30
|
+
const dimKeys = report.areas.length > 0
|
|
31
|
+
? Object.keys(report.areas[0].dimensions)
|
|
32
|
+
: Object.keys(report.deltas.perDimension);
|
|
33
|
+
// Per-area table — columns are dynamic
|
|
34
|
+
const dimHeaders = dimKeys.map(kebabToTitleCase);
|
|
35
|
+
const headerRow = ["Feature", "Baseline", "Current", "Delta", ...dimHeaders];
|
|
36
|
+
const separatorRow = headerRow.map(() => "------");
|
|
37
|
+
lines.push(`| ${headerRow.join(" | ")} |`);
|
|
38
|
+
lines.push(`|${separatorRow.join("|")}|`);
|
|
31
39
|
for (const a of report.areas) {
|
|
32
40
|
const icon = changeIcon(a.change);
|
|
33
|
-
|
|
41
|
+
const dimCells = dimKeys.map((k) => deltaStr(a.dimensions[k]?.delta ?? 0));
|
|
42
|
+
lines.push(`| ${a.area} | ${a.baseline} | ${a.experiment} | ${icon} ${deltaStr(a.delta)} | ${dimCells.join(" | ")} |`);
|
|
34
43
|
}
|
|
35
44
|
lines.push("");
|
|
36
45
|
// Summary
|
|
@@ -55,9 +64,9 @@ export function formatComparisonMarkdown(report) {
|
|
|
55
64
|
const dim = report.deltas.perDimension;
|
|
56
65
|
lines.push("| Dimension | Delta |");
|
|
57
66
|
lines.push("|-----------|-------|");
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
67
|
+
for (const k of Object.keys(dim)) {
|
|
68
|
+
lines.push(`| ${kebabToTitleCase(k)} | ${deltaStr(dim[k])} |`);
|
|
69
|
+
}
|
|
61
70
|
lines.push(`| Doc Lift | ${deltaStr(report.deltas.docLift)} |`);
|
|
62
71
|
if (report.deltas.cost !== undefined) {
|
|
63
72
|
const costStr = report.deltas.cost > 0
|
|
@@ -91,29 +100,51 @@ export function formatComparisonTable(report) {
|
|
|
91
100
|
: "unchanged");
|
|
92
101
|
lines.push(` Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)} (${overallIcon} ${deltaStr(overall)})`);
|
|
93
102
|
lines.push("");
|
|
94
|
-
// Per-dimension averages
|
|
103
|
+
// Per-dimension averages — derived dynamically from the report
|
|
95
104
|
const dim = report.deltas.perDimension;
|
|
105
|
+
const dimKeys = report.areas.length > 0
|
|
106
|
+
? Object.keys(report.areas[0].dimensions)
|
|
107
|
+
: Object.keys(dim);
|
|
96
108
|
lines.push(" Dimension averages:");
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
109
|
+
// Pad labels to the longest dimension label for alignment
|
|
110
|
+
const dimLabels = dimKeys.map(kebabToTitleCase);
|
|
111
|
+
// +1 for the colon appended to each label
|
|
112
|
+
const maxLabelLen = Math.max(...dimLabels.map((l) => l.length + 1), "Doc Lift:".length);
|
|
113
|
+
for (let i = 0; i < dimKeys.length; i++) {
|
|
114
|
+
lines.push(` ${(dimLabels[i] + ":").padEnd(maxLabelLen)} ${deltaStr(dim[dimKeys[i]] ?? 0)}`);
|
|
115
|
+
}
|
|
116
|
+
lines.push(` ${"Doc Lift:".padEnd(maxLabelLen)} ${deltaStr(report.deltas.docLift)}`);
|
|
101
117
|
if (report.deltas.cost !== undefined) {
|
|
102
|
-
lines.push(` Cost:
|
|
118
|
+
lines.push(` ${"Cost:".padEnd(maxLabelLen)} ${report.deltas.cost > 0 ? "+" : ""}$${report.deltas.cost.toFixed(4)}`);
|
|
103
119
|
}
|
|
104
120
|
lines.push("");
|
|
105
|
-
// Per-area table
|
|
121
|
+
// Per-area table — columns are dynamic
|
|
106
122
|
lines.push("-".repeat(80));
|
|
107
123
|
lines.push("PER-AREA BREAKDOWN");
|
|
108
124
|
lines.push("-".repeat(80));
|
|
109
125
|
lines.push("");
|
|
110
|
-
const
|
|
111
|
-
const
|
|
112
|
-
|
|
113
|
-
|
|
126
|
+
const dimHeaders = dimKeys.map(kebabToTitleCase);
|
|
127
|
+
const colWidths = dimHeaders.map((h) => Math.max(h.length, 4));
|
|
128
|
+
const hCols = [
|
|
129
|
+
"Feature Area".padEnd(19),
|
|
130
|
+
"Baseline".padStart(8),
|
|
131
|
+
"Experiment".padStart(10),
|
|
132
|
+
"Delta".padStart(5),
|
|
133
|
+
...dimHeaders.map((h, i) => h.padStart(colWidths[i])),
|
|
134
|
+
];
|
|
135
|
+
const sepCols = [
|
|
136
|
+
"-".repeat(21),
|
|
137
|
+
"-".repeat(10),
|
|
138
|
+
"-".repeat(12),
|
|
139
|
+
"-".repeat(7),
|
|
140
|
+
...colWidths.map((w) => "-".repeat(w + 2)),
|
|
141
|
+
];
|
|
142
|
+
lines.push(`| ${hCols.join(" | ")} |`);
|
|
143
|
+
lines.push(`|${sepCols.join("|")}|`);
|
|
114
144
|
for (const a of report.areas) {
|
|
115
145
|
const icon = changeIcon(a.change);
|
|
116
|
-
|
|
146
|
+
const dimCells = dimKeys.map((k, i) => deltaStr(a.dimensions[k]?.delta ?? 0).padStart(colWidths[i]));
|
|
147
|
+
lines.push(`| ${icon} ${a.area.padEnd(17)} | ${String(a.baseline).padStart(8)} | ${String(a.experiment).padStart(10)} | ${deltaStr(a.delta).padStart(5)} | ${dimCells.join(" | ")} |`);
|
|
117
148
|
}
|
|
118
149
|
lines.push("");
|
|
119
150
|
// Classification summary
|
|
@@ -187,3 +218,10 @@ function deltaStr(d) {
|
|
|
187
218
|
return `${Math.round(d)}`;
|
|
188
219
|
return "0";
|
|
189
220
|
}
|
|
221
|
+
/** Convert kebab-case dimension name to title case (e.g. 'task-completion' → 'Task Completion') */
|
|
222
|
+
function kebabToTitleCase(name) {
|
|
223
|
+
return name
|
|
224
|
+
.split("-")
|
|
225
|
+
.map((w) => w.charAt(0).toUpperCase() + w.slice(1))
|
|
226
|
+
.join(" ");
|
|
227
|
+
}
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
* Extracted from packages/eval/src/lib/ during the Ports & Adapters
|
|
8
8
|
* migration (Phase 4e).
|
|
9
9
|
*/
|
|
10
|
-
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
|
|
10
|
+
export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
|
|
11
11
|
export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
|
|
12
|
+
export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, type AggregationStrategy, type AreaScore, type AssertionScore, type DimensionScore, type EnsembleGradingConfig, type GraderTransitionConfig, type TaskScore, type TaskScoreOptions, } from "./scoring-engine.js";
|
|
12
13
|
export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "./config-helpers.js";
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
* Extracted from packages/eval/src/lib/ during the Ports & Adapters
|
|
8
8
|
* migration (Phase 4e).
|
|
9
9
|
*/
|
|
10
|
-
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
|
|
10
|
+
export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
|
|
11
11
|
export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
|
|
12
|
+
export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, } from "./scoring-engine.js";
|
|
12
13
|
export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "./config-helpers.js";
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 4-tier scoring engine — unified scoring across all evaluation modes.
|
|
3
|
+
*
|
|
4
|
+
* Tier 1: Assertion-level (atomic pass/fail + optional numeric score)
|
|
5
|
+
* Tier 2: Dimension-level (aggregated per scoring dimension)
|
|
6
|
+
* Tier 3: Task-level (weighted composite of dimensions)
|
|
7
|
+
* Tier 4: Suite/Area-level (aggregated across tasks)
|
|
8
|
+
*
|
|
9
|
+
* This engine is mode-agnostic — it works for literacy, MCP server,
|
|
10
|
+
* agent harness, knowledge probe, and custom modes.
|
|
11
|
+
*
|
|
12
|
+
* @see docs/design-docs/architecture-overhaul/scoring-rubrics-assertions.md
|
|
13
|
+
*/
|
|
14
|
+
/** The result of a single assertion evaluation */
|
|
15
|
+
export interface AssertionScore {
|
|
16
|
+
/** Whether the assertion passed */
|
|
17
|
+
pass: boolean;
|
|
18
|
+
/** Numeric score in [0, 1], null if not applicable */
|
|
19
|
+
score: number | null;
|
|
20
|
+
/** Human-readable explanation */
|
|
21
|
+
reason: string;
|
|
22
|
+
/** Assertion type that produced this result */
|
|
23
|
+
assertionType: string;
|
|
24
|
+
/** Dimension this assertion contributes to */
|
|
25
|
+
dimension: string;
|
|
26
|
+
/** Wall-clock grading time in ms */
|
|
27
|
+
latencyMs: number;
|
|
28
|
+
/** Weight of this assertion (1.0 if unspecified) */
|
|
29
|
+
weight: number;
|
|
30
|
+
}
|
|
31
|
+
/** Aggregation strategy for dimension scoring */
|
|
32
|
+
export type AggregationStrategy = "max" | "mean" | "min" | "weighted-mean";
|
|
33
|
+
/** Aggregated score for a scoring dimension */
|
|
34
|
+
export interface DimensionScore {
|
|
35
|
+
/** Dimension identifier (e.g., "code-correctness") */
|
|
36
|
+
dimensionId: string;
|
|
37
|
+
/** Human-readable label */
|
|
38
|
+
label: string;
|
|
39
|
+
/** Aggregated score in [0, 1] */
|
|
40
|
+
score: number;
|
|
41
|
+
/** How many assertions contributed */
|
|
42
|
+
assertionCount: number;
|
|
43
|
+
/** How many assertions passed */
|
|
44
|
+
passCount: number;
|
|
45
|
+
/** Aggregation method used */
|
|
46
|
+
aggregation: AggregationStrategy;
|
|
47
|
+
/** Individual assertion results */
|
|
48
|
+
assertions: AssertionScore[];
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Aggregate assertion scores into dimension scores.
|
|
52
|
+
*
|
|
53
|
+
* Groups assertions by dimension, then applies the configured aggregation
|
|
54
|
+
* strategy (default: weighted-mean).
|
|
55
|
+
*/
|
|
56
|
+
export declare function aggregateDimensions(assertions: AssertionScore[], options?: {
|
|
57
|
+
defaultAggregation?: AggregationStrategy;
|
|
58
|
+
dimensionLabels?: Record<string, string>;
|
|
59
|
+
}): DimensionScore[];
|
|
60
|
+
/** Weighted composite score for a task */
|
|
61
|
+
export interface TaskScore {
|
|
62
|
+
/** Task identifier */
|
|
63
|
+
taskId: string;
|
|
64
|
+
/** Feature area (e.g., "groq", "studio"). When absent, aggregateAreas() falls back to taskId prefix. */
|
|
65
|
+
area?: string;
|
|
66
|
+
/** Weighted composite score in [0, 1] */
|
|
67
|
+
score: number;
|
|
68
|
+
/** Per-dimension breakdown */
|
|
69
|
+
dimensions: DimensionScore[];
|
|
70
|
+
/** Weight configuration used */
|
|
71
|
+
weights: Record<string, number>;
|
|
72
|
+
/** Source of weights (default profile, task override, etc.) */
|
|
73
|
+
weightSource: string;
|
|
74
|
+
/** Whether the task met its quality threshold */
|
|
75
|
+
passesThreshold: boolean;
|
|
76
|
+
/** The threshold compared against */
|
|
77
|
+
threshold: number;
|
|
78
|
+
/** Warnings about potential misconfiguration (e.g., no dimensions matched weights) */
|
|
79
|
+
warnings?: string[];
|
|
80
|
+
}
|
|
81
|
+
/** Options for computing a task score */
|
|
82
|
+
export interface TaskScoreOptions {
|
|
83
|
+
/** Task identifier */
|
|
84
|
+
taskId: string;
|
|
85
|
+
/** Feature area (e.g., "groq", "studio"). Falls back to taskId prefix if omitted. */
|
|
86
|
+
area?: string;
|
|
87
|
+
/** Dimension weights (must sum to ~1.0) */
|
|
88
|
+
weights: Record<string, number>;
|
|
89
|
+
/** Where the weights came from (for traceability) */
|
|
90
|
+
weightSource?: string;
|
|
91
|
+
/** Quality threshold (0-1) for pass/fail gate */
|
|
92
|
+
threshold?: number;
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Compute a weighted task score from dimension scores.
|
|
96
|
+
*/
|
|
97
|
+
export declare function computeTaskScore(dimensions: DimensionScore[], options: TaskScoreOptions): TaskScore;
|
|
98
|
+
/** Aggregated score across tasks in a feature area */
|
|
99
|
+
export interface AreaScore {
|
|
100
|
+
/** Area identifier (e.g., "groq", "studio") */
|
|
101
|
+
areaId: string;
|
|
102
|
+
/** Mean task score */
|
|
103
|
+
score: number;
|
|
104
|
+
/** Number of tasks evaluated */
|
|
105
|
+
taskCount: number;
|
|
106
|
+
/** Number of tasks passing threshold */
|
|
107
|
+
passingTaskCount: number;
|
|
108
|
+
/** Per-task breakdown */
|
|
109
|
+
tasks: TaskScore[];
|
|
110
|
+
/** Trend vs previous evaluation */
|
|
111
|
+
delta: number | null;
|
|
112
|
+
}
|
|
113
|
+
/**
|
|
114
|
+
* Aggregate task scores into area scores.
|
|
115
|
+
*/
|
|
116
|
+
export declare function aggregateAreas(tasks: TaskScore[], previousScores?: Record<string, number>): AreaScore[];
|
|
117
|
+
/**
|
|
118
|
+
* Normalize an assertion score to [0, 1] range.
|
|
119
|
+
*
|
|
120
|
+
* Different assertion types produce scores in different ranges:
|
|
121
|
+
* - Boolean (contains, equals, regex): 0 or 1
|
|
122
|
+
* - LLM rubric: 0-100 (needs /100)
|
|
123
|
+
* - similar: 0-1 (already normalized)
|
|
124
|
+
* - javascript/python: user-defined (assumed 0-1)
|
|
125
|
+
*/
|
|
126
|
+
export declare function normalizeScore(rawScore: number, assertionType: string): number;
|
|
127
|
+
/** Grader transition configuration for gradual migration */
|
|
128
|
+
export interface GraderTransitionConfig {
|
|
129
|
+
/** Current (old) grader model */
|
|
130
|
+
old: string;
|
|
131
|
+
/** New grader model to transition to */
|
|
132
|
+
new_: string;
|
|
133
|
+
/** ISO date after which old grader is retired */
|
|
134
|
+
expiration: string;
|
|
135
|
+
/** Whether to run both graders in parallel */
|
|
136
|
+
parallel: boolean;
|
|
137
|
+
}
|
|
138
|
+
/** Ensemble grading configuration */
|
|
139
|
+
export interface EnsembleGradingConfig {
|
|
140
|
+
/** Whether ensemble grading is enabled */
|
|
141
|
+
enabled: boolean;
|
|
142
|
+
/** Grader models to use */
|
|
143
|
+
models: string[];
|
|
144
|
+
/** Aggregation strategy for ensemble scores */
|
|
145
|
+
aggregation: "max" | "mean" | "median";
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Compute ensemble score from multiple grader outputs.
|
|
149
|
+
*/
|
|
150
|
+
export declare function computeEnsembleScore(scores: number[], aggregation?: "max" | "mean" | "median"): {
|
|
151
|
+
score: number;
|
|
152
|
+
agreement: number;
|
|
153
|
+
};
|