@sanity/ailf 0.5.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/config/features.ts +23 -0
- package/config/models.ts +95 -0
- package/config/prompts.ts +16 -0
- package/config/rubrics.ts +225 -0
- package/config/schedules.ts +47 -0
- package/config/sinks.ts +37 -0
- package/config/sources.ts +21 -0
- package/config/thresholds.ts +61 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +171 -0
- package/dist/_vendor/ailf-core/config-helpers.js +170 -0
- package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
- package/dist/_vendor/ailf-core/env-helper.js +45 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/examples/index.js +25 -0
- package/dist/_vendor/ailf-core/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/index.js +5 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +17 -2
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
- package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +8 -2
- package/dist/_vendor/ailf-core/schemas/eval-config.js +17 -2
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +9 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +8 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -31
- package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -9
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
- package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/services/index.js +2 -1
- package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
- package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
- package/dist/_vendor/ailf-core/services/scoring.js +25 -15
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +332 -0
- package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +45 -83
- package/dist/_vendor/ailf-core/types/index.js +8 -1
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +257 -0
- package/dist/_vendor/ailf-core/types/plugin-registry.js +185 -0
- package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
- package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
- package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
- package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
- package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
- package/dist/_vendor/ailf-core/types/trace.js +18 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
- package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
- package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
- package/dist/_vendor/ailf-shared/index.d.ts +0 -1
- package/dist/_vendor/ailf-shared/index.js +0 -1
- package/dist/adapters/api-client/build-request.js +14 -13
- package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
- package/dist/adapters/config-sources/file-config-adapter.js +39 -12
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
- package/dist/adapters/config-sources/ts-config-loader.js +141 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
- package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +35 -39
- package/dist/adapters/task-sources/index.d.ts +3 -2
- package/dist/adapters/task-sources/index.js +3 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
- package/dist/adapters/task-sources/repo-schemas.js +227 -19
- package/dist/adapters/task-sources/repo-task-source.d.ts +16 -12
- package/dist/adapters/task-sources/repo-task-source.js +92 -80
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
- package/dist/adapters/task-sources/task-file-loader.js +83 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
- package/dist/adapters/task-sources/yaml-task-source.js +19 -16
- package/dist/cli.js +0 -2
- package/dist/commands/baseline.js +4 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/coverage-audit.js +9 -1
- package/dist/commands/explain-handler.js +25 -23
- package/dist/commands/fetch-docs.js +3 -2
- package/dist/commands/generate-configs.js +1 -1
- package/dist/commands/init.d.ts +6 -4
- package/dist/commands/init.js +302 -23
- package/dist/commands/interactive.js +11 -7
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +16 -6
- package/dist/commands/pipeline.d.ts +1 -0
- package/dist/commands/pipeline.js +4 -2
- package/dist/commands/pr-comment.js +1 -1
- package/dist/commands/publish.js +2 -2
- package/dist/commands/readiness-report.js +13 -6
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +13 -1
- package/dist/composition-root.js +99 -4
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +28 -8
- package/dist/orchestration/steps/calculate-scores-step.js +24 -11
- package/dist/orchestration/steps/fetch-docs-step.js +8 -7
- package/dist/orchestration/steps/gap-analysis-step.js +8 -7
- package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
- package/dist/orchestration/steps/generate-configs-step.js +261 -51
- package/dist/orchestration/steps/grader-consistency-step.js +7 -4
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/readiness-step.js +5 -6
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
- package/dist/orchestration/steps/run-eval-step.js +8 -7
- package/dist/pipeline/cache.d.ts +1 -1
- package/dist/pipeline/cache.js +36 -8
- package/dist/pipeline/calculate-scores.d.ts +2 -4
- package/dist/pipeline/calculate-scores.js +43 -113
- package/dist/pipeline/checks.js +2 -2
- package/dist/pipeline/compare.js +8 -8
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +392 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +404 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
- package/dist/pipeline/compiler/assertion-mapper.js +175 -0
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
- package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
- package/dist/pipeline/compiler/config-loader.d.ts +56 -0
- package/dist/pipeline/compiler/config-loader.js +111 -0
- package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
- package/dist/pipeline/compiler/fixture-resolver.js +113 -0
- package/dist/pipeline/compiler/hash.d.ts +11 -0
- package/dist/pipeline/compiler/hash.js +18 -0
- package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
- package/dist/pipeline/compiler/ignore-fields.js +113 -0
- package/dist/pipeline/compiler/index.d.ts +29 -0
- package/dist/pipeline/compiler/index.js +45 -0
- package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
- package/dist/pipeline/compiler/literacy-bridge.js +172 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +15 -0
- package/dist/pipeline/compiler/mode-handlers/index.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/index.d.ts +9 -0
- package/dist/pipeline/compiler/presets/index.js +8 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +42 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.js +208 -0
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
- package/dist/pipeline/compiler/provider-assembler.js +137 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
- package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
- package/dist/pipeline/compiler/sandbox/index.js +11 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
- package/dist/pipeline/compiler/scoring-bridge.js +114 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
- package/dist/pipeline/compiler/task-graph-builder.js +291 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
- package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
- package/dist/pipeline/compiler/telemetry/index.js +19 -0
- package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
- package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
- package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
- package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
- package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
- package/dist/pipeline/compiler/variable-resolver.js +115 -0
- package/dist/pipeline/coverage-audit.d.ts +15 -5
- package/dist/pipeline/coverage-audit.js +41 -22
- package/dist/pipeline/eval-constants.d.ts +16 -6
- package/dist/pipeline/eval-constants.js +25 -4
- package/dist/pipeline/eval-fingerprint.d.ts +2 -2
- package/dist/pipeline/eval-fingerprint.js +8 -9
- package/dist/pipeline/expand-tasks.d.ts +19 -10
- package/dist/pipeline/expand-tasks.js +34 -28
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +2 -2
- package/dist/pipeline/generate-configs.d.ts +22 -4
- package/dist/pipeline/generate-configs.js +53 -24
- package/dist/pipeline/grader-api.d.ts +3 -3
- package/dist/pipeline/grader-api.js +5 -12
- package/dist/pipeline/grader-compare-runner.js +20 -27
- package/dist/pipeline/grader-comparison.d.ts +4 -8
- package/dist/pipeline/grader-comparison.js +11 -17
- package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
- package/dist/pipeline/grader-consistency-runner.js +16 -20
- package/dist/pipeline/grader-consistency.d.ts +6 -10
- package/dist/pipeline/grader-consistency.js +13 -32
- package/dist/pipeline/grader-sensitivity-runner.js +7 -5
- package/dist/pipeline/grader-sensitivity.d.ts +2 -6
- package/dist/pipeline/grader-sensitivity.js +10 -10
- package/dist/pipeline/grader-validate-runner.js +7 -5
- package/dist/pipeline/grader-validation.d.ts +2 -6
- package/dist/pipeline/grader-validation.js +14 -22
- package/dist/pipeline/map-request-to-config.js +7 -1
- package/dist/pipeline/mirror-repo-tasks.d.ts +13 -13
- package/dist/pipeline/mirror-repo-tasks.js +22 -21
- package/dist/pipeline/normalize-mode.d.ts +49 -0
- package/dist/pipeline/normalize-mode.js +64 -0
- package/dist/pipeline/plan.d.ts +5 -2
- package/dist/pipeline/plan.js +134 -78
- package/dist/pipeline/pr-comment.js +2 -0
- package/dist/pipeline/profile-resolution.d.ts +22 -14
- package/dist/pipeline/profile-resolution.js +41 -19
- package/dist/pipeline/provenance.d.ts +2 -2
- package/dist/pipeline/provenance.js +12 -17
- package/dist/pipeline/release-report.js +4 -4
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/rubric-loader.d.ts +20 -0
- package/dist/pipeline/rubric-loader.js +37 -0
- package/dist/pipeline/validate.d.ts +4 -4
- package/dist/pipeline/validate.js +64 -53
- package/dist/schedules/loader.js +18 -8
- package/dist/scripts/migrate-task-mode.d.ts +24 -0
- package/dist/scripts/migrate-task-mode.js +85 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +15 -15
- package/dist/sinks/loader.js +5 -7
- package/dist/sources.d.ts +7 -7
- package/dist/sources.js +22 -24
- package/dist/webhook/dispatch.js +2 -1
- package/package.json +15 -4
- package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
- package/tasks/literacy/frameworks.task.ts +128 -0
- package/tasks/literacy/functions.task.ts +69 -0
- package/tasks/literacy/groq.task.ts +258 -0
- package/tasks/literacy/nextjs-live.task.ts +75 -0
- package/tasks/literacy/studio-setup.task.ts +131 -0
- package/tasks/literacy/visual-editing.task.ts +146 -0
- package/config/features.yaml +0 -116
- package/config/models.yaml +0 -116
- package/config/prompts.yaml +0 -75
- package/config/rubrics.yaml +0 -81
- package/config/schedules.yaml +0 -43
- package/config/sinks.yaml +0 -54
- package/config/sources.yaml +0 -51
- package/config/thresholds.yaml +0 -49
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
|
@@ -9,12 +9,23 @@
|
|
|
9
9
|
* Ports & Adapters migration (Phase 0c). The original file is now a
|
|
10
10
|
* re-export barrel that preserves backward compatibility.
|
|
11
11
|
*/
|
|
12
|
-
import type {
|
|
12
|
+
import type { DocumentRef as _DocumentRef, EvalMode as _EvalMode } from "../../ailf-shared/index.d.ts";
|
|
13
13
|
export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "./scoring-input.js";
|
|
14
14
|
export type { DocumentRef } from "../../ailf-shared/index.d.ts";
|
|
15
|
+
export type { StoredBaseline, StoredReport, StoredRun, StoredTaskResult, StoredTrace, SchemaVersioned, } from "./storage-schema.js";
|
|
16
|
+
export { CURRENT_SCHEMA_VERSION, isSchemaVersioned, migrateDocument, } from "./storage-schema.js";
|
|
17
|
+
export type { AssertionRegistration, FixtureResolverRegistration, ModeBase, ModeRegistration, PluginManifest, PluginRegistry, PresetDefinition, ReportSinkRegistration, RubricTemplateRegistration, } from "./plugin-registry.js";
|
|
18
|
+
export { InMemoryPluginRegistry } from "./plugin-registry.js";
|
|
19
|
+
export type { AgentHarnessConfig, AgentHarnessModeConfig, CustomModeConfig, EvalModeConfig, EvalModeType, KnowledgeBaseRef, KnowledgeProbeModeConfig, LiteracyModeConfig, MCPServerConfig, MCPServerModeConfig, ProbeStrategy, SandboxConfig, ToolDef, } from "./eval-mode-config.js";
|
|
20
|
+
export { evalModeType } from "./eval-mode-config.js";
|
|
21
|
+
export type { DependencyEdge, ResolvedFixture, TaskGraph, TaskNode, } from "./task-graph.js";
|
|
22
|
+
export type { VariableDeclaration, VariableEnvelope, VariableProvenance, VariableSource, } from "./variable-envelope.js";
|
|
23
|
+
export type { EvalTrace, ToolCallCategory, ToolCallRecord, TraceEvent, TraceSpan, TraceTokenUsage, } from "./trace.js";
|
|
24
|
+
export type { ArtifactId, Brand, Err, FixtureId, IdValidationError, NewReportId, Ok, ProviderId, PromptId, Result, ResultId, RubricId, RunFingerprint, RunId, SuiteId, TaskId, TaskSlug, TraceId, } from "./branded-ids.js";
|
|
25
|
+
export { err, fixtureId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
|
|
26
|
+
export type { AgentHarnessTaskDefinition, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PathDocRef, PerspectiveDocRef, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
|
|
15
27
|
type DocumentRef = _DocumentRef;
|
|
16
28
|
type EvalMode = _EvalMode;
|
|
17
|
-
type ConcreteEvalMode = _ConcreteEvalMode;
|
|
18
29
|
/** Aggregated retrieval metrics for a feature area */
|
|
19
30
|
export interface AreaRetrievalMetrics {
|
|
20
31
|
area: string;
|
|
@@ -76,8 +87,7 @@ export interface DebugOptions {
|
|
|
76
87
|
/** Random sample of N tests */
|
|
77
88
|
sample?: number;
|
|
78
89
|
}
|
|
79
|
-
export type {
|
|
80
|
-
export { FULL_MODE_SUBMODES } from "../../ailf-shared/index.d.ts";
|
|
90
|
+
export type { EvalMode } from "../../ailf-shared/index.d.ts";
|
|
81
91
|
/** A classified failure mode with confidence level */
|
|
82
92
|
export interface FailureMode {
|
|
83
93
|
/** How confident we are in this classification */
|
|
@@ -263,21 +273,11 @@ export interface GraderReliability {
|
|
|
263
273
|
totalJudgments: number;
|
|
264
274
|
/** Recommended noise threshold for comparisons (2× max dimension σ) */
|
|
265
275
|
recommendedThreshold: number;
|
|
266
|
-
/** Per-dimension consistency */
|
|
267
|
-
perDimension: {
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
};
|
|
272
|
-
codeCorrectness: {
|
|
273
|
-
avgStdDev: number;
|
|
274
|
-
maxStdDev: number;
|
|
275
|
-
};
|
|
276
|
-
docCoverage: {
|
|
277
|
-
avgStdDev: number;
|
|
278
|
-
maxStdDev: number;
|
|
279
|
-
};
|
|
280
|
-
};
|
|
276
|
+
/** Per-dimension consistency (keyed by dimension name) */
|
|
277
|
+
perDimension: Record<string, {
|
|
278
|
+
avgStdDev: number;
|
|
279
|
+
maxStdDev: number;
|
|
280
|
+
}>;
|
|
281
281
|
};
|
|
282
282
|
/** Grader model used for this evaluation */
|
|
283
283
|
graderModel: string;
|
|
@@ -289,21 +289,11 @@ export interface GraderReliability {
|
|
|
289
289
|
avgSeparation: number;
|
|
290
290
|
/** Total paired comparisons analyzed */
|
|
291
291
|
totalPairs: number;
|
|
292
|
-
/** Per-dimension sensitivity */
|
|
293
|
-
perDimension: {
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
};
|
|
298
|
-
codeCorrectness: {
|
|
299
|
-
concordanceRate: number;
|
|
300
|
-
avgSeparation: number;
|
|
301
|
-
};
|
|
302
|
-
docCoverage: {
|
|
303
|
-
concordanceRate: number;
|
|
304
|
-
avgSeparation: number;
|
|
305
|
-
};
|
|
306
|
-
};
|
|
292
|
+
/** Per-dimension sensitivity (keyed by dimension name) */
|
|
293
|
+
perDimension: Record<string, {
|
|
294
|
+
concordanceRate: number;
|
|
295
|
+
avgSeparation: number;
|
|
296
|
+
}>;
|
|
307
297
|
};
|
|
308
298
|
/** Criterion validity (from human reference grades) — Phase 2 */
|
|
309
299
|
validity?: {
|
|
@@ -313,12 +303,8 @@ export interface GraderReliability {
|
|
|
313
303
|
correlation: number;
|
|
314
304
|
/** Systematic bias (positive = grader scores higher than humans) */
|
|
315
305
|
bias: number;
|
|
316
|
-
/** Per-dimension correlation with human grades */
|
|
317
|
-
perDimension:
|
|
318
|
-
taskCompletion: number;
|
|
319
|
-
codeCorrectness: number;
|
|
320
|
-
docCoverage: number;
|
|
321
|
-
};
|
|
306
|
+
/** Per-dimension correlation with human grades (keyed by dimension name) */
|
|
307
|
+
perDimension: Record<string, number>;
|
|
322
308
|
/** Number of human-graded reference samples */
|
|
323
309
|
sampleSize: number;
|
|
324
310
|
/** Whether the grader passes the MAE threshold */
|
|
@@ -420,7 +406,7 @@ export interface PipelineOptions {
|
|
|
420
406
|
}
|
|
421
407
|
/** A Promptfoo share URL tagged with the evaluation mode that produced it. */
|
|
422
408
|
export interface PromptfooUrlEntry {
|
|
423
|
-
mode:
|
|
409
|
+
mode: string;
|
|
424
410
|
url: string;
|
|
425
411
|
}
|
|
426
412
|
/**
|
|
@@ -589,8 +575,6 @@ export interface ProductFeature {
|
|
|
589
575
|
sections: string[];
|
|
590
576
|
/** Coverage status */
|
|
591
577
|
status: "covered" | "out-of-scope" | "planned" | "uncovered";
|
|
592
|
-
/** Number of evaluation tasks (if covered) */
|
|
593
|
-
taskCount?: number;
|
|
594
578
|
}
|
|
595
579
|
/** Full classification of a content release for evaluation */
|
|
596
580
|
export interface ReleaseClassification {
|
|
@@ -694,14 +678,18 @@ export interface ScoreSummary {
|
|
|
694
678
|
*/
|
|
695
679
|
documentManifest?: DocumentRef[];
|
|
696
680
|
/**
|
|
697
|
-
* Which evaluation
|
|
681
|
+
* Which evaluation variant contributed data to this summary.
|
|
682
|
+
*
|
|
683
|
+
* For literacy mode this is a variant name:
|
|
698
684
|
* - `'full'`: both baseline and agentic data present
|
|
699
685
|
* - `'baseline'`: floor + ceiling only (no agentic data)
|
|
700
686
|
* - `'agentic'`: actual only (no floor/ceiling data)
|
|
701
687
|
* - `'observed'`: observed mode data
|
|
688
|
+
*
|
|
689
|
+
* For non-literacy modes this is the canonical mode name.
|
|
702
690
|
* Absent in legacy summaries — treat as `'baseline'` for backward compat.
|
|
703
691
|
*/
|
|
704
|
-
evaluationMode?:
|
|
692
|
+
evaluationMode?: string;
|
|
705
693
|
/** Failure mode analysis (Phase 3a) — diagnostic breakdown of why scores are low */
|
|
706
694
|
failureModes?: FailureModeReport;
|
|
707
695
|
/**
|
|
@@ -870,24 +858,12 @@ export interface AreaDelta {
|
|
|
870
858
|
costDelta?: number;
|
|
871
859
|
/** Overall score delta (experiment − baseline) */
|
|
872
860
|
delta: number;
|
|
873
|
-
/** Per-dimension deltas */
|
|
874
|
-
dimensions: {
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
};
|
|
880
|
-
codeCorrectness: {
|
|
881
|
-
baseline: number;
|
|
882
|
-
experiment: number;
|
|
883
|
-
delta: number;
|
|
884
|
-
};
|
|
885
|
-
docCoverage: {
|
|
886
|
-
baseline: number;
|
|
887
|
-
experiment: number;
|
|
888
|
-
delta: number;
|
|
889
|
-
};
|
|
890
|
-
};
|
|
861
|
+
/** Per-dimension deltas (keyed by dimension name, e.g. 'taskCompletion') */
|
|
862
|
+
dimensions: Record<string, {
|
|
863
|
+
baseline: number;
|
|
864
|
+
experiment: number;
|
|
865
|
+
delta: number;
|
|
866
|
+
}>;
|
|
891
867
|
/** Doc Lift delta */
|
|
892
868
|
docLiftDelta: number;
|
|
893
869
|
/** Experiment total score */
|
|
@@ -958,11 +934,7 @@ export interface ComparisonReport {
|
|
|
958
934
|
/** Per-area total score deltas */
|
|
959
935
|
perArea: Record<string, number>;
|
|
960
936
|
/** Per-dimension average deltas (across all areas) */
|
|
961
|
-
perDimension:
|
|
962
|
-
taskCompletion: number;
|
|
963
|
-
codeCorrectness: number;
|
|
964
|
-
docCoverage: number;
|
|
965
|
-
};
|
|
937
|
+
perDimension: Record<string, number>;
|
|
966
938
|
/** Doc Lift average delta */
|
|
967
939
|
docLift: number;
|
|
968
940
|
/** Cost delta (if both runs have cost data) */
|
|
@@ -1018,21 +990,11 @@ export interface ConfidenceAnnotation {
|
|
|
1018
990
|
* Matches the shape produced by pipeline/grader-consistency.ts.
|
|
1019
991
|
*/
|
|
1020
992
|
export interface GraderConsistencyData {
|
|
1021
|
-
/** Per-dimension consistency metrics */
|
|
1022
|
-
perDimension: {
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
};
|
|
1027
|
-
docCoverage: {
|
|
1028
|
-
avgStdDev: number;
|
|
1029
|
-
maxStdDev: number;
|
|
1030
|
-
};
|
|
1031
|
-
taskCompletion: {
|
|
1032
|
-
avgStdDev: number;
|
|
1033
|
-
maxStdDev: number;
|
|
1034
|
-
};
|
|
1035
|
-
};
|
|
993
|
+
/** Per-dimension consistency metrics (keyed by dimension name) */
|
|
994
|
+
perDimension: Record<string, {
|
|
995
|
+
avgStdDev: number;
|
|
996
|
+
maxStdDev: number;
|
|
997
|
+
}>;
|
|
1036
998
|
/** Recommended noise threshold for comparisons (2× max dimension σ) */
|
|
1037
999
|
recommendedThreshold: number;
|
|
1038
1000
|
}
|
|
@@ -9,7 +9,14 @@
|
|
|
9
9
|
* Ports & Adapters migration (Phase 0c). The original file is now a
|
|
10
10
|
* re-export barrel that preserves backward compatibility.
|
|
11
11
|
*/
|
|
12
|
-
export {
|
|
12
|
+
export { CURRENT_SCHEMA_VERSION, isSchemaVersioned, migrateDocument, } from "./storage-schema.js";
|
|
13
|
+
export { InMemoryPluginRegistry } from "./plugin-registry.js";
|
|
14
|
+
// Note: DocSourceConfig is NOT re-exported here — it conflicts with the
|
|
15
|
+
// existing DocSourceConfig in ports/doc-fetcher.ts. The eval-mode-config
|
|
16
|
+
// version is used internally by LiteracyModeConfig. If consumers need
|
|
17
|
+
// the mode-specific version, they import from "./eval-mode-config.js".
|
|
18
|
+
export { evalModeType } from "./eval-mode-config.js";
|
|
19
|
+
export { err, fixtureId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
|
|
13
20
|
// ---------------------------------------------------------------------------
|
|
14
21
|
// Comparison (Approach 2: structured comparison output)
|
|
15
22
|
// ---------------------------------------------------------------------------
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Plugin registry — typed extension points for AILF evaluation capabilities.
|
|
3
|
+
*
|
|
4
|
+
* Three-tier architecture:
|
|
5
|
+
* - **Mode bases** define evaluation methodology (rubrics, scoring, prompts)
|
|
6
|
+
* - **Domain presets** target a mode base and add domain config (sources,
|
|
7
|
+
* features, doc fetcher)
|
|
8
|
+
* - **Framework assertions** are generic evaluation primitives available to
|
|
9
|
+
* all modes
|
|
10
|
+
*
|
|
11
|
+
* @see docs/design-docs/architecture-overhaul/extensibility-plugins.md
|
|
12
|
+
*/
|
|
13
|
+
import type { PromptTemplate } from "../ports/mode-handler.js";
|
|
14
|
+
import type { DocFetcher } from "../ports/doc-fetcher.js";
|
|
15
|
+
import type { SourceEntry } from "../config-helpers.js";
|
|
16
|
+
import type { FeatureRegistry } from "../schemas/pipeline.js";
|
|
17
|
+
/** A registered evaluation mode handler */
|
|
18
|
+
export interface ModeRegistration {
|
|
19
|
+
/** Unique mode identifier (e.g., "api-contract") */
|
|
20
|
+
id: string;
|
|
21
|
+
/** Human-readable label */
|
|
22
|
+
label: string;
|
|
23
|
+
/** Valid provider pattern regexes */
|
|
24
|
+
validProviderPatterns: string[];
|
|
25
|
+
/** Rubric template IDs available for this mode */
|
|
26
|
+
rubricTemplateIds: string[];
|
|
27
|
+
/** Compile function module path (loaded at runtime) */
|
|
28
|
+
handlerModule: string;
|
|
29
|
+
}
|
|
30
|
+
/** A registered assertion type */
|
|
31
|
+
export interface AssertionRegistration {
|
|
32
|
+
/** Assertion type name (e.g., "api-contract-match") */
|
|
33
|
+
type: string;
|
|
34
|
+
/** Human-readable label */
|
|
35
|
+
label: string;
|
|
36
|
+
/**
|
|
37
|
+
* Which modes this assertion is compatible with.
|
|
38
|
+
* When omitted, the assertion is compatible with all modes.
|
|
39
|
+
* When specified, acts as a whitelist of mode IDs.
|
|
40
|
+
*/
|
|
41
|
+
compatibleModes?: string[];
|
|
42
|
+
/** Assertion handler module path */
|
|
43
|
+
handlerModule: string;
|
|
44
|
+
}
|
|
45
|
+
/** A registered rubric template */
|
|
46
|
+
export interface RubricTemplateRegistration {
|
|
47
|
+
/** Template ID (e.g., "api-accuracy") */
|
|
48
|
+
id: string;
|
|
49
|
+
/** Scoring dimension this template contributes to */
|
|
50
|
+
dimension: string;
|
|
51
|
+
/** Scale header text */
|
|
52
|
+
header: string;
|
|
53
|
+
/** Scale entries */
|
|
54
|
+
scale: string[];
|
|
55
|
+
/** Criteria label */
|
|
56
|
+
criteriaLabel?: string;
|
|
57
|
+
}
|
|
58
|
+
/** A registered fixture resolver */
|
|
59
|
+
export interface FixtureResolverRegistration {
|
|
60
|
+
/** URI scheme this resolver handles (e.g., "graphql://") */
|
|
61
|
+
scheme: string;
|
|
62
|
+
/** Resolver module path */
|
|
63
|
+
handlerModule: string;
|
|
64
|
+
}
|
|
65
|
+
/** A registered report sink */
|
|
66
|
+
export interface ReportSinkRegistration {
|
|
67
|
+
/** Sink identifier (e.g., "bigquery", "slack") */
|
|
68
|
+
id: string;
|
|
69
|
+
/** Sink module path */
|
|
70
|
+
handlerModule: string;
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* ModeBase — shared evaluation methodology for a mode.
|
|
74
|
+
*
|
|
75
|
+
* Defines HOW you evaluate (rubrics, scoring, prompts) independently of
|
|
76
|
+
* WHAT you're evaluating (sources, features, docs). Multiple domain presets
|
|
77
|
+
* can target the same mode base and inherit its defaults.
|
|
78
|
+
*
|
|
79
|
+
* Example: the "literacy" mode base defines rubric templates for
|
|
80
|
+
* task-completion, code-correctness, and doc-coverage. Both a Sanity docs
|
|
81
|
+
* preset and an external docs preset can target "literacy" and inherit
|
|
82
|
+
* these rubrics without redefining them.
|
|
83
|
+
*/
|
|
84
|
+
export interface ModeBase {
|
|
85
|
+
/** The mode registration (handler, provider patterns, rubric template IDs) */
|
|
86
|
+
mode: ModeRegistration;
|
|
87
|
+
/** Default rubric templates for this mode */
|
|
88
|
+
rubricTemplates?: RubricTemplateRegistration[];
|
|
89
|
+
/** Default scoring profiles for this mode (profile name → dimension weights) */
|
|
90
|
+
scoringProfiles?: Record<string, Record<string, number>>;
|
|
91
|
+
/** Default prompt templates for this mode (template name → template) */
|
|
92
|
+
promptTemplates?: Record<string, PromptTemplate>;
|
|
93
|
+
/** Mode-specific assertion types (beyond framework builtins) */
|
|
94
|
+
assertions?: AssertionRegistration[];
|
|
95
|
+
}
|
|
96
|
+
/** Plugin manifest describing a single plugin */
|
|
97
|
+
export interface PluginManifest {
|
|
98
|
+
/** Plugin name (npm package style) */
|
|
99
|
+
name: string;
|
|
100
|
+
/** Semver version */
|
|
101
|
+
version: string;
|
|
102
|
+
/** Human-readable description */
|
|
103
|
+
description?: string;
|
|
104
|
+
/** Plugin API version this plugin targets */
|
|
105
|
+
pluginApiVersion: number;
|
|
106
|
+
/** Minimum AILF version required */
|
|
107
|
+
minAILFVersion?: string;
|
|
108
|
+
/** Dependencies on other plugins */
|
|
109
|
+
requires?: string[];
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* A domain preset targets a mode base and adds domain-specific configuration.
|
|
113
|
+
*
|
|
114
|
+
* The preset inherits evaluation methodology (rubrics, scoring, prompts) from
|
|
115
|
+
* its mode base. It can optionally override any inherited values.
|
|
116
|
+
*/
|
|
117
|
+
export interface PresetDefinition {
|
|
118
|
+
/** Preset name (unique identifier) */
|
|
119
|
+
name: string;
|
|
120
|
+
/** Plugin manifest */
|
|
121
|
+
manifest: PluginManifest;
|
|
122
|
+
/**
|
|
123
|
+
* Lifecycle status — mirrors task status semantics.
|
|
124
|
+
* active: registered and used in evaluations (default)
|
|
125
|
+
* draft: registered but skipped unless explicitly targeted
|
|
126
|
+
* paused: registered but skipped (can be resumed)
|
|
127
|
+
* archived: not registered
|
|
128
|
+
*/
|
|
129
|
+
status?: "active" | "archived" | "draft" | "paused";
|
|
130
|
+
/**
|
|
131
|
+
* Which mode this preset targets (by mode ID).
|
|
132
|
+
* Links to a registered ModeBase. The preset inherits rubrics,
|
|
133
|
+
* scoring profiles, and prompt templates from the base.
|
|
134
|
+
*/
|
|
135
|
+
mode: string;
|
|
136
|
+
/** Fixture resolvers */
|
|
137
|
+
fixtureResolvers?: FixtureResolverRegistration[];
|
|
138
|
+
/** Report sinks */
|
|
139
|
+
reportSinks?: ReportSinkRegistration[];
|
|
140
|
+
/** Factory function that creates a DocFetcher instance */
|
|
141
|
+
docFetcher?: () => DocFetcher;
|
|
142
|
+
/** Documentation source definitions (production, branch, local, etc.) */
|
|
143
|
+
sourceDefs?: SourceEntry[];
|
|
144
|
+
/** Product feature registry for coverage tracking */
|
|
145
|
+
featureDefs?: FeatureRegistry;
|
|
146
|
+
/** Override rubric templates (merged by ID with mode base) */
|
|
147
|
+
rubricTemplates?: RubricTemplateRegistration[];
|
|
148
|
+
/** Override scoring profiles (merged by name with mode base) */
|
|
149
|
+
scoringProfiles?: Record<string, Record<string, number>>;
|
|
150
|
+
/** Override prompt templates (merged by name with mode base) */
|
|
151
|
+
promptTemplates?: Record<string, PromptTemplate>;
|
|
152
|
+
/** Additional mode-specific assertions */
|
|
153
|
+
assertions?: AssertionRegistration[];
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* PluginRegistry — central registry for all AILF extensions.
|
|
157
|
+
*
|
|
158
|
+
* Plugins register their capabilities here. The pipeline queries the
|
|
159
|
+
* registry to discover available modes, assertions, templates, etc.
|
|
160
|
+
*/
|
|
161
|
+
export interface PluginRegistry {
|
|
162
|
+
/** Register a complete preset (bundles multiple extensions) */
|
|
163
|
+
registerPreset(preset: PresetDefinition): void;
|
|
164
|
+
/** Register a single evaluation mode */
|
|
165
|
+
registerMode(mode: ModeRegistration): void;
|
|
166
|
+
/** Register a single assertion type */
|
|
167
|
+
registerAssertion(assertion: AssertionRegistration): void;
|
|
168
|
+
/** Register a rubric template */
|
|
169
|
+
registerRubricTemplate(template: RubricTemplateRegistration): void;
|
|
170
|
+
/** Register a fixture resolver */
|
|
171
|
+
registerFixtureResolver(resolver: FixtureResolverRegistration): void;
|
|
172
|
+
/** Register a report sink */
|
|
173
|
+
registerReportSink(sink: ReportSinkRegistration): void;
|
|
174
|
+
/** Get all registered modes */
|
|
175
|
+
getModes(): ModeRegistration[];
|
|
176
|
+
/** Get a mode by ID */
|
|
177
|
+
getMode(id: string): ModeRegistration | undefined;
|
|
178
|
+
/** Get all registered assertion types */
|
|
179
|
+
getAssertions(): AssertionRegistration[];
|
|
180
|
+
/** Get all registered rubric templates */
|
|
181
|
+
getRubricTemplates(): RubricTemplateRegistration[];
|
|
182
|
+
/** Get all registered fixture resolvers */
|
|
183
|
+
getFixtureResolvers(): FixtureResolverRegistration[];
|
|
184
|
+
/** Get all registered report sinks */
|
|
185
|
+
getReportSinks(): ReportSinkRegistration[];
|
|
186
|
+
/** Register prompt templates (merged with existing) */
|
|
187
|
+
registerPromptTemplates(templates: Record<string, PromptTemplate>): void;
|
|
188
|
+
/** Get all registered prompt templates */
|
|
189
|
+
getPromptTemplates(): Record<string, PromptTemplate>;
|
|
190
|
+
/** Register scoring profiles (merged with existing) */
|
|
191
|
+
registerScoringProfiles(profiles: Record<string, Record<string, number>>): void;
|
|
192
|
+
/** Get all registered scoring profiles */
|
|
193
|
+
getScoringProfiles(): Record<string, Record<string, number>>;
|
|
194
|
+
/** Register a doc fetcher factory (last-write-wins) */
|
|
195
|
+
registerDocFetcherFactory(factory: () => DocFetcher): void;
|
|
196
|
+
/** Get the registered doc fetcher factory, if any */
|
|
197
|
+
getDocFetcherFactory(): (() => DocFetcher) | undefined;
|
|
198
|
+
/** Register source definitions (concatenated with existing) */
|
|
199
|
+
registerSourceDefs(sources: SourceEntry[]): void;
|
|
200
|
+
/** Get all registered source definitions */
|
|
201
|
+
getSourceDefs(): SourceEntry[];
|
|
202
|
+
/** Register a feature registry (merged by feature ID with existing) */
|
|
203
|
+
registerFeatureDefs(features: FeatureRegistry): void;
|
|
204
|
+
/** Get the registered feature registry, if any */
|
|
205
|
+
getFeatureDefs(): FeatureRegistry | undefined;
|
|
206
|
+
/** Register a mode base (evaluation methodology) */
|
|
207
|
+
registerModeBase(base: ModeBase): void;
|
|
208
|
+
/** Get a mode base by mode ID */
|
|
209
|
+
getModeBase(modeId: string): ModeBase | undefined;
|
|
210
|
+
/** Get all registered mode bases */
|
|
211
|
+
getModeBases(): ModeBase[];
|
|
212
|
+
/** Get all registered presets */
|
|
213
|
+
getPresets(): PresetDefinition[];
|
|
214
|
+
}
|
|
215
|
+
/**
|
|
216
|
+
* In-memory plugin registry implementation.
|
|
217
|
+
*/
|
|
218
|
+
export declare class InMemoryPluginRegistry implements PluginRegistry {
|
|
219
|
+
private readonly modes;
|
|
220
|
+
private readonly assertions_;
|
|
221
|
+
private readonly rubricTemplates_;
|
|
222
|
+
private readonly fixtureResolvers_;
|
|
223
|
+
private readonly reportSinks_;
|
|
224
|
+
private readonly modeBases_;
|
|
225
|
+
private readonly presets_;
|
|
226
|
+
private promptTemplates_;
|
|
227
|
+
private scoringProfiles_;
|
|
228
|
+
private docFetcherFactory_;
|
|
229
|
+
private sourceDefs_;
|
|
230
|
+
private featureDefs_;
|
|
231
|
+
registerPreset(preset: PresetDefinition): void;
|
|
232
|
+
registerMode(mode: ModeRegistration): void;
|
|
233
|
+
registerAssertion(assertion: AssertionRegistration): void;
|
|
234
|
+
registerRubricTemplate(template: RubricTemplateRegistration): void;
|
|
235
|
+
registerFixtureResolver(resolver: FixtureResolverRegistration): void;
|
|
236
|
+
registerReportSink(sink: ReportSinkRegistration): void;
|
|
237
|
+
getModes(): ModeRegistration[];
|
|
238
|
+
getMode(id: string): ModeRegistration | undefined;
|
|
239
|
+
getAssertions(): AssertionRegistration[];
|
|
240
|
+
getRubricTemplates(): RubricTemplateRegistration[];
|
|
241
|
+
getFixtureResolvers(): FixtureResolverRegistration[];
|
|
242
|
+
getReportSinks(): ReportSinkRegistration[];
|
|
243
|
+
getPresets(): PresetDefinition[];
|
|
244
|
+
registerPromptTemplates(templates: Record<string, PromptTemplate>): void;
|
|
245
|
+
getPromptTemplates(): Record<string, PromptTemplate>;
|
|
246
|
+
registerScoringProfiles(profiles: Record<string, Record<string, number>>): void;
|
|
247
|
+
getScoringProfiles(): Record<string, Record<string, number>>;
|
|
248
|
+
registerDocFetcherFactory(factory: () => DocFetcher): void;
|
|
249
|
+
getDocFetcherFactory(): (() => DocFetcher) | undefined;
|
|
250
|
+
registerSourceDefs(sources: SourceEntry[]): void;
|
|
251
|
+
getSourceDefs(): SourceEntry[];
|
|
252
|
+
registerFeatureDefs(features: FeatureRegistry): void;
|
|
253
|
+
getFeatureDefs(): FeatureRegistry | undefined;
|
|
254
|
+
registerModeBase(base: ModeBase): void;
|
|
255
|
+
getModeBase(modeId: string): ModeBase | undefined;
|
|
256
|
+
getModeBases(): ModeBase[];
|
|
257
|
+
}
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Plugin registry — typed extension points for AILF evaluation capabilities.
|
|
3
|
+
*
|
|
4
|
+
* Three-tier architecture:
|
|
5
|
+
* - **Mode bases** define evaluation methodology (rubrics, scoring, prompts)
|
|
6
|
+
* - **Domain presets** target a mode base and add domain config (sources,
|
|
7
|
+
* features, doc fetcher)
|
|
8
|
+
* - **Framework assertions** are generic evaluation primitives available to
|
|
9
|
+
* all modes
|
|
10
|
+
*
|
|
11
|
+
* @see docs/design-docs/architecture-overhaul/extensibility-plugins.md
|
|
12
|
+
*/
|
|
13
|
+
/**
|
|
14
|
+
* In-memory plugin registry implementation.
|
|
15
|
+
*/
|
|
16
|
+
export class InMemoryPluginRegistry {
|
|
17
|
+
modes = new Map();
|
|
18
|
+
assertions_ = new Map();
|
|
19
|
+
rubricTemplates_ = new Map();
|
|
20
|
+
fixtureResolvers_ = new Map();
|
|
21
|
+
reportSinks_ = new Map();
|
|
22
|
+
modeBases_ = new Map();
|
|
23
|
+
presets_ = new Map();
|
|
24
|
+
promptTemplates_ = {};
|
|
25
|
+
scoringProfiles_ = {};
|
|
26
|
+
docFetcherFactory_;
|
|
27
|
+
sourceDefs_ = [];
|
|
28
|
+
featureDefs_;
|
|
29
|
+
registerPreset(preset) {
|
|
30
|
+
// Skip archived presets entirely
|
|
31
|
+
if (preset.status === "archived")
|
|
32
|
+
return;
|
|
33
|
+
// Store draft/paused presets in the map (for later activation via
|
|
34
|
+
// --preset flag) but skip all side-effect registrations. This prevents
|
|
35
|
+
// a draft preset from silently overwriting the doc fetcher, merging
|
|
36
|
+
// scoring profiles, etc.
|
|
37
|
+
this.presets_.set(preset.name, preset);
|
|
38
|
+
if (preset.status === "draft" || preset.status === "paused")
|
|
39
|
+
return;
|
|
40
|
+
// Resolve mode base defaults
|
|
41
|
+
const base = this.modeBases_.get(preset.mode);
|
|
42
|
+
if (!base) {
|
|
43
|
+
throw new Error(`Preset "${preset.name}" targets mode "${preset.mode}" ` +
|
|
44
|
+
`but no mode base is registered for it. ` +
|
|
45
|
+
`Available mode bases: ${[...this.modeBases_.keys()].join(", ") || "(none)"}`);
|
|
46
|
+
}
|
|
47
|
+
// Mode is already registered by registerModeBase() — no need to re-register.
|
|
48
|
+
// Merge rubric templates: base defaults + preset overrides (by ID)
|
|
49
|
+
const baseRubrics = new Map((base.rubricTemplates ?? []).map((r) => [r.id, r]));
|
|
50
|
+
for (const r of preset.rubricTemplates ?? []) {
|
|
51
|
+
baseRubrics.set(r.id, r);
|
|
52
|
+
}
|
|
53
|
+
for (const r of baseRubrics.values()) {
|
|
54
|
+
this.registerRubricTemplate(r);
|
|
55
|
+
}
|
|
56
|
+
// Merge scoring profiles: base defaults + preset overrides (by name)
|
|
57
|
+
const profiles = {
|
|
58
|
+
...base.scoringProfiles,
|
|
59
|
+
...preset.scoringProfiles,
|
|
60
|
+
};
|
|
61
|
+
if (Object.keys(profiles).length > 0) {
|
|
62
|
+
this.registerScoringProfiles(profiles);
|
|
63
|
+
}
|
|
64
|
+
// Merge prompt templates: base defaults + preset overrides (by name)
|
|
65
|
+
const prompts = {
|
|
66
|
+
...base.promptTemplates,
|
|
67
|
+
...preset.promptTemplates,
|
|
68
|
+
};
|
|
69
|
+
if (Object.keys(prompts).length > 0) {
|
|
70
|
+
this.registerPromptTemplates(prompts);
|
|
71
|
+
}
|
|
72
|
+
// Merge assertions: base + preset (preset overrides by type)
|
|
73
|
+
for (const a of base.assertions ?? [])
|
|
74
|
+
this.registerAssertion(a);
|
|
75
|
+
if (preset.assertions) {
|
|
76
|
+
for (const a of preset.assertions)
|
|
77
|
+
this.registerAssertion(a);
|
|
78
|
+
}
|
|
79
|
+
// Register domain-specific fields
|
|
80
|
+
if (preset.fixtureResolvers) {
|
|
81
|
+
for (const r of preset.fixtureResolvers)
|
|
82
|
+
this.registerFixtureResolver(r);
|
|
83
|
+
}
|
|
84
|
+
if (preset.reportSinks) {
|
|
85
|
+
for (const s of preset.reportSinks)
|
|
86
|
+
this.registerReportSink(s);
|
|
87
|
+
}
|
|
88
|
+
if (preset.docFetcher) {
|
|
89
|
+
this.registerDocFetcherFactory(preset.docFetcher);
|
|
90
|
+
}
|
|
91
|
+
if (preset.sourceDefs) {
|
|
92
|
+
this.registerSourceDefs(preset.sourceDefs);
|
|
93
|
+
}
|
|
94
|
+
if (preset.featureDefs) {
|
|
95
|
+
this.registerFeatureDefs(preset.featureDefs);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
registerMode(mode) {
|
|
99
|
+
this.modes.set(mode.id, mode);
|
|
100
|
+
}
|
|
101
|
+
registerAssertion(assertion) {
|
|
102
|
+
this.assertions_.set(assertion.type, assertion);
|
|
103
|
+
}
|
|
104
|
+
registerRubricTemplate(template) {
|
|
105
|
+
this.rubricTemplates_.set(template.id, template);
|
|
106
|
+
}
|
|
107
|
+
registerFixtureResolver(resolver) {
|
|
108
|
+
this.fixtureResolvers_.set(resolver.scheme, resolver);
|
|
109
|
+
}
|
|
110
|
+
registerReportSink(sink) {
|
|
111
|
+
this.reportSinks_.set(sink.id, sink);
|
|
112
|
+
}
|
|
113
|
+
getModes() {
|
|
114
|
+
return [...this.modes.values()];
|
|
115
|
+
}
|
|
116
|
+
getMode(id) {
|
|
117
|
+
return this.modes.get(id);
|
|
118
|
+
}
|
|
119
|
+
getAssertions() {
|
|
120
|
+
return [...this.assertions_.values()];
|
|
121
|
+
}
|
|
122
|
+
getRubricTemplates() {
|
|
123
|
+
return [...this.rubricTemplates_.values()];
|
|
124
|
+
}
|
|
125
|
+
getFixtureResolvers() {
|
|
126
|
+
return [...this.fixtureResolvers_.values()];
|
|
127
|
+
}
|
|
128
|
+
getReportSinks() {
|
|
129
|
+
return [...this.reportSinks_.values()];
|
|
130
|
+
}
|
|
131
|
+
getPresets() {
|
|
132
|
+
return [...this.presets_.values()];
|
|
133
|
+
}
|
|
134
|
+
registerPromptTemplates(templates) {
|
|
135
|
+
Object.assign(this.promptTemplates_, templates);
|
|
136
|
+
}
|
|
137
|
+
getPromptTemplates() {
|
|
138
|
+
return this.promptTemplates_;
|
|
139
|
+
}
|
|
140
|
+
registerScoringProfiles(profiles) {
|
|
141
|
+
Object.assign(this.scoringProfiles_, profiles);
|
|
142
|
+
}
|
|
143
|
+
getScoringProfiles() {
|
|
144
|
+
return this.scoringProfiles_;
|
|
145
|
+
}
|
|
146
|
+
registerDocFetcherFactory(factory) {
|
|
147
|
+
this.docFetcherFactory_ = factory;
|
|
148
|
+
}
|
|
149
|
+
getDocFetcherFactory() {
|
|
150
|
+
return this.docFetcherFactory_;
|
|
151
|
+
}
|
|
152
|
+
registerSourceDefs(sources) {
|
|
153
|
+
this.sourceDefs_ = [...this.sourceDefs_, ...sources];
|
|
154
|
+
}
|
|
155
|
+
getSourceDefs() {
|
|
156
|
+
return this.sourceDefs_;
|
|
157
|
+
}
|
|
158
|
+
registerFeatureDefs(features) {
|
|
159
|
+
if (!this.featureDefs_) {
|
|
160
|
+
this.featureDefs_ = features;
|
|
161
|
+
return;
|
|
162
|
+
}
|
|
163
|
+
// Merge by feature ID: new features override existing on ID collision,
|
|
164
|
+
// existing features not in new set are preserved.
|
|
165
|
+
const merged = new Map(this.featureDefs_.features.map((f) => [f.id, f]));
|
|
166
|
+
for (const f of features.features) {
|
|
167
|
+
merged.set(f.id, f);
|
|
168
|
+
}
|
|
169
|
+
this.featureDefs_ = { features: [...merged.values()] };
|
|
170
|
+
}
|
|
171
|
+
getFeatureDefs() {
|
|
172
|
+
return this.featureDefs_;
|
|
173
|
+
}
|
|
174
|
+
registerModeBase(base) {
|
|
175
|
+
this.modeBases_.set(base.mode.id, base);
|
|
176
|
+
// Also register the mode itself so getMode() works
|
|
177
|
+
this.registerMode(base.mode);
|
|
178
|
+
}
|
|
179
|
+
getModeBase(modeId) {
|
|
180
|
+
return this.modeBases_.get(modeId);
|
|
181
|
+
}
|
|
182
|
+
getModeBases() {
|
|
183
|
+
return [...this.modeBases_.values()];
|
|
184
|
+
}
|
|
185
|
+
}
|