@sanity/ailf 2.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +28 -23
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
- package/dist/_vendor/ailf-core/config-helpers.js +29 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
- package/dist/_vendor/ailf-core/examples/index.js +208 -114
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
- package/dist/_vendor/ailf-tasks/cli.js +61 -0
- package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
- package/dist/_vendor/ailf-tasks/index.js +16 -0
- package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
- package/dist/_vendor/ailf-tasks/parser.js +73 -0
- package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
- package/dist/_vendor/ailf-tasks/schemas.js +180 -0
- package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
- package/dist/_vendor/ailf-tasks/validation.js +162 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +6 -1
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
- package/dist/adapters/task-sources/index.d.ts +1 -2
- package/dist/adapters/task-sources/index.js +1 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
- package/dist/adapters/task-sources/repo-schemas.js +2 -2
- package/dist/adapters/task-sources/repo-task-source.js +1 -1
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
- package/dist/adapters/task-sources/task-file-loader.js +20 -6
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +2 -3
- package/dist/commands/init.js +56 -170
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/composition-root.d.ts +2 -3
- package/dist/composition-root.js +27 -14
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +30 -16
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +50 -15
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +51 -31
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
- package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
- package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
- package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +15 -8
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +15 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/dist/pipeline/mirror-repo-tasks.js +1 -1
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +67 -29
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +24 -24
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
*
|
|
19
19
|
* @see packages/core/src/services/scoring-engine.ts — the 4-tier engine
|
|
20
20
|
* @see packages/eval/src/pipeline/calculate-scores.ts — the consumer
|
|
21
|
-
* @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
|
|
21
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
|
|
22
22
|
*/
|
|
23
23
|
import { aggregateDimensions, computeTaskScore, normalizeScore, } from "../../_vendor/ailf-core/index.js";
|
|
24
24
|
import { classifyRubric, parseRubricScore } from "../../_vendor/ailf-core/index.js";
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* task-bridge.ts — Bidirectional bridge between old TaskDefinition and new LiteracyTaskDefinition.
|
|
3
|
+
*
|
|
4
|
+
* Enables incremental migration: consumers can convert between the two types
|
|
5
|
+
* without changing their internal logic. Once all consumers use
|
|
6
|
+
* GeneralizedTaskDefinition, this module is deleted (Wave 3 task 6).
|
|
7
|
+
*
|
|
8
|
+
* Field mapping (TaskDefinition ↔ LiteracyTaskDefinition):
|
|
9
|
+
* id ↔ id
|
|
10
|
+
* description ↔ title
|
|
11
|
+
* featureArea ↔ area
|
|
12
|
+
* taskPrompt ↔ prompt.text (fallback: prompt.template)
|
|
13
|
+
* canonicalDocs ↔ context.docs
|
|
14
|
+
* referenceSolution ↔ referenceSolution
|
|
15
|
+
* docCoverage ↔ docCoverage
|
|
16
|
+
* assertions ↔ assertions (structurally identical)
|
|
17
|
+
* baseline ↔ baseline (structurally identical)
|
|
18
|
+
* tags ↔ tags
|
|
19
|
+
* status ↔ status
|
|
20
|
+
* extraVars ↔ prompt.vars
|
|
21
|
+
*
|
|
22
|
+
* The assertion and doc-ref sub-types are structurally identical between
|
|
23
|
+
* the old and new type systems, so no field-level remapping is needed
|
|
24
|
+
* for those — only a TypeScript-level cast.
|
|
25
|
+
*/
|
|
26
|
+
import type { LiteracyTaskDefinition, TaskDefinition } from "../../_vendor/ailf-core/index.d.ts";
|
|
27
|
+
/**
|
|
28
|
+
* Convert an old-style TaskDefinition to the new LiteracyTaskDefinition.
|
|
29
|
+
*
|
|
30
|
+
* Every field of TaskDefinition has a corresponding field in LiteracyTaskDefinition,
|
|
31
|
+
* so this conversion is lossless.
|
|
32
|
+
*/
|
|
33
|
+
export declare function toGeneralized(task: TaskDefinition): LiteracyTaskDefinition;
|
|
34
|
+
/**
|
|
35
|
+
* Convert a new LiteracyTaskDefinition to the old TaskDefinition shape.
|
|
36
|
+
*
|
|
37
|
+
* Fields that only exist on LiteracyTaskDefinition (description, difficulty,
|
|
38
|
+
* metadata, rubric, providers, options, context.fixtures, prompt.systemMessage)
|
|
39
|
+
* are dropped — the old type has no place for them.
|
|
40
|
+
*/
|
|
41
|
+
export declare function toLiteracyTask(task: LiteracyTaskDefinition): TaskDefinition;
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* task-bridge.ts — Bidirectional bridge between old TaskDefinition and new LiteracyTaskDefinition.
|
|
3
|
+
*
|
|
4
|
+
* Enables incremental migration: consumers can convert between the two types
|
|
5
|
+
* without changing their internal logic. Once all consumers use
|
|
6
|
+
* GeneralizedTaskDefinition, this module is deleted (Wave 3 task 6).
|
|
7
|
+
*
|
|
8
|
+
* Field mapping (TaskDefinition ↔ LiteracyTaskDefinition):
|
|
9
|
+
* id ↔ id
|
|
10
|
+
* description ↔ title
|
|
11
|
+
* featureArea ↔ area
|
|
12
|
+
* taskPrompt ↔ prompt.text (fallback: prompt.template)
|
|
13
|
+
* canonicalDocs ↔ context.docs
|
|
14
|
+
* referenceSolution ↔ referenceSolution
|
|
15
|
+
* docCoverage ↔ docCoverage
|
|
16
|
+
* assertions ↔ assertions (structurally identical)
|
|
17
|
+
* baseline ↔ baseline (structurally identical)
|
|
18
|
+
* tags ↔ tags
|
|
19
|
+
* status ↔ status
|
|
20
|
+
* extraVars ↔ prompt.vars
|
|
21
|
+
*
|
|
22
|
+
* The assertion and doc-ref sub-types are structurally identical between
|
|
23
|
+
* the old and new type systems, so no field-level remapping is needed
|
|
24
|
+
* for those — only a TypeScript-level cast.
|
|
25
|
+
*/
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
// toGeneralized — old TaskDefinition → LiteracyTaskDefinition
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
/**
|
|
30
|
+
* Convert an old-style TaskDefinition to the new LiteracyTaskDefinition.
|
|
31
|
+
*
|
|
32
|
+
* Every field of TaskDefinition has a corresponding field in LiteracyTaskDefinition,
|
|
33
|
+
* so this conversion is lossless.
|
|
34
|
+
*/
|
|
35
|
+
export function toGeneralized(task) {
|
|
36
|
+
const result = {
|
|
37
|
+
mode: "literacy",
|
|
38
|
+
id: task.id,
|
|
39
|
+
title: task.description,
|
|
40
|
+
area: task.featureArea,
|
|
41
|
+
prompt: {
|
|
42
|
+
text: task.taskPrompt,
|
|
43
|
+
...(task.extraVars != null ? { vars: task.extraVars } : {}),
|
|
44
|
+
},
|
|
45
|
+
context: {
|
|
46
|
+
docs: task.canonicalDocs,
|
|
47
|
+
},
|
|
48
|
+
referenceSolution: task.referenceSolution,
|
|
49
|
+
docCoverage: task.docCoverage,
|
|
50
|
+
assertions: task.assertions,
|
|
51
|
+
};
|
|
52
|
+
// Only set optional fields when present to preserve round-trip identity
|
|
53
|
+
if (task.baseline != null)
|
|
54
|
+
result.baseline = task.baseline;
|
|
55
|
+
if (task.tags != null)
|
|
56
|
+
result.tags = task.tags;
|
|
57
|
+
if (task.status != null)
|
|
58
|
+
result.status = task.status;
|
|
59
|
+
return result;
|
|
60
|
+
}
|
|
61
|
+
// ---------------------------------------------------------------------------
|
|
62
|
+
// toLiteracyTask — LiteracyTaskDefinition → old TaskDefinition
|
|
63
|
+
// ---------------------------------------------------------------------------
|
|
64
|
+
/**
|
|
65
|
+
* Convert a new LiteracyTaskDefinition to the old TaskDefinition shape.
|
|
66
|
+
*
|
|
67
|
+
* Fields that only exist on LiteracyTaskDefinition (description, difficulty,
|
|
68
|
+
* metadata, rubric, providers, options, context.fixtures, prompt.systemMessage)
|
|
69
|
+
* are dropped — the old type has no place for them.
|
|
70
|
+
*/
|
|
71
|
+
export function toLiteracyTask(task) {
|
|
72
|
+
const result = {
|
|
73
|
+
id: task.id,
|
|
74
|
+
description: task.title,
|
|
75
|
+
featureArea: task.area ?? "",
|
|
76
|
+
taskPrompt: task.prompt?.text ?? task.prompt?.template ?? "",
|
|
77
|
+
canonicalDocs: (task.context?.docs ?? []),
|
|
78
|
+
referenceSolution: task.referenceSolution ?? "",
|
|
79
|
+
docCoverage: task.docCoverage ?? false,
|
|
80
|
+
assertions: (task.assertions ?? []),
|
|
81
|
+
};
|
|
82
|
+
// Only set optional fields when present to preserve round-trip identity
|
|
83
|
+
if (task.baseline != null)
|
|
84
|
+
result.baseline = task.baseline;
|
|
85
|
+
if (task.tags != null)
|
|
86
|
+
result.tags = task.tags;
|
|
87
|
+
if (task.status != null)
|
|
88
|
+
result.status = task.status;
|
|
89
|
+
if (task.prompt?.vars != null)
|
|
90
|
+
result.extraVars = task.prompt.vars;
|
|
91
|
+
return result;
|
|
92
|
+
}
|
|
@@ -11,11 +11,8 @@
|
|
|
11
11
|
* - Validate the graph is a DAG (reject cycles)
|
|
12
12
|
* - Assign execution priority via topological sort
|
|
13
13
|
*
|
|
14
|
-
* This module exists alongside `generate-configs.ts` — it does NOT replace
|
|
15
|
-
* the existing codegen path. Phase 7 will swap callers over to the compiler.
|
|
16
|
-
*
|
|
17
14
|
* @see packages/core/src/types/task-graph.ts — TaskGraph types
|
|
18
|
-
* @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
15
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
19
16
|
*/
|
|
20
17
|
import type { DependencyEdge, FilterOptions, GeneralizedTaskDefinition, TaskGraph, TaskNode } from "../../_vendor/ailf-core/index.d.ts";
|
|
21
18
|
/** Options for building a task graph */
|
|
@@ -11,11 +11,8 @@
|
|
|
11
11
|
* - Validate the graph is a DAG (reject cycles)
|
|
12
12
|
* - Assign execution priority via topological sort
|
|
13
13
|
*
|
|
14
|
-
* This module exists alongside `generate-configs.ts` — it does NOT replace
|
|
15
|
-
* the existing codegen path. Phase 7 will swap callers over to the compiler.
|
|
16
|
-
*
|
|
17
14
|
* @see packages/core/src/types/task-graph.ts — TaskGraph types
|
|
18
|
-
* @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
15
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
19
16
|
*/
|
|
20
17
|
// ---------------------------------------------------------------------------
|
|
21
18
|
// Public API
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* Captures tool calls, token usage, cost, and timing for every evaluation.
|
|
5
5
|
* Full traces go to blob storage; sanitized summaries to Content Lake.
|
|
6
6
|
*
|
|
7
|
-
* @see docs/exec-plans/architecture-overhaul/phase-6-observability.md
|
|
7
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-6-observability.md
|
|
8
8
|
* @see docs/design-docs/architecture-overhaul/observability-telemetry.md
|
|
9
9
|
*/
|
|
10
10
|
export { collectTrace, mergeTraces, type ProviderResponse, type RawToolCall, type TraceCollectorOptions, } from "./trace-collector.js";
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* Captures tool calls, token usage, cost, and timing for every evaluation.
|
|
5
5
|
* Full traces go to blob storage; sanitized summaries to Content Lake.
|
|
6
6
|
*
|
|
7
|
-
* @see docs/exec-plans/architecture-overhaul/phase-6-observability.md
|
|
7
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-6-observability.md
|
|
8
8
|
* @see docs/design-docs/architecture-overhaul/observability-telemetry.md
|
|
9
9
|
*/
|
|
10
10
|
// Trace collection
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
* recorded in provenance for reproducibility tracking.
|
|
13
13
|
*
|
|
14
14
|
* @see docs/design-docs/architecture-overhaul/domain-model.md (VariableEnvelope)
|
|
15
|
-
* @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
15
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
16
16
|
*/
|
|
17
17
|
import type { VariableDeclaration, VariableEnvelope } from "../../_vendor/ailf-core/index.d.ts";
|
|
18
18
|
/** Options for variable resolution */
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
* recorded in provenance for reproducibility tracking.
|
|
13
13
|
*
|
|
14
14
|
* @see docs/design-docs/architecture-overhaul/domain-model.md (VariableEnvelope)
|
|
15
|
-
* @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
15
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
16
16
|
*/
|
|
17
17
|
import { simpleHash } from "./hash.js";
|
|
18
18
|
// ---------------------------------------------------------------------------
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
*
|
|
8
8
|
* Phase 3c of the Scenario Matrix implementation.
|
|
9
9
|
*
|
|
10
|
-
* @see docs/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
10
|
+
* @see docs/archive/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
11
11
|
*/
|
|
12
12
|
import type { Logger, PluginRegistry } from "../_vendor/ailf-core/index.d.ts";
|
|
13
13
|
import type { CoverageAuditReport, ProductFeature } from "./types.js";
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
*
|
|
8
8
|
* Phase 3c of the Scenario Matrix implementation.
|
|
9
9
|
*
|
|
10
|
-
* @see docs/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
10
|
+
* @see docs/archive/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
11
11
|
*/
|
|
12
12
|
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
13
13
|
import { tryLoadConfigFile } from "./compiler/config-loader.js";
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
*
|
|
13
13
|
* These are deterministic, pure functions — no randomness, no side effects.
|
|
14
14
|
*
|
|
15
|
-
* @see docs/exec-plans/grader-reliability.md — Phase 4
|
|
15
|
+
* @see docs/archive/exec-plans/grader-reliability.md — Phase 4
|
|
16
16
|
*/
|
|
17
17
|
/** A degradation targeting a specific scoring dimension */
|
|
18
18
|
export interface Degradation {
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
*
|
|
13
13
|
* These are deterministic, pure functions — no randomness, no side effects.
|
|
14
14
|
*
|
|
15
|
-
* @see docs/exec-plans/grader-reliability.md — Phase 4
|
|
15
|
+
* @see docs/archive/exec-plans/grader-reliability.md — Phase 4
|
|
16
16
|
*/
|
|
17
17
|
// ---------------------------------------------------------------------------
|
|
18
18
|
// Task Completion degradations
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
* When both sources agree, confidence is boosted. When only ceiling
|
|
14
14
|
* signals are available, they serve as a fallback for unclassified cases.
|
|
15
15
|
*
|
|
16
|
-
* @see docs/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
16
|
+
* @see docs/archive/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
17
17
|
*/
|
|
18
18
|
import type { FailureMode, FailureModeReport, FeatureScore, GraderJudgment } from "./types.js";
|
|
19
19
|
/**
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
* When both sources agree, confidence is boosted. When only ceiling
|
|
14
14
|
* signals are available, they serve as a fallback for unclassified cases.
|
|
15
15
|
*
|
|
16
|
-
* @see docs/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
16
|
+
* @see docs/archive/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
17
17
|
*/
|
|
18
18
|
import { detectFeatureArea } from "../_vendor/ailf-core/index.js";
|
|
19
19
|
// ---------------------------------------------------------------------------
|
|
@@ -23,6 +23,7 @@ import { detectFeatureArea } from "../_vendor/ailf-core/index.js";
|
|
|
23
23
|
const CLASSIFICATION_THRESHOLD = 60;
|
|
24
24
|
/** All failure mode types for initializing empty counts */
|
|
25
25
|
const ALL_MODES = [
|
|
26
|
+
"api-error",
|
|
26
27
|
"incorrect-docs",
|
|
27
28
|
"missing-docs",
|
|
28
29
|
"model-limitation",
|
|
@@ -33,6 +34,9 @@ const ALL_MODES = [
|
|
|
33
34
|
// ---------------------------------------------------------------------------
|
|
34
35
|
// Keyword patterns
|
|
35
36
|
// ---------------------------------------------------------------------------
|
|
37
|
+
/** API error pattern — checked FIRST to prevent timeout errors containing
|
|
38
|
+
* "deprecated" from being misclassified as outdated-docs. */
|
|
39
|
+
const API_ERROR_PATTERN = /\[api-error\]|timeout|timed out|rate limit|429|503|ECONNRESET|ETIMEDOUT|socket hang up|fetch failed/i;
|
|
36
40
|
const OUTDATED_PATTERN = /deprecated|old api|v[0-9]+ syntax|no longer supported|legacy|previous version|outdated|superseded|replaced by/i;
|
|
37
41
|
const MISSING_PATTERN = /no documentation|not covered|had to guess|not found|missing.*doc|no.*information|undocumented|couldn't find|without.*documentation/i;
|
|
38
42
|
const INCORRECT_PATTERN = /contradicts|incorrect.*doc|doc.*incorrect|wrong.*doc|doc.*wrong|documentation says.*but|factual error|inaccurate|misleading.*doc/i;
|
|
@@ -226,6 +230,11 @@ function classifyByCeiling(score, ceilingScore, floorScore) {
|
|
|
226
230
|
}
|
|
227
231
|
/** Classify by keyword matching on the reason text */
|
|
228
232
|
function classifyByKeyword(reason) {
|
|
233
|
+
// API errors checked first — prevents timeout messages containing
|
|
234
|
+
// "deprecated" from being misclassified as outdated-docs.
|
|
235
|
+
if (API_ERROR_PATTERN.test(reason)) {
|
|
236
|
+
return { confidence: "high", mode: "api-error", source: "keyword" };
|
|
237
|
+
}
|
|
229
238
|
if (OUTDATED_PATTERN.test(reason)) {
|
|
230
239
|
return { confidence: "high", mode: "outdated-docs", source: "keyword" };
|
|
231
240
|
}
|
|
@@ -321,6 +330,7 @@ function findTopMode(modes) {
|
|
|
321
330
|
/** Initialize mode counts to zero */
|
|
322
331
|
function initModeCounts() {
|
|
323
332
|
return {
|
|
333
|
+
"api-error": 0,
|
|
324
334
|
"incorrect-docs": 0,
|
|
325
335
|
"missing-docs": 0,
|
|
326
336
|
"model-limitation": 0,
|
|
@@ -332,6 +342,8 @@ function initModeCounts() {
|
|
|
332
342
|
/** Get icon for a failure mode */
|
|
333
343
|
function modeIcon(mode) {
|
|
334
344
|
switch (mode) {
|
|
345
|
+
case "api-error":
|
|
346
|
+
return "⚡";
|
|
335
347
|
case "incorrect-docs":
|
|
336
348
|
return "❌";
|
|
337
349
|
case "missing-docs":
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
* bottleneck dimension to the median of non-bottlenecked dimensions (not 100).
|
|
14
14
|
* This produces realistic estimates rather than theoretical maximums.
|
|
15
15
|
*
|
|
16
|
-
* @see docs/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
16
|
+
* @see docs/archive/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
17
17
|
*/
|
|
18
18
|
import type { FailureModeReport, FeatureScore, GapAnalysisReport, GapEstimate } from "./types.js";
|
|
19
19
|
/**
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
* bottleneck dimension to the median of non-bottlenecked dimensions (not 100).
|
|
14
14
|
* This produces realistic estimates rather than theoretical maximums.
|
|
15
15
|
*
|
|
16
|
-
* @see docs/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
16
|
+
* @see docs/archive/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
17
17
|
*/
|
|
18
18
|
// ---------------------------------------------------------------------------
|
|
19
19
|
// Constants
|
|
@@ -26,6 +26,7 @@ const DEFAULT_WEIGHTS = {
|
|
|
26
26
|
};
|
|
27
27
|
/** Map failure modes to the dimensions they typically bottleneck */
|
|
28
28
|
const MODE_BOTTLENECKS = {
|
|
29
|
+
"api-error": [], // Infrastructure issue, not a docs problem
|
|
29
30
|
"incorrect-docs": ["code-correctness", "task-completion"],
|
|
30
31
|
"missing-docs": ["task-completion", "doc-coverage"],
|
|
31
32
|
"model-limitation": [], // Not a docs problem
|
|
@@ -35,6 +36,7 @@ const MODE_BOTTLENECKS = {
|
|
|
35
36
|
};
|
|
36
37
|
/** Remediation descriptions by failure mode */
|
|
37
38
|
const REMEDIATION_MAP = {
|
|
39
|
+
"api-error": "Check model provider config (timeoutMs, maxRetries), API quotas, and task complexity",
|
|
38
40
|
"incorrect-docs": "Fix factual errors in existing documentation",
|
|
39
41
|
"missing-docs": "Write new documentation for uncovered functionality",
|
|
40
42
|
"model-limitation": "Not a documentation problem — track for model improvement",
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
* the new compiler has been validated in production.
|
|
13
13
|
*
|
|
14
14
|
* @see packages/eval/src/pipeline/compiler/ — the new compiler pipeline
|
|
15
|
-
* @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
|
|
15
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
|
|
16
16
|
*
|
|
17
17
|
* ---
|
|
18
18
|
*
|
|
@@ -31,7 +31,7 @@
|
|
|
31
31
|
* No process.argv parsing. No env var fallbacks. Callers provide typed options.
|
|
32
32
|
*
|
|
33
33
|
* @see config/models.yaml — the central model registry
|
|
34
|
-
* @see docs/exec-plans/eliminate-lib-layer.md
|
|
34
|
+
* @see docs/archive/exec-plans/eliminate-lib-layer.md
|
|
35
35
|
*/
|
|
36
36
|
import { type LiteracyTaskDefinition, type Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
37
37
|
import type { FilterOptions } from "./types.js";
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
* the new compiler has been validated in production.
|
|
13
13
|
*
|
|
14
14
|
* @see packages/eval/src/pipeline/compiler/ — the new compiler pipeline
|
|
15
|
-
* @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
|
|
15
|
+
* @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
|
|
16
16
|
*
|
|
17
17
|
* ---
|
|
18
18
|
*
|
|
@@ -31,14 +31,15 @@
|
|
|
31
31
|
* No process.argv parsing. No env var fallbacks. Callers provide typed options.
|
|
32
32
|
*
|
|
33
33
|
* @see config/models.yaml — the central model registry
|
|
34
|
-
* @see docs/exec-plans/eliminate-lib-layer.md
|
|
34
|
+
* @see docs/archive/exec-plans/eliminate-lib-layer.md
|
|
35
35
|
*/
|
|
36
|
-
import { extractModelName, extractProvider, mergeConfig,
|
|
36
|
+
import { extractModelName, extractProvider, mergeConfig, } from "../_vendor/ailf-core/index.js";
|
|
37
37
|
import { existsSync, readdirSync, writeFileSync } from "fs";
|
|
38
38
|
import { resolve } from "path";
|
|
39
39
|
import { dump } from "js-yaml";
|
|
40
40
|
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
41
41
|
import { loadConfigFile } from "./compiler/config-loader.js";
|
|
42
|
+
import { modelMatchesLiteracyVariant } from "./compiler/mode-bases/literacy.js";
|
|
42
43
|
import { LITERACY_PROMPT_TEMPLATES } from "./compiler/mode-handlers/literacy/index.js";
|
|
43
44
|
import { expandTaskDefinitions, loadAndExpandTasks } from "./expand-tasks.js";
|
|
44
45
|
import { validateModelsYaml } from "./validate.js";
|
|
@@ -135,8 +136,8 @@ const SOURCE_ISOLATION_ASSERT = {
|
|
|
135
136
|
// Config generators
|
|
136
137
|
// ---------------------------------------------------------------------------
|
|
137
138
|
function generateAgenticConfig(models, tests, prompts, source, searchMode, allowedOrigins) {
|
|
138
|
-
const naiveModels = models.models.filter((m) =>
|
|
139
|
-
const optimizedModels = models.models.filter((m) =>
|
|
139
|
+
const naiveModels = models.models.filter((m) => modelMatchesLiteracyVariant(m, "agentic-naive"));
|
|
140
|
+
const optimizedModels = models.models.filter((m) => modelMatchesLiteracyVariant(m, "agentic-optimized"));
|
|
140
141
|
const providers = [];
|
|
141
142
|
// Build doc source config to inject into providers
|
|
142
143
|
const resolvedSearchMode = searchMode ?? "open";
|
|
@@ -170,6 +171,7 @@ function generateAgenticConfig(models, tests, prompts, source, searchMode, allow
|
|
|
170
171
|
model: modelName,
|
|
171
172
|
provider,
|
|
172
173
|
}),
|
|
174
|
+
...(model.timeoutMs ? { timeoutMs: model.timeoutMs } : {}),
|
|
173
175
|
...sourceConfig,
|
|
174
176
|
observe: true,
|
|
175
177
|
observerOptions: models.defaults.observerOptions ?? {},
|
|
@@ -189,6 +191,7 @@ function generateAgenticConfig(models, tests, prompts, source, searchMode, allow
|
|
|
189
191
|
model: modelName,
|
|
190
192
|
provider,
|
|
191
193
|
}),
|
|
194
|
+
...(model.timeoutMs ? { timeoutMs: model.timeoutMs } : {}),
|
|
192
195
|
...sourceConfig,
|
|
193
196
|
observe: true,
|
|
194
197
|
observerOptions: models.defaults.observerOptions ?? {},
|
|
@@ -220,9 +223,12 @@ function generateAgenticConfig(models, tests, prompts, source, searchMode, allow
|
|
|
220
223
|
};
|
|
221
224
|
}
|
|
222
225
|
function generateBaselineConfig(models, tests, prompts) {
|
|
223
|
-
const baselineModels = models.models.filter((m) =>
|
|
226
|
+
const baselineModels = models.models.filter((m) => modelMatchesLiteracyVariant(m, "baseline"));
|
|
224
227
|
const providers = baselineModels.map((model) => ({
|
|
225
|
-
config:
|
|
228
|
+
config: {
|
|
229
|
+
...mergeConfig(models.defaults, model.config),
|
|
230
|
+
...(model.timeoutMs ? { timeoutMs: model.timeoutMs } : {}),
|
|
231
|
+
},
|
|
226
232
|
id: model.id,
|
|
227
233
|
label: model.label,
|
|
228
234
|
}));
|
|
@@ -246,12 +252,13 @@ function generateBaselineConfig(models, tests, prompts) {
|
|
|
246
252
|
};
|
|
247
253
|
}
|
|
248
254
|
function generateObservedConfig(models, tests, prompts) {
|
|
249
|
-
const observedModels = models.models.filter((m) =>
|
|
255
|
+
const observedModels = models.models.filter((m) => modelMatchesLiteracyVariant(m, "observed"));
|
|
250
256
|
const providers = observedModels.map((model) => {
|
|
251
257
|
const modelName = extractModelName(model.id);
|
|
252
258
|
return {
|
|
253
259
|
config: {
|
|
254
260
|
...mergeConfig(models.defaults, model.config),
|
|
261
|
+
...(model.timeoutMs ? { timeoutMs: model.timeoutMs } : {}),
|
|
255
262
|
modelName,
|
|
256
263
|
observe: true,
|
|
257
264
|
recordOptions: models.defaults.observerOptions ?? {},
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* Migrated from lib/grader-compare.ts — no process.argv, no process.exit(),
|
|
11
11
|
* no module-level constants. Accepts rootDir as parameter.
|
|
12
12
|
*
|
|
13
|
-
* @see docs/exec-plans/grader-reliability.md — Phase 3
|
|
13
|
+
* @see docs/archive/exec-plans/grader-reliability.md — Phase 3
|
|
14
14
|
*/
|
|
15
15
|
import type { Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
16
16
|
import { type GraderComparison } from "./grader-comparison.js";
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* Migrated from lib/grader-compare.ts — no process.argv, no process.exit(),
|
|
11
11
|
* no module-level constants. Accepts rootDir as parameter.
|
|
12
12
|
*
|
|
13
|
-
* @see docs/exec-plans/grader-reliability.md — Phase 3
|
|
13
|
+
* @see docs/archive/exec-plans/grader-reliability.md — Phase 3
|
|
14
14
|
*/
|
|
15
15
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
16
16
|
import { join } from "path";
|
|
@@ -41,6 +41,12 @@ function classifyDimension(component) {
|
|
|
41
41
|
}
|
|
42
42
|
function detectFeatureArea(description) {
|
|
43
43
|
const desc = description.toLowerCase();
|
|
44
|
+
if (desc.includes("portable text"))
|
|
45
|
+
return "portable-text";
|
|
46
|
+
if (desc.includes("content lake"))
|
|
47
|
+
return "content-lake";
|
|
48
|
+
if (desc.includes("image handling") || desc.includes("image asset"))
|
|
49
|
+
return "image-handling";
|
|
44
50
|
if (desc.includes("studio"))
|
|
45
51
|
return "studio-setup";
|
|
46
52
|
if (desc.includes("visual") ||
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* This module has NO side effects — no file I/O, no API calls.
|
|
11
11
|
* It operates on pre-collected data only.
|
|
12
12
|
*
|
|
13
|
-
* @see docs/exec-plans/grader-reliability.md — Phase 3
|
|
13
|
+
* @see docs/archive/exec-plans/grader-reliability.md — Phase 3
|
|
14
14
|
*/
|
|
15
15
|
import type { ComparisonReport, ScoreSummary } from "./types.js";
|
|
16
16
|
/** Per-dimension comparison between two graders */
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* This module has NO side effects — no file I/O, no API calls.
|
|
11
11
|
* It operates on pre-collected data only.
|
|
12
12
|
*
|
|
13
|
-
* @see docs/exec-plans/grader-reliability.md — Phase 3
|
|
13
|
+
* @see docs/archive/exec-plans/grader-reliability.md — Phase 3
|
|
14
14
|
*/
|
|
15
15
|
import { compare } from "./compare.js";
|
|
16
16
|
import { pearsonCorrelation } from "./grader-validation.js";
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
* Migrated from lib/grader-consistency.ts — no process.argv, no process.exit(),
|
|
13
13
|
* no module-level constants.
|
|
14
14
|
*
|
|
15
|
-
* @see docs/exec-plans/grader-reliability.md — Phase 1
|
|
15
|
+
* @see docs/archive/exec-plans/grader-reliability.md — Phase 1
|
|
16
16
|
*/
|
|
17
17
|
import { type Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
18
18
|
import type { RawPromptfooFile } from "./calculate-scores.js";
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
* Migrated from lib/grader-consistency.ts — no process.argv, no process.exit(),
|
|
13
13
|
* no module-level constants.
|
|
14
14
|
*
|
|
15
|
-
* @see docs/exec-plans/grader-reliability.md — Phase 1
|
|
15
|
+
* @see docs/archive/exec-plans/grader-reliability.md — Phase 1
|
|
16
16
|
*/
|
|
17
17
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
18
18
|
import { join } from "path";
|
|
@@ -44,6 +44,12 @@ function classifyDimension(component) {
|
|
|
44
44
|
// ---------------------------------------------------------------------------
|
|
45
45
|
function detectFeatureArea(description) {
|
|
46
46
|
const desc = description.toLowerCase();
|
|
47
|
+
if (desc.includes("portable text"))
|
|
48
|
+
return "portable-text";
|
|
49
|
+
if (desc.includes("content lake"))
|
|
50
|
+
return "content-lake";
|
|
51
|
+
if (desc.includes("image handling") || desc.includes("image asset"))
|
|
52
|
+
return "image-handling";
|
|
47
53
|
if (desc.includes("studio"))
|
|
48
54
|
return "studio-setup";
|
|
49
55
|
if (desc.includes("visual") ||
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* This module has NO side effects — no file I/O, no API calls.
|
|
11
11
|
* It operates on pre-collected data only.
|
|
12
12
|
*
|
|
13
|
-
* @see docs/exec-plans/grader-reliability.md — Phase 1
|
|
13
|
+
* @see docs/archive/exec-plans/grader-reliability.md — Phase 1
|
|
14
14
|
*/
|
|
15
15
|
/** Per-dimension consistency aggregates */
|
|
16
16
|
export interface DimensionConsistency {
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* This module has NO side effects — no file I/O, no API calls.
|
|
11
11
|
* It operates on pre-collected data only.
|
|
12
12
|
*
|
|
13
|
-
* @see docs/exec-plans/grader-reliability.md — Phase 1
|
|
13
|
+
* @see docs/archive/exec-plans/grader-reliability.md — Phase 1
|
|
14
14
|
*/
|
|
15
15
|
// ---------------------------------------------------------------------------
|
|
16
16
|
// Pure computation
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
* Migrated from lib/grader-sensitivity.ts — no process.argv, no process.exit(),
|
|
12
12
|
* no module-level constants. Accepts rootDir as parameter.
|
|
13
13
|
*
|
|
14
|
-
* @see docs/exec-plans/grader-reliability.md — Phase 4
|
|
14
|
+
* @see docs/archive/exec-plans/grader-reliability.md — Phase 4
|
|
15
15
|
*/
|
|
16
16
|
import type { Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
17
17
|
import { type GraderSensitivityResult } from "./grader-sensitivity.js";
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
* Migrated from lib/grader-sensitivity.ts — no process.argv, no process.exit(),
|
|
12
12
|
* no module-level constants. Accepts rootDir as parameter.
|
|
13
13
|
*
|
|
14
|
-
* @see docs/exec-plans/grader-reliability.md — Phase 4
|
|
14
|
+
* @see docs/archive/exec-plans/grader-reliability.md — Phase 4
|
|
15
15
|
*/
|
|
16
16
|
import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync, } from "fs";
|
|
17
17
|
import { basename, join } from "path";
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
*
|
|
12
12
|
* This module has NO side effects — no file I/O, no API calls.
|
|
13
13
|
*
|
|
14
|
-
* @see docs/exec-plans/grader-reliability.md — Phase 4
|
|
14
|
+
* @see docs/archive/exec-plans/grader-reliability.md — Phase 4
|
|
15
15
|
*/
|
|
16
16
|
/** Sensitivity broken down by degradation type */
|
|
17
17
|
export interface DegradationSensitivity {
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
*
|
|
12
12
|
* This module has NO side effects — no file I/O, no API calls.
|
|
13
13
|
*
|
|
14
|
-
* @see docs/exec-plans/grader-reliability.md — Phase 4
|
|
14
|
+
* @see docs/archive/exec-plans/grader-reliability.md — Phase 4
|
|
15
15
|
*/
|
|
16
16
|
// ---------------------------------------------------------------------------
|
|
17
17
|
// Pure computation
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
* Migrated from lib/grader-validate.ts — no process.argv, no process.exit(),
|
|
12
12
|
* no module-level constants. Accepts rootDir as parameter.
|
|
13
13
|
*
|
|
14
|
-
* @see docs/exec-plans/grader-reliability.md — Phase 2
|
|
14
|
+
* @see docs/archive/exec-plans/grader-reliability.md — Phase 2
|
|
15
15
|
*/
|
|
16
16
|
import type { Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
17
17
|
import { type GraderValidation } from "./grader-validation.js";
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
* Migrated from lib/grader-validate.ts — no process.argv, no process.exit(),
|
|
12
12
|
* no module-level constants. Accepts rootDir as parameter.
|
|
13
13
|
*
|
|
14
|
-
* @see docs/exec-plans/grader-reliability.md — Phase 2
|
|
14
|
+
* @see docs/archive/exec-plans/grader-reliability.md — Phase 2
|
|
15
15
|
*/
|
|
16
16
|
import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
|
|
17
17
|
import { join } from "path";
|
|
@@ -43,7 +43,7 @@ function loadReferenceGrades(rootDir) {
|
|
|
43
43
|
if (!existsSync(refsDir)) {
|
|
44
44
|
throw new Error(`Reference grades directory not found: ${refsDir}. ` +
|
|
45
45
|
"Create canonical/grader-references/ with YAML reference files. " +
|
|
46
|
-
"See docs/exec-plans/grader-reliability.md — Phase 2.");
|
|
46
|
+
"See docs/archive/exec-plans/grader-reliability.md — Phase 2.");
|
|
47
47
|
}
|
|
48
48
|
const files = readdirSync(refsDir)
|
|
49
49
|
.filter((f) => f.endsWith(".yaml") || f.endsWith(".yml"))
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
*
|
|
12
12
|
* This module has NO side effects — no file I/O, no API calls.
|
|
13
13
|
*
|
|
14
|
-
* @see docs/exec-plans/grader-reliability.md — Phase 2
|
|
14
|
+
* @see docs/archive/exec-plans/grader-reliability.md — Phase 2
|
|
15
15
|
*/
|
|
16
16
|
/** Quality label for a correlation value */
|
|
17
17
|
export type CorrelationQuality = "excellent" | "good" | "moderate" | "poor" | "very-poor";
|