npm - @sanity/ailf - Versions diffs - 0.4.1 → 1.0.0 - Mend

@sanity/ailf 0.4.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (290) hide show

package/config/features.ts +23 -0
package/config/models.ts +83 -0
package/config/prompts.ts +16 -0
package/config/rubrics.ts +225 -0
package/config/schedules.ts +47 -0
package/config/sinks.ts +37 -0
package/config/sources.ts +21 -0
package/config/thresholds.ts +61 -0
package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
package/dist/_vendor/ailf-core/config-helpers.js +150 -0
package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
package/dist/_vendor/ailf-core/env-helper.js +45 -0
package/dist/_vendor/ailf-core/examples/index.d.ts +10 -10
package/dist/_vendor/ailf-core/examples/index.js +10 -10
package/dist/_vendor/ailf-core/index.d.ts +3 -0
package/dist/_vendor/ailf-core/index.js +5 -0
package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +32 -31
package/dist/_vendor/ailf-core/schemas/pipeline.js +52 -12
package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
package/dist/_vendor/ailf-core/services/index.js +2 -1
package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
package/dist/_vendor/ailf-core/services/scoring.js +25 -15
package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
package/dist/_vendor/ailf-core/types/index.js +8 -1
package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
package/dist/_vendor/ailf-core/types/trace.js +18 -0
package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
package/dist/_vendor/ailf-shared/index.d.ts +0 -1
package/dist/_vendor/ailf-shared/index.js +0 -1
package/dist/adapters/api-client/build-request.js +14 -13
package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
package/dist/adapters/config-sources/file-config-adapter.js +38 -12
package/dist/adapters/config-sources/index.d.ts +2 -0
package/dist/adapters/config-sources/index.js +1 -0
package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
package/dist/adapters/config-sources/ts-config-loader.js +133 -0
package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
package/dist/adapters/task-sources/composite-task-source.js +1 -1
package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
package/dist/adapters/task-sources/index.d.ts +1 -0
package/dist/adapters/task-sources/index.js +1 -0
package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
package/dist/adapters/task-sources/repo-task-source.js +69 -16
package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
package/dist/adapters/task-sources/task-file-loader.js +83 -0
package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
package/dist/adapters/task-sources/yaml-task-source.js +19 -16
package/dist/cli.js +0 -2
package/dist/commands/baseline.js +4 -1
package/dist/commands/calculate-scores.js +1 -1
package/dist/commands/coverage-audit.js +7 -1
package/dist/commands/explain-handler.js +25 -23
package/dist/commands/fetch-docs.js +3 -2
package/dist/commands/generate-configs.js +1 -1
package/dist/commands/interactive.js +11 -7
package/dist/commands/pipeline-action.d.ts +2 -0
package/dist/commands/pipeline-action.js +16 -6
package/dist/commands/pipeline.d.ts +1 -0
package/dist/commands/pipeline.js +4 -2
package/dist/commands/pr-comment.js +1 -1
package/dist/commands/publish.js +2 -2
package/dist/commands/readiness-report.js +13 -6
package/dist/composition-root.d.ts +1 -1
package/dist/composition-root.js +67 -4
package/dist/orchestration/build-app-context.js +1 -0
package/dist/orchestration/build-step-sequence.js +24 -6
package/dist/orchestration/steps/calculate-scores-step.js +24 -11
package/dist/orchestration/steps/fetch-docs-step.js +6 -4
package/dist/orchestration/steps/gap-analysis-step.js +8 -7
package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
package/dist/orchestration/steps/generate-configs-step.js +245 -51
package/dist/orchestration/steps/grader-consistency-step.js +7 -4
package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
package/dist/orchestration/steps/readiness-step.js +5 -6
package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
package/dist/orchestration/steps/run-eval-step.js +8 -7
package/dist/pipeline/cache.d.ts +1 -1
package/dist/pipeline/cache.js +36 -8
package/dist/pipeline/calculate-scores.d.ts +5 -7
package/dist/pipeline/calculate-scores.js +74 -153
package/dist/pipeline/checks.js +2 -2
package/dist/pipeline/compare.js +8 -8
package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
package/dist/pipeline/compiler/assertion-mapper.js +175 -0
package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
package/dist/pipeline/compiler/config-loader.d.ts +56 -0
package/dist/pipeline/compiler/config-loader.js +111 -0
package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
package/dist/pipeline/compiler/fixture-resolver.js +113 -0
package/dist/pipeline/compiler/hash.d.ts +11 -0
package/dist/pipeline/compiler/hash.js +18 -0
package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
package/dist/pipeline/compiler/ignore-fields.js +113 -0
package/dist/pipeline/compiler/index.d.ts +29 -0
package/dist/pipeline/compiler/index.js +45 -0
package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
package/dist/pipeline/compiler/literacy-bridge.js +172 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
package/dist/pipeline/compiler/presets/index.d.ts +9 -0
package/dist/pipeline/compiler/presets/index.js +8 -0
package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
package/dist/pipeline/compiler/provider-assembler.js +137 -0
package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
package/dist/pipeline/compiler/sandbox/index.js +11 -0
package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
package/dist/pipeline/compiler/scoring-bridge.js +114 -0
package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
package/dist/pipeline/compiler/task-graph-builder.js +291 -0
package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
package/dist/pipeline/compiler/telemetry/index.js +19 -0
package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
package/dist/pipeline/compiler/variable-resolver.js +115 -0
package/dist/pipeline/coverage-audit.d.ts +15 -5
package/dist/pipeline/coverage-audit.js +41 -22
package/dist/pipeline/eval-constants.d.ts +16 -6
package/dist/pipeline/eval-constants.js +25 -4
package/dist/pipeline/eval-fingerprint.d.ts +2 -2
package/dist/pipeline/eval-fingerprint.js +8 -9
package/dist/pipeline/expand-tasks.d.ts +23 -14
package/dist/pipeline/expand-tasks.js +37 -31
package/dist/pipeline/gap-analysis.d.ts +1 -1
package/dist/pipeline/gap-analysis.js +2 -2
package/dist/pipeline/generate-configs.d.ts +22 -4
package/dist/pipeline/generate-configs.js +53 -24
package/dist/pipeline/grader-api.d.ts +3 -3
package/dist/pipeline/grader-api.js +5 -12
package/dist/pipeline/grader-compare-runner.js +20 -27
package/dist/pipeline/grader-comparison.d.ts +4 -8
package/dist/pipeline/grader-comparison.js +11 -17
package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
package/dist/pipeline/grader-consistency-runner.js +18 -21
package/dist/pipeline/grader-consistency.d.ts +6 -10
package/dist/pipeline/grader-consistency.js +13 -32
package/dist/pipeline/grader-sensitivity-runner.js +7 -5
package/dist/pipeline/grader-sensitivity.d.ts +2 -6
package/dist/pipeline/grader-sensitivity.js +10 -10
package/dist/pipeline/grader-validate-runner.js +7 -5
package/dist/pipeline/grader-validation.d.ts +2 -6
package/dist/pipeline/grader-validation.js +14 -22
package/dist/pipeline/map-request-to-config.js +6 -1
package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
package/dist/pipeline/mirror-repo-tasks.js +16 -15
package/dist/pipeline/normalize-mode.d.ts +49 -0
package/dist/pipeline/normalize-mode.js +64 -0
package/dist/pipeline/plan.d.ts +5 -2
package/dist/pipeline/plan.js +134 -78
package/dist/pipeline/pr-comment.js +2 -0
package/dist/pipeline/profile-resolution.d.ts +47 -0
package/dist/pipeline/profile-resolution.js +91 -0
package/dist/pipeline/provenance.d.ts +2 -2
package/dist/pipeline/provenance.js +12 -17
package/dist/pipeline/release-report.js +4 -4
package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
package/dist/pipeline/repo-threshold-evaluator.js +1 -1
package/dist/pipeline/rubric-loader.d.ts +20 -0
package/dist/pipeline/rubric-loader.js +37 -0
package/dist/pipeline/validate.d.ts +4 -4
package/dist/pipeline/validate.js +64 -53
package/dist/schedules/loader.js +18 -8
package/dist/scripts/migrate-task-mode.d.ts +24 -0
package/dist/scripts/migrate-task-mode.js +85 -0
package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
package/dist/scripts/validate-task-sources.d.ts +1 -1
package/dist/scripts/validate-task-sources.js +15 -15
package/dist/sinks/loader.js +5 -7
package/dist/sources.d.ts +7 -7
package/dist/sources.js +22 -24
package/dist/webhook/dispatch.js +2 -1
package/package.json +6 -3
package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
package/tasks/literacy/frameworks.task.ts +128 -0
package/tasks/literacy/functions.task.ts +69 -0
package/tasks/literacy/groq.task.ts +258 -0
package/tasks/literacy/nextjs-live.task.ts +75 -0
package/tasks/literacy/studio-setup.task.ts +131 -0
package/tasks/literacy/visual-editing.task.ts +146 -0
package/config/features.yaml +0 -116
package/config/models.yaml +0 -116
package/config/prompts.yaml +0 -75
package/config/rubrics.yaml +0 -62
package/config/schedules.yaml +0 -43
package/config/sinks.yaml +0 -54
package/config/sources.yaml +0 -51
package/config/thresholds.yaml +0 -49
package/dist/agent-observer/test-imports.d.ts +0 -7
package/dist/agent-observer/test-imports.js +0 -185

package/dist/pipeline/cache.d.ts CHANGED Viewed

@@ -10,7 +10,7 @@
  *
  * Cache invalidation triggers:
  *   - Content change: any input file's content changes → hash changes → miss
- *   - Config change: config/models.yaml, config/sources.yaml, tasks/*.yaml changes → miss
+ *   - Config change: config/models, config/sources, tasks/*.yaml changes → miss
  *   - Manual bypass: --no-cache flag skips all cache lookups
  *   - Cache clear: delete results/cache/ to start fresh
  */

package/dist/pipeline/cache.js CHANGED Viewed

@@ -10,7 +10,7 @@
  *
  * Cache invalidation triggers:
  *   - Content change: any input file's content changes → hash changes → miss
- *   - Config change: config/models.yaml, config/sources.yaml, tasks/*.yaml changes → miss
+ *   - Config change: config/models, config/sources, tasks/*.yaml changes → miss
  *   - Manual bypass: --no-cache flag skips all cache lookups
  *   - Cache clear: delete results/cache/ to start fresh
  */
@@ -18,6 +18,19 @@ import { createHash } from "crypto";
 import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync, } from "fs";
 import { join, resolve } from "path";
 // ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+/** Resolve first existing config file (matches loadConfigFile priority chain) */
+function resolveConfig(rootDir, name) {
+    const r = (f) => resolve(rootDir, f);
+    for (const ext of [".ts", ".js", ".yaml", ".yml", ".json"]) {
+        const p = r(`config/${name}${ext}`);
+        if (existsSync(p))
+            return p;
+    }
+    return undefined;
+}
+// ---------------------------------------------------------------------------
 // Constants
 // ---------------------------------------------------------------------------
 const CACHE_DIR_NAME = "cache";
@@ -79,7 +92,10 @@ export function getStepInputPaths(rootDir, step) {
             const isBaseline = step === "eval-baseline" || step === "eval";
             const isAgentic = step === "eval-agentic" || step === "eval";
             const isObserved = step === "eval-observed" || step === "eval";
-            const paths = [r("config/models.yaml")];
+            const paths = [];
+            const modelsPath = resolveConfig(rootDir, "models");
+            if (modelsPath)
+                paths.push(modelsPath);
             // Config files — only the relevant ones for this mode
             if (isBaseline) {
                 paths.push(r("promptfooconfig.yaml"));
@@ -130,25 +146,37 @@ export function getStepInputPaths(rootDir, step) {
             return paths;
         }
         case "fetch-docs": {
-            // Inputs: config/sources.yaml, config/models.yaml, task files (which contain inline mappings)
-            const paths = [r("config/sources.yaml"), r("config/models.yaml")];
+            // Inputs: config sources + models, task files
+            const paths = [];
+            const sourcesPath = resolveConfig(rootDir, "sources");
+            const modelsPath2 = resolveConfig(rootDir, "models");
+            if (sourcesPath)
+                paths.push(sourcesPath);
+            if (modelsPath2)
+                paths.push(modelsPath2);
             // Include all task files (they define feature areas)
             const tasksDir = r("tasks");
             if (existsSync(tasksDir)) {
                 const taskFiles = readdirSync(tasksDir)
-                    .filter((f) => f.endsWith(".yaml") || f.endsWith(".yml"))
+                    .filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
                     .map((f) => join(tasksDir, f));
                 paths.push(...taskFiles);
             }
             return paths;
         }
         case "generate-configs": {
-            // Inputs: config/models.yaml, config/sources.yaml, all task files
-            const paths = [r("config/models.yaml"), r("config/sources.yaml")];
+            // Inputs: config models + sources, all task files
+            const paths = [];
+            const modelsPath3 = resolveConfig(rootDir, "models");
+            const sourcesPath2 = resolveConfig(rootDir, "sources");
+            if (modelsPath3)
+                paths.push(modelsPath3);
+            if (sourcesPath2)
+                paths.push(sourcesPath2);
             const tasksDir = r("tasks");
             if (existsSync(tasksDir)) {
                 const taskFiles = readdirSync(tasksDir)
-                    .filter((f) => f.endsWith(".yaml") || f.endsWith(".yml"))
+                    .filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
                     .map((f) => join(tasksDir, f));
                 paths.push(...taskFiles);
             }

package/dist/pipeline/calculate-scores.d.ts CHANGED Viewed

@@ -1,9 +1,7 @@
-import type { Logger, TestSummary } from "../_vendor/ailf-core/index.d.ts";
+import { type ActualScoreEntry, type ComponentResult, type Logger, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
 import { type ResolvedSourceConfig } from "../sources.js";
-import { type ActualScoreEntry, type ComponentResult } from "../_vendor/ailf-core/index.d.ts";
 import type { GraderJudgment, PerModelEntry } from "./types.js";
-export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.d.ts";
-export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
+export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
 export interface PromptfooResultsWrapper {
     results: RawTestResult[];
     stats: {
@@ -64,7 +62,7 @@ export interface RawTestResult {
  * @returns Record keyed by model ID, or null if only one model was used
  *          (per-model breakdown is redundant when there's only one model).
  */
-export declare function calculateScoresPerModel(resultsPath: string, weights: Record<string, number>): null | PerModelEntry[];
+export declare function calculateScoresPerModel(resultsPath: string, goldProfile: Record<string, number>, baselineProfile: Record<string, number>): null | PerModelEntry[];
 /**
  * Extract grader judgments (reason text + scores) from evaluation results.
  *
@@ -82,7 +80,7 @@ export declare function extractGraderJudgments(resultsPath: string): GraderJudgm
  *
  * Returns a record keyed by feature area with the composite actual score.
  */
-export declare function scoreAgenticResults(resultsPath: string, weights: Record<string, number>): Record<string, ActualScoreEntry>;
+export declare function scoreAgenticResults(resultsPath: string, profile: Record<string, number>): Record<string, ActualScoreEntry>;
 /**
  * Score agentic results broken down by model.
  *
@@ -90,7 +88,7 @@ export declare function scoreAgenticResults(resultsPath: string, weights: Record
  * producing a map of model → feature → ActualScoreEntry.
  * Used to enrich the per-model breakdown with actual scores in full mode.
  */
-export declare function scoreAgenticResultsPerModel(resultsPath: string, weights: Record<string, number>): Record<string, Record<string, ActualScoreEntry>>;
+export declare function scoreAgenticResultsPerModel(resultsPath: string, profile: Record<string, number>): Record<string, Record<string, ActualScoreEntry>>;
 /** Options for the calculate-scores main() function. */
 export interface CalculateScoresOptions {
     /** Allowed origins for source isolation reporting */

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -8,8 +8,11 @@
  *   Code Correctness   (0–100)  — Is the code idiomatic and correct?
  *   Doc Coverage       (0–100)  — Did docs provide the needed info?
  *
- * Dimensions are combined into a weighted composite (0–100) using weights
- * from config/rubrics.yaml (default: Task×0.50 + Code×0.25 + Docs×0.25).
+ * Dimensions are combined into a weighted composite (0–100) using named
+ * scoring profiles from config/rubrics. Gold (with-docs) entries use
+ * the "default" profile; baseline (without-docs) entries use "output-only"
+ * which excludes doc-coverage (undefined without docs).
+ * See docs/design-docs/named-scoring-profiles.md.
  *
  * Additionally compares with-docs vs without-docs scores to calculate
  * the "Doc Lift" — how much documentation helps vs parametric knowledge.
@@ -26,14 +29,17 @@
  */
 import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
 import { join } from "path";
+import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
 import { calculateCost } from "../agent-observer/pricing.js";
 import { ConsoleLogger } from "../adapters/loggers/index.js";
+import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
 import { checkResultsExist } from "./checks.js";
-import { loadRubricTemplates } from "./expand-tasks.js";
+import { loadRubricTemplates } from "./rubric-loader.js";
+import { resolveProfile } from "./profile-resolution.js";
 import { loadSource } from "../sources.js";
-import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
-import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
-// Re-export pure functions from core for backward compatibility.
+import { LiteracyVariant } from "./normalize-mode.js";
+import { scoreTestGroup } from "./compiler/scoring-bridge.js";
+// Re-export from core for backward compatibility.
 // Existing imports from this file continue to work unchanged.
 export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
 /**
@@ -46,7 +52,7 @@ export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, par
  * @returns Record keyed by model ID, or null if only one model was used
  *          (per-model breakdown is redundant when there's only one model).
  */
-export function calculateScoresPerModel(resultsPath, weights) {
+export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfile) {
     const results = readAndNormalizeResults(resultsPath);
     // Group results by provider
     const byModel = {};
@@ -66,7 +72,7 @@ export function calculateScoresPerModel(resultsPath, weights) {
     }
     const perModel = [];
     for (const [modelId, { label, results: modelResults }] of Object.entries(byModel)) {
-        const scores = scoreResults(modelResults, weights, modelId);
+        const scores = scoreResults(modelResults, goldProfile, baselineProfile, modelId);
         const totalTests = scores.reduce((s, sc) => s + sc.testCount, 0);
         const totalCost = scores.reduce((s, sc) => s + sc.totalCost, 0);
         const avgScore = scores.length > 0
@@ -133,14 +139,8 @@ export function extractGraderJudgments(resultsPath) {
                     // Not JSON — use raw reason string
                 }
             }
-            // Map internal dimension names to hyphenated form
-            const dimensionMap = {
-                codeCorrectness: "code-correctness",
-                docCoverage: "doc-coverage",
-                taskCompletion: "task-completion",
-            };
             judgments.push({
-                dimension: dimensionMap[kind] ?? kind,
+                dimension: kind,
                 modelId,
                 reason,
                 score,
@@ -277,7 +277,7 @@ function aggregateUrlReferences(resultsPath) {
  * verification report.
  */
 function buildSourceVerification(root, source, verificationCtx) {
-    const mode = verificationCtx?.mode ?? "baseline";
+    const mode = verificationCtx?.mode ?? LiteracyVariant.STANDARD;
     const sourceUrl = source?.baseUrl ?? "default";
     const searchMode = verificationCtx?.searchMode;
     const allowedOrigins = verificationCtx?.allowedOrigins;
@@ -318,9 +318,9 @@ function buildSourceVerification(root, source, verificationCtx) {
  * Calculate overall scores (all models combined).
  * This is the original scoring path — backward compatible.
  */
-function calculateScores(resultsPath, weights) {
+function calculateScores(resultsPath, goldProfile, baselineProfile) {
     const results = readAndNormalizeResults(resultsPath);
-    return scoreResults(results, weights);
+    return scoreResults(results, goldProfile, baselineProfile);
 }
 /**
  * Extracts agent behavior summary from a test result's metadata.
@@ -495,13 +495,11 @@ function readAndNormalizeResults(resultsPath, log) {
  * used by both the overall scoring and per-model scoring paths.
  *
  * @param results  Pre-filtered (valid) test results
- * @param weights  Dimension weights from rubrics.yaml
- * @param modelId  Optional model identifier to tag each FeatureScore
+ * @param goldProfile     Weight profile for gold (with-docs) entries
+ * @param baselineProfile Weight profile for baseline (without-docs) entries
+ * @param modelId         Optional model identifier to tag each FeatureScore
  */
-function scoreResults(results, weights, modelId) {
-    const wTask = weights["task-completion"] ?? 0.5;
-    const wCode = weights["code-correctness"] ?? 0.25;
-    const wDoc = weights["doc-coverage"] ?? 0.25;
+function scoreResults(results, goldProfile, baselineProfile, modelId) {
     // Group by feature + docs/no-docs
     const byFeature = {};
     for (const result of results) {
@@ -519,65 +517,28 @@ function scoreResults(results, weights, modelId) {
     }
     const scores = [];
     for (const [feature, data] of Object.entries(byFeature)) {
-        // --- With docs ---
-        let totalTask = 0;
-        let totalCode = 0;
-        let totalDoc = 0;
-        let featureCost = 0;
-        const countWithDocs = data.withDocs.length || 1;
-        for (const test of data.withDocs) {
-            featureCost += test.cost;
-            for (const comp of test.gradingResult.componentResults) {
-                if (comp.assertion?.type !== "llm-rubric") {
-                    continue;
-                }
-                const score = parseRubricScore(comp);
-                const kind = classifyRubric(comp);
-                if (kind === "taskCompletion") {
-                    totalTask += score;
-                }
-                else if (kind === "codeCorrectness") {
-                    totalCode += score;
-                }
-                else if (kind === "docCoverage") {
-                    totalDoc += score;
-                }
-            }
-        }
-        // Per-dimension averages (each 0–100)
-        const avgTask = totalTask / countWithDocs;
-        const avgCode = totalCode / countWithDocs;
-        const avgDoc = totalDoc / countWithDocs;
-        // Weighted composite (0–100)
-        const withDocsTotal = avgTask * wTask + avgCode * wCode + avgDoc * wDoc;
-        // --- Without docs (baseline) ---
-        let baselineTotal = 0;
-        let baselineCount = 0;
-        for (const test of data.withoutDocs) {
-            featureCost += test.cost;
-            for (const comp of test.gradingResult.componentResults) {
-                if (comp.assertion?.type !== "llm-rubric") {
-                    continue;
-                }
-                baselineTotal += parseRubricScore(comp);
-                baselineCount++;
-            }
-        }
-        const withoutDocsScore = baselineCount > 0 ? baselineTotal / baselineCount : 0;
-        const ceilingScore = Math.round(withDocsTotal);
-        const floorScore = Math.round(withoutDocsScore);
+        // --- With docs (gold / ceiling) — scored via 4-tier engine ---
+        const gold = scoreTestGroup(data.withDocs, goldProfile, feature);
+        // --- Without docs (baseline / floor) ---
+        // Uses the baseline profile (e.g. "output-only") which may exclude
+        // dimensions like doc-coverage that are undefined without docs.
+        // See docs/design-docs/named-scoring-profiles.md.
+        const baseline = scoreTestGroup(data.withoutDocs, baselineProfile, feature);
+        const featureCost = gold.totalCost + baseline.totalCost;
+        const ceilingScore = gold.composite;
+        const floorScore = baseline.composite;
         const docLift = ceilingScore - floorScore;
         const featureScore = {
             ceilingScore,
-            codeCorrectness: Math.round(avgCode),
-            docCoverage: Math.round(avgDoc),
+            codeCorrectness: gold.dimensions.codeCorrectness ?? 0,
+            docCoverage: gold.dimensions.docCoverage ?? 0,
             docLift,
             docQualityGap: 100 - ceilingScore,
             feature,
             floorScore,
             ...(modelId && { modelId }),
             negativeDocLift: docLift < 0,
-            taskCompletion: Math.round(avgTask),
+            taskCompletion: gold.dimensions.taskCompletion ?? 0,
             testCount: data.withDocs.length,
             totalCost: featureCost,
             totalScore: ceilingScore,
@@ -597,11 +558,8 @@ function scoreResults(results, weights, modelId) {
  * Returns a record keyed by feature area with the composite actual score.
  */
 // ActualScoreEntry — imported from @sanity/ailf-core via pipeline/types.js
-export function scoreAgenticResults(resultsPath, weights) {
+export function scoreAgenticResults(resultsPath, profile) {
     const results = readAndNormalizeResults(resultsPath);
-    const wTask = weights["task-completion"] ?? 0.5;
-    const wCode = weights["code-correctness"] ?? 0.25;
-    const wDoc = weights["doc-coverage"] ?? 0.25;
     // Group by feature area
     const byFeature = {};
     for (const result of results) {
@@ -613,37 +571,14 @@ export function scoreAgenticResults(resultsPath, weights) {
     }
     const entries = {};
     for (const [feature, featureResults] of Object.entries(byFeature)) {
-        let totalTask = 0;
-        let totalCode = 0;
-        let totalDoc = 0;
-        let featureCost = 0;
-        const count = featureResults.length || 1;
-        for (const test of featureResults) {
-            featureCost += test.cost;
-            for (const comp of test.gradingResult.componentResults) {
-                if (comp.assertion?.type !== "llm-rubric")
-                    continue;
-                const score = parseRubricScore(comp);
-                const kind = classifyRubric(comp);
-                if (kind === "taskCompletion")
-                    totalTask += score;
-                else if (kind === "codeCorrectness")
-                    totalCode += score;
-                else if (kind === "docCoverage")
-                    totalDoc += score;
-            }
-        }
-        const avgTask = totalTask / count;
-        const avgCode = totalCode / count;
-        const avgDoc = totalDoc / count;
-        const actualScore = Math.round(avgTask * wTask + avgCode * wCode + avgDoc * wDoc);
+        const scored = scoreTestGroup(featureResults, profile, feature);
         entries[feature] = {
-            actualScore,
-            codeCorrectness: Math.round(avgCode),
-            docCoverage: Math.round(avgDoc),
-            taskCompletion: Math.round(avgTask),
+            actualScore: scored.composite,
+            codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
+            docCoverage: scored.dimensions.docCoverage ?? 0,
+            taskCompletion: scored.dimensions.taskCompletion ?? 0,
             testCount: featureResults.length,
-            totalCost: featureCost,
+            totalCost: scored.totalCost,
         };
     }
     return entries;
@@ -655,11 +590,8 @@ export function scoreAgenticResults(resultsPath, weights) {
  * producing a map of model → feature → ActualScoreEntry.
  * Used to enrich the per-model breakdown with actual scores in full mode.
  */
-export function scoreAgenticResultsPerModel(resultsPath, weights) {
+export function scoreAgenticResultsPerModel(resultsPath, profile) {
     const results = readAndNormalizeResults(resultsPath);
-    const wTask = weights["task-completion"] ?? 0.5;
-    const wCode = weights["code-correctness"] ?? 0.25;
-    const wDoc = weights["doc-coverage"] ?? 0.25;
     // Group by model, then feature
     const byModel = {};
     for (const result of results) {
@@ -675,37 +607,14 @@ export function scoreAgenticResultsPerModel(resultsPath, weights) {
     for (const [modelId, features] of Object.entries(byModel)) {
         perModel[modelId] = {};
         for (const [feature, featureResults] of Object.entries(features)) {
-            let totalTask = 0;
-            let totalCode = 0;
-            let totalDoc = 0;
-            let featureCost = 0;
-            const count = featureResults.length || 1;
-            for (const test of featureResults) {
-                featureCost += test.cost;
-                for (const comp of test.gradingResult.componentResults) {
-                    if (comp.assertion?.type !== "llm-rubric")
-                        continue;
-                    const score = parseRubricScore(comp);
-                    const kind = classifyRubric(comp);
-                    if (kind === "taskCompletion")
-                        totalTask += score;
-                    else if (kind === "codeCorrectness")
-                        totalCode += score;
-                    else if (kind === "docCoverage")
-                        totalDoc += score;
-                }
-            }
-            const avgTask = totalTask / count;
-            const avgCode = totalCode / count;
-            const avgDoc = totalDoc / count;
-            const actualScore = Math.round(avgTask * wTask + avgCode * wCode + avgDoc * wDoc);
+            const scored = scoreTestGroup(featureResults, profile, feature);
             perModel[modelId][feature] = {
-                actualScore,
-                codeCorrectness: Math.round(avgCode),
-                docCoverage: Math.round(avgDoc),
-                taskCompletion: Math.round(avgTask),
+                actualScore: scored.composite,
+                codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
+                docCoverage: scored.dimensions.docCoverage ?? 0,
+                taskCompletion: scored.dimensions.taskCompletion ?? 0,
                 testCount: featureResults.length,
-                totalCost: featureCost,
+                totalCost: scored.totalCost,
             };
         }
     }
@@ -743,7 +652,7 @@ export function calculateAndWriteScores(options) {
         }
     }
     // Determine mode — controls which result files are read
-    const mode = options.mode ?? "baseline";
+    const mode = options.mode ?? LiteracyVariant.STANDARD;
     const baselineResultsPath = options.resultsPath ?? join(ROOT, "results", "latest", "eval-results.json");
     // Agentic results path (only used in full mode)
     const agenticResultsPath = join(ROOT, "results", "latest", "eval-results-agentic.json");
@@ -760,10 +669,18 @@ export function calculateAndWriteScores(options) {
     if (source) {
         log.info(`Source: ${sourceName} (${source.baseUrl})`);
     }
-    // Load dimension weights from rubrics.yaml
+    // Load rubric config and resolve scoring profiles per variant.
+    // Gold (with-docs) entries use the "default" profile (3 dimensions).
+    // Baseline (without-docs) entries use "output-only" (2 dimensions,
+    // doc-coverage excluded). See docs/design-docs/named-scoring-profiles.md.
     const rubricConfig = loadRubricTemplates(ROOT);
-    log.debug("Loaded rubric weights", { weights: rubricConfig.weights });
-    const baselineScores = calculateScores(baselineResultsPath, rubricConfig.weights);
+    const goldProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.STANDARD);
+    const baselineProfileWeights = resolveProfile("literacy", LiteracyVariant.STANDARD, rubricConfig, LiteracyVariant.STANDARD);
+    log.debug("Loaded scoring profiles", {
+        gold: goldProfile,
+        baseline: baselineProfileWeights,
+    });
+    const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights);
     log.debug("Baseline scores calculated", {
         featureCount: baselineScores.length,
         features: baselineScores.map((s) => ({
@@ -773,7 +690,7 @@ export function calculateAndWriteScores(options) {
             docLift: s.docLift,
         })),
     });
-    const perModel = calculateScoresPerModel(baselineResultsPath, rubricConfig.weights);
+    const perModel = calculateScoresPerModel(baselineResultsPath, goldProfile, baselineProfileWeights);
     const urlRefs = aggregateUrlReferences(baselineResultsPath);
     const sourceVerification = buildSourceVerification(ROOT, source, {
         allowedOrigins: options.allowedOrigins,
@@ -786,9 +703,10 @@ export function calculateAndWriteScores(options) {
     let agentBehavior = null;
     let sourceIsolation = null;
     let evaluationMode;
-    if (mode === "full" && existsSync(agenticResultsPath)) {
+    if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
         log.info(`\nReading agentic results from: ${agenticResultsPath}`);
-        const agenticScores = scoreAgenticResults(agenticResultsPath, rubricConfig.weights);
+        const agenticProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC);
+        const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile);
         log.debug("Agentic scores calculated", {
             featureCount: Object.keys(agenticScores).length,
             features: Object.entries(agenticScores).map(([f, s]) => ({
@@ -798,10 +716,10 @@ export function calculateAndWriteScores(options) {
             })),
         });
         scores = mergeScores(baselineScores, agenticScores);
-        evaluationMode = "full";
+        evaluationMode = LiteracyVariant.FULL;
         // Merge agentic actual scores into the per-model breakdown
         if (perModel) {
-            const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath, rubricConfig.weights);
+            const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath, agenticProfile);
             for (const entry of perModel) {
                 const modelAgentic = agenticPerModel[entry.modelId];
                 if (modelAgentic) {
@@ -821,17 +739,20 @@ export function calculateAndWriteScores(options) {
             graderCost.completionTokens += agenticGraderCost.completionTokens;
         }
     }
-    else if (mode === "agentic") {
+    else if (mode === LiteracyVariant.AGENTIC) {
         scores = baselineScores;
         agentBehavior = aggregateAgentBehavior(baselineResultsPath);
         sourceIsolation = aggregateSourceIsolation(baselineResultsPath, options?.allowedOrigins);
-        evaluationMode = "agentic";
+        evaluationMode = LiteracyVariant.AGENTIC;
     }
     else {
         scores = baselineScores;
         agentBehavior = aggregateAgentBehavior(baselineResultsPath);
         sourceIsolation = aggregateSourceIsolation(baselineResultsPath, options?.allowedOrigins);
-        evaluationMode = mode === "observed" ? "observed" : "baseline";
+        evaluationMode =
+            mode === LiteracyVariant.OBSERVED
+                ? LiteracyVariant.OBSERVED
+                : LiteracyVariant.STANDARD;
     }
     const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log);
     // Persist
@@ -842,7 +763,7 @@ export function calculateAndWriteScores(options) {
     // Extract and persist grader judgments (Phase 3a: failure mode extraction)
     const judgments = extractGraderJudgments(baselineResultsPath);
     // In full mode, also extract judgments from agentic results
-    if (mode === "full" && existsSync(agenticResultsPath)) {
+    if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
         const agenticJudgments = extractGraderJudgments(agenticResultsPath);
         judgments.push(...agenticJudgments);
     }

package/dist/pipeline/checks.js CHANGED Viewed

@@ -117,7 +117,7 @@ export function checkGeneratedConfigsExist(rootDir) {
     const baselinePath = resolve(rootDir, "promptfooconfig.yaml");
     if (!existsSync(baselinePath)) {
         issues.push({
-            message: "Baseline config 'promptfooconfig.yaml' not found. Run 'pnpm generate-configs'.",
+            message: "Baseline config 'promptfooconfig.yaml' not found. Run the pipeline to generate it.",
             path: baselinePath,
             severity: "error",
             source: "checkGeneratedConfigsExist",
@@ -131,7 +131,7 @@ export function checkGeneratedConfigsExist(rootDir) {
         const configPath = resolve(rootDir, name);
         if (!existsSync(configPath)) {
             issues.push({
-                message: `Optional config \`${name}\` not found. Run \`pnpm generate-configs\` to create it.`,
+                message: `Optional config \`${name}\` not found. Run the pipeline to generate it.`,
                 path: configPath,
                 severity: "warning",
                 source: "checkGeneratedConfigsExist",

package/dist/pipeline/compare.js CHANGED Viewed

@@ -79,14 +79,14 @@ export function compare(baseline, experiment, options) {
     // Per-dimension average deltas (only for areas present in both summaries)
     const commonAreas = areas.filter((a) => baselineAreas.has(a.area) && experimentAreas.has(a.area));
     const commonCount = commonAreas.length || 1;
-    const perDimension = {
-        codeCorrectness: commonAreas.reduce((s, a) => s + a.dimensions.codeCorrectness.delta, 0) /
-            commonCount,
-        docCoverage: commonAreas.reduce((s, a) => s + a.dimensions.docCoverage.delta, 0) /
-            commonCount,
-        taskCompletion: commonAreas.reduce((s, a) => s + a.dimensions.taskCompletion.delta, 0) /
-            commonCount,
-    };
+    // Collect all dimension keys from area deltas and average each
+    const allDimKeys = new Set(commonAreas.flatMap((a) => Object.keys(a.dimensions)));
+    const perDimension = {};
+    for (const dim of allDimKeys) {
+        perDimension[dim] =
+            commonAreas.reduce((s, a) => s + (a.dimensions[dim]?.delta ?? 0), 0) /
+                commonCount;
+    }
     // Doc Lift average delta (common areas only)
     const docLift = commonAreas.reduce((s, a) => s + a.docLiftDelta, 0) / commonCount;
     // Cost delta (if both summaries have cost data)

package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts ADDED Viewed

@@ -0,0 +1,10 @@
+/**
+ * agent-harness-handler.test.ts — Tests for agent harness mode compilation.
+ *
+ * Tests validation, provider assembly, tool permission resolution,
+ * assertion mapping, sandbox config, lifecycle extensions, and
+ * end-to-end compilation of example tasks.
+ *
+ * Run: npx tsx --test src/pipeline/compiler/__tests__/agent-harness-handler.test.ts
+ */
+export {};