npm - @sanity/ailf - Versions diffs - 0.5.0 → 1.0.0 - Mend

@sanity/ailf 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (288) hide show

package/config/features.ts +23 -0
package/config/models.ts +83 -0
package/config/prompts.ts +16 -0
package/config/rubrics.ts +225 -0
package/config/schedules.ts +47 -0
package/config/sinks.ts +37 -0
package/config/sources.ts +21 -0
package/config/thresholds.ts +61 -0
package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
package/dist/_vendor/ailf-core/config-helpers.js +150 -0
package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
package/dist/_vendor/ailf-core/env-helper.js +45 -0
package/dist/_vendor/ailf-core/index.d.ts +3 -0
package/dist/_vendor/ailf-core/index.js +5 -0
package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -29
package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -8
package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
package/dist/_vendor/ailf-core/services/index.js +2 -1
package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
package/dist/_vendor/ailf-core/services/scoring.js +25 -15
package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
package/dist/_vendor/ailf-core/types/index.js +8 -1
package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
package/dist/_vendor/ailf-core/types/trace.js +18 -0
package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
package/dist/_vendor/ailf-shared/index.d.ts +0 -1
package/dist/_vendor/ailf-shared/index.js +0 -1
package/dist/adapters/api-client/build-request.js +14 -13
package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
package/dist/adapters/config-sources/file-config-adapter.js +38 -12
package/dist/adapters/config-sources/index.d.ts +2 -0
package/dist/adapters/config-sources/index.js +1 -0
package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
package/dist/adapters/config-sources/ts-config-loader.js +133 -0
package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
package/dist/adapters/task-sources/composite-task-source.js +1 -1
package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
package/dist/adapters/task-sources/index.d.ts +1 -0
package/dist/adapters/task-sources/index.js +1 -0
package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
package/dist/adapters/task-sources/repo-task-source.js +69 -16
package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
package/dist/adapters/task-sources/task-file-loader.js +83 -0
package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
package/dist/adapters/task-sources/yaml-task-source.js +19 -16
package/dist/cli.js +0 -2
package/dist/commands/baseline.js +4 -1
package/dist/commands/calculate-scores.js +1 -1
package/dist/commands/coverage-audit.js +7 -1
package/dist/commands/explain-handler.js +25 -23
package/dist/commands/fetch-docs.js +3 -2
package/dist/commands/generate-configs.js +1 -1
package/dist/commands/interactive.js +11 -7
package/dist/commands/pipeline-action.d.ts +2 -0
package/dist/commands/pipeline-action.js +16 -6
package/dist/commands/pipeline.d.ts +1 -0
package/dist/commands/pipeline.js +4 -2
package/dist/commands/pr-comment.js +1 -1
package/dist/commands/publish.js +2 -2
package/dist/commands/readiness-report.js +13 -6
package/dist/composition-root.d.ts +1 -1
package/dist/composition-root.js +67 -4
package/dist/orchestration/build-app-context.js +1 -0
package/dist/orchestration/build-step-sequence.js +24 -6
package/dist/orchestration/steps/calculate-scores-step.js +24 -11
package/dist/orchestration/steps/fetch-docs-step.js +6 -4
package/dist/orchestration/steps/gap-analysis-step.js +8 -7
package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
package/dist/orchestration/steps/generate-configs-step.js +245 -51
package/dist/orchestration/steps/grader-consistency-step.js +7 -4
package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
package/dist/orchestration/steps/readiness-step.js +5 -6
package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
package/dist/orchestration/steps/run-eval-step.js +8 -7
package/dist/pipeline/cache.d.ts +1 -1
package/dist/pipeline/cache.js +36 -8
package/dist/pipeline/calculate-scores.d.ts +2 -4
package/dist/pipeline/calculate-scores.js +43 -113
package/dist/pipeline/checks.js +2 -2
package/dist/pipeline/compare.js +8 -8
package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
package/dist/pipeline/compiler/assertion-mapper.js +175 -0
package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
package/dist/pipeline/compiler/config-loader.d.ts +56 -0
package/dist/pipeline/compiler/config-loader.js +111 -0
package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
package/dist/pipeline/compiler/fixture-resolver.js +113 -0
package/dist/pipeline/compiler/hash.d.ts +11 -0
package/dist/pipeline/compiler/hash.js +18 -0
package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
package/dist/pipeline/compiler/ignore-fields.js +113 -0
package/dist/pipeline/compiler/index.d.ts +29 -0
package/dist/pipeline/compiler/index.js +45 -0
package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
package/dist/pipeline/compiler/literacy-bridge.js +172 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
package/dist/pipeline/compiler/presets/index.d.ts +9 -0
package/dist/pipeline/compiler/presets/index.js +8 -0
package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
package/dist/pipeline/compiler/provider-assembler.js +137 -0
package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
package/dist/pipeline/compiler/sandbox/index.js +11 -0
package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
package/dist/pipeline/compiler/scoring-bridge.js +114 -0
package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
package/dist/pipeline/compiler/task-graph-builder.js +291 -0
package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
package/dist/pipeline/compiler/telemetry/index.js +19 -0
package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
package/dist/pipeline/compiler/variable-resolver.js +115 -0
package/dist/pipeline/coverage-audit.d.ts +15 -5
package/dist/pipeline/coverage-audit.js +41 -22
package/dist/pipeline/eval-constants.d.ts +16 -6
package/dist/pipeline/eval-constants.js +25 -4
package/dist/pipeline/eval-fingerprint.d.ts +2 -2
package/dist/pipeline/eval-fingerprint.js +8 -9
package/dist/pipeline/expand-tasks.d.ts +19 -10
package/dist/pipeline/expand-tasks.js +34 -28
package/dist/pipeline/gap-analysis.d.ts +1 -1
package/dist/pipeline/gap-analysis.js +2 -2
package/dist/pipeline/generate-configs.d.ts +22 -4
package/dist/pipeline/generate-configs.js +53 -24
package/dist/pipeline/grader-api.d.ts +3 -3
package/dist/pipeline/grader-api.js +5 -12
package/dist/pipeline/grader-compare-runner.js +20 -27
package/dist/pipeline/grader-comparison.d.ts +4 -8
package/dist/pipeline/grader-comparison.js +11 -17
package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
package/dist/pipeline/grader-consistency-runner.js +16 -20
package/dist/pipeline/grader-consistency.d.ts +6 -10
package/dist/pipeline/grader-consistency.js +13 -32
package/dist/pipeline/grader-sensitivity-runner.js +7 -5
package/dist/pipeline/grader-sensitivity.d.ts +2 -6
package/dist/pipeline/grader-sensitivity.js +10 -10
package/dist/pipeline/grader-validate-runner.js +7 -5
package/dist/pipeline/grader-validation.d.ts +2 -6
package/dist/pipeline/grader-validation.js +14 -22
package/dist/pipeline/map-request-to-config.js +6 -1
package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
package/dist/pipeline/mirror-repo-tasks.js +16 -15
package/dist/pipeline/normalize-mode.d.ts +49 -0
package/dist/pipeline/normalize-mode.js +64 -0
package/dist/pipeline/plan.d.ts +5 -2
package/dist/pipeline/plan.js +134 -78
package/dist/pipeline/pr-comment.js +2 -0
package/dist/pipeline/profile-resolution.d.ts +22 -14
package/dist/pipeline/profile-resolution.js +41 -19
package/dist/pipeline/provenance.d.ts +2 -2
package/dist/pipeline/provenance.js +12 -17
package/dist/pipeline/release-report.js +4 -4
package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
package/dist/pipeline/repo-threshold-evaluator.js +1 -1
package/dist/pipeline/rubric-loader.d.ts +20 -0
package/dist/pipeline/rubric-loader.js +37 -0
package/dist/pipeline/validate.d.ts +4 -4
package/dist/pipeline/validate.js +64 -53
package/dist/schedules/loader.js +18 -8
package/dist/scripts/migrate-task-mode.d.ts +24 -0
package/dist/scripts/migrate-task-mode.js +85 -0
package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
package/dist/scripts/validate-task-sources.d.ts +1 -1
package/dist/scripts/validate-task-sources.js +15 -15
package/dist/sinks/loader.js +5 -7
package/dist/sources.d.ts +7 -7
package/dist/sources.js +22 -24
package/dist/webhook/dispatch.js +2 -1
package/package.json +6 -3
package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
package/tasks/literacy/frameworks.task.ts +128 -0
package/tasks/literacy/functions.task.ts +69 -0
package/tasks/literacy/groq.task.ts +258 -0
package/tasks/literacy/nextjs-live.task.ts +75 -0
package/tasks/literacy/studio-setup.task.ts +131 -0
package/tasks/literacy/visual-editing.task.ts +146 -0
package/config/features.yaml +0 -116
package/config/models.yaml +0 -116
package/config/prompts.yaml +0 -75
package/config/rubrics.yaml +0 -81
package/config/schedules.yaml +0 -43
package/config/sinks.yaml +0 -54
package/config/sources.yaml +0 -51
package/config/thresholds.yaml +0 -49
package/dist/agent-observer/test-imports.d.ts +0 -7
package/dist/agent-observer/test-imports.js +0 -185

package/dist/pipeline/cache.d.ts CHANGED Viewed

@@ -10,7 +10,7 @@
  *
  * Cache invalidation triggers:
  *   - Content change: any input file's content changes → hash changes → miss
- *   - Config change: config/models.yaml, config/sources.yaml, tasks/*.yaml changes → miss
+ *   - Config change: config/models, config/sources, tasks/*.yaml changes → miss
  *   - Manual bypass: --no-cache flag skips all cache lookups
  *   - Cache clear: delete results/cache/ to start fresh
  */

package/dist/pipeline/cache.js CHANGED Viewed

@@ -10,7 +10,7 @@
  *
  * Cache invalidation triggers:
  *   - Content change: any input file's content changes → hash changes → miss
- *   - Config change: config/models.yaml, config/sources.yaml, tasks/*.yaml changes → miss
+ *   - Config change: config/models, config/sources, tasks/*.yaml changes → miss
  *   - Manual bypass: --no-cache flag skips all cache lookups
  *   - Cache clear: delete results/cache/ to start fresh
  */
@@ -18,6 +18,19 @@ import { createHash } from "crypto";
 import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync, } from "fs";
 import { join, resolve } from "path";
 // ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+/** Resolve first existing config file (matches loadConfigFile priority chain) */
+function resolveConfig(rootDir, name) {
+    const r = (f) => resolve(rootDir, f);
+    for (const ext of [".ts", ".js", ".yaml", ".yml", ".json"]) {
+        const p = r(`config/${name}${ext}`);
+        if (existsSync(p))
+            return p;
+    }
+    return undefined;
+}
+// ---------------------------------------------------------------------------
 // Constants
 // ---------------------------------------------------------------------------
 const CACHE_DIR_NAME = "cache";
@@ -79,7 +92,10 @@ export function getStepInputPaths(rootDir, step) {
             const isBaseline = step === "eval-baseline" || step === "eval";
             const isAgentic = step === "eval-agentic" || step === "eval";
             const isObserved = step === "eval-observed" || step === "eval";
-            const paths = [r("config/models.yaml")];
+            const paths = [];
+            const modelsPath = resolveConfig(rootDir, "models");
+            if (modelsPath)
+                paths.push(modelsPath);
             // Config files — only the relevant ones for this mode
             if (isBaseline) {
                 paths.push(r("promptfooconfig.yaml"));
@@ -130,25 +146,37 @@ export function getStepInputPaths(rootDir, step) {
             return paths;
         }
         case "fetch-docs": {
-            // Inputs: config/sources.yaml, config/models.yaml, task files (which contain inline mappings)
-            const paths = [r("config/sources.yaml"), r("config/models.yaml")];
+            // Inputs: config sources + models, task files
+            const paths = [];
+            const sourcesPath = resolveConfig(rootDir, "sources");
+            const modelsPath2 = resolveConfig(rootDir, "models");
+            if (sourcesPath)
+                paths.push(sourcesPath);
+            if (modelsPath2)
+                paths.push(modelsPath2);
             // Include all task files (they define feature areas)
             const tasksDir = r("tasks");
             if (existsSync(tasksDir)) {
                 const taskFiles = readdirSync(tasksDir)
-                    .filter((f) => f.endsWith(".yaml") || f.endsWith(".yml"))
+                    .filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
                     .map((f) => join(tasksDir, f));
                 paths.push(...taskFiles);
             }
             return paths;
         }
         case "generate-configs": {
-            // Inputs: config/models.yaml, config/sources.yaml, all task files
-            const paths = [r("config/models.yaml"), r("config/sources.yaml")];
+            // Inputs: config models + sources, all task files
+            const paths = [];
+            const modelsPath3 = resolveConfig(rootDir, "models");
+            const sourcesPath2 = resolveConfig(rootDir, "sources");
+            if (modelsPath3)
+                paths.push(modelsPath3);
+            if (sourcesPath2)
+                paths.push(sourcesPath2);
             const tasksDir = r("tasks");
             if (existsSync(tasksDir)) {
                 const taskFiles = readdirSync(tasksDir)
-                    .filter((f) => f.endsWith(".yaml") || f.endsWith(".yml"))
+                    .filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
                     .map((f) => join(tasksDir, f));
                 paths.push(...taskFiles);
             }

package/dist/pipeline/calculate-scores.d.ts CHANGED Viewed

@@ -1,9 +1,7 @@
-import type { Logger, TestSummary } from "../_vendor/ailf-core/index.d.ts";
+import { type ActualScoreEntry, type ComponentResult, type Logger, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
 import { type ResolvedSourceConfig } from "../sources.js";
-import { type ActualScoreEntry, type ComponentResult } from "../_vendor/ailf-core/index.d.ts";
 import type { GraderJudgment, PerModelEntry } from "./types.js";
-export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.d.ts";
-export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
+export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
 export interface PromptfooResultsWrapper {
     results: RawTestResult[];
     stats: {

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -9,7 +9,7 @@
  *   Doc Coverage       (0–100)  — Did docs provide the needed info?
  *
  * Dimensions are combined into a weighted composite (0–100) using named
- * scoring profiles from config/rubrics.yaml. Gold (with-docs) entries use
+ * scoring profiles from config/rubrics. Gold (with-docs) entries use
  * the "default" profile; baseline (without-docs) entries use "output-only"
  * which excludes doc-coverage (undefined without docs).
  * See docs/design-docs/named-scoring-profiles.md.
@@ -29,15 +29,17 @@
  */
 import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
 import { join } from "path";
+import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
 import { calculateCost } from "../agent-observer/pricing.js";
 import { ConsoleLogger } from "../adapters/loggers/index.js";
+import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
 import { checkResultsExist } from "./checks.js";
-import { loadRubricTemplates } from "./expand-tasks.js";
+import { loadRubricTemplates } from "./rubric-loader.js";
 import { resolveProfile } from "./profile-resolution.js";
 import { loadSource } from "../sources.js";
-import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
-import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
-// Re-export pure functions from core for backward compatibility.
+import { LiteracyVariant } from "./normalize-mode.js";
+import { scoreTestGroup } from "./compiler/scoring-bridge.js";
+// Re-export from core for backward compatibility.
 // Existing imports from this file continue to work unchanged.
 export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
 /**
@@ -137,14 +139,8 @@ export function extractGraderJudgments(resultsPath) {
                     // Not JSON — use raw reason string
                 }
             }
-            // Map internal dimension names to hyphenated form
-            const dimensionMap = {
-                codeCorrectness: "code-correctness",
-                docCoverage: "doc-coverage",
-                taskCompletion: "task-completion",
-            };
             judgments.push({
-                dimension: dimensionMap[kind] ?? kind,
+                dimension: kind,
                 modelId,
                 reason,
                 score,
@@ -281,7 +277,7 @@ function aggregateUrlReferences(resultsPath) {
  * verification report.
  */
 function buildSourceVerification(root, source, verificationCtx) {
-    const mode = verificationCtx?.mode ?? "baseline";
+    const mode = verificationCtx?.mode ?? LiteracyVariant.STANDARD;
     const sourceUrl = source?.baseUrl ?? "default";
     const searchMode = verificationCtx?.searchMode;
     const allowedOrigins = verificationCtx?.allowedOrigins;
@@ -493,62 +489,6 @@ function readAndNormalizeResults(resultsPath, log) {
     }
     return valid;
 }
-/**
- * Accumulate raw dimension scores across an array of test results.
- * Dimension-agnostic: any dimension returned by classifyRubric() is tracked.
- */
-function accumulateDimensions(tests) {
-    const dimensions = {};
-    let totalCost = 0;
-    for (const test of tests) {
-        totalCost += test.cost;
-        for (const comp of test.gradingResult.componentResults) {
-            if (comp.assertion?.type !== "llm-rubric")
-                continue;
-            const score = parseRubricScore(comp);
-            const kind = classifyRubric(comp);
-            if (kind) {
-                dimensions[kind] = (dimensions[kind] ?? 0) + score;
-            }
-        }
-    }
-    return { dimensions, totalCost };
-}
-/**
- * Average accumulated dimension scores by a count.
- * Returns a dimension → average score map.
- */
-function averageDimensions(accumulated, count) {
-    const avg = {};
-    for (const [dim, total] of Object.entries(accumulated.dimensions)) {
-        avg[dim] = total / count;
-    }
-    return avg;
-}
-/**
- * Compute a weighted composite score from dimension averages and a profile.
- * Only dimensions present in the profile contribute to the composite.
- * Dimensions not in the profile are ignored (e.g., doc-coverage on baseline).
- *
- * The profile maps camelCase dimension names (as returned by classifyRubric)
- * to kebab-case keys (as used in rubrics.yaml). This function handles the
- * mapping internally.
- */
-function weightedComposite(dimensionAverages, profile) {
-    // Map profile keys (kebab-case: "task-completion") to classifyRubric
-    // output (camelCase: "taskCompletion")
-    const kebabToCamel = {
-        "code-correctness": "codeCorrectness",
-        "doc-coverage": "docCoverage",
-        "task-completion": "taskCompletion",
-    };
-    let total = 0;
-    for (const [profileKey, weight] of Object.entries(profile)) {
-        const dimKey = kebabToCamel[profileKey] ?? profileKey;
-        total += (dimensionAverages[dimKey] ?? 0) * weight;
-    }
-    return total;
-}
 /**
  * Core scoring logic: takes a pre-filtered array of TestResult and produces
  * FeatureScore[] grouped by feature area. This is the shared implementation
@@ -577,35 +517,28 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
     }
     const scores = [];
     for (const [feature, data] of Object.entries(byFeature)) {
-        // --- With docs (gold / ceiling) ---
-        const goldDims = accumulateDimensions(data.withDocs);
-        let featureCost = goldDims.totalCost;
-        const countWithDocs = data.withDocs.length || 1;
-        const avgGold = averageDimensions(goldDims, countWithDocs);
-        const withDocsTotal = weightedComposite(avgGold, goldProfile);
+        // --- With docs (gold / ceiling) — scored via 4-tier engine ---
+        const gold = scoreTestGroup(data.withDocs, goldProfile, feature);
         // --- Without docs (baseline / floor) ---
         // Uses the baseline profile (e.g. "output-only") which may exclude
         // dimensions like doc-coverage that are undefined without docs.
         // See docs/design-docs/named-scoring-profiles.md.
-        const baselineDims = accumulateDimensions(data.withoutDocs);
-        featureCost += baselineDims.totalCost;
-        const countWithoutDocs = data.withoutDocs.length || 1;
-        const avgBaseline = averageDimensions(baselineDims, countWithoutDocs);
-        const withoutDocsScore = weightedComposite(avgBaseline, baselineProfile);
-        const ceilingScore = Math.round(withDocsTotal);
-        const floorScore = Math.round(withoutDocsScore);
+        const baseline = scoreTestGroup(data.withoutDocs, baselineProfile, feature);
+        const featureCost = gold.totalCost + baseline.totalCost;
+        const ceilingScore = gold.composite;
+        const floorScore = baseline.composite;
         const docLift = ceilingScore - floorScore;
         const featureScore = {
             ceilingScore,
-            codeCorrectness: Math.round(avgGold.codeCorrectness ?? 0),
-            docCoverage: Math.round(avgGold.docCoverage ?? 0),
+            codeCorrectness: gold.dimensions.codeCorrectness ?? 0,
+            docCoverage: gold.dimensions.docCoverage ?? 0,
             docLift,
             docQualityGap: 100 - ceilingScore,
             feature,
             floorScore,
             ...(modelId && { modelId }),
             negativeDocLift: docLift < 0,
-            taskCompletion: Math.round(avgGold.taskCompletion ?? 0),
+            taskCompletion: gold.dimensions.taskCompletion ?? 0,
             testCount: data.withDocs.length,
             totalCost: featureCost,
             totalScore: ceilingScore,
@@ -638,17 +571,14 @@ export function scoreAgenticResults(resultsPath, profile) {
     }
     const entries = {};
     for (const [feature, featureResults] of Object.entries(byFeature)) {
-        const count = featureResults.length || 1;
-        const accumulated = accumulateDimensions(featureResults);
-        const avg = averageDimensions(accumulated, count);
-        const actualScore = Math.round(weightedComposite(avg, profile));
+        const scored = scoreTestGroup(featureResults, profile, feature);
         entries[feature] = {
-            actualScore,
-            codeCorrectness: Math.round(avg.codeCorrectness ?? 0),
-            docCoverage: Math.round(avg.docCoverage ?? 0),
-            taskCompletion: Math.round(avg.taskCompletion ?? 0),
+            actualScore: scored.composite,
+            codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
+            docCoverage: scored.dimensions.docCoverage ?? 0,
+            taskCompletion: scored.dimensions.taskCompletion ?? 0,
             testCount: featureResults.length,
-            totalCost: accumulated.totalCost,
+            totalCost: scored.totalCost,
         };
     }
     return entries;
@@ -677,17 +607,14 @@ export function scoreAgenticResultsPerModel(resultsPath, profile) {
     for (const [modelId, features] of Object.entries(byModel)) {
         perModel[modelId] = {};
         for (const [feature, featureResults] of Object.entries(features)) {
-            const count = featureResults.length || 1;
-            const accumulated = accumulateDimensions(featureResults);
-            const avg = averageDimensions(accumulated, count);
-            const actualScore = Math.round(weightedComposite(avg, profile));
+            const scored = scoreTestGroup(featureResults, profile, feature);
             perModel[modelId][feature] = {
-                actualScore,
-                codeCorrectness: Math.round(avg.codeCorrectness ?? 0),
-                docCoverage: Math.round(avg.docCoverage ?? 0),
-                taskCompletion: Math.round(avg.taskCompletion ?? 0),
+                actualScore: scored.composite,
+                codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
+                docCoverage: scored.dimensions.docCoverage ?? 0,
+                taskCompletion: scored.dimensions.taskCompletion ?? 0,
                 testCount: featureResults.length,
-                totalCost: accumulated.totalCost,
+                totalCost: scored.totalCost,
             };
         }
     }
@@ -725,7 +652,7 @@ export function calculateAndWriteScores(options) {
         }
     }
     // Determine mode — controls which result files are read
-    const mode = options.mode ?? "baseline";
+    const mode = options.mode ?? LiteracyVariant.STANDARD;
     const baselineResultsPath = options.resultsPath ?? join(ROOT, "results", "latest", "eval-results.json");
     // Agentic results path (only used in full mode)
     const agenticResultsPath = join(ROOT, "results", "latest", "eval-results-agentic.json");
@@ -747,8 +674,8 @@ export function calculateAndWriteScores(options) {
     // Baseline (without-docs) entries use "output-only" (2 dimensions,
     // doc-coverage excluded). See docs/design-docs/named-scoring-profiles.md.
     const rubricConfig = loadRubricTemplates(ROOT);
-    const goldProfile = resolveProfile("baseline", "gold", rubricConfig);
-    const baselineProfileWeights = resolveProfile("baseline", "baseline", rubricConfig);
+    const goldProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.STANDARD);
+    const baselineProfileWeights = resolveProfile("literacy", LiteracyVariant.STANDARD, rubricConfig, LiteracyVariant.STANDARD);
     log.debug("Loaded scoring profiles", {
         gold: goldProfile,
         baseline: baselineProfileWeights,
@@ -776,9 +703,9 @@ export function calculateAndWriteScores(options) {
     let agentBehavior = null;
     let sourceIsolation = null;
     let evaluationMode;
-    if (mode === "full" && existsSync(agenticResultsPath)) {
+    if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
         log.info(`\nReading agentic results from: ${agenticResultsPath}`);
-        const agenticProfile = resolveProfile("agentic", "gold", rubricConfig);
+        const agenticProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC);
         const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile);
         log.debug("Agentic scores calculated", {
             featureCount: Object.keys(agenticScores).length,
@@ -789,7 +716,7 @@ export function calculateAndWriteScores(options) {
             })),
         });
         scores = mergeScores(baselineScores, agenticScores);
-        evaluationMode = "full";
+        evaluationMode = LiteracyVariant.FULL;
         // Merge agentic actual scores into the per-model breakdown
         if (perModel) {
             const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath, agenticProfile);
@@ -812,17 +739,20 @@ export function calculateAndWriteScores(options) {
             graderCost.completionTokens += agenticGraderCost.completionTokens;
         }
     }
-    else if (mode === "agentic") {
+    else if (mode === LiteracyVariant.AGENTIC) {
         scores = baselineScores;
         agentBehavior = aggregateAgentBehavior(baselineResultsPath);
         sourceIsolation = aggregateSourceIsolation(baselineResultsPath, options?.allowedOrigins);
-        evaluationMode = "agentic";
+        evaluationMode = LiteracyVariant.AGENTIC;
     }
     else {
         scores = baselineScores;
         agentBehavior = aggregateAgentBehavior(baselineResultsPath);
         sourceIsolation = aggregateSourceIsolation(baselineResultsPath, options?.allowedOrigins);
-        evaluationMode = mode === "observed" ? "observed" : "baseline";
+        evaluationMode =
+            mode === LiteracyVariant.OBSERVED
+                ? LiteracyVariant.OBSERVED
+                : LiteracyVariant.STANDARD;
     }
     const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log);
     // Persist
@@ -833,7 +763,7 @@ export function calculateAndWriteScores(options) {
     // Extract and persist grader judgments (Phase 3a: failure mode extraction)
     const judgments = extractGraderJudgments(baselineResultsPath);
     // In full mode, also extract judgments from agentic results
-    if (mode === "full" && existsSync(agenticResultsPath)) {
+    if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
         const agenticJudgments = extractGraderJudgments(agenticResultsPath);
         judgments.push(...agenticJudgments);
     }

package/dist/pipeline/checks.js CHANGED Viewed

@@ -117,7 +117,7 @@ export function checkGeneratedConfigsExist(rootDir) {
     const baselinePath = resolve(rootDir, "promptfooconfig.yaml");
     if (!existsSync(baselinePath)) {
         issues.push({
-            message: "Baseline config 'promptfooconfig.yaml' not found. Run 'pnpm generate-configs'.",
+            message: "Baseline config 'promptfooconfig.yaml' not found. Run the pipeline to generate it.",
             path: baselinePath,
             severity: "error",
             source: "checkGeneratedConfigsExist",
@@ -131,7 +131,7 @@ export function checkGeneratedConfigsExist(rootDir) {
         const configPath = resolve(rootDir, name);
         if (!existsSync(configPath)) {
             issues.push({
-                message: `Optional config \`${name}\` not found. Run \`pnpm generate-configs\` to create it.`,
+                message: `Optional config \`${name}\` not found. Run the pipeline to generate it.`,
                 path: configPath,
                 severity: "warning",
                 source: "checkGeneratedConfigsExist",

package/dist/pipeline/compare.js CHANGED Viewed

@@ -79,14 +79,14 @@ export function compare(baseline, experiment, options) {
     // Per-dimension average deltas (only for areas present in both summaries)
     const commonAreas = areas.filter((a) => baselineAreas.has(a.area) && experimentAreas.has(a.area));
     const commonCount = commonAreas.length || 1;
-    const perDimension = {
-        codeCorrectness: commonAreas.reduce((s, a) => s + a.dimensions.codeCorrectness.delta, 0) /
-            commonCount,
-        docCoverage: commonAreas.reduce((s, a) => s + a.dimensions.docCoverage.delta, 0) /
-            commonCount,
-        taskCompletion: commonAreas.reduce((s, a) => s + a.dimensions.taskCompletion.delta, 0) /
-            commonCount,
-    };
+    // Collect all dimension keys from area deltas and average each
+    const allDimKeys = new Set(commonAreas.flatMap((a) => Object.keys(a.dimensions)));
+    const perDimension = {};
+    for (const dim of allDimKeys) {
+        perDimension[dim] =
+            commonAreas.reduce((s, a) => s + (a.dimensions[dim]?.delta ?? 0), 0) /
+                commonCount;
+    }
     // Doc Lift average delta (common areas only)
     const docLift = commonAreas.reduce((s, a) => s + a.docLiftDelta, 0) / commonCount;
     // Cost delta (if both summaries have cost data)

package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts ADDED Viewed

@@ -0,0 +1,10 @@
+/**
+ * agent-harness-handler.test.ts — Tests for agent harness mode compilation.
+ *
+ * Tests validation, provider assembly, tool permission resolution,
+ * assertion mapping, sandbox config, lifecycle extensions, and
+ * end-to-end compilation of example tasks.
+ *
+ * Run: npx tsx --test src/pipeline/compiler/__tests__/agent-harness-handler.test.ts
+ */
+export {};