@sanity/ailf 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/features.ts +23 -0
- package/config/models.ts +83 -0
- package/config/prompts.ts +16 -0
- package/config/rubrics.ts +225 -0
- package/config/schedules.ts +47 -0
- package/config/sinks.ts +37 -0
- package/config/sources.ts +21 -0
- package/config/thresholds.ts +61 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
- package/dist/_vendor/ailf-core/config-helpers.js +150 -0
- package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
- package/dist/_vendor/ailf-core/env-helper.js +45 -0
- package/dist/_vendor/ailf-core/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/index.js +5 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
- package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
- package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -29
- package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -8
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
- package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/services/index.js +2 -1
- package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
- package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
- package/dist/_vendor/ailf-core/services/scoring.js +25 -15
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
- package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
- package/dist/_vendor/ailf-core/types/index.js +8 -1
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
- package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
- package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
- package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
- package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
- package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
- package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
- package/dist/_vendor/ailf-core/types/trace.js +18 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
- package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
- package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
- package/dist/_vendor/ailf-shared/index.d.ts +0 -1
- package/dist/_vendor/ailf-shared/index.js +0 -1
- package/dist/adapters/api-client/build-request.js +14 -13
- package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
- package/dist/adapters/config-sources/file-config-adapter.js +38 -12
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
- package/dist/adapters/config-sources/ts-config-loader.js +133 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
- package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
- package/dist/adapters/task-sources/index.d.ts +1 -0
- package/dist/adapters/task-sources/index.js +1 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
- package/dist/adapters/task-sources/repo-task-source.js +69 -16
- package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
- package/dist/adapters/task-sources/task-file-loader.js +83 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
- package/dist/adapters/task-sources/yaml-task-source.js +19 -16
- package/dist/cli.js +0 -2
- package/dist/commands/baseline.js +4 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/coverage-audit.js +7 -1
- package/dist/commands/explain-handler.js +25 -23
- package/dist/commands/fetch-docs.js +3 -2
- package/dist/commands/generate-configs.js +1 -1
- package/dist/commands/interactive.js +11 -7
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +16 -6
- package/dist/commands/pipeline.d.ts +1 -0
- package/dist/commands/pipeline.js +4 -2
- package/dist/commands/pr-comment.js +1 -1
- package/dist/commands/publish.js +2 -2
- package/dist/commands/readiness-report.js +13 -6
- package/dist/composition-root.d.ts +1 -1
- package/dist/composition-root.js +67 -4
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +24 -6
- package/dist/orchestration/steps/calculate-scores-step.js +24 -11
- package/dist/orchestration/steps/fetch-docs-step.js +6 -4
- package/dist/orchestration/steps/gap-analysis-step.js +8 -7
- package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
- package/dist/orchestration/steps/generate-configs-step.js +245 -51
- package/dist/orchestration/steps/grader-consistency-step.js +7 -4
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/readiness-step.js +5 -6
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
- package/dist/orchestration/steps/run-eval-step.js +8 -7
- package/dist/pipeline/cache.d.ts +1 -1
- package/dist/pipeline/cache.js +36 -8
- package/dist/pipeline/calculate-scores.d.ts +2 -4
- package/dist/pipeline/calculate-scores.js +43 -113
- package/dist/pipeline/checks.js +2 -2
- package/dist/pipeline/compare.js +8 -8
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
- package/dist/pipeline/compiler/assertion-mapper.js +175 -0
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
- package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
- package/dist/pipeline/compiler/config-loader.d.ts +56 -0
- package/dist/pipeline/compiler/config-loader.js +111 -0
- package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
- package/dist/pipeline/compiler/fixture-resolver.js +113 -0
- package/dist/pipeline/compiler/hash.d.ts +11 -0
- package/dist/pipeline/compiler/hash.js +18 -0
- package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
- package/dist/pipeline/compiler/ignore-fields.js +113 -0
- package/dist/pipeline/compiler/index.d.ts +29 -0
- package/dist/pipeline/compiler/index.js +45 -0
- package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
- package/dist/pipeline/compiler/literacy-bridge.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
- package/dist/pipeline/compiler/presets/index.d.ts +9 -0
- package/dist/pipeline/compiler/presets/index.js +8 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
- package/dist/pipeline/compiler/provider-assembler.js +137 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
- package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
- package/dist/pipeline/compiler/sandbox/index.js +11 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
- package/dist/pipeline/compiler/scoring-bridge.js +114 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
- package/dist/pipeline/compiler/task-graph-builder.js +291 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
- package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
- package/dist/pipeline/compiler/telemetry/index.js +19 -0
- package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
- package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
- package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
- package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
- package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
- package/dist/pipeline/compiler/variable-resolver.js +115 -0
- package/dist/pipeline/coverage-audit.d.ts +15 -5
- package/dist/pipeline/coverage-audit.js +41 -22
- package/dist/pipeline/eval-constants.d.ts +16 -6
- package/dist/pipeline/eval-constants.js +25 -4
- package/dist/pipeline/eval-fingerprint.d.ts +2 -2
- package/dist/pipeline/eval-fingerprint.js +8 -9
- package/dist/pipeline/expand-tasks.d.ts +19 -10
- package/dist/pipeline/expand-tasks.js +34 -28
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +2 -2
- package/dist/pipeline/generate-configs.d.ts +22 -4
- package/dist/pipeline/generate-configs.js +53 -24
- package/dist/pipeline/grader-api.d.ts +3 -3
- package/dist/pipeline/grader-api.js +5 -12
- package/dist/pipeline/grader-compare-runner.js +20 -27
- package/dist/pipeline/grader-comparison.d.ts +4 -8
- package/dist/pipeline/grader-comparison.js +11 -17
- package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
- package/dist/pipeline/grader-consistency-runner.js +16 -20
- package/dist/pipeline/grader-consistency.d.ts +6 -10
- package/dist/pipeline/grader-consistency.js +13 -32
- package/dist/pipeline/grader-sensitivity-runner.js +7 -5
- package/dist/pipeline/grader-sensitivity.d.ts +2 -6
- package/dist/pipeline/grader-sensitivity.js +10 -10
- package/dist/pipeline/grader-validate-runner.js +7 -5
- package/dist/pipeline/grader-validation.d.ts +2 -6
- package/dist/pipeline/grader-validation.js +14 -22
- package/dist/pipeline/map-request-to-config.js +6 -1
- package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
- package/dist/pipeline/mirror-repo-tasks.js +16 -15
- package/dist/pipeline/normalize-mode.d.ts +49 -0
- package/dist/pipeline/normalize-mode.js +64 -0
- package/dist/pipeline/plan.d.ts +5 -2
- package/dist/pipeline/plan.js +134 -78
- package/dist/pipeline/pr-comment.js +2 -0
- package/dist/pipeline/profile-resolution.d.ts +22 -14
- package/dist/pipeline/profile-resolution.js +41 -19
- package/dist/pipeline/provenance.d.ts +2 -2
- package/dist/pipeline/provenance.js +12 -17
- package/dist/pipeline/release-report.js +4 -4
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/rubric-loader.d.ts +20 -0
- package/dist/pipeline/rubric-loader.js +37 -0
- package/dist/pipeline/validate.d.ts +4 -4
- package/dist/pipeline/validate.js +64 -53
- package/dist/schedules/loader.js +18 -8
- package/dist/scripts/migrate-task-mode.d.ts +24 -0
- package/dist/scripts/migrate-task-mode.js +85 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +15 -15
- package/dist/sinks/loader.js +5 -7
- package/dist/sources.d.ts +7 -7
- package/dist/sources.js +22 -24
- package/dist/webhook/dispatch.js +2 -1
- package/package.json +6 -3
- package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
- package/tasks/literacy/frameworks.task.ts +128 -0
- package/tasks/literacy/functions.task.ts +69 -0
- package/tasks/literacy/groq.task.ts +258 -0
- package/tasks/literacy/nextjs-live.task.ts +75 -0
- package/tasks/literacy/studio-setup.task.ts +131 -0
- package/tasks/literacy/visual-editing.task.ts +146 -0
- package/config/features.yaml +0 -116
- package/config/models.yaml +0 -116
- package/config/prompts.yaml +0 -75
- package/config/rubrics.yaml +0 -81
- package/config/schedules.yaml +0 -43
- package/config/sinks.yaml +0 -54
- package/config/sources.yaml +0 -51
- package/config/thresholds.yaml +0 -49
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
package/dist/pipeline/plan.js
CHANGED
|
@@ -9,13 +9,15 @@
|
|
|
9
9
|
*
|
|
10
10
|
* @see docs/exec-plans/execution-preview.md
|
|
11
11
|
*/
|
|
12
|
-
import { existsSync,
|
|
12
|
+
import { existsSync, readdirSync, statSync } from "fs";
|
|
13
13
|
import { resolve } from "path";
|
|
14
|
-
import { load } from "js-yaml";
|
|
15
14
|
import { lookupPricing } from "../agent-observer/pricing.js";
|
|
16
15
|
import { RepoTaskSource } from "../adapters/task-sources/repo-task-source.js";
|
|
16
|
+
import { loadAllTsTaskFiles } from "../adapters/task-sources/task-file-loader.js";
|
|
17
17
|
import { lookupCache } from "./cache.js";
|
|
18
|
-
import {
|
|
18
|
+
import { compileLiteracyTasks } from "./compiler/literacy-bridge.js";
|
|
19
|
+
import { tryLoadConfigFile } from "./compiler/config-loader.js";
|
|
20
|
+
import { LiteracyVariant } from "./normalize-mode.js";
|
|
19
21
|
import { validateConfiguration } from "./validate.js";
|
|
20
22
|
/**
|
|
21
23
|
* Known promptfoo provider prefixes — stripped to get the raw model name.
|
|
@@ -39,38 +41,37 @@ function extractModelName(id) {
|
|
|
39
41
|
return parts.length > 1 ? parts.slice(1).join(":") : id;
|
|
40
42
|
}
|
|
41
43
|
function loadModelsFile(rootDir) {
|
|
42
|
-
const
|
|
43
|
-
|
|
44
|
-
return null;
|
|
45
|
-
try {
|
|
46
|
-
const raw = readFileSync(modelsPath, "utf-8");
|
|
47
|
-
return load(raw);
|
|
48
|
-
}
|
|
49
|
-
catch {
|
|
50
|
-
return null;
|
|
51
|
-
}
|
|
44
|
+
const result = tryLoadConfigFile("models", rootDir);
|
|
45
|
+
return result?.data ?? null;
|
|
52
46
|
}
|
|
53
47
|
/**
|
|
54
|
-
* Map eval mode to the model "modes" array values from models.
|
|
55
|
-
*
|
|
48
|
+
* Map eval mode + variant to the model "modes" array values from models config.
|
|
49
|
+
*
|
|
50
|
+
* Literacy mode uses the variant to determine which model sub-modes match.
|
|
51
|
+
* Non-literacy modes accept all models by default (filtering is done
|
|
52
|
+
* elsewhere for those modes).
|
|
56
53
|
*/
|
|
57
|
-
function modeMatchesModelModes(mode, modelModes) {
|
|
54
|
+
function modeMatchesModelModes(mode, modelModes, variant) {
|
|
58
55
|
if (!modelModes || modelModes.length === 0)
|
|
59
56
|
return true;
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
modelModes.includes("agentic-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
57
|
+
if (mode === "literacy") {
|
|
58
|
+
switch (variant) {
|
|
59
|
+
case LiteracyVariant.AGENTIC:
|
|
60
|
+
return (modelModes.includes("agentic-naive") ||
|
|
61
|
+
modelModes.includes("agentic-optimized"));
|
|
62
|
+
case LiteracyVariant.OBSERVED:
|
|
63
|
+
return modelModes.includes(LiteracyVariant.OBSERVED);
|
|
64
|
+
case LiteracyVariant.FULL:
|
|
65
|
+
return (modelModes.includes(LiteracyVariant.STANDARD) ||
|
|
66
|
+
modelModes.includes("agentic-naive") ||
|
|
67
|
+
modelModes.includes("agentic-optimized"));
|
|
68
|
+
case LiteracyVariant.STANDARD:
|
|
69
|
+
default:
|
|
70
|
+
return modelModes.includes(LiteracyVariant.STANDARD);
|
|
71
|
+
}
|
|
73
72
|
}
|
|
73
|
+
// Non-literacy modes accept all models by default
|
|
74
|
+
return true;
|
|
74
75
|
}
|
|
75
76
|
// ---------------------------------------------------------------------------
|
|
76
77
|
// Cost estimation
|
|
@@ -131,46 +132,103 @@ export async function buildPipelinePlan(opts, rootDir) {
|
|
|
131
132
|
let totalTests = 0;
|
|
132
133
|
let tasks = [];
|
|
133
134
|
let repoTaskCount;
|
|
135
|
+
// -----------------------------------------------------------------------
|
|
136
|
+
// Load and compile tasks — unified path for all modes
|
|
137
|
+
// -----------------------------------------------------------------------
|
|
134
138
|
try {
|
|
135
|
-
const
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
139
|
+
const modelsForCompile = loadModelsFile(rootDir);
|
|
140
|
+
const graderProvider = modelsForCompile?.grader?.id ?? "openai:chat:gpt-4o";
|
|
141
|
+
const modelEntries = (modelsForCompile?.models ?? []).map((m) => ({ id: m.id, label: m.label }));
|
|
142
|
+
// Load *.task.ts files from tasks/<mode>/
|
|
143
|
+
const modeTasksDir = resolve(rootDir, "tasks", opts.mode);
|
|
144
|
+
if (existsSync(modeTasksDir)) {
|
|
145
|
+
const rawTasks = await loadAllTsTaskFiles(modeTasksDir);
|
|
146
|
+
if (rawTasks.length > 0) {
|
|
147
|
+
// Dynamic import of the handler module
|
|
148
|
+
const handlerModulePath = `./compiler/mode-handlers/${opts.mode}-handler.js`;
|
|
149
|
+
const mod = await import(handlerModulePath);
|
|
150
|
+
const handler = mod.handler;
|
|
151
|
+
for (const rawFile of rawTasks) {
|
|
152
|
+
for (const taskDef of rawFile.tasks) {
|
|
153
|
+
const task = taskDef;
|
|
154
|
+
// Apply area/task/tag filter
|
|
155
|
+
if (filter) {
|
|
156
|
+
if (filter.areas?.length &&
|
|
157
|
+
!filter.areas
|
|
158
|
+
.map((a) => a.toLowerCase())
|
|
159
|
+
.includes((task.area ?? "").toLowerCase()))
|
|
160
|
+
continue;
|
|
161
|
+
if (filter.taskIds?.length && !filter.taskIds.includes(task.id))
|
|
162
|
+
continue;
|
|
163
|
+
if (filter.tags?.length &&
|
|
164
|
+
(!task.tags || !task.tags.some((t) => filter.tags.includes(t))))
|
|
165
|
+
continue;
|
|
166
|
+
}
|
|
167
|
+
const result = handler.compileTask(task, {
|
|
168
|
+
rootDir,
|
|
169
|
+
graderProvider,
|
|
170
|
+
models: modelEntries,
|
|
171
|
+
// For literacy mode, pass the variant as evalMode
|
|
172
|
+
...(opts.mode === "literacy"
|
|
173
|
+
? {
|
|
174
|
+
evalMode: opts.variant === LiteracyVariant.AGENTIC
|
|
175
|
+
? LiteracyVariant.AGENTIC
|
|
176
|
+
: LiteracyVariant.STANDARD,
|
|
177
|
+
}
|
|
178
|
+
: {}),
|
|
179
|
+
});
|
|
180
|
+
totalTests += result.tests.length;
|
|
181
|
+
for (const test of result.tests) {
|
|
182
|
+
const desc = typeof test.description === "string"
|
|
183
|
+
? test.description
|
|
184
|
+
: (taskDef.id ?? "unknown");
|
|
185
|
+
const isBaseline = desc.includes("[Baseline]") || desc.endsWith("(baseline)");
|
|
186
|
+
tasks.push({
|
|
187
|
+
description: desc,
|
|
188
|
+
variant: isBaseline
|
|
189
|
+
? LiteracyVariant.STANDARD
|
|
190
|
+
: "gold",
|
|
191
|
+
});
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
}
|
|
147
197
|
}
|
|
148
|
-
catch {
|
|
149
|
-
|
|
198
|
+
catch (err) {
|
|
199
|
+
const detail = err instanceof Error ? err.message : String(err);
|
|
200
|
+
errors.push(`Failed to compile tasks: ${detail}`);
|
|
150
201
|
}
|
|
151
202
|
// Scan repo tasks path for additional task count (preview only)
|
|
152
203
|
if (opts.repoTasksPath) {
|
|
153
204
|
try {
|
|
154
205
|
const repoSource = new RepoTaskSource(opts.repoTasksPath);
|
|
155
|
-
|
|
206
|
+
// Type-narrow to literacy tasks — compileLiteracyTasks accepts LiteracyTaskDefinition[]
|
|
207
|
+
const repoTasks = (await repoSource.loadTasks(filter)).filter((t) => t.mode === "literacy");
|
|
156
208
|
repoTaskCount = repoTasks.length;
|
|
157
209
|
if (repoTaskCount > 0) {
|
|
158
|
-
|
|
159
|
-
const
|
|
160
|
-
const
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
description:
|
|
172
|
-
|
|
173
|
-
|
|
210
|
+
const modelsForCompile = loadModelsFile(rootDir);
|
|
211
|
+
const graderProvider = modelsForCompile?.grader?.id ?? "openai:chat:gpt-4o";
|
|
212
|
+
const compileResult = compileLiteracyTasks(repoTasks, {
|
|
213
|
+
rootDir,
|
|
214
|
+
evalMode: opts.variant === LiteracyVariant.AGENTIC
|
|
215
|
+
? LiteracyVariant.AGENTIC
|
|
216
|
+
: LiteracyVariant.STANDARD,
|
|
217
|
+
graderProvider,
|
|
218
|
+
models: (modelsForCompile?.models ?? []).map((m) => ({ id: m.id, label: m.label })),
|
|
219
|
+
});
|
|
220
|
+
totalTests += compileResult.totalTests;
|
|
221
|
+
for (const { taskId, result } of compileResult.tasks) {
|
|
222
|
+
for (const test of result.tests) {
|
|
223
|
+
const desc = typeof test.description === "string" ? test.description : taskId;
|
|
224
|
+
const isBaseline = desc.includes("[Baseline]") || desc.endsWith("(baseline)");
|
|
225
|
+
tasks.push({
|
|
226
|
+
description: desc,
|
|
227
|
+
variant: isBaseline
|
|
228
|
+
? LiteracyVariant.STANDARD
|
|
229
|
+
: "gold",
|
|
230
|
+
});
|
|
231
|
+
}
|
|
174
232
|
}
|
|
175
233
|
}
|
|
176
234
|
}
|
|
@@ -186,11 +244,11 @@ export async function buildPipelinePlan(opts, rootDir) {
|
|
|
186
244
|
const models = [];
|
|
187
245
|
let graderModelName = "";
|
|
188
246
|
if (modelsFile) {
|
|
189
|
-
const activeModels = modelsFile.models.filter((m) => modeMatchesModelModes(opts.mode, m.modes));
|
|
247
|
+
const activeModels = modelsFile.models.filter((m) => modeMatchesModelModes(opts.mode, m.modes, opts.variant));
|
|
190
248
|
// For agentic mode, each model appears twice (naive + optimized)
|
|
191
249
|
for (const m of activeModels) {
|
|
192
250
|
const modelName = extractModelName(m.id);
|
|
193
|
-
if (opts.
|
|
251
|
+
if (opts.variant === LiteracyVariant.AGENTIC) {
|
|
194
252
|
if (m.modes?.includes("agentic-naive")) {
|
|
195
253
|
models.push({
|
|
196
254
|
id: m.id,
|
|
@@ -518,16 +576,16 @@ function collectFilesCreated(opts) {
|
|
|
518
576
|
// ---------------------------------------------------------------------------
|
|
519
577
|
function collectFilesRead(rootDir, _mode) {
|
|
520
578
|
const files = [
|
|
521
|
-
"config/models.
|
|
522
|
-
"config/rubrics.
|
|
523
|
-
"config/prompts.
|
|
524
|
-
"config/sources.
|
|
579
|
+
"config/models.ts",
|
|
580
|
+
"config/rubrics.ts",
|
|
581
|
+
"config/prompts.ts",
|
|
582
|
+
"config/sources.ts",
|
|
525
583
|
];
|
|
526
584
|
// Task files
|
|
527
585
|
const tasksDir = resolve(rootDir, "tasks");
|
|
528
586
|
if (existsSync(tasksDir)) {
|
|
529
587
|
const taskFiles = readdirSync(tasksDir)
|
|
530
|
-
.filter((f) => (
|
|
588
|
+
.filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f) && !f.startsWith("."))
|
|
531
589
|
.sort();
|
|
532
590
|
for (const f of taskFiles)
|
|
533
591
|
files.push(`tasks/${f}`);
|
|
@@ -551,11 +609,11 @@ function collectFilesRead(rootDir, _mode) {
|
|
|
551
609
|
files.push(`canonical/reference-solutions/${f}`);
|
|
552
610
|
}
|
|
553
611
|
// Thresholds (if readiness is involved)
|
|
554
|
-
if (existsSync(resolve(rootDir, "config", "thresholds.
|
|
555
|
-
files.push("config/thresholds.
|
|
612
|
+
if (existsSync(resolve(rootDir, "config", "thresholds.ts"))) {
|
|
613
|
+
files.push("config/thresholds.ts");
|
|
556
614
|
}
|
|
557
|
-
if (existsSync(resolve(rootDir, "config", "features.
|
|
558
|
-
files.push("config/features.
|
|
615
|
+
if (existsSync(resolve(rootDir, "config", "features.ts"))) {
|
|
616
|
+
files.push("config/features.ts");
|
|
559
617
|
}
|
|
560
618
|
return [...new Set(files)].sort();
|
|
561
619
|
}
|
|
@@ -616,16 +674,14 @@ function estimateCost(testCount, models, graderModelName, rubricAssertionsPerTas
|
|
|
616
674
|
// Used by the plan builder without importing the full type to avoid circular deps.
|
|
617
675
|
// ---------------------------------------------------------------------------
|
|
618
676
|
function estimateRubricAssertionsPerTask(rootDir) {
|
|
619
|
-
// Load rubrics
|
|
677
|
+
// Load rubrics config and count the default template set.
|
|
620
678
|
// In practice, most tasks have 2-4 rubric assertions.
|
|
621
|
-
const
|
|
622
|
-
if (!
|
|
679
|
+
const result = tryLoadConfigFile("rubrics", rootDir);
|
|
680
|
+
if (!result)
|
|
623
681
|
return 2; // conservative default
|
|
624
682
|
try {
|
|
625
|
-
const
|
|
626
|
-
|
|
627
|
-
const templateCount = data?.templates
|
|
628
|
-
? Object.keys(data.templates).length
|
|
683
|
+
const templateCount = result.data?.templates
|
|
684
|
+
? Object.keys(result.data.templates).length
|
|
629
685
|
: 2;
|
|
630
686
|
// Most tasks use 2-3 of the available templates
|
|
631
687
|
return Math.min(templateCount, 3);
|
|
@@ -320,6 +320,8 @@ function generateComment(summary, options = {}) {
|
|
|
320
320
|
? "📉"
|
|
321
321
|
: "➡️";
|
|
322
322
|
const d = (n) => n > 0 ? `+${Math.round(n)}` : String(Math.round(n));
|
|
323
|
+
// TODO(multi-mode): These dimension keys are literacy-specific.
|
|
324
|
+
// For other modes, iterate Object.entries(a.dimensions) dynamically.
|
|
323
325
|
lines.push(`| ${a.area} | ${a.baseline} | ${a.experiment} | ${icon} ${d(a.delta)} | ${d(a.dimensions.taskCompletion.delta)} | ${d(a.dimensions.codeCorrectness.delta)} | ${d(a.dimensions.docCoverage.delta)} |`);
|
|
324
326
|
}
|
|
325
327
|
}
|
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* pipeline/profile-resolution.ts
|
|
3
3
|
*
|
|
4
|
-
* Resolves the correct weight profile for a given (mode, variant)
|
|
5
|
-
* The scoring engine calls this to determine which dimensions and
|
|
6
|
-
* apply to each test entry's composite score.
|
|
4
|
+
* Resolves the correct weight profile for a given (mode, perspective, variant)
|
|
5
|
+
* tuple. The scoring engine calls this to determine which dimensions and
|
|
6
|
+
* weights apply to each test entry's composite score.
|
|
7
7
|
*
|
|
8
8
|
* Resolution order:
|
|
9
|
-
* 1.
|
|
10
|
-
*
|
|
9
|
+
* 1. Nested binding (variant provided):
|
|
10
|
+
* mode-profiles.<mode>.<variant>.<perspective> → profile name
|
|
11
|
+
* 2. Flat binding (no variant):
|
|
12
|
+
* mode-profiles.<mode>.<perspective> → profile name
|
|
13
|
+
* 3. Fallback: the "default" profile
|
|
11
14
|
*
|
|
12
15
|
* Supports both the new `profiles` format and the legacy flat `weights`
|
|
13
16
|
* format (treated as a single profile named "default").
|
|
@@ -23,17 +26,22 @@ import type { RubricConfig, WeightProfile } from "../_vendor/ailf-core/index.d.t
|
|
|
23
26
|
*/
|
|
24
27
|
export declare function resolveProfiles(config: RubricConfig): Record<string, WeightProfile>;
|
|
25
28
|
/**
|
|
26
|
-
* Resolve the weight profile for a specific (mode, variant)
|
|
29
|
+
* Resolve the weight profile for a specific (mode, perspective, variant) tuple.
|
|
27
30
|
*
|
|
28
|
-
* @param mode
|
|
29
|
-
* @param
|
|
30
|
-
* @param config
|
|
31
|
+
* @param mode - Canonical mode (e.g., "literacy", "mcp-server")
|
|
32
|
+
* @param perspective - Entry perspective: "gold" (with docs) or "baseline" (without docs)
|
|
33
|
+
* @param config - Parsed rubrics config
|
|
34
|
+
* @param variant - Optional variant within the mode (e.g., "baseline", "agentic" for literacy)
|
|
31
35
|
* @returns The resolved weight profile (dimension → weight map)
|
|
32
36
|
*
|
|
33
37
|
* @example
|
|
34
|
-
*
|
|
35
|
-
* resolveProfile("
|
|
36
|
-
* resolveProfile("
|
|
37
|
-
* resolveProfile("
|
|
38
|
+
* // Nested: literacy mode with variant sub-keys
|
|
39
|
+
* resolveProfile("literacy", "gold", config, "baseline") // → default profile
|
|
40
|
+
* resolveProfile("literacy", "baseline", config, "baseline") // → output-only profile
|
|
41
|
+
* resolveProfile("literacy", "gold", config, "agentic") // → default profile
|
|
42
|
+
*
|
|
43
|
+
* // Flat: non-literacy modes
|
|
44
|
+
* resolveProfile("mcp-server", "gold", config) // → mcp-behavior profile
|
|
45
|
+
* resolveProfile("unknown-mode", "gold", config) // → default (fallback)
|
|
38
46
|
*/
|
|
39
|
-
export declare function resolveProfile(mode: string,
|
|
47
|
+
export declare function resolveProfile(mode: string, perspective: string, config: RubricConfig, variant?: string): WeightProfile;
|
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* pipeline/profile-resolution.ts
|
|
3
3
|
*
|
|
4
|
-
* Resolves the correct weight profile for a given (mode, variant)
|
|
5
|
-
* The scoring engine calls this to determine which dimensions and
|
|
6
|
-
* apply to each test entry's composite score.
|
|
4
|
+
* Resolves the correct weight profile for a given (mode, perspective, variant)
|
|
5
|
+
* tuple. The scoring engine calls this to determine which dimensions and
|
|
6
|
+
* weights apply to each test entry's composite score.
|
|
7
7
|
*
|
|
8
8
|
* Resolution order:
|
|
9
|
-
* 1.
|
|
10
|
-
*
|
|
9
|
+
* 1. Nested binding (variant provided):
|
|
10
|
+
* mode-profiles.<mode>.<variant>.<perspective> → profile name
|
|
11
|
+
* 2. Flat binding (no variant):
|
|
12
|
+
* mode-profiles.<mode>.<perspective> → profile name
|
|
13
|
+
* 3. Fallback: the "default" profile
|
|
11
14
|
*
|
|
12
15
|
* Supports both the new `profiles` format and the legacy flat `weights`
|
|
13
16
|
* format (treated as a single profile named "default").
|
|
@@ -29,31 +32,50 @@ export function resolveProfiles(config) {
|
|
|
29
32
|
return { default: config.weights };
|
|
30
33
|
}
|
|
31
34
|
// Schema validation should prevent this, but be defensive
|
|
32
|
-
throw new Error("rubrics
|
|
35
|
+
throw new Error("rubrics config has neither 'profiles' nor 'weights' — cannot resolve scoring profiles");
|
|
33
36
|
}
|
|
34
37
|
/**
|
|
35
|
-
* Resolve the weight profile for a specific (mode, variant)
|
|
38
|
+
* Resolve the weight profile for a specific (mode, perspective, variant) tuple.
|
|
36
39
|
*
|
|
37
|
-
* @param mode
|
|
38
|
-
* @param
|
|
39
|
-
* @param config
|
|
40
|
+
* @param mode - Canonical mode (e.g., "literacy", "mcp-server")
|
|
41
|
+
* @param perspective - Entry perspective: "gold" (with docs) or "baseline" (without docs)
|
|
42
|
+
* @param config - Parsed rubrics config
|
|
43
|
+
* @param variant - Optional variant within the mode (e.g., "baseline", "agentic" for literacy)
|
|
40
44
|
* @returns The resolved weight profile (dimension → weight map)
|
|
41
45
|
*
|
|
42
46
|
* @example
|
|
43
|
-
*
|
|
44
|
-
* resolveProfile("
|
|
45
|
-
* resolveProfile("
|
|
46
|
-
* resolveProfile("
|
|
47
|
+
* // Nested: literacy mode with variant sub-keys
|
|
48
|
+
* resolveProfile("literacy", "gold", config, "baseline") // → default profile
|
|
49
|
+
* resolveProfile("literacy", "baseline", config, "baseline") // → output-only profile
|
|
50
|
+
* resolveProfile("literacy", "gold", config, "agentic") // → default profile
|
|
51
|
+
*
|
|
52
|
+
* // Flat: non-literacy modes
|
|
53
|
+
* resolveProfile("mcp-server", "gold", config) // → mcp-behavior profile
|
|
54
|
+
* resolveProfile("unknown-mode", "gold", config) // → default (fallback)
|
|
47
55
|
*/
|
|
48
|
-
export function resolveProfile(mode,
|
|
56
|
+
export function resolveProfile(mode, perspective, config, variant) {
|
|
49
57
|
const profiles = resolveProfiles(config);
|
|
50
58
|
const modeProfiles = config["mode-profiles"];
|
|
51
|
-
|
|
52
|
-
|
|
59
|
+
const modeEntry = modeProfiles?.[mode];
|
|
60
|
+
let profileName;
|
|
61
|
+
if (modeEntry && variant) {
|
|
62
|
+
// Nested lookup: mode-profiles.<mode>.<variant>.<perspective>
|
|
63
|
+
const variantEntry = modeEntry[variant];
|
|
64
|
+
if (typeof variantEntry === "object" && variantEntry !== null) {
|
|
65
|
+
profileName = variantEntry[perspective];
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
if (!profileName && modeEntry) {
|
|
69
|
+
// Flat lookup: mode-profiles.<mode>.<perspective>
|
|
70
|
+
const directEntry = modeEntry[perspective];
|
|
71
|
+
if (typeof directEntry === "string") {
|
|
72
|
+
profileName = directEntry;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
53
75
|
if (profileName) {
|
|
54
76
|
const profile = profiles[profileName];
|
|
55
77
|
if (!profile) {
|
|
56
|
-
throw new Error(`mode-profiles.${mode}.${variant} references profile "${profileName}" ` +
|
|
78
|
+
throw new Error(`mode-profiles.${mode}.${variant ? variant + "." : ""}${perspective} references profile "${profileName}" ` +
|
|
57
79
|
`which does not exist. Available profiles: ${Object.keys(profiles).join(", ")}`);
|
|
58
80
|
}
|
|
59
81
|
return profile;
|
|
@@ -61,7 +83,7 @@ export function resolveProfile(mode, variant, config) {
|
|
|
61
83
|
// Fall back to "default" profile
|
|
62
84
|
const defaultProfile = profiles["default"];
|
|
63
85
|
if (!defaultProfile) {
|
|
64
|
-
throw new Error(`No scoring profile found for mode="${mode}"
|
|
86
|
+
throw new Error(`No scoring profile found for mode="${mode}" perspective="${perspective}" ` +
|
|
65
87
|
`and no "default" profile exists. ` +
|
|
66
88
|
`Available profiles: ${Object.keys(profiles).join(", ")}`);
|
|
67
89
|
}
|
|
@@ -42,7 +42,7 @@ export interface ProvenanceInput {
|
|
|
42
42
|
promptfooUrl?: string;
|
|
43
43
|
/** Per-mode Promptfoo share URLs */
|
|
44
44
|
promptfooUrls?: PromptfooUrlEntry[];
|
|
45
|
-
/** Path to the package root (for reading models
|
|
45
|
+
/** Path to the package root (for reading config/models) */
|
|
46
46
|
rootDir: string;
|
|
47
47
|
/** Report ID that triggered this re-run (becomes lineage.rerunOf) */
|
|
48
48
|
sourceReportId?: string;
|
|
@@ -58,7 +58,7 @@ export interface ProvenanceInput {
|
|
|
58
58
|
*
|
|
59
59
|
* Assembles provenance from:
|
|
60
60
|
* - Pipeline options (mode, source, areas, tasks)
|
|
61
|
-
* - config/models.
|
|
61
|
+
* - config/models.ts (model list, grader)
|
|
62
62
|
* - Environment variables (CI metadata, trigger detection)
|
|
63
63
|
* - Optional metadata (context hash, Promptfoo URL)
|
|
64
64
|
*/
|
|
@@ -11,16 +11,14 @@
|
|
|
11
11
|
* @see docs/design-docs/report-store/domain-model.md
|
|
12
12
|
* @see docs/design-docs/report-store/architecture.md — Provenance collection
|
|
13
13
|
*/
|
|
14
|
-
import { readFileSync } from "fs";
|
|
15
|
-
import { resolve } from "path";
|
|
16
|
-
import { load } from "js-yaml";
|
|
17
14
|
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
15
|
+
import { tryLoadConfigFile } from "./compiler/config-loader.js";
|
|
18
16
|
/**
|
|
19
17
|
* Build a ReportProvenance object from pipeline context.
|
|
20
18
|
*
|
|
21
19
|
* Assembles provenance from:
|
|
22
20
|
* - Pipeline options (mode, source, areas, tasks)
|
|
23
|
-
* - config/models.
|
|
21
|
+
* - config/models.ts (model list, grader)
|
|
24
22
|
* - Environment variables (CI metadata, trigger detection)
|
|
25
23
|
* - Optional metadata (context hash, Promptfoo URL)
|
|
26
24
|
*/
|
|
@@ -168,20 +166,17 @@ function detectTrigger() {
|
|
|
168
166
|
// Model config loading
|
|
169
167
|
// ---------------------------------------------------------------------------
|
|
170
168
|
/**
|
|
171
|
-
* Load config/models
|
|
169
|
+
* Load config/models to extract model list and grader info.
|
|
172
170
|
* Falls back to a minimal config if the file can't be read.
|
|
173
171
|
*/
|
|
174
172
|
function loadModelsConfig(rootDir, log) {
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
return
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
models: [],
|
|
185
|
-
};
|
|
186
|
-
}
|
|
173
|
+
const result = tryLoadConfigFile("models", rootDir);
|
|
174
|
+
if (result)
|
|
175
|
+
return result.data;
|
|
176
|
+
log.warn("Could not read config/models for provenance");
|
|
177
|
+
return {
|
|
178
|
+
defaults: {},
|
|
179
|
+
grader: { id: "unknown" },
|
|
180
|
+
models: [],
|
|
181
|
+
};
|
|
187
182
|
}
|
|
@@ -133,8 +133,8 @@ export function formatReleaseImpactConsole(report) {
|
|
|
133
133
|
const docs = task.attributedDocs.length > 0
|
|
134
134
|
? task.attributedDocs.join(", ")
|
|
135
135
|
: "(unattributed)";
|
|
136
|
-
const
|
|
137
|
-
lines.push(` ${docs.padEnd(32)} | ${area.area.padEnd(16)} | ${task.taskId.padEnd(23)} | ${
|
|
136
|
+
const taskDeltaStr = task.delta >= 0 ? `+${task.delta.toFixed(1)}` : task.delta.toFixed(1);
|
|
137
|
+
lines.push(` ${docs.padEnd(32)} | ${area.area.padEnd(16)} | ${task.taskId.padEnd(23)} | ${taskDeltaStr}`);
|
|
138
138
|
}
|
|
139
139
|
}
|
|
140
140
|
lines.push("");
|
|
@@ -194,9 +194,9 @@ export function formatReleaseImpactMarkdown(report) {
|
|
|
194
194
|
const docs = task.attributedDocs.length > 0
|
|
195
195
|
? task.attributedDocs.map((d) => `\`${d}\``).join(", ")
|
|
196
196
|
: "—";
|
|
197
|
-
const
|
|
197
|
+
const taskDeltaStr = task.delta >= 0 ? `+${task.delta.toFixed(1)}` : task.delta.toFixed(1);
|
|
198
198
|
const regressIcon = area.regressed ? " ⚠️" : "";
|
|
199
|
-
lines.push(`| ${docs} | ${area.area} | ${task.taskId} | ${
|
|
199
|
+
lines.push(`| ${docs} | ${area.area} | ${task.taskId} | ${taskDeltaStr}${regressIcon} |`);
|
|
200
200
|
}
|
|
201
201
|
}
|
|
202
202
|
lines.push("");
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* definitions (`.ailf/tasks/*.yaml` → `execution.threshold`).
|
|
6
6
|
*
|
|
7
7
|
* This is distinct from the readiness-gate threshold system in
|
|
8
|
-
* `config/thresholds
|
|
8
|
+
* `config/thresholds`. Repo thresholds are per-task, defined by
|
|
9
9
|
* the product team, and drive PR check pass/fail status. Framework
|
|
10
10
|
* thresholds are per-area, defined by the AILF team, and drive
|
|
11
11
|
* readiness reports.
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* definitions (`.ailf/tasks/*.yaml` → `execution.threshold`).
|
|
6
6
|
*
|
|
7
7
|
* This is distinct from the readiness-gate threshold system in
|
|
8
|
-
* `config/thresholds
|
|
8
|
+
* `config/thresholds`. Repo thresholds are per-task, defined by
|
|
9
9
|
* the product team, and drive PR check pass/fail status. Framework
|
|
10
10
|
* thresholds are per-area, defined by the AILF team, and drive
|
|
11
11
|
* readiness reports.
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/rubric-loader.ts — Load and validate rubric config.
|
|
3
|
+
*
|
|
4
|
+
* Extracted from the legacy expand-tasks.ts so that callers (e.g.,
|
|
5
|
+
* calculate-scores.ts) can load rubric templates without pulling in
|
|
6
|
+
* the deprecated task expansion machinery.
|
|
7
|
+
*
|
|
8
|
+
* @see packages/eval/config/rubrics.ts — the rubric configuration
|
|
9
|
+
* @see packages/core/src/schemas/pipeline.ts — RubricConfigSchema
|
|
10
|
+
*/
|
|
11
|
+
import { type RubricConfig } from "../_vendor/ailf-core/index.d.ts";
|
|
12
|
+
/**
|
|
13
|
+
* Load and validate config/rubrics from the given root directory.
|
|
14
|
+
* Caches the result for subsequent calls with the same rootDir.
|
|
15
|
+
*/
|
|
16
|
+
export declare function loadRubricTemplates(rootDir: string): RubricConfig;
|
|
17
|
+
/**
|
|
18
|
+
* Reset the rubric config cache. Useful in tests.
|
|
19
|
+
*/
|
|
20
|
+
export declare function resetRubricCache(): void;
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/rubric-loader.ts — Load and validate rubric config.
|
|
3
|
+
*
|
|
4
|
+
* Extracted from the legacy expand-tasks.ts so that callers (e.g.,
|
|
5
|
+
* calculate-scores.ts) can load rubric templates without pulling in
|
|
6
|
+
* the deprecated task expansion machinery.
|
|
7
|
+
*
|
|
8
|
+
* @see packages/eval/config/rubrics.ts — the rubric configuration
|
|
9
|
+
* @see packages/core/src/schemas/pipeline.ts — RubricConfigSchema
|
|
10
|
+
*/
|
|
11
|
+
import { RubricConfigSchema } from "../_vendor/ailf-core/index.js";
|
|
12
|
+
import { loadConfigFile } from "./compiler/config-loader.js";
|
|
13
|
+
let cachedRubricConfig = null;
|
|
14
|
+
/**
|
|
15
|
+
* Load and validate config/rubrics from the given root directory.
|
|
16
|
+
* Caches the result for subsequent calls with the same rootDir.
|
|
17
|
+
*/
|
|
18
|
+
export function loadRubricTemplates(rootDir) {
|
|
19
|
+
if (cachedRubricConfig)
|
|
20
|
+
return cachedRubricConfig;
|
|
21
|
+
const { data } = loadConfigFile("rubrics", rootDir);
|
|
22
|
+
const result = RubricConfigSchema.safeParse(data);
|
|
23
|
+
if (!result.success) {
|
|
24
|
+
const messages = result.error.issues
|
|
25
|
+
.map((i) => ` [${i.path.join(".")}]: ${i.message}`)
|
|
26
|
+
.join("\n");
|
|
27
|
+
throw new Error(`Invalid config/rubrics:\n${messages}`);
|
|
28
|
+
}
|
|
29
|
+
cachedRubricConfig = result.data;
|
|
30
|
+
return result.data;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Reset the rubric config cache. Useful in tests.
|
|
34
|
+
*/
|
|
35
|
+
export function resetRubricCache() {
|
|
36
|
+
cachedRubricConfig = null;
|
|
37
|
+
}
|