@sanity/ailf 2.0.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/cli.js +0 -0
- package/package.json +24 -24
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
- package/dist/adapters/task-sources/yaml-task-source.js +0 -139
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
- package/dist/commands/update-quality-scores.d.ts +0 -5
- package/dist/commands/update-quality-scores.js +0 -20
- package/dist/lib/agent-behavior-report.d.ts +0 -8
- package/dist/lib/agent-behavior-report.js +0 -185
- package/dist/lib/baseline.d.ts +0 -19
- package/dist/lib/baseline.js +0 -153
- package/dist/lib/calculate-scores.d.ts +0 -23
- package/dist/lib/calculate-scores.js +0 -42
- package/dist/lib/compare.d.ts +0 -18
- package/dist/lib/compare.js +0 -170
- package/dist/lib/coverage-audit.d.ts +0 -4
- package/dist/lib/coverage-audit.js +0 -42
- package/dist/lib/discovery-report.d.ts +0 -13
- package/dist/lib/discovery-report.js +0 -57
- package/dist/lib/fetch-docs.d.ts +0 -30
- package/dist/lib/fetch-docs.js +0 -171
- package/dist/lib/generate-configs.d.ts +0 -25
- package/dist/lib/generate-configs.js +0 -42
- package/dist/lib/grader-api.d.ts +0 -21
- package/dist/lib/grader-api.js +0 -34
- package/dist/lib/grader-compare.d.ts +0 -19
- package/dist/lib/grader-compare.js +0 -91
- package/dist/lib/grader-consistency.d.ts +0 -27
- package/dist/lib/grader-consistency.js +0 -79
- package/dist/lib/grader-sensitivity.d.ts +0 -19
- package/dist/lib/grader-sensitivity.js +0 -75
- package/dist/lib/grader-validate.d.ts +0 -19
- package/dist/lib/grader-validate.js +0 -78
- package/dist/lib/measure-retrieval.d.ts +0 -14
- package/dist/lib/measure-retrieval.js +0 -71
- package/dist/lib/pr-comment.d.ts +0 -16
- package/dist/lib/pr-comment.js +0 -28
- package/dist/lib/readiness-report.d.ts +0 -13
- package/dist/lib/readiness-report.js +0 -108
- package/dist/lib/webhook-server.d.ts +0 -11
- package/dist/lib/webhook-server.js +0 -24
- package/dist/lib/weekly-digest.d.ts +0 -24
- package/dist/lib/weekly-digest.js +0 -148
- package/dist/orchestration/env-bridge.d.ts +0 -21
- package/dist/orchestration/env-bridge.js +0 -66
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
- package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
- package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
- package/dist/pipeline/compiler/task-bridge.js +0 -92
- package/dist/pipeline/expand-tasks.d.ts +0 -232
- package/dist/pipeline/expand-tasks.js +0 -467
- package/dist/pipeline/generate-configs.d.ts +0 -92
- package/dist/pipeline/generate-configs.js +0 -445
- package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/calculate-scores-step.js +0 -89
- package/dist/pipeline/steps/compare-step.d.ts +0 -18
- package/dist/pipeline/steps/compare-step.js +0 -90
- package/dist/pipeline/steps/eval-step.d.ts +0 -53
- package/dist/pipeline/steps/eval-step.js +0 -347
- package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
- package/dist/pipeline/steps/fetch-docs-step.js +0 -84
- package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
- package/dist/pipeline/steps/generate-configs-step.js +0 -98
- package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
- package/dist/pipeline/steps/grader-consistency-step.js +0 -74
- package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
- package/dist/pipeline/steps/publish-report-step.js +0 -243
- package/dist/pipeline/steps/report-step.d.ts +0 -13
- package/dist/pipeline/steps/report-step.js +0 -56
- package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/update-scores-step.js +0 -42
- package/dist/scripts/agent-behavior-report.d.ts +0 -19
- package/dist/scripts/agent-behavior-report.js +0 -315
- package/dist/scripts/baseline.d.ts +0 -43
- package/dist/scripts/baseline.js +0 -267
- package/dist/scripts/calculate-scores.d.ts +0 -166
- package/dist/scripts/calculate-scores.js +0 -1296
- package/dist/scripts/compare.d.ts +0 -22
- package/dist/scripts/compare.js +0 -334
- package/dist/scripts/coverage-audit.d.ts +0 -44
- package/dist/scripts/coverage-audit.js +0 -209
- package/dist/scripts/debug-eval.d.ts +0 -19
- package/dist/scripts/debug-eval.js +0 -73
- package/dist/scripts/discovery-report.d.ts +0 -58
- package/dist/scripts/discovery-report.js +0 -250
- package/dist/scripts/fetch-docs.d.ts +0 -35
- package/dist/scripts/fetch-docs.js +0 -472
- package/dist/scripts/generate-configs.d.ts +0 -66
- package/dist/scripts/generate-configs.js +0 -459
- package/dist/scripts/grader-api.d.ts +0 -27
- package/dist/scripts/grader-api.js +0 -206
- package/dist/scripts/grader-compare.d.ts +0 -22
- package/dist/scripts/grader-compare.js +0 -368
- package/dist/scripts/grader-consistency.d.ts +0 -20
- package/dist/scripts/grader-consistency.js +0 -313
- package/dist/scripts/grader-sensitivity.d.ts +0 -22
- package/dist/scripts/grader-sensitivity.js +0 -354
- package/dist/scripts/grader-validate.d.ts +0 -19
- package/dist/scripts/grader-validate.js +0 -267
- package/dist/scripts/measure-retrieval.d.ts +0 -10
- package/dist/scripts/measure-retrieval.js +0 -145
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
- package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
- package/dist/scripts/pipeline.d.ts +0 -76
- package/dist/scripts/pipeline.js +0 -1031
- package/dist/scripts/pr-comment.d.ts +0 -10
- package/dist/scripts/pr-comment.js +0 -510
- package/dist/scripts/readiness-report.d.ts +0 -88
- package/dist/scripts/readiness-report.js +0 -342
- package/dist/scripts/update-quality-scores.d.ts +0 -15
- package/dist/scripts/update-quality-scores.js +0 -184
- package/dist/scripts/validate-task-sources.d.ts +0 -21
- package/dist/scripts/validate-task-sources.js +0 -210
- package/dist/scripts/validate.d.ts +0 -13
- package/dist/scripts/validate.js +0 -79
- package/dist/scripts/webhook-server.d.ts +0 -26
- package/dist/scripts/webhook-server.js +0 -147
- package/dist/scripts/weekly-digest.d.ts +0 -24
- package/dist/scripts/weekly-digest.js +0 -144
- package/dist/sinks/format-slack.d.ts +0 -64
- package/dist/sinks/format-slack.js +0 -306
- package/dist/sinks/slack-sink.d.ts +0 -27
- package/dist/sinks/slack-sink.js +0 -78
- package/dist/sinks/webhook-sink.d.ts +0 -19
- package/dist/sinks/webhook-sink.js +0 -50
- package/tasks/.expanded.agentic.yaml +0 -280
- package/tasks/.expanded.yaml +0 -565
|
@@ -1,232 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* pipeline/expand-tasks.ts
|
|
3
|
-
*
|
|
4
|
-
* @deprecated This is part of the LEGACY compilation path. New code should
|
|
5
|
-
* use the literacy handler in the compiler pipeline instead:
|
|
6
|
-
*
|
|
7
|
-
* import { compileLiteracyTask } from "./compiler/mode-handlers/literacy/index.js"
|
|
8
|
-
*
|
|
9
|
-
* @see packages/eval/src/pipeline/compiler/mode-handlers/literacy/index.ts
|
|
10
|
-
*
|
|
11
|
-
* ---
|
|
12
|
-
*
|
|
13
|
-
* Reads task YAML files in the single-definition format and expands each
|
|
14
|
-
* task into gold + baseline Promptfoo test entries. This eliminates the
|
|
15
|
-
* manual duplication where every task had to be written twice.
|
|
16
|
-
*
|
|
17
|
-
* Rubric templates from config/rubrics.yaml are resolved at expansion time:
|
|
18
|
-
* tasks specify `template` + `criteria`, and the expander assembles
|
|
19
|
-
* the full rubric text by injecting criteria into the template.
|
|
20
|
-
*
|
|
21
|
-
* Structured dimension metadata (Approach 5):
|
|
22
|
-
* When a rubric template has a `dimension` field, the resolved assertion
|
|
23
|
-
* includes `metadata.dimension` and `metadata.maxScore`. This flows through
|
|
24
|
-
* Promptfoo into component results, allowing the scoring engine to classify
|
|
25
|
-
* rubrics structurally instead of via heuristic string matching.
|
|
26
|
-
* See docs/design-docs/structured-dimensions.md.
|
|
27
|
-
*
|
|
28
|
-
* Single-definition format:
|
|
29
|
-
* - id: groq-blog-queries
|
|
30
|
-
* description: "GROQ - Blog queries with filtering and pagination"
|
|
31
|
-
* doc_coverage: true
|
|
32
|
-
* vars:
|
|
33
|
-
* task: |
|
|
34
|
-
* Write GROQ queries for a Sanity blog application: ...
|
|
35
|
-
* docs: file://contexts/canonical/groq-blog-queries.md
|
|
36
|
-
* assert:
|
|
37
|
-
* - type: llm-rubric
|
|
38
|
-
* template: task-completion
|
|
39
|
-
* criteria:
|
|
40
|
-
* - GROQ filter with _type == "post"
|
|
41
|
-
* - Projection with aliased slug field
|
|
42
|
-
* - type: contains-any
|
|
43
|
-
* value: ["client.fetch", "createClient"]
|
|
44
|
-
* baseline:
|
|
45
|
-
* enabled: true
|
|
46
|
-
* rubric: full
|
|
47
|
-
*
|
|
48
|
-
* Expands to:
|
|
49
|
-
* 1. Gold entry — uses vars.docs as-is, resolves templates, appends doc-coverage
|
|
50
|
-
* 2. Baseline entry — sets docs: "", uses full rubric (same assertions as gold)
|
|
51
|
-
*/
|
|
52
|
-
import type { LiteracyTaskDefinition, Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
53
|
-
import { type RubricConfig } from "./schemas.js";
|
|
54
|
-
import { type LiteracyEvalSubMode } from "./normalize-mode.js";
|
|
55
|
-
import type { FilterOptions } from "./types.js";
|
|
56
|
-
/** Any assertion entry (templated or value-based). */
|
|
57
|
-
export type AssertEntry = TemplatedAssert | ValueAssert;
|
|
58
|
-
/** The output format — a Promptfoo-compatible test entry. */
|
|
59
|
-
export interface ExpandedTestEntry {
|
|
60
|
-
assert?: ValueAssert[];
|
|
61
|
-
description: string;
|
|
62
|
-
/**
|
|
63
|
-
* Promptfoo prompt filter — restricts which prompts this test runs against.
|
|
64
|
-
* Matches on prompt `id` or `label`. When absent, all prompts are allowed
|
|
65
|
-
* (Promptfoo's default cartesian product behavior).
|
|
66
|
-
*
|
|
67
|
-
* Gold entries use `['with-docs']` (ceiling measurement).
|
|
68
|
-
* Baseline entries use `['without-docs']` (floor measurement).
|
|
69
|
-
*
|
|
70
|
-
* See: evaluation-ceiling.md for the floor/ceiling/actual decomposition.
|
|
71
|
-
*/
|
|
72
|
-
prompts?: string[];
|
|
73
|
-
vars: Record<string, unknown>;
|
|
74
|
-
}
|
|
75
|
-
/** A legacy task entry (the old paired format without an `id` field). */
|
|
76
|
-
export interface LegacyTaskEntry {
|
|
77
|
-
assert?: AssertEntry[];
|
|
78
|
-
description: string;
|
|
79
|
-
transform?: string;
|
|
80
|
-
vars?: Record<string, unknown>;
|
|
81
|
-
}
|
|
82
|
-
/** A single task definition in the new format (input). */
|
|
83
|
-
export interface SingleTaskDefinition {
|
|
84
|
-
/** Grading assertions (applied to both gold and baseline by default). */
|
|
85
|
-
assert: AssertEntry[];
|
|
86
|
-
/** Baseline generation options. */
|
|
87
|
-
baseline?: {
|
|
88
|
-
/** Whether to generate a baseline variant. Default: true. */
|
|
89
|
-
enabled?: boolean;
|
|
90
|
-
/** Rubric mode: 'full' copies all asserts, 'abbreviated' generates a
|
|
91
|
-
* summary rubric, 'none' omits rubric asserts. Default: 'full'. */
|
|
92
|
-
rubric?: "abbreviated" | "full" | "none";
|
|
93
|
-
};
|
|
94
|
-
/** Human-readable description of what this task tests. */
|
|
95
|
-
description: string;
|
|
96
|
-
/** Opt-in: auto-generate a documentation coverage rubric for gold. */
|
|
97
|
-
doc_coverage?: boolean;
|
|
98
|
-
/** Feature area this task belongs to (flows through to scoring). */
|
|
99
|
-
featureArea?: string;
|
|
100
|
-
/** Explicit task ID — determines the canonical context filename. */
|
|
101
|
-
id: string;
|
|
102
|
-
/** Template variables: task prompt and docs path. */
|
|
103
|
-
vars: {
|
|
104
|
-
task: string;
|
|
105
|
-
docs: string;
|
|
106
|
-
[key: string]: unknown;
|
|
107
|
-
};
|
|
108
|
-
}
|
|
109
|
-
/** A templated assertion — references a rubric template. */
|
|
110
|
-
export interface TemplatedAssert {
|
|
111
|
-
criteria: string[];
|
|
112
|
-
template: string;
|
|
113
|
-
type: "llm-rubric";
|
|
114
|
-
weight?: number;
|
|
115
|
-
}
|
|
116
|
-
/** A standard assertion with a value. */
|
|
117
|
-
export interface ValueAssert {
|
|
118
|
-
[key: string]: unknown;
|
|
119
|
-
type: string;
|
|
120
|
-
value?: unknown;
|
|
121
|
-
weight?: number;
|
|
122
|
-
}
|
|
123
|
-
/**
|
|
124
|
-
* Assemble a full rubric text string from a template and criteria.
|
|
125
|
-
*
|
|
126
|
-
* Output format:
|
|
127
|
-
* {header}
|
|
128
|
-
* - {scale[0]}
|
|
129
|
-
* - {scale[1]}
|
|
130
|
-
* ...
|
|
131
|
-
*
|
|
132
|
-
* {criteria_label}
|
|
133
|
-
* - {criteria[0]}
|
|
134
|
-
* - {criteria[1]}
|
|
135
|
-
* ...
|
|
136
|
-
*
|
|
137
|
-
* {footer}
|
|
138
|
-
*/
|
|
139
|
-
export declare function assembleRubric(templateKey: string, criteria: string[], rubricConfig: RubricConfig): string;
|
|
140
|
-
/**
|
|
141
|
-
* Build baseline assertions based on the rubric mode.
|
|
142
|
-
*
|
|
143
|
-
* - 'full': Copy all assertions as-is
|
|
144
|
-
* - 'abbreviated': Keep only the first llm-rubric (task completion) with
|
|
145
|
-
* a shortened prompt, plus all non-rubric assertions
|
|
146
|
-
* - 'none': No assertions at all
|
|
147
|
-
*/
|
|
148
|
-
export declare function buildBaselineAsserts(goldAsserts: ValueAssert[], mode: "abbreviated" | "full" | "none"): ValueAssert[];
|
|
149
|
-
/**
|
|
150
|
-
* Clear the cached rubric config. Used in tests.
|
|
151
|
-
*/
|
|
152
|
-
export declare function clearRubricCache(): void;
|
|
153
|
-
/**
|
|
154
|
-
* Expand a single task definition into gold + baseline Promptfoo test entries.
|
|
155
|
-
* Returns 1 entry (gold only) if baseline is disabled, or 2 entries otherwise.
|
|
156
|
-
*
|
|
157
|
-
* Resolves templated assertions and appends doc-coverage if opted in.
|
|
158
|
-
*
|
|
159
|
-
* @param mode - Controls which entries are generated and how:
|
|
160
|
-
* - `'baseline'` (default): Gold + baseline entries with `prompts` filter
|
|
161
|
-
* to prevent cartesian product with multiple prompts. Gold entries get
|
|
162
|
-
* `prompts: ['with-docs']`, baseline entries get `prompts: ['without-docs']`.
|
|
163
|
-
* - `'agentic'`: Gold entries only, no `prompts` filter (agentic mode has
|
|
164
|
-
* a single prompt that doesn't use `{{docs}}`; baseline entries would be
|
|
165
|
-
* pure waste — identical prompts, wasted API calls).
|
|
166
|
-
*/
|
|
167
|
-
export declare function expandTask(task: SingleTaskDefinition, rubricConfig: RubricConfig, mode?: LiteracyEvalSubMode): ExpandedTestEntry[];
|
|
168
|
-
/**
|
|
169
|
-
* Expand an array of LiteracyTaskDefinition[] (from any TaskSource adapter)
|
|
170
|
-
* into Promptfoo-compatible test entries. This is the TaskSource-aware
|
|
171
|
-
* counterpart of loadAndExpandTasks() — it skips YAML file I/O and works
|
|
172
|
-
* directly with the domain type.
|
|
173
|
-
*
|
|
174
|
-
* @param tasks - Literacy task definitions from any TaskSource adapter
|
|
175
|
-
* @param rootDir - Eval package root (needed to load rubric templates)
|
|
176
|
-
* @param mode - Expansion mode: 'baseline' (gold + baseline) or 'agentic' (gold only)
|
|
177
|
-
* @returns Expanded test entries and statistics
|
|
178
|
-
*/
|
|
179
|
-
export declare function expandTaskDefinitions(tasks: LiteracyTaskDefinition[], rootDir: string, mode?: LiteracyEvalSubMode): {
|
|
180
|
-
entries: ExpandedTestEntry[];
|
|
181
|
-
stats: {
|
|
182
|
-
totalTasks: number;
|
|
183
|
-
expandedTotal: number;
|
|
184
|
-
};
|
|
185
|
-
};
|
|
186
|
-
/**
|
|
187
|
-
* Extract all task IDs from task files. Only works with the new
|
|
188
|
-
* single-definition format entries (those that have an `id` field).
|
|
189
|
-
*/
|
|
190
|
-
export declare function extractTaskIds(rootDir: string): string[];
|
|
191
|
-
/**
|
|
192
|
-
* Type guard: checks if an entry is in the new single-definition format.
|
|
193
|
-
* The distinguishing feature is the presence of an `id` field.
|
|
194
|
-
*/
|
|
195
|
-
export declare function isSingleTaskDefinition(entry: unknown): entry is SingleTaskDefinition;
|
|
196
|
-
/**
|
|
197
|
-
* Type guard: checks if an assertion uses the templated format.
|
|
198
|
-
*/
|
|
199
|
-
export declare function isTemplatedAssert(entry: AssertEntry): entry is TemplatedAssert;
|
|
200
|
-
/**
|
|
201
|
-
* Load and expand all task files from the tasks/ directory.
|
|
202
|
-
* Supports both the new single-definition format (has `id`) and the legacy
|
|
203
|
-
* paired format (no `id`). Legacy entries pass through unchanged.
|
|
204
|
-
*
|
|
205
|
-
* @param mode - Controls expansion behavior:
|
|
206
|
-
* - `'baseline'` (default): Gold + baseline entries with prompt filters.
|
|
207
|
-
* - `'agentic'`: Gold entries only, no prompt filters.
|
|
208
|
-
*
|
|
209
|
-
* Returns the expanded entries grouped by source file.
|
|
210
|
-
*/
|
|
211
|
-
export declare function loadAndExpandTasks(rootDir: string, filter?: FilterOptions, mode?: LiteracyEvalSubMode, logger?: Logger): {
|
|
212
|
-
/** All expanded test entries, in order. */
|
|
213
|
-
entries: ExpandedTestEntry[];
|
|
214
|
-
/** Statistics about what was processed. */
|
|
215
|
-
stats: {
|
|
216
|
-
totalFiles: number;
|
|
217
|
-
singleDefinitions: number;
|
|
218
|
-
legacyEntries: number;
|
|
219
|
-
expandedTotal: number;
|
|
220
|
-
};
|
|
221
|
-
};
|
|
222
|
-
/**
|
|
223
|
-
* Load and validate config/rubrics.yaml from the given root directory.
|
|
224
|
-
* Caches the result for subsequent calls with the same rootDir.
|
|
225
|
-
*/
|
|
226
|
-
export declare function loadRubricTemplates(rootDir: string): RubricConfig;
|
|
227
|
-
/**
|
|
228
|
-
* Resolve a single assertion: if it's templated, assemble the rubric text
|
|
229
|
-
* and attach structured dimension metadata when the template has a
|
|
230
|
-
* `dimension` field. Otherwise, pass through unchanged.
|
|
231
|
-
*/
|
|
232
|
-
export declare function resolveAssert(entry: AssertEntry, rubricConfig: RubricConfig): ValueAssert;
|
|
@@ -1,467 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* pipeline/expand-tasks.ts
|
|
3
|
-
*
|
|
4
|
-
* @deprecated This is part of the LEGACY compilation path. New code should
|
|
5
|
-
* use the literacy handler in the compiler pipeline instead:
|
|
6
|
-
*
|
|
7
|
-
* import { compileLiteracyTask } from "./compiler/mode-handlers/literacy/index.js"
|
|
8
|
-
*
|
|
9
|
-
* @see packages/eval/src/pipeline/compiler/mode-handlers/literacy/index.ts
|
|
10
|
-
*
|
|
11
|
-
* ---
|
|
12
|
-
*
|
|
13
|
-
* Reads task YAML files in the single-definition format and expands each
|
|
14
|
-
* task into gold + baseline Promptfoo test entries. This eliminates the
|
|
15
|
-
* manual duplication where every task had to be written twice.
|
|
16
|
-
*
|
|
17
|
-
* Rubric templates from config/rubrics.yaml are resolved at expansion time:
|
|
18
|
-
* tasks specify `template` + `criteria`, and the expander assembles
|
|
19
|
-
* the full rubric text by injecting criteria into the template.
|
|
20
|
-
*
|
|
21
|
-
* Structured dimension metadata (Approach 5):
|
|
22
|
-
* When a rubric template has a `dimension` field, the resolved assertion
|
|
23
|
-
* includes `metadata.dimension` and `metadata.maxScore`. This flows through
|
|
24
|
-
* Promptfoo into component results, allowing the scoring engine to classify
|
|
25
|
-
* rubrics structurally instead of via heuristic string matching.
|
|
26
|
-
* See docs/design-docs/structured-dimensions.md.
|
|
27
|
-
*
|
|
28
|
-
* Single-definition format:
|
|
29
|
-
* - id: groq-blog-queries
|
|
30
|
-
* description: "GROQ - Blog queries with filtering and pagination"
|
|
31
|
-
* doc_coverage: true
|
|
32
|
-
* vars:
|
|
33
|
-
* task: |
|
|
34
|
-
* Write GROQ queries for a Sanity blog application: ...
|
|
35
|
-
* docs: file://contexts/canonical/groq-blog-queries.md
|
|
36
|
-
* assert:
|
|
37
|
-
* - type: llm-rubric
|
|
38
|
-
* template: task-completion
|
|
39
|
-
* criteria:
|
|
40
|
-
* - GROQ filter with _type == "post"
|
|
41
|
-
* - Projection with aliased slug field
|
|
42
|
-
* - type: contains-any
|
|
43
|
-
* value: ["client.fetch", "createClient"]
|
|
44
|
-
* baseline:
|
|
45
|
-
* enabled: true
|
|
46
|
-
* rubric: full
|
|
47
|
-
*
|
|
48
|
-
* Expands to:
|
|
49
|
-
* 1. Gold entry — uses vars.docs as-is, resolves templates, appends doc-coverage
|
|
50
|
-
* 2. Baseline entry — sets docs: "", uses full rubric (same assertions as gold)
|
|
51
|
-
*/
|
|
52
|
-
import { existsSync, readFileSync, readdirSync } from "fs";
|
|
53
|
-
import { resolve } from "path";
|
|
54
|
-
import { load } from "js-yaml";
|
|
55
|
-
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
56
|
-
import { loadConfigFile } from "./compiler/config-loader.js";
|
|
57
|
-
import { RubricConfigSchema } from "./schemas.js";
|
|
58
|
-
import { LiteracyVariant } from "./normalize-mode.js";
|
|
59
|
-
// ---------------------------------------------------------------------------
|
|
60
|
-
// Rubric template loading and assembly
|
|
61
|
-
// ---------------------------------------------------------------------------
|
|
62
|
-
/** Cached rubric config — loaded once per process. */
|
|
63
|
-
let cachedRubricConfig = null;
|
|
64
|
-
/**
|
|
65
|
-
* Assemble a full rubric text string from a template and criteria.
|
|
66
|
-
*
|
|
67
|
-
* Output format:
|
|
68
|
-
* {header}
|
|
69
|
-
* - {scale[0]}
|
|
70
|
-
* - {scale[1]}
|
|
71
|
-
* ...
|
|
72
|
-
*
|
|
73
|
-
* {criteria_label}
|
|
74
|
-
* - {criteria[0]}
|
|
75
|
-
* - {criteria[1]}
|
|
76
|
-
* ...
|
|
77
|
-
*
|
|
78
|
-
* {footer}
|
|
79
|
-
*/
|
|
80
|
-
export function assembleRubric(templateKey, criteria, rubricConfig) {
|
|
81
|
-
const template = rubricConfig.templates[templateKey];
|
|
82
|
-
if (!template) {
|
|
83
|
-
throw new Error(`Unknown rubric template '${templateKey}'. Available: ${Object.keys(rubricConfig.templates).join(", ")}`);
|
|
84
|
-
}
|
|
85
|
-
const parts = [];
|
|
86
|
-
// Header
|
|
87
|
-
parts.push(template.header);
|
|
88
|
-
// Scale
|
|
89
|
-
for (const item of template.scale) {
|
|
90
|
-
parts.push(`- ${item}`);
|
|
91
|
-
}
|
|
92
|
-
// Criteria (if template has a label and criteria are provided)
|
|
93
|
-
if (template.criteria_label && criteria.length > 0) {
|
|
94
|
-
parts.push(""); // blank line
|
|
95
|
-
parts.push(template.criteria_label);
|
|
96
|
-
for (const c of criteria) {
|
|
97
|
-
parts.push(`- ${c}`);
|
|
98
|
-
}
|
|
99
|
-
}
|
|
100
|
-
// Footer
|
|
101
|
-
parts.push(""); // blank line
|
|
102
|
-
parts.push(rubricConfig.footer);
|
|
103
|
-
return parts.join("\n");
|
|
104
|
-
}
|
|
105
|
-
/**
|
|
106
|
-
* Build baseline assertions based on the rubric mode.
|
|
107
|
-
*
|
|
108
|
-
* - 'full': Copy all assertions as-is
|
|
109
|
-
* - 'abbreviated': Keep only the first llm-rubric (task completion) with
|
|
110
|
-
* a shortened prompt, plus all non-rubric assertions
|
|
111
|
-
* - 'none': No assertions at all
|
|
112
|
-
*/
|
|
113
|
-
export function buildBaselineAsserts(goldAsserts, mode) {
|
|
114
|
-
if (mode === "none")
|
|
115
|
-
return [];
|
|
116
|
-
if (mode === "full")
|
|
117
|
-
return [...goldAsserts];
|
|
118
|
-
// Abbreviated: keep the first llm-rubric with a summary, drop the rest
|
|
119
|
-
const abbreviated = [];
|
|
120
|
-
let foundFirstRubric = false;
|
|
121
|
-
for (const a of goldAsserts) {
|
|
122
|
-
if (a.type === "llm-rubric") {
|
|
123
|
-
if (!foundFirstRubric) {
|
|
124
|
-
foundFirstRubric = true;
|
|
125
|
-
abbreviated.push({
|
|
126
|
-
type: "llm-rubric",
|
|
127
|
-
value: 'Score task completion from 0 to 100 (same criteria as above).\nReturn ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}',
|
|
128
|
-
});
|
|
129
|
-
}
|
|
130
|
-
// Skip subsequent llm-rubrics (code correctness, doc coverage)
|
|
131
|
-
}
|
|
132
|
-
// Non-rubric assertions are excluded from baseline to match current behavior
|
|
133
|
-
}
|
|
134
|
-
return abbreviated;
|
|
135
|
-
}
|
|
136
|
-
/**
|
|
137
|
-
* Clear the cached rubric config. Used in tests.
|
|
138
|
-
*/
|
|
139
|
-
export function clearRubricCache() {
|
|
140
|
-
cachedRubricConfig = null;
|
|
141
|
-
}
|
|
142
|
-
/**
|
|
143
|
-
* Expand a single task definition into gold + baseline Promptfoo test entries.
|
|
144
|
-
* Returns 1 entry (gold only) if baseline is disabled, or 2 entries otherwise.
|
|
145
|
-
*
|
|
146
|
-
* Resolves templated assertions and appends doc-coverage if opted in.
|
|
147
|
-
*
|
|
148
|
-
* @param mode - Controls which entries are generated and how:
|
|
149
|
-
* - `'baseline'` (default): Gold + baseline entries with `prompts` filter
|
|
150
|
-
* to prevent cartesian product with multiple prompts. Gold entries get
|
|
151
|
-
* `prompts: ['with-docs']`, baseline entries get `prompts: ['without-docs']`.
|
|
152
|
-
* - `'agentic'`: Gold entries only, no `prompts` filter (agentic mode has
|
|
153
|
-
* a single prompt that doesn't use `{{docs}}`; baseline entries would be
|
|
154
|
-
* pure waste — identical prompts, wasted API calls).
|
|
155
|
-
*/
|
|
156
|
-
export function expandTask(task, rubricConfig, mode = LiteracyVariant.STANDARD) {
|
|
157
|
-
const entries = [];
|
|
158
|
-
// Resolve all templated assertions
|
|
159
|
-
const resolvedAsserts = task.assert.map((a) => resolveAssert(a, rubricConfig));
|
|
160
|
-
// Append doc-coverage rubric if opted in
|
|
161
|
-
if (task.doc_coverage) {
|
|
162
|
-
const dcTemplate = rubricConfig.templates["doc-coverage"];
|
|
163
|
-
resolvedAsserts.push({
|
|
164
|
-
type: "llm-rubric",
|
|
165
|
-
value: assembleRubric("doc-coverage", [], rubricConfig),
|
|
166
|
-
...(dcTemplate?.dimension && {
|
|
167
|
-
metadata: {
|
|
168
|
-
dimension: dcTemplate.dimension,
|
|
169
|
-
maxScore: 100,
|
|
170
|
-
},
|
|
171
|
-
}),
|
|
172
|
-
});
|
|
173
|
-
}
|
|
174
|
-
// Gold entry — ceiling measurement (canonical docs injected directly).
|
|
175
|
-
// In baseline/observed mode: restricted to the 'with-docs' prompt to
|
|
176
|
-
// prevent cartesian product with the 'without-docs' prompt.
|
|
177
|
-
// In agentic mode: no filter needed (single prompt, id = 'agentic').
|
|
178
|
-
// See evaluation-ceiling.md for the floor/ceiling/actual decomposition.
|
|
179
|
-
entries.push({
|
|
180
|
-
assert: [...resolvedAsserts],
|
|
181
|
-
description: `${task.description} (gold)`,
|
|
182
|
-
...(mode === LiteracyVariant.STANDARD ? { prompts: ["with-docs"] } : {}),
|
|
183
|
-
vars: { ...task.vars, __featureArea: task.featureArea ?? "" },
|
|
184
|
-
});
|
|
185
|
-
// Baseline entry — floor measurement (no docs, parametric knowledge only).
|
|
186
|
-
// Skipped entirely in agentic mode: the agentic prompt doesn't reference
|
|
187
|
-
// {{docs}}, so gold and baseline would produce identical prompts — pure
|
|
188
|
-
// waste of API calls and cost.
|
|
189
|
-
if (mode === LiteracyVariant.AGENTIC) {
|
|
190
|
-
return entries;
|
|
191
|
-
}
|
|
192
|
-
// Restricted to the 'without-docs' prompt. Unless explicitly disabled.
|
|
193
|
-
const baselineEnabled = task.baseline?.enabled !== false;
|
|
194
|
-
if (baselineEnabled) {
|
|
195
|
-
const rubricMode = task.baseline?.rubric ?? "full";
|
|
196
|
-
const baselineAsserts = buildBaselineAsserts(resolvedAsserts, rubricMode);
|
|
197
|
-
entries.push({
|
|
198
|
-
description: `${task.description} (baseline)`,
|
|
199
|
-
prompts: ["without-docs"],
|
|
200
|
-
vars: {
|
|
201
|
-
...task.vars,
|
|
202
|
-
docs: "",
|
|
203
|
-
__featureArea: task.featureArea ?? "",
|
|
204
|
-
},
|
|
205
|
-
...(baselineAsserts.length > 0 ? { assert: baselineAsserts } : {}),
|
|
206
|
-
});
|
|
207
|
-
}
|
|
208
|
-
return entries;
|
|
209
|
-
}
|
|
210
|
-
/**
|
|
211
|
-
* Convert a LiteracyTaskDefinition to the local SingleTaskDefinition
|
|
212
|
-
* format used by expandTask().
|
|
213
|
-
*
|
|
214
|
-
* When a task has no canonical docs, the `docs` var is set to empty string
|
|
215
|
-
* instead of a file path. This prevents Promptfoo from trying to read a
|
|
216
|
-
* context file that was never created by FetchDocsStep (which skips tasks
|
|
217
|
-
* without canonical docs). The gold entry still runs — it just tests model
|
|
218
|
-
* knowledge alone, same as the baseline variant.
|
|
219
|
-
*/
|
|
220
|
-
function literacyTaskToSingle(task) {
|
|
221
|
-
const hasDocs = (task.context?.docs?.length ?? 0) > 0;
|
|
222
|
-
return {
|
|
223
|
-
assert: (task.assertions ?? []).map((a) => ({ ...a })),
|
|
224
|
-
baseline: task.baseline,
|
|
225
|
-
description: task.title,
|
|
226
|
-
doc_coverage: task.docCoverage,
|
|
227
|
-
featureArea: task.area ?? "",
|
|
228
|
-
id: task.id,
|
|
229
|
-
vars: {
|
|
230
|
-
docs: hasDocs ? `file://contexts/canonical/${task.id}.md` : "",
|
|
231
|
-
task: task.prompt?.text ?? "",
|
|
232
|
-
...task.prompt?.vars,
|
|
233
|
-
},
|
|
234
|
-
};
|
|
235
|
-
}
|
|
236
|
-
/**
|
|
237
|
-
* Expand an array of LiteracyTaskDefinition[] (from any TaskSource adapter)
|
|
238
|
-
* into Promptfoo-compatible test entries. This is the TaskSource-aware
|
|
239
|
-
* counterpart of loadAndExpandTasks() — it skips YAML file I/O and works
|
|
240
|
-
* directly with the domain type.
|
|
241
|
-
*
|
|
242
|
-
* @param tasks - Literacy task definitions from any TaskSource adapter
|
|
243
|
-
* @param rootDir - Eval package root (needed to load rubric templates)
|
|
244
|
-
* @param mode - Expansion mode: 'baseline' (gold + baseline) or 'agentic' (gold only)
|
|
245
|
-
* @returns Expanded test entries and statistics
|
|
246
|
-
*/
|
|
247
|
-
export function expandTaskDefinitions(tasks, rootDir, mode = LiteracyVariant.STANDARD) {
|
|
248
|
-
const rubricConfig = loadRubricTemplates(rootDir);
|
|
249
|
-
const entries = [];
|
|
250
|
-
for (const task of tasks) {
|
|
251
|
-
const single = literacyTaskToSingle(task);
|
|
252
|
-
entries.push(...expandTask(single, rubricConfig, mode));
|
|
253
|
-
}
|
|
254
|
-
return {
|
|
255
|
-
entries,
|
|
256
|
-
stats: {
|
|
257
|
-
expandedTotal: entries.length,
|
|
258
|
-
totalTasks: tasks.length,
|
|
259
|
-
},
|
|
260
|
-
};
|
|
261
|
-
}
|
|
262
|
-
/**
|
|
263
|
-
* Extract all task IDs from task files. Only works with the new
|
|
264
|
-
* single-definition format entries (those that have an `id` field).
|
|
265
|
-
*/
|
|
266
|
-
export function extractTaskIds(rootDir) {
|
|
267
|
-
const tasksDir = resolve(rootDir, "tasks");
|
|
268
|
-
if (!existsSync(tasksDir))
|
|
269
|
-
return [];
|
|
270
|
-
const yamlFiles = readdirSync(tasksDir)
|
|
271
|
-
.filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
|
|
272
|
-
.sort();
|
|
273
|
-
const ids = [];
|
|
274
|
-
for (const file of yamlFiles) {
|
|
275
|
-
const filePath = resolve(tasksDir, file);
|
|
276
|
-
const raw = readFileSync(filePath, "utf-8");
|
|
277
|
-
const parsed = load(raw);
|
|
278
|
-
if (!Array.isArray(parsed))
|
|
279
|
-
continue;
|
|
280
|
-
for (const entry of parsed) {
|
|
281
|
-
if (isSingleTaskDefinition(entry)) {
|
|
282
|
-
ids.push(entry.id);
|
|
283
|
-
}
|
|
284
|
-
}
|
|
285
|
-
}
|
|
286
|
-
return ids;
|
|
287
|
-
}
|
|
288
|
-
// ---------------------------------------------------------------------------
|
|
289
|
-
// Core expansion logic
|
|
290
|
-
// ---------------------------------------------------------------------------
|
|
291
|
-
/**
|
|
292
|
-
* Type guard: checks if an entry is in the new single-definition format.
|
|
293
|
-
* The distinguishing feature is the presence of an `id` field.
|
|
294
|
-
*/
|
|
295
|
-
export function isSingleTaskDefinition(entry) {
|
|
296
|
-
return (typeof entry === "object" &&
|
|
297
|
-
entry !== null &&
|
|
298
|
-
"id" in entry &&
|
|
299
|
-
typeof entry.id === "string" &&
|
|
300
|
-
"description" in entry &&
|
|
301
|
-
"vars" in entry &&
|
|
302
|
-
"assert" in entry);
|
|
303
|
-
}
|
|
304
|
-
/**
|
|
305
|
-
* Type guard: checks if an assertion uses the templated format.
|
|
306
|
-
*/
|
|
307
|
-
export function isTemplatedAssert(entry) {
|
|
308
|
-
return (entry.type === "llm-rubric" &&
|
|
309
|
-
"template" in entry &&
|
|
310
|
-
typeof entry.template === "string" &&
|
|
311
|
-
"criteria" in entry &&
|
|
312
|
-
Array.isArray(entry.criteria));
|
|
313
|
-
}
|
|
314
|
-
// ---------------------------------------------------------------------------
|
|
315
|
-
// Task file loading
|
|
316
|
-
// ---------------------------------------------------------------------------
|
|
317
|
-
/**
|
|
318
|
-
* Load and expand all task files from the tasks/ directory.
|
|
319
|
-
* Supports both the new single-definition format (has `id`) and the legacy
|
|
320
|
-
* paired format (no `id`). Legacy entries pass through unchanged.
|
|
321
|
-
*
|
|
322
|
-
* @param mode - Controls expansion behavior:
|
|
323
|
-
* - `'baseline'` (default): Gold + baseline entries with prompt filters.
|
|
324
|
-
* - `'agentic'`: Gold entries only, no prompt filters.
|
|
325
|
-
*
|
|
326
|
-
* Returns the expanded entries grouped by source file.
|
|
327
|
-
*/
|
|
328
|
-
export function loadAndExpandTasks(rootDir, filter, mode = LiteracyVariant.STANDARD, logger) {
|
|
329
|
-
const log = logger ?? new ConsoleLogger();
|
|
330
|
-
const tasksDir = resolve(rootDir, "tasks");
|
|
331
|
-
if (!existsSync(tasksDir)) {
|
|
332
|
-
log.debug("Tasks directory not found, returning empty (tasks may come from Content Lake)", {
|
|
333
|
-
tasksDir,
|
|
334
|
-
});
|
|
335
|
-
// tasks/ may not exist when task definitions come from Content Lake
|
|
336
|
-
return {
|
|
337
|
-
entries: [],
|
|
338
|
-
stats: {
|
|
339
|
-
expandedTotal: 0,
|
|
340
|
-
legacyEntries: 0,
|
|
341
|
-
singleDefinitions: 0,
|
|
342
|
-
totalFiles: 0,
|
|
343
|
-
},
|
|
344
|
-
};
|
|
345
|
-
}
|
|
346
|
-
// Load rubric templates
|
|
347
|
-
const rubricConfig = loadRubricTemplates(rootDir);
|
|
348
|
-
let yamlFiles = readdirSync(tasksDir)
|
|
349
|
-
.filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
|
|
350
|
-
.sort();
|
|
351
|
-
log.debug("Discovered task YAML files", {
|
|
352
|
-
directory: tasksDir,
|
|
353
|
-
fileCount: yamlFiles.length,
|
|
354
|
-
files: yamlFiles,
|
|
355
|
-
});
|
|
356
|
-
// Apply area filter — area name = filename stem (e.g., "groq" matches "groq.yaml")
|
|
357
|
-
if (filter?.areas && filter.areas.length > 0) {
|
|
358
|
-
const allowedAreas = new Set(filter.areas.map((a) => a.toLowerCase()));
|
|
359
|
-
const beforeCount = yamlFiles.length;
|
|
360
|
-
yamlFiles = yamlFiles.filter((f) => {
|
|
361
|
-
const stem = f.replace(/\.ya?ml$/, "").toLowerCase();
|
|
362
|
-
return allowedAreas.has(stem);
|
|
363
|
-
});
|
|
364
|
-
log.debug("Applied area filter", {
|
|
365
|
-
allowedAreas: [...allowedAreas],
|
|
366
|
-
beforeCount,
|
|
367
|
-
afterCount: yamlFiles.length,
|
|
368
|
-
matchedFiles: yamlFiles,
|
|
369
|
-
});
|
|
370
|
-
}
|
|
371
|
-
const entries = [];
|
|
372
|
-
let singleDefinitions = 0;
|
|
373
|
-
let legacyEntries = 0;
|
|
374
|
-
for (const file of yamlFiles) {
|
|
375
|
-
const filePath = resolve(tasksDir, file);
|
|
376
|
-
const raw = readFileSync(filePath, "utf-8");
|
|
377
|
-
let parsed = load(raw);
|
|
378
|
-
if (!Array.isArray(parsed)) {
|
|
379
|
-
throw new Error(`${file} did not parse to an array of tasks`);
|
|
380
|
-
}
|
|
381
|
-
// Apply task ID filter
|
|
382
|
-
if (filter?.taskIds && filter.taskIds.length > 0) {
|
|
383
|
-
const allowedIds = new Set(filter.taskIds);
|
|
384
|
-
const beforeCount = parsed.length;
|
|
385
|
-
parsed = parsed.filter((entry) => typeof entry === "object" &&
|
|
386
|
-
entry !== null &&
|
|
387
|
-
"id" in entry &&
|
|
388
|
-
allowedIds.has(entry.id));
|
|
389
|
-
log.debug("Applied task ID filter", {
|
|
390
|
-
file,
|
|
391
|
-
allowedIds: [...allowedIds],
|
|
392
|
-
beforeCount,
|
|
393
|
-
afterCount: parsed.length,
|
|
394
|
-
});
|
|
395
|
-
}
|
|
396
|
-
for (const entry of parsed) {
|
|
397
|
-
if (isSingleTaskDefinition(entry)) {
|
|
398
|
-
singleDefinitions++;
|
|
399
|
-
entries.push(...expandTask(entry, rubricConfig, mode));
|
|
400
|
-
}
|
|
401
|
-
else {
|
|
402
|
-
legacyEntries++;
|
|
403
|
-
entries.push(entry);
|
|
404
|
-
}
|
|
405
|
-
}
|
|
406
|
-
}
|
|
407
|
-
if (filter?.areas || filter?.taskIds) {
|
|
408
|
-
log.info(`Filter: ${filter.areas ? `areas=[${filter.areas.join(", ")}]` : ""}${filter.areas && filter.taskIds ? ", " : ""}${filter.taskIds ? `tasks=[${filter.taskIds.join(", ")}]` : ""}`);
|
|
409
|
-
}
|
|
410
|
-
log.debug("Task expansion complete", {
|
|
411
|
-
totalFiles: yamlFiles.length,
|
|
412
|
-
singleDefinitions,
|
|
413
|
-
legacyEntries,
|
|
414
|
-
expandedTotal: entries.length,
|
|
415
|
-
mode,
|
|
416
|
-
});
|
|
417
|
-
return {
|
|
418
|
-
entries,
|
|
419
|
-
stats: {
|
|
420
|
-
expandedTotal: entries.length,
|
|
421
|
-
legacyEntries,
|
|
422
|
-
singleDefinitions,
|
|
423
|
-
totalFiles: yamlFiles.length,
|
|
424
|
-
},
|
|
425
|
-
};
|
|
426
|
-
}
|
|
427
|
-
/**
|
|
428
|
-
* Load and validate config/rubrics.yaml from the given root directory.
|
|
429
|
-
* Caches the result for subsequent calls with the same rootDir.
|
|
430
|
-
*/
|
|
431
|
-
export function loadRubricTemplates(rootDir) {
|
|
432
|
-
if (cachedRubricConfig)
|
|
433
|
-
return cachedRubricConfig;
|
|
434
|
-
const { data } = loadConfigFile("rubrics", rootDir);
|
|
435
|
-
const result = RubricConfigSchema.safeParse(data);
|
|
436
|
-
if (!result.success) {
|
|
437
|
-
const messages = result.error.issues
|
|
438
|
-
.map((i) => ` [${i.path.join(".")}]: ${i.message}`)
|
|
439
|
-
.join("\n");
|
|
440
|
-
throw new Error(`Invalid config/rubrics:\n${messages}`);
|
|
441
|
-
}
|
|
442
|
-
cachedRubricConfig = result.data;
|
|
443
|
-
return result.data;
|
|
444
|
-
}
|
|
445
|
-
/**
|
|
446
|
-
* Resolve a single assertion: if it's templated, assemble the rubric text
|
|
447
|
-
* and attach structured dimension metadata when the template has a
|
|
448
|
-
* `dimension` field. Otherwise, pass through unchanged.
|
|
449
|
-
*/
|
|
450
|
-
export function resolveAssert(entry, rubricConfig) {
|
|
451
|
-
if (isTemplatedAssert(entry)) {
|
|
452
|
-
const template = rubricConfig.templates[entry.template];
|
|
453
|
-
return {
|
|
454
|
-
type: "llm-rubric",
|
|
455
|
-
value: assembleRubric(entry.template, entry.criteria, rubricConfig),
|
|
456
|
-
...(entry.weight !== undefined ? { weight: entry.weight } : {}),
|
|
457
|
-
// Structured dimension metadata (Approach 5)
|
|
458
|
-
...(template?.dimension && {
|
|
459
|
-
metadata: {
|
|
460
|
-
dimension: template.dimension,
|
|
461
|
-
maxScore: 100,
|
|
462
|
-
},
|
|
463
|
-
}),
|
|
464
|
-
};
|
|
465
|
-
}
|
|
466
|
-
return entry;
|
|
467
|
-
}
|