@sanity/ailf 2.0.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/LICENSE +21 -0
  2. package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
  3. package/dist/_vendor/ailf-core/examples/index.js +66 -1
  4. package/dist/agent-harness/assertions-runtime.d.ts +49 -0
  5. package/dist/agent-harness/assertions-runtime.js +138 -0
  6. package/dist/agent-harness/provider.d.ts +58 -0
  7. package/dist/agent-harness/provider.js +104 -0
  8. package/dist/cli.js +0 -0
  9. package/dist/commands/init.js +3 -0
  10. package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
  11. package/dist/orchestration/steps/generate-configs-step.js +35 -2
  12. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +39 -25
  13. package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
  14. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +9 -0
  15. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +28 -85
  16. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
  17. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
  18. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
  19. package/package.json +25 -24
  20. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
  21. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
  22. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
  23. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
  24. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  25. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  26. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  27. package/dist/_vendor/ailf-tasks/index.js +0 -16
  28. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  29. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  30. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  31. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  32. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  33. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  34. package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
  35. package/dist/adapters/task-sources/yaml-task-source.js +0 -139
  36. package/dist/agent-observer/test-imports.d.ts +0 -7
  37. package/dist/agent-observer/test-imports.js +0 -185
  38. package/dist/commands/update-quality-scores.d.ts +0 -5
  39. package/dist/commands/update-quality-scores.js +0 -20
  40. package/dist/lib/agent-behavior-report.d.ts +0 -8
  41. package/dist/lib/agent-behavior-report.js +0 -185
  42. package/dist/lib/baseline.d.ts +0 -19
  43. package/dist/lib/baseline.js +0 -153
  44. package/dist/lib/calculate-scores.d.ts +0 -23
  45. package/dist/lib/calculate-scores.js +0 -42
  46. package/dist/lib/compare.d.ts +0 -18
  47. package/dist/lib/compare.js +0 -170
  48. package/dist/lib/coverage-audit.d.ts +0 -4
  49. package/dist/lib/coverage-audit.js +0 -42
  50. package/dist/lib/discovery-report.d.ts +0 -13
  51. package/dist/lib/discovery-report.js +0 -57
  52. package/dist/lib/fetch-docs.d.ts +0 -30
  53. package/dist/lib/fetch-docs.js +0 -171
  54. package/dist/lib/generate-configs.d.ts +0 -25
  55. package/dist/lib/generate-configs.js +0 -42
  56. package/dist/lib/grader-api.d.ts +0 -21
  57. package/dist/lib/grader-api.js +0 -34
  58. package/dist/lib/grader-compare.d.ts +0 -19
  59. package/dist/lib/grader-compare.js +0 -91
  60. package/dist/lib/grader-consistency.d.ts +0 -27
  61. package/dist/lib/grader-consistency.js +0 -79
  62. package/dist/lib/grader-sensitivity.d.ts +0 -19
  63. package/dist/lib/grader-sensitivity.js +0 -75
  64. package/dist/lib/grader-validate.d.ts +0 -19
  65. package/dist/lib/grader-validate.js +0 -78
  66. package/dist/lib/measure-retrieval.d.ts +0 -14
  67. package/dist/lib/measure-retrieval.js +0 -71
  68. package/dist/lib/pr-comment.d.ts +0 -16
  69. package/dist/lib/pr-comment.js +0 -28
  70. package/dist/lib/readiness-report.d.ts +0 -13
  71. package/dist/lib/readiness-report.js +0 -108
  72. package/dist/lib/webhook-server.d.ts +0 -11
  73. package/dist/lib/webhook-server.js +0 -24
  74. package/dist/lib/weekly-digest.d.ts +0 -24
  75. package/dist/lib/weekly-digest.js +0 -148
  76. package/dist/orchestration/env-bridge.d.ts +0 -21
  77. package/dist/orchestration/env-bridge.js +0 -66
  78. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  79. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  80. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
  81. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
  82. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
  83. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  84. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  85. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  86. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  87. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  88. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
  89. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
  90. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
  91. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
  92. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
  93. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
  94. package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
  95. package/dist/pipeline/compiler/task-bridge.js +0 -92
  96. package/dist/pipeline/expand-tasks.d.ts +0 -232
  97. package/dist/pipeline/expand-tasks.js +0 -467
  98. package/dist/pipeline/generate-configs.d.ts +0 -92
  99. package/dist/pipeline/generate-configs.js +0 -445
  100. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  101. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  102. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  103. package/dist/pipeline/steps/compare-step.js +0 -90
  104. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  105. package/dist/pipeline/steps/eval-step.js +0 -347
  106. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  107. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  108. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  109. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  110. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  111. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  112. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  113. package/dist/pipeline/steps/publish-report-step.js +0 -243
  114. package/dist/pipeline/steps/report-step.d.ts +0 -13
  115. package/dist/pipeline/steps/report-step.js +0 -56
  116. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  117. package/dist/pipeline/steps/update-scores-step.js +0 -42
  118. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  119. package/dist/scripts/agent-behavior-report.js +0 -315
  120. package/dist/scripts/baseline.d.ts +0 -43
  121. package/dist/scripts/baseline.js +0 -267
  122. package/dist/scripts/calculate-scores.d.ts +0 -166
  123. package/dist/scripts/calculate-scores.js +0 -1296
  124. package/dist/scripts/compare.d.ts +0 -22
  125. package/dist/scripts/compare.js +0 -334
  126. package/dist/scripts/coverage-audit.d.ts +0 -44
  127. package/dist/scripts/coverage-audit.js +0 -209
  128. package/dist/scripts/debug-eval.d.ts +0 -19
  129. package/dist/scripts/debug-eval.js +0 -73
  130. package/dist/scripts/discovery-report.d.ts +0 -58
  131. package/dist/scripts/discovery-report.js +0 -250
  132. package/dist/scripts/fetch-docs.d.ts +0 -35
  133. package/dist/scripts/fetch-docs.js +0 -472
  134. package/dist/scripts/generate-configs.d.ts +0 -66
  135. package/dist/scripts/generate-configs.js +0 -459
  136. package/dist/scripts/grader-api.d.ts +0 -27
  137. package/dist/scripts/grader-api.js +0 -206
  138. package/dist/scripts/grader-compare.d.ts +0 -22
  139. package/dist/scripts/grader-compare.js +0 -368
  140. package/dist/scripts/grader-consistency.d.ts +0 -20
  141. package/dist/scripts/grader-consistency.js +0 -313
  142. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  143. package/dist/scripts/grader-sensitivity.js +0 -354
  144. package/dist/scripts/grader-validate.d.ts +0 -19
  145. package/dist/scripts/grader-validate.js +0 -267
  146. package/dist/scripts/measure-retrieval.d.ts +0 -10
  147. package/dist/scripts/measure-retrieval.js +0 -145
  148. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
  149. package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
  150. package/dist/scripts/pipeline.d.ts +0 -76
  151. package/dist/scripts/pipeline.js +0 -1031
  152. package/dist/scripts/pr-comment.d.ts +0 -10
  153. package/dist/scripts/pr-comment.js +0 -510
  154. package/dist/scripts/readiness-report.d.ts +0 -88
  155. package/dist/scripts/readiness-report.js +0 -342
  156. package/dist/scripts/update-quality-scores.d.ts +0 -15
  157. package/dist/scripts/update-quality-scores.js +0 -184
  158. package/dist/scripts/validate-task-sources.d.ts +0 -21
  159. package/dist/scripts/validate-task-sources.js +0 -210
  160. package/dist/scripts/validate.d.ts +0 -13
  161. package/dist/scripts/validate.js +0 -79
  162. package/dist/scripts/webhook-server.d.ts +0 -26
  163. package/dist/scripts/webhook-server.js +0 -147
  164. package/dist/scripts/weekly-digest.d.ts +0 -24
  165. package/dist/scripts/weekly-digest.js +0 -144
  166. package/dist/sinks/format-slack.d.ts +0 -64
  167. package/dist/sinks/format-slack.js +0 -306
  168. package/dist/sinks/slack-sink.d.ts +0 -27
  169. package/dist/sinks/slack-sink.js +0 -78
  170. package/dist/sinks/webhook-sink.d.ts +0 -19
  171. package/dist/sinks/webhook-sink.js +0 -50
  172. package/tasks/.expanded.agentic.yaml +0 -280
  173. package/tasks/.expanded.yaml +0 -565
@@ -1,232 +0,0 @@
1
- /**
2
- * pipeline/expand-tasks.ts
3
- *
4
- * @deprecated This is part of the LEGACY compilation path. New code should
5
- * use the literacy handler in the compiler pipeline instead:
6
- *
7
- * import { compileLiteracyTask } from "./compiler/mode-handlers/literacy/index.js"
8
- *
9
- * @see packages/eval/src/pipeline/compiler/mode-handlers/literacy/index.ts
10
- *
11
- * ---
12
- *
13
- * Reads task YAML files in the single-definition format and expands each
14
- * task into gold + baseline Promptfoo test entries. This eliminates the
15
- * manual duplication where every task had to be written twice.
16
- *
17
- * Rubric templates from config/rubrics.yaml are resolved at expansion time:
18
- * tasks specify `template` + `criteria`, and the expander assembles
19
- * the full rubric text by injecting criteria into the template.
20
- *
21
- * Structured dimension metadata (Approach 5):
22
- * When a rubric template has a `dimension` field, the resolved assertion
23
- * includes `metadata.dimension` and `metadata.maxScore`. This flows through
24
- * Promptfoo into component results, allowing the scoring engine to classify
25
- * rubrics structurally instead of via heuristic string matching.
26
- * See docs/design-docs/structured-dimensions.md.
27
- *
28
- * Single-definition format:
29
- * - id: groq-blog-queries
30
- * description: "GROQ - Blog queries with filtering and pagination"
31
- * doc_coverage: true
32
- * vars:
33
- * task: |
34
- * Write GROQ queries for a Sanity blog application: ...
35
- * docs: file://contexts/canonical/groq-blog-queries.md
36
- * assert:
37
- * - type: llm-rubric
38
- * template: task-completion
39
- * criteria:
40
- * - GROQ filter with _type == "post"
41
- * - Projection with aliased slug field
42
- * - type: contains-any
43
- * value: ["client.fetch", "createClient"]
44
- * baseline:
45
- * enabled: true
46
- * rubric: full
47
- *
48
- * Expands to:
49
- * 1. Gold entry — uses vars.docs as-is, resolves templates, appends doc-coverage
50
- * 2. Baseline entry — sets docs: "", uses full rubric (same assertions as gold)
51
- */
52
- import type { LiteracyTaskDefinition, Logger } from "../_vendor/ailf-core/index.d.ts";
53
- import { type RubricConfig } from "./schemas.js";
54
- import { type LiteracyEvalSubMode } from "./normalize-mode.js";
55
- import type { FilterOptions } from "./types.js";
56
- /** Any assertion entry (templated or value-based). */
57
- export type AssertEntry = TemplatedAssert | ValueAssert;
58
- /** The output format — a Promptfoo-compatible test entry. */
59
- export interface ExpandedTestEntry {
60
- assert?: ValueAssert[];
61
- description: string;
62
- /**
63
- * Promptfoo prompt filter — restricts which prompts this test runs against.
64
- * Matches on prompt `id` or `label`. When absent, all prompts are allowed
65
- * (Promptfoo's default cartesian product behavior).
66
- *
67
- * Gold entries use `['with-docs']` (ceiling measurement).
68
- * Baseline entries use `['without-docs']` (floor measurement).
69
- *
70
- * See: evaluation-ceiling.md for the floor/ceiling/actual decomposition.
71
- */
72
- prompts?: string[];
73
- vars: Record<string, unknown>;
74
- }
75
- /** A legacy task entry (the old paired format without an `id` field). */
76
- export interface LegacyTaskEntry {
77
- assert?: AssertEntry[];
78
- description: string;
79
- transform?: string;
80
- vars?: Record<string, unknown>;
81
- }
82
- /** A single task definition in the new format (input). */
83
- export interface SingleTaskDefinition {
84
- /** Grading assertions (applied to both gold and baseline by default). */
85
- assert: AssertEntry[];
86
- /** Baseline generation options. */
87
- baseline?: {
88
- /** Whether to generate a baseline variant. Default: true. */
89
- enabled?: boolean;
90
- /** Rubric mode: 'full' copies all asserts, 'abbreviated' generates a
91
- * summary rubric, 'none' omits rubric asserts. Default: 'full'. */
92
- rubric?: "abbreviated" | "full" | "none";
93
- };
94
- /** Human-readable description of what this task tests. */
95
- description: string;
96
- /** Opt-in: auto-generate a documentation coverage rubric for gold. */
97
- doc_coverage?: boolean;
98
- /** Feature area this task belongs to (flows through to scoring). */
99
- featureArea?: string;
100
- /** Explicit task ID — determines the canonical context filename. */
101
- id: string;
102
- /** Template variables: task prompt and docs path. */
103
- vars: {
104
- task: string;
105
- docs: string;
106
- [key: string]: unknown;
107
- };
108
- }
109
- /** A templated assertion — references a rubric template. */
110
- export interface TemplatedAssert {
111
- criteria: string[];
112
- template: string;
113
- type: "llm-rubric";
114
- weight?: number;
115
- }
116
- /** A standard assertion with a value. */
117
- export interface ValueAssert {
118
- [key: string]: unknown;
119
- type: string;
120
- value?: unknown;
121
- weight?: number;
122
- }
123
- /**
124
- * Assemble a full rubric text string from a template and criteria.
125
- *
126
- * Output format:
127
- * {header}
128
- * - {scale[0]}
129
- * - {scale[1]}
130
- * ...
131
- *
132
- * {criteria_label}
133
- * - {criteria[0]}
134
- * - {criteria[1]}
135
- * ...
136
- *
137
- * {footer}
138
- */
139
- export declare function assembleRubric(templateKey: string, criteria: string[], rubricConfig: RubricConfig): string;
140
- /**
141
- * Build baseline assertions based on the rubric mode.
142
- *
143
- * - 'full': Copy all assertions as-is
144
- * - 'abbreviated': Keep only the first llm-rubric (task completion) with
145
- * a shortened prompt, plus all non-rubric assertions
146
- * - 'none': No assertions at all
147
- */
148
- export declare function buildBaselineAsserts(goldAsserts: ValueAssert[], mode: "abbreviated" | "full" | "none"): ValueAssert[];
149
- /**
150
- * Clear the cached rubric config. Used in tests.
151
- */
152
- export declare function clearRubricCache(): void;
153
- /**
154
- * Expand a single task definition into gold + baseline Promptfoo test entries.
155
- * Returns 1 entry (gold only) if baseline is disabled, or 2 entries otherwise.
156
- *
157
- * Resolves templated assertions and appends doc-coverage if opted in.
158
- *
159
- * @param mode - Controls which entries are generated and how:
160
- * - `'baseline'` (default): Gold + baseline entries with `prompts` filter
161
- * to prevent cartesian product with multiple prompts. Gold entries get
162
- * `prompts: ['with-docs']`, baseline entries get `prompts: ['without-docs']`.
163
- * - `'agentic'`: Gold entries only, no `prompts` filter (agentic mode has
164
- * a single prompt that doesn't use `{{docs}}`; baseline entries would be
165
- * pure waste — identical prompts, wasted API calls).
166
- */
167
- export declare function expandTask(task: SingleTaskDefinition, rubricConfig: RubricConfig, mode?: LiteracyEvalSubMode): ExpandedTestEntry[];
168
- /**
169
- * Expand an array of LiteracyTaskDefinition[] (from any TaskSource adapter)
170
- * into Promptfoo-compatible test entries. This is the TaskSource-aware
171
- * counterpart of loadAndExpandTasks() — it skips YAML file I/O and works
172
- * directly with the domain type.
173
- *
174
- * @param tasks - Literacy task definitions from any TaskSource adapter
175
- * @param rootDir - Eval package root (needed to load rubric templates)
176
- * @param mode - Expansion mode: 'baseline' (gold + baseline) or 'agentic' (gold only)
177
- * @returns Expanded test entries and statistics
178
- */
179
- export declare function expandTaskDefinitions(tasks: LiteracyTaskDefinition[], rootDir: string, mode?: LiteracyEvalSubMode): {
180
- entries: ExpandedTestEntry[];
181
- stats: {
182
- totalTasks: number;
183
- expandedTotal: number;
184
- };
185
- };
186
- /**
187
- * Extract all task IDs from task files. Only works with the new
188
- * single-definition format entries (those that have an `id` field).
189
- */
190
- export declare function extractTaskIds(rootDir: string): string[];
191
- /**
192
- * Type guard: checks if an entry is in the new single-definition format.
193
- * The distinguishing feature is the presence of an `id` field.
194
- */
195
- export declare function isSingleTaskDefinition(entry: unknown): entry is SingleTaskDefinition;
196
- /**
197
- * Type guard: checks if an assertion uses the templated format.
198
- */
199
- export declare function isTemplatedAssert(entry: AssertEntry): entry is TemplatedAssert;
200
- /**
201
- * Load and expand all task files from the tasks/ directory.
202
- * Supports both the new single-definition format (has `id`) and the legacy
203
- * paired format (no `id`). Legacy entries pass through unchanged.
204
- *
205
- * @param mode - Controls expansion behavior:
206
- * - `'baseline'` (default): Gold + baseline entries with prompt filters.
207
- * - `'agentic'`: Gold entries only, no prompt filters.
208
- *
209
- * Returns the expanded entries grouped by source file.
210
- */
211
- export declare function loadAndExpandTasks(rootDir: string, filter?: FilterOptions, mode?: LiteracyEvalSubMode, logger?: Logger): {
212
- /** All expanded test entries, in order. */
213
- entries: ExpandedTestEntry[];
214
- /** Statistics about what was processed. */
215
- stats: {
216
- totalFiles: number;
217
- singleDefinitions: number;
218
- legacyEntries: number;
219
- expandedTotal: number;
220
- };
221
- };
222
- /**
223
- * Load and validate config/rubrics.yaml from the given root directory.
224
- * Caches the result for subsequent calls with the same rootDir.
225
- */
226
- export declare function loadRubricTemplates(rootDir: string): RubricConfig;
227
- /**
228
- * Resolve a single assertion: if it's templated, assemble the rubric text
229
- * and attach structured dimension metadata when the template has a
230
- * `dimension` field. Otherwise, pass through unchanged.
231
- */
232
- export declare function resolveAssert(entry: AssertEntry, rubricConfig: RubricConfig): ValueAssert;
@@ -1,467 +0,0 @@
1
- /**
2
- * pipeline/expand-tasks.ts
3
- *
4
- * @deprecated This is part of the LEGACY compilation path. New code should
5
- * use the literacy handler in the compiler pipeline instead:
6
- *
7
- * import { compileLiteracyTask } from "./compiler/mode-handlers/literacy/index.js"
8
- *
9
- * @see packages/eval/src/pipeline/compiler/mode-handlers/literacy/index.ts
10
- *
11
- * ---
12
- *
13
- * Reads task YAML files in the single-definition format and expands each
14
- * task into gold + baseline Promptfoo test entries. This eliminates the
15
- * manual duplication where every task had to be written twice.
16
- *
17
- * Rubric templates from config/rubrics.yaml are resolved at expansion time:
18
- * tasks specify `template` + `criteria`, and the expander assembles
19
- * the full rubric text by injecting criteria into the template.
20
- *
21
- * Structured dimension metadata (Approach 5):
22
- * When a rubric template has a `dimension` field, the resolved assertion
23
- * includes `metadata.dimension` and `metadata.maxScore`. This flows through
24
- * Promptfoo into component results, allowing the scoring engine to classify
25
- * rubrics structurally instead of via heuristic string matching.
26
- * See docs/design-docs/structured-dimensions.md.
27
- *
28
- * Single-definition format:
29
- * - id: groq-blog-queries
30
- * description: "GROQ - Blog queries with filtering and pagination"
31
- * doc_coverage: true
32
- * vars:
33
- * task: |
34
- * Write GROQ queries for a Sanity blog application: ...
35
- * docs: file://contexts/canonical/groq-blog-queries.md
36
- * assert:
37
- * - type: llm-rubric
38
- * template: task-completion
39
- * criteria:
40
- * - GROQ filter with _type == "post"
41
- * - Projection with aliased slug field
42
- * - type: contains-any
43
- * value: ["client.fetch", "createClient"]
44
- * baseline:
45
- * enabled: true
46
- * rubric: full
47
- *
48
- * Expands to:
49
- * 1. Gold entry — uses vars.docs as-is, resolves templates, appends doc-coverage
50
- * 2. Baseline entry — sets docs: "", uses full rubric (same assertions as gold)
51
- */
52
- import { existsSync, readFileSync, readdirSync } from "fs";
53
- import { resolve } from "path";
54
- import { load } from "js-yaml";
55
- import { ConsoleLogger } from "../adapters/loggers/index.js";
56
- import { loadConfigFile } from "./compiler/config-loader.js";
57
- import { RubricConfigSchema } from "./schemas.js";
58
- import { LiteracyVariant } from "./normalize-mode.js";
59
- // ---------------------------------------------------------------------------
60
- // Rubric template loading and assembly
61
- // ---------------------------------------------------------------------------
62
- /** Cached rubric config — loaded once per process. */
63
- let cachedRubricConfig = null;
64
- /**
65
- * Assemble a full rubric text string from a template and criteria.
66
- *
67
- * Output format:
68
- * {header}
69
- * - {scale[0]}
70
- * - {scale[1]}
71
- * ...
72
- *
73
- * {criteria_label}
74
- * - {criteria[0]}
75
- * - {criteria[1]}
76
- * ...
77
- *
78
- * {footer}
79
- */
80
- export function assembleRubric(templateKey, criteria, rubricConfig) {
81
- const template = rubricConfig.templates[templateKey];
82
- if (!template) {
83
- throw new Error(`Unknown rubric template '${templateKey}'. Available: ${Object.keys(rubricConfig.templates).join(", ")}`);
84
- }
85
- const parts = [];
86
- // Header
87
- parts.push(template.header);
88
- // Scale
89
- for (const item of template.scale) {
90
- parts.push(`- ${item}`);
91
- }
92
- // Criteria (if template has a label and criteria are provided)
93
- if (template.criteria_label && criteria.length > 0) {
94
- parts.push(""); // blank line
95
- parts.push(template.criteria_label);
96
- for (const c of criteria) {
97
- parts.push(`- ${c}`);
98
- }
99
- }
100
- // Footer
101
- parts.push(""); // blank line
102
- parts.push(rubricConfig.footer);
103
- return parts.join("\n");
104
- }
105
- /**
106
- * Build baseline assertions based on the rubric mode.
107
- *
108
- * - 'full': Copy all assertions as-is
109
- * - 'abbreviated': Keep only the first llm-rubric (task completion) with
110
- * a shortened prompt, plus all non-rubric assertions
111
- * - 'none': No assertions at all
112
- */
113
- export function buildBaselineAsserts(goldAsserts, mode) {
114
- if (mode === "none")
115
- return [];
116
- if (mode === "full")
117
- return [...goldAsserts];
118
- // Abbreviated: keep the first llm-rubric with a summary, drop the rest
119
- const abbreviated = [];
120
- let foundFirstRubric = false;
121
- for (const a of goldAsserts) {
122
- if (a.type === "llm-rubric") {
123
- if (!foundFirstRubric) {
124
- foundFirstRubric = true;
125
- abbreviated.push({
126
- type: "llm-rubric",
127
- value: 'Score task completion from 0 to 100 (same criteria as above).\nReturn ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}',
128
- });
129
- }
130
- // Skip subsequent llm-rubrics (code correctness, doc coverage)
131
- }
132
- // Non-rubric assertions are excluded from baseline to match current behavior
133
- }
134
- return abbreviated;
135
- }
136
- /**
137
- * Clear the cached rubric config. Used in tests.
138
- */
139
- export function clearRubricCache() {
140
- cachedRubricConfig = null;
141
- }
142
- /**
143
- * Expand a single task definition into gold + baseline Promptfoo test entries.
144
- * Returns 1 entry (gold only) if baseline is disabled, or 2 entries otherwise.
145
- *
146
- * Resolves templated assertions and appends doc-coverage if opted in.
147
- *
148
- * @param mode - Controls which entries are generated and how:
149
- * - `'baseline'` (default): Gold + baseline entries with `prompts` filter
150
- * to prevent cartesian product with multiple prompts. Gold entries get
151
- * `prompts: ['with-docs']`, baseline entries get `prompts: ['without-docs']`.
152
- * - `'agentic'`: Gold entries only, no `prompts` filter (agentic mode has
153
- * a single prompt that doesn't use `{{docs}}`; baseline entries would be
154
- * pure waste — identical prompts, wasted API calls).
155
- */
156
- export function expandTask(task, rubricConfig, mode = LiteracyVariant.STANDARD) {
157
- const entries = [];
158
- // Resolve all templated assertions
159
- const resolvedAsserts = task.assert.map((a) => resolveAssert(a, rubricConfig));
160
- // Append doc-coverage rubric if opted in
161
- if (task.doc_coverage) {
162
- const dcTemplate = rubricConfig.templates["doc-coverage"];
163
- resolvedAsserts.push({
164
- type: "llm-rubric",
165
- value: assembleRubric("doc-coverage", [], rubricConfig),
166
- ...(dcTemplate?.dimension && {
167
- metadata: {
168
- dimension: dcTemplate.dimension,
169
- maxScore: 100,
170
- },
171
- }),
172
- });
173
- }
174
- // Gold entry — ceiling measurement (canonical docs injected directly).
175
- // In baseline/observed mode: restricted to the 'with-docs' prompt to
176
- // prevent cartesian product with the 'without-docs' prompt.
177
- // In agentic mode: no filter needed (single prompt, id = 'agentic').
178
- // See evaluation-ceiling.md for the floor/ceiling/actual decomposition.
179
- entries.push({
180
- assert: [...resolvedAsserts],
181
- description: `${task.description} (gold)`,
182
- ...(mode === LiteracyVariant.STANDARD ? { prompts: ["with-docs"] } : {}),
183
- vars: { ...task.vars, __featureArea: task.featureArea ?? "" },
184
- });
185
- // Baseline entry — floor measurement (no docs, parametric knowledge only).
186
- // Skipped entirely in agentic mode: the agentic prompt doesn't reference
187
- // {{docs}}, so gold and baseline would produce identical prompts — pure
188
- // waste of API calls and cost.
189
- if (mode === LiteracyVariant.AGENTIC) {
190
- return entries;
191
- }
192
- // Restricted to the 'without-docs' prompt. Unless explicitly disabled.
193
- const baselineEnabled = task.baseline?.enabled !== false;
194
- if (baselineEnabled) {
195
- const rubricMode = task.baseline?.rubric ?? "full";
196
- const baselineAsserts = buildBaselineAsserts(resolvedAsserts, rubricMode);
197
- entries.push({
198
- description: `${task.description} (baseline)`,
199
- prompts: ["without-docs"],
200
- vars: {
201
- ...task.vars,
202
- docs: "",
203
- __featureArea: task.featureArea ?? "",
204
- },
205
- ...(baselineAsserts.length > 0 ? { assert: baselineAsserts } : {}),
206
- });
207
- }
208
- return entries;
209
- }
210
- /**
211
- * Convert a LiteracyTaskDefinition to the local SingleTaskDefinition
212
- * format used by expandTask().
213
- *
214
- * When a task has no canonical docs, the `docs` var is set to empty string
215
- * instead of a file path. This prevents Promptfoo from trying to read a
216
- * context file that was never created by FetchDocsStep (which skips tasks
217
- * without canonical docs). The gold entry still runs — it just tests model
218
- * knowledge alone, same as the baseline variant.
219
- */
220
- function literacyTaskToSingle(task) {
221
- const hasDocs = (task.context?.docs?.length ?? 0) > 0;
222
- return {
223
- assert: (task.assertions ?? []).map((a) => ({ ...a })),
224
- baseline: task.baseline,
225
- description: task.title,
226
- doc_coverage: task.docCoverage,
227
- featureArea: task.area ?? "",
228
- id: task.id,
229
- vars: {
230
- docs: hasDocs ? `file://contexts/canonical/${task.id}.md` : "",
231
- task: task.prompt?.text ?? "",
232
- ...task.prompt?.vars,
233
- },
234
- };
235
- }
236
- /**
237
- * Expand an array of LiteracyTaskDefinition[] (from any TaskSource adapter)
238
- * into Promptfoo-compatible test entries. This is the TaskSource-aware
239
- * counterpart of loadAndExpandTasks() — it skips YAML file I/O and works
240
- * directly with the domain type.
241
- *
242
- * @param tasks - Literacy task definitions from any TaskSource adapter
243
- * @param rootDir - Eval package root (needed to load rubric templates)
244
- * @param mode - Expansion mode: 'baseline' (gold + baseline) or 'agentic' (gold only)
245
- * @returns Expanded test entries and statistics
246
- */
247
- export function expandTaskDefinitions(tasks, rootDir, mode = LiteracyVariant.STANDARD) {
248
- const rubricConfig = loadRubricTemplates(rootDir);
249
- const entries = [];
250
- for (const task of tasks) {
251
- const single = literacyTaskToSingle(task);
252
- entries.push(...expandTask(single, rubricConfig, mode));
253
- }
254
- return {
255
- entries,
256
- stats: {
257
- expandedTotal: entries.length,
258
- totalTasks: tasks.length,
259
- },
260
- };
261
- }
262
- /**
263
- * Extract all task IDs from task files. Only works with the new
264
- * single-definition format entries (those that have an `id` field).
265
- */
266
- export function extractTaskIds(rootDir) {
267
- const tasksDir = resolve(rootDir, "tasks");
268
- if (!existsSync(tasksDir))
269
- return [];
270
- const yamlFiles = readdirSync(tasksDir)
271
- .filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
272
- .sort();
273
- const ids = [];
274
- for (const file of yamlFiles) {
275
- const filePath = resolve(tasksDir, file);
276
- const raw = readFileSync(filePath, "utf-8");
277
- const parsed = load(raw);
278
- if (!Array.isArray(parsed))
279
- continue;
280
- for (const entry of parsed) {
281
- if (isSingleTaskDefinition(entry)) {
282
- ids.push(entry.id);
283
- }
284
- }
285
- }
286
- return ids;
287
- }
288
- // ---------------------------------------------------------------------------
289
- // Core expansion logic
290
- // ---------------------------------------------------------------------------
291
- /**
292
- * Type guard: checks if an entry is in the new single-definition format.
293
- * The distinguishing feature is the presence of an `id` field.
294
- */
295
- export function isSingleTaskDefinition(entry) {
296
- return (typeof entry === "object" &&
297
- entry !== null &&
298
- "id" in entry &&
299
- typeof entry.id === "string" &&
300
- "description" in entry &&
301
- "vars" in entry &&
302
- "assert" in entry);
303
- }
304
- /**
305
- * Type guard: checks if an assertion uses the templated format.
306
- */
307
- export function isTemplatedAssert(entry) {
308
- return (entry.type === "llm-rubric" &&
309
- "template" in entry &&
310
- typeof entry.template === "string" &&
311
- "criteria" in entry &&
312
- Array.isArray(entry.criteria));
313
- }
314
- // ---------------------------------------------------------------------------
315
- // Task file loading
316
- // ---------------------------------------------------------------------------
317
- /**
318
- * Load and expand all task files from the tasks/ directory.
319
- * Supports both the new single-definition format (has `id`) and the legacy
320
- * paired format (no `id`). Legacy entries pass through unchanged.
321
- *
322
- * @param mode - Controls expansion behavior:
323
- * - `'baseline'` (default): Gold + baseline entries with prompt filters.
324
- * - `'agentic'`: Gold entries only, no prompt filters.
325
- *
326
- * Returns the expanded entries grouped by source file.
327
- */
328
- export function loadAndExpandTasks(rootDir, filter, mode = LiteracyVariant.STANDARD, logger) {
329
- const log = logger ?? new ConsoleLogger();
330
- const tasksDir = resolve(rootDir, "tasks");
331
- if (!existsSync(tasksDir)) {
332
- log.debug("Tasks directory not found, returning empty (tasks may come from Content Lake)", {
333
- tasksDir,
334
- });
335
- // tasks/ may not exist when task definitions come from Content Lake
336
- return {
337
- entries: [],
338
- stats: {
339
- expandedTotal: 0,
340
- legacyEntries: 0,
341
- singleDefinitions: 0,
342
- totalFiles: 0,
343
- },
344
- };
345
- }
346
- // Load rubric templates
347
- const rubricConfig = loadRubricTemplates(rootDir);
348
- let yamlFiles = readdirSync(tasksDir)
349
- .filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
350
- .sort();
351
- log.debug("Discovered task YAML files", {
352
- directory: tasksDir,
353
- fileCount: yamlFiles.length,
354
- files: yamlFiles,
355
- });
356
- // Apply area filter — area name = filename stem (e.g., "groq" matches "groq.yaml")
357
- if (filter?.areas && filter.areas.length > 0) {
358
- const allowedAreas = new Set(filter.areas.map((a) => a.toLowerCase()));
359
- const beforeCount = yamlFiles.length;
360
- yamlFiles = yamlFiles.filter((f) => {
361
- const stem = f.replace(/\.ya?ml$/, "").toLowerCase();
362
- return allowedAreas.has(stem);
363
- });
364
- log.debug("Applied area filter", {
365
- allowedAreas: [...allowedAreas],
366
- beforeCount,
367
- afterCount: yamlFiles.length,
368
- matchedFiles: yamlFiles,
369
- });
370
- }
371
- const entries = [];
372
- let singleDefinitions = 0;
373
- let legacyEntries = 0;
374
- for (const file of yamlFiles) {
375
- const filePath = resolve(tasksDir, file);
376
- const raw = readFileSync(filePath, "utf-8");
377
- let parsed = load(raw);
378
- if (!Array.isArray(parsed)) {
379
- throw new Error(`${file} did not parse to an array of tasks`);
380
- }
381
- // Apply task ID filter
382
- if (filter?.taskIds && filter.taskIds.length > 0) {
383
- const allowedIds = new Set(filter.taskIds);
384
- const beforeCount = parsed.length;
385
- parsed = parsed.filter((entry) => typeof entry === "object" &&
386
- entry !== null &&
387
- "id" in entry &&
388
- allowedIds.has(entry.id));
389
- log.debug("Applied task ID filter", {
390
- file,
391
- allowedIds: [...allowedIds],
392
- beforeCount,
393
- afterCount: parsed.length,
394
- });
395
- }
396
- for (const entry of parsed) {
397
- if (isSingleTaskDefinition(entry)) {
398
- singleDefinitions++;
399
- entries.push(...expandTask(entry, rubricConfig, mode));
400
- }
401
- else {
402
- legacyEntries++;
403
- entries.push(entry);
404
- }
405
- }
406
- }
407
- if (filter?.areas || filter?.taskIds) {
408
- log.info(`Filter: ${filter.areas ? `areas=[${filter.areas.join(", ")}]` : ""}${filter.areas && filter.taskIds ? ", " : ""}${filter.taskIds ? `tasks=[${filter.taskIds.join(", ")}]` : ""}`);
409
- }
410
- log.debug("Task expansion complete", {
411
- totalFiles: yamlFiles.length,
412
- singleDefinitions,
413
- legacyEntries,
414
- expandedTotal: entries.length,
415
- mode,
416
- });
417
- return {
418
- entries,
419
- stats: {
420
- expandedTotal: entries.length,
421
- legacyEntries,
422
- singleDefinitions,
423
- totalFiles: yamlFiles.length,
424
- },
425
- };
426
- }
427
- /**
428
- * Load and validate config/rubrics.yaml from the given root directory.
429
- * Caches the result for subsequent calls with the same rootDir.
430
- */
431
- export function loadRubricTemplates(rootDir) {
432
- if (cachedRubricConfig)
433
- return cachedRubricConfig;
434
- const { data } = loadConfigFile("rubrics", rootDir);
435
- const result = RubricConfigSchema.safeParse(data);
436
- if (!result.success) {
437
- const messages = result.error.issues
438
- .map((i) => ` [${i.path.join(".")}]: ${i.message}`)
439
- .join("\n");
440
- throw new Error(`Invalid config/rubrics:\n${messages}`);
441
- }
442
- cachedRubricConfig = result.data;
443
- return result.data;
444
- }
445
- /**
446
- * Resolve a single assertion: if it's templated, assemble the rubric text
447
- * and attach structured dimension metadata when the template has a
448
- * `dimension` field. Otherwise, pass through unchanged.
449
- */
450
- export function resolveAssert(entry, rubricConfig) {
451
- if (isTemplatedAssert(entry)) {
452
- const template = rubricConfig.templates[entry.template];
453
- return {
454
- type: "llm-rubric",
455
- value: assembleRubric(entry.template, entry.criteria, rubricConfig),
456
- ...(entry.weight !== undefined ? { weight: entry.weight } : {}),
457
- // Structured dimension metadata (Approach 5)
458
- ...(template?.dimension && {
459
- metadata: {
460
- dimension: template.dimension,
461
- maxScore: 100,
462
- },
463
- }),
464
- };
465
- }
466
- return entry;
467
- }