@sanity/ailf 2.0.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. package/LICENSE +21 -0
  2. package/dist/cli.js +0 -0
  3. package/package.json +24 -24
  4. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
  5. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
  6. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
  7. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
  8. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  9. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  10. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  11. package/dist/_vendor/ailf-tasks/index.js +0 -16
  12. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  13. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  14. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  15. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  16. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  17. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  18. package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
  19. package/dist/adapters/task-sources/yaml-task-source.js +0 -139
  20. package/dist/agent-observer/test-imports.d.ts +0 -7
  21. package/dist/agent-observer/test-imports.js +0 -185
  22. package/dist/commands/update-quality-scores.d.ts +0 -5
  23. package/dist/commands/update-quality-scores.js +0 -20
  24. package/dist/lib/agent-behavior-report.d.ts +0 -8
  25. package/dist/lib/agent-behavior-report.js +0 -185
  26. package/dist/lib/baseline.d.ts +0 -19
  27. package/dist/lib/baseline.js +0 -153
  28. package/dist/lib/calculate-scores.d.ts +0 -23
  29. package/dist/lib/calculate-scores.js +0 -42
  30. package/dist/lib/compare.d.ts +0 -18
  31. package/dist/lib/compare.js +0 -170
  32. package/dist/lib/coverage-audit.d.ts +0 -4
  33. package/dist/lib/coverage-audit.js +0 -42
  34. package/dist/lib/discovery-report.d.ts +0 -13
  35. package/dist/lib/discovery-report.js +0 -57
  36. package/dist/lib/fetch-docs.d.ts +0 -30
  37. package/dist/lib/fetch-docs.js +0 -171
  38. package/dist/lib/generate-configs.d.ts +0 -25
  39. package/dist/lib/generate-configs.js +0 -42
  40. package/dist/lib/grader-api.d.ts +0 -21
  41. package/dist/lib/grader-api.js +0 -34
  42. package/dist/lib/grader-compare.d.ts +0 -19
  43. package/dist/lib/grader-compare.js +0 -91
  44. package/dist/lib/grader-consistency.d.ts +0 -27
  45. package/dist/lib/grader-consistency.js +0 -79
  46. package/dist/lib/grader-sensitivity.d.ts +0 -19
  47. package/dist/lib/grader-sensitivity.js +0 -75
  48. package/dist/lib/grader-validate.d.ts +0 -19
  49. package/dist/lib/grader-validate.js +0 -78
  50. package/dist/lib/measure-retrieval.d.ts +0 -14
  51. package/dist/lib/measure-retrieval.js +0 -71
  52. package/dist/lib/pr-comment.d.ts +0 -16
  53. package/dist/lib/pr-comment.js +0 -28
  54. package/dist/lib/readiness-report.d.ts +0 -13
  55. package/dist/lib/readiness-report.js +0 -108
  56. package/dist/lib/webhook-server.d.ts +0 -11
  57. package/dist/lib/webhook-server.js +0 -24
  58. package/dist/lib/weekly-digest.d.ts +0 -24
  59. package/dist/lib/weekly-digest.js +0 -148
  60. package/dist/orchestration/env-bridge.d.ts +0 -21
  61. package/dist/orchestration/env-bridge.js +0 -66
  62. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  63. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  64. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
  65. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
  66. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
  67. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  68. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  69. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  70. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  71. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  72. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
  73. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
  74. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
  75. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
  76. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
  77. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
  78. package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
  79. package/dist/pipeline/compiler/task-bridge.js +0 -92
  80. package/dist/pipeline/expand-tasks.d.ts +0 -232
  81. package/dist/pipeline/expand-tasks.js +0 -467
  82. package/dist/pipeline/generate-configs.d.ts +0 -92
  83. package/dist/pipeline/generate-configs.js +0 -445
  84. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  85. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  86. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  87. package/dist/pipeline/steps/compare-step.js +0 -90
  88. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  89. package/dist/pipeline/steps/eval-step.js +0 -347
  90. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  91. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  92. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  93. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  94. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  95. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  96. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  97. package/dist/pipeline/steps/publish-report-step.js +0 -243
  98. package/dist/pipeline/steps/report-step.d.ts +0 -13
  99. package/dist/pipeline/steps/report-step.js +0 -56
  100. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  101. package/dist/pipeline/steps/update-scores-step.js +0 -42
  102. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  103. package/dist/scripts/agent-behavior-report.js +0 -315
  104. package/dist/scripts/baseline.d.ts +0 -43
  105. package/dist/scripts/baseline.js +0 -267
  106. package/dist/scripts/calculate-scores.d.ts +0 -166
  107. package/dist/scripts/calculate-scores.js +0 -1296
  108. package/dist/scripts/compare.d.ts +0 -22
  109. package/dist/scripts/compare.js +0 -334
  110. package/dist/scripts/coverage-audit.d.ts +0 -44
  111. package/dist/scripts/coverage-audit.js +0 -209
  112. package/dist/scripts/debug-eval.d.ts +0 -19
  113. package/dist/scripts/debug-eval.js +0 -73
  114. package/dist/scripts/discovery-report.d.ts +0 -58
  115. package/dist/scripts/discovery-report.js +0 -250
  116. package/dist/scripts/fetch-docs.d.ts +0 -35
  117. package/dist/scripts/fetch-docs.js +0 -472
  118. package/dist/scripts/generate-configs.d.ts +0 -66
  119. package/dist/scripts/generate-configs.js +0 -459
  120. package/dist/scripts/grader-api.d.ts +0 -27
  121. package/dist/scripts/grader-api.js +0 -206
  122. package/dist/scripts/grader-compare.d.ts +0 -22
  123. package/dist/scripts/grader-compare.js +0 -368
  124. package/dist/scripts/grader-consistency.d.ts +0 -20
  125. package/dist/scripts/grader-consistency.js +0 -313
  126. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  127. package/dist/scripts/grader-sensitivity.js +0 -354
  128. package/dist/scripts/grader-validate.d.ts +0 -19
  129. package/dist/scripts/grader-validate.js +0 -267
  130. package/dist/scripts/measure-retrieval.d.ts +0 -10
  131. package/dist/scripts/measure-retrieval.js +0 -145
  132. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
  133. package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
  134. package/dist/scripts/pipeline.d.ts +0 -76
  135. package/dist/scripts/pipeline.js +0 -1031
  136. package/dist/scripts/pr-comment.d.ts +0 -10
  137. package/dist/scripts/pr-comment.js +0 -510
  138. package/dist/scripts/readiness-report.d.ts +0 -88
  139. package/dist/scripts/readiness-report.js +0 -342
  140. package/dist/scripts/update-quality-scores.d.ts +0 -15
  141. package/dist/scripts/update-quality-scores.js +0 -184
  142. package/dist/scripts/validate-task-sources.d.ts +0 -21
  143. package/dist/scripts/validate-task-sources.js +0 -210
  144. package/dist/scripts/validate.d.ts +0 -13
  145. package/dist/scripts/validate.js +0 -79
  146. package/dist/scripts/webhook-server.d.ts +0 -26
  147. package/dist/scripts/webhook-server.js +0 -147
  148. package/dist/scripts/weekly-digest.d.ts +0 -24
  149. package/dist/scripts/weekly-digest.js +0 -144
  150. package/dist/sinks/format-slack.d.ts +0 -64
  151. package/dist/sinks/format-slack.js +0 -306
  152. package/dist/sinks/slack-sink.d.ts +0 -27
  153. package/dist/sinks/slack-sink.js +0 -78
  154. package/dist/sinks/webhook-sink.d.ts +0 -19
  155. package/dist/sinks/webhook-sink.js +0 -50
  156. package/tasks/.expanded.agentic.yaml +0 -280
  157. package/tasks/.expanded.yaml +0 -565
@@ -1,459 +0,0 @@
1
- /**
2
- * Generate-configs.ts
3
- *
4
- * Reads config/models.yaml (the central model registry) and generates all
5
- * promptfoo config files with the correct provider entries.
6
- *
7
- * This keeps model definitions in one place — add a model to config/models.yaml
8
- * and run `pnpm generate-configs` to propagate it to all eval modes.
9
- *
10
- * Generated configs:
11
- * - promptfooconfig.yaml (baseline: with-docs vs without-docs)
12
- * - promptfooconfig.observed.yaml (instrumented HTTP recording)
13
- * - promptfooconfig.agentic.yaml (agentic tool-calling: naive vs optimized)
14
- *
15
- * Usage:
16
- * pnpm generate-configs
17
- * # or
18
- * tsx src/scripts/generate-configs.ts
19
- */
20
- import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
21
- import { dirname, resolve } from "path";
22
- import { fileURLToPath } from "url";
23
- import { dump, load } from "js-yaml";
24
- import { loadAndExpandTasks } from "../pipeline/expand-tasks.js";
25
- import { validateModelsYaml } from "../pipeline/validate.js";
26
- import { loadSource } from "../sources.js";
27
- const __dirname = dirname(fileURLToPath(import.meta.url));
28
- const ROOT = resolve(__dirname, "../..");
29
- // ---------------------------------------------------------------------------
30
- // Helpers
31
- // ---------------------------------------------------------------------------
32
- /** Auto-discover all task YAML files in the tasks/ directory. */
33
- export function discoverTaskFiles(rootDir) {
34
- const tasksDir = resolve(rootDir, "tasks");
35
- if (!existsSync(tasksDir)) {
36
- throw new Error(`tasks/ directory not found at ${tasksDir}`);
37
- }
38
- return readdirSync(tasksDir)
39
- .filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
40
- .sort()
41
- .map((f) => `file://tasks/${f}`);
42
- }
43
- /**
44
- * Known promptfoo provider prefixes that are NOT part of the model name.
45
- * Order matters: longer prefixes must come first to avoid partial matches.
46
- */
47
- const PROVIDER_PREFIXES = [
48
- "anthropic:messages:",
49
- "openai:chat:",
50
- "openai:responses:",
51
- "openai:",
52
- "anthropic:",
53
- "google:",
54
- ];
55
- /**
56
- * Extract the raw API model name from a promptfoo provider ID.
57
- *
58
- * Promptfoo IDs encode the provider + sub-protocol + model, e.g.:
59
- * "openai:chat:gpt-5.2" → "gpt-5.2"
60
- * "anthropic:messages:claude-opus-4-6" → "claude-opus-4-6"
61
- * "openai:gpt-4o" → "gpt-4o"
62
- * "google:gemini-2.5-pro" → "gemini-2.5-pro"
63
- *
64
- * Falls back to stripping everything before the first colon for unknown
65
- * providers (e.g., "openrouter:deepseek/deepseek-r1" → "deepseek/deepseek-r1").
66
- */
67
- export function extractModelName(id) {
68
- for (const prefix of PROVIDER_PREFIXES) {
69
- if (id.startsWith(prefix)) {
70
- return id.slice(prefix.length);
71
- }
72
- }
73
- // Fallback: strip first colon-separated segment
74
- const parts = id.split(":");
75
- return parts.length > 1 ? parts.slice(1).join(":") : id;
76
- }
77
- /**
78
- * Extract the LLM provider family from a promptfoo provider ID.
79
- *
80
- * "openai:chat:gpt-5.2" → "openai"
81
- * "anthropic:messages:claude-opus-4-6" → "anthropic"
82
- * "google:gemini-2.5-pro" → "google"
83
- */
84
- export function extractProvider(id) {
85
- const colon = id.indexOf(":");
86
- return colon > 0 ? id.slice(0, colon) : "openai";
87
- }
88
- /** Load prompt templates from config/prompts.yaml. Throws if missing or malformed. */
89
- export function loadPrompts(rootDir) {
90
- const promptsPath = resolve(rootDir, "config", "prompts.yaml");
91
- if (!existsSync(promptsPath)) {
92
- throw new Error(`config/prompts.yaml not found at ${promptsPath}. This file is required — it defines the prompt templates for all evaluation modes.`);
93
- }
94
- const raw = readFileSync(promptsPath, "utf-8");
95
- const data = load(raw);
96
- const toPrompt = (entry) => ({
97
- id: entry.id,
98
- label: entry.label,
99
- raw: entry.template,
100
- });
101
- if (!data["with-docs"] || !data["without-docs"] || !data["agentic"]) {
102
- const missing = ["with-docs", "without-docs", "agentic"].filter((k) => !data[k]);
103
- throw new Error(`config/prompts.yaml is missing required keys: ${missing.join(", ")}. Each prompt must have id, label, and template fields.`);
104
- }
105
- return {
106
- agentic: toPrompt(data["agentic"]),
107
- withDocs: toPrompt(data["with-docs"]),
108
- withoutDocs: toPrompt(data["without-docs"]),
109
- };
110
- }
111
- // ---------------------------------------------------------------------------
112
- // Task discovery
113
- // ---------------------------------------------------------------------------
114
- /** Merge default config with model-specific config */
115
- export function mergeConfig(defaults, modelConfig, overrides) {
116
- const result = {};
117
- // Only pick scalar defaults (temperature, max_tokens)
118
- for (const [key, value] of Object.entries(defaults)) {
119
- if (typeof value !== "object" || value === null) {
120
- result[key] = value;
121
- }
122
- }
123
- // Model-specific overrides
124
- if (modelConfig) {
125
- Object.assign(result, modelConfig);
126
- }
127
- // Mode-specific overrides
128
- if (overrides) {
129
- Object.assign(result, overrides);
130
- }
131
- return result;
132
- }
133
- // ---------------------------------------------------------------------------
134
- // Prompt loading
135
- // ---------------------------------------------------------------------------
136
- export function modelMatchesMode(model, mode) {
137
- if (!model.modes || model.modes.length === 0) {
138
- return true;
139
- }
140
- return model.modes.includes(mode);
141
- }
142
- function loadModels() {
143
- const raw = readFileSync(resolve(ROOT, "config", "models.yaml"), "utf-8");
144
- return load(raw);
145
- }
146
- // ---------------------------------------------------------------------------
147
- // Shared components
148
- // ---------------------------------------------------------------------------
149
- const URL_EXTRACTION_ASSERT = {
150
- type: "javascript",
151
- value: `const urlPattern = /https?:\\/\\/[^\\s\\)\\"\\'\\\`>]+/g;
152
- const urls = [...new Set((output.match(urlPattern) || []))];
153
- const sanityUrls = urls.filter(u => u.includes('sanity.io'));
154
- return {
155
- pass: true,
156
- score: 1,
157
- reason: JSON.stringify({
158
- sanityUrls,
159
- otherUrls: urls.filter(u => !u.includes('sanity.io')),
160
- totalUrlCount: urls.length,
161
- sanityUrlCount: sanityUrls.length
162
- })
163
- };`,
164
- weight: 0,
165
- };
166
- /**
167
- * Source isolation assertion — advisory (weight: 0).
168
- * Verifies that the agentic provider only fetched docs from allowed origins.
169
- * Only injected when origin sandboxing is active (DOC_ALLOWED_ORIGINS is set).
170
- */
171
- const SOURCE_ISOLATION_ASSERT = {
172
- metadata: { dimension: "source-isolation" },
173
- type: "javascript",
174
- value: "file://dist/assertions/source-isolation.js",
175
- weight: 0,
176
- };
177
- // ---------------------------------------------------------------------------
178
- // Config generators
179
- // ---------------------------------------------------------------------------
180
- function generateAgenticConfig(models, tests, prompts, source) {
181
- const naiveModels = models.models.filter((m) => modelMatchesMode(m, "agentic-naive"));
182
- const optimizedModels = models.models.filter((m) => modelMatchesMode(m, "agentic-optimized"));
183
- const providers = [];
184
- // Build doc source config to inject into providers
185
- // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string means unset
186
- const searchMode = process.env.EVAL_SEARCH_MODE || "open";
187
- const sourceConfig = source
188
- ? {
189
- ...(source.allowedOrigins?.length
190
- ? { allowedOrigins: source.allowedOrigins }
191
- : {}),
192
- docBaseUrl: source.baseUrl,
193
- ...(source.headers && Object.keys(source.headers).length > 0
194
- ? { customHeaders: source.headers }
195
- : {}),
196
- llmsTxtUrl: source.llmsTxt,
197
- ...(source.priorityDomain
198
- ? { priorityDomain: source.priorityDomain }
199
- : {}),
200
- // Tool access control: search mode for web_search behavior
201
- ...(searchMode !== "open" ? { searchMode } : {}),
202
- }
203
- : {};
204
- for (const model of naiveModels) {
205
- const modelName = extractModelName(model.id);
206
- const provider = extractProvider(model.id);
207
- providers.push({
208
- config: {
209
- ...mergeConfig(models.defaults, model.config, {
210
- agentMode: "naive",
211
- maxToolRounds: models.defaults.maxToolRounds ?? 5,
212
- model: modelName,
213
- provider,
214
- }),
215
- ...sourceConfig,
216
- observe: true,
217
- observerOptions: models.defaults.observerOptions ?? {},
218
- },
219
- id: "file://dist/agent-observer/agentic-provider.js",
220
- label: `${model.label} (Naive Agent)`,
221
- });
222
- }
223
- for (const model of optimizedModels) {
224
- const modelName = extractModelName(model.id);
225
- const provider = extractProvider(model.id);
226
- providers.push({
227
- config: {
228
- ...mergeConfig(models.defaults, model.config, {
229
- agentMode: "optimized",
230
- maxToolRounds: models.defaults.maxToolRounds ?? 5,
231
- model: modelName,
232
- provider,
233
- }),
234
- ...sourceConfig,
235
- observe: true,
236
- observerOptions: models.defaults.observerOptions ?? {},
237
- },
238
- id: "file://dist/agent-observer/agentic-provider.js",
239
- label: `${model.label} (Optimized Agent)`,
240
- });
241
- }
242
- // Inject source isolation assertion when origin sandboxing is active
243
- const hasOriginSandbox = Boolean(process.env.DOC_ALLOWED_ORIGINS);
244
- const agenticAssertions = hasOriginSandbox ? [SOURCE_ISOLATION_ASSERT] : [];
245
- return {
246
- commandLineOptions: { table: false },
247
- defaultTest: {
248
- ...(agenticAssertions.length > 0 ? { assert: agenticAssertions } : {}),
249
- options: {
250
- provider: models.grader.id,
251
- rubricProvider: models.grader.id,
252
- },
253
- },
254
- description: "Sanity AI Literacy Evaluation — Agentic (naive vs optimized)",
255
- ...(models.maxConcurrency
256
- ? { evaluateOptions: { maxConcurrency: models.maxConcurrency } }
257
- : {}),
258
- outputPath: "results/latest/eval-results-agentic.json",
259
- prompts: [prompts.agentic],
260
- providers,
261
- tests,
262
- };
263
- }
264
- function generateBaselineConfig(models, tests, prompts) {
265
- const baselineModels = models.models.filter((m) => modelMatchesMode(m, "baseline"));
266
- const providers = baselineModels.map((model) => ({
267
- config: mergeConfig(models.defaults, model.config),
268
- id: model.id,
269
- label: model.label,
270
- }));
271
- return {
272
- commandLineOptions: { table: false },
273
- defaultTest: {
274
- assert: [URL_EXTRACTION_ASSERT],
275
- options: {
276
- provider: models.grader.id,
277
- rubricProvider: models.grader.id,
278
- },
279
- },
280
- description: "Sanity AI Literacy Evaluation — Baseline",
281
- ...(models.maxConcurrency
282
- ? { evaluateOptions: { maxConcurrency: models.maxConcurrency } }
283
- : {}),
284
- outputPath: "results/latest/eval-results.json",
285
- prompts: [prompts.withDocs, prompts.withoutDocs],
286
- providers,
287
- tests,
288
- };
289
- }
290
- function generateObservedConfig(models, tests, prompts) {
291
- const observedModels = models.models.filter((m) => modelMatchesMode(m, "observed"));
292
- const providers = observedModels.map((model) => {
293
- const modelName = extractModelName(model.id);
294
- return {
295
- config: {
296
- ...mergeConfig(models.defaults, model.config),
297
- modelName,
298
- observe: true,
299
- recordOptions: models.defaults.observerOptions ?? {},
300
- },
301
- id: "file://dist/agent-observer/provider.js",
302
- label: `${model.label} (Observed)`,
303
- };
304
- });
305
- return {
306
- commandLineOptions: { table: false },
307
- defaultTest: {
308
- options: {
309
- provider: models.grader.id,
310
- rubricProvider: models.grader.id,
311
- },
312
- },
313
- description: "Sanity AI Literacy Evaluation — Observed",
314
- ...(models.maxConcurrency
315
- ? { evaluateOptions: { maxConcurrency: models.maxConcurrency } }
316
- : {}),
317
- outputPath: "results/latest/eval-results-observed.json",
318
- prompts: [prompts.withDocs, prompts.withoutDocs],
319
- providers,
320
- tests,
321
- };
322
- }
323
- // ---------------------------------------------------------------------------
324
- // File writing
325
- // ---------------------------------------------------------------------------
326
- function main() {
327
- // Validate config/models.yaml before generating configs
328
- const modelIssues = validateModelsYaml(ROOT);
329
- const modelErrors = modelIssues.filter((i) => i.severity === "error");
330
- if (modelErrors.length > 0) {
331
- console.error("❌ config/models.yaml validation failed:");
332
- for (const e of modelErrors) {
333
- console.error(` ERROR: ${e.message}`);
334
- if (e.path) {
335
- console.error(` at ${e.path}`);
336
- }
337
- }
338
- console.error("\nFix config/models.yaml before generating configs. Run 'pnpm validate' for details.");
339
- process.exit(1);
340
- }
341
- console.log("Loading config/models.yaml...");
342
- const models = loadModels();
343
- const activeModels = models.models.filter((m) => m.id && m.label);
344
- console.log(` Found ${activeModels.length} active model(s):`);
345
- for (const m of activeModels) {
346
- // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty array join → "all"
347
- const modes = m.modes?.join(", ") || "all";
348
- console.log(` - ${m.label} (${m.id}) → [${modes}]`);
349
- }
350
- // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty label falls through to id
351
- console.log(` Grader: ${models.grader.label || models.grader.id}`);
352
- // Expand tasks: read single-definition tasks and generate gold+baseline pairs
353
- // Build filter options from environment variables (set by pipeline.ts)
354
- const filterAreas = process.env.EVAL_FILTER_AREAS;
355
- const filterTaskIds = process.env.EVAL_FILTER_TASKS;
356
- const filter = filterAreas || filterTaskIds
357
- ? {
358
- areas: filterAreas
359
- ? filterAreas.split(",").map((a) => a.trim())
360
- : undefined,
361
- taskIds: filterTaskIds
362
- ? filterTaskIds.split(",").map((t) => t.trim())
363
- : undefined,
364
- }
365
- : undefined;
366
- // Expand tasks for baseline/observed mode (gold + baseline with prompt filters)
367
- const { entries, stats } = loadAndExpandTasks(ROOT, filter, "baseline");
368
- console.log(` Expanded ${stats.singleDefinitions} task(s) → ${stats.expandedTotal} test entries`);
369
- if (stats.legacyEntries > 0) {
370
- console.log(` ⚠ ${stats.legacyEntries} legacy (paired) entries passed through unchanged`);
371
- }
372
- if (filter) {
373
- const parts = [];
374
- if (filter.areas) {
375
- parts.push(`areas: ${filter.areas.join(", ")}`);
376
- }
377
- if (filter.taskIds) {
378
- parts.push(`tasks: ${filter.taskIds.join(", ")}`);
379
- }
380
- console.log(` Scoped to: ${parts.join("; ")}`);
381
- }
382
- // Expand tasks for agentic mode (gold entries only, no prompt filters).
383
- // Agentic mode has a single prompt that doesn't use {{docs}}, so baseline
384
- // entries would produce identical prompts — pure waste of API calls.
385
- const { entries: agenticEntries, stats: agenticStats } = loadAndExpandTasks(ROOT, filter, "agentic");
386
- console.log(` Agentic: ${agenticStats.expandedTotal} entries (gold only, no baseline)`);
387
- // Write expanded tasks to generated files for Promptfoo to consume
388
- const expandedPath = resolve(ROOT, "tasks", ".expanded.yaml");
389
- const expandedYaml = dump(entries, {
390
- forceQuotes: false,
391
- lineWidth: 120,
392
- noRefs: true,
393
- quotingType: "'",
394
- });
395
- writeFileSync(expandedPath, `# .expanded.yaml\n#\n# AUTO-GENERATED — do not edit directly.\n# Source: tasks/*.yaml (single-definition format)\n# Run: pnpm generate-configs\n\n${expandedYaml}`, "utf-8");
396
- console.log(` ✓ tasks/.expanded.yaml (${entries.length} entries)`);
397
- const agenticExpandedPath = resolve(ROOT, "tasks", ".expanded.agentic.yaml");
398
- const agenticExpandedYaml = dump(agenticEntries, {
399
- forceQuotes: false,
400
- lineWidth: 120,
401
- noRefs: true,
402
- quotingType: "'",
403
- });
404
- writeFileSync(agenticExpandedPath, `# .expanded.agentic.yaml\n#\n# AUTO-GENERATED — do not edit directly.\n# Gold entries only (no baseline) for agentic evaluation mode.\n# Source: tasks/*.yaml (single-definition format)\n# Run: pnpm generate-configs\n\n${agenticExpandedYaml}`, "utf-8");
405
- console.log(` ✓ tasks/.expanded.agentic.yaml (${agenticEntries.length} entries)`);
406
- const taskFiles = ["file://tasks/.expanded.yaml"];
407
- const agenticTaskFiles = ["file://tasks/.expanded.agentic.yaml"];
408
- // Load prompt templates
409
- const prompts = loadPrompts(ROOT);
410
- console.log(` Loaded prompts: ${Object.keys(prompts).join(", ")}`);
411
- // Load optional documentation source configuration
412
- // Usage: pnpm generate-configs [source-name]
413
- // Or: DOC_SOURCE=branch pnpm generate-configs
414
- const sourceName = process.argv[2] || process.env.DOC_SOURCE;
415
- let source;
416
- if (sourceName) {
417
- console.log(`\nLoading source: ${sourceName}`);
418
- try {
419
- source = loadSource(sourceName);
420
- console.log(` Base URL: ${source.baseUrl}`);
421
- console.log(` Dataset: ${source.dataset}`);
422
- if (source.allowedOrigins?.length) {
423
- console.log(` Allowed origins: ${source.allowedOrigins.join(", ")}`);
424
- }
425
- }
426
- catch (err) {
427
- const msg = err instanceof Error ? err.message : String(err);
428
- console.warn(`\n⚠ Failed to load source "${sourceName}": ${msg}`);
429
- }
430
- }
431
- console.log("\nGenerating configs...");
432
- writeConfig("promptfooconfig.yaml", generateBaselineConfig(models, taskFiles, prompts), `# promptfooconfig.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`);
433
- writeConfig("promptfooconfig.observed.yaml", generateObservedConfig(models, taskFiles, prompts), `# promptfooconfig.observed.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`);
434
- writeConfig("promptfooconfig.agentic.yaml", generateAgenticConfig(models, agenticTaskFiles, prompts, source), `# promptfooconfig.agentic.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`);
435
- console.log("\nDone! Configs are ready.");
436
- if (source) {
437
- console.log(` (using doc source: ${sourceName})`);
438
- }
439
- }
440
- // ---------------------------------------------------------------------------
441
- // Main
442
- // ---------------------------------------------------------------------------
443
- function writeConfig(filename, config, header) {
444
- const yamlStr = dump(config, {
445
- forceQuotes: false,
446
- lineWidth: 120,
447
- noRefs: true,
448
- quotingType: "'",
449
- });
450
- const content = `${header}\n${yamlStr}`;
451
- const outPath = resolve(ROOT, filename);
452
- writeFileSync(outPath, content, "utf-8");
453
- console.log(` ✓ ${filename}`);
454
- }
455
- // Only run when invoked directly (not when imported for testing)
456
- if (process.argv[1]?.endsWith("generate-configs.ts") ||
457
- process.argv[1]?.endsWith("generate-configs.js")) {
458
- main();
459
- }
@@ -1,27 +0,0 @@
1
- /**
2
- * grader-api.ts
3
- *
4
- * Shared utility for calling LLM grading APIs from grader scripts.
5
- *
6
- * Dispatches to the correct provider API (OpenAI, Anthropic) based on the
7
- * grader model prefix. Reads the appropriate API key from environment.
8
- *
9
- * Also exports `loadGraderModel()` to resolve the grader from
10
- * `config/models.yaml`.
11
- */
12
- /**
13
- * Call the grader model once to score a response against a rubric.
14
- *
15
- * Dispatches to the correct provider API based on the model prefix.
16
- * Returns a numeric score (0–100) or null if the call or parse fails.
17
- */
18
- export declare function gradeOnce(graderModel: string, responseText: string, rubricText: string): Promise<null | number>;
19
- /**
20
- * Load the grader model from `config/models.yaml`.
21
- * Returns both the model ID and human-readable label.
22
- * Falls back to `openai:gpt-5` if not configured.
23
- */
24
- export declare function loadGraderModel(): {
25
- id: string;
26
- label: string;
27
- };
@@ -1,206 +0,0 @@
1
- /**
2
- * grader-api.ts
3
- *
4
- * Shared utility for calling LLM grading APIs from grader scripts.
5
- *
6
- * Dispatches to the correct provider API (OpenAI, Anthropic) based on the
7
- * grader model prefix. Reads the appropriate API key from environment.
8
- *
9
- * Also exports `loadGraderModel()` to resolve the grader from
10
- * `config/models.yaml`.
11
- */
12
- import { config as dotenvConfig } from "dotenv";
13
- import { existsSync, readFileSync } from "fs";
14
- import { dirname, join, resolve } from "path";
15
- import { fileURLToPath } from "url";
16
- import { load } from "js-yaml";
17
- const __dirname = dirname(fileURLToPath(import.meta.url));
18
- const EVAL_ROOT = resolve(__dirname, "..", "..");
19
- // Load root .env (two levels above packages/eval/) so API keys are available
20
- // even when this module is invoked via `tsx` from packages/eval/.
21
- const rootEnvPath = resolve(EVAL_ROOT, "..", "..", ".env");
22
- if (existsSync(rootEnvPath)) {
23
- dotenvConfig({ override: true, path: rootEnvPath });
24
- }
25
- // ---------------------------------------------------------------------------
26
- // Provider detection
27
- // ---------------------------------------------------------------------------
28
- /**
29
- * Call the grader model once to score a response against a rubric.
30
- *
31
- * Dispatches to the correct provider API based on the model prefix.
32
- * Returns a numeric score (0–100) or null if the call or parse fails.
33
- */
34
- export async function gradeOnce(graderModel, responseText, rubricText) {
35
- const config = resolveProvider(graderModel);
36
- const prompt = `You are evaluating an AI assistant's response. Grade the response according to the following rubric.
37
-
38
- ## Response to evaluate:
39
- ${responseText.slice(0, 8000)}
40
-
41
- ## Rubric:
42
- ${rubricText}
43
- `;
44
- try {
45
- const provider = graderModel.split(":")[0];
46
- let content;
47
- if (provider === "anthropic") {
48
- content = await callAnthropic(config, prompt);
49
- }
50
- else if (provider === "openai") {
51
- content = await callOpenAI(config, prompt);
52
- }
53
- else {
54
- // resolveProvider already throws for unknown providers, but just in case
55
- return null;
56
- }
57
- if (content === null)
58
- return null;
59
- const score = extractScore(content);
60
- if (score === null) {
61
- console.error(` ⚠ Could not parse grader response: ${content.slice(0, 100)}`);
62
- }
63
- return score;
64
- }
65
- catch (err) {
66
- console.error(` ⚠ Grader call failed: ${err instanceof Error ? err.message : String(err)}`);
67
- return null;
68
- }
69
- }
70
- // ---------------------------------------------------------------------------
71
- // Provider-specific API calls
72
- // ---------------------------------------------------------------------------
73
- /**
74
- * Load the grader model from `config/models.yaml`.
75
- * Returns both the model ID and human-readable label.
76
- * Falls back to `openai:gpt-5` if not configured.
77
- */
78
- export function loadGraderModel() {
79
- const modelsPath = join(EVAL_ROOT, "config", "models.yaml");
80
- if (!existsSync(modelsPath)) {
81
- console.error("❌ config/models.yaml not found");
82
- process.exit(1);
83
- }
84
- const raw = readFileSync(modelsPath, "utf-8");
85
- const data = load(raw);
86
- return {
87
- id: data?.grader?.id ?? "openai:gpt-5",
88
- label: data?.grader?.label ?? "GPT-5 (grader)",
89
- };
90
- }
91
- async function callAnthropic(config, prompt) {
92
- const response = await fetch(config.baseUrl, {
93
- body: JSON.stringify({
94
- max_tokens: 256,
95
- messages: [{ content: prompt, role: "user" }],
96
- model: config.modelName,
97
- temperature: 0.2,
98
- }),
99
- headers: {
100
- "anthropic-version": "2023-06-01",
101
- "Content-Type": "application/json",
102
- "x-api-key": config.apiKey,
103
- },
104
- method: "POST",
105
- });
106
- if (!response.ok) {
107
- const text = await response.text();
108
- console.error(` ⚠ Grader API error (Anthropic): ${response.status} ${text.slice(0, 200)}`);
109
- return null;
110
- }
111
- const data = (await response.json());
112
- const textBlock = data.content?.find((c) => c.type === "text");
113
- return textBlock?.text ?? "";
114
- }
115
- // ---------------------------------------------------------------------------
116
- // Score extraction
117
- // ---------------------------------------------------------------------------
118
- async function callOpenAI(config, prompt) {
119
- const response = await fetch(config.baseUrl, {
120
- body: JSON.stringify({
121
- max_tokens: 256,
122
- messages: [{ content: prompt, role: "user" }],
123
- model: config.modelName,
124
- temperature: 0.2,
125
- }),
126
- headers: {
127
- Authorization: `Bearer ${config.apiKey}`,
128
- "Content-Type": "application/json",
129
- },
130
- method: "POST",
131
- });
132
- if (!response.ok) {
133
- const text = await response.text();
134
- console.error(` ⚠ Grader API error (OpenAI): ${response.status} ${text.slice(0, 200)}`);
135
- return null;
136
- }
137
- const data = (await response.json());
138
- return data.choices?.[0]?.message?.content ?? "";
139
- }
140
- // ---------------------------------------------------------------------------
141
- // Public API
142
- // ---------------------------------------------------------------------------
143
- function extractScore(content) {
144
- // Try JSON parse first: {"score": 85, "reason": "..."}
145
- try {
146
- const parsed = JSON.parse(content);
147
- if (typeof parsed === "object" &&
148
- parsed !== null &&
149
- "score" in parsed &&
150
- typeof parsed.score === "number") {
151
- return parsed.score;
152
- }
153
- }
154
- catch {
155
- // Not JSON — fall through
156
- }
157
- // Fallback: extract first bare number
158
- const match = content.match(/(\d+)/);
159
- if (match)
160
- return parseInt(match[1], 10);
161
- return null;
162
- }
163
- /**
164
- * Parse a Promptfoo-style model ID and resolve the provider config.
165
- *
166
- * Supported formats:
167
- * - `openai:chat:gpt-5.2` → OpenAI, model = `gpt-5.2`
168
- * - `openai:gpt-5` → OpenAI, model = `gpt-5`
169
- * - `anthropic:messages:claude-opus-4-5-20251101` → Anthropic, model = `claude-opus-4-5-20251101`
170
- * - `anthropic:claude-sonnet-4` → Anthropic, model = `claude-sonnet-4`
171
- */
172
- function resolveProvider(graderModel) {
173
- const parts = graderModel.split(":");
174
- const provider = parts[0];
175
- if (provider === "anthropic") {
176
- // "anthropic:messages:claude-opus-4-5" → "claude-opus-4-5"
177
- // "anthropic:claude-sonnet-4" → "claude-sonnet-4"
178
- const modelName = parts.length >= 3 && parts[1] === "messages"
179
- ? parts.slice(2).join(":")
180
- : parts.slice(1).join(":");
181
- const apiKey = process.env.ANTHROPIC_API_KEY;
182
- if (!apiKey) {
183
- throw new Error("ANTHROPIC_API_KEY not set. Required for grader model: " + graderModel);
184
- }
185
- return {
186
- apiKey,
187
- baseUrl: "https://api.anthropic.com/v1/messages",
188
- modelName,
189
- };
190
- }
191
- if (provider === "openai") {
192
- // "openai:chat:gpt-5.2" → "gpt-5.2", "openai:gpt-5" → "gpt-5"
193
- const modelName = parts.length >= 3 ? parts.slice(2).join(":") : parts[1];
194
- const apiKey = process.env.OPENAI_API_KEY;
195
- if (!apiKey) {
196
- throw new Error("OPENAI_API_KEY not set. Required for grader model: " + graderModel);
197
- }
198
- return {
199
- apiKey,
200
- baseUrl: "https://api.openai.com/v1/chat/completions",
201
- modelName,
202
- };
203
- }
204
- throw new Error(`Unsupported grader provider "${provider}" in model "${graderModel}". ` +
205
- "Supported: openai, anthropic.");
206
- }