@sanity/ailf 2.0.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/LICENSE +21 -0
  2. package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
  3. package/dist/_vendor/ailf-core/examples/index.js +66 -1
  4. package/dist/agent-harness/assertions-runtime.d.ts +49 -0
  5. package/dist/agent-harness/assertions-runtime.js +138 -0
  6. package/dist/agent-harness/provider.d.ts +58 -0
  7. package/dist/agent-harness/provider.js +104 -0
  8. package/dist/cli.js +0 -0
  9. package/dist/commands/init.js +3 -0
  10. package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
  11. package/dist/orchestration/steps/generate-configs-step.js +35 -2
  12. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +39 -25
  13. package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
  14. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +9 -0
  15. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +28 -85
  16. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
  17. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
  18. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
  19. package/package.json +25 -24
  20. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
  21. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
  22. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
  23. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
  24. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  25. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  26. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  27. package/dist/_vendor/ailf-tasks/index.js +0 -16
  28. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  29. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  30. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  31. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  32. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  33. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  34. package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
  35. package/dist/adapters/task-sources/yaml-task-source.js +0 -139
  36. package/dist/agent-observer/test-imports.d.ts +0 -7
  37. package/dist/agent-observer/test-imports.js +0 -185
  38. package/dist/commands/update-quality-scores.d.ts +0 -5
  39. package/dist/commands/update-quality-scores.js +0 -20
  40. package/dist/lib/agent-behavior-report.d.ts +0 -8
  41. package/dist/lib/agent-behavior-report.js +0 -185
  42. package/dist/lib/baseline.d.ts +0 -19
  43. package/dist/lib/baseline.js +0 -153
  44. package/dist/lib/calculate-scores.d.ts +0 -23
  45. package/dist/lib/calculate-scores.js +0 -42
  46. package/dist/lib/compare.d.ts +0 -18
  47. package/dist/lib/compare.js +0 -170
  48. package/dist/lib/coverage-audit.d.ts +0 -4
  49. package/dist/lib/coverage-audit.js +0 -42
  50. package/dist/lib/discovery-report.d.ts +0 -13
  51. package/dist/lib/discovery-report.js +0 -57
  52. package/dist/lib/fetch-docs.d.ts +0 -30
  53. package/dist/lib/fetch-docs.js +0 -171
  54. package/dist/lib/generate-configs.d.ts +0 -25
  55. package/dist/lib/generate-configs.js +0 -42
  56. package/dist/lib/grader-api.d.ts +0 -21
  57. package/dist/lib/grader-api.js +0 -34
  58. package/dist/lib/grader-compare.d.ts +0 -19
  59. package/dist/lib/grader-compare.js +0 -91
  60. package/dist/lib/grader-consistency.d.ts +0 -27
  61. package/dist/lib/grader-consistency.js +0 -79
  62. package/dist/lib/grader-sensitivity.d.ts +0 -19
  63. package/dist/lib/grader-sensitivity.js +0 -75
  64. package/dist/lib/grader-validate.d.ts +0 -19
  65. package/dist/lib/grader-validate.js +0 -78
  66. package/dist/lib/measure-retrieval.d.ts +0 -14
  67. package/dist/lib/measure-retrieval.js +0 -71
  68. package/dist/lib/pr-comment.d.ts +0 -16
  69. package/dist/lib/pr-comment.js +0 -28
  70. package/dist/lib/readiness-report.d.ts +0 -13
  71. package/dist/lib/readiness-report.js +0 -108
  72. package/dist/lib/webhook-server.d.ts +0 -11
  73. package/dist/lib/webhook-server.js +0 -24
  74. package/dist/lib/weekly-digest.d.ts +0 -24
  75. package/dist/lib/weekly-digest.js +0 -148
  76. package/dist/orchestration/env-bridge.d.ts +0 -21
  77. package/dist/orchestration/env-bridge.js +0 -66
  78. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  79. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  80. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
  81. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
  82. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
  83. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  84. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  85. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  86. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  87. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  88. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
  89. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
  90. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
  91. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
  92. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
  93. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
  94. package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
  95. package/dist/pipeline/compiler/task-bridge.js +0 -92
  96. package/dist/pipeline/expand-tasks.d.ts +0 -232
  97. package/dist/pipeline/expand-tasks.js +0 -467
  98. package/dist/pipeline/generate-configs.d.ts +0 -92
  99. package/dist/pipeline/generate-configs.js +0 -445
  100. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  101. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  102. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  103. package/dist/pipeline/steps/compare-step.js +0 -90
  104. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  105. package/dist/pipeline/steps/eval-step.js +0 -347
  106. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  107. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  108. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  109. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  110. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  111. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  112. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  113. package/dist/pipeline/steps/publish-report-step.js +0 -243
  114. package/dist/pipeline/steps/report-step.d.ts +0 -13
  115. package/dist/pipeline/steps/report-step.js +0 -56
  116. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  117. package/dist/pipeline/steps/update-scores-step.js +0 -42
  118. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  119. package/dist/scripts/agent-behavior-report.js +0 -315
  120. package/dist/scripts/baseline.d.ts +0 -43
  121. package/dist/scripts/baseline.js +0 -267
  122. package/dist/scripts/calculate-scores.d.ts +0 -166
  123. package/dist/scripts/calculate-scores.js +0 -1296
  124. package/dist/scripts/compare.d.ts +0 -22
  125. package/dist/scripts/compare.js +0 -334
  126. package/dist/scripts/coverage-audit.d.ts +0 -44
  127. package/dist/scripts/coverage-audit.js +0 -209
  128. package/dist/scripts/debug-eval.d.ts +0 -19
  129. package/dist/scripts/debug-eval.js +0 -73
  130. package/dist/scripts/discovery-report.d.ts +0 -58
  131. package/dist/scripts/discovery-report.js +0 -250
  132. package/dist/scripts/fetch-docs.d.ts +0 -35
  133. package/dist/scripts/fetch-docs.js +0 -472
  134. package/dist/scripts/generate-configs.d.ts +0 -66
  135. package/dist/scripts/generate-configs.js +0 -459
  136. package/dist/scripts/grader-api.d.ts +0 -27
  137. package/dist/scripts/grader-api.js +0 -206
  138. package/dist/scripts/grader-compare.d.ts +0 -22
  139. package/dist/scripts/grader-compare.js +0 -368
  140. package/dist/scripts/grader-consistency.d.ts +0 -20
  141. package/dist/scripts/grader-consistency.js +0 -313
  142. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  143. package/dist/scripts/grader-sensitivity.js +0 -354
  144. package/dist/scripts/grader-validate.d.ts +0 -19
  145. package/dist/scripts/grader-validate.js +0 -267
  146. package/dist/scripts/measure-retrieval.d.ts +0 -10
  147. package/dist/scripts/measure-retrieval.js +0 -145
  148. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
  149. package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
  150. package/dist/scripts/pipeline.d.ts +0 -76
  151. package/dist/scripts/pipeline.js +0 -1031
  152. package/dist/scripts/pr-comment.d.ts +0 -10
  153. package/dist/scripts/pr-comment.js +0 -510
  154. package/dist/scripts/readiness-report.d.ts +0 -88
  155. package/dist/scripts/readiness-report.js +0 -342
  156. package/dist/scripts/update-quality-scores.d.ts +0 -15
  157. package/dist/scripts/update-quality-scores.js +0 -184
  158. package/dist/scripts/validate-task-sources.d.ts +0 -21
  159. package/dist/scripts/validate-task-sources.js +0 -210
  160. package/dist/scripts/validate.d.ts +0 -13
  161. package/dist/scripts/validate.js +0 -79
  162. package/dist/scripts/webhook-server.d.ts +0 -26
  163. package/dist/scripts/webhook-server.js +0 -147
  164. package/dist/scripts/weekly-digest.d.ts +0 -24
  165. package/dist/scripts/weekly-digest.js +0 -144
  166. package/dist/sinks/format-slack.d.ts +0 -64
  167. package/dist/sinks/format-slack.js +0 -306
  168. package/dist/sinks/slack-sink.d.ts +0 -27
  169. package/dist/sinks/slack-sink.js +0 -78
  170. package/dist/sinks/webhook-sink.d.ts +0 -19
  171. package/dist/sinks/webhook-sink.js +0 -50
  172. package/tasks/.expanded.agentic.yaml +0 -280
  173. package/tasks/.expanded.yaml +0 -565
@@ -1,459 +0,0 @@
1
- /**
2
- * Generate-configs.ts
3
- *
4
- * Reads config/models.yaml (the central model registry) and generates all
5
- * promptfoo config files with the correct provider entries.
6
- *
7
- * This keeps model definitions in one place — add a model to config/models.yaml
8
- * and run `pnpm generate-configs` to propagate it to all eval modes.
9
- *
10
- * Generated configs:
11
- * - promptfooconfig.yaml (baseline: with-docs vs without-docs)
12
- * - promptfooconfig.observed.yaml (instrumented HTTP recording)
13
- * - promptfooconfig.agentic.yaml (agentic tool-calling: naive vs optimized)
14
- *
15
- * Usage:
16
- * pnpm generate-configs
17
- * # or
18
- * tsx src/scripts/generate-configs.ts
19
- */
20
- import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
21
- import { dirname, resolve } from "path";
22
- import { fileURLToPath } from "url";
23
- import { dump, load } from "js-yaml";
24
- import { loadAndExpandTasks } from "../pipeline/expand-tasks.js";
25
- import { validateModelsYaml } from "../pipeline/validate.js";
26
- import { loadSource } from "../sources.js";
27
- const __dirname = dirname(fileURLToPath(import.meta.url));
28
- const ROOT = resolve(__dirname, "../..");
29
- // ---------------------------------------------------------------------------
30
- // Helpers
31
- // ---------------------------------------------------------------------------
32
- /** Auto-discover all task YAML files in the tasks/ directory. */
33
- export function discoverTaskFiles(rootDir) {
34
- const tasksDir = resolve(rootDir, "tasks");
35
- if (!existsSync(tasksDir)) {
36
- throw new Error(`tasks/ directory not found at ${tasksDir}`);
37
- }
38
- return readdirSync(tasksDir)
39
- .filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
40
- .sort()
41
- .map((f) => `file://tasks/${f}`);
42
- }
43
- /**
44
- * Known promptfoo provider prefixes that are NOT part of the model name.
45
- * Order matters: longer prefixes must come first to avoid partial matches.
46
- */
47
- const PROVIDER_PREFIXES = [
48
- "anthropic:messages:",
49
- "openai:chat:",
50
- "openai:responses:",
51
- "openai:",
52
- "anthropic:",
53
- "google:",
54
- ];
55
- /**
56
- * Extract the raw API model name from a promptfoo provider ID.
57
- *
58
- * Promptfoo IDs encode the provider + sub-protocol + model, e.g.:
59
- * "openai:chat:gpt-5.2" → "gpt-5.2"
60
- * "anthropic:messages:claude-opus-4-6" → "claude-opus-4-6"
61
- * "openai:gpt-4o" → "gpt-4o"
62
- * "google:gemini-2.5-pro" → "gemini-2.5-pro"
63
- *
64
- * Falls back to stripping everything before the first colon for unknown
65
- * providers (e.g., "openrouter:deepseek/deepseek-r1" → "deepseek/deepseek-r1").
66
- */
67
- export function extractModelName(id) {
68
- for (const prefix of PROVIDER_PREFIXES) {
69
- if (id.startsWith(prefix)) {
70
- return id.slice(prefix.length);
71
- }
72
- }
73
- // Fallback: strip first colon-separated segment
74
- const parts = id.split(":");
75
- return parts.length > 1 ? parts.slice(1).join(":") : id;
76
- }
77
- /**
78
- * Extract the LLM provider family from a promptfoo provider ID.
79
- *
80
- * "openai:chat:gpt-5.2" → "openai"
81
- * "anthropic:messages:claude-opus-4-6" → "anthropic"
82
- * "google:gemini-2.5-pro" → "google"
83
- */
84
- export function extractProvider(id) {
85
- const colon = id.indexOf(":");
86
- return colon > 0 ? id.slice(0, colon) : "openai";
87
- }
88
- /** Load prompt templates from config/prompts.yaml. Throws if missing or malformed. */
89
- export function loadPrompts(rootDir) {
90
- const promptsPath = resolve(rootDir, "config", "prompts.yaml");
91
- if (!existsSync(promptsPath)) {
92
- throw new Error(`config/prompts.yaml not found at ${promptsPath}. This file is required — it defines the prompt templates for all evaluation modes.`);
93
- }
94
- const raw = readFileSync(promptsPath, "utf-8");
95
- const data = load(raw);
96
- const toPrompt = (entry) => ({
97
- id: entry.id,
98
- label: entry.label,
99
- raw: entry.template,
100
- });
101
- if (!data["with-docs"] || !data["without-docs"] || !data["agentic"]) {
102
- const missing = ["with-docs", "without-docs", "agentic"].filter((k) => !data[k]);
103
- throw new Error(`config/prompts.yaml is missing required keys: ${missing.join(", ")}. Each prompt must have id, label, and template fields.`);
104
- }
105
- return {
106
- agentic: toPrompt(data["agentic"]),
107
- withDocs: toPrompt(data["with-docs"]),
108
- withoutDocs: toPrompt(data["without-docs"]),
109
- };
110
- }
111
- // ---------------------------------------------------------------------------
112
- // Task discovery
113
- // ---------------------------------------------------------------------------
114
- /** Merge default config with model-specific config */
115
- export function mergeConfig(defaults, modelConfig, overrides) {
116
- const result = {};
117
- // Only pick scalar defaults (temperature, max_tokens)
118
- for (const [key, value] of Object.entries(defaults)) {
119
- if (typeof value !== "object" || value === null) {
120
- result[key] = value;
121
- }
122
- }
123
- // Model-specific overrides
124
- if (modelConfig) {
125
- Object.assign(result, modelConfig);
126
- }
127
- // Mode-specific overrides
128
- if (overrides) {
129
- Object.assign(result, overrides);
130
- }
131
- return result;
132
- }
133
- // ---------------------------------------------------------------------------
134
- // Prompt loading
135
- // ---------------------------------------------------------------------------
136
- export function modelMatchesMode(model, mode) {
137
- if (!model.modes || model.modes.length === 0) {
138
- return true;
139
- }
140
- return model.modes.includes(mode);
141
- }
142
- function loadModels() {
143
- const raw = readFileSync(resolve(ROOT, "config", "models.yaml"), "utf-8");
144
- return load(raw);
145
- }
146
- // ---------------------------------------------------------------------------
147
- // Shared components
148
- // ---------------------------------------------------------------------------
149
- const URL_EXTRACTION_ASSERT = {
150
- type: "javascript",
151
- value: `const urlPattern = /https?:\\/\\/[^\\s\\)\\"\\'\\\`>]+/g;
152
- const urls = [...new Set((output.match(urlPattern) || []))];
153
- const sanityUrls = urls.filter(u => u.includes('sanity.io'));
154
- return {
155
- pass: true,
156
- score: 1,
157
- reason: JSON.stringify({
158
- sanityUrls,
159
- otherUrls: urls.filter(u => !u.includes('sanity.io')),
160
- totalUrlCount: urls.length,
161
- sanityUrlCount: sanityUrls.length
162
- })
163
- };`,
164
- weight: 0,
165
- };
166
- /**
167
- * Source isolation assertion — advisory (weight: 0).
168
- * Verifies that the agentic provider only fetched docs from allowed origins.
169
- * Only injected when origin sandboxing is active (DOC_ALLOWED_ORIGINS is set).
170
- */
171
- const SOURCE_ISOLATION_ASSERT = {
172
- metadata: { dimension: "source-isolation" },
173
- type: "javascript",
174
- value: "file://dist/assertions/source-isolation.js",
175
- weight: 0,
176
- };
177
- // ---------------------------------------------------------------------------
178
- // Config generators
179
- // ---------------------------------------------------------------------------
180
- function generateAgenticConfig(models, tests, prompts, source) {
181
- const naiveModels = models.models.filter((m) => modelMatchesMode(m, "agentic-naive"));
182
- const optimizedModels = models.models.filter((m) => modelMatchesMode(m, "agentic-optimized"));
183
- const providers = [];
184
- // Build doc source config to inject into providers
185
- // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string means unset
186
- const searchMode = process.env.EVAL_SEARCH_MODE || "open";
187
- const sourceConfig = source
188
- ? {
189
- ...(source.allowedOrigins?.length
190
- ? { allowedOrigins: source.allowedOrigins }
191
- : {}),
192
- docBaseUrl: source.baseUrl,
193
- ...(source.headers && Object.keys(source.headers).length > 0
194
- ? { customHeaders: source.headers }
195
- : {}),
196
- llmsTxtUrl: source.llmsTxt,
197
- ...(source.priorityDomain
198
- ? { priorityDomain: source.priorityDomain }
199
- : {}),
200
- // Tool access control: search mode for web_search behavior
201
- ...(searchMode !== "open" ? { searchMode } : {}),
202
- }
203
- : {};
204
- for (const model of naiveModels) {
205
- const modelName = extractModelName(model.id);
206
- const provider = extractProvider(model.id);
207
- providers.push({
208
- config: {
209
- ...mergeConfig(models.defaults, model.config, {
210
- agentMode: "naive",
211
- maxToolRounds: models.defaults.maxToolRounds ?? 5,
212
- model: modelName,
213
- provider,
214
- }),
215
- ...sourceConfig,
216
- observe: true,
217
- observerOptions: models.defaults.observerOptions ?? {},
218
- },
219
- id: "file://dist/agent-observer/agentic-provider.js",
220
- label: `${model.label} (Naive Agent)`,
221
- });
222
- }
223
- for (const model of optimizedModels) {
224
- const modelName = extractModelName(model.id);
225
- const provider = extractProvider(model.id);
226
- providers.push({
227
- config: {
228
- ...mergeConfig(models.defaults, model.config, {
229
- agentMode: "optimized",
230
- maxToolRounds: models.defaults.maxToolRounds ?? 5,
231
- model: modelName,
232
- provider,
233
- }),
234
- ...sourceConfig,
235
- observe: true,
236
- observerOptions: models.defaults.observerOptions ?? {},
237
- },
238
- id: "file://dist/agent-observer/agentic-provider.js",
239
- label: `${model.label} (Optimized Agent)`,
240
- });
241
- }
242
- // Inject source isolation assertion when origin sandboxing is active
243
- const hasOriginSandbox = Boolean(process.env.DOC_ALLOWED_ORIGINS);
244
- const agenticAssertions = hasOriginSandbox ? [SOURCE_ISOLATION_ASSERT] : [];
245
- return {
246
- commandLineOptions: { table: false },
247
- defaultTest: {
248
- ...(agenticAssertions.length > 0 ? { assert: agenticAssertions } : {}),
249
- options: {
250
- provider: models.grader.id,
251
- rubricProvider: models.grader.id,
252
- },
253
- },
254
- description: "Sanity AI Literacy Evaluation — Agentic (naive vs optimized)",
255
- ...(models.maxConcurrency
256
- ? { evaluateOptions: { maxConcurrency: models.maxConcurrency } }
257
- : {}),
258
- outputPath: "results/latest/eval-results-agentic.json",
259
- prompts: [prompts.agentic],
260
- providers,
261
- tests,
262
- };
263
- }
264
- function generateBaselineConfig(models, tests, prompts) {
265
- const baselineModels = models.models.filter((m) => modelMatchesMode(m, "baseline"));
266
- const providers = baselineModels.map((model) => ({
267
- config: mergeConfig(models.defaults, model.config),
268
- id: model.id,
269
- label: model.label,
270
- }));
271
- return {
272
- commandLineOptions: { table: false },
273
- defaultTest: {
274
- assert: [URL_EXTRACTION_ASSERT],
275
- options: {
276
- provider: models.grader.id,
277
- rubricProvider: models.grader.id,
278
- },
279
- },
280
- description: "Sanity AI Literacy Evaluation — Baseline",
281
- ...(models.maxConcurrency
282
- ? { evaluateOptions: { maxConcurrency: models.maxConcurrency } }
283
- : {}),
284
- outputPath: "results/latest/eval-results.json",
285
- prompts: [prompts.withDocs, prompts.withoutDocs],
286
- providers,
287
- tests,
288
- };
289
- }
290
- function generateObservedConfig(models, tests, prompts) {
291
- const observedModels = models.models.filter((m) => modelMatchesMode(m, "observed"));
292
- const providers = observedModels.map((model) => {
293
- const modelName = extractModelName(model.id);
294
- return {
295
- config: {
296
- ...mergeConfig(models.defaults, model.config),
297
- modelName,
298
- observe: true,
299
- recordOptions: models.defaults.observerOptions ?? {},
300
- },
301
- id: "file://dist/agent-observer/provider.js",
302
- label: `${model.label} (Observed)`,
303
- };
304
- });
305
- return {
306
- commandLineOptions: { table: false },
307
- defaultTest: {
308
- options: {
309
- provider: models.grader.id,
310
- rubricProvider: models.grader.id,
311
- },
312
- },
313
- description: "Sanity AI Literacy Evaluation — Observed",
314
- ...(models.maxConcurrency
315
- ? { evaluateOptions: { maxConcurrency: models.maxConcurrency } }
316
- : {}),
317
- outputPath: "results/latest/eval-results-observed.json",
318
- prompts: [prompts.withDocs, prompts.withoutDocs],
319
- providers,
320
- tests,
321
- };
322
- }
323
- // ---------------------------------------------------------------------------
324
- // File writing
325
- // ---------------------------------------------------------------------------
326
- function main() {
327
- // Validate config/models.yaml before generating configs
328
- const modelIssues = validateModelsYaml(ROOT);
329
- const modelErrors = modelIssues.filter((i) => i.severity === "error");
330
- if (modelErrors.length > 0) {
331
- console.error("❌ config/models.yaml validation failed:");
332
- for (const e of modelErrors) {
333
- console.error(` ERROR: ${e.message}`);
334
- if (e.path) {
335
- console.error(` at ${e.path}`);
336
- }
337
- }
338
- console.error("\nFix config/models.yaml before generating configs. Run 'pnpm validate' for details.");
339
- process.exit(1);
340
- }
341
- console.log("Loading config/models.yaml...");
342
- const models = loadModels();
343
- const activeModels = models.models.filter((m) => m.id && m.label);
344
- console.log(` Found ${activeModels.length} active model(s):`);
345
- for (const m of activeModels) {
346
- // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty array join → "all"
347
- const modes = m.modes?.join(", ") || "all";
348
- console.log(` - ${m.label} (${m.id}) → [${modes}]`);
349
- }
350
- // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty label falls through to id
351
- console.log(` Grader: ${models.grader.label || models.grader.id}`);
352
- // Expand tasks: read single-definition tasks and generate gold+baseline pairs
353
- // Build filter options from environment variables (set by pipeline.ts)
354
- const filterAreas = process.env.EVAL_FILTER_AREAS;
355
- const filterTaskIds = process.env.EVAL_FILTER_TASKS;
356
- const filter = filterAreas || filterTaskIds
357
- ? {
358
- areas: filterAreas
359
- ? filterAreas.split(",").map((a) => a.trim())
360
- : undefined,
361
- taskIds: filterTaskIds
362
- ? filterTaskIds.split(",").map((t) => t.trim())
363
- : undefined,
364
- }
365
- : undefined;
366
- // Expand tasks for baseline/observed mode (gold + baseline with prompt filters)
367
- const { entries, stats } = loadAndExpandTasks(ROOT, filter, "baseline");
368
- console.log(` Expanded ${stats.singleDefinitions} task(s) → ${stats.expandedTotal} test entries`);
369
- if (stats.legacyEntries > 0) {
370
- console.log(` ⚠ ${stats.legacyEntries} legacy (paired) entries passed through unchanged`);
371
- }
372
- if (filter) {
373
- const parts = [];
374
- if (filter.areas) {
375
- parts.push(`areas: ${filter.areas.join(", ")}`);
376
- }
377
- if (filter.taskIds) {
378
- parts.push(`tasks: ${filter.taskIds.join(", ")}`);
379
- }
380
- console.log(` Scoped to: ${parts.join("; ")}`);
381
- }
382
- // Expand tasks for agentic mode (gold entries only, no prompt filters).
383
- // Agentic mode has a single prompt that doesn't use {{docs}}, so baseline
384
- // entries would produce identical prompts — pure waste of API calls.
385
- const { entries: agenticEntries, stats: agenticStats } = loadAndExpandTasks(ROOT, filter, "agentic");
386
- console.log(` Agentic: ${agenticStats.expandedTotal} entries (gold only, no baseline)`);
387
- // Write expanded tasks to generated files for Promptfoo to consume
388
- const expandedPath = resolve(ROOT, "tasks", ".expanded.yaml");
389
- const expandedYaml = dump(entries, {
390
- forceQuotes: false,
391
- lineWidth: 120,
392
- noRefs: true,
393
- quotingType: "'",
394
- });
395
- writeFileSync(expandedPath, `# .expanded.yaml\n#\n# AUTO-GENERATED — do not edit directly.\n# Source: tasks/*.yaml (single-definition format)\n# Run: pnpm generate-configs\n\n${expandedYaml}`, "utf-8");
396
- console.log(` ✓ tasks/.expanded.yaml (${entries.length} entries)`);
397
- const agenticExpandedPath = resolve(ROOT, "tasks", ".expanded.agentic.yaml");
398
- const agenticExpandedYaml = dump(agenticEntries, {
399
- forceQuotes: false,
400
- lineWidth: 120,
401
- noRefs: true,
402
- quotingType: "'",
403
- });
404
- writeFileSync(agenticExpandedPath, `# .expanded.agentic.yaml\n#\n# AUTO-GENERATED — do not edit directly.\n# Gold entries only (no baseline) for agentic evaluation mode.\n# Source: tasks/*.yaml (single-definition format)\n# Run: pnpm generate-configs\n\n${agenticExpandedYaml}`, "utf-8");
405
- console.log(` ✓ tasks/.expanded.agentic.yaml (${agenticEntries.length} entries)`);
406
- const taskFiles = ["file://tasks/.expanded.yaml"];
407
- const agenticTaskFiles = ["file://tasks/.expanded.agentic.yaml"];
408
- // Load prompt templates
409
- const prompts = loadPrompts(ROOT);
410
- console.log(` Loaded prompts: ${Object.keys(prompts).join(", ")}`);
411
- // Load optional documentation source configuration
412
- // Usage: pnpm generate-configs [source-name]
413
- // Or: DOC_SOURCE=branch pnpm generate-configs
414
- const sourceName = process.argv[2] || process.env.DOC_SOURCE;
415
- let source;
416
- if (sourceName) {
417
- console.log(`\nLoading source: ${sourceName}`);
418
- try {
419
- source = loadSource(sourceName);
420
- console.log(` Base URL: ${source.baseUrl}`);
421
- console.log(` Dataset: ${source.dataset}`);
422
- if (source.allowedOrigins?.length) {
423
- console.log(` Allowed origins: ${source.allowedOrigins.join(", ")}`);
424
- }
425
- }
426
- catch (err) {
427
- const msg = err instanceof Error ? err.message : String(err);
428
- console.warn(`\n⚠ Failed to load source "${sourceName}": ${msg}`);
429
- }
430
- }
431
- console.log("\nGenerating configs...");
432
- writeConfig("promptfooconfig.yaml", generateBaselineConfig(models, taskFiles, prompts), `# promptfooconfig.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`);
433
- writeConfig("promptfooconfig.observed.yaml", generateObservedConfig(models, taskFiles, prompts), `# promptfooconfig.observed.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`);
434
- writeConfig("promptfooconfig.agentic.yaml", generateAgenticConfig(models, agenticTaskFiles, prompts, source), `# promptfooconfig.agentic.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`);
435
- console.log("\nDone! Configs are ready.");
436
- if (source) {
437
- console.log(` (using doc source: ${sourceName})`);
438
- }
439
- }
440
- // ---------------------------------------------------------------------------
441
- // Main
442
- // ---------------------------------------------------------------------------
443
- function writeConfig(filename, config, header) {
444
- const yamlStr = dump(config, {
445
- forceQuotes: false,
446
- lineWidth: 120,
447
- noRefs: true,
448
- quotingType: "'",
449
- });
450
- const content = `${header}\n${yamlStr}`;
451
- const outPath = resolve(ROOT, filename);
452
- writeFileSync(outPath, content, "utf-8");
453
- console.log(` ✓ ${filename}`);
454
- }
455
- // Only run when invoked directly (not when imported for testing)
456
- if (process.argv[1]?.endsWith("generate-configs.ts") ||
457
- process.argv[1]?.endsWith("generate-configs.js")) {
458
- main();
459
- }
@@ -1,27 +0,0 @@
1
- /**
2
- * grader-api.ts
3
- *
4
- * Shared utility for calling LLM grading APIs from grader scripts.
5
- *
6
- * Dispatches to the correct provider API (OpenAI, Anthropic) based on the
7
- * grader model prefix. Reads the appropriate API key from environment.
8
- *
9
- * Also exports `loadGraderModel()` to resolve the grader from
10
- * `config/models.yaml`.
11
- */
12
- /**
13
- * Call the grader model once to score a response against a rubric.
14
- *
15
- * Dispatches to the correct provider API based on the model prefix.
16
- * Returns a numeric score (0–100) or null if the call or parse fails.
17
- */
18
- export declare function gradeOnce(graderModel: string, responseText: string, rubricText: string): Promise<null | number>;
19
- /**
20
- * Load the grader model from `config/models.yaml`.
21
- * Returns both the model ID and human-readable label.
22
- * Falls back to `openai:gpt-5` if not configured.
23
- */
24
- export declare function loadGraderModel(): {
25
- id: string;
26
- label: string;
27
- };
@@ -1,206 +0,0 @@
1
- /**
2
- * grader-api.ts
3
- *
4
- * Shared utility for calling LLM grading APIs from grader scripts.
5
- *
6
- * Dispatches to the correct provider API (OpenAI, Anthropic) based on the
7
- * grader model prefix. Reads the appropriate API key from environment.
8
- *
9
- * Also exports `loadGraderModel()` to resolve the grader from
10
- * `config/models.yaml`.
11
- */
12
- import { config as dotenvConfig } from "dotenv";
13
- import { existsSync, readFileSync } from "fs";
14
- import { dirname, join, resolve } from "path";
15
- import { fileURLToPath } from "url";
16
- import { load } from "js-yaml";
17
- const __dirname = dirname(fileURLToPath(import.meta.url));
18
- const EVAL_ROOT = resolve(__dirname, "..", "..");
19
- // Load root .env (two levels above packages/eval/) so API keys are available
20
- // even when this module is invoked via `tsx` from packages/eval/.
21
- const rootEnvPath = resolve(EVAL_ROOT, "..", "..", ".env");
22
- if (existsSync(rootEnvPath)) {
23
- dotenvConfig({ override: true, path: rootEnvPath });
24
- }
25
- // ---------------------------------------------------------------------------
26
- // Provider detection
27
- // ---------------------------------------------------------------------------
28
- /**
29
- * Call the grader model once to score a response against a rubric.
30
- *
31
- * Dispatches to the correct provider API based on the model prefix.
32
- * Returns a numeric score (0–100) or null if the call or parse fails.
33
- */
34
- export async function gradeOnce(graderModel, responseText, rubricText) {
35
- const config = resolveProvider(graderModel);
36
- const prompt = `You are evaluating an AI assistant's response. Grade the response according to the following rubric.
37
-
38
- ## Response to evaluate:
39
- ${responseText.slice(0, 8000)}
40
-
41
- ## Rubric:
42
- ${rubricText}
43
- `;
44
- try {
45
- const provider = graderModel.split(":")[0];
46
- let content;
47
- if (provider === "anthropic") {
48
- content = await callAnthropic(config, prompt);
49
- }
50
- else if (provider === "openai") {
51
- content = await callOpenAI(config, prompt);
52
- }
53
- else {
54
- // resolveProvider already throws for unknown providers, but just in case
55
- return null;
56
- }
57
- if (content === null)
58
- return null;
59
- const score = extractScore(content);
60
- if (score === null) {
61
- console.error(` ⚠ Could not parse grader response: ${content.slice(0, 100)}`);
62
- }
63
- return score;
64
- }
65
- catch (err) {
66
- console.error(` ⚠ Grader call failed: ${err instanceof Error ? err.message : String(err)}`);
67
- return null;
68
- }
69
- }
70
- // ---------------------------------------------------------------------------
71
- // Provider-specific API calls
72
- // ---------------------------------------------------------------------------
73
- /**
74
- * Load the grader model from `config/models.yaml`.
75
- * Returns both the model ID and human-readable label.
76
- * Falls back to `openai:gpt-5` if not configured.
77
- */
78
- export function loadGraderModel() {
79
- const modelsPath = join(EVAL_ROOT, "config", "models.yaml");
80
- if (!existsSync(modelsPath)) {
81
- console.error("❌ config/models.yaml not found");
82
- process.exit(1);
83
- }
84
- const raw = readFileSync(modelsPath, "utf-8");
85
- const data = load(raw);
86
- return {
87
- id: data?.grader?.id ?? "openai:gpt-5",
88
- label: data?.grader?.label ?? "GPT-5 (grader)",
89
- };
90
- }
91
- async function callAnthropic(config, prompt) {
92
- const response = await fetch(config.baseUrl, {
93
- body: JSON.stringify({
94
- max_tokens: 256,
95
- messages: [{ content: prompt, role: "user" }],
96
- model: config.modelName,
97
- temperature: 0.2,
98
- }),
99
- headers: {
100
- "anthropic-version": "2023-06-01",
101
- "Content-Type": "application/json",
102
- "x-api-key": config.apiKey,
103
- },
104
- method: "POST",
105
- });
106
- if (!response.ok) {
107
- const text = await response.text();
108
- console.error(` ⚠ Grader API error (Anthropic): ${response.status} ${text.slice(0, 200)}`);
109
- return null;
110
- }
111
- const data = (await response.json());
112
- const textBlock = data.content?.find((c) => c.type === "text");
113
- return textBlock?.text ?? "";
114
- }
115
- // ---------------------------------------------------------------------------
116
- // Score extraction
117
- // ---------------------------------------------------------------------------
118
- async function callOpenAI(config, prompt) {
119
- const response = await fetch(config.baseUrl, {
120
- body: JSON.stringify({
121
- max_tokens: 256,
122
- messages: [{ content: prompt, role: "user" }],
123
- model: config.modelName,
124
- temperature: 0.2,
125
- }),
126
- headers: {
127
- Authorization: `Bearer ${config.apiKey}`,
128
- "Content-Type": "application/json",
129
- },
130
- method: "POST",
131
- });
132
- if (!response.ok) {
133
- const text = await response.text();
134
- console.error(` ⚠ Grader API error (OpenAI): ${response.status} ${text.slice(0, 200)}`);
135
- return null;
136
- }
137
- const data = (await response.json());
138
- return data.choices?.[0]?.message?.content ?? "";
139
- }
140
- // ---------------------------------------------------------------------------
141
- // Public API
142
- // ---------------------------------------------------------------------------
143
- function extractScore(content) {
144
- // Try JSON parse first: {"score": 85, "reason": "..."}
145
- try {
146
- const parsed = JSON.parse(content);
147
- if (typeof parsed === "object" &&
148
- parsed !== null &&
149
- "score" in parsed &&
150
- typeof parsed.score === "number") {
151
- return parsed.score;
152
- }
153
- }
154
- catch {
155
- // Not JSON — fall through
156
- }
157
- // Fallback: extract first bare number
158
- const match = content.match(/(\d+)/);
159
- if (match)
160
- return parseInt(match[1], 10);
161
- return null;
162
- }
163
- /**
164
- * Parse a Promptfoo-style model ID and resolve the provider config.
165
- *
166
- * Supported formats:
167
- * - `openai:chat:gpt-5.2` → OpenAI, model = `gpt-5.2`
168
- * - `openai:gpt-5` → OpenAI, model = `gpt-5`
169
- * - `anthropic:messages:claude-opus-4-5-20251101` → Anthropic, model = `claude-opus-4-5-20251101`
170
- * - `anthropic:claude-sonnet-4` → Anthropic, model = `claude-sonnet-4`
171
- */
172
- function resolveProvider(graderModel) {
173
- const parts = graderModel.split(":");
174
- const provider = parts[0];
175
- if (provider === "anthropic") {
176
- // "anthropic:messages:claude-opus-4-5" → "claude-opus-4-5"
177
- // "anthropic:claude-sonnet-4" → "claude-sonnet-4"
178
- const modelName = parts.length >= 3 && parts[1] === "messages"
179
- ? parts.slice(2).join(":")
180
- : parts.slice(1).join(":");
181
- const apiKey = process.env.ANTHROPIC_API_KEY;
182
- if (!apiKey) {
183
- throw new Error("ANTHROPIC_API_KEY not set. Required for grader model: " + graderModel);
184
- }
185
- return {
186
- apiKey,
187
- baseUrl: "https://api.anthropic.com/v1/messages",
188
- modelName,
189
- };
190
- }
191
- if (provider === "openai") {
192
- // "openai:chat:gpt-5.2" → "gpt-5.2", "openai:gpt-5" → "gpt-5"
193
- const modelName = parts.length >= 3 ? parts.slice(2).join(":") : parts[1];
194
- const apiKey = process.env.OPENAI_API_KEY;
195
- if (!apiKey) {
196
- throw new Error("OPENAI_API_KEY not set. Required for grader model: " + graderModel);
197
- }
198
- return {
199
- apiKey,
200
- baseUrl: "https://api.openai.com/v1/chat/completions",
201
- modelName,
202
- };
203
- }
204
- throw new Error(`Unsupported grader provider "${provider}" in model "${graderModel}". ` +
205
- "Supported: openai, anthropic.");
206
- }