@sanity/ailf 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/models.yaml +3 -2
- package/dist/_vendor/ailf-core/types/index.d.ts +53 -0
- package/dist/composition-root.js +7 -2
- package/dist/orchestration/pipeline-orchestrator.js +27 -2
- package/dist/orchestration/step-runner.js +8 -0
- package/dist/orchestration/steps/calculate-scores-step.js +4 -0
- package/dist/orchestration/steps/generate-configs-step.js +1 -0
- package/dist/orchestration/steps/grader-consistency-step.js +1 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +2 -1
- package/dist/pipeline/calculate-scores.d.ts +5 -0
- package/dist/pipeline/calculate-scores.js +219 -146
- package/dist/pipeline/coverage-audit.d.ts +2 -1
- package/dist/pipeline/coverage-audit.js +5 -3
- package/dist/pipeline/expand-tasks.d.ts +2 -1
- package/dist/pipeline/expand-tasks.js +33 -2
- package/dist/pipeline/generate-configs.d.ts +3 -1
- package/dist/pipeline/generate-configs.js +47 -28
- package/dist/pipeline/grader-api.d.ts +2 -1
- package/dist/pipeline/grader-api.js +11 -9
- package/dist/pipeline/grader-compare-runner.d.ts +3 -0
- package/dist/pipeline/grader-compare-runner.js +21 -19
- package/dist/pipeline/grader-consistency-runner.d.ts +3 -0
- package/dist/pipeline/grader-consistency-runner.js +16 -14
- package/dist/pipeline/grader-sensitivity-runner.d.ts +3 -0
- package/dist/pipeline/grader-sensitivity-runner.js +18 -16
- package/dist/pipeline/grader-validate-runner.d.ts +3 -0
- package/dist/pipeline/grader-validate-runner.js +16 -14
- package/dist/pipeline/mirror-repo-tasks.d.ts +3 -1
- package/dist/pipeline/mirror-repo-tasks.js +8 -6
- package/dist/pipeline/provenance.d.ts +3 -0
- package/dist/pipeline/provenance.js +25 -3
- package/dist/sources.d.ts +2 -1
- package/dist/sources.js +28 -1
- package/package.json +3 -3
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
*
|
|
10
10
|
* @see docs/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
11
11
|
*/
|
|
12
|
+
import type { Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
12
13
|
import type { CoverageAuditReport, ProductFeature } from "./types.js";
|
|
13
14
|
/**
|
|
14
15
|
* Count unique document slugs referenced across all tasks.
|
|
@@ -32,7 +33,7 @@ export declare function formatCoverageMarkdown(report: CoverageAuditReport): str
|
|
|
32
33
|
/**
|
|
33
34
|
* Load and validate the feature registry from config/features.yaml.
|
|
34
35
|
*/
|
|
35
|
-
export declare function loadFeatureRegistry(rootDir: string): null | ProductFeature[];
|
|
36
|
+
export declare function loadFeatureRegistry(rootDir: string, logger?: Logger): null | ProductFeature[];
|
|
36
37
|
/**
|
|
37
38
|
* Run the coverage audit and produce a structured report.
|
|
38
39
|
*/
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
import { existsSync, readFileSync } from "fs";
|
|
13
13
|
import { join } from "path";
|
|
14
14
|
import { load } from "js-yaml";
|
|
15
|
+
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
15
16
|
import { FeatureRegistrySchema } from "./schemas.js";
|
|
16
17
|
import { resolveMappings } from "./resolve-mappings.js";
|
|
17
18
|
// ---------------------------------------------------------------------------
|
|
@@ -113,7 +114,8 @@ export function formatCoverageMarkdown(report) {
|
|
|
113
114
|
/**
|
|
114
115
|
* Load and validate the feature registry from config/features.yaml.
|
|
115
116
|
*/
|
|
116
|
-
export function loadFeatureRegistry(rootDir) {
|
|
117
|
+
export function loadFeatureRegistry(rootDir, logger) {
|
|
118
|
+
const log = logger ?? new ConsoleLogger();
|
|
117
119
|
const filePath = join(rootDir, "config", "features.yaml");
|
|
118
120
|
if (!existsSync(filePath)) {
|
|
119
121
|
return null;
|
|
@@ -122,9 +124,9 @@ export function loadFeatureRegistry(rootDir) {
|
|
|
122
124
|
const parsed = load(raw);
|
|
123
125
|
const result = FeatureRegistrySchema.safeParse(parsed);
|
|
124
126
|
if (!result.success) {
|
|
125
|
-
|
|
127
|
+
log.error("❌ config/features.yaml validation failed:");
|
|
126
128
|
for (const issue of result.error.issues) {
|
|
127
|
-
|
|
129
|
+
log.error(` ${issue.path.join(".")}: ${issue.message}`);
|
|
128
130
|
}
|
|
129
131
|
return null;
|
|
130
132
|
}
|
|
@@ -41,6 +41,7 @@
|
|
|
41
41
|
* 2. Baseline entry — sets docs: "", adds transform, uses abbreviated rubric
|
|
42
42
|
*/
|
|
43
43
|
import type { TaskDefinition } from "../_vendor/ailf-core/index.d.ts";
|
|
44
|
+
import type { Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
44
45
|
import { type RubricConfig } from "./schemas.js";
|
|
45
46
|
import type { FilterOptions } from "./types.js";
|
|
46
47
|
/** Any assertion entry (templated or value-based). */
|
|
@@ -198,7 +199,7 @@ export declare function isTemplatedAssert(entry: AssertEntry): entry is Template
|
|
|
198
199
|
*
|
|
199
200
|
* Returns the expanded entries grouped by source file.
|
|
200
201
|
*/
|
|
201
|
-
export declare function loadAndExpandTasks(rootDir: string, filter?: FilterOptions, mode?: "agentic" | "baseline"): {
|
|
202
|
+
export declare function loadAndExpandTasks(rootDir: string, filter?: FilterOptions, mode?: "agentic" | "baseline", logger?: Logger): {
|
|
202
203
|
/** All expanded test entries, in order. */
|
|
203
204
|
entries: ExpandedTestEntry[];
|
|
204
205
|
/** Statistics about what was processed. */
|
|
@@ -43,6 +43,7 @@
|
|
|
43
43
|
import { existsSync, readFileSync, readdirSync } from "fs";
|
|
44
44
|
import { resolve } from "path";
|
|
45
45
|
import { load } from "js-yaml";
|
|
46
|
+
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
46
47
|
import { RubricConfigSchema } from "./schemas.js";
|
|
47
48
|
// ---------------------------------------------------------------------------
|
|
48
49
|
// Rubric template loading and assembly
|
|
@@ -313,9 +314,13 @@ export function isTemplatedAssert(entry) {
|
|
|
313
314
|
*
|
|
314
315
|
* Returns the expanded entries grouped by source file.
|
|
315
316
|
*/
|
|
316
|
-
export function loadAndExpandTasks(rootDir, filter, mode = "baseline") {
|
|
317
|
+
export function loadAndExpandTasks(rootDir, filter, mode = "baseline", logger) {
|
|
318
|
+
const log = logger ?? new ConsoleLogger();
|
|
317
319
|
const tasksDir = resolve(rootDir, "tasks");
|
|
318
320
|
if (!existsSync(tasksDir)) {
|
|
321
|
+
log.debug("Tasks directory not found, returning empty (tasks may come from Content Lake)", {
|
|
322
|
+
tasksDir,
|
|
323
|
+
});
|
|
319
324
|
// tasks/ may not exist when task definitions come from Content Lake
|
|
320
325
|
return {
|
|
321
326
|
entries: [],
|
|
@@ -332,13 +337,25 @@ export function loadAndExpandTasks(rootDir, filter, mode = "baseline") {
|
|
|
332
337
|
let yamlFiles = readdirSync(tasksDir)
|
|
333
338
|
.filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
|
|
334
339
|
.sort();
|
|
340
|
+
log.debug("Discovered task YAML files", {
|
|
341
|
+
directory: tasksDir,
|
|
342
|
+
fileCount: yamlFiles.length,
|
|
343
|
+
files: yamlFiles,
|
|
344
|
+
});
|
|
335
345
|
// Apply area filter — area name = filename stem (e.g., "groq" matches "groq.yaml")
|
|
336
346
|
if (filter?.areas && filter.areas.length > 0) {
|
|
337
347
|
const allowedAreas = new Set(filter.areas.map((a) => a.toLowerCase()));
|
|
348
|
+
const beforeCount = yamlFiles.length;
|
|
338
349
|
yamlFiles = yamlFiles.filter((f) => {
|
|
339
350
|
const stem = f.replace(/\.ya?ml$/, "").toLowerCase();
|
|
340
351
|
return allowedAreas.has(stem);
|
|
341
352
|
});
|
|
353
|
+
log.debug("Applied area filter", {
|
|
354
|
+
allowedAreas: [...allowedAreas],
|
|
355
|
+
beforeCount,
|
|
356
|
+
afterCount: yamlFiles.length,
|
|
357
|
+
matchedFiles: yamlFiles,
|
|
358
|
+
});
|
|
342
359
|
}
|
|
343
360
|
const entries = [];
|
|
344
361
|
let singleDefinitions = 0;
|
|
@@ -353,10 +370,17 @@ export function loadAndExpandTasks(rootDir, filter, mode = "baseline") {
|
|
|
353
370
|
// Apply task ID filter
|
|
354
371
|
if (filter?.taskIds && filter.taskIds.length > 0) {
|
|
355
372
|
const allowedIds = new Set(filter.taskIds);
|
|
373
|
+
const beforeCount = parsed.length;
|
|
356
374
|
parsed = parsed.filter((entry) => typeof entry === "object" &&
|
|
357
375
|
entry !== null &&
|
|
358
376
|
"id" in entry &&
|
|
359
377
|
allowedIds.has(entry.id));
|
|
378
|
+
log.debug("Applied task ID filter", {
|
|
379
|
+
file,
|
|
380
|
+
allowedIds: [...allowedIds],
|
|
381
|
+
beforeCount,
|
|
382
|
+
afterCount: parsed.length,
|
|
383
|
+
});
|
|
360
384
|
}
|
|
361
385
|
for (const entry of parsed) {
|
|
362
386
|
if (isSingleTaskDefinition(entry)) {
|
|
@@ -370,8 +394,15 @@ export function loadAndExpandTasks(rootDir, filter, mode = "baseline") {
|
|
|
370
394
|
}
|
|
371
395
|
}
|
|
372
396
|
if (filter?.areas || filter?.taskIds) {
|
|
373
|
-
|
|
397
|
+
log.info(`Filter: ${filter.areas ? `areas=[${filter.areas.join(", ")}]` : ""}${filter.areas && filter.taskIds ? ", " : ""}${filter.taskIds ? `tasks=[${filter.taskIds.join(", ")}]` : ""}`);
|
|
374
398
|
}
|
|
399
|
+
log.debug("Task expansion complete", {
|
|
400
|
+
totalFiles: yamlFiles.length,
|
|
401
|
+
singleDefinitions,
|
|
402
|
+
legacyEntries,
|
|
403
|
+
expandedTotal: entries.length,
|
|
404
|
+
mode,
|
|
405
|
+
});
|
|
375
406
|
return {
|
|
376
407
|
entries,
|
|
377
408
|
stats: {
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
* @see config/models.yaml — the central model registry
|
|
19
19
|
* @see docs/exec-plans/eliminate-lib-layer.md
|
|
20
20
|
*/
|
|
21
|
-
import { type TaskDefinition } from "../_vendor/ailf-core/index.d.ts";
|
|
21
|
+
import { type Logger, type TaskDefinition } from "../_vendor/ailf-core/index.d.ts";
|
|
22
22
|
import type { FilterOptions } from "./types.js";
|
|
23
23
|
import { type ResolvedSourceConfig } from "../sources.js";
|
|
24
24
|
export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "../_vendor/ailf-core/index.d.ts";
|
|
@@ -49,6 +49,8 @@ export interface GenerateConfigsOptions {
|
|
|
49
49
|
allowedOrigins?: string[];
|
|
50
50
|
/** Filter to specific feature areas or task IDs */
|
|
51
51
|
filter?: FilterOptions;
|
|
52
|
+
/** Logger instance (defaults to ConsoleLogger) */
|
|
53
|
+
logger?: Logger;
|
|
52
54
|
/** Pre-resolved source config (skips loadSource() call) */
|
|
53
55
|
resolvedSource?: ResolvedSourceConfig;
|
|
54
56
|
/** Root directory of the eval package (required) */
|
|
@@ -22,6 +22,7 @@ import { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } fro
|
|
|
22
22
|
import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
|
|
23
23
|
import { resolve } from "path";
|
|
24
24
|
import { dump, load } from "js-yaml";
|
|
25
|
+
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
25
26
|
import { expandTaskDefinitions, loadAndExpandTasks } from "./expand-tasks.js";
|
|
26
27
|
import { validateModelsYaml } from "./validate.js";
|
|
27
28
|
import { loadSource } from "../sources.js";
|
|
@@ -260,6 +261,7 @@ function generateObservedConfig(models, tests, prompts) {
|
|
|
260
261
|
*/
|
|
261
262
|
export function generateConfigs(options) {
|
|
262
263
|
const { rootDir } = options;
|
|
264
|
+
const log = options.logger ?? new ConsoleLogger();
|
|
263
265
|
// Validate config/models.yaml before generating configs
|
|
264
266
|
const modelIssues = validateModelsYaml(rootDir);
|
|
265
267
|
const modelErrors = modelIssues.filter((i) => i.severity === "error");
|
|
@@ -269,17 +271,24 @@ export function generateConfigs(options) {
|
|
|
269
271
|
.join("; ");
|
|
270
272
|
throw new Error(`config/models.yaml validation failed: ${details}. Run 'pnpm validate' for details.`);
|
|
271
273
|
}
|
|
272
|
-
|
|
274
|
+
log.info("Loading config/models.yaml...");
|
|
273
275
|
const models = loadModels(rootDir);
|
|
274
276
|
const activeModels = models.models.filter((m) => m.id && m.label);
|
|
275
|
-
|
|
277
|
+
log.debug("Models loaded from config/models.yaml", {
|
|
278
|
+
totalModels: models.models.length,
|
|
279
|
+
activeModels: activeModels.length,
|
|
280
|
+
modelIds: activeModels.map((m) => m.id),
|
|
281
|
+
graderId: models.grader.id,
|
|
282
|
+
maxConcurrency: models.maxConcurrency,
|
|
283
|
+
});
|
|
284
|
+
log.info(` Found ${activeModels.length} active model(s):`);
|
|
276
285
|
for (const m of activeModels) {
|
|
277
286
|
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty array join → "all"
|
|
278
287
|
const modes = m.modes?.join(", ") || "all";
|
|
279
|
-
|
|
288
|
+
log.info(` - ${m.label} (${m.id}) → [${modes}]`);
|
|
280
289
|
}
|
|
281
290
|
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty label falls through to id
|
|
282
|
-
|
|
291
|
+
log.info(` Grader: ${models.grader.label || models.grader.id}`);
|
|
283
292
|
// Build filter from options
|
|
284
293
|
const filter = options.filter?.areas || options.filter?.taskIds
|
|
285
294
|
? options.filter
|
|
@@ -290,20 +299,24 @@ export function generateConfigs(options) {
|
|
|
290
299
|
let agenticEntries;
|
|
291
300
|
if (options.tasks) {
|
|
292
301
|
// TaskSource path — tasks already loaded and filtered by the adapter
|
|
302
|
+
log.debug("Expanding tasks from TaskSource adapter", {
|
|
303
|
+
taskCount: options.tasks.length,
|
|
304
|
+
taskIds: options.tasks.map((t) => t.id),
|
|
305
|
+
});
|
|
293
306
|
const baselineResult = expandTaskDefinitions(options.tasks, rootDir, "baseline");
|
|
294
307
|
entries = baselineResult.entries;
|
|
295
|
-
|
|
308
|
+
log.info(` Expanded ${baselineResult.stats.totalTasks} task(s) → ${baselineResult.stats.expandedTotal} test entries (from TaskSource)`);
|
|
296
309
|
const agenticResult = expandTaskDefinitions(options.tasks, rootDir, "agentic");
|
|
297
310
|
agenticEntries = agenticResult.entries;
|
|
298
|
-
|
|
311
|
+
log.info(` Agentic: ${agenticResult.stats.expandedTotal} entries (gold only, no baseline)`);
|
|
299
312
|
}
|
|
300
313
|
else {
|
|
301
314
|
// Legacy path — read from tasks/*.yaml files
|
|
302
|
-
const { entries: baselineEntries, stats } = loadAndExpandTasks(rootDir, filter, "baseline");
|
|
315
|
+
const { entries: baselineEntries, stats } = loadAndExpandTasks(rootDir, filter, "baseline", log);
|
|
303
316
|
entries = baselineEntries;
|
|
304
|
-
|
|
317
|
+
log.info(` Expanded ${stats.singleDefinitions} task(s) → ${stats.expandedTotal} test entries`);
|
|
305
318
|
if (stats.legacyEntries > 0) {
|
|
306
|
-
|
|
319
|
+
log.info(` ⚠ ${stats.legacyEntries} legacy (paired) entries passed through unchanged`);
|
|
307
320
|
}
|
|
308
321
|
if (filter) {
|
|
309
322
|
const parts = [];
|
|
@@ -313,11 +326,11 @@ export function generateConfigs(options) {
|
|
|
313
326
|
if (filter.taskIds) {
|
|
314
327
|
parts.push(`tasks: ${filter.taskIds.join(", ")}`);
|
|
315
328
|
}
|
|
316
|
-
|
|
329
|
+
log.info(` Scoped to: ${parts.join("; ")}`);
|
|
317
330
|
}
|
|
318
|
-
const { entries: agenticFromYaml, stats: agenticStats } = loadAndExpandTasks(rootDir, filter, "agentic");
|
|
331
|
+
const { entries: agenticFromYaml, stats: agenticStats } = loadAndExpandTasks(rootDir, filter, "agentic", log);
|
|
319
332
|
agenticEntries = agenticFromYaml;
|
|
320
|
-
|
|
333
|
+
log.info(` Agentic: ${agenticStats.expandedTotal} entries (gold only, no baseline)`);
|
|
321
334
|
}
|
|
322
335
|
// Write expanded tasks to generated files for Promptfoo to consume
|
|
323
336
|
const expandedPath = resolve(rootDir, "tasks", ".expanded.yaml");
|
|
@@ -328,7 +341,7 @@ export function generateConfigs(options) {
|
|
|
328
341
|
quotingType: "'",
|
|
329
342
|
});
|
|
330
343
|
writeFileSync(expandedPath, `# .expanded.yaml\n#\n# AUTO-GENERATED — do not edit directly.\n# Source: tasks/*.yaml (single-definition format)\n# Run: pnpm generate-configs\n\n${expandedYaml}`, "utf-8");
|
|
331
|
-
|
|
344
|
+
log.info(` ✓ tasks/.expanded.yaml (${entries.length} entries)`);
|
|
332
345
|
const agenticExpandedPath = resolve(rootDir, "tasks", ".expanded.agentic.yaml");
|
|
333
346
|
const agenticExpandedYaml = dump(agenticEntries, {
|
|
334
347
|
forceQuotes: false,
|
|
@@ -337,46 +350,52 @@ export function generateConfigs(options) {
|
|
|
337
350
|
quotingType: "'",
|
|
338
351
|
});
|
|
339
352
|
writeFileSync(agenticExpandedPath, `# .expanded.agentic.yaml\n#\n# AUTO-GENERATED — do not edit directly.\n# Gold entries only (no baseline) for agentic evaluation mode.\n# Source: tasks/*.yaml (single-definition format)\n# Run: pnpm generate-configs\n\n${agenticExpandedYaml}`, "utf-8");
|
|
340
|
-
|
|
353
|
+
log.info(` ✓ tasks/.expanded.agentic.yaml (${agenticEntries.length} entries)`);
|
|
341
354
|
const taskFiles = ["file://tasks/.expanded.yaml"];
|
|
342
355
|
const agenticTaskFiles = ["file://tasks/.expanded.agentic.yaml"];
|
|
343
356
|
// Load prompt templates
|
|
344
357
|
const prompts = loadPrompts(rootDir);
|
|
345
|
-
|
|
358
|
+
log.debug("Prompt templates loaded", {
|
|
359
|
+
keys: Object.keys(prompts),
|
|
360
|
+
withDocsId: prompts.withDocs.id,
|
|
361
|
+
withoutDocsId: prompts.withoutDocs.id,
|
|
362
|
+
agenticId: prompts.agentic.id,
|
|
363
|
+
});
|
|
364
|
+
log.info(` Loaded prompts: ${Object.keys(prompts).join(", ")}`);
|
|
346
365
|
// Load optional documentation source configuration
|
|
347
366
|
// Pre-resolved source wins over name-based lookup
|
|
348
367
|
let source = options.resolvedSource;
|
|
349
368
|
const sourceName = options.source;
|
|
350
369
|
if (!source && sourceName) {
|
|
351
|
-
|
|
370
|
+
log.info(`\nLoading source: ${sourceName}`);
|
|
352
371
|
try {
|
|
353
372
|
source = loadSource(sourceName);
|
|
354
373
|
}
|
|
355
374
|
catch (err) {
|
|
356
375
|
const msg = err instanceof Error ? err.message : String(err);
|
|
357
|
-
|
|
376
|
+
log.warn(`\n⚠ Failed to load source "${sourceName}": ${msg}`);
|
|
358
377
|
}
|
|
359
378
|
}
|
|
360
379
|
if (source) {
|
|
361
|
-
|
|
362
|
-
|
|
380
|
+
log.info(` Base URL: ${source.baseUrl}`);
|
|
381
|
+
log.info(` Dataset: ${source.dataset}`);
|
|
363
382
|
if (source.allowedOrigins?.length) {
|
|
364
|
-
|
|
383
|
+
log.info(` Allowed origins: ${source.allowedOrigins.join(", ")}`);
|
|
365
384
|
}
|
|
366
385
|
}
|
|
367
|
-
|
|
368
|
-
writeConfig(rootDir, "promptfooconfig.yaml", generateBaselineConfig(models, taskFiles, prompts), `# promptfooconfig.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n
|
|
369
|
-
writeConfig(rootDir, "promptfooconfig.observed.yaml", generateObservedConfig(models, taskFiles, prompts), `# promptfooconfig.observed.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n
|
|
370
|
-
writeConfig(rootDir, "promptfooconfig.agentic.yaml", generateAgenticConfig(models, agenticTaskFiles, prompts, source, options.searchMode, options.allowedOrigins), `# promptfooconfig.agentic.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n
|
|
371
|
-
|
|
386
|
+
log.info("\nGenerating configs...");
|
|
387
|
+
writeConfig(rootDir, "promptfooconfig.yaml", generateBaselineConfig(models, taskFiles, prompts), `# promptfooconfig.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`, log);
|
|
388
|
+
writeConfig(rootDir, "promptfooconfig.observed.yaml", generateObservedConfig(models, taskFiles, prompts), `# promptfooconfig.observed.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`, log);
|
|
389
|
+
writeConfig(rootDir, "promptfooconfig.agentic.yaml", generateAgenticConfig(models, agenticTaskFiles, prompts, source, options.searchMode, options.allowedOrigins), `# promptfooconfig.agentic.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`, log);
|
|
390
|
+
log.info("\nDone! Configs are ready.");
|
|
372
391
|
if (source) {
|
|
373
|
-
|
|
392
|
+
log.info(` (using doc source: ${sourceName})`);
|
|
374
393
|
}
|
|
375
394
|
}
|
|
376
395
|
// ---------------------------------------------------------------------------
|
|
377
396
|
// File writing
|
|
378
397
|
// ---------------------------------------------------------------------------
|
|
379
|
-
function writeConfig(rootDir, filename, config, header) {
|
|
398
|
+
function writeConfig(rootDir, filename, config, header, log) {
|
|
380
399
|
const yamlStr = dump(config, {
|
|
381
400
|
forceQuotes: false,
|
|
382
401
|
lineWidth: 120,
|
|
@@ -386,5 +405,5 @@ function writeConfig(rootDir, filename, config, header) {
|
|
|
386
405
|
const content = `${header}\n${yamlStr}`;
|
|
387
406
|
const outPath = resolve(rootDir, filename);
|
|
388
407
|
writeFileSync(outPath, content, "utf-8");
|
|
389
|
-
|
|
408
|
+
log.info(` ✓ ${filename}`);
|
|
390
409
|
}
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
* Migrated from lib/grader-api.ts — no module-level side effects, no
|
|
13
13
|
* process.exit(), accepts rootDir as parameter for file-based operations.
|
|
14
14
|
*/
|
|
15
|
+
import type { Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
15
16
|
interface ProviderConfig {
|
|
16
17
|
apiKey: string;
|
|
17
18
|
baseUrl: string;
|
|
@@ -23,7 +24,7 @@ interface ProviderConfig {
|
|
|
23
24
|
* Dispatches to the correct provider API based on the model prefix.
|
|
24
25
|
* Returns a numeric score (0–100) or null if the call or parse fails.
|
|
25
26
|
*/
|
|
26
|
-
export declare function gradeOnce(graderModel: string, responseText: string, rubricText: string): Promise<null | number>;
|
|
27
|
+
export declare function gradeOnce(graderModel: string, responseText: string, rubricText: string, logger?: Logger): Promise<null | number>;
|
|
27
28
|
/**
|
|
28
29
|
* Load the grader model from `config/models.yaml`.
|
|
29
30
|
* Returns both the model ID and human-readable label.
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
import { existsSync, readFileSync } from "fs";
|
|
16
16
|
import { join } from "path";
|
|
17
17
|
import { load } from "js-yaml";
|
|
18
|
+
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
18
19
|
// ---------------------------------------------------------------------------
|
|
19
20
|
// Public API
|
|
20
21
|
// ---------------------------------------------------------------------------
|
|
@@ -24,7 +25,8 @@ import { load } from "js-yaml";
|
|
|
24
25
|
* Dispatches to the correct provider API based on the model prefix.
|
|
25
26
|
* Returns a numeric score (0–100) or null if the call or parse fails.
|
|
26
27
|
*/
|
|
27
|
-
export async function gradeOnce(graderModel, responseText, rubricText) {
|
|
28
|
+
export async function gradeOnce(graderModel, responseText, rubricText, logger) {
|
|
29
|
+
const log = logger ?? new ConsoleLogger();
|
|
28
30
|
const config = resolveProvider(graderModel);
|
|
29
31
|
const prompt = `You are evaluating an AI assistant's response. Grade the response according to the following rubric.
|
|
30
32
|
|
|
@@ -38,10 +40,10 @@ ${rubricText}
|
|
|
38
40
|
const provider = graderModel.split(":")[0];
|
|
39
41
|
let content;
|
|
40
42
|
if (provider === "anthropic") {
|
|
41
|
-
content = await callAnthropic(config, prompt);
|
|
43
|
+
content = await callAnthropic(config, prompt, log);
|
|
42
44
|
}
|
|
43
45
|
else if (provider === "openai") {
|
|
44
|
-
content = await callOpenAI(config, prompt);
|
|
46
|
+
content = await callOpenAI(config, prompt, log);
|
|
45
47
|
}
|
|
46
48
|
else {
|
|
47
49
|
// resolveProvider already throws for unknown providers, but just in case
|
|
@@ -51,12 +53,12 @@ ${rubricText}
|
|
|
51
53
|
return null;
|
|
52
54
|
const score = extractScore(content);
|
|
53
55
|
if (score === null) {
|
|
54
|
-
|
|
56
|
+
log.error(` ⚠ Could not parse grader response: ${content.slice(0, 100)}`);
|
|
55
57
|
}
|
|
56
58
|
return score;
|
|
57
59
|
}
|
|
58
60
|
catch (err) {
|
|
59
|
-
|
|
61
|
+
log.error(` ⚠ Grader call failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
60
62
|
return null;
|
|
61
63
|
}
|
|
62
64
|
}
|
|
@@ -152,7 +154,7 @@ export function resolveProvider(graderModel) {
|
|
|
152
154
|
// ---------------------------------------------------------------------------
|
|
153
155
|
// Provider-specific API calls
|
|
154
156
|
// ---------------------------------------------------------------------------
|
|
155
|
-
async function callAnthropic(config, prompt) {
|
|
157
|
+
async function callAnthropic(config, prompt, log) {
|
|
156
158
|
const response = await fetch(config.baseUrl, {
|
|
157
159
|
body: JSON.stringify({
|
|
158
160
|
max_tokens: 256,
|
|
@@ -169,14 +171,14 @@ async function callAnthropic(config, prompt) {
|
|
|
169
171
|
});
|
|
170
172
|
if (!response.ok) {
|
|
171
173
|
const text = await response.text();
|
|
172
|
-
|
|
174
|
+
log.error(` ⚠ Grader API error (Anthropic): ${response.status} ${text.slice(0, 200)}`);
|
|
173
175
|
return null;
|
|
174
176
|
}
|
|
175
177
|
const data = (await response.json());
|
|
176
178
|
const textBlock = data.content?.find((c) => c.type === "text");
|
|
177
179
|
return textBlock?.text ?? "";
|
|
178
180
|
}
|
|
179
|
-
async function callOpenAI(config, prompt) {
|
|
181
|
+
async function callOpenAI(config, prompt, log) {
|
|
180
182
|
const response = await fetch(config.baseUrl, {
|
|
181
183
|
body: JSON.stringify({
|
|
182
184
|
max_tokens: 256,
|
|
@@ -192,7 +194,7 @@ async function callOpenAI(config, prompt) {
|
|
|
192
194
|
});
|
|
193
195
|
if (!response.ok) {
|
|
194
196
|
const text = await response.text();
|
|
195
|
-
|
|
197
|
+
log.error(` ⚠ Grader API error (OpenAI): ${response.status} ${text.slice(0, 200)}`);
|
|
196
198
|
return null;
|
|
197
199
|
}
|
|
198
200
|
const data = (await response.json());
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
*
|
|
13
13
|
* @see docs/exec-plans/grader-reliability.md — Phase 3
|
|
14
14
|
*/
|
|
15
|
+
import type { Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
15
16
|
import { type GraderComparison } from "./grader-comparison.js";
|
|
16
17
|
export interface GraderCompareRunnerOptions {
|
|
17
18
|
/** Candidate grader models to compare against the baseline */
|
|
@@ -21,6 +22,8 @@ export interface GraderCompareRunnerOptions {
|
|
|
21
22
|
}[];
|
|
22
23
|
/** Output format */
|
|
23
24
|
format?: "json" | "table";
|
|
25
|
+
/** Logger instance (defaults to ConsoleLogger) */
|
|
26
|
+
logger?: Logger;
|
|
24
27
|
/** Custom output path (default: results/latest/grader-comparison.json) */
|
|
25
28
|
outputPath?: string;
|
|
26
29
|
/** Path to eval results (default: results/latest/eval-results.json) */
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
16
16
|
import { join } from "path";
|
|
17
17
|
import { load } from "js-yaml";
|
|
18
|
+
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
18
19
|
import { compareGraders, } from "./grader-comparison.js";
|
|
19
20
|
import { classifyCorrelation } from "./grader-validation.js";
|
|
20
21
|
import { gradeOnce } from "./grader-api.js";
|
|
@@ -198,10 +199,11 @@ export function formatComparisonReport(result) {
|
|
|
198
199
|
*/
|
|
199
200
|
export async function runGraderCompare(options) {
|
|
200
201
|
const { rootDir, format = "table" } = options;
|
|
202
|
+
const log = options.logger ?? new ConsoleLogger();
|
|
201
203
|
const resultsPath = options.resultsPath
|
|
202
204
|
? join(rootDir, options.resultsPath)
|
|
203
205
|
: join(rootDir, "results", "latest", "eval-results.json");
|
|
204
|
-
|
|
206
|
+
log.info("=== Grader Comparison ===\n");
|
|
205
207
|
// Load config
|
|
206
208
|
const { baseline, candidates } = loadConfig(rootDir, options.candidates);
|
|
207
209
|
if (candidates.length === 0) {
|
|
@@ -215,32 +217,32 @@ export async function runGraderCompare(options) {
|
|
|
215
217
|
const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
|
|
216
218
|
// Extract judgments
|
|
217
219
|
const judgments = extractJudgments(file);
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
220
|
+
log.info(` Baseline: ${baseline.label} (${baseline.id})`);
|
|
221
|
+
log.info(` Candidates: ${candidates.map((c) => c.label).join(", ")}`);
|
|
222
|
+
log.info(` Judgments: ${judgments.length}`);
|
|
221
223
|
if (judgments.length === 0) {
|
|
222
224
|
throw new Error("No gradable judgments found in results.");
|
|
223
225
|
}
|
|
224
226
|
const totalCalls = judgments.length * (1 + candidates.length);
|
|
225
227
|
const estimatedCost = totalCalls * 0.005;
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
228
|
+
log.info(` API calls: ${totalCalls} (${judgments.length} × ${1 + candidates.length} models)`);
|
|
229
|
+
log.info(` Est. cost: ~$${estimatedCost.toFixed(2)}`);
|
|
230
|
+
log.info("");
|
|
229
231
|
// Grade with baseline
|
|
230
|
-
|
|
231
|
-
const baselineScores = await gradeJudgments(judgments, baseline.id);
|
|
232
|
+
log.info(` Grading with baseline: ${baseline.label}...`);
|
|
233
|
+
const baselineScores = await gradeJudgments(judgments, baseline.id, log);
|
|
232
234
|
// Grade with each candidate
|
|
233
235
|
const candidateScoreSets = [];
|
|
234
236
|
for (const candidate of candidates) {
|
|
235
|
-
|
|
236
|
-
const scores = await gradeJudgments(judgments, candidate.id);
|
|
237
|
+
log.info(` Grading with candidate: ${candidate.label}...`);
|
|
238
|
+
const scores = await gradeJudgments(judgments, candidate.id, log);
|
|
237
239
|
candidateScoreSets.push({
|
|
238
240
|
label: candidate.label,
|
|
239
241
|
modelId: candidate.id,
|
|
240
242
|
scores,
|
|
241
243
|
});
|
|
242
244
|
}
|
|
243
|
-
|
|
245
|
+
log.info("");
|
|
244
246
|
// Compare
|
|
245
247
|
const baselineScoreSet = {
|
|
246
248
|
label: baseline.label,
|
|
@@ -250,10 +252,10 @@ export async function runGraderCompare(options) {
|
|
|
250
252
|
const result = compareGraders(baselineScoreSet, candidateScoreSets);
|
|
251
253
|
// Output
|
|
252
254
|
if (format === "table") {
|
|
253
|
-
|
|
255
|
+
log.info(formatComparisonReport(result));
|
|
254
256
|
}
|
|
255
257
|
else {
|
|
256
|
-
|
|
258
|
+
log.info(JSON.stringify(result, null, 2));
|
|
257
259
|
}
|
|
258
260
|
// Write output
|
|
259
261
|
const outPath = options.outputPath ??
|
|
@@ -261,7 +263,7 @@ export async function runGraderCompare(options) {
|
|
|
261
263
|
const outDir = join(outPath, "..");
|
|
262
264
|
mkdirSync(outDir, { recursive: true });
|
|
263
265
|
writeFileSync(outPath, JSON.stringify(result, null, 2));
|
|
264
|
-
|
|
266
|
+
log.info(`\n 📄 Results written to ${outPath}`);
|
|
265
267
|
return result;
|
|
266
268
|
}
|
|
267
269
|
// ---------------------------------------------------------------------------
|
|
@@ -271,7 +273,7 @@ export async function runGraderCompare(options) {
|
|
|
271
273
|
* Grade a set of judgments with a specific grader model.
|
|
272
274
|
* Returns GraderScore[] with one score per judgment.
|
|
273
275
|
*/
|
|
274
|
-
async function gradeJudgments(judgments, graderModel) {
|
|
276
|
+
async function gradeJudgments(judgments, graderModel, log) {
|
|
275
277
|
const scores = [];
|
|
276
278
|
let completed = 0;
|
|
277
279
|
let failed = 0;
|
|
@@ -280,7 +282,7 @@ async function gradeJudgments(judgments, graderModel) {
|
|
|
280
282
|
completed++;
|
|
281
283
|
if (completed % 10 === 0 || completed === judgments.length) {
|
|
282
284
|
const pct = Math.round((completed / judgments.length) * 100);
|
|
283
|
-
|
|
285
|
+
log.info(` Progress: ${completed}/${judgments.length} (${pct}%)`);
|
|
284
286
|
}
|
|
285
287
|
if (score === null) {
|
|
286
288
|
failed++;
|
|
@@ -293,9 +295,9 @@ async function gradeJudgments(judgments, graderModel) {
|
|
|
293
295
|
taskId: judgment.description,
|
|
294
296
|
});
|
|
295
297
|
}
|
|
296
|
-
|
|
298
|
+
log.info(""); // newline after progress
|
|
297
299
|
if (failed > 0) {
|
|
298
|
-
|
|
300
|
+
log.warn(` ⚠ ${failed} grading calls failed (excluded)`);
|
|
299
301
|
}
|
|
300
302
|
return scores;
|
|
301
303
|
}
|
|
@@ -14,11 +14,14 @@
|
|
|
14
14
|
*
|
|
15
15
|
* @see docs/exec-plans/grader-reliability.md — Phase 1
|
|
16
16
|
*/
|
|
17
|
+
import { type Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
17
18
|
import type { RawPromptfooFile } from "./calculate-scores.js";
|
|
18
19
|
import { type GraderConsistency } from "./grader-consistency.js";
|
|
19
20
|
import type { DimensionName } from "./types.js";
|
|
20
21
|
/** Options for the grader consistency runner. */
|
|
21
22
|
export interface GraderConsistencyRunnerOptions {
|
|
23
|
+
/** Logger for structured output. Falls back to ConsoleLogger if omitted. */
|
|
24
|
+
logger?: Logger;
|
|
22
25
|
/** Number of additional grading replications (default: 5) */
|
|
23
26
|
replications: number;
|
|
24
27
|
/** Path to eval-results.json */
|