@sanity/ailf 0.1.34 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/config/airbyte/ai_literacy_framework.connector.yaml +6 -0
- package/config/bigquery/views/reports.sql +1 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +10 -20
- package/dist/_vendor/ailf-core/examples/index.js +10 -20
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +2 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +65 -0
- package/dist/_vendor/ailf-tasks/schemas.d.ts +12 -0
- package/dist/_vendor/ailf-tasks/schemas.js +4 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +9 -1
- package/dist/adapters/task-sources/repo-task-source.js +19 -4
- package/dist/commands/calculate-scores.js +5 -1
- package/dist/commands/publish.js +3 -0
- package/dist/composition-root.js +7 -2
- package/dist/orchestration/pipeline-orchestrator.js +27 -2
- package/dist/orchestration/step-runner.js +8 -0
- package/dist/orchestration/steps/calculate-scores-step.js +22 -19
- package/dist/orchestration/steps/generate-configs-step.js +1 -0
- package/dist/orchestration/steps/grader-consistency-step.js +1 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +2 -1
- package/dist/orchestration/steps/publish-report-step.js +3 -0
- package/dist/pipeline/calculate-scores.d.ts +11 -1
- package/dist/pipeline/calculate-scores.js +222 -157
- package/dist/pipeline/coverage-audit.d.ts +2 -1
- package/dist/pipeline/coverage-audit.js +5 -3
- package/dist/pipeline/expand-tasks.d.ts +2 -1
- package/dist/pipeline/expand-tasks.js +33 -2
- package/dist/pipeline/generate-configs.d.ts +3 -1
- package/dist/pipeline/generate-configs.js +51 -37
- package/dist/pipeline/grader-api.d.ts +2 -1
- package/dist/pipeline/grader-api.js +11 -9
- package/dist/pipeline/grader-compare-runner.d.ts +3 -0
- package/dist/pipeline/grader-compare-runner.js +21 -19
- package/dist/pipeline/grader-consistency-runner.d.ts +3 -0
- package/dist/pipeline/grader-consistency-runner.js +16 -14
- package/dist/pipeline/grader-sensitivity-runner.d.ts +3 -0
- package/dist/pipeline/grader-sensitivity-runner.js +18 -16
- package/dist/pipeline/grader-validate-runner.d.ts +3 -0
- package/dist/pipeline/grader-validate-runner.js +16 -14
- package/dist/pipeline/mirror-repo-tasks.d.ts +80 -1
- package/dist/pipeline/mirror-repo-tasks.js +148 -32
- package/dist/pipeline/provenance.d.ts +3 -0
- package/dist/pipeline/provenance.js +25 -3
- package/dist/pipeline/report-title.d.ts +66 -0
- package/dist/pipeline/report-title.js +118 -0
- package/dist/report-store.js +2 -0
- package/dist/sinks/bigquery/index.d.ts +1 -0
- package/dist/sinks/bigquery/index.js +1 -0
- package/dist/sources.d.ts +2 -1
- package/dist/sources.js +28 -1
- package/package.json +23 -23
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
*
|
|
10
10
|
* @see docs/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
11
11
|
*/
|
|
12
|
+
import type { Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
12
13
|
import type { CoverageAuditReport, ProductFeature } from "./types.js";
|
|
13
14
|
/**
|
|
14
15
|
* Count unique document slugs referenced across all tasks.
|
|
@@ -32,7 +33,7 @@ export declare function formatCoverageMarkdown(report: CoverageAuditReport): str
|
|
|
32
33
|
/**
|
|
33
34
|
* Load and validate the feature registry from config/features.yaml.
|
|
34
35
|
*/
|
|
35
|
-
export declare function loadFeatureRegistry(rootDir: string): null | ProductFeature[];
|
|
36
|
+
export declare function loadFeatureRegistry(rootDir: string, logger?: Logger): null | ProductFeature[];
|
|
36
37
|
/**
|
|
37
38
|
* Run the coverage audit and produce a structured report.
|
|
38
39
|
*/
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
import { existsSync, readFileSync } from "fs";
|
|
13
13
|
import { join } from "path";
|
|
14
14
|
import { load } from "js-yaml";
|
|
15
|
+
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
15
16
|
import { FeatureRegistrySchema } from "./schemas.js";
|
|
16
17
|
import { resolveMappings } from "./resolve-mappings.js";
|
|
17
18
|
// ---------------------------------------------------------------------------
|
|
@@ -113,7 +114,8 @@ export function formatCoverageMarkdown(report) {
|
|
|
113
114
|
/**
|
|
114
115
|
* Load and validate the feature registry from config/features.yaml.
|
|
115
116
|
*/
|
|
116
|
-
export function loadFeatureRegistry(rootDir) {
|
|
117
|
+
export function loadFeatureRegistry(rootDir, logger) {
|
|
118
|
+
const log = logger ?? new ConsoleLogger();
|
|
117
119
|
const filePath = join(rootDir, "config", "features.yaml");
|
|
118
120
|
if (!existsSync(filePath)) {
|
|
119
121
|
return null;
|
|
@@ -122,9 +124,9 @@ export function loadFeatureRegistry(rootDir) {
|
|
|
122
124
|
const parsed = load(raw);
|
|
123
125
|
const result = FeatureRegistrySchema.safeParse(parsed);
|
|
124
126
|
if (!result.success) {
|
|
125
|
-
|
|
127
|
+
log.error("❌ config/features.yaml validation failed:");
|
|
126
128
|
for (const issue of result.error.issues) {
|
|
127
|
-
|
|
129
|
+
log.error(` ${issue.path.join(".")}: ${issue.message}`);
|
|
128
130
|
}
|
|
129
131
|
return null;
|
|
130
132
|
}
|
|
@@ -41,6 +41,7 @@
|
|
|
41
41
|
* 2. Baseline entry — sets docs: "", adds transform, uses abbreviated rubric
|
|
42
42
|
*/
|
|
43
43
|
import type { TaskDefinition } from "../_vendor/ailf-core/index.d.ts";
|
|
44
|
+
import type { Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
44
45
|
import { type RubricConfig } from "./schemas.js";
|
|
45
46
|
import type { FilterOptions } from "./types.js";
|
|
46
47
|
/** Any assertion entry (templated or value-based). */
|
|
@@ -198,7 +199,7 @@ export declare function isTemplatedAssert(entry: AssertEntry): entry is Template
|
|
|
198
199
|
*
|
|
199
200
|
* Returns the expanded entries grouped by source file.
|
|
200
201
|
*/
|
|
201
|
-
export declare function loadAndExpandTasks(rootDir: string, filter?: FilterOptions, mode?: "agentic" | "baseline"): {
|
|
202
|
+
export declare function loadAndExpandTasks(rootDir: string, filter?: FilterOptions, mode?: "agentic" | "baseline", logger?: Logger): {
|
|
202
203
|
/** All expanded test entries, in order. */
|
|
203
204
|
entries: ExpandedTestEntry[];
|
|
204
205
|
/** Statistics about what was processed. */
|
|
@@ -43,6 +43,7 @@
|
|
|
43
43
|
import { existsSync, readFileSync, readdirSync } from "fs";
|
|
44
44
|
import { resolve } from "path";
|
|
45
45
|
import { load } from "js-yaml";
|
|
46
|
+
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
46
47
|
import { RubricConfigSchema } from "./schemas.js";
|
|
47
48
|
// ---------------------------------------------------------------------------
|
|
48
49
|
// Rubric template loading and assembly
|
|
@@ -313,9 +314,13 @@ export function isTemplatedAssert(entry) {
|
|
|
313
314
|
*
|
|
314
315
|
* Returns the expanded entries grouped by source file.
|
|
315
316
|
*/
|
|
316
|
-
export function loadAndExpandTasks(rootDir, filter, mode = "baseline") {
|
|
317
|
+
export function loadAndExpandTasks(rootDir, filter, mode = "baseline", logger) {
|
|
318
|
+
const log = logger ?? new ConsoleLogger();
|
|
317
319
|
const tasksDir = resolve(rootDir, "tasks");
|
|
318
320
|
if (!existsSync(tasksDir)) {
|
|
321
|
+
log.debug("Tasks directory not found, returning empty (tasks may come from Content Lake)", {
|
|
322
|
+
tasksDir,
|
|
323
|
+
});
|
|
319
324
|
// tasks/ may not exist when task definitions come from Content Lake
|
|
320
325
|
return {
|
|
321
326
|
entries: [],
|
|
@@ -332,13 +337,25 @@ export function loadAndExpandTasks(rootDir, filter, mode = "baseline") {
|
|
|
332
337
|
let yamlFiles = readdirSync(tasksDir)
|
|
333
338
|
.filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
|
|
334
339
|
.sort();
|
|
340
|
+
log.debug("Discovered task YAML files", {
|
|
341
|
+
directory: tasksDir,
|
|
342
|
+
fileCount: yamlFiles.length,
|
|
343
|
+
files: yamlFiles,
|
|
344
|
+
});
|
|
335
345
|
// Apply area filter — area name = filename stem (e.g., "groq" matches "groq.yaml")
|
|
336
346
|
if (filter?.areas && filter.areas.length > 0) {
|
|
337
347
|
const allowedAreas = new Set(filter.areas.map((a) => a.toLowerCase()));
|
|
348
|
+
const beforeCount = yamlFiles.length;
|
|
338
349
|
yamlFiles = yamlFiles.filter((f) => {
|
|
339
350
|
const stem = f.replace(/\.ya?ml$/, "").toLowerCase();
|
|
340
351
|
return allowedAreas.has(stem);
|
|
341
352
|
});
|
|
353
|
+
log.debug("Applied area filter", {
|
|
354
|
+
allowedAreas: [...allowedAreas],
|
|
355
|
+
beforeCount,
|
|
356
|
+
afterCount: yamlFiles.length,
|
|
357
|
+
matchedFiles: yamlFiles,
|
|
358
|
+
});
|
|
342
359
|
}
|
|
343
360
|
const entries = [];
|
|
344
361
|
let singleDefinitions = 0;
|
|
@@ -353,10 +370,17 @@ export function loadAndExpandTasks(rootDir, filter, mode = "baseline") {
|
|
|
353
370
|
// Apply task ID filter
|
|
354
371
|
if (filter?.taskIds && filter.taskIds.length > 0) {
|
|
355
372
|
const allowedIds = new Set(filter.taskIds);
|
|
373
|
+
const beforeCount = parsed.length;
|
|
356
374
|
parsed = parsed.filter((entry) => typeof entry === "object" &&
|
|
357
375
|
entry !== null &&
|
|
358
376
|
"id" in entry &&
|
|
359
377
|
allowedIds.has(entry.id));
|
|
378
|
+
log.debug("Applied task ID filter", {
|
|
379
|
+
file,
|
|
380
|
+
allowedIds: [...allowedIds],
|
|
381
|
+
beforeCount,
|
|
382
|
+
afterCount: parsed.length,
|
|
383
|
+
});
|
|
360
384
|
}
|
|
361
385
|
for (const entry of parsed) {
|
|
362
386
|
if (isSingleTaskDefinition(entry)) {
|
|
@@ -370,8 +394,15 @@ export function loadAndExpandTasks(rootDir, filter, mode = "baseline") {
|
|
|
370
394
|
}
|
|
371
395
|
}
|
|
372
396
|
if (filter?.areas || filter?.taskIds) {
|
|
373
|
-
|
|
397
|
+
log.info(`Filter: ${filter.areas ? `areas=[${filter.areas.join(", ")}]` : ""}${filter.areas && filter.taskIds ? ", " : ""}${filter.taskIds ? `tasks=[${filter.taskIds.join(", ")}]` : ""}`);
|
|
374
398
|
}
|
|
399
|
+
log.debug("Task expansion complete", {
|
|
400
|
+
totalFiles: yamlFiles.length,
|
|
401
|
+
singleDefinitions,
|
|
402
|
+
legacyEntries,
|
|
403
|
+
expandedTotal: entries.length,
|
|
404
|
+
mode,
|
|
405
|
+
});
|
|
375
406
|
return {
|
|
376
407
|
entries,
|
|
377
408
|
stats: {
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
* @see config/models.yaml — the central model registry
|
|
19
19
|
* @see docs/exec-plans/eliminate-lib-layer.md
|
|
20
20
|
*/
|
|
21
|
-
import { type TaskDefinition } from "../_vendor/ailf-core/index.d.ts";
|
|
21
|
+
import { type Logger, type TaskDefinition } from "../_vendor/ailf-core/index.d.ts";
|
|
22
22
|
import type { FilterOptions } from "./types.js";
|
|
23
23
|
import { type ResolvedSourceConfig } from "../sources.js";
|
|
24
24
|
export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "../_vendor/ailf-core/index.d.ts";
|
|
@@ -49,6 +49,8 @@ export interface GenerateConfigsOptions {
|
|
|
49
49
|
allowedOrigins?: string[];
|
|
50
50
|
/** Filter to specific feature areas or task IDs */
|
|
51
51
|
filter?: FilterOptions;
|
|
52
|
+
/** Logger instance (defaults to ConsoleLogger) */
|
|
53
|
+
logger?: Logger;
|
|
52
54
|
/** Pre-resolved source config (skips loadSource() call) */
|
|
53
55
|
resolvedSource?: ResolvedSourceConfig;
|
|
54
56
|
/** Root directory of the eval package (required) */
|
|
@@ -22,6 +22,7 @@ import { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } fro
|
|
|
22
22
|
import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
|
|
23
23
|
import { resolve } from "path";
|
|
24
24
|
import { dump, load } from "js-yaml";
|
|
25
|
+
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
25
26
|
import { expandTaskDefinitions, loadAndExpandTasks } from "./expand-tasks.js";
|
|
26
27
|
import { validateModelsYaml } from "./validate.js";
|
|
27
28
|
import { loadSource } from "../sources.js";
|
|
@@ -260,31 +261,34 @@ function generateObservedConfig(models, tests, prompts) {
|
|
|
260
261
|
*/
|
|
261
262
|
export function generateConfigs(options) {
|
|
262
263
|
const { rootDir } = options;
|
|
264
|
+
const log = options.logger ?? new ConsoleLogger();
|
|
263
265
|
// Validate config/models.yaml before generating configs
|
|
264
266
|
const modelIssues = validateModelsYaml(rootDir);
|
|
265
267
|
const modelErrors = modelIssues.filter((i) => i.severity === "error");
|
|
266
268
|
if (modelErrors.length > 0) {
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
console.error(` at ${e.path}`);
|
|
272
|
-
}
|
|
273
|
-
}
|
|
274
|
-
console.error("\nFix config/models.yaml before generating configs. Run 'pnpm validate' for details.");
|
|
275
|
-
process.exit(1);
|
|
269
|
+
const details = modelErrors
|
|
270
|
+
.map((e) => (e.path ? `${e.message} (at ${e.path})` : e.message))
|
|
271
|
+
.join("; ");
|
|
272
|
+
throw new Error(`config/models.yaml validation failed: ${details}. Run 'pnpm validate' for details.`);
|
|
276
273
|
}
|
|
277
|
-
|
|
274
|
+
log.info("Loading config/models.yaml...");
|
|
278
275
|
const models = loadModels(rootDir);
|
|
279
276
|
const activeModels = models.models.filter((m) => m.id && m.label);
|
|
280
|
-
|
|
277
|
+
log.debug("Models loaded from config/models.yaml", {
|
|
278
|
+
totalModels: models.models.length,
|
|
279
|
+
activeModels: activeModels.length,
|
|
280
|
+
modelIds: activeModels.map((m) => m.id),
|
|
281
|
+
graderId: models.grader.id,
|
|
282
|
+
maxConcurrency: models.maxConcurrency,
|
|
283
|
+
});
|
|
284
|
+
log.info(` Found ${activeModels.length} active model(s):`);
|
|
281
285
|
for (const m of activeModels) {
|
|
282
286
|
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty array join → "all"
|
|
283
287
|
const modes = m.modes?.join(", ") || "all";
|
|
284
|
-
|
|
288
|
+
log.info(` - ${m.label} (${m.id}) → [${modes}]`);
|
|
285
289
|
}
|
|
286
290
|
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty label falls through to id
|
|
287
|
-
|
|
291
|
+
log.info(` Grader: ${models.grader.label || models.grader.id}`);
|
|
288
292
|
// Build filter from options
|
|
289
293
|
const filter = options.filter?.areas || options.filter?.taskIds
|
|
290
294
|
? options.filter
|
|
@@ -295,20 +299,24 @@ export function generateConfigs(options) {
|
|
|
295
299
|
let agenticEntries;
|
|
296
300
|
if (options.tasks) {
|
|
297
301
|
// TaskSource path — tasks already loaded and filtered by the adapter
|
|
302
|
+
log.debug("Expanding tasks from TaskSource adapter", {
|
|
303
|
+
taskCount: options.tasks.length,
|
|
304
|
+
taskIds: options.tasks.map((t) => t.id),
|
|
305
|
+
});
|
|
298
306
|
const baselineResult = expandTaskDefinitions(options.tasks, rootDir, "baseline");
|
|
299
307
|
entries = baselineResult.entries;
|
|
300
|
-
|
|
308
|
+
log.info(` Expanded ${baselineResult.stats.totalTasks} task(s) → ${baselineResult.stats.expandedTotal} test entries (from TaskSource)`);
|
|
301
309
|
const agenticResult = expandTaskDefinitions(options.tasks, rootDir, "agentic");
|
|
302
310
|
agenticEntries = agenticResult.entries;
|
|
303
|
-
|
|
311
|
+
log.info(` Agentic: ${agenticResult.stats.expandedTotal} entries (gold only, no baseline)`);
|
|
304
312
|
}
|
|
305
313
|
else {
|
|
306
314
|
// Legacy path — read from tasks/*.yaml files
|
|
307
|
-
const { entries: baselineEntries, stats } = loadAndExpandTasks(rootDir, filter, "baseline");
|
|
315
|
+
const { entries: baselineEntries, stats } = loadAndExpandTasks(rootDir, filter, "baseline", log);
|
|
308
316
|
entries = baselineEntries;
|
|
309
|
-
|
|
317
|
+
log.info(` Expanded ${stats.singleDefinitions} task(s) → ${stats.expandedTotal} test entries`);
|
|
310
318
|
if (stats.legacyEntries > 0) {
|
|
311
|
-
|
|
319
|
+
log.info(` ⚠ ${stats.legacyEntries} legacy (paired) entries passed through unchanged`);
|
|
312
320
|
}
|
|
313
321
|
if (filter) {
|
|
314
322
|
const parts = [];
|
|
@@ -318,11 +326,11 @@ export function generateConfigs(options) {
|
|
|
318
326
|
if (filter.taskIds) {
|
|
319
327
|
parts.push(`tasks: ${filter.taskIds.join(", ")}`);
|
|
320
328
|
}
|
|
321
|
-
|
|
329
|
+
log.info(` Scoped to: ${parts.join("; ")}`);
|
|
322
330
|
}
|
|
323
|
-
const { entries: agenticFromYaml, stats: agenticStats } = loadAndExpandTasks(rootDir, filter, "agentic");
|
|
331
|
+
const { entries: agenticFromYaml, stats: agenticStats } = loadAndExpandTasks(rootDir, filter, "agentic", log);
|
|
324
332
|
agenticEntries = agenticFromYaml;
|
|
325
|
-
|
|
333
|
+
log.info(` Agentic: ${agenticStats.expandedTotal} entries (gold only, no baseline)`);
|
|
326
334
|
}
|
|
327
335
|
// Write expanded tasks to generated files for Promptfoo to consume
|
|
328
336
|
const expandedPath = resolve(rootDir, "tasks", ".expanded.yaml");
|
|
@@ -333,7 +341,7 @@ export function generateConfigs(options) {
|
|
|
333
341
|
quotingType: "'",
|
|
334
342
|
});
|
|
335
343
|
writeFileSync(expandedPath, `# .expanded.yaml\n#\n# AUTO-GENERATED — do not edit directly.\n# Source: tasks/*.yaml (single-definition format)\n# Run: pnpm generate-configs\n\n${expandedYaml}`, "utf-8");
|
|
336
|
-
|
|
344
|
+
log.info(` ✓ tasks/.expanded.yaml (${entries.length} entries)`);
|
|
337
345
|
const agenticExpandedPath = resolve(rootDir, "tasks", ".expanded.agentic.yaml");
|
|
338
346
|
const agenticExpandedYaml = dump(agenticEntries, {
|
|
339
347
|
forceQuotes: false,
|
|
@@ -342,46 +350,52 @@ export function generateConfigs(options) {
|
|
|
342
350
|
quotingType: "'",
|
|
343
351
|
});
|
|
344
352
|
writeFileSync(agenticExpandedPath, `# .expanded.agentic.yaml\n#\n# AUTO-GENERATED — do not edit directly.\n# Gold entries only (no baseline) for agentic evaluation mode.\n# Source: tasks/*.yaml (single-definition format)\n# Run: pnpm generate-configs\n\n${agenticExpandedYaml}`, "utf-8");
|
|
345
|
-
|
|
353
|
+
log.info(` ✓ tasks/.expanded.agentic.yaml (${agenticEntries.length} entries)`);
|
|
346
354
|
const taskFiles = ["file://tasks/.expanded.yaml"];
|
|
347
355
|
const agenticTaskFiles = ["file://tasks/.expanded.agentic.yaml"];
|
|
348
356
|
// Load prompt templates
|
|
349
357
|
const prompts = loadPrompts(rootDir);
|
|
350
|
-
|
|
358
|
+
log.debug("Prompt templates loaded", {
|
|
359
|
+
keys: Object.keys(prompts),
|
|
360
|
+
withDocsId: prompts.withDocs.id,
|
|
361
|
+
withoutDocsId: prompts.withoutDocs.id,
|
|
362
|
+
agenticId: prompts.agentic.id,
|
|
363
|
+
});
|
|
364
|
+
log.info(` Loaded prompts: ${Object.keys(prompts).join(", ")}`);
|
|
351
365
|
// Load optional documentation source configuration
|
|
352
366
|
// Pre-resolved source wins over name-based lookup
|
|
353
367
|
let source = options.resolvedSource;
|
|
354
368
|
const sourceName = options.source;
|
|
355
369
|
if (!source && sourceName) {
|
|
356
|
-
|
|
370
|
+
log.info(`\nLoading source: ${sourceName}`);
|
|
357
371
|
try {
|
|
358
372
|
source = loadSource(sourceName);
|
|
359
373
|
}
|
|
360
374
|
catch (err) {
|
|
361
375
|
const msg = err instanceof Error ? err.message : String(err);
|
|
362
|
-
|
|
376
|
+
log.warn(`\n⚠ Failed to load source "${sourceName}": ${msg}`);
|
|
363
377
|
}
|
|
364
378
|
}
|
|
365
379
|
if (source) {
|
|
366
|
-
|
|
367
|
-
|
|
380
|
+
log.info(` Base URL: ${source.baseUrl}`);
|
|
381
|
+
log.info(` Dataset: ${source.dataset}`);
|
|
368
382
|
if (source.allowedOrigins?.length) {
|
|
369
|
-
|
|
383
|
+
log.info(` Allowed origins: ${source.allowedOrigins.join(", ")}`);
|
|
370
384
|
}
|
|
371
385
|
}
|
|
372
|
-
|
|
373
|
-
writeConfig(rootDir, "promptfooconfig.yaml", generateBaselineConfig(models, taskFiles, prompts), `# promptfooconfig.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n
|
|
374
|
-
writeConfig(rootDir, "promptfooconfig.observed.yaml", generateObservedConfig(models, taskFiles, prompts), `# promptfooconfig.observed.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n
|
|
375
|
-
writeConfig(rootDir, "promptfooconfig.agentic.yaml", generateAgenticConfig(models, agenticTaskFiles, prompts, source, options.searchMode, options.allowedOrigins), `# promptfooconfig.agentic.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n
|
|
376
|
-
|
|
386
|
+
log.info("\nGenerating configs...");
|
|
387
|
+
writeConfig(rootDir, "promptfooconfig.yaml", generateBaselineConfig(models, taskFiles, prompts), `# promptfooconfig.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`, log);
|
|
388
|
+
writeConfig(rootDir, "promptfooconfig.observed.yaml", generateObservedConfig(models, taskFiles, prompts), `# promptfooconfig.observed.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`, log);
|
|
389
|
+
writeConfig(rootDir, "promptfooconfig.agentic.yaml", generateAgenticConfig(models, agenticTaskFiles, prompts, source, options.searchMode, options.allowedOrigins), `# promptfooconfig.agentic.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`, log);
|
|
390
|
+
log.info("\nDone! Configs are ready.");
|
|
377
391
|
if (source) {
|
|
378
|
-
|
|
392
|
+
log.info(` (using doc source: ${sourceName})`);
|
|
379
393
|
}
|
|
380
394
|
}
|
|
381
395
|
// ---------------------------------------------------------------------------
|
|
382
396
|
// File writing
|
|
383
397
|
// ---------------------------------------------------------------------------
|
|
384
|
-
function writeConfig(rootDir, filename, config, header) {
|
|
398
|
+
function writeConfig(rootDir, filename, config, header, log) {
|
|
385
399
|
const yamlStr = dump(config, {
|
|
386
400
|
forceQuotes: false,
|
|
387
401
|
lineWidth: 120,
|
|
@@ -391,5 +405,5 @@ function writeConfig(rootDir, filename, config, header) {
|
|
|
391
405
|
const content = `${header}\n${yamlStr}`;
|
|
392
406
|
const outPath = resolve(rootDir, filename);
|
|
393
407
|
writeFileSync(outPath, content, "utf-8");
|
|
394
|
-
|
|
408
|
+
log.info(` ✓ ${filename}`);
|
|
395
409
|
}
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
* Migrated from lib/grader-api.ts — no module-level side effects, no
|
|
13
13
|
* process.exit(), accepts rootDir as parameter for file-based operations.
|
|
14
14
|
*/
|
|
15
|
+
import type { Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
15
16
|
interface ProviderConfig {
|
|
16
17
|
apiKey: string;
|
|
17
18
|
baseUrl: string;
|
|
@@ -23,7 +24,7 @@ interface ProviderConfig {
|
|
|
23
24
|
* Dispatches to the correct provider API based on the model prefix.
|
|
24
25
|
* Returns a numeric score (0–100) or null if the call or parse fails.
|
|
25
26
|
*/
|
|
26
|
-
export declare function gradeOnce(graderModel: string, responseText: string, rubricText: string): Promise<null | number>;
|
|
27
|
+
export declare function gradeOnce(graderModel: string, responseText: string, rubricText: string, logger?: Logger): Promise<null | number>;
|
|
27
28
|
/**
|
|
28
29
|
* Load the grader model from `config/models.yaml`.
|
|
29
30
|
* Returns both the model ID and human-readable label.
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
import { existsSync, readFileSync } from "fs";
|
|
16
16
|
import { join } from "path";
|
|
17
17
|
import { load } from "js-yaml";
|
|
18
|
+
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
18
19
|
// ---------------------------------------------------------------------------
|
|
19
20
|
// Public API
|
|
20
21
|
// ---------------------------------------------------------------------------
|
|
@@ -24,7 +25,8 @@ import { load } from "js-yaml";
|
|
|
24
25
|
* Dispatches to the correct provider API based on the model prefix.
|
|
25
26
|
* Returns a numeric score (0–100) or null if the call or parse fails.
|
|
26
27
|
*/
|
|
27
|
-
export async function gradeOnce(graderModel, responseText, rubricText) {
|
|
28
|
+
export async function gradeOnce(graderModel, responseText, rubricText, logger) {
|
|
29
|
+
const log = logger ?? new ConsoleLogger();
|
|
28
30
|
const config = resolveProvider(graderModel);
|
|
29
31
|
const prompt = `You are evaluating an AI assistant's response. Grade the response according to the following rubric.
|
|
30
32
|
|
|
@@ -38,10 +40,10 @@ ${rubricText}
|
|
|
38
40
|
const provider = graderModel.split(":")[0];
|
|
39
41
|
let content;
|
|
40
42
|
if (provider === "anthropic") {
|
|
41
|
-
content = await callAnthropic(config, prompt);
|
|
43
|
+
content = await callAnthropic(config, prompt, log);
|
|
42
44
|
}
|
|
43
45
|
else if (provider === "openai") {
|
|
44
|
-
content = await callOpenAI(config, prompt);
|
|
46
|
+
content = await callOpenAI(config, prompt, log);
|
|
45
47
|
}
|
|
46
48
|
else {
|
|
47
49
|
// resolveProvider already throws for unknown providers, but just in case
|
|
@@ -51,12 +53,12 @@ ${rubricText}
|
|
|
51
53
|
return null;
|
|
52
54
|
const score = extractScore(content);
|
|
53
55
|
if (score === null) {
|
|
54
|
-
|
|
56
|
+
log.error(` ⚠ Could not parse grader response: ${content.slice(0, 100)}`);
|
|
55
57
|
}
|
|
56
58
|
return score;
|
|
57
59
|
}
|
|
58
60
|
catch (err) {
|
|
59
|
-
|
|
61
|
+
log.error(` ⚠ Grader call failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
60
62
|
return null;
|
|
61
63
|
}
|
|
62
64
|
}
|
|
@@ -152,7 +154,7 @@ export function resolveProvider(graderModel) {
|
|
|
152
154
|
// ---------------------------------------------------------------------------
|
|
153
155
|
// Provider-specific API calls
|
|
154
156
|
// ---------------------------------------------------------------------------
|
|
155
|
-
async function callAnthropic(config, prompt) {
|
|
157
|
+
async function callAnthropic(config, prompt, log) {
|
|
156
158
|
const response = await fetch(config.baseUrl, {
|
|
157
159
|
body: JSON.stringify({
|
|
158
160
|
max_tokens: 256,
|
|
@@ -169,14 +171,14 @@ async function callAnthropic(config, prompt) {
|
|
|
169
171
|
});
|
|
170
172
|
if (!response.ok) {
|
|
171
173
|
const text = await response.text();
|
|
172
|
-
|
|
174
|
+
log.error(` ⚠ Grader API error (Anthropic): ${response.status} ${text.slice(0, 200)}`);
|
|
173
175
|
return null;
|
|
174
176
|
}
|
|
175
177
|
const data = (await response.json());
|
|
176
178
|
const textBlock = data.content?.find((c) => c.type === "text");
|
|
177
179
|
return textBlock?.text ?? "";
|
|
178
180
|
}
|
|
179
|
-
async function callOpenAI(config, prompt) {
|
|
181
|
+
async function callOpenAI(config, prompt, log) {
|
|
180
182
|
const response = await fetch(config.baseUrl, {
|
|
181
183
|
body: JSON.stringify({
|
|
182
184
|
max_tokens: 256,
|
|
@@ -192,7 +194,7 @@ async function callOpenAI(config, prompt) {
|
|
|
192
194
|
});
|
|
193
195
|
if (!response.ok) {
|
|
194
196
|
const text = await response.text();
|
|
195
|
-
|
|
197
|
+
log.error(` ⚠ Grader API error (OpenAI): ${response.status} ${text.slice(0, 200)}`);
|
|
196
198
|
return null;
|
|
197
199
|
}
|
|
198
200
|
const data = (await response.json());
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
*
|
|
13
13
|
* @see docs/exec-plans/grader-reliability.md — Phase 3
|
|
14
14
|
*/
|
|
15
|
+
import type { Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
15
16
|
import { type GraderComparison } from "./grader-comparison.js";
|
|
16
17
|
export interface GraderCompareRunnerOptions {
|
|
17
18
|
/** Candidate grader models to compare against the baseline */
|
|
@@ -21,6 +22,8 @@ export interface GraderCompareRunnerOptions {
|
|
|
21
22
|
}[];
|
|
22
23
|
/** Output format */
|
|
23
24
|
format?: "json" | "table";
|
|
25
|
+
/** Logger instance (defaults to ConsoleLogger) */
|
|
26
|
+
logger?: Logger;
|
|
24
27
|
/** Custom output path (default: results/latest/grader-comparison.json) */
|
|
25
28
|
outputPath?: string;
|
|
26
29
|
/** Path to eval results (default: results/latest/eval-results.json) */
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
16
16
|
import { join } from "path";
|
|
17
17
|
import { load } from "js-yaml";
|
|
18
|
+
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
18
19
|
import { compareGraders, } from "./grader-comparison.js";
|
|
19
20
|
import { classifyCorrelation } from "./grader-validation.js";
|
|
20
21
|
import { gradeOnce } from "./grader-api.js";
|
|
@@ -198,10 +199,11 @@ export function formatComparisonReport(result) {
|
|
|
198
199
|
*/
|
|
199
200
|
export async function runGraderCompare(options) {
|
|
200
201
|
const { rootDir, format = "table" } = options;
|
|
202
|
+
const log = options.logger ?? new ConsoleLogger();
|
|
201
203
|
const resultsPath = options.resultsPath
|
|
202
204
|
? join(rootDir, options.resultsPath)
|
|
203
205
|
: join(rootDir, "results", "latest", "eval-results.json");
|
|
204
|
-
|
|
206
|
+
log.info("=== Grader Comparison ===\n");
|
|
205
207
|
// Load config
|
|
206
208
|
const { baseline, candidates } = loadConfig(rootDir, options.candidates);
|
|
207
209
|
if (candidates.length === 0) {
|
|
@@ -215,32 +217,32 @@ export async function runGraderCompare(options) {
|
|
|
215
217
|
const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
|
|
216
218
|
// Extract judgments
|
|
217
219
|
const judgments = extractJudgments(file);
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
220
|
+
log.info(` Baseline: ${baseline.label} (${baseline.id})`);
|
|
221
|
+
log.info(` Candidates: ${candidates.map((c) => c.label).join(", ")}`);
|
|
222
|
+
log.info(` Judgments: ${judgments.length}`);
|
|
221
223
|
if (judgments.length === 0) {
|
|
222
224
|
throw new Error("No gradable judgments found in results.");
|
|
223
225
|
}
|
|
224
226
|
const totalCalls = judgments.length * (1 + candidates.length);
|
|
225
227
|
const estimatedCost = totalCalls * 0.005;
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
228
|
+
log.info(` API calls: ${totalCalls} (${judgments.length} × ${1 + candidates.length} models)`);
|
|
229
|
+
log.info(` Est. cost: ~$${estimatedCost.toFixed(2)}`);
|
|
230
|
+
log.info("");
|
|
229
231
|
// Grade with baseline
|
|
230
|
-
|
|
231
|
-
const baselineScores = await gradeJudgments(judgments, baseline.id);
|
|
232
|
+
log.info(` Grading with baseline: ${baseline.label}...`);
|
|
233
|
+
const baselineScores = await gradeJudgments(judgments, baseline.id, log);
|
|
232
234
|
// Grade with each candidate
|
|
233
235
|
const candidateScoreSets = [];
|
|
234
236
|
for (const candidate of candidates) {
|
|
235
|
-
|
|
236
|
-
const scores = await gradeJudgments(judgments, candidate.id);
|
|
237
|
+
log.info(` Grading with candidate: ${candidate.label}...`);
|
|
238
|
+
const scores = await gradeJudgments(judgments, candidate.id, log);
|
|
237
239
|
candidateScoreSets.push({
|
|
238
240
|
label: candidate.label,
|
|
239
241
|
modelId: candidate.id,
|
|
240
242
|
scores,
|
|
241
243
|
});
|
|
242
244
|
}
|
|
243
|
-
|
|
245
|
+
log.info("");
|
|
244
246
|
// Compare
|
|
245
247
|
const baselineScoreSet = {
|
|
246
248
|
label: baseline.label,
|
|
@@ -250,10 +252,10 @@ export async function runGraderCompare(options) {
|
|
|
250
252
|
const result = compareGraders(baselineScoreSet, candidateScoreSets);
|
|
251
253
|
// Output
|
|
252
254
|
if (format === "table") {
|
|
253
|
-
|
|
255
|
+
log.info(formatComparisonReport(result));
|
|
254
256
|
}
|
|
255
257
|
else {
|
|
256
|
-
|
|
258
|
+
log.info(JSON.stringify(result, null, 2));
|
|
257
259
|
}
|
|
258
260
|
// Write output
|
|
259
261
|
const outPath = options.outputPath ??
|
|
@@ -261,7 +263,7 @@ export async function runGraderCompare(options) {
|
|
|
261
263
|
const outDir = join(outPath, "..");
|
|
262
264
|
mkdirSync(outDir, { recursive: true });
|
|
263
265
|
writeFileSync(outPath, JSON.stringify(result, null, 2));
|
|
264
|
-
|
|
266
|
+
log.info(`\n 📄 Results written to ${outPath}`);
|
|
265
267
|
return result;
|
|
266
268
|
}
|
|
267
269
|
// ---------------------------------------------------------------------------
|
|
@@ -271,7 +273,7 @@ export async function runGraderCompare(options) {
|
|
|
271
273
|
* Grade a set of judgments with a specific grader model.
|
|
272
274
|
* Returns GraderScore[] with one score per judgment.
|
|
273
275
|
*/
|
|
274
|
-
async function gradeJudgments(judgments, graderModel) {
|
|
276
|
+
async function gradeJudgments(judgments, graderModel, log) {
|
|
275
277
|
const scores = [];
|
|
276
278
|
let completed = 0;
|
|
277
279
|
let failed = 0;
|
|
@@ -280,7 +282,7 @@ async function gradeJudgments(judgments, graderModel) {
|
|
|
280
282
|
completed++;
|
|
281
283
|
if (completed % 10 === 0 || completed === judgments.length) {
|
|
282
284
|
const pct = Math.round((completed / judgments.length) * 100);
|
|
283
|
-
|
|
285
|
+
log.info(` Progress: ${completed}/${judgments.length} (${pct}%)`);
|
|
284
286
|
}
|
|
285
287
|
if (score === null) {
|
|
286
288
|
failed++;
|
|
@@ -293,9 +295,9 @@ async function gradeJudgments(judgments, graderModel) {
|
|
|
293
295
|
taskId: judgment.description,
|
|
294
296
|
});
|
|
295
297
|
}
|
|
296
|
-
|
|
298
|
+
log.info(""); // newline after progress
|
|
297
299
|
if (failed > 0) {
|
|
298
|
-
|
|
300
|
+
log.warn(` ⚠ ${failed} grading calls failed (excluded)`);
|
|
299
301
|
}
|
|
300
302
|
return scores;
|
|
301
303
|
}
|
|
@@ -14,11 +14,14 @@
|
|
|
14
14
|
*
|
|
15
15
|
* @see docs/exec-plans/grader-reliability.md — Phase 1
|
|
16
16
|
*/
|
|
17
|
+
import { type Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
17
18
|
import type { RawPromptfooFile } from "./calculate-scores.js";
|
|
18
19
|
import { type GraderConsistency } from "./grader-consistency.js";
|
|
19
20
|
import type { DimensionName } from "./types.js";
|
|
20
21
|
/** Options for the grader consistency runner. */
|
|
21
22
|
export interface GraderConsistencyRunnerOptions {
|
|
23
|
+
/** Logger for structured output. Falls back to ConsoleLogger if omitted. */
|
|
24
|
+
logger?: Logger;
|
|
22
25
|
/** Number of additional grading replications (default: 5) */
|
|
23
26
|
replications: number;
|
|
24
27
|
/** Path to eval-results.json */
|