@sanity/ailf 0.1.34 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/LICENSE +21 -0
  2. package/config/airbyte/ai_literacy_framework.connector.yaml +6 -0
  3. package/config/bigquery/views/reports.sql +1 -0
  4. package/dist/_vendor/ailf-core/examples/index.d.ts +10 -20
  5. package/dist/_vendor/ailf-core/examples/index.js +10 -20
  6. package/dist/_vendor/ailf-core/ports/task-source.d.ts +2 -0
  7. package/dist/_vendor/ailf-core/types/index.d.ts +65 -0
  8. package/dist/_vendor/ailf-tasks/schemas.d.ts +12 -0
  9. package/dist/_vendor/ailf-tasks/schemas.js +4 -0
  10. package/dist/adapters/task-sources/content-lake-task-source.js +9 -1
  11. package/dist/adapters/task-sources/repo-task-source.js +19 -4
  12. package/dist/commands/calculate-scores.js +5 -1
  13. package/dist/commands/publish.js +3 -0
  14. package/dist/composition-root.js +7 -2
  15. package/dist/orchestration/pipeline-orchestrator.js +27 -2
  16. package/dist/orchestration/step-runner.js +8 -0
  17. package/dist/orchestration/steps/calculate-scores-step.js +22 -19
  18. package/dist/orchestration/steps/generate-configs-step.js +1 -0
  19. package/dist/orchestration/steps/grader-consistency-step.js +1 -0
  20. package/dist/orchestration/steps/mirror-repo-tasks-step.js +2 -1
  21. package/dist/orchestration/steps/publish-report-step.js +3 -0
  22. package/dist/pipeline/calculate-scores.d.ts +11 -1
  23. package/dist/pipeline/calculate-scores.js +222 -157
  24. package/dist/pipeline/coverage-audit.d.ts +2 -1
  25. package/dist/pipeline/coverage-audit.js +5 -3
  26. package/dist/pipeline/expand-tasks.d.ts +2 -1
  27. package/dist/pipeline/expand-tasks.js +33 -2
  28. package/dist/pipeline/generate-configs.d.ts +3 -1
  29. package/dist/pipeline/generate-configs.js +51 -37
  30. package/dist/pipeline/grader-api.d.ts +2 -1
  31. package/dist/pipeline/grader-api.js +11 -9
  32. package/dist/pipeline/grader-compare-runner.d.ts +3 -0
  33. package/dist/pipeline/grader-compare-runner.js +21 -19
  34. package/dist/pipeline/grader-consistency-runner.d.ts +3 -0
  35. package/dist/pipeline/grader-consistency-runner.js +16 -14
  36. package/dist/pipeline/grader-sensitivity-runner.d.ts +3 -0
  37. package/dist/pipeline/grader-sensitivity-runner.js +18 -16
  38. package/dist/pipeline/grader-validate-runner.d.ts +3 -0
  39. package/dist/pipeline/grader-validate-runner.js +16 -14
  40. package/dist/pipeline/mirror-repo-tasks.d.ts +80 -1
  41. package/dist/pipeline/mirror-repo-tasks.js +148 -32
  42. package/dist/pipeline/provenance.d.ts +3 -0
  43. package/dist/pipeline/provenance.js +25 -3
  44. package/dist/pipeline/report-title.d.ts +66 -0
  45. package/dist/pipeline/report-title.js +118 -0
  46. package/dist/report-store.js +2 -0
  47. package/dist/sinks/bigquery/index.d.ts +1 -0
  48. package/dist/sinks/bigquery/index.js +1 -0
  49. package/dist/sources.d.ts +2 -1
  50. package/dist/sources.js +28 -1
  51. package/package.json +23 -23
@@ -9,6 +9,7 @@
9
9
  *
10
10
  * @see docs/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
11
11
  */
12
+ import type { Logger } from "../_vendor/ailf-core/index.d.ts";
12
13
  import type { CoverageAuditReport, ProductFeature } from "./types.js";
13
14
  /**
14
15
  * Count unique document slugs referenced across all tasks.
@@ -32,7 +33,7 @@ export declare function formatCoverageMarkdown(report: CoverageAuditReport): str
32
33
  /**
33
34
  * Load and validate the feature registry from config/features.yaml.
34
35
  */
35
- export declare function loadFeatureRegistry(rootDir: string): null | ProductFeature[];
36
+ export declare function loadFeatureRegistry(rootDir: string, logger?: Logger): null | ProductFeature[];
36
37
  /**
37
38
  * Run the coverage audit and produce a structured report.
38
39
  */
@@ -12,6 +12,7 @@
12
12
  import { existsSync, readFileSync } from "fs";
13
13
  import { join } from "path";
14
14
  import { load } from "js-yaml";
15
+ import { ConsoleLogger } from "../adapters/loggers/index.js";
15
16
  import { FeatureRegistrySchema } from "./schemas.js";
16
17
  import { resolveMappings } from "./resolve-mappings.js";
17
18
  // ---------------------------------------------------------------------------
@@ -113,7 +114,8 @@ export function formatCoverageMarkdown(report) {
113
114
  /**
114
115
  * Load and validate the feature registry from config/features.yaml.
115
116
  */
116
- export function loadFeatureRegistry(rootDir) {
117
+ export function loadFeatureRegistry(rootDir, logger) {
118
+ const log = logger ?? new ConsoleLogger();
117
119
  const filePath = join(rootDir, "config", "features.yaml");
118
120
  if (!existsSync(filePath)) {
119
121
  return null;
@@ -122,9 +124,9 @@ export function loadFeatureRegistry(rootDir) {
122
124
  const parsed = load(raw);
123
125
  const result = FeatureRegistrySchema.safeParse(parsed);
124
126
  if (!result.success) {
125
- console.error("❌ config/features.yaml validation failed:");
127
+ log.error("❌ config/features.yaml validation failed:");
126
128
  for (const issue of result.error.issues) {
127
- console.error(` ${issue.path.join(".")}: ${issue.message}`);
129
+ log.error(` ${issue.path.join(".")}: ${issue.message}`);
128
130
  }
129
131
  return null;
130
132
  }
@@ -41,6 +41,7 @@
41
41
  * 2. Baseline entry — sets docs: "", adds transform, uses abbreviated rubric
42
42
  */
43
43
  import type { TaskDefinition } from "../_vendor/ailf-core/index.d.ts";
44
+ import type { Logger } from "../_vendor/ailf-core/index.d.ts";
44
45
  import { type RubricConfig } from "./schemas.js";
45
46
  import type { FilterOptions } from "./types.js";
46
47
  /** Any assertion entry (templated or value-based). */
@@ -198,7 +199,7 @@ export declare function isTemplatedAssert(entry: AssertEntry): entry is Template
198
199
  *
199
200
  * Returns the expanded entries grouped by source file.
200
201
  */
201
- export declare function loadAndExpandTasks(rootDir: string, filter?: FilterOptions, mode?: "agentic" | "baseline"): {
202
+ export declare function loadAndExpandTasks(rootDir: string, filter?: FilterOptions, mode?: "agentic" | "baseline", logger?: Logger): {
202
203
  /** All expanded test entries, in order. */
203
204
  entries: ExpandedTestEntry[];
204
205
  /** Statistics about what was processed. */
@@ -43,6 +43,7 @@
43
43
  import { existsSync, readFileSync, readdirSync } from "fs";
44
44
  import { resolve } from "path";
45
45
  import { load } from "js-yaml";
46
+ import { ConsoleLogger } from "../adapters/loggers/index.js";
46
47
  import { RubricConfigSchema } from "./schemas.js";
47
48
  // ---------------------------------------------------------------------------
48
49
  // Rubric template loading and assembly
@@ -313,9 +314,13 @@ export function isTemplatedAssert(entry) {
313
314
  *
314
315
  * Returns the expanded entries grouped by source file.
315
316
  */
316
- export function loadAndExpandTasks(rootDir, filter, mode = "baseline") {
317
+ export function loadAndExpandTasks(rootDir, filter, mode = "baseline", logger) {
318
+ const log = logger ?? new ConsoleLogger();
317
319
  const tasksDir = resolve(rootDir, "tasks");
318
320
  if (!existsSync(tasksDir)) {
321
+ log.debug("Tasks directory not found, returning empty (tasks may come from Content Lake)", {
322
+ tasksDir,
323
+ });
319
324
  // tasks/ may not exist when task definitions come from Content Lake
320
325
  return {
321
326
  entries: [],
@@ -332,13 +337,25 @@ export function loadAndExpandTasks(rootDir, filter, mode = "baseline") {
332
337
  let yamlFiles = readdirSync(tasksDir)
333
338
  .filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
334
339
  .sort();
340
+ log.debug("Discovered task YAML files", {
341
+ directory: tasksDir,
342
+ fileCount: yamlFiles.length,
343
+ files: yamlFiles,
344
+ });
335
345
  // Apply area filter — area name = filename stem (e.g., "groq" matches "groq.yaml")
336
346
  if (filter?.areas && filter.areas.length > 0) {
337
347
  const allowedAreas = new Set(filter.areas.map((a) => a.toLowerCase()));
348
+ const beforeCount = yamlFiles.length;
338
349
  yamlFiles = yamlFiles.filter((f) => {
339
350
  const stem = f.replace(/\.ya?ml$/, "").toLowerCase();
340
351
  return allowedAreas.has(stem);
341
352
  });
353
+ log.debug("Applied area filter", {
354
+ allowedAreas: [...allowedAreas],
355
+ beforeCount,
356
+ afterCount: yamlFiles.length,
357
+ matchedFiles: yamlFiles,
358
+ });
342
359
  }
343
360
  const entries = [];
344
361
  let singleDefinitions = 0;
@@ -353,10 +370,17 @@ export function loadAndExpandTasks(rootDir, filter, mode = "baseline") {
353
370
  // Apply task ID filter
354
371
  if (filter?.taskIds && filter.taskIds.length > 0) {
355
372
  const allowedIds = new Set(filter.taskIds);
373
+ const beforeCount = parsed.length;
356
374
  parsed = parsed.filter((entry) => typeof entry === "object" &&
357
375
  entry !== null &&
358
376
  "id" in entry &&
359
377
  allowedIds.has(entry.id));
378
+ log.debug("Applied task ID filter", {
379
+ file,
380
+ allowedIds: [...allowedIds],
381
+ beforeCount,
382
+ afterCount: parsed.length,
383
+ });
360
384
  }
361
385
  for (const entry of parsed) {
362
386
  if (isSingleTaskDefinition(entry)) {
@@ -370,8 +394,15 @@ export function loadAndExpandTasks(rootDir, filter, mode = "baseline") {
370
394
  }
371
395
  }
372
396
  if (filter?.areas || filter?.taskIds) {
373
- console.log(` Filter: ${filter.areas ? `areas=[${filter.areas.join(", ")}]` : ""}${filter.areas && filter.taskIds ? ", " : ""}${filter.taskIds ? `tasks=[${filter.taskIds.join(", ")}]` : ""}`);
397
+ log.info(`Filter: ${filter.areas ? `areas=[${filter.areas.join(", ")}]` : ""}${filter.areas && filter.taskIds ? ", " : ""}${filter.taskIds ? `tasks=[${filter.taskIds.join(", ")}]` : ""}`);
374
398
  }
399
+ log.debug("Task expansion complete", {
400
+ totalFiles: yamlFiles.length,
401
+ singleDefinitions,
402
+ legacyEntries,
403
+ expandedTotal: entries.length,
404
+ mode,
405
+ });
375
406
  return {
376
407
  entries,
377
408
  stats: {
@@ -18,7 +18,7 @@
18
18
  * @see config/models.yaml — the central model registry
19
19
  * @see docs/exec-plans/eliminate-lib-layer.md
20
20
  */
21
- import { type TaskDefinition } from "../_vendor/ailf-core/index.d.ts";
21
+ import { type Logger, type TaskDefinition } from "../_vendor/ailf-core/index.d.ts";
22
22
  import type { FilterOptions } from "./types.js";
23
23
  import { type ResolvedSourceConfig } from "../sources.js";
24
24
  export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "../_vendor/ailf-core/index.d.ts";
@@ -49,6 +49,8 @@ export interface GenerateConfigsOptions {
49
49
  allowedOrigins?: string[];
50
50
  /** Filter to specific feature areas or task IDs */
51
51
  filter?: FilterOptions;
52
+ /** Logger instance (defaults to ConsoleLogger) */
53
+ logger?: Logger;
52
54
  /** Pre-resolved source config (skips loadSource() call) */
53
55
  resolvedSource?: ResolvedSourceConfig;
54
56
  /** Root directory of the eval package (required) */
@@ -22,6 +22,7 @@ import { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } fro
22
22
  import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
23
23
  import { resolve } from "path";
24
24
  import { dump, load } from "js-yaml";
25
+ import { ConsoleLogger } from "../adapters/loggers/index.js";
25
26
  import { expandTaskDefinitions, loadAndExpandTasks } from "./expand-tasks.js";
26
27
  import { validateModelsYaml } from "./validate.js";
27
28
  import { loadSource } from "../sources.js";
@@ -260,31 +261,34 @@ function generateObservedConfig(models, tests, prompts) {
260
261
  */
261
262
  export function generateConfigs(options) {
262
263
  const { rootDir } = options;
264
+ const log = options.logger ?? new ConsoleLogger();
263
265
  // Validate config/models.yaml before generating configs
264
266
  const modelIssues = validateModelsYaml(rootDir);
265
267
  const modelErrors = modelIssues.filter((i) => i.severity === "error");
266
268
  if (modelErrors.length > 0) {
267
- console.error("❌ config/models.yaml validation failed:");
268
- for (const e of modelErrors) {
269
- console.error(` ERROR: ${e.message}`);
270
- if (e.path) {
271
- console.error(` at ${e.path}`);
272
- }
273
- }
274
- console.error("\nFix config/models.yaml before generating configs. Run 'pnpm validate' for details.");
275
- process.exit(1);
269
+ const details = modelErrors
270
+ .map((e) => (e.path ? `${e.message} (at ${e.path})` : e.message))
271
+ .join("; ");
272
+ throw new Error(`config/models.yaml validation failed: ${details}. Run 'pnpm validate' for details.`);
276
273
  }
277
- console.log("Loading config/models.yaml...");
274
+ log.info("Loading config/models.yaml...");
278
275
  const models = loadModels(rootDir);
279
276
  const activeModels = models.models.filter((m) => m.id && m.label);
280
- console.log(` Found ${activeModels.length} active model(s):`);
277
+ log.debug("Models loaded from config/models.yaml", {
278
+ totalModels: models.models.length,
279
+ activeModels: activeModels.length,
280
+ modelIds: activeModels.map((m) => m.id),
281
+ graderId: models.grader.id,
282
+ maxConcurrency: models.maxConcurrency,
283
+ });
284
+ log.info(` Found ${activeModels.length} active model(s):`);
281
285
  for (const m of activeModels) {
282
286
  // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty array join → "all"
283
287
  const modes = m.modes?.join(", ") || "all";
284
- console.log(` - ${m.label} (${m.id}) → [${modes}]`);
288
+ log.info(` - ${m.label} (${m.id}) → [${modes}]`);
285
289
  }
286
290
  // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty label falls through to id
287
- console.log(` Grader: ${models.grader.label || models.grader.id}`);
291
+ log.info(` Grader: ${models.grader.label || models.grader.id}`);
288
292
  // Build filter from options
289
293
  const filter = options.filter?.areas || options.filter?.taskIds
290
294
  ? options.filter
@@ -295,20 +299,24 @@ export function generateConfigs(options) {
295
299
  let agenticEntries;
296
300
  if (options.tasks) {
297
301
  // TaskSource path — tasks already loaded and filtered by the adapter
302
+ log.debug("Expanding tasks from TaskSource adapter", {
303
+ taskCount: options.tasks.length,
304
+ taskIds: options.tasks.map((t) => t.id),
305
+ });
298
306
  const baselineResult = expandTaskDefinitions(options.tasks, rootDir, "baseline");
299
307
  entries = baselineResult.entries;
300
- console.log(` Expanded ${baselineResult.stats.totalTasks} task(s) → ${baselineResult.stats.expandedTotal} test entries (from TaskSource)`);
308
+ log.info(` Expanded ${baselineResult.stats.totalTasks} task(s) → ${baselineResult.stats.expandedTotal} test entries (from TaskSource)`);
301
309
  const agenticResult = expandTaskDefinitions(options.tasks, rootDir, "agentic");
302
310
  agenticEntries = agenticResult.entries;
303
- console.log(` Agentic: ${agenticResult.stats.expandedTotal} entries (gold only, no baseline)`);
311
+ log.info(` Agentic: ${agenticResult.stats.expandedTotal} entries (gold only, no baseline)`);
304
312
  }
305
313
  else {
306
314
  // Legacy path — read from tasks/*.yaml files
307
- const { entries: baselineEntries, stats } = loadAndExpandTasks(rootDir, filter, "baseline");
315
+ const { entries: baselineEntries, stats } = loadAndExpandTasks(rootDir, filter, "baseline", log);
308
316
  entries = baselineEntries;
309
- console.log(` Expanded ${stats.singleDefinitions} task(s) → ${stats.expandedTotal} test entries`);
317
+ log.info(` Expanded ${stats.singleDefinitions} task(s) → ${stats.expandedTotal} test entries`);
310
318
  if (stats.legacyEntries > 0) {
311
- console.log(` ⚠ ${stats.legacyEntries} legacy (paired) entries passed through unchanged`);
319
+ log.info(` ⚠ ${stats.legacyEntries} legacy (paired) entries passed through unchanged`);
312
320
  }
313
321
  if (filter) {
314
322
  const parts = [];
@@ -318,11 +326,11 @@ export function generateConfigs(options) {
318
326
  if (filter.taskIds) {
319
327
  parts.push(`tasks: ${filter.taskIds.join(", ")}`);
320
328
  }
321
- console.log(` Scoped to: ${parts.join("; ")}`);
329
+ log.info(` Scoped to: ${parts.join("; ")}`);
322
330
  }
323
- const { entries: agenticFromYaml, stats: agenticStats } = loadAndExpandTasks(rootDir, filter, "agentic");
331
+ const { entries: agenticFromYaml, stats: agenticStats } = loadAndExpandTasks(rootDir, filter, "agentic", log);
324
332
  agenticEntries = agenticFromYaml;
325
- console.log(` Agentic: ${agenticStats.expandedTotal} entries (gold only, no baseline)`);
333
+ log.info(` Agentic: ${agenticStats.expandedTotal} entries (gold only, no baseline)`);
326
334
  }
327
335
  // Write expanded tasks to generated files for Promptfoo to consume
328
336
  const expandedPath = resolve(rootDir, "tasks", ".expanded.yaml");
@@ -333,7 +341,7 @@ export function generateConfigs(options) {
333
341
  quotingType: "'",
334
342
  });
335
343
  writeFileSync(expandedPath, `# .expanded.yaml\n#\n# AUTO-GENERATED — do not edit directly.\n# Source: tasks/*.yaml (single-definition format)\n# Run: pnpm generate-configs\n\n${expandedYaml}`, "utf-8");
336
- console.log(` ✓ tasks/.expanded.yaml (${entries.length} entries)`);
344
+ log.info(` ✓ tasks/.expanded.yaml (${entries.length} entries)`);
337
345
  const agenticExpandedPath = resolve(rootDir, "tasks", ".expanded.agentic.yaml");
338
346
  const agenticExpandedYaml = dump(agenticEntries, {
339
347
  forceQuotes: false,
@@ -342,46 +350,52 @@ export function generateConfigs(options) {
342
350
  quotingType: "'",
343
351
  });
344
352
  writeFileSync(agenticExpandedPath, `# .expanded.agentic.yaml\n#\n# AUTO-GENERATED — do not edit directly.\n# Gold entries only (no baseline) for agentic evaluation mode.\n# Source: tasks/*.yaml (single-definition format)\n# Run: pnpm generate-configs\n\n${agenticExpandedYaml}`, "utf-8");
345
- console.log(` ✓ tasks/.expanded.agentic.yaml (${agenticEntries.length} entries)`);
353
+ log.info(` ✓ tasks/.expanded.agentic.yaml (${agenticEntries.length} entries)`);
346
354
  const taskFiles = ["file://tasks/.expanded.yaml"];
347
355
  const agenticTaskFiles = ["file://tasks/.expanded.agentic.yaml"];
348
356
  // Load prompt templates
349
357
  const prompts = loadPrompts(rootDir);
350
- console.log(` Loaded prompts: ${Object.keys(prompts).join(", ")}`);
358
+ log.debug("Prompt templates loaded", {
359
+ keys: Object.keys(prompts),
360
+ withDocsId: prompts.withDocs.id,
361
+ withoutDocsId: prompts.withoutDocs.id,
362
+ agenticId: prompts.agentic.id,
363
+ });
364
+ log.info(` Loaded prompts: ${Object.keys(prompts).join(", ")}`);
351
365
  // Load optional documentation source configuration
352
366
  // Pre-resolved source wins over name-based lookup
353
367
  let source = options.resolvedSource;
354
368
  const sourceName = options.source;
355
369
  if (!source && sourceName) {
356
- console.log(`\nLoading source: ${sourceName}`);
370
+ log.info(`\nLoading source: ${sourceName}`);
357
371
  try {
358
372
  source = loadSource(sourceName);
359
373
  }
360
374
  catch (err) {
361
375
  const msg = err instanceof Error ? err.message : String(err);
362
- console.warn(`\n⚠ Failed to load source "${sourceName}": ${msg}`);
376
+ log.warn(`\n⚠ Failed to load source "${sourceName}": ${msg}`);
363
377
  }
364
378
  }
365
379
  if (source) {
366
- console.log(` Base URL: ${source.baseUrl}`);
367
- console.log(` Dataset: ${source.dataset}`);
380
+ log.info(` Base URL: ${source.baseUrl}`);
381
+ log.info(` Dataset: ${source.dataset}`);
368
382
  if (source.allowedOrigins?.length) {
369
- console.log(` Allowed origins: ${source.allowedOrigins.join(", ")}`);
383
+ log.info(` Allowed origins: ${source.allowedOrigins.join(", ")}`);
370
384
  }
371
385
  }
372
- console.log("\nGenerating configs...");
373
- writeConfig(rootDir, "promptfooconfig.yaml", generateBaselineConfig(models, taskFiles, prompts), `# promptfooconfig.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`);
374
- writeConfig(rootDir, "promptfooconfig.observed.yaml", generateObservedConfig(models, taskFiles, prompts), `# promptfooconfig.observed.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`);
375
- writeConfig(rootDir, "promptfooconfig.agentic.yaml", generateAgenticConfig(models, agenticTaskFiles, prompts, source, options.searchMode, options.allowedOrigins), `# promptfooconfig.agentic.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`);
376
- console.log("\nDone! Configs are ready.");
386
+ log.info("\nGenerating configs...");
387
+ writeConfig(rootDir, "promptfooconfig.yaml", generateBaselineConfig(models, taskFiles, prompts), `# promptfooconfig.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`, log);
388
+ writeConfig(rootDir, "promptfooconfig.observed.yaml", generateObservedConfig(models, taskFiles, prompts), `# promptfooconfig.observed.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`, log);
389
+ writeConfig(rootDir, "promptfooconfig.agentic.yaml", generateAgenticConfig(models, agenticTaskFiles, prompts, source, options.searchMode, options.allowedOrigins), `# promptfooconfig.agentic.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`, log);
390
+ log.info("\nDone! Configs are ready.");
377
391
  if (source) {
378
- console.log(` (using doc source: ${sourceName})`);
392
+ log.info(` (using doc source: ${sourceName})`);
379
393
  }
380
394
  }
381
395
  // ---------------------------------------------------------------------------
382
396
  // File writing
383
397
  // ---------------------------------------------------------------------------
384
- function writeConfig(rootDir, filename, config, header) {
398
+ function writeConfig(rootDir, filename, config, header, log) {
385
399
  const yamlStr = dump(config, {
386
400
  forceQuotes: false,
387
401
  lineWidth: 120,
@@ -391,5 +405,5 @@ function writeConfig(rootDir, filename, config, header) {
391
405
  const content = `${header}\n${yamlStr}`;
392
406
  const outPath = resolve(rootDir, filename);
393
407
  writeFileSync(outPath, content, "utf-8");
394
- console.log(` ✓ ${filename}`);
408
+ log.info(` ✓ ${filename}`);
395
409
  }
@@ -12,6 +12,7 @@
12
12
  * Migrated from lib/grader-api.ts — no module-level side effects, no
13
13
  * process.exit(), accepts rootDir as parameter for file-based operations.
14
14
  */
15
+ import type { Logger } from "../_vendor/ailf-core/index.d.ts";
15
16
  interface ProviderConfig {
16
17
  apiKey: string;
17
18
  baseUrl: string;
@@ -23,7 +24,7 @@ interface ProviderConfig {
23
24
  * Dispatches to the correct provider API based on the model prefix.
24
25
  * Returns a numeric score (0–100) or null if the call or parse fails.
25
26
  */
26
- export declare function gradeOnce(graderModel: string, responseText: string, rubricText: string): Promise<null | number>;
27
+ export declare function gradeOnce(graderModel: string, responseText: string, rubricText: string, logger?: Logger): Promise<null | number>;
27
28
  /**
28
29
  * Load the grader model from `config/models.yaml`.
29
30
  * Returns both the model ID and human-readable label.
@@ -15,6 +15,7 @@
15
15
  import { existsSync, readFileSync } from "fs";
16
16
  import { join } from "path";
17
17
  import { load } from "js-yaml";
18
+ import { ConsoleLogger } from "../adapters/loggers/index.js";
18
19
  // ---------------------------------------------------------------------------
19
20
  // Public API
20
21
  // ---------------------------------------------------------------------------
@@ -24,7 +25,8 @@ import { load } from "js-yaml";
24
25
  * Dispatches to the correct provider API based on the model prefix.
25
26
  * Returns a numeric score (0–100) or null if the call or parse fails.
26
27
  */
27
- export async function gradeOnce(graderModel, responseText, rubricText) {
28
+ export async function gradeOnce(graderModel, responseText, rubricText, logger) {
29
+ const log = logger ?? new ConsoleLogger();
28
30
  const config = resolveProvider(graderModel);
29
31
  const prompt = `You are evaluating an AI assistant's response. Grade the response according to the following rubric.
30
32
 
@@ -38,10 +40,10 @@ ${rubricText}
38
40
  const provider = graderModel.split(":")[0];
39
41
  let content;
40
42
  if (provider === "anthropic") {
41
- content = await callAnthropic(config, prompt);
43
+ content = await callAnthropic(config, prompt, log);
42
44
  }
43
45
  else if (provider === "openai") {
44
- content = await callOpenAI(config, prompt);
46
+ content = await callOpenAI(config, prompt, log);
45
47
  }
46
48
  else {
47
49
  // resolveProvider already throws for unknown providers, but just in case
@@ -51,12 +53,12 @@ ${rubricText}
51
53
  return null;
52
54
  const score = extractScore(content);
53
55
  if (score === null) {
54
- console.error(` ⚠ Could not parse grader response: ${content.slice(0, 100)}`);
56
+ log.error(` ⚠ Could not parse grader response: ${content.slice(0, 100)}`);
55
57
  }
56
58
  return score;
57
59
  }
58
60
  catch (err) {
59
- console.error(` ⚠ Grader call failed: ${err instanceof Error ? err.message : String(err)}`);
61
+ log.error(` ⚠ Grader call failed: ${err instanceof Error ? err.message : String(err)}`);
60
62
  return null;
61
63
  }
62
64
  }
@@ -152,7 +154,7 @@ export function resolveProvider(graderModel) {
152
154
  // ---------------------------------------------------------------------------
153
155
  // Provider-specific API calls
154
156
  // ---------------------------------------------------------------------------
155
- async function callAnthropic(config, prompt) {
157
+ async function callAnthropic(config, prompt, log) {
156
158
  const response = await fetch(config.baseUrl, {
157
159
  body: JSON.stringify({
158
160
  max_tokens: 256,
@@ -169,14 +171,14 @@ async function callAnthropic(config, prompt) {
169
171
  });
170
172
  if (!response.ok) {
171
173
  const text = await response.text();
172
- console.error(` ⚠ Grader API error (Anthropic): ${response.status} ${text.slice(0, 200)}`);
174
+ log.error(` ⚠ Grader API error (Anthropic): ${response.status} ${text.slice(0, 200)}`);
173
175
  return null;
174
176
  }
175
177
  const data = (await response.json());
176
178
  const textBlock = data.content?.find((c) => c.type === "text");
177
179
  return textBlock?.text ?? "";
178
180
  }
179
- async function callOpenAI(config, prompt) {
181
+ async function callOpenAI(config, prompt, log) {
180
182
  const response = await fetch(config.baseUrl, {
181
183
  body: JSON.stringify({
182
184
  max_tokens: 256,
@@ -192,7 +194,7 @@ async function callOpenAI(config, prompt) {
192
194
  });
193
195
  if (!response.ok) {
194
196
  const text = await response.text();
195
- console.error(` ⚠ Grader API error (OpenAI): ${response.status} ${text.slice(0, 200)}`);
197
+ log.error(` ⚠ Grader API error (OpenAI): ${response.status} ${text.slice(0, 200)}`);
196
198
  return null;
197
199
  }
198
200
  const data = (await response.json());
@@ -12,6 +12,7 @@
12
12
  *
13
13
  * @see docs/exec-plans/grader-reliability.md — Phase 3
14
14
  */
15
+ import type { Logger } from "../_vendor/ailf-core/index.d.ts";
15
16
  import { type GraderComparison } from "./grader-comparison.js";
16
17
  export interface GraderCompareRunnerOptions {
17
18
  /** Candidate grader models to compare against the baseline */
@@ -21,6 +22,8 @@ export interface GraderCompareRunnerOptions {
21
22
  }[];
22
23
  /** Output format */
23
24
  format?: "json" | "table";
25
+ /** Logger instance (defaults to ConsoleLogger) */
26
+ logger?: Logger;
24
27
  /** Custom output path (default: results/latest/grader-comparison.json) */
25
28
  outputPath?: string;
26
29
  /** Path to eval results (default: results/latest/eval-results.json) */
@@ -15,6 +15,7 @@
15
15
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
16
16
  import { join } from "path";
17
17
  import { load } from "js-yaml";
18
+ import { ConsoleLogger } from "../adapters/loggers/index.js";
18
19
  import { compareGraders, } from "./grader-comparison.js";
19
20
  import { classifyCorrelation } from "./grader-validation.js";
20
21
  import { gradeOnce } from "./grader-api.js";
@@ -198,10 +199,11 @@ export function formatComparisonReport(result) {
198
199
  */
199
200
  export async function runGraderCompare(options) {
200
201
  const { rootDir, format = "table" } = options;
202
+ const log = options.logger ?? new ConsoleLogger();
201
203
  const resultsPath = options.resultsPath
202
204
  ? join(rootDir, options.resultsPath)
203
205
  : join(rootDir, "results", "latest", "eval-results.json");
204
- console.log("=== Grader Comparison ===\n");
206
+ log.info("=== Grader Comparison ===\n");
205
207
  // Load config
206
208
  const { baseline, candidates } = loadConfig(rootDir, options.candidates);
207
209
  if (candidates.length === 0) {
@@ -215,32 +217,32 @@ export async function runGraderCompare(options) {
215
217
  const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
216
218
  // Extract judgments
217
219
  const judgments = extractJudgments(file);
218
- console.log(` Baseline: ${baseline.label} (${baseline.id})`);
219
- console.log(` Candidates: ${candidates.map((c) => c.label).join(", ")}`);
220
- console.log(` Judgments: ${judgments.length}`);
220
+ log.info(` Baseline: ${baseline.label} (${baseline.id})`);
221
+ log.info(` Candidates: ${candidates.map((c) => c.label).join(", ")}`);
222
+ log.info(` Judgments: ${judgments.length}`);
221
223
  if (judgments.length === 0) {
222
224
  throw new Error("No gradable judgments found in results.");
223
225
  }
224
226
  const totalCalls = judgments.length * (1 + candidates.length);
225
227
  const estimatedCost = totalCalls * 0.005;
226
- console.log(` API calls: ${totalCalls} (${judgments.length} × ${1 + candidates.length} models)`);
227
- console.log(` Est. cost: ~$${estimatedCost.toFixed(2)}`);
228
- console.log();
228
+ log.info(` API calls: ${totalCalls} (${judgments.length} × ${1 + candidates.length} models)`);
229
+ log.info(` Est. cost: ~$${estimatedCost.toFixed(2)}`);
230
+ log.info("");
229
231
  // Grade with baseline
230
- console.log(` Grading with baseline: ${baseline.label}...`);
231
- const baselineScores = await gradeJudgments(judgments, baseline.id);
232
+ log.info(` Grading with baseline: ${baseline.label}...`);
233
+ const baselineScores = await gradeJudgments(judgments, baseline.id, log);
232
234
  // Grade with each candidate
233
235
  const candidateScoreSets = [];
234
236
  for (const candidate of candidates) {
235
- console.log(` Grading with candidate: ${candidate.label}...`);
236
- const scores = await gradeJudgments(judgments, candidate.id);
237
+ log.info(` Grading with candidate: ${candidate.label}...`);
238
+ const scores = await gradeJudgments(judgments, candidate.id, log);
237
239
  candidateScoreSets.push({
238
240
  label: candidate.label,
239
241
  modelId: candidate.id,
240
242
  scores,
241
243
  });
242
244
  }
243
- console.log();
245
+ log.info("");
244
246
  // Compare
245
247
  const baselineScoreSet = {
246
248
  label: baseline.label,
@@ -250,10 +252,10 @@ export async function runGraderCompare(options) {
250
252
  const result = compareGraders(baselineScoreSet, candidateScoreSets);
251
253
  // Output
252
254
  if (format === "table") {
253
- console.log(formatComparisonReport(result));
255
+ log.info(formatComparisonReport(result));
254
256
  }
255
257
  else {
256
- console.log(JSON.stringify(result, null, 2));
258
+ log.info(JSON.stringify(result, null, 2));
257
259
  }
258
260
  // Write output
259
261
  const outPath = options.outputPath ??
@@ -261,7 +263,7 @@ export async function runGraderCompare(options) {
261
263
  const outDir = join(outPath, "..");
262
264
  mkdirSync(outDir, { recursive: true });
263
265
  writeFileSync(outPath, JSON.stringify(result, null, 2));
264
- console.log(`\n 📄 Results written to ${outPath}`);
266
+ log.info(`\n 📄 Results written to ${outPath}`);
265
267
  return result;
266
268
  }
267
269
  // ---------------------------------------------------------------------------
@@ -271,7 +273,7 @@ export async function runGraderCompare(options) {
271
273
  * Grade a set of judgments with a specific grader model.
272
274
  * Returns GraderScore[] with one score per judgment.
273
275
  */
274
- async function gradeJudgments(judgments, graderModel) {
276
+ async function gradeJudgments(judgments, graderModel, log) {
275
277
  const scores = [];
276
278
  let completed = 0;
277
279
  let failed = 0;
@@ -280,7 +282,7 @@ async function gradeJudgments(judgments, graderModel) {
280
282
  completed++;
281
283
  if (completed % 10 === 0 || completed === judgments.length) {
282
284
  const pct = Math.round((completed / judgments.length) * 100);
283
- process.stdout.write(`\r Progress: ${completed}/${judgments.length} (${pct}%)`);
285
+ log.info(` Progress: ${completed}/${judgments.length} (${pct}%)`);
284
286
  }
285
287
  if (score === null) {
286
288
  failed++;
@@ -293,9 +295,9 @@ async function gradeJudgments(judgments, graderModel) {
293
295
  taskId: judgment.description,
294
296
  });
295
297
  }
296
- console.log(); // newline after progress
298
+ log.info(""); // newline after progress
297
299
  if (failed > 0) {
298
- console.log(` ⚠ ${failed} grading calls failed (excluded)`);
300
+ log.warn(` ⚠ ${failed} grading calls failed (excluded)`);
299
301
  }
300
302
  return scores;
301
303
  }
@@ -14,11 +14,14 @@
14
14
  *
15
15
  * @see docs/exec-plans/grader-reliability.md — Phase 1
16
16
  */
17
+ import { type Logger } from "../_vendor/ailf-core/index.d.ts";
17
18
  import type { RawPromptfooFile } from "./calculate-scores.js";
18
19
  import { type GraderConsistency } from "./grader-consistency.js";
19
20
  import type { DimensionName } from "./types.js";
20
21
  /** Options for the grader consistency runner. */
21
22
  export interface GraderConsistencyRunnerOptions {
23
+ /** Logger for structured output. Falls back to ConsoleLogger if omitted. */
24
+ logger?: Logger;
22
25
  /** Number of additional grading replications (default: 5) */
23
26
  replications: number;
24
27
  /** Path to eval-results.json */