@sanity/ailf 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/config/models.yaml +3 -2
  2. package/dist/_vendor/ailf-core/types/index.d.ts +53 -0
  3. package/dist/composition-root.js +7 -2
  4. package/dist/orchestration/pipeline-orchestrator.js +27 -2
  5. package/dist/orchestration/step-runner.js +8 -0
  6. package/dist/orchestration/steps/calculate-scores-step.js +4 -0
  7. package/dist/orchestration/steps/generate-configs-step.js +1 -0
  8. package/dist/orchestration/steps/grader-consistency-step.js +1 -0
  9. package/dist/orchestration/steps/mirror-repo-tasks-step.js +2 -1
  10. package/dist/pipeline/calculate-scores.d.ts +5 -0
  11. package/dist/pipeline/calculate-scores.js +219 -146
  12. package/dist/pipeline/coverage-audit.d.ts +2 -1
  13. package/dist/pipeline/coverage-audit.js +5 -3
  14. package/dist/pipeline/expand-tasks.d.ts +2 -1
  15. package/dist/pipeline/expand-tasks.js +33 -2
  16. package/dist/pipeline/generate-configs.d.ts +3 -1
  17. package/dist/pipeline/generate-configs.js +47 -28
  18. package/dist/pipeline/grader-api.d.ts +2 -1
  19. package/dist/pipeline/grader-api.js +11 -9
  20. package/dist/pipeline/grader-compare-runner.d.ts +3 -0
  21. package/dist/pipeline/grader-compare-runner.js +21 -19
  22. package/dist/pipeline/grader-consistency-runner.d.ts +3 -0
  23. package/dist/pipeline/grader-consistency-runner.js +16 -14
  24. package/dist/pipeline/grader-sensitivity-runner.d.ts +3 -0
  25. package/dist/pipeline/grader-sensitivity-runner.js +18 -16
  26. package/dist/pipeline/grader-validate-runner.d.ts +3 -0
  27. package/dist/pipeline/grader-validate-runner.js +16 -14
  28. package/dist/pipeline/mirror-repo-tasks.d.ts +3 -1
  29. package/dist/pipeline/mirror-repo-tasks.js +8 -6
  30. package/dist/pipeline/provenance.d.ts +3 -0
  31. package/dist/pipeline/provenance.js +25 -3
  32. package/dist/sources.d.ts +2 -1
  33. package/dist/sources.js +28 -1
  34. package/package.json +3 -3
@@ -46,10 +46,11 @@ models:
46
46
  max_tokens: 4096
47
47
  modes: [baseline, observed, agentic-naive, agentic-optimized]
48
48
  - id: openai:chat:gpt-5.4
49
- label: GPT 5.4 (high)
49
+ label: GPT 5.4
50
50
  config:
51
- reasoning_effort: "high"
51
+ reasoning_effort: "medium"
52
52
  max_output_tokens: 4096
53
+ maxRetries: 1
53
54
  modes: [baseline, observed, agentic-naive, agentic-optimized]
54
55
 
55
56
  # ── Anthropic ───────────────────────────────────────────────
@@ -462,6 +462,12 @@ export interface PipelineState {
462
462
  * below threshold — this is informational, not a hard failure.
463
463
  */
464
464
  belowCritical?: string[];
465
+ /**
466
+ * Test execution summary. Set by CalculateScoresStep which reads
467
+ * the eval results and counts passed/failed/errored tests.
468
+ * Consumed by the orchestrator for the enriched PipelineResult.
469
+ */
470
+ testSummary?: TestSummary;
465
471
  }
466
472
  /**
467
473
  * Release auto-scope metadata — which tasks are affected by a content
@@ -484,7 +490,48 @@ export interface ReleaseAutoScope {
484
490
  };
485
491
  }
486
492
  /** Result of a full pipeline run */
493
+ /** Classified failure reason for a pipeline run. */
494
+ export type PipelineFailureReason = {
495
+ type: "validation";
496
+ message: string;
497
+ } | {
498
+ type: "missing-results";
499
+ message: string;
500
+ } | {
501
+ type: "step-failed";
502
+ step: string;
503
+ message: string;
504
+ } | {
505
+ type: "all-tests-errored";
506
+ message: string;
507
+ };
508
+ /** Summary of test execution outcomes. */
509
+ export interface TestSummary {
510
+ /** Total test cases executed */
511
+ total: number;
512
+ /** Tests that passed grading */
513
+ passed: number;
514
+ /** Tests that failed grading */
515
+ failed: number;
516
+ /** Tests that errored (API timeout, malformed response, etc.) */
517
+ errored: number;
518
+ /** Details of errored tests (model, task, error message) */
519
+ errors?: {
520
+ model: string;
521
+ task: string;
522
+ error: string;
523
+ }[];
524
+ }
525
+ /** Token usage and estimated cost for a pipeline run. */
526
+ export interface PipelineUsage {
527
+ totalTokens: number;
528
+ evalTokens: number;
529
+ graderTokens: number;
530
+ estimatedCostUsd?: number;
531
+ }
487
532
  export interface PipelineResult {
533
+ /** Feature areas that scored below the critical threshold. Informational — not a failure. */
534
+ belowCritical?: string[];
488
535
  /** Cache hit/miss statistics for this run */
489
536
  cache?: {
490
537
  hits: number;
@@ -494,6 +541,8 @@ export interface PipelineResult {
494
541
  };
495
542
  /** Total duration in milliseconds */
496
543
  durationMs: number;
544
+ /** Classified failure reason (when success is false). */
545
+ failureReason?: PipelineFailureReason;
497
546
  /** @deprecated Use `promptfooUrls` — kept for backward compatibility */
498
547
  promptfooUrl?: string;
499
548
  /** Per-mode Promptfoo share URLs (one per sub-eval that produced a shareable link) */
@@ -502,6 +551,10 @@ export interface PipelineResult {
502
551
  steps: Record<string, StepResult>;
503
552
  /** Overall success (all non-skipped steps succeeded) */
504
553
  success: boolean;
554
+ /** Summary of test execution outcomes. */
555
+ testSummary?: TestSummary;
556
+ /** Token usage and estimated cost. */
557
+ usage?: PipelineUsage;
505
558
  /** Validation issues found (if any) */
506
559
  validation: ValidationResult;
507
560
  }
@@ -63,9 +63,14 @@ export function createAppContext(config) {
63
63
  function createLogger() {
64
64
  if (process.env.AILF_LOG_FORMAT === "json")
65
65
  return new JsonLogger();
66
- if (process.env.AILF_QUIET === "1")
66
+ if (process.env.AILF_LOG_LEVEL === "quiet" ||
67
+ process.env.AILF_QUIET === "1") {
67
68
  return new QuietLogger();
68
- return new ConsoleLogger({ verbose: process.env.AILF_VERBOSE === "1" });
69
+ }
70
+ return new ConsoleLogger({
71
+ verbose: process.env.AILF_LOG_LEVEL === "verbose" ||
72
+ process.env.AILF_VERBOSE === "1",
73
+ });
69
74
  }
70
75
  function createCache(config) {
71
76
  const local = new FilesystemCache(config.rootDir);
@@ -77,12 +77,20 @@ export async function orchestratePipeline(ctx, steps) {
77
77
  const pipelineStart = Date.now();
78
78
  const hasJob = !!ctx.config.jobId;
79
79
  ctx.logger.section("ai-literacy-framework — Evaluation Pipeline");
80
+ ctx.logger.debug(`Pipeline starting with ${steps.length} steps`, {
81
+ steps: steps.map((s) => s.name),
82
+ mode: ctx.config.mode,
83
+ source: ctx.config.source,
84
+ noCache: ctx.config.noCache,
85
+ jobId: ctx.config.jobId,
86
+ });
80
87
  // Report initial running status
81
88
  if (hasJob) {
82
89
  await reportJobProgress(ctx, steps[0]?.name ?? "init", 0, steps.length, "running");
83
90
  }
84
91
  for (let i = 0; i < steps.length; i++) {
85
92
  const step = steps[i];
93
+ ctx.logger.debug(`Starting step ${i + 1}/${steps.length}: ${step.name}`);
86
94
  ctx.logger.section(step.name);
87
95
  // Report current step progress
88
96
  if (hasJob) {
@@ -97,16 +105,23 @@ export async function orchestratePipeline(ctx, steps) {
97
105
  // Fail fast on required step failure
98
106
  if (result.status === "failed" && !step.optional) {
99
107
  ctx.logger.error(`Pipeline aborted: ${step.name} failed`);
108
+ const failedError = result.status === "failed" ? result.error : `${step.name} failed`;
100
109
  // Report failure to job store
101
110
  if (hasJob) {
102
- const errorMsg = result.status === "failed" ? result.error : `${step.name} failed`;
103
111
  await reportJobProgress(ctx, step.name, i + 1, steps.length, "failed", {
104
- message: errorMsg,
112
+ message: failedError,
105
113
  step: step.name,
106
114
  });
107
115
  }
108
116
  return {
117
+ belowCritical: state.belowCritical,
109
118
  durationMs: Date.now() - pipelineStart,
119
+ failureReason: {
120
+ type: "step-failed",
121
+ step: step.name,
122
+ message: failedError,
123
+ },
124
+ promptfooUrls: state.promptfooUrls,
110
125
  steps: results,
111
126
  success: false,
112
127
  validation,
@@ -120,6 +135,13 @@ export async function orchestratePipeline(ctx, steps) {
120
135
  const durationMs = Date.now() - pipelineStart;
121
136
  ctx.logger.section("Pipeline Complete");
122
137
  ctx.logger.info(`All steps completed in ${durationMs}ms`);
138
+ ctx.logger.debug("Pipeline state at completion", {
139
+ belowCritical: state.belowCritical,
140
+ reportId: state.reportId,
141
+ remoteCacheHits: state.remoteCacheHits
142
+ ? [...state.remoteCacheHits]
143
+ : undefined,
144
+ });
123
145
  // Report completion to job store (with reportId from state if available)
124
146
  if (hasJob) {
125
147
  try {
@@ -145,9 +167,12 @@ export async function orchestratePipeline(ctx, steps) {
145
167
  }
146
168
  }
147
169
  return {
170
+ belowCritical: state.belowCritical,
148
171
  durationMs,
172
+ promptfooUrls: state.promptfooUrls,
149
173
  steps: results,
150
174
  success: true,
175
+ testSummary: state.testSummary,
151
176
  validation,
152
177
  };
153
178
  }
@@ -36,10 +36,13 @@ export async function runStep(step, ctx, state = {}) {
36
36
  if (canCache) {
37
37
  try {
38
38
  const inputs = step.cacheInputs(ctx);
39
+ ctx.logger.debug(`[${step.name}] Cache inputs: ${inputs.length} files`);
39
40
  const key = await ctx.cache.computeKey(inputs);
40
41
  cacheKey = key;
42
+ ctx.logger.debug(`[${step.name}] Cache key: ${key}`);
41
43
  const cached = await ctx.cache.lookup(step.name, key);
42
44
  if (cached.hit) {
45
+ ctx.logger.debug(`[${step.name}] Cache HIT — skipping execution`);
43
46
  const result = {
44
47
  durationMs: Date.now() - start,
45
48
  status: "success",
@@ -48,11 +51,16 @@ export async function runStep(step, ctx, state = {}) {
48
51
  ctx.logger.step(step.name, result);
49
52
  return result;
50
53
  }
54
+ ctx.logger.debug(`[${step.name}] Cache MISS — executing`);
51
55
  }
52
56
  catch {
53
57
  // Cache lookup failure is non-fatal — proceed to execute
58
+ ctx.logger.debug(`[${step.name}] Cache lookup failed — proceeding`);
54
59
  }
55
60
  }
61
+ else {
62
+ ctx.logger.debug(`[${step.name}] Cache skipped (${!ctx.cache ? "no cache adapter" : ctx.config.noCache ? "--no-cache" : "no cacheInputs"})`);
63
+ }
56
64
  // 3. Execute
57
65
  try {
58
66
  const result = await step.execute(ctx, state);
@@ -67,6 +67,7 @@ export class CalculateScoresStep {
67
67
  try {
68
68
  const result = calculateAndWriteScores({
69
69
  allowedOrigins: ctx.config.allowedOrigins,
70
+ logger: ctx.logger,
70
71
  mode: ctx.config.mode,
71
72
  resolvedSource,
72
73
  resultsPath: primaryMode !== "baseline"
@@ -77,6 +78,9 @@ export class CalculateScoresStep {
77
78
  source: ctx.config.source,
78
79
  });
79
80
  belowCritical = result.belowCritical;
81
+ if (result.testSummary) {
82
+ state.testSummary = result.testSummary;
83
+ }
80
84
  }
81
85
  catch (err) {
82
86
  return {
@@ -65,6 +65,7 @@ export class GenerateConfigsStep {
65
65
  tags: ctx.config.tags,
66
66
  }
67
67
  : undefined,
68
+ logger: ctx.logger,
68
69
  resolvedSource,
69
70
  rootDir: ctx.config.rootDir,
70
71
  searchMode: ctx.config.searchMode,
@@ -34,6 +34,7 @@ export class GraderConsistencyStep {
34
34
  }
35
35
  try {
36
36
  await runGraderConsistency({
37
+ logger: ctx.logger,
37
38
  replications,
38
39
  resultsPath: resolve(ctx.config.rootDir, resultsFile),
39
40
  rootDir: ctx.config.rootDir,
@@ -59,8 +59,9 @@ export class MirrorRepoTasksStep {
59
59
  // Run the mirror
60
60
  const result = await mirrorRepoTasks({
61
61
  client,
62
- tasks: repoTasks,
63
62
  git,
63
+ logger: ctx.logger,
64
+ tasks: repoTasks,
64
65
  });
65
66
  // Log results
66
67
  if (result.areasCreated.length > 0) {
@@ -1,3 +1,4 @@
1
+ import type { Logger, TestSummary } from "../_vendor/ailf-core/index.d.ts";
1
2
  import { type ResolvedSourceConfig } from "../sources.js";
2
3
  import { type ActualScoreEntry, type ComponentResult } from "../_vendor/ailf-core/index.d.ts";
3
4
  import type { GraderJudgment, PerModelEntry } from "./types.js";
@@ -86,6 +87,8 @@ export declare function scoreAgenticResults(resultsPath: string, weights: Record
86
87
  export interface CalculateScoresOptions {
87
88
  /** Allowed origins for source isolation reporting */
88
89
  allowedOrigins?: string[];
90
+ /** Logger instance (defaults to ConsoleLogger if not provided) */
91
+ logger?: Logger;
89
92
  /** Evaluation mode (controls which result files are read) */
90
93
  mode?: string;
91
94
  /** Pre-resolved source config (skips loadSource() call) */
@@ -103,5 +106,7 @@ export interface CalculateScoresOptions {
103
106
  export interface CalculateScoresResult {
104
107
  /** Feature areas that scored below the critical threshold (40). */
105
108
  belowCritical: string[];
109
+ /** Summary of test execution outcomes (total, passed, failed, errored). */
110
+ testSummary?: TestSummary;
106
111
  }
107
112
  export declare function calculateAndWriteScores(options: CalculateScoresOptions): CalculateScoresResult;