@sanity/ailf 0.1.34 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/LICENSE +21 -0
  2. package/config/airbyte/ai_literacy_framework.connector.yaml +6 -0
  3. package/config/bigquery/views/reports.sql +1 -0
  4. package/dist/_vendor/ailf-core/examples/index.d.ts +10 -20
  5. package/dist/_vendor/ailf-core/examples/index.js +10 -20
  6. package/dist/_vendor/ailf-core/ports/task-source.d.ts +2 -0
  7. package/dist/_vendor/ailf-core/types/index.d.ts +65 -0
  8. package/dist/_vendor/ailf-tasks/schemas.d.ts +12 -0
  9. package/dist/_vendor/ailf-tasks/schemas.js +4 -0
  10. package/dist/adapters/task-sources/content-lake-task-source.js +9 -1
  11. package/dist/adapters/task-sources/repo-task-source.js +19 -4
  12. package/dist/commands/calculate-scores.js +5 -1
  13. package/dist/commands/publish.js +3 -0
  14. package/dist/composition-root.js +7 -2
  15. package/dist/orchestration/pipeline-orchestrator.js +27 -2
  16. package/dist/orchestration/step-runner.js +8 -0
  17. package/dist/orchestration/steps/calculate-scores-step.js +22 -19
  18. package/dist/orchestration/steps/generate-configs-step.js +1 -0
  19. package/dist/orchestration/steps/grader-consistency-step.js +1 -0
  20. package/dist/orchestration/steps/mirror-repo-tasks-step.js +2 -1
  21. package/dist/orchestration/steps/publish-report-step.js +3 -0
  22. package/dist/pipeline/calculate-scores.d.ts +11 -1
  23. package/dist/pipeline/calculate-scores.js +222 -157
  24. package/dist/pipeline/coverage-audit.d.ts +2 -1
  25. package/dist/pipeline/coverage-audit.js +5 -3
  26. package/dist/pipeline/expand-tasks.d.ts +2 -1
  27. package/dist/pipeline/expand-tasks.js +33 -2
  28. package/dist/pipeline/generate-configs.d.ts +3 -1
  29. package/dist/pipeline/generate-configs.js +51 -37
  30. package/dist/pipeline/grader-api.d.ts +2 -1
  31. package/dist/pipeline/grader-api.js +11 -9
  32. package/dist/pipeline/grader-compare-runner.d.ts +3 -0
  33. package/dist/pipeline/grader-compare-runner.js +21 -19
  34. package/dist/pipeline/grader-consistency-runner.d.ts +3 -0
  35. package/dist/pipeline/grader-consistency-runner.js +16 -14
  36. package/dist/pipeline/grader-sensitivity-runner.d.ts +3 -0
  37. package/dist/pipeline/grader-sensitivity-runner.js +18 -16
  38. package/dist/pipeline/grader-validate-runner.d.ts +3 -0
  39. package/dist/pipeline/grader-validate-runner.js +16 -14
  40. package/dist/pipeline/mirror-repo-tasks.d.ts +80 -1
  41. package/dist/pipeline/mirror-repo-tasks.js +148 -32
  42. package/dist/pipeline/provenance.d.ts +3 -0
  43. package/dist/pipeline/provenance.js +25 -3
  44. package/dist/pipeline/report-title.d.ts +66 -0
  45. package/dist/pipeline/report-title.js +118 -0
  46. package/dist/report-store.js +2 -0
  47. package/dist/sinks/bigquery/index.d.ts +1 -0
  48. package/dist/sinks/bigquery/index.js +1 -0
  49. package/dist/sources.d.ts +2 -1
  50. package/dist/sources.js +28 -1
  51. package/package.json +23 -23
@@ -36,11 +36,15 @@ export function createCalculateScoresCommand() {
36
36
  remote: false,
37
37
  apiUrl: "https://ailf-api.sanity.build",
38
38
  });
39
- calculateAndWriteScores({
39
+ const result = calculateAndWriteScores({
40
40
  resultsPath,
41
41
  rootDir: ctx.config.rootDir,
42
42
  source: opts.source,
43
43
  });
44
+ // At the CLI boundary, exit non-zero if areas are below threshold
45
+ if (result.belowCritical.length > 0) {
46
+ process.exitCode = 1;
47
+ }
44
48
  }
45
49
  catch (err) {
46
50
  process.exitCode = 1;
@@ -24,6 +24,7 @@ import { fileURLToPath } from "url";
24
24
  import { Command } from "commander";
25
25
  import { createAppContext } from "../composition-root.js";
26
26
  import { buildProvenance, } from "../pipeline/provenance.js";
27
+ import { generateReportTitle } from "../pipeline/report-title.js";
27
28
  import { generateReportId, } from "../report-store.js";
28
29
  import { withRetry } from "../sinks/retry.js";
29
30
  const __dirname = dirname(fileURLToPath(import.meta.url));
@@ -166,6 +167,7 @@ async function runPublishCommand(summaryPath, opts) {
166
167
  };
167
168
  }
168
169
  const reportId = generateReportId();
170
+ const title = generateReportTitle({ provenance });
169
171
  const report = {
170
172
  comparison: comparison ?? undefined,
171
173
  completedAt: now,
@@ -174,6 +176,7 @@ async function runPublishCommand(summaryPath, opts) {
174
176
  provenance,
175
177
  summary,
176
178
  tag: opts.tag,
179
+ title,
177
180
  };
178
181
  // -----------------------------------------------------------------------
179
182
  // 4. Dry run — print preview and exit
@@ -63,9 +63,14 @@ export function createAppContext(config) {
63
63
  function createLogger() {
64
64
  if (process.env.AILF_LOG_FORMAT === "json")
65
65
  return new JsonLogger();
66
- if (process.env.AILF_QUIET === "1")
66
+ if (process.env.AILF_LOG_LEVEL === "quiet" ||
67
+ process.env.AILF_QUIET === "1") {
67
68
  return new QuietLogger();
68
- return new ConsoleLogger({ verbose: process.env.AILF_VERBOSE === "1" });
69
+ }
70
+ return new ConsoleLogger({
71
+ verbose: process.env.AILF_LOG_LEVEL === "verbose" ||
72
+ process.env.AILF_VERBOSE === "1",
73
+ });
69
74
  }
70
75
  function createCache(config) {
71
76
  const local = new FilesystemCache(config.rootDir);
@@ -77,12 +77,20 @@ export async function orchestratePipeline(ctx, steps) {
77
77
  const pipelineStart = Date.now();
78
78
  const hasJob = !!ctx.config.jobId;
79
79
  ctx.logger.section("ai-literacy-framework — Evaluation Pipeline");
80
+ ctx.logger.debug(`Pipeline starting with ${steps.length} steps`, {
81
+ steps: steps.map((s) => s.name),
82
+ mode: ctx.config.mode,
83
+ source: ctx.config.source,
84
+ noCache: ctx.config.noCache,
85
+ jobId: ctx.config.jobId,
86
+ });
80
87
  // Report initial running status
81
88
  if (hasJob) {
82
89
  await reportJobProgress(ctx, steps[0]?.name ?? "init", 0, steps.length, "running");
83
90
  }
84
91
  for (let i = 0; i < steps.length; i++) {
85
92
  const step = steps[i];
93
+ ctx.logger.debug(`Starting step ${i + 1}/${steps.length}: ${step.name}`);
86
94
  ctx.logger.section(step.name);
87
95
  // Report current step progress
88
96
  if (hasJob) {
@@ -97,16 +105,23 @@ export async function orchestratePipeline(ctx, steps) {
97
105
  // Fail fast on required step failure
98
106
  if (result.status === "failed" && !step.optional) {
99
107
  ctx.logger.error(`Pipeline aborted: ${step.name} failed`);
108
+ const failedError = result.status === "failed" ? result.error : `${step.name} failed`;
100
109
  // Report failure to job store
101
110
  if (hasJob) {
102
- const errorMsg = result.status === "failed" ? result.error : `${step.name} failed`;
103
111
  await reportJobProgress(ctx, step.name, i + 1, steps.length, "failed", {
104
- message: errorMsg,
112
+ message: failedError,
105
113
  step: step.name,
106
114
  });
107
115
  }
108
116
  return {
117
+ belowCritical: state.belowCritical,
109
118
  durationMs: Date.now() - pipelineStart,
119
+ failureReason: {
120
+ type: "step-failed",
121
+ step: step.name,
122
+ message: failedError,
123
+ },
124
+ promptfooUrls: state.promptfooUrls,
110
125
  steps: results,
111
126
  success: false,
112
127
  validation,
@@ -120,6 +135,13 @@ export async function orchestratePipeline(ctx, steps) {
120
135
  const durationMs = Date.now() - pipelineStart;
121
136
  ctx.logger.section("Pipeline Complete");
122
137
  ctx.logger.info(`All steps completed in ${durationMs}ms`);
138
+ ctx.logger.debug("Pipeline state at completion", {
139
+ belowCritical: state.belowCritical,
140
+ reportId: state.reportId,
141
+ remoteCacheHits: state.remoteCacheHits
142
+ ? [...state.remoteCacheHits]
143
+ : undefined,
144
+ });
123
145
  // Report completion to job store (with reportId from state if available)
124
146
  if (hasJob) {
125
147
  try {
@@ -145,9 +167,12 @@ export async function orchestratePipeline(ctx, steps) {
145
167
  }
146
168
  }
147
169
  return {
170
+ belowCritical: state.belowCritical,
148
171
  durationMs,
172
+ promptfooUrls: state.promptfooUrls,
149
173
  steps: results,
150
174
  success: true,
175
+ testSummary: state.testSummary,
151
176
  validation,
152
177
  };
153
178
  }
@@ -36,10 +36,13 @@ export async function runStep(step, ctx, state = {}) {
36
36
  if (canCache) {
37
37
  try {
38
38
  const inputs = step.cacheInputs(ctx);
39
+ ctx.logger.debug(`[${step.name}] Cache inputs: ${inputs.length} files`);
39
40
  const key = await ctx.cache.computeKey(inputs);
40
41
  cacheKey = key;
42
+ ctx.logger.debug(`[${step.name}] Cache key: ${key}`);
41
43
  const cached = await ctx.cache.lookup(step.name, key);
42
44
  if (cached.hit) {
45
+ ctx.logger.debug(`[${step.name}] Cache HIT — skipping execution`);
43
46
  const result = {
44
47
  durationMs: Date.now() - start,
45
48
  status: "success",
@@ -48,11 +51,16 @@ export async function runStep(step, ctx, state = {}) {
48
51
  ctx.logger.step(step.name, result);
49
52
  return result;
50
53
  }
54
+ ctx.logger.debug(`[${step.name}] Cache MISS — executing`);
51
55
  }
52
56
  catch {
53
57
  // Cache lookup failure is non-fatal — proceed to execute
58
+ ctx.logger.debug(`[${step.name}] Cache lookup failed — proceeding`);
54
59
  }
55
60
  }
61
+ else {
62
+ ctx.logger.debug(`[${step.name}] Cache skipped (${!ctx.cache ? "no cache adapter" : ctx.config.noCache ? "--no-cache" : "no cacheInputs"})`);
63
+ }
56
64
  // 3. Execute
57
65
  try {
58
66
  const result = await step.execute(ctx, state);
@@ -63,9 +63,11 @@ export class CalculateScoresStep {
63
63
  catch {
64
64
  // Non-fatal — proceed without source metadata
65
65
  }
66
+ let belowCritical = [];
66
67
  try {
67
- calculateAndWriteScores({
68
+ const result = calculateAndWriteScores({
68
69
  allowedOrigins: ctx.config.allowedOrigins,
70
+ logger: ctx.logger,
69
71
  mode: ctx.config.mode,
70
72
  resolvedSource,
71
73
  resultsPath: primaryMode !== "baseline"
@@ -75,25 +77,17 @@ export class CalculateScoresStep {
75
77
  searchMode: ctx.config.searchMode,
76
78
  source: ctx.config.source,
77
79
  });
80
+ belowCritical = result.belowCritical;
81
+ if (result.testSummary) {
82
+ state.testSummary = result.testSummary;
83
+ }
78
84
  }
79
85
  catch (err) {
80
- const code = err !== null && typeof err === "object" && "status" in err
81
- ? err.status
82
- : undefined;
83
- if (code !== undefined && code !== 1) {
84
- return {
85
- durationMs: Date.now() - start,
86
- error: `calculate-scores failed with exit code ${code}`,
87
- status: "failed",
88
- };
89
- }
90
- if (code === undefined) {
91
- return {
92
- durationMs: Date.now() - start,
93
- error: `calculate-scores failed: ${err instanceof Error ? err.message : String(err)}`,
94
- status: "failed",
95
- };
96
- }
86
+ return {
87
+ durationMs: Date.now() - start,
88
+ error: `calculate-scores failed: ${err instanceof Error ? err.message : String(err)}`,
89
+ status: "failed",
90
+ };
97
91
  }
98
92
  // Postcondition: score summary exists and is valid
99
93
  const summaryIssues = checkScoreSummaryValid(ctx.config.rootDir);
@@ -105,10 +99,19 @@ export class CalculateScoresStep {
105
99
  status: "failed",
106
100
  };
107
101
  }
102
+ // Propagate belowCritical into pipeline state for downstream consumers
103
+ // (e.g., orchestrator reporting, publish step metadata).
104
+ // This is informational — the pipeline continues to run subsequent steps.
105
+ if (belowCritical.length > 0) {
106
+ state.belowCritical = belowCritical;
107
+ }
108
+ const criticalSuffix = belowCritical.length > 0
109
+ ? ` (${belowCritical.length} area(s) below critical threshold: ${belowCritical.join(", ")})`
110
+ : "";
108
111
  return {
109
112
  durationMs: Date.now() - start,
110
113
  status: "success",
111
- summary: "Scores calculated and summary written",
114
+ summary: `Scores calculated and summary written${criticalSuffix}`,
112
115
  };
113
116
  }
114
117
  cacheInputs(ctx) {
@@ -65,6 +65,7 @@ export class GenerateConfigsStep {
65
65
  tags: ctx.config.tags,
66
66
  }
67
67
  : undefined,
68
+ logger: ctx.logger,
68
69
  resolvedSource,
69
70
  rootDir: ctx.config.rootDir,
70
71
  searchMode: ctx.config.searchMode,
@@ -34,6 +34,7 @@ export class GraderConsistencyStep {
34
34
  }
35
35
  try {
36
36
  await runGraderConsistency({
37
+ logger: ctx.logger,
37
38
  replications,
38
39
  resultsPath: resolve(ctx.config.rootDir, resultsFile),
39
40
  rootDir: ctx.config.rootDir,
@@ -59,8 +59,9 @@ export class MirrorRepoTasksStep {
59
59
  // Run the mirror
60
60
  const result = await mirrorRepoTasks({
61
61
  client,
62
- tasks: repoTasks,
63
62
  git,
63
+ logger: ctx.logger,
64
+ tasks: repoTasks,
64
65
  });
65
66
  // Log results
66
67
  if (result.areasCreated.length > 0) {
@@ -14,6 +14,7 @@ import { readFileSync } from "fs";
14
14
  import { resolve } from "path";
15
15
  import { checkScoreSummaryValid } from "../../pipeline/checks.js";
16
16
  import { buildProvenance, } from "../../pipeline/provenance.js";
17
+ import { generateReportTitle } from "../../pipeline/report-title.js";
17
18
  import { generateReportId } from "../../report-store.js";
18
19
  import { withRetry } from "../../sinks/retry.js";
19
20
  export class PublishReportStep {
@@ -101,6 +102,7 @@ export class PublishReportStep {
101
102
  comparedAgainst: autoCompareResult.baselineReportId,
102
103
  };
103
104
  }
105
+ const title = generateReportTitle({ provenance });
104
106
  const report = {
105
107
  comparison: comparison ?? undefined,
106
108
  completedAt: now,
@@ -109,6 +111,7 @@ export class PublishReportStep {
109
111
  provenance,
110
112
  summary,
111
113
  tag: this.options.publishTag ?? ctx.config.publishTag,
114
+ title,
112
115
  };
113
116
  // Share reportId with downstream steps (CallbackStep + orchestrator job update)
114
117
  state.reportId = reportId;
@@ -1,3 +1,4 @@
1
+ import type { Logger, TestSummary } from "../_vendor/ailf-core/index.d.ts";
1
2
  import { type ResolvedSourceConfig } from "../sources.js";
2
3
  import { type ActualScoreEntry, type ComponentResult } from "../_vendor/ailf-core/index.d.ts";
3
4
  import type { GraderJudgment, PerModelEntry } from "./types.js";
@@ -86,6 +87,8 @@ export declare function scoreAgenticResults(resultsPath: string, weights: Record
86
87
  export interface CalculateScoresOptions {
87
88
  /** Allowed origins for source isolation reporting */
88
89
  allowedOrigins?: string[];
90
+ /** Logger instance (defaults to ConsoleLogger if not provided) */
91
+ logger?: Logger;
89
92
  /** Evaluation mode (controls which result files are read) */
90
93
  mode?: string;
91
94
  /** Pre-resolved source config (skips loadSource() call) */
@@ -99,4 +102,11 @@ export interface CalculateScoresOptions {
99
102
  /** Documentation source name */
100
103
  source?: string;
101
104
  }
102
- export declare function calculateAndWriteScores(options: CalculateScoresOptions): void;
105
+ /** Result from calculateAndWriteScores — replaces process.exit() calls. */
106
+ export interface CalculateScoresResult {
107
+ /** Feature areas that scored below the critical threshold (40). */
108
+ belowCritical: string[];
109
+ /** Summary of test execution outcomes (total, passed, failed, errored). */
110
+ testSummary?: TestSummary;
111
+ }
112
+ export declare function calculateAndWriteScores(options: CalculateScoresOptions): CalculateScoresResult;