@sanity/ailf 0.1.34 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/LICENSE +21 -0
  2. package/config/airbyte/ai_literacy_framework.connector.yaml +6 -0
  3. package/config/bigquery/views/reports.sql +1 -0
  4. package/dist/_vendor/ailf-core/examples/index.d.ts +10 -20
  5. package/dist/_vendor/ailf-core/examples/index.js +10 -20
  6. package/dist/_vendor/ailf-core/ports/task-source.d.ts +2 -0
  7. package/dist/_vendor/ailf-core/types/index.d.ts +65 -0
  8. package/dist/_vendor/ailf-tasks/schemas.d.ts +12 -0
  9. package/dist/_vendor/ailf-tasks/schemas.js +4 -0
  10. package/dist/adapters/task-sources/content-lake-task-source.js +9 -1
  11. package/dist/adapters/task-sources/repo-task-source.js +19 -4
  12. package/dist/commands/calculate-scores.js +5 -1
  13. package/dist/commands/publish.js +3 -0
  14. package/dist/composition-root.js +7 -2
  15. package/dist/orchestration/pipeline-orchestrator.js +27 -2
  16. package/dist/orchestration/step-runner.js +8 -0
  17. package/dist/orchestration/steps/calculate-scores-step.js +22 -19
  18. package/dist/orchestration/steps/generate-configs-step.js +1 -0
  19. package/dist/orchestration/steps/grader-consistency-step.js +1 -0
  20. package/dist/orchestration/steps/mirror-repo-tasks-step.js +2 -1
  21. package/dist/orchestration/steps/publish-report-step.js +3 -0
  22. package/dist/pipeline/calculate-scores.d.ts +11 -1
  23. package/dist/pipeline/calculate-scores.js +222 -157
  24. package/dist/pipeline/coverage-audit.d.ts +2 -1
  25. package/dist/pipeline/coverage-audit.js +5 -3
  26. package/dist/pipeline/expand-tasks.d.ts +2 -1
  27. package/dist/pipeline/expand-tasks.js +33 -2
  28. package/dist/pipeline/generate-configs.d.ts +3 -1
  29. package/dist/pipeline/generate-configs.js +51 -37
  30. package/dist/pipeline/grader-api.d.ts +2 -1
  31. package/dist/pipeline/grader-api.js +11 -9
  32. package/dist/pipeline/grader-compare-runner.d.ts +3 -0
  33. package/dist/pipeline/grader-compare-runner.js +21 -19
  34. package/dist/pipeline/grader-consistency-runner.d.ts +3 -0
  35. package/dist/pipeline/grader-consistency-runner.js +16 -14
  36. package/dist/pipeline/grader-sensitivity-runner.d.ts +3 -0
  37. package/dist/pipeline/grader-sensitivity-runner.js +18 -16
  38. package/dist/pipeline/grader-validate-runner.d.ts +3 -0
  39. package/dist/pipeline/grader-validate-runner.js +16 -14
  40. package/dist/pipeline/mirror-repo-tasks.d.ts +80 -1
  41. package/dist/pipeline/mirror-repo-tasks.js +148 -32
  42. package/dist/pipeline/provenance.d.ts +3 -0
  43. package/dist/pipeline/provenance.js +25 -3
  44. package/dist/pipeline/report-title.d.ts +66 -0
  45. package/dist/pipeline/report-title.js +118 -0
  46. package/dist/report-store.js +2 -0
  47. package/dist/sinks/bigquery/index.d.ts +1 -0
  48. package/dist/sinks/bigquery/index.js +1 -0
  49. package/dist/sources.d.ts +2 -1
  50. package/dist/sources.js +28 -1
  51. package/package.json +23 -23
@@ -16,6 +16,7 @@
16
16
  */
17
17
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
18
18
  import { join } from "path";
19
+ import { ConsoleLogger } from "../adapters/loggers/index.js";
19
20
  import { gradeOnce } from "./grader-api.js";
20
21
  import { analyzeConsistency, } from "./grader-consistency.js";
21
22
  // ---------------------------------------------------------------------------
@@ -192,7 +193,8 @@ export function formatConsistencyReport(result, graderModel) {
192
193
  */
193
194
  export async function runGraderConsistency(options) {
194
195
  const { replications, resultsPath, rootDir } = options;
195
- console.log("=== Grader Consistency Analysis ===\n");
196
+ const log = options.logger ?? new ConsoleLogger();
197
+ log.section("Grader Consistency Analysis");
196
198
  // Validate inputs
197
199
  if (!existsSync(resultsPath)) {
198
200
  throw new Error(`Results file not found: ${resultsPath}. Run 'pnpm eval' first to generate results.`);
@@ -201,8 +203,8 @@ export async function runGraderConsistency(options) {
201
203
  throw new Error("Need at least 2 replications for meaningful analysis.");
202
204
  }
203
205
  // Load eval results
204
- console.log(` Results: ${resultsPath}`);
205
- console.log(` Replications: ${replications}`);
206
+ log.info(`Results: ${resultsPath}`);
207
+ log.info(`Replications: ${replications}`);
206
208
  const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
207
209
  // Extract grader model
208
210
  const graderModel = file.config?.defaultTest?.options?.rubricProvider ??
@@ -210,20 +212,20 @@ export async function runGraderConsistency(options) {
210
212
  if (!graderModel) {
211
213
  throw new Error("Could not determine grader model from eval results config.");
212
214
  }
213
- console.log(` Grader: ${graderModel}`);
215
+ log.info(`Grader: ${graderModel}`);
214
216
  // Extract judgments
215
217
  const judgments = extractGradingJudgments(file);
216
- console.log(` Judgments: ${judgments.length} (gold tests × rubric dimensions)`);
218
+ log.info(`Judgments: ${judgments.length} (gold tests × rubric dimensions)`);
217
219
  if (judgments.length === 0) {
218
220
  throw new Error("No gradable judgments found in results.");
219
221
  }
220
222
  const totalCalls = judgments.length * replications;
221
223
  const estimatedCost = totalCalls * 0.005;
222
- console.log(` API calls: ${totalCalls} (${judgments.length} × ${replications})`);
223
- console.log(` Est. cost: ~$${estimatedCost.toFixed(2)}`);
224
- console.log();
224
+ log.info(`API calls: ${totalCalls} (${judgments.length} × ${replications})`);
225
+ log.info(`Est. cost: ~$${estimatedCost.toFixed(2)}`);
226
+ log.info("");
225
227
  // Re-grade each judgment N times
226
- console.log(` Running ${replications} replications per judgment...`);
228
+ log.info(`Running ${replications} replications per judgment...`);
227
229
  const gradings = [];
228
230
  let completed = 0;
229
231
  let failed = 0;
@@ -251,20 +253,20 @@ export async function runGraderConsistency(options) {
251
253
  taskId: judgment.description,
252
254
  });
253
255
  }
254
- console.log(); // newline after progress
256
+ log.info(""); // newline after progress
255
257
  if (failed > 0) {
256
- console.log(` ⚠ ${failed} grading calls failed (excluded from analysis)`);
258
+ log.warn(`${failed} grading calls failed (excluded from analysis)`);
257
259
  }
258
- console.log();
260
+ log.info("");
259
261
  // Analyze consistency
260
262
  const result = analyzeConsistency(gradings);
261
263
  // Print report
262
- console.log(formatConsistencyReport(result, graderModel));
264
+ log.info(formatConsistencyReport(result, graderModel));
263
265
  // Write output
264
266
  const outDir = join(rootDir, "results", "latest");
265
267
  mkdirSync(outDir, { recursive: true });
266
268
  const outPath = join(outDir, "grader-consistency.json");
267
269
  writeFileSync(outPath, JSON.stringify(result, null, 2));
268
- console.log(`\n 📄 Results written to ${outPath}`);
270
+ log.info(`\n📄 Results written to ${outPath}`);
269
271
  return result;
270
272
  }
@@ -13,12 +13,15 @@
13
13
  *
14
14
  * @see docs/exec-plans/grader-reliability.md — Phase 4
15
15
  */
16
+ import type { Logger } from "../_vendor/ailf-core/index.d.ts";
16
17
  import { type GraderSensitivityResult } from "./grader-sensitivity.js";
17
18
  export interface GraderSensitivityRunnerOptions {
18
19
  /** Filter to a specific feature area (e.g., "groq") */
19
20
  areaFilter?: string;
20
21
  /** Output format */
21
22
  format?: "json" | "table";
23
+ /** Logger instance (defaults to ConsoleLogger) */
24
+ logger?: Logger;
22
25
  /** Custom output path */
23
26
  outputPath?: string;
24
27
  /** Root directory of the eval package */
@@ -15,6 +15,7 @@
15
15
  */
16
16
  import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync, } from "fs";
17
17
  import { basename, join } from "path";
18
+ import { ConsoleLogger } from "../adapters/loggers/index.js";
18
19
  import { DEGRADATION_STRATEGIES } from "./degradations.js";
19
20
  import { gradeOnce, loadGraderModel } from "./grader-api.js";
20
21
  import { analyzeSensitivity, } from "./grader-sensitivity.js";
@@ -182,19 +183,20 @@ export function formatSensitivityReport(result) {
182
183
  */
183
184
  export async function runGraderSensitivity(options) {
184
185
  const { rootDir, areaFilter, format = "table" } = options;
185
- console.log("=== Grader Sensitivity Analysis ===\n");
186
+ const log = options.logger ?? new ConsoleLogger();
187
+ log.info("=== Grader Sensitivity Analysis ===\n");
186
188
  // Resolve grader model
187
189
  const grader = loadGraderModel(rootDir);
188
- console.log(` Grader: ${grader.label} (${grader.id})`);
190
+ log.info(` Grader: ${grader.label} (${grader.id})`);
189
191
  // Discover reference solutions
190
192
  const solutions = discoverReferenceSolutions(rootDir, areaFilter);
191
- console.log(` Solutions: ${solutions.length} reference files`);
193
+ log.info(` Solutions: ${solutions.length} reference files`);
192
194
  if (areaFilter) {
193
- console.log(` Area filter: ${areaFilter}`);
195
+ log.info(` Area filter: ${areaFilter}`);
194
196
  }
195
197
  // Generate degraded pairs
196
198
  const degradedPairs = generateDegradedPairs(solutions);
197
- console.log(` Pairs: ${degradedPairs.length} (solutions × degradations)`);
199
+ log.info(` Pairs: ${degradedPairs.length} (solutions × degradations)`);
198
200
  if (degradedPairs.length === 0) {
199
201
  throw new Error("No degraded pairs generated. Check reference solutions.");
200
202
  }
@@ -206,11 +208,11 @@ export async function runGraderSensitivity(options) {
206
208
  ];
207
209
  const totalCalls = degradedPairs.length * dimensions.length * 2;
208
210
  const estimatedCost = totalCalls * 0.005;
209
- console.log(` API calls: ${totalCalls}`);
210
- console.log(` Est. cost: ~$${estimatedCost.toFixed(2)}`);
211
- console.log();
211
+ log.info(` API calls: ${totalCalls}`);
212
+ log.info(` Est. cost: ~$${estimatedCost.toFixed(2)}`);
213
+ log.info("");
212
214
  // Grade each pair
213
- console.log(" Grading original and degraded pairs...");
215
+ log.info(" Grading original and degraded pairs...");
214
216
  const rubricTemplate = (dim) => {
215
217
  const labels = {
216
218
  codeCorrectness: "Evaluate code correctness: Does the code follow best practices, use correct APIs, and avoid anti-patterns? Score 0–100.",
@@ -237,7 +239,7 @@ export async function runGraderSensitivity(options) {
237
239
  completed === degradedPairs.length * dimensions.length) {
238
240
  const total = degradedPairs.length * dimensions.length;
239
241
  const pct = Math.round((completed / total) * 100);
240
- process.stdout.write(`\r Progress: ${completed}/${total} (${pct}%)`);
242
+ log.info(` Progress: ${completed}/${total} (${pct}%)`);
241
243
  }
242
244
  if (originalScore === null || degradedScore === null) {
243
245
  failed++;
@@ -254,11 +256,11 @@ export async function runGraderSensitivity(options) {
254
256
  });
255
257
  }
256
258
  }
257
- console.log(); // newline after progress
259
+ log.info(""); // newline after progress
258
260
  if (failed > 0) {
259
- console.log(` ⚠ ${failed} grading pairs failed (excluded)`);
261
+ log.warn(` ⚠ ${failed} grading pairs failed (excluded)`);
260
262
  }
261
- console.log();
263
+ log.info("");
262
264
  if (sensitivityPairs.length === 0) {
263
265
  throw new Error("No sensitivity pairs to analyze. All grading calls failed.");
264
266
  }
@@ -266,10 +268,10 @@ export async function runGraderSensitivity(options) {
266
268
  const result = analyzeSensitivity(sensitivityPairs, grader.id);
267
269
  // Output
268
270
  if (format === "table") {
269
- console.log(formatSensitivityReport(result));
271
+ log.info(formatSensitivityReport(result));
270
272
  }
271
273
  else {
272
- console.log(JSON.stringify(result, null, 2));
274
+ log.info(JSON.stringify(result, null, 2));
273
275
  }
274
276
  // Write output
275
277
  const outPath = options.outputPath ??
@@ -277,6 +279,6 @@ export async function runGraderSensitivity(options) {
277
279
  const outDir = join(outPath, "..");
278
280
  mkdirSync(outDir, { recursive: true });
279
281
  writeFileSync(outPath, JSON.stringify(result, null, 2));
280
- console.log(`\n 📄 Results written to ${outPath}`);
282
+ log.info(`\n 📄 Results written to ${outPath}`);
281
283
  return result;
282
284
  }
@@ -13,10 +13,13 @@
13
13
  *
14
14
  * @see docs/exec-plans/grader-reliability.md — Phase 2
15
15
  */
16
+ import type { Logger } from "../_vendor/ailf-core/index.d.ts";
16
17
  import { type GraderValidation } from "./grader-validation.js";
17
18
  export interface GraderValidateRunnerOptions {
18
19
  /** Grader model to validate (defaults to loadGraderModel(rootDir).id) */
19
20
  graderModel?: string;
21
+ /** Logger instance (defaults to ConsoleLogger) */
22
+ logger?: Logger;
20
23
  /** MAE threshold for pass/fail (default: 10) */
21
24
  maeThreshold?: number;
22
25
  /** Root directory of the eval package */
@@ -16,6 +16,7 @@
16
16
  import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
17
17
  import { join } from "path";
18
18
  import { load } from "js-yaml";
19
+ import { ConsoleLogger } from "../adapters/loggers/index.js";
19
20
  import { gradeOnce, loadGraderModel } from "./grader-api.js";
20
21
  import { classifyCorrelation, validateGrader, } from "./grader-validation.js";
21
22
  // ---------------------------------------------------------------------------
@@ -154,26 +155,27 @@ export function formatValidationReport(result) {
154
155
  */
155
156
  export async function runGraderValidate(options) {
156
157
  const { rootDir } = options;
158
+ const log = options.logger ?? new ConsoleLogger();
157
159
  const maeThreshold = options.maeThreshold ?? 10;
158
- console.log("=== Grader Validation ===\n");
160
+ log.section("Grader Validation");
159
161
  // Resolve grader model
160
162
  const graderModel = options.graderModel ?? loadGraderModel(rootDir).id;
161
- console.log(` Grader: ${graderModel}`);
162
- console.log(` Threshold: MAE < ${maeThreshold}`);
163
+ log.info(`Grader: ${graderModel}`);
164
+ log.info(`Threshold: MAE < ${maeThreshold}`);
163
165
  // Load reference grades
164
166
  const rawGrades = loadReferenceGrades(rootDir);
165
- console.log(` Samples: ${rawGrades.length} reference-graded responses`);
167
+ log.info(`Samples: ${rawGrades.length} reference-graded responses`);
166
168
  // Count total rubric judgments
167
169
  let totalJudgments = 0;
168
170
  for (const rg of rawGrades) {
169
171
  totalJudgments += rg.rubrics.length;
170
172
  }
171
- console.log(` Judgments: ${totalJudgments} (response × rubric pairs)`);
173
+ log.info(`Judgments: ${totalJudgments} (response × rubric pairs)`);
172
174
  const estimatedCost = totalJudgments * 0.005;
173
- console.log(` Est. cost: ~$${estimatedCost.toFixed(2)}`);
174
- console.log();
175
+ log.info(`Est. cost: ~$${estimatedCost.toFixed(2)}`);
176
+ log.info("");
175
177
  // Grade each reference sample
176
- console.log(" Running grader on reference samples...");
178
+ log.info("Running grader on reference samples...");
177
179
  const grades = [];
178
180
  let completed = 0;
179
181
  let failed = 0;
@@ -181,7 +183,7 @@ export async function runGraderValidate(options) {
181
183
  for (const rubric of ref.rubrics) {
182
184
  const dimension = mapDimension(rubric.dimension);
183
185
  if (!dimension) {
184
- console.error(` ⚠ Unknown dimension '${rubric.dimension}' — skipping`);
186
+ log.error(`⚠ Unknown dimension '${rubric.dimension}' — skipping`);
185
187
  continue;
186
188
  }
187
189
  const graderScore = await gradeOnce(graderModel, ref.response, rubric.rubricText);
@@ -203,24 +205,24 @@ export async function runGraderValidate(options) {
203
205
  });
204
206
  }
205
207
  }
206
- console.log(); // newline after progress
208
+ log.info(""); // newline after progress
207
209
  if (failed > 0) {
208
- console.log(` ⚠ ${failed} grading calls failed (excluded from analysis)`);
210
+ log.warn(`${failed} grading calls failed (excluded from analysis)`);
209
211
  }
210
- console.log();
212
+ log.info("");
211
213
  if (grades.length === 0) {
212
214
  throw new Error("No grades to analyze. All grading calls failed.");
213
215
  }
214
216
  // Validate
215
217
  const result = validateGrader(grades, graderModel, { maeThreshold });
216
218
  // Print report
217
- console.log(formatValidationReport(result));
219
+ log.info(formatValidationReport(result));
218
220
  // Write output
219
221
  const outDir = join(rootDir, "results", "latest");
220
222
  mkdirSync(outDir, { recursive: true });
221
223
  const outPath = join(outDir, "grader-validation.json");
222
224
  writeFileSync(outPath, JSON.stringify(result, null, 2));
223
- console.log(`\n 📄 Results written to ${outPath}`);
225
+ log.info(`\n📄 Results written to ${outPath}`);
224
226
  // Throw if threshold not met (instead of process.exit)
225
227
  if (!result.passesThreshold) {
226
228
  throw new Error(`VALIDATION FAILED: MAE ${result.overallMae} exceeds threshold ${maeThreshold}`);
@@ -13,7 +13,7 @@
13
13
  * @see docs/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
14
14
  */
15
15
  import type { SanityClient } from "@sanity/client";
16
- import { type TaskDefinition } from "../_vendor/ailf-core/index.d.ts";
16
+ import { type Logger, type TaskDefinition } from "../_vendor/ailf-core/index.d.ts";
17
17
  export interface MirrorOptions {
18
18
  /** Sanity client with write access */
19
19
  client: SanityClient;
@@ -23,6 +23,17 @@ export interface MirrorOptions {
23
23
  git: GitContext;
24
24
  /** If true, log what would be done without writing */
25
25
  dryRun?: boolean;
26
+ /** Logger instance (defaults to ConsoleLogger) */
27
+ logger?: Logger;
28
+ }
29
+ /** Authorship info extracted from git context or GitHub Actions environment. */
30
+ export interface GitAuthor {
31
+ /** Git commit author name (e.g., "Jordan Smith") */
32
+ gitName?: string;
33
+ /** Git commit author email (e.g., "jordan@example.com") */
34
+ gitEmail?: string;
35
+ /** GitHub username (from GITHUB_ACTOR or event payload) */
36
+ githubUsername?: string;
26
37
  }
27
38
  export interface GitContext {
28
39
  /** Full repo identifier (e.g., "sanity-io/visual-editing") */
@@ -35,6 +46,8 @@ export interface GitContext {
35
46
  branch: string;
36
47
  /** HEAD commit SHA */
37
48
  commitSha: string;
49
+ /** Author of the current commit/trigger */
50
+ author: GitAuthor;
38
51
  }
39
52
  export interface MirrorResult {
40
53
  /** Total tasks processed */
@@ -84,3 +97,69 @@ export declare function mirrorDocId(owner: string, repo: string, taskId: string)
84
97
  * that's not mirrored.
85
98
  */
86
99
  export declare function computeTaskHash(task: TaskDefinition): string;
100
+ /** @internal Exported for testing — not part of the public API. */
101
+ export declare function buildMirrorDocument(task: TaskDefinition, opts: {
102
+ contentHash: string;
103
+ docId: string;
104
+ /** Existing author from the current mirror document (write-once preservation) */
105
+ existingAuthor?: GitAuthor;
106
+ git: GitContext;
107
+ slugToDocId: Map<string, string>;
108
+ }): {
109
+ baseline?: {
110
+ rubric?: "full" | "abbreviated" | "none" | undefined;
111
+ enabled?: boolean | undefined;
112
+ } | undefined;
113
+ _id: string;
114
+ _type: string;
115
+ ownership: string;
116
+ status: "active" | "draft" | "paused" | "archived";
117
+ assert: Record<string, unknown>[];
118
+ canonicalDocs: ({
119
+ _key: string;
120
+ reason: string;
121
+ } | {
122
+ refType: string;
123
+ path: string;
124
+ _key: string;
125
+ reason: string;
126
+ } | {
127
+ doc?: {
128
+ _ref: string;
129
+ _type: string;
130
+ } | undefined;
131
+ docId?: string | undefined;
132
+ refType: string;
133
+ _key: string;
134
+ reason: string;
135
+ } | {
136
+ refType: string;
137
+ perspective: string;
138
+ _key: string;
139
+ reason: string;
140
+ })[];
141
+ description: string;
142
+ docCoverage: boolean;
143
+ featureArea: {
144
+ _ref: string;
145
+ _type: string;
146
+ };
147
+ id: {
148
+ _type: string;
149
+ current: string;
150
+ };
151
+ origin: {
152
+ branch: string;
153
+ commitSha: string;
154
+ contentHash: string;
155
+ lastSyncedAt: string;
156
+ path: string;
157
+ repo: string;
158
+ repoName: string;
159
+ repoOwner: string;
160
+ type: string;
161
+ author: GitAuthor;
162
+ lastEditor: GitAuthor;
163
+ };
164
+ taskPrompt: string;
165
+ };