@sanity/ailf 3.5.0 → 3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/ailf.js CHANGED
@@ -33,9 +33,24 @@ const callerCwd = process.cwd()
33
33
  // ---------------------------------------------------------------------------
34
34
  if (existsSync(tsSrc)) {
35
35
  try {
36
+ // Enable the `ailf-source` export condition so @sanity/ailf-shared and
37
+ // @sanity/ailf-core resolve to their `src/index.ts` entrypoints rather
38
+ // than whatever happens to be in their `dist/` directories. Without
39
+ // this, running `ailf …` against a freshly pulled monorepo (or any
40
+ // workspace with a stale dist) fails at import time whenever the
41
+ // source introduces a new export that the dist hasn't caught up with.
42
+ const existingNodeOptions = process.env.NODE_OPTIONS ?? ""
43
+ const conditionFlag = "--conditions=ailf-source"
44
+ const nodeOptions = existingNodeOptions.includes(conditionFlag)
45
+ ? existingNodeOptions
46
+ : `${existingNodeOptions} ${conditionFlag}`.trim()
36
47
  execFileSync("npx", ["tsx", tsSrc, ...args], {
37
48
  cwd: ROOT,
38
- env: { ...process.env, AILF_CALLER_CWD: callerCwd },
49
+ env: {
50
+ ...process.env,
51
+ AILF_CALLER_CWD: callerCwd,
52
+ NODE_OPTIONS: nodeOptions,
53
+ },
39
54
  stdio: "inherit",
40
55
  })
41
56
  process.exit(0)
@@ -39,26 +39,55 @@ from `docs/design-docs/report-store/bigquery.md`.
39
39
  ### 1. Create the raw dataset (Airbyte writes here)
40
40
 
41
41
  ```bash
42
- bq mk --dataset data-platform-302218:ailf_raw
42
+ bq --project_id=data-platform-302218 --location=EU mk --dataset ailf_raw
43
43
  ```
44
44
 
45
45
  ### 2. Create the analytics dataset (views live here)
46
46
 
47
47
  ```bash
48
- bq mk --dataset data-platform-302218:ailf
48
+ bq --project_id=data-platform-302218 --location=EU mk --dataset ailf
49
49
  ```
50
50
 
51
51
  ### 3. Create the views
52
52
 
53
+ **Important ordering (learned 2026-04-23):** Airbyte must be redeployed with the
54
+ current manifest **before** you run these view SQLs. Each view binds to specific
55
+ columns on `ailf_raw.reports`; if the raw table is missing columns the Airbyte
56
+ projection expects, the `CREATE VIEW` statement fails with
57
+ `Unrecognized name: <column>`.
58
+
59
+ If your Airbyte destination has **schema evolution enabled** ("Propagate column
60
+ changes" in the UI), new columns appear automatically on the next incremental
61
+ sync. If not, flip it on, trigger a resync, and confirm the expected columns
62
+ exist before creating views:
63
+
64
+ ```bash
65
+ bq --project_id=data-platform-302218 --location=EU query --use_legacy_sql=false \
66
+ "SELECT column_name FROM ailf_raw.INFORMATION_SCHEMA.COLUMNS WHERE table_name = 'reports' ORDER BY column_name"
67
+ ```
68
+
69
+ If propagation is disabled and you can't flip it quickly, manually
70
+ `ALTER TABLE ailf_raw.reports ADD COLUMN IF NOT EXISTS …` for each missing
71
+ column as a stop-gap. Values will be `NULL` until Airbyte writes to them on the
72
+ next sync.
73
+
74
+ Once the raw table has the expected columns:
75
+
53
76
  ```bash
54
- bq query --use_legacy_sql=false < views/reports.sql
55
- bq query --use_legacy_sql=false < views/area_scores.sql
56
- bq query --use_legacy_sql=false < views/official_runs.sql
57
- bq query --use_legacy_sql=false < views/official_area_scores.sql
77
+ cd packages/eval/config/bigquery
78
+ bq --project_id=data-platform-302218 --location=EU query --use_legacy_sql=false < views/reports.sql
79
+ bq --project_id=data-platform-302218 --location=EU query --use_legacy_sql=false < views/area_scores.sql
80
+ bq --project_id=data-platform-302218 --location=EU query --use_legacy_sql=false < views/official_runs.sql
81
+ bq --project_id=data-platform-302218 --location=EU query --use_legacy_sql=false < views/official_area_scores.sql
58
82
  # per-team views are optional — copy views/team_runs_template.sql,
59
83
  # fill in the slug, and run.
60
84
  ```
61
85
 
86
+ > `--project_id` / `--location=EU` are required because `bq` needs an explicit
87
+ > billing project and the `ailf*` datasets live in the EU multi-region. If you
88
+ > run `bq query` from this repo regularly, consider setting the default with
89
+ > `gcloud config set project data-platform-302218`.
90
+
62
91
  ## Naming conventions
63
92
 
64
93
  - **`ailf_raw.*`** — raw Airbyte-loaded tables (nested JSON, Airbyte metadata
@@ -341,3 +341,26 @@ export interface CustomTaskDefinition extends TaskCommonFields {
341
341
  * when authoring tasks.
342
342
  */
343
343
  export type GeneralizedTaskDefinition = LiteracyTaskDefinition | MCPServerTaskDefinition | AgentHarnessTaskDefinition | KnowledgeProbeTaskDefinition | CustomTaskDefinition;
344
+ /**
345
+ * The subset of task modes that can be authored as `ailf.task` documents in
346
+ * the Content Lake (Sanity Studio). Today exactly `"literacy"`.
347
+ *
348
+ * Expanding this set is a deliberate decision: execution-bound fields
349
+ * (filesystem handles, local commands, sandbox config, module paths) cannot
350
+ * round-trip through Content Lake, so not every mode belongs here. Adding a
351
+ * mode requires a new or superseding ADR and a coordinated schema update
352
+ * across the domain type, Studio schema, and `ContentLakeTaskSource` adapter
353
+ * per the `ailf-schema-sync` skill.
354
+ *
355
+ * @see docs/decisions/D0038-content-lake-authorable-task-modes.md
356
+ */
357
+ export type ContentLakeAuthorableMode = "literacy";
358
+ /**
359
+ * The slice of `GeneralizedTaskDefinition` authorable in the Content Lake,
360
+ * derived mechanically from `ContentLakeAuthorableMode`. Used as the return
361
+ * type of `ContentLakeTaskSource` so the adapter's mode literal is
362
+ * type-checked against the boundary rather than a loose cast.
363
+ */
364
+ export type ContentLakeAuthorableTask = Extract<GeneralizedTaskDefinition, {
365
+ mode: ContentLakeAuthorableMode;
366
+ }>;
@@ -25,7 +25,7 @@ export type { VariableDeclaration, VariableEnvelope, VariableProvenance, Variabl
25
25
  export type { EvalTrace, ToolCallCategory, ToolCallRecord, TraceEvent, TraceSpan, TraceTokenUsage, } from "./trace.js";
26
26
  export type { ArtifactId, AssociationAxis, AssociationValues, Brand, EntryKey, Err, FixtureId, IdValidationError, NewReportId, Ok, ProviderId, PromptId, Result, ResultId, RubricId, RunFingerprint, RunId, SuiteId, TaskId, TaskSlug, TraceId, } from "./branded-ids.js";
27
27
  export { err, fixtureId, generateRunId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
28
- export type { AgentHarnessTaskDefinition, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PathDocRef, PerspectiveDocRef, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
28
+ export type { AgentHarnessTaskDefinition, ContentLakeAuthorableMode, ContentLakeAuthorableTask, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PathDocRef, PerspectiveDocRef, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
29
29
  type DocumentRef = _DocumentRef;
30
30
  /** Aggregated retrieval metrics for a feature area */
31
31
  export interface AreaRetrievalMetrics {
@@ -13,6 +13,16 @@
13
13
  * @see packages/eval/src/adapters/task-sources/repo-task-source.ts
14
14
  */
15
15
  import { type PipelineRequest } from "../../_vendor/ailf-core/index.d.ts";
16
+ /**
17
+ * Thrown when `buildRemoteRequest` can't find any runnable tasks.
18
+ *
19
+ * The CLI catches this separately from ZodError so it can print the
20
+ * message without an accompanying stack trace — the message is already
21
+ * the whole story for the user.
22
+ */
23
+ export declare class NoRunnableTasksError extends Error {
24
+ readonly name = "NoRunnableTasksError";
25
+ }
16
26
  /** Options for building a remote pipeline request. */
17
27
  export interface BuildRequestOptions {
18
28
  /** Path to .ailf/tasks/ directory. */
@@ -27,6 +37,7 @@ export interface BuildRequestOptions {
27
37
  */
28
38
  export interface RemoteConfigSlice {
29
39
  mode?: string;
40
+ variant?: string;
30
41
  debug?: {
31
42
  enabled?: boolean;
32
43
  firstN?: number;
@@ -16,7 +16,6 @@ import { existsSync } from "fs";
16
16
  import { resolve } from "path";
17
17
  import { PipelineRequestSchema, } from "../../_vendor/ailf-core/index.js";
18
18
  import { LEGACY_EVAL_MODE_ALIASES, isRunClassification, } from "../../_vendor/ailf-shared/index.js";
19
- import { LiteracyVariant } from "../../pipeline/normalize-mode.js";
20
19
  import { RepoTaskSource } from "../task-sources/repo-task-source.js";
21
20
  const LEGACY_LITERACY_VARIANT_SET = new Set(LEGACY_EVAL_MODE_ALIASES);
22
21
  /**
@@ -27,6 +26,16 @@ const LEGACY_LITERACY_VARIANT_SET = new Set(LEGACY_EVAL_MODE_ALIASES);
27
26
  function resolveCanonicalTaskMode(configMode) {
28
27
  return LEGACY_LITERACY_VARIANT_SET.has(configMode) ? "literacy" : configMode;
29
28
  }
29
+ /**
30
+ * Thrown when `buildRemoteRequest` can't find any runnable tasks.
31
+ *
32
+ * The CLI catches this separately from ZodError so it can print the
33
+ * message without an accompanying stack trace — the message is already
34
+ * the whole story for the user.
35
+ */
36
+ export class NoRunnableTasksError extends Error {
37
+ name = "NoRunnableTasksError";
38
+ }
30
39
  // ---------------------------------------------------------------------------
31
40
  // Public API
32
41
  // ---------------------------------------------------------------------------
@@ -56,11 +65,13 @@ export async function buildRemoteRequest(options) {
56
65
  ? allTasks.filter((t) => t.mode === taskModeFilter)
57
66
  : allTasks;
58
67
  if (tasks.length === 0) {
59
- throw new Error("No tasks found after applying filters.\n" +
60
- ` Tasks directory: ${tasksDir}\n` +
61
- (config.areas ? ` Area filter: ${config.areas.join(", ")}\n` : "") +
62
- (config.tasks ? ` Task filter: ${config.tasks.join(", ")}\n` : "") +
63
- " Check that your .ailf/tasks/ YAML files define tasks matching these filters.");
68
+ throw await emptyTasksError({
69
+ taskSource,
70
+ tasksDir,
71
+ config,
72
+ filterOptions,
73
+ taskModeFilter,
74
+ });
64
75
  }
65
76
  // 2. Convert tasks to inline format
66
77
  const inlineTasks = tasks.map(taskToInlineFormat);
@@ -69,10 +80,14 @@ export async function buildRemoteRequest(options) {
69
80
  taskMode: "inline",
70
81
  inlineTasks,
71
82
  };
72
- // Mode
73
- if (config.mode && config.mode !== LiteracyVariant.FULL) {
83
+ // Mode + variant — send both when set so the server sees the caller's
84
+ // canonical intent. Legacy aliases ("full", "baseline", …) are accepted
85
+ // by `PipelineRequestSchema.mode` for back-compat but the CLI now emits
86
+ // the canonical form (`mode: "literacy"` + explicit `variant`).
87
+ if (config.mode)
74
88
  raw.mode = config.mode;
75
- }
89
+ if (config.variant)
90
+ raw.variant = config.variant;
76
91
  // Debug
77
92
  if (config.debug?.enabled) {
78
93
  raw.debug = config.debug;
@@ -206,6 +221,88 @@ function taskToInlineFormat(task) {
206
221
  }
207
222
  return inline;
208
223
  }
224
+ /**
225
+ * Build a descriptive error when the task list is empty after filtering.
226
+ *
227
+ * Loads the full task list a second time with `includeDrafts: true` so we
228
+ * can distinguish the two common failure modes:
229
+ *
230
+ * 1. Every discovered task is non-active (`status: "draft"` from
231
+ * `ailf init` scaffolding, or `status: "paused"`). Tell the user how
232
+ * to opt a task in.
233
+ * 2. The tasks directory is genuinely empty for this filter combination.
234
+ * Echo the filters back so the mismatch is obvious.
235
+ *
236
+ * The directory-missing and file-missing cases are already surfaced
237
+ * earlier by `RepoTaskSource.loadTasks()`, so we never reach this helper
238
+ * for those.
239
+ */
240
+ async function emptyTasksError(args) {
241
+ const { taskSource, tasksDir, config, filterOptions, taskModeFilter } = args;
242
+ // Re-load without the status gate to categorize what got filtered.
243
+ let relaxed = [];
244
+ try {
245
+ relaxed = await taskSource.loadTasks({
246
+ ...(filterOptions ?? {}),
247
+ includeDrafts: true,
248
+ });
249
+ }
250
+ catch {
251
+ // Fall through to the generic message if re-loading fails for any
252
+ // reason (e.g. directory removed mid-run).
253
+ }
254
+ const modeMatched = taskModeFilter
255
+ ? relaxed.filter((t) => t.mode === taskModeFilter)
256
+ : relaxed;
257
+ const drafts = modeMatched.filter((t) => (t.status ?? "active") === "draft");
258
+ const paused = modeMatched.filter((t) => t.status === "paused");
259
+ const filtersBlock = (config.areas?.length
260
+ ? ` Area filter: ${config.areas.join(", ")}\n`
261
+ : "") +
262
+ (config.tasks?.length
263
+ ? ` Task filter: ${config.tasks.join(", ")}\n`
264
+ : "") +
265
+ (config.tags?.length ? ` Tag filter: ${config.tags.join(", ")}\n` : "") +
266
+ (taskModeFilter ? ` Mode filter: ${taskModeFilter}\n` : "");
267
+ if (modeMatched.length === 0) {
268
+ return new NoRunnableTasksError("No tasks matched your filters.\n" +
269
+ ` Tasks directory: ${tasksDir}\n` +
270
+ filtersBlock +
271
+ " Check that your .ailf/tasks/ YAML or .task.ts files define tasks\n" +
272
+ " matching these filters.");
273
+ }
274
+ // All matched tasks were excluded by the status gate.
275
+ const draftIds = drafts.map((t) => t.id);
276
+ const pausedIds = paused.map((t) => t.id);
277
+ const draftSample = draftIds.slice(0, 3).join(", ");
278
+ const draftMore = draftIds.length > 3 ? `, +${draftIds.length - 3} more` : "";
279
+ const pausedSample = pausedIds.slice(0, 3).join(", ");
280
+ const pausedMore = pausedIds.length > 3 ? `, +${pausedIds.length - 3} more` : "";
281
+ const lines = [];
282
+ lines.push("No runnable tasks after applying filters.");
283
+ lines.push(` Tasks directory: ${tasksDir}`);
284
+ if (filtersBlock)
285
+ lines.push(filtersBlock.trimEnd());
286
+ if (drafts.length > 0) {
287
+ lines.push(` ${drafts.length} task(s) skipped because status: "draft": ${draftSample}${draftMore}`);
288
+ }
289
+ if (paused.length > 0) {
290
+ lines.push(` ${paused.length} task(s) skipped because status: "paused": ${pausedSample}${pausedMore}`);
291
+ }
292
+ lines.push("");
293
+ lines.push(" To run one of these anyway, either:");
294
+ if (drafts.length > 0) {
295
+ lines.push(` • Change the task's status field from "draft" to "active", or`);
296
+ lines.push(` • Target it explicitly: --task ${drafts[0]?.id ?? "<id>"}`);
297
+ }
298
+ else if (paused.length > 0) {
299
+ lines.push(` • Target it explicitly by id: --task ${paused[0]?.id ?? "<id>"}, or`);
300
+ lines.push(` • Flip its status from "paused" to "active"`);
301
+ }
302
+ lines.push(" Tasks scaffolded by `ailf init` ship as drafts so you can edit");
303
+ lines.push(" them before they start contributing to your literacy score.");
304
+ return new NoRunnableTasksError(lines.join("\n"));
305
+ }
209
306
  function buildFilterOptions(config) {
210
307
  const areas = config.areas?.length ? config.areas : undefined;
211
308
  const taskIds = config.tasks?.length ? config.tasks : undefined;
@@ -5,7 +5,7 @@
5
5
  * import { ApiClient, buildRemoteRequest, resolveTasksDir } from "./adapters/api-client/index.js"
6
6
  */
7
7
  export { ApiClient } from "./api-client.js";
8
- export { buildCallerEnvelope, buildRemoteRequest, resolveTasksDir, type BuildRequestOptions, type RemoteConfigSlice, } from "./build-request.js";
8
+ export { buildCallerEnvelope, buildRemoteRequest, NoRunnableTasksError, resolveTasksDir, type BuildRequestOptions, type RemoteConfigSlice, } from "./build-request.js";
9
9
  export { ApiAuthError, ApiConnectionError, ApiError, ApiTimeoutError, } from "./errors.js";
10
10
  export { formatJobError } from "./format-error.js";
11
11
  export { createProgressDisplay } from "./progress.js";
@@ -5,7 +5,7 @@
5
5
  * import { ApiClient, buildRemoteRequest, resolveTasksDir } from "./adapters/api-client/index.js"
6
6
  */
7
7
  export { ApiClient } from "./api-client.js";
8
- export { buildCallerEnvelope, buildRemoteRequest, resolveTasksDir, } from "./build-request.js";
8
+ export { buildCallerEnvelope, buildRemoteRequest, NoRunnableTasksError, resolveTasksDir, } from "./build-request.js";
9
9
  export { ApiAuthError, ApiConnectionError, ApiError, ApiTimeoutError, } from "./errors.js";
10
10
  export { formatJobError } from "./format-error.js";
11
11
  export { createProgressDisplay } from "./progress.js";
@@ -2,13 +2,18 @@
2
2
  * Adapter: Load task definitions from the Sanity Content Lake.
3
3
  *
4
4
  * Fetches ailf.task documents via GROQ and maps them to
5
- * GeneralizedTaskDefinition (LiteracyTaskDefinition variant).
6
- * The pipeline never knows which adapter loaded the tasks.
5
+ * `ContentLakeAuthorableTask` the subset of `GeneralizedTaskDefinition`
6
+ * authorable in Studio per D0038. Today that subset is exactly the
7
+ * literacy variant.
8
+ *
9
+ * The pipeline never knows which adapter loaded the tasks; the
10
+ * `TaskSource` port widens the return type back to
11
+ * `GeneralizedTaskDefinition[]`.
7
12
  *
8
13
  * Wired in the composition root as the default task source.
9
14
  *
10
15
  * @see packages/core/src/ports/task-source.ts — TaskSource port
11
- * @see docs/archive/exec-plans/tasks-as-content/phase-2-pipeline-integration.md
16
+ * @see docs/decisions/D0038-content-lake-authorable-task-modes.md
12
17
  */
13
18
  import type { SanityClient } from "@sanity/client";
14
19
  import type { FilterOptions, GeneralizedTaskDefinition, TaskSource } from "../../_vendor/ailf-core/index.d.ts";
@@ -2,13 +2,18 @@
2
2
  * Adapter: Load task definitions from the Sanity Content Lake.
3
3
  *
4
4
  * Fetches ailf.task documents via GROQ and maps them to
5
- * GeneralizedTaskDefinition (LiteracyTaskDefinition variant).
6
- * The pipeline never knows which adapter loaded the tasks.
5
+ * `ContentLakeAuthorableTask` the subset of `GeneralizedTaskDefinition`
6
+ * authorable in Studio per D0038. Today that subset is exactly the
7
+ * literacy variant.
8
+ *
9
+ * The pipeline never knows which adapter loaded the tasks; the
10
+ * `TaskSource` port widens the return type back to
11
+ * `GeneralizedTaskDefinition[]`.
7
12
  *
8
13
  * Wired in the composition root as the default task source.
9
14
  *
10
15
  * @see packages/core/src/ports/task-source.ts — TaskSource port
11
- * @see docs/archive/exec-plans/tasks-as-content/phase-2-pipeline-integration.md
16
+ * @see docs/decisions/D0038-content-lake-authorable-task-modes.md
12
17
  */
13
18
  // ---------------------------------------------------------------------------
14
19
  // GROQ query — fetches ailf.task documents with resolved references
@@ -47,11 +52,15 @@ const TASKS_QUERY = /* groq */ `
47
52
  && (!defined($tags) || count((tags)[@ in $tags]) > 0)
48
53
  ] | order(coalesce(area->areaId.current, featureArea->areaId.current) asc, id.current asc) {
49
54
  "taskId": id.current,
50
- // Coalesce current and legacy field names so documents created before
51
- // the schema rename are still readable.
55
+ // The coalesce on title preserves back-compat: older documents that used
56
+ // the description field as the task label (before title was required)
57
+ // still read cleanly. New documents have title and description as
58
+ // distinct fields.
52
59
  "title": coalesce(title, description),
60
+ description,
53
61
  "areaId": coalesce(area->areaId.current, featureArea->areaId.current),
54
62
  "promptText": coalesce(promptText, taskPrompt),
63
+ status,
55
64
  docCoverage,
56
65
  "contextDocs": coalesce(contextDocs, canonicalDocs)[] {
57
66
  refType,
@@ -86,7 +95,7 @@ export class ContentLakeTaskSource {
86
95
  }
87
96
  const definitions = [];
88
97
  for (const entry of raw) {
89
- const mapped = mapToLiteracyTask(entry);
98
+ const mapped = mapToAuthorableTask(entry);
90
99
  if (!mapped)
91
100
  continue;
92
101
  definitions.push(mapped);
@@ -115,14 +124,14 @@ function buildGroqParams(filter) {
115
124
  // Mapping: Content Lake → LiteracyTaskDefinition
116
125
  // ---------------------------------------------------------------------------
117
126
  /**
118
- * Map a Content Lake ailf.task document directly to a LiteracyTaskDefinition.
127
+ * Map a Content Lake ailf.task document to a `ContentLakeAuthorableTask`.
119
128
  *
120
129
  * Returns null if the document is missing required fields (taskId,
121
130
  * title, areaId, promptText). These are required by the
122
131
  * Studio schema, but defensive coding handles edge cases (drafts,
123
132
  * partially-created documents, etc.).
124
133
  */
125
- function mapToLiteracyTask(raw) {
134
+ function mapToAuthorableTask(raw) {
126
135
  // Required fields — skip malformed documents
127
136
  if (!raw.taskId || !raw.title || !raw.areaId || !raw.promptText) {
128
137
  return null;
@@ -172,6 +181,8 @@ function mapToLiteracyTask(raw) {
172
181
  referenceSolution: "",
173
182
  ...(baseline ? { baseline } : {}),
174
183
  ...(raw.tags?.length ? { tags: raw.tags } : {}),
184
+ ...(raw.status ? { status: raw.status } : {}),
185
+ ...(raw.description ? { description: raw.description } : {}),
175
186
  };
176
187
  }
177
188
  /**