@sanity/ailf 3.5.0 → 3.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ailf.js +16 -1
- package/config/bigquery/README.md +35 -6
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +23 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +11 -0
- package/dist/adapters/api-client/build-request.js +106 -9
- package/dist/adapters/api-client/index.d.ts +1 -1
- package/dist/adapters/api-client/index.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +8 -3
- package/dist/adapters/task-sources/content-lake-task-source.js +19 -8
- package/dist/adapters/task-sources/repo-schemas.d.ts +1093 -41
- package/dist/adapters/task-sources/repo-schemas.js +178 -44
- package/dist/commands/pipeline-action.js +8 -1
- package/dist/commands/pipeline.js +1 -2
- package/dist/commands/remote-pipeline.js +6 -1
- package/package.json +1 -1
|
@@ -139,76 +139,209 @@ const AssertionSchema = z.union([
|
|
|
139
139
|
ValueAssertionSchema,
|
|
140
140
|
]);
|
|
141
141
|
// ---------------------------------------------------------------------------
|
|
142
|
-
//
|
|
142
|
+
// Shared field schemas — building blocks reused across mode variants
|
|
143
143
|
// ---------------------------------------------------------------------------
|
|
144
|
-
const
|
|
145
|
-
.
|
|
144
|
+
const TaskPromptSchema = z.object({
|
|
145
|
+
template: z.string().optional(),
|
|
146
|
+
text: z.string().optional(),
|
|
147
|
+
systemMessage: z.string().optional(),
|
|
148
|
+
vars: z.record(z.string(), z.unknown()).optional(),
|
|
149
|
+
});
|
|
150
|
+
const RubricRefSchema = z.union([
|
|
151
|
+
z.object({ ref: z.string().min(1) }),
|
|
152
|
+
z.object({
|
|
153
|
+
inline: z.string().min(1),
|
|
154
|
+
dimensions: z
|
|
155
|
+
.array(z.object({ key: z.string().min(1), weight: z.number() }))
|
|
156
|
+
.optional(),
|
|
157
|
+
}),
|
|
158
|
+
]);
|
|
159
|
+
const TaskProviderConfigSchema = z.object({
|
|
160
|
+
id: z.string().min(1),
|
|
161
|
+
config: z.record(z.string(), z.unknown()).optional(),
|
|
162
|
+
});
|
|
163
|
+
const TaskOptionsSchema = z.object({
|
|
164
|
+
timeout: z.number().optional(),
|
|
165
|
+
cache: z.boolean().optional(),
|
|
166
|
+
transformOutput: z.string().optional(),
|
|
167
|
+
promptfooOverrides: z.record(z.string(), z.unknown()).optional(),
|
|
168
|
+
});
|
|
169
|
+
const BaselineConfigSchema = z.object({
|
|
146
170
|
enabled: z.boolean().optional(),
|
|
147
171
|
rubric: z.enum(["abbreviated", "full", "none"]).optional(),
|
|
148
|
-
})
|
|
149
|
-
|
|
172
|
+
});
|
|
173
|
+
const MultiTurnSchema = z.object({
|
|
174
|
+
turns: z.array(z.object({
|
|
175
|
+
role: z.enum(["user", "assistant"]),
|
|
176
|
+
content: z.string(),
|
|
177
|
+
})),
|
|
178
|
+
});
|
|
179
|
+
// Context variants — shape differs slightly per mode. Keeping them separate
|
|
180
|
+
// is what enforces variant-specific strictness (e.g. only mcp-server carries
|
|
181
|
+
// `context.tools`).
|
|
182
|
+
const ContextLiteracyLikeSchema = z.object({
|
|
183
|
+
docs: z.array(CanonicalDocRefSchema).optional(),
|
|
184
|
+
fixtures: z.array(z.string()).optional(),
|
|
185
|
+
});
|
|
186
|
+
const ContextMcpSchema = z.object({
|
|
187
|
+
docs: z.array(CanonicalDocRefSchema).optional(),
|
|
188
|
+
fixtures: z.array(z.string()).optional(),
|
|
189
|
+
tools: z.array(z.string()).optional(),
|
|
190
|
+
});
|
|
191
|
+
// Mode-specific nested shapes
|
|
192
|
+
const McpAuthSchema = z.object({
|
|
193
|
+
type: z.enum(["bearer", "basic", "api_key", "oauth"]),
|
|
194
|
+
token: z.string().optional(),
|
|
195
|
+
username: z.string().optional(),
|
|
196
|
+
password: z.string().optional(),
|
|
197
|
+
value: z.string().optional(),
|
|
198
|
+
keyName: z.string().optional(),
|
|
199
|
+
placement: z.enum(["header", "query"]).optional(),
|
|
200
|
+
grantType: z.enum(["client_credentials", "password"]).optional(),
|
|
201
|
+
tokenUrl: z.string().optional(),
|
|
202
|
+
clientId: z.string().optional(),
|
|
203
|
+
clientSecret: z.string().optional(),
|
|
204
|
+
scopes: z.array(z.string()).optional(),
|
|
205
|
+
});
|
|
206
|
+
const McpServerConfigSchema = z.object({
|
|
207
|
+
transport: z.enum(["stdio", "sse", "streamable-http"]),
|
|
208
|
+
command: z.string().optional(),
|
|
209
|
+
url: z.string().optional(),
|
|
210
|
+
env: z.record(z.string(), z.string()).optional(),
|
|
211
|
+
headers: z.record(z.string(), z.string()).optional(),
|
|
212
|
+
startupTimeoutMs: z.number().optional(),
|
|
213
|
+
auth: McpAuthSchema.optional(),
|
|
214
|
+
});
|
|
215
|
+
const SandboxSchema = z.object({
|
|
216
|
+
type: z.enum(["docker", "git-worktree", "none", "nsjail", "tempdir"]),
|
|
217
|
+
image: z.string().optional(),
|
|
218
|
+
limits: z
|
|
219
|
+
.object({
|
|
220
|
+
cpus: z.number().optional(),
|
|
221
|
+
memoryBytes: z.number().optional(),
|
|
222
|
+
diskBytes: z.number().optional(),
|
|
223
|
+
networkAccess: z.boolean().optional(),
|
|
224
|
+
})
|
|
225
|
+
.optional(),
|
|
226
|
+
});
|
|
227
|
+
const KnowledgeBaseSchema = z.object({
|
|
228
|
+
type: z.enum(["sanity-dataset", "embeddings-index", "file-corpus"]),
|
|
229
|
+
name: z.string().min(1),
|
|
230
|
+
config: z.record(z.string(), z.unknown()).optional(),
|
|
231
|
+
});
|
|
150
232
|
// ---------------------------------------------------------------------------
|
|
151
|
-
// CanonicalTaskSchema —
|
|
233
|
+
// CanonicalTaskSchema — mode-discriminated union
|
|
152
234
|
//
|
|
153
|
-
//
|
|
154
|
-
//
|
|
235
|
+
// Mirrors `GeneralizedTaskDefinition` from
|
|
236
|
+
// packages/core/src/types/generalized-task.ts. Each variant declares only
|
|
237
|
+
// the fields that belong to its mode — cross-variant field mixing (e.g.
|
|
238
|
+
// `{ mode: "mcp-server", sandbox: {...} }`) is rejected because strict
|
|
239
|
+
// objects do not permit unknown keys.
|
|
155
240
|
//
|
|
156
|
-
// YAML tasks may omit `mode` (defaults to "literacy").
|
|
157
|
-
//
|
|
158
|
-
//
|
|
241
|
+
// YAML tasks may omit `mode` (defaults to "literacy"). The default is
|
|
242
|
+
// applied in `parseCanonicalTaskFile` before the discriminated union runs,
|
|
243
|
+
// since Zod cannot default a discriminator inside a discriminated union.
|
|
159
244
|
// ---------------------------------------------------------------------------
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
* handler, etc.) without listing every possible field. Mode-specific
|
|
165
|
-
* validation is deferred to the pipeline's mode handlers.
|
|
166
|
-
*/
|
|
167
|
-
export const CanonicalTaskSchema = z
|
|
168
|
-
.object({
|
|
245
|
+
// Common fields shared by every variant — used as a base for each per-mode
|
|
246
|
+
// schema via `.extend()`. Kept internal to avoid consumers building tasks
|
|
247
|
+
// from the base alone.
|
|
248
|
+
const TaskCommonFieldsSchema = z.object({
|
|
169
249
|
id: z
|
|
170
250
|
.string()
|
|
171
251
|
.min(1)
|
|
172
252
|
.regex(/^[a-z0-9][a-z0-9-]*$/, "Task ID must be lowercase alphanumeric with hyphens"),
|
|
173
|
-
mode: z.string().default("literacy"),
|
|
174
253
|
title: z.string().min(1),
|
|
175
254
|
description: z.string().optional(),
|
|
176
255
|
area: z.string().optional(),
|
|
177
256
|
difficulty: z.enum(["basic", "intermediate", "advanced"]).optional(),
|
|
257
|
+
tags: z.array(z.string()).optional(),
|
|
178
258
|
status: z
|
|
179
259
|
.enum(["active", "draft", "paused", "archived"])
|
|
180
260
|
.optional()
|
|
181
261
|
.default("active"),
|
|
182
|
-
tags: z.array(z.string()).optional(),
|
|
183
|
-
prompt: z
|
|
184
|
-
.object({
|
|
185
|
-
template: z.string().optional(),
|
|
186
|
-
text: z.string().optional(),
|
|
187
|
-
systemMessage: z.string().optional(),
|
|
188
|
-
vars: z.record(z.string(), z.unknown()).optional(),
|
|
189
|
-
})
|
|
190
|
-
.optional(),
|
|
191
|
-
context: z
|
|
192
|
-
.object({
|
|
193
|
-
docs: z.array(CanonicalDocRefSchema).optional(),
|
|
194
|
-
fixtures: z.array(z.string()).optional(),
|
|
195
|
-
})
|
|
196
|
-
.optional(),
|
|
197
262
|
assertions: z.array(AssertionSchema).optional(),
|
|
263
|
+
rubric: RubricRefSchema.optional(),
|
|
264
|
+
providers: z.array(TaskProviderConfigSchema).optional(),
|
|
265
|
+
options: TaskOptionsSchema.optional(),
|
|
266
|
+
prompt: TaskPromptSchema.optional(),
|
|
267
|
+
metadata: z.record(z.string(), z.unknown()).optional(),
|
|
268
|
+
});
|
|
269
|
+
// `.strict()` on each variant rejects unknown keys — this is what enforces
|
|
270
|
+
// cross-variant strictness (e.g. a literacy task cannot carry `sandbox`).
|
|
271
|
+
const LiteracyTaskSchema = TaskCommonFieldsSchema.extend({
|
|
272
|
+
mode: z.literal("literacy"),
|
|
273
|
+
context: ContextLiteracyLikeSchema.optional(),
|
|
198
274
|
referenceSolution: z.string().optional(),
|
|
199
275
|
docCoverage: z.boolean().optional().default(false),
|
|
200
|
-
baseline: BaselineConfigSchema,
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
.
|
|
276
|
+
baseline: BaselineConfigSchema.optional(),
|
|
277
|
+
}).strict();
|
|
278
|
+
const MCPServerTaskSchema = TaskCommonFieldsSchema.extend({
|
|
279
|
+
mode: z.literal("mcp-server"),
|
|
280
|
+
serverConfig: McpServerConfigSchema.optional(),
|
|
281
|
+
capabilities: z.array(z.string()).optional(),
|
|
282
|
+
context: ContextMcpSchema.optional(),
|
|
283
|
+
multiTurn: MultiTurnSchema.optional(),
|
|
284
|
+
models: z.array(z.string()).optional(),
|
|
285
|
+
maxToolRounds: z.number().optional(),
|
|
286
|
+
}).strict();
|
|
287
|
+
const AgentHarnessTaskSchema = TaskCommonFieldsSchema.extend({
|
|
288
|
+
mode: z.literal("agent-harness"),
|
|
289
|
+
sandbox: SandboxSchema.optional(),
|
|
290
|
+
tools: z.array(z.string()).optional(),
|
|
291
|
+
fixtures: z.array(z.string()).optional(),
|
|
292
|
+
context: ContextLiteracyLikeSchema.optional(),
|
|
293
|
+
multiTurn: MultiTurnSchema.optional(),
|
|
294
|
+
}).strict();
|
|
295
|
+
const KnowledgeProbeTaskSchema = TaskCommonFieldsSchema.extend({
|
|
296
|
+
mode: z.literal("knowledge-probe"),
|
|
297
|
+
probeStrategy: z
|
|
298
|
+
.enum(["breadth-first", "depth-first", "random-sample", "coverage-guided"])
|
|
299
|
+
.optional(),
|
|
300
|
+
knowledgeBase: KnowledgeBaseSchema.optional(),
|
|
301
|
+
context: ContextLiteracyLikeSchema.optional(),
|
|
302
|
+
}).strict();
|
|
303
|
+
const CustomTaskSchema = TaskCommonFieldsSchema.extend({
|
|
304
|
+
mode: z.literal("custom"),
|
|
305
|
+
handler: z.string().min(1),
|
|
306
|
+
schema: z.record(z.string(), z.unknown()).optional(),
|
|
307
|
+
context: ContextLiteracyLikeSchema.optional(),
|
|
308
|
+
}).strict();
|
|
309
|
+
/**
|
|
310
|
+
* Zod schema for a single task definition — a mode-discriminated union
|
|
311
|
+
* mirroring `GeneralizedTaskDefinition`.
|
|
312
|
+
*
|
|
313
|
+
* Unknown keys are rejected on each variant. Invalid modes are rejected
|
|
314
|
+
* by the discriminator itself.
|
|
315
|
+
*/
|
|
316
|
+
export const CanonicalTaskSchema = z.discriminatedUnion("mode", [
|
|
317
|
+
LiteracyTaskSchema,
|
|
318
|
+
MCPServerTaskSchema,
|
|
319
|
+
AgentHarnessTaskSchema,
|
|
320
|
+
KnowledgeProbeTaskSchema,
|
|
321
|
+
CustomTaskSchema,
|
|
322
|
+
]);
|
|
207
323
|
/**
|
|
208
324
|
* Schema for an array of canonical tasks — what a single .ailf/tasks/*.yaml
|
|
209
325
|
* file contains. Each file must define at least one task.
|
|
210
326
|
*/
|
|
211
327
|
export const CanonicalTaskFileSchema = z.array(CanonicalTaskSchema).min(1);
|
|
328
|
+
/**
|
|
329
|
+
* Pre-process raw task entries before discriminated-union parsing: when
|
|
330
|
+
* `mode` is missing, default it to `"literacy"`. Zod cannot default a
|
|
331
|
+
* discriminator in-place, so we normalise here.
|
|
332
|
+
*/
|
|
333
|
+
function applyModeDefault(raw) {
|
|
334
|
+
if (!Array.isArray(raw))
|
|
335
|
+
return raw;
|
|
336
|
+
return raw.map((entry) => {
|
|
337
|
+
if (typeof entry !== "object" || entry === null)
|
|
338
|
+
return entry;
|
|
339
|
+
const obj = entry;
|
|
340
|
+
if (obj.mode === undefined)
|
|
341
|
+
return { ...obj, mode: "literacy" };
|
|
342
|
+
return obj;
|
|
343
|
+
});
|
|
344
|
+
}
|
|
212
345
|
/**
|
|
213
346
|
* Parse and validate a task file's content against the canonical schema.
|
|
214
347
|
* Returns typed tasks or throws with a user-friendly Zod error message.
|
|
@@ -216,7 +349,8 @@ export const CanonicalTaskFileSchema = z.array(CanonicalTaskSchema).min(1);
|
|
|
216
349
|
* Accepts pre-parsed YAML data (unknown), not a raw string.
|
|
217
350
|
*/
|
|
218
351
|
export function parseCanonicalTaskFile(raw, filename) {
|
|
219
|
-
const
|
|
352
|
+
const prepared = applyModeDefault(raw);
|
|
353
|
+
const result = CanonicalTaskFileSchema.safeParse(prepared);
|
|
220
354
|
if (!result.success) {
|
|
221
355
|
const messages = result.error.issues
|
|
222
356
|
.map((i) => ` [${i.path.join(".")}]: ${i.message}`)
|
|
@@ -14,7 +14,7 @@ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
|
14
14
|
import { dirname, resolve } from "path";
|
|
15
15
|
import { fileURLToPath } from "url";
|
|
16
16
|
import { classifyUrls } from "../pipeline/classify-url.js";
|
|
17
|
-
import { normalizeMode } from "../pipeline/normalize-mode.js";
|
|
17
|
+
import { LiteracyVariant, normalizeMode } from "../pipeline/normalize-mode.js";
|
|
18
18
|
import { assessImpact, buildReverseMapping, } from "../pipeline/reverse-mapping.js";
|
|
19
19
|
import { buildAppContext, parseArtifactUploadEnv, } from "../orchestration/build-app-context.js";
|
|
20
20
|
import { buildStepSequence } from "../orchestration/build-step-sequence.js";
|
|
@@ -47,6 +47,13 @@ export function computeResolvedOptions(opts) {
|
|
|
47
47
|
mode = normalized.mode;
|
|
48
48
|
// Explicit --variant flag takes precedence over what normalizeMode inferred
|
|
49
49
|
variant = opts.variant ?? normalized.variant;
|
|
50
|
+
// Canonical mode "literacy" with no variant defaults to the full variant
|
|
51
|
+
// (standard + agentic). This preserves the pre-canonical CLI behavior
|
|
52
|
+
// where `--mode full` was the default, without emitting the legacy alias
|
|
53
|
+
// deprecation warning for users who pass no flags at all.
|
|
54
|
+
if (mode === "literacy" && !variant) {
|
|
55
|
+
variant = LiteracyVariant.FULL;
|
|
56
|
+
}
|
|
50
57
|
}
|
|
51
58
|
catch (err) {
|
|
52
59
|
console.error(`❌ ${err instanceof Error ? err.message : String(err)}`);
|
|
@@ -8,12 +8,11 @@
|
|
|
8
8
|
* @see docs/cli.md for the full flag reference.
|
|
9
9
|
*/
|
|
10
10
|
import { Command } from "commander";
|
|
11
|
-
import { LiteracyVariant } from "../pipeline/normalize-mode.js";
|
|
12
11
|
import { addAgenticOptions, addDebugOptions, addSanitySourceOptions, } from "./shared/options.js";
|
|
13
12
|
export function createPipelineCommand() {
|
|
14
13
|
const cmd = new Command("pipeline")
|
|
15
14
|
.description("Run the full evaluation pipeline")
|
|
16
|
-
.option("-m, --mode <mode>", "Evaluation mode: literacy (default), mcp-server, agent-harness, knowledge-probe, custom. Legacy aliases (baseline, agentic, observed, full) are accepted and normalized to literacy + variant.",
|
|
15
|
+
.option("-m, --mode <mode>", "Evaluation mode: literacy (default), mcp-server, agent-harness, knowledge-probe, custom. Legacy aliases (baseline, agentic, observed, full) are accepted and normalized to literacy + variant.", "literacy")
|
|
17
16
|
.option("--variant <variant>", "Literacy variant: full (default — standard + agentic), baseline (standard only), agentic (agentic only), observed. Only applies to --mode literacy.")
|
|
18
17
|
.option("-s, --source <name>", "Documentation source name (from sources.yaml)")
|
|
19
18
|
.option("-n, --dry-run", "Validate configuration only, no execution", false)
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
* @see docs/design-docs/cli-as-api-client.md — design doc
|
|
15
15
|
*/
|
|
16
16
|
import { ZodError } from "zod";
|
|
17
|
-
import { ApiClient, buildRemoteRequest, createProgressDisplay, formatJobError, resolveTasksDir, } from "../adapters/api-client/index.js";
|
|
17
|
+
import { ApiClient, buildRemoteRequest, createProgressDisplay, formatJobError, NoRunnableTasksError, resolveTasksDir, } from "../adapters/api-client/index.js";
|
|
18
18
|
import { writeRemoteResults } from "./remote-results.js";
|
|
19
19
|
// ---------------------------------------------------------------------------
|
|
20
20
|
// Public API
|
|
@@ -66,6 +66,10 @@ export async function runRemotePipeline(opts, rootDir) {
|
|
|
66
66
|
console.error("💡 Fix the issues above in your .ailf/tasks/ YAML files.");
|
|
67
67
|
process.exit(2);
|
|
68
68
|
}
|
|
69
|
+
if (err instanceof NoRunnableTasksError) {
|
|
70
|
+
console.error(`❌ ${err.message}`);
|
|
71
|
+
process.exit(2);
|
|
72
|
+
}
|
|
69
73
|
throw err;
|
|
70
74
|
}
|
|
71
75
|
console.log(`📦 Found ${taskCount} task(s) in ${tasksDir}`);
|
|
@@ -103,6 +107,7 @@ export async function runRemotePipeline(opts, rootDir) {
|
|
|
103
107
|
function toConfigSlice(opts) {
|
|
104
108
|
return {
|
|
105
109
|
mode: opts.mode,
|
|
110
|
+
variant: opts.variant,
|
|
106
111
|
debug: opts.debug,
|
|
107
112
|
areas: opts.areaOption
|
|
108
113
|
?.split(",")
|