@sanity/ailf 3.3.0 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -320,6 +320,32 @@ const graderPromptPreviewSchema = z.object({
320
320
  rubricName: z.string().max(60).optional(),
321
321
  snippet: z.string().max(120),
322
322
  });
323
+ /**
324
+ * Preview shape for the run-scoped `pipelineContext` bulk artifact (W0063 /
325
+ * D0033 M7). Lets the Studio Overview tab render a Pipeline Execution header
326
+ * row (step count, wall-clock, failed-step badge, quality-gate badge, cache
327
+ * hit count) without fetching the full context payload — `config` and per-
328
+ * step detail only land when the panel is expanded.
329
+ *
330
+ * Bounds chosen so the worst-case preview fits comfortably under 384 bytes:
331
+ * - `failedSteps` is capped at 5 entries with each name ≤ 40 chars. Real
332
+ * step names ("fetch-docs", "calculate-scores", "gap-analysis") are 10–
333
+ * 25 chars; 40 is a defensive ceiling. The array cap exists because
334
+ * `fitPreviewToCap` only shortens string fields — an unbounded array
335
+ * could push the preview over cap and force it to drop entirely.
336
+ * 5 is a triage ceiling: the panel shows "showed 5 of N failed steps"
337
+ * when `failedSteps.length < stepCount - successCount`, and the full
338
+ * per-step list is available in the drilldown payload.
339
+ * - `belowCritical` and `cacheHits` are optional — absent on old runs,
340
+ * skipped pipelines, or runs without remote-cache telemetry.
341
+ */
342
+ const pipelineContextPreviewSchema = z.object({
343
+ stepCount: z.number().int().nonnegative(),
344
+ totalDurationMs: z.number().nonnegative(),
345
+ failedSteps: z.array(z.string().max(40)).max(5),
346
+ belowCritical: z.boolean().optional(),
347
+ cacheHits: z.number().int().nonnegative().optional(),
348
+ });
323
349
  // Aspirational: most payload shapes are still loose. Tightening per-type as
324
350
  // consumers stabilize is explicitly a W0050/W0051 concern — W0049 fixes the
325
351
  // structural shape around them without changing the payload contracts.
@@ -495,6 +521,56 @@ export const ARTIFACT_REGISTRY = {
495
521
  entrySchema: unknownEntry,
496
522
  mime: "application/json",
497
523
  capBytes: 64_000,
524
+ manifestPreview: {
525
+ schema: pipelineContextPreviewSchema,
526
+ extract: (entry) => {
527
+ // Producer shape from `capturePipelineContext` in
528
+ // packages/eval/src/orchestration/pipeline-orchestrator.ts:
529
+ // { config, state: { belowCritical, remoteCacheHits, ... },
530
+ // steps: [{ name, status: "success"|"failed"|"skipped",
531
+ // durationMs? }] }
532
+ //
533
+ // `config` and everything else on `state` are drilldown-only and
534
+ // intentionally absent from the preview — they're what the panel
535
+ // fetches lazily when expanded.
536
+ const e = entry;
537
+ const stepsRaw = Array.isArray(e.steps) ? e.steps : [];
538
+ let totalDurationMs = 0;
539
+ const failedSteps = [];
540
+ let stepCount = 0;
541
+ for (const raw of stepsRaw) {
542
+ if (raw === null || typeof raw !== "object")
543
+ continue;
544
+ stepCount += 1;
545
+ const s = raw;
546
+ if (typeof s.durationMs === "number" &&
547
+ Number.isFinite(s.durationMs) &&
548
+ s.durationMs >= 0) {
549
+ totalDurationMs += s.durationMs;
550
+ }
551
+ if (s.status === "failed" &&
552
+ typeof s.name === "string" &&
553
+ failedSteps.length < 5) {
554
+ failedSteps.push(truncateString(s.name, 40));
555
+ }
556
+ }
557
+ const belowCritical = typeof e.state?.belowCritical === "boolean"
558
+ ? e.state.belowCritical
559
+ : undefined;
560
+ const cacheHitsRaw = e.state?.remoteCacheHits;
561
+ const cacheHits = Array.isArray(cacheHitsRaw)
562
+ ? cacheHitsRaw.length
563
+ : undefined;
564
+ return {
565
+ stepCount,
566
+ totalDurationMs,
567
+ failedSteps,
568
+ ...(belowCritical === undefined ? {} : { belowCritical }),
569
+ ...(cacheHits === undefined ? {} : { cacheHits }),
570
+ };
571
+ },
572
+ capBytes: 384,
573
+ },
498
574
  }),
499
575
  documentManifest: buildDescriptor({
500
576
  type: "documentManifest",
@@ -0,0 +1,59 @@
1
+ /**
2
+ * Feature flags — compile-time UI/feature visibility toggles.
3
+ *
4
+ * Single source of truth for "temporary" flags that hide in-flight features,
5
+ * gate partially-built panels, or carry a known rollback. Each entry carries
6
+ * the metadata needed to answer "why is this off and when can it go?" so
7
+ * flags don't rot into undiscoverable tombstones.
8
+ *
9
+ * This is intentionally NOT a runtime feature-flag system — no user
10
+ * segmentation, no A/B, no env-var overrides. Just a typed map of booleans
11
+ * with audit metadata. Flipping a flag is a code change.
12
+ *
13
+ * Adding a flag:
14
+ * 1. Add an entry below with every metadata field populated.
15
+ * 2. Import `FEATURE_FLAGS` at the call site and read `.enabled`.
16
+ * 3. When the re-enable condition is met, remove the entry and the gate.
17
+ *
18
+ * See docs/guides/feature-flags.md for the full lifecycle.
19
+ */
20
+ /** Shape of a single feature-flag entry. All fields required. */
21
+ export interface FeatureFlag {
22
+ /** Whether the gated feature is visible / active. */
23
+ readonly enabled: boolean;
24
+ /** Why the flag exists. Answers "what problem did turning this off solve?" */
25
+ readonly rationale: string;
26
+ /** The condition under which this flag should be re-enabled or removed. */
27
+ readonly reEnableWhen: string;
28
+ /** ID of the work item that owns the flag's resolution, or null if none. */
29
+ readonly relatedWorkItem: `W${string}` | null;
30
+ /** ISO 8601 date (YYYY-MM-DD) the flag was introduced. Used for staleness audits. */
31
+ readonly addedAt: string;
32
+ }
33
+ /**
34
+ * Registry of all active feature flags across AILF packages.
35
+ *
36
+ * Consumers read values directly:
37
+ * if (FEATURE_FLAGS.showFailureModes.enabled) { ... }
38
+ *
39
+ * Adding a key here extends the `FeatureFlagKey` union automatically; typos
40
+ * at call sites fail at compile time.
41
+ */
42
+ export declare const FEATURE_FLAGS: {
43
+ readonly showFailureModes: {
44
+ readonly enabled: false;
45
+ readonly rationale: "Current classification is too broad (majority \"Unclassified\") to be actionable in the diagnostics view.";
46
+ readonly reEnableWhen: "Failure taxonomy is refined so non-Unclassified buckets carry meaningful signal.";
47
+ readonly relatedWorkItem: "W0037-detect-model-output-failures";
48
+ readonly addedAt: "2026-04-22";
49
+ };
50
+ readonly showRegressedSinceLastRun: {
51
+ readonly enabled: false;
52
+ readonly rationale: "Bare list of regressed area names lacks explanatory context for why each regressed.";
53
+ readonly reEnableWhen: "Per-area regression attribution can be surfaced alongside the list.";
54
+ readonly relatedWorkItem: null;
55
+ readonly addedAt: "2026-04-22";
56
+ };
57
+ };
58
+ /** Union of all registered flag keys. Typos at call sites fail at compile time. */
59
+ export type FeatureFlagKey = keyof typeof FEATURE_FLAGS;
@@ -0,0 +1,44 @@
1
+ /**
2
+ * Feature flags — compile-time UI/feature visibility toggles.
3
+ *
4
+ * Single source of truth for "temporary" flags that hide in-flight features,
5
+ * gate partially-built panels, or carry a known rollback. Each entry carries
6
+ * the metadata needed to answer "why is this off and when can it go?" so
7
+ * flags don't rot into undiscoverable tombstones.
8
+ *
9
+ * This is intentionally NOT a runtime feature-flag system — no user
10
+ * segmentation, no A/B, no env-var overrides. Just a typed map of booleans
11
+ * with audit metadata. Flipping a flag is a code change.
12
+ *
13
+ * Adding a flag:
14
+ * 1. Add an entry below with every metadata field populated.
15
+ * 2. Import `FEATURE_FLAGS` at the call site and read `.enabled`.
16
+ * 3. When the re-enable condition is met, remove the entry and the gate.
17
+ *
18
+ * See docs/guides/feature-flags.md for the full lifecycle.
19
+ */
20
+ /**
21
+ * Registry of all active feature flags across AILF packages.
22
+ *
23
+ * Consumers read values directly:
24
+ * if (FEATURE_FLAGS.showFailureModes.enabled) { ... }
25
+ *
26
+ * Adding a key here extends the `FeatureFlagKey` union automatically; typos
27
+ * at call sites fail at compile time.
28
+ */
29
+ export const FEATURE_FLAGS = {
30
+ showFailureModes: {
31
+ enabled: false,
32
+ rationale: 'Current classification is too broad (majority "Unclassified") to be actionable in the diagnostics view.',
33
+ reEnableWhen: "Failure taxonomy is refined so non-Unclassified buckets carry meaningful signal.",
34
+ relatedWorkItem: "W0037-detect-model-output-failures",
35
+ addedAt: "2026-04-22",
36
+ },
37
+ showRegressedSinceLastRun: {
38
+ enabled: false,
39
+ rationale: "Bare list of regressed area names lacks explanatory context for why each regressed.",
40
+ reEnableWhen: "Per-area regression attribution can be surfaced alongside the list.",
41
+ relatedWorkItem: null,
42
+ addedAt: "2026-04-22",
43
+ },
44
+ };
@@ -10,6 +10,7 @@
10
10
  * @sanity/ailf-studio. It is the leaf of the dependency graph.
11
11
  */
12
12
  export * from "./document-ref.js";
13
+ export * from "./feature-flags.js";
13
14
  export * from "./score-grades.js";
14
15
  export * from "./noise-threshold.js";
15
16
  export * from "./eval-modes.js";
@@ -10,6 +10,7 @@
10
10
  * @sanity/ailf-studio. It is the leaf of the dependency graph.
11
11
  */
12
12
  export * from "./document-ref.js";
13
+ export * from "./feature-flags.js";
13
14
  export * from "./score-grades.js";
14
15
  export * from "./noise-threshold.js";
15
16
  export * from "./eval-modes.js";
@@ -0,0 +1,55 @@
1
+ /**
2
+ * ailf-resolver.ts — locate `@sanity/ailf` for user TS files, with a bundled fallback.
3
+ *
4
+ * User `.ailf/*.ts` files import `defineTask` / `defineConfig` / `definePreset`
5
+ * from `@sanity/ailf`. In a fresh project with no local install, that bare
6
+ * specifier cannot resolve from the user's tree. To keep `ailf init` → `ailf
7
+ * pipeline` working out of the box we transparently fall back to the CLI's own
8
+ * copy of `@sanity/ailf` by registering a jiti module alias. A user-local
9
+ * install always wins — the fallback kicks in only when resolution fails.
10
+ *
11
+ * All jiti callsites across the eval package use `resolveAilfAlias()` to get
12
+ * a consistent resolution + warning story. Callers pass the returned map (or
13
+ * nothing) to `createJiti`.
14
+ */
15
+ /**
16
+ * Probe whether the user has `@sanity/ailf` installed as a local dependency
17
+ * reachable from the given path. Walks up the directory tree looking for a
18
+ * `node_modules/@sanity/ailf/package.json`. Returns the package entry point
19
+ * path on success, null otherwise.
20
+ *
21
+ * We intentionally do NOT use Node's `require.resolve` self-reference path:
22
+ * tsx and some bundler setups make it unreliable, and a self-reference
23
+ * would only match when the caller *is* the `@sanity/ailf` package (the
24
+ * monorepo devving case), which is semantically the same as having no
25
+ * install — the bundled fallback handles it.
26
+ */
27
+ export declare function probeUserLocalAilf(fromPath: string): string | null;
28
+ /**
29
+ * Return the path to the CLI's own bundled copy of `@sanity/ailf`. Used as the
30
+ * fallback target when a user's project does not have it installed.
31
+ *
32
+ * We walk the filesystem rather than `require.resolve("@sanity/ailf")` because
33
+ * self-reference resolution is unreliable under tsx and some bundler setups.
34
+ * Returns null in exotic setups where no ancestor package.json matches.
35
+ */
36
+ export declare function getBundledAilfPath(): string | null;
37
+ /**
38
+ * Emit a one-shot stderr advisory when the loader falls back to the bundled
39
+ * `@sanity/ailf`. The flag is module-scoped so a single pipeline run warns at
40
+ * most once, no matter how many TS files trigger the fallback.
41
+ */
42
+ export declare function warnBundledFallbackOnce(): void;
43
+ /** Test-only: reset the warn-once flag between unit tests. */
44
+ export declare function resetBundledFallbackWarning(): void;
45
+ /**
46
+ * Decide whether jiti should alias `@sanity/ailf` → bundled-path for the given
47
+ * file. Returns the alias map or null.
48
+ *
49
+ * - User-local resolves → returns null (jiti's natural walk finds it).
50
+ * - User-local fails + bundled path available → returns alias map, fires
51
+ * one-shot warning, logs at verbose level.
52
+ * - User-local fails + no bundled path → returns null (nothing we can do;
53
+ * jiti will surface the original MODULE_NOT_FOUND).
54
+ */
55
+ export declare function resolveAilfAlias(filePath: string): Record<string, string> | null;
@@ -0,0 +1,147 @@
1
+ /**
2
+ * ailf-resolver.ts — locate `@sanity/ailf` for user TS files, with a bundled fallback.
3
+ *
4
+ * User `.ailf/*.ts` files import `defineTask` / `defineConfig` / `definePreset`
5
+ * from `@sanity/ailf`. In a fresh project with no local install, that bare
6
+ * specifier cannot resolve from the user's tree. To keep `ailf init` → `ailf
7
+ * pipeline` working out of the box we transparently fall back to the CLI's own
8
+ * copy of `@sanity/ailf` by registering a jiti module alias. A user-local
9
+ * install always wins — the fallback kicks in only when resolution fails.
10
+ *
11
+ * All jiti callsites across the eval package use `resolveAilfAlias()` to get
12
+ * a consistent resolution + warning story. Callers pass the returned map (or
13
+ * nothing) to `createJiti`.
14
+ */
15
+ import { existsSync, readFileSync, statSync } from "node:fs";
16
+ import { dirname, resolve as pathResolve } from "node:path";
17
+ import { fileURLToPath } from "node:url";
18
+ /**
19
+ * Walk up from this module's location to find the enclosing `@sanity/ailf`
20
+ * package root. Works in both dev (source under `packages/eval/src/`) and
21
+ * production (compiled under `packages/eval/dist/`) because both share the
22
+ * same package.json anchor.
23
+ */
24
+ function findAilfPackageRoot() {
25
+ let dir = dirname(fileURLToPath(import.meta.url));
26
+ while (dir !== dirname(dir)) {
27
+ const pkgPath = pathResolve(dir, "package.json");
28
+ if (existsSync(pkgPath)) {
29
+ try {
30
+ const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
31
+ if (pkg.name === "@sanity/ailf")
32
+ return dir;
33
+ }
34
+ catch {
35
+ /* ignore malformed package.json */
36
+ }
37
+ }
38
+ dir = dirname(dir);
39
+ }
40
+ return null;
41
+ }
42
+ /**
43
+ * Probe whether the user has `@sanity/ailf` installed as a local dependency
44
+ * reachable from the given path. Walks up the directory tree looking for a
45
+ * `node_modules/@sanity/ailf/package.json`. Returns the package entry point
46
+ * path on success, null otherwise.
47
+ *
48
+ * We intentionally do NOT use Node's `require.resolve` self-reference path:
49
+ * tsx and some bundler setups make it unreliable, and a self-reference
50
+ * would only match when the caller *is* the `@sanity/ailf` package (the
51
+ * monorepo devving case), which is semantically the same as having no
52
+ * install — the bundled fallback handles it.
53
+ */
54
+ export function probeUserLocalAilf(fromPath) {
55
+ let dir;
56
+ try {
57
+ dir =
58
+ existsSync(fromPath) && statSync(fromPath).isDirectory()
59
+ ? fromPath
60
+ : dirname(fromPath);
61
+ }
62
+ catch {
63
+ dir = dirname(fromPath);
64
+ }
65
+ while (dir !== dirname(dir)) {
66
+ const pkgJson = pathResolve(dir, "node_modules", "@sanity", "ailf", "package.json");
67
+ if (existsSync(pkgJson)) {
68
+ try {
69
+ const pkg = JSON.parse(readFileSync(pkgJson, "utf-8"));
70
+ const entry = pkg.module ?? pkg.main ?? "index.js";
71
+ return pathResolve(dirname(pkgJson), entry);
72
+ }
73
+ catch {
74
+ return null;
75
+ }
76
+ }
77
+ dir = dirname(dir);
78
+ }
79
+ return null;
80
+ }
81
+ /**
82
+ * Return the path to the CLI's own bundled copy of `@sanity/ailf`. Used as the
83
+ * fallback target when a user's project does not have it installed.
84
+ *
85
+ * We walk the filesystem rather than `require.resolve("@sanity/ailf")` because
86
+ * self-reference resolution is unreliable under tsx and some bundler setups.
87
+ * Returns null in exotic setups where no ancestor package.json matches.
88
+ */
89
+ export function getBundledAilfPath() {
90
+ const pkgRoot = findAilfPackageRoot();
91
+ if (!pkgRoot)
92
+ return null;
93
+ // Production layout: packages/eval/dist/index.js
94
+ const distEntry = pathResolve(pkgRoot, "dist", "index.js");
95
+ if (existsSync(distEntry))
96
+ return distEntry;
97
+ // Development layout (tsx on source): packages/eval/src/index.ts
98
+ const srcEntry = pathResolve(pkgRoot, "src", "index.ts");
99
+ if (existsSync(srcEntry))
100
+ return srcEntry;
101
+ return null;
102
+ }
103
+ let hasWarnedOnce = false;
104
+ /**
105
+ * Emit a one-shot stderr advisory when the loader falls back to the bundled
106
+ * `@sanity/ailf`. The flag is module-scoped so a single pipeline run warns at
107
+ * most once, no matter how many TS files trigger the fallback.
108
+ */
109
+ export function warnBundledFallbackOnce() {
110
+ if (hasWarnedOnce)
111
+ return;
112
+ hasWarnedOnce = true;
113
+ process.stderr.write(" ⚠ @sanity/ailf is not installed in your project — using the CLI's bundled copy.\n" +
114
+ " Pin it locally for reproducibility: npm install -D @sanity/ailf\n");
115
+ }
116
+ /** Test-only: reset the warn-once flag between unit tests. */
117
+ export function resetBundledFallbackWarning() {
118
+ hasWarnedOnce = false;
119
+ }
120
+ /**
121
+ * Decide whether jiti should alias `@sanity/ailf` → bundled-path for the given
122
+ * file. Returns the alias map or null.
123
+ *
124
+ * - User-local resolves → returns null (jiti's natural walk finds it).
125
+ * - User-local fails + bundled path available → returns alias map, fires
126
+ * one-shot warning, logs at verbose level.
127
+ * - User-local fails + no bundled path → returns null (nothing we can do;
128
+ * jiti will surface the original MODULE_NOT_FOUND).
129
+ */
130
+ export function resolveAilfAlias(filePath) {
131
+ const userLocal = probeUserLocalAilf(filePath);
132
+ const verbose = process.env.AILF_LOG_LEVEL === "verbose";
133
+ if (userLocal) {
134
+ if (verbose) {
135
+ process.stderr.write(` [ts-loader] ${filePath} → @sanity/ailf resolved locally at ${userLocal}\n`);
136
+ }
137
+ return null;
138
+ }
139
+ const bundled = getBundledAilfPath();
140
+ if (!bundled)
141
+ return null;
142
+ if (verbose) {
143
+ process.stderr.write(` [ts-loader] ${filePath} → @sanity/ailf not installed locally; using bundled copy at ${bundled}\n`);
144
+ }
145
+ warnBundledFallbackOnce();
146
+ return { "@sanity/ailf": bundled };
147
+ }
@@ -15,6 +15,7 @@
15
15
  import { existsSync } from "fs";
16
16
  import { pathToFileURL } from "node:url";
17
17
  import { createJiti } from "jiti";
18
+ import { resolveAilfAlias } from "./ailf-resolver.js";
18
19
  // ---------------------------------------------------------------------------
19
20
  // jiti instance factory — resolves imports relative to the loaded file
20
21
  // ---------------------------------------------------------------------------
@@ -28,13 +29,19 @@ import { createJiti } from "jiti";
28
29
  *
29
30
  * We pass a `file://` URL (not a bare path) so jiti uses ESM resolution,
30
31
  * which matches the `"import"` condition in package.json exports maps.
32
+ *
33
+ * When the user's project cannot resolve `@sanity/ailf` (fresh directory
34
+ * without a local install), we register an alias pointing at the CLI's own
35
+ * bundled copy so the load still succeeds. See `ailf-resolver.ts`.
31
36
  */
32
37
  function createJitiForFile(filePath) {
38
+ const alias = resolveAilfAlias(filePath);
33
39
  return createJiti(pathToFileURL(filePath).href, {
34
40
  // Interop: handle both `export default` and `module.exports`
35
41
  interopDefault: true,
36
42
  // Don't require file extensions in imports
37
43
  requireCache: true,
44
+ ...(alias ? { alias } : {}),
38
45
  });
39
46
  }
40
47
  /**
@@ -19,15 +19,17 @@ import { z } from "zod";
19
19
  /**
20
20
  * The set of assertion types allowed in task files.
21
21
  *
22
- * This is a curated subset of Promptfoo assertion types we expose only the
23
- * types that are stable, well-documented, and useful for external authors.
22
+ * Combines a curated subset of Promptfoo assertion types (stable, well-
23
+ * documented, useful for external authors) with the agent-harness-specific
24
+ * types mapped by `mode-handlers/agent-harness/assertions.ts`.
24
25
  */
25
- export declare const CURATED_ASSERTION_TYPES: readonly ["llm-rubric", "contains", "contains-any", "contains-all", "not-contains", "icontains", "icontains-any", "regex", "javascript", "similar", "cost", "latency"];
26
+ export declare const CURATED_ASSERTION_TYPES: readonly ["llm-rubric", "contains", "contains-any", "contains-all", "not-contains", "icontains", "icontains-any", "regex", "javascript", "similar", "cost", "latency", "file-exists", "file-contains", "command-succeeds", "diff-matches"];
26
27
  export type CuratedAssertionType = (typeof CURATED_ASSERTION_TYPES)[number];
27
28
  /**
28
- * Valid rubric template names — must match keys in config/rubrics.yaml.
29
+ * Valid rubric template names — must match template keys in
30
+ * `packages/eval/config/rubrics.ts`.
29
31
  */
30
- export declare const RUBRIC_TEMPLATE_NAMES: readonly ["task-completion", "code-correctness", "doc-coverage"];
32
+ export declare const RUBRIC_TEMPLATE_NAMES: readonly ["task-completion", "code-correctness", "doc-coverage", "mcp-input-validation", "mcp-output-correctness", "mcp-error-handling", "mcp-security", "factual-correctness", "completeness", "currency", "process-quality", "agent-output", "agent-tool-usage"];
31
33
  export type RubricTemplateName = (typeof RUBRIC_TEMPLATE_NAMES)[number];
32
34
  /**
33
35
  * Zod schema for a single task definition using canonical field names.
@@ -84,6 +86,16 @@ export declare const CanonicalTaskSchema: z.ZodObject<{
84
86
  "task-completion": "task-completion";
85
87
  "code-correctness": "code-correctness";
86
88
  "doc-coverage": "doc-coverage";
89
+ "mcp-input-validation": "mcp-input-validation";
90
+ "mcp-output-correctness": "mcp-output-correctness";
91
+ "mcp-error-handling": "mcp-error-handling";
92
+ "mcp-security": "mcp-security";
93
+ "factual-correctness": "factual-correctness";
94
+ completeness: "completeness";
95
+ currency: "currency";
96
+ "process-quality": "process-quality";
97
+ "agent-output": "agent-output";
98
+ "agent-tool-usage": "agent-tool-usage";
87
99
  }>;
88
100
  criteria: z.ZodArray<z.ZodString>;
89
101
  weight: z.ZodOptional<z.ZodNumber>;
@@ -101,6 +113,10 @@ export declare const CanonicalTaskSchema: z.ZodObject<{
101
113
  similar: "similar";
102
114
  cost: "cost";
103
115
  latency: "latency";
116
+ "file-exists": "file-exists";
117
+ "file-contains": "file-contains";
118
+ "command-succeeds": "command-succeeds";
119
+ "diff-matches": "diff-matches";
104
120
  }>;
105
121
  value: z.ZodOptional<z.ZodUnknown>;
106
122
  threshold: z.ZodOptional<z.ZodNumber>;
@@ -174,6 +190,16 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodObject<{
174
190
  "task-completion": "task-completion";
175
191
  "code-correctness": "code-correctness";
176
192
  "doc-coverage": "doc-coverage";
193
+ "mcp-input-validation": "mcp-input-validation";
194
+ "mcp-output-correctness": "mcp-output-correctness";
195
+ "mcp-error-handling": "mcp-error-handling";
196
+ "mcp-security": "mcp-security";
197
+ "factual-correctness": "factual-correctness";
198
+ completeness: "completeness";
199
+ currency: "currency";
200
+ "process-quality": "process-quality";
201
+ "agent-output": "agent-output";
202
+ "agent-tool-usage": "agent-tool-usage";
177
203
  }>;
178
204
  criteria: z.ZodArray<z.ZodString>;
179
205
  weight: z.ZodOptional<z.ZodNumber>;
@@ -191,6 +217,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodObject<{
191
217
  similar: "similar";
192
218
  cost: "cost";
193
219
  latency: "latency";
220
+ "file-exists": "file-exists";
221
+ "file-contains": "file-contains";
222
+ "command-succeeds": "command-succeeds";
223
+ "diff-matches": "diff-matches";
194
224
  }>;
195
225
  value: z.ZodOptional<z.ZodUnknown>;
196
226
  threshold: z.ZodOptional<z.ZodNumber>;
@@ -22,8 +22,9 @@ import { z } from "zod";
22
22
  /**
23
23
  * The set of assertion types allowed in task files.
24
24
  *
25
- * This is a curated subset of Promptfoo assertion types we expose only the
26
- * types that are stable, well-documented, and useful for external authors.
25
+ * Combines a curated subset of Promptfoo assertion types (stable, well-
26
+ * documented, useful for external authors) with the agent-harness-specific
27
+ * types mapped by `mode-handlers/agent-harness/assertions.ts`.
27
28
  */
28
29
  export const CURATED_ASSERTION_TYPES = [
29
30
  "llm-rubric",
@@ -38,14 +39,35 @@ export const CURATED_ASSERTION_TYPES = [
38
39
  "similar",
39
40
  "cost",
40
41
  "latency",
42
+ // Agent-harness assertions — verify sandbox state after the agent runs.
43
+ // See src/pipeline/compiler/mode-handlers/agent-harness/assertions.ts
44
+ "file-exists",
45
+ "file-contains",
46
+ "command-succeeds",
47
+ "diff-matches",
41
48
  ];
42
49
  /**
43
- * Valid rubric template names — must match keys in config/rubrics.yaml.
50
+ * Valid rubric template names — must match template keys in
51
+ * `packages/eval/config/rubrics.ts`.
44
52
  */
45
53
  export const RUBRIC_TEMPLATE_NAMES = [
54
+ // Core literacy dimensions
46
55
  "task-completion",
47
56
  "code-correctness",
48
57
  "doc-coverage",
58
+ // MCP server dimensions
59
+ "mcp-input-validation",
60
+ "mcp-output-correctness",
61
+ "mcp-error-handling",
62
+ "mcp-security",
63
+ // Knowledge probe dimensions
64
+ "factual-correctness",
65
+ "completeness",
66
+ "currency",
67
+ // Agent harness dimensions
68
+ "process-quality",
69
+ "agent-output",
70
+ "agent-tool-usage",
49
71
  ];
50
72
  // ---------------------------------------------------------------------------
51
73
  // Doc ref schemas — polymorphic canonical doc references
@@ -25,6 +25,7 @@ import { existsSync, readdirSync } from "fs";
25
25
  import { pathToFileURL } from "node:url";
26
26
  import { resolve } from "path";
27
27
  import { createJiti } from "jiti";
28
+ import { resolveAilfAlias } from "../config-sources/ailf-resolver.js";
28
29
  import { loadTsConfig } from "../config-sources/ts-config-loader.js";
29
30
  /**
30
31
  * Discover TS/JS task files in a directory.
@@ -72,9 +73,11 @@ export async function loadTsTaskFile(filePath) {
72
73
  * Needed by resolve-mappings.ts which is called from sync contexts.
73
74
  */
74
75
  export function loadTsTaskFileSync(filePath) {
76
+ const alias = resolveAilfAlias(filePath);
75
77
  const jiti = createJiti(pathToFileURL(filePath).href, {
76
78
  interopDefault: true,
77
79
  requireCache: true,
80
+ ...(alias ? { alias } : {}),
78
81
  });
79
82
  const mod = jiti(filePath);
80
83
  const value = mod && typeof mod === "object" && "default" in mod ? mod.default : mod;
@@ -20,6 +20,7 @@ import { Command } from "commander";
20
20
  import { existsSync, mkdirSync, writeFileSync } from "fs";
21
21
  import { resolve, relative } from "path";
22
22
  import { ailfConfigData, ailfConfigYaml, ailfConfigTs, taskYamlFiles, taskTsFiles, TASK_FILE_NAMES, TASK_EXAMPLES, allTaskData, workflowYaml, } from "../_vendor/ailf-core/index.js";
23
+ import { probeUserLocalAilf } from "../adapters/config-sources/ailf-resolver.js";
23
24
  // ---------------------------------------------------------------------------
24
25
  // Command factory
25
26
  // ---------------------------------------------------------------------------
@@ -82,6 +83,13 @@ async function runInit(opts) {
82
83
  console.log();
83
84
  console.log(" 🚀 Initializing AI Literacy Framework");
84
85
  console.log();
86
+ if (format === "ts" && !probeUserLocalAilf(targetDir)) {
87
+ console.log(" ℹ @sanity/ailf is not installed in this project yet.");
88
+ console.log(" For reproducibility and IDE autocomplete, install it after init:");
89
+ console.log(" npm install -D @sanity/ailf (or pnpm add -D, yarn add -D)");
90
+ console.log(" The pipeline will fall back to the CLI's bundled copy until you do.");
91
+ console.log();
92
+ }
85
93
  // 1. Create directories
86
94
  mkdirSync(tasksDir, { recursive: true });
87
95
  console.log(` ✓ Created ${rel(targetDir, ailfDir)}/`);
@@ -252,7 +260,7 @@ async function runInit(opts) {
252
260
  console.log(` 2. Validate locally: npx @sanity/ailf@latest validate-tasks .ailf/tasks/`);
253
261
  console.log(" 3. Add a GitHub Actions secret");
254
262
  console.log(" (Settings → Secrets and variables → Actions):");
255
- console.log(" • AILF_API_KEY — your API key (starts with ailf_live_sk_)");
263
+ console.log(" • AILF_API_KEY — your API key");
256
264
  console.log(" 4. Push — the workflow at .github/workflows/ailf-eval.yml runs");
257
265
  console.log(" automatically on PRs");
258
266
  if (format === "ts") {
@@ -268,9 +276,15 @@ async function runInit(opts) {
268
276
  console.log();
269
277
  console.log(" Not a Sanity employee? Request an API key from the AILF team.");
270
278
  console.log();
271
- console.log(" 💡 Test locally before pushing:");
279
+ console.log(" 💡 Test a remote run (executes against the AILF API) before pushing:");
272
280
  console.log(" AILF_API_KEY=... npx @sanity/ailf@latest pipeline --remote --debug");
273
281
  console.log();
282
+ console.log(" 💡 Or test a remote run against your repo tasks:");
283
+ console.log(" AILF_API_KEY=... npx @sanity/ailf@latest pipeline --remote --task-source=repo --debug");
284
+ console.log();
285
+ console.log(" 💡 Or run locally against your repo tasks:");
286
+ console.log(" AILF_API_KEY=... npx @sanity/ailf@latest pipeline --mode=literacy --variant=full --task-source=repo --debug --explain -y");
287
+ console.log();
274
288
  }
275
289
  // ---------------------------------------------------------------------------
276
290
  // Custom preset scaffold template
@@ -25,6 +25,7 @@ import { createRequire } from "module";
25
25
  import { existsSync, readFileSync } from "fs";
26
26
  import { load } from "js-yaml";
27
27
  import { resolve } from "path";
28
+ import { resolveAilfAlias } from "../../adapters/config-sources/ailf-resolver.js";
28
29
  /**
29
30
  * Load a config file by name, searching for TS/JS/YAML/JSON variants.
30
31
  *
@@ -134,7 +135,11 @@ function loadTsFile(filePath, format) {
134
135
  // jiti supports sync loading. Use createRequire for ESM compatibility.
135
136
  const esmRequire = createRequire(import.meta.url);
136
137
  const { createJiti } = esmRequire("jiti");
137
- const jiti = createJiti(filePath, { interopDefault: true });
138
+ const alias = resolveAilfAlias(filePath);
139
+ const jiti = createJiti(filePath, {
140
+ interopDefault: true,
141
+ ...(alias ? { alias } : {}),
142
+ });
138
143
  const mod = jiti(filePath);
139
144
  const data = (mod?.default ?? mod);
140
145
  return { data, filePath, format };
@@ -14,6 +14,7 @@ import { existsSync } from "fs";
14
14
  import { resolve } from "path";
15
15
  import { pathToFileURL } from "url";
16
16
  import { createJiti } from "jiti";
17
+ import { resolveAilfAlias } from "../../adapters/config-sources/ailf-resolver.js";
17
18
  /** Thrown for preset-specific load errors (distinguishes from third-party errors) */
18
19
  class PresetLoadError extends Error {
19
20
  constructor(message) {
@@ -53,9 +54,11 @@ function loadSinglePreset(ref, rootDir) {
53
54
  }
54
55
  }
55
56
  try {
57
+ const alias = resolveAilfAlias(filePath);
56
58
  const jiti = createJiti(pathToFileURL(rootDir).href, {
57
59
  interopDefault: true,
58
60
  requireCache: true,
61
+ ...(alias ? { alias } : {}),
59
62
  });
60
63
  // jiti() is the synchronous loader
61
64
  const mod = jiti(filePath);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "3.3.0",
3
+ "version": "3.4.0",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"