@sanity/ailf 3.1.1 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +1 -1
- package/dist/_vendor/ailf-core/artifact-registry.js +188 -9
- package/dist/_vendor/ailf-core/examples/index.d.ts +8 -8
- package/dist/_vendor/ailf-core/examples/index.js +8 -8
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +8 -6
- package/dist/_vendor/ailf-core/types/branded-ids.js +35 -24
- package/dist/_vendor/ailf-core/types/index.d.ts +11 -4
- package/dist/_vendor/ailf-core/types/scoring-input.d.ts +6 -0
- package/dist/artifact-capture/api-gateway-artifact-writer.js +5 -3
- package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +11 -7
- package/dist/artifact-capture/fanout-artifact-writer.d.ts +9 -6
- package/dist/artifact-capture/fanout-artifact-writer.js +9 -6
- package/dist/artifact-capture/gcs-artifact-writer.js +17 -22
- package/dist/artifact-capture/prepare-upload-body.d.ts +27 -0
- package/dist/artifact-capture/prepare-upload-body.js +36 -0
- package/dist/commands/init.d.ts +1 -1
- package/dist/commands/init.js +3 -3
- package/dist/commands/pipeline-action.js +51 -6
- package/dist/commands/pipeline.js +1 -1
- package/dist/commands/validate-tasks.d.ts +14 -3
- package/dist/commands/validate-tasks.js +125 -81
- package/dist/composition-root.js +7 -2
- package/dist/index.d.ts +2 -0
- package/dist/index.js +4 -0
- package/dist/orchestration/pipeline-orchestrator.js +34 -5
- package/dist/pipeline/calculate-scores.d.ts +6 -0
- package/dist/pipeline/calculate-scores.js +2 -0
- package/package.json +1 -1
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared preamble for uploading an artifact payload from an `ArtifactWriter`.
|
|
3
|
+
*
|
|
4
|
+
* All three `ArtifactWriter` implementations (direct GCS, API Gateway, batching
|
|
5
|
+
* API Gateway) must apply the same `redact → serialize → bytecount` pipeline so
|
|
6
|
+
* secrets are stripped before leaving the process. Routing each writer through
|
|
7
|
+
* this helper prevents drift — any future writer that skips the helper would
|
|
8
|
+
* fail the contract test in
|
|
9
|
+
* `src/__tests__/artifact-upload-redaction.test.ts`.
|
|
10
|
+
*
|
|
11
|
+
* NDJSON streaming is **not** handled here — each row is redacted independently
|
|
12
|
+
* by the NDJSON writer path before being concatenated into a part body.
|
|
13
|
+
*/
|
|
14
|
+
import type { ArtifactMime } from "../_vendor/ailf-core/index.d.ts";
|
|
15
|
+
export interface PreparedUploadBody {
|
|
16
|
+
readonly body: string;
|
|
17
|
+
readonly bytes: number;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Redact, serialize, and size `payload` for upload.
|
|
21
|
+
*
|
|
22
|
+
* Serialization branches on `mime`:
|
|
23
|
+
* - `application/json` (and anything else JSON-shaped, including the
|
|
24
|
+
* single-shot side of `application/x-ndjson`) → `JSON.stringify`.
|
|
25
|
+
* - `text/markdown` / `application/yaml` → coerce to string via `String()`.
|
|
26
|
+
*/
|
|
27
|
+
export declare function prepareUploadBody(payload: unknown, mime: ArtifactMime): PreparedUploadBody;
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared preamble for uploading an artifact payload from an `ArtifactWriter`.
|
|
3
|
+
*
|
|
4
|
+
* All three `ArtifactWriter` implementations (direct GCS, API Gateway, batching
|
|
5
|
+
* API Gateway) must apply the same `redact → serialize → bytecount` pipeline so
|
|
6
|
+
* secrets are stripped before leaving the process. Routing each writer through
|
|
7
|
+
* this helper prevents drift — any future writer that skips the helper would
|
|
8
|
+
* fail the contract test in
|
|
9
|
+
* `src/__tests__/artifact-upload-redaction.test.ts`.
|
|
10
|
+
*
|
|
11
|
+
* NDJSON streaming is **not** handled here — each row is redacted independently
|
|
12
|
+
* by the NDJSON writer path before being concatenated into a part body.
|
|
13
|
+
*/
|
|
14
|
+
import { redactArtifactData } from "./redact-artifact.js";
|
|
15
|
+
/**
|
|
16
|
+
* Redact, serialize, and size `payload` for upload.
|
|
17
|
+
*
|
|
18
|
+
* Serialization branches on `mime`:
|
|
19
|
+
* - `application/json` (and anything else JSON-shaped, including the
|
|
20
|
+
* single-shot side of `application/x-ndjson`) → `JSON.stringify`.
|
|
21
|
+
* - `text/markdown` / `application/yaml` → coerce to string via `String()`.
|
|
22
|
+
*/
|
|
23
|
+
export function prepareUploadBody(payload, mime) {
|
|
24
|
+
const redacted = redactArtifactData(payload);
|
|
25
|
+
const body = serializeForMime(redacted, mime);
|
|
26
|
+
const bytes = Buffer.byteLength(body, "utf-8");
|
|
27
|
+
return { body, bytes };
|
|
28
|
+
}
|
|
29
|
+
function serializeForMime(payload, mime) {
|
|
30
|
+
if (mime === "text/markdown" || mime === "application/yaml") {
|
|
31
|
+
if (typeof payload === "string")
|
|
32
|
+
return payload;
|
|
33
|
+
return String(payload ?? "");
|
|
34
|
+
}
|
|
35
|
+
return JSON.stringify(payload);
|
|
36
|
+
}
|
package/dist/commands/init.d.ts
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* task files. The generated files are ready-to-edit starting points —
|
|
6
6
|
* not live evaluation tasks.
|
|
7
7
|
*
|
|
8
|
-
* TypeScript output (default) uses define* helpers from @sanity/ailf
|
|
8
|
+
* TypeScript output (default) uses define* helpers from @sanity/ailf
|
|
9
9
|
* for full IDE autocomplete and type checking. YAML output serializes the
|
|
10
10
|
* parsed task data. JSON output is a plain serialization of the parsed data.
|
|
11
11
|
*
|
package/dist/commands/init.js
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* task files. The generated files are ready-to-edit starting points —
|
|
6
6
|
* not live evaluation tasks.
|
|
7
7
|
*
|
|
8
|
-
* TypeScript output (default) uses define* helpers from @sanity/ailf
|
|
8
|
+
* TypeScript output (default) uses define* helpers from @sanity/ailf
|
|
9
9
|
* for full IDE autocomplete and type checking. YAML output serializes the
|
|
10
10
|
* parsed task data. JSON output is a plain serialization of the parsed data.
|
|
11
11
|
*
|
|
@@ -258,7 +258,7 @@ async function runInit(opts) {
|
|
|
258
258
|
if (format === "ts") {
|
|
259
259
|
console.log();
|
|
260
260
|
console.log(` 💡 TypeScript tasks (${taskExt}) give you full IDE autocomplete`);
|
|
261
|
-
console.log(" via defineTask() from @sanity/ailf
|
|
261
|
+
console.log(" via defineTask() from @sanity/ailf.");
|
|
262
262
|
}
|
|
263
263
|
console.log();
|
|
264
264
|
console.log(" 🔑 Retrieve the API key from 1Password (Sanity employees):");
|
|
@@ -289,7 +289,7 @@ const CUSTOM_PRESET_TS = `/**
|
|
|
289
289
|
* @see https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/presets.md
|
|
290
290
|
*/
|
|
291
291
|
|
|
292
|
-
import { definePreset } from "
|
|
292
|
+
import { definePreset } from "@sanity/ailf"
|
|
293
293
|
|
|
294
294
|
export default definePreset({
|
|
295
295
|
name: "my-docs-evaluation",
|
|
@@ -191,12 +191,18 @@ export function computeResolvedOptions(opts) {
|
|
|
191
191
|
// Smart default: full runs auto-publish when store is configured
|
|
192
192
|
publishEnabled = reportStoreConfigured && !debugEnabled;
|
|
193
193
|
}
|
|
194
|
+
// Resolve task source + repo tasks path before anything that depends on
|
|
195
|
+
// them (report store overrides, output dir). When --task-source=repo is
|
|
196
|
+
// set without --repo-tasks-path, default to ./.ailf/tasks/ — the location
|
|
197
|
+
// created by `ailf init`.
|
|
198
|
+
const resolvedTaskSourceType = resolveTaskSourceType(opts.taskSource);
|
|
199
|
+
const resolvedRepoTasksPath = resolveRepoTasksPath(callerCwd, opts.repoTasksPath, resolvedTaskSourceType);
|
|
194
200
|
// Report store overrides — resolution order:
|
|
195
201
|
// 1. Explicit CLI flags (--report-dataset, --report-project)
|
|
196
202
|
// 2. Environment variables (AILF_REPORT_DATASET, AILF_REPORT_PROJECT_ID)
|
|
197
|
-
// 3. .ailf/config.yaml reportStore block (when
|
|
203
|
+
// 3. .ailf/config.yaml reportStore block (when repo tasks path is set)
|
|
198
204
|
// 4. Eval dataset override (so perspective evals publish to the same dataset)
|
|
199
|
-
const repoConfig = loadRepoConfigIfPresent(
|
|
205
|
+
const repoConfig = loadRepoConfigIfPresent(resolvedRepoTasksPath);
|
|
200
206
|
const reportDataset = opts.reportDataset ??
|
|
201
207
|
process.env.AILF_REPORT_DATASET ??
|
|
202
208
|
repoConfig?.reportStore?.dataset ??
|
|
@@ -211,10 +217,6 @@ export function computeResolvedOptions(opts) {
|
|
|
211
217
|
const apiUrl = opts.apiUrl ?? process.env.AILF_API_URL ?? "https://ailf-api.sanity.build";
|
|
212
218
|
const apiKey = process.env.AILF_API_KEY ?? undefined;
|
|
213
219
|
// Output directory: explicit --output-dir → $CWD/.ailf/results/latest/
|
|
214
|
-
const resolvedRepoTasksPath = opts.repoTasksPath
|
|
215
|
-
? resolve(callerCwd, opts.repoTasksPath)
|
|
216
|
-
: undefined;
|
|
217
|
-
const resolvedTaskSourceType = resolveTaskSourceType(opts.taskSource);
|
|
218
220
|
const outputDir = resolveOutputDir(opts.outputDir);
|
|
219
221
|
return {
|
|
220
222
|
allowedOriginArgs,
|
|
@@ -299,6 +301,39 @@ function resolveTaskSourceType(raw) {
|
|
|
299
301
|
console.error(`❌ Invalid --task-source "${raw}". Must be "repo" or "content-lake".`);
|
|
300
302
|
process.exit(1);
|
|
301
303
|
}
|
|
304
|
+
/**
|
|
305
|
+
* Resolve the repo tasks path.
|
|
306
|
+
*
|
|
307
|
+
* - Explicit `--repo-tasks-path` wins (resolved relative to callerCwd).
|
|
308
|
+
* - When `--task-source=repo` is set without a path, defaults to
|
|
309
|
+
* `./.ailf/tasks/` in callerCwd — the location created by `ailf init`.
|
|
310
|
+
* - Otherwise returns undefined (Content Lake source).
|
|
311
|
+
*
|
|
312
|
+
* Exits with a helpful error when an explicit path doesn't exist, or when
|
|
313
|
+
* the repo source was requested but no tasks directory can be found.
|
|
314
|
+
*/
|
|
315
|
+
function resolveRepoTasksPath(callerCwd, explicitPath, taskSourceType) {
|
|
316
|
+
if (explicitPath) {
|
|
317
|
+
const abs = resolve(callerCwd, explicitPath);
|
|
318
|
+
if (!existsSync(abs)) {
|
|
319
|
+
console.error(`❌ Repo tasks directory not found: ${abs}\n` +
|
|
320
|
+
" Provide a valid --repo-tasks-path, or run 'ailf init' to scaffold .ailf/tasks/.");
|
|
321
|
+
process.exit(1);
|
|
322
|
+
}
|
|
323
|
+
return abs;
|
|
324
|
+
}
|
|
325
|
+
if (taskSourceType === "repo") {
|
|
326
|
+
const defaultPath = resolve(callerCwd, ".ailf", "tasks");
|
|
327
|
+
if (!existsSync(defaultPath)) {
|
|
328
|
+
console.error(`❌ --task-source=repo was set but no tasks directory was found.\n` +
|
|
329
|
+
` Looked for: ${defaultPath}\n` +
|
|
330
|
+
" Run 'ailf init' to scaffold .ailf/tasks/, or pass --repo-tasks-path <path>.");
|
|
331
|
+
process.exit(1);
|
|
332
|
+
}
|
|
333
|
+
return defaultPath;
|
|
334
|
+
}
|
|
335
|
+
return undefined;
|
|
336
|
+
}
|
|
302
337
|
// ---------------------------------------------------------------------------
|
|
303
338
|
// Pipeline entry point
|
|
304
339
|
// ---------------------------------------------------------------------------
|
|
@@ -330,6 +365,16 @@ export async function executePipeline(cliOpts) {
|
|
|
330
365
|
if (cliOpts.repoTasksPath) {
|
|
331
366
|
config.repoTasksPath = resolve(callerCwd, cliOpts.repoTasksPath);
|
|
332
367
|
}
|
|
368
|
+
else if (config.taskSourceType === "repo" && !config.repoTasksPath) {
|
|
369
|
+
// Default: when taskSource=repo but no path set, look in .ailf/tasks/
|
|
370
|
+
// (matches the `ailf init` scaffold location). Silent fallback here —
|
|
371
|
+
// composition root will surface a helpful error if the directory is
|
|
372
|
+
// missing.
|
|
373
|
+
const defaultPath = resolve(callerCwd, ".ailf", "tasks");
|
|
374
|
+
if (existsSync(defaultPath)) {
|
|
375
|
+
config.repoTasksPath = defaultPath;
|
|
376
|
+
}
|
|
377
|
+
}
|
|
333
378
|
if (cliOpts.output) {
|
|
334
379
|
config.outputPath = resolve(callerCwd, cliOpts.output);
|
|
335
380
|
}
|
|
@@ -51,7 +51,7 @@ export function createPipelineCommand() {
|
|
|
51
51
|
.option("--output-dir <path>", "Base directory for pipeline output artifacts (default: inferred from execution context)")
|
|
52
52
|
.option("--promptfoo-url <url>", "Promptfoo share URL for report")
|
|
53
53
|
.option("--task-source <type>", "Task definition source: content-lake (default — Sanity Content Lake), repo (repo tasks only, no Content Lake merge)", "content-lake")
|
|
54
|
-
.option("--repo-tasks-path <path>", "Path to repo-based task definitions
|
|
54
|
+
.option("--repo-tasks-path <path>", "Path to repo-based task definitions. Defaults to ./.ailf/tasks/ when --task-source=repo.")
|
|
55
55
|
.option("--remote", "Submit evaluation to the AILF API instead of running locally", false)
|
|
56
56
|
.option("--api-url <url>", "AILF API base URL (default: https://ailf-api.sanity.build)")
|
|
57
57
|
.option("--no-artifacts", "Disable all artifact writers (D0033). Overrides --artifacts-dir.")
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* validate-tasks command — standalone validation of task files.
|
|
3
3
|
*
|
|
4
|
-
* Validates .ailf/tasks/*.yaml files against the
|
|
5
|
-
* running the full pipeline. Useful for
|
|
6
|
-
* in external repos.
|
|
4
|
+
* Validates .ailf/tasks/*.yaml and .ailf/tasks/*.task.ts files against the
|
|
5
|
+
* CanonicalTaskSchema without running the full pipeline. Useful for
|
|
6
|
+
* pre-commit hooks and CI checks in external repos.
|
|
7
7
|
*
|
|
8
8
|
* Usage:
|
|
9
9
|
* ailf validate-tasks .ailf/tasks/
|
|
@@ -11,6 +11,17 @@
|
|
|
11
11
|
*
|
|
12
12
|
* @see packages/eval/src/adapters/task-sources/repo-schemas.ts
|
|
13
13
|
* @see packages/eval/src/adapters/task-sources/repo-validation.ts
|
|
14
|
+
* @see packages/eval/src/adapters/task-sources/task-file-loader.ts
|
|
14
15
|
*/
|
|
15
16
|
import { Command } from "commander";
|
|
17
|
+
export interface ValidateTasksOptions {
|
|
18
|
+
strict: boolean;
|
|
19
|
+
callerCwd?: string;
|
|
20
|
+
}
|
|
16
21
|
export declare function createValidateTasksCommand(): Command;
|
|
22
|
+
/**
|
|
23
|
+
* Execute the validate-tasks command logic. Returns the exit code (0 success,
|
|
24
|
+
* 1 failure) so callers can decide how to surface it — the CLI wrapper calls
|
|
25
|
+
* `process.exit`, tests can assert directly.
|
|
26
|
+
*/
|
|
27
|
+
export declare function runValidateTasks(tasksPath: string, opts: ValidateTasksOptions): Promise<number>;
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* validate-tasks command — standalone validation of task files.
|
|
3
3
|
*
|
|
4
|
-
* Validates .ailf/tasks/*.yaml files against the
|
|
5
|
-
* running the full pipeline. Useful for
|
|
6
|
-
* in external repos.
|
|
4
|
+
* Validates .ailf/tasks/*.yaml and .ailf/tasks/*.task.ts files against the
|
|
5
|
+
* CanonicalTaskSchema without running the full pipeline. Useful for
|
|
6
|
+
* pre-commit hooks and CI checks in external repos.
|
|
7
7
|
*
|
|
8
8
|
* Usage:
|
|
9
9
|
* ailf validate-tasks .ailf/tasks/
|
|
@@ -11,97 +11,141 @@
|
|
|
11
11
|
*
|
|
12
12
|
* @see packages/eval/src/adapters/task-sources/repo-schemas.ts
|
|
13
13
|
* @see packages/eval/src/adapters/task-sources/repo-validation.ts
|
|
14
|
+
* @see packages/eval/src/adapters/task-sources/task-file-loader.ts
|
|
14
15
|
*/
|
|
15
16
|
import { existsSync, readdirSync, readFileSync } from "fs";
|
|
16
|
-
import { resolve, relative } from "path";
|
|
17
|
+
import { resolve, relative, basename } from "path";
|
|
17
18
|
import { Command } from "commander";
|
|
18
19
|
import { load } from "js-yaml";
|
|
19
20
|
import { detectLegacyFieldNames, parseCanonicalTaskFile, } from "../adapters/task-sources/repo-schemas.js";
|
|
20
21
|
import { validateCanonicalTasks, formatValidationResult, } from "../adapters/task-sources/repo-validation.js";
|
|
22
|
+
import { discoverTsTaskFiles, loadTsTaskFile, } from "../adapters/task-sources/task-file-loader.js";
|
|
21
23
|
export function createValidateTasksCommand() {
|
|
22
24
|
return new Command("validate-tasks")
|
|
23
|
-
.description("Validate task YAML
|
|
25
|
+
.description("Validate task files (YAML and TypeScript) in .ailf/tasks/ against the canonical schema")
|
|
24
26
|
.argument("[path]", "Path to tasks directory (default: .ailf/tasks/)", ".ailf/tasks")
|
|
25
27
|
.option("--strict", "Treat warnings as errors", false)
|
|
26
28
|
.action(async (tasksPath, opts) => {
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
29
|
+
const exitCode = await runValidateTasks(tasksPath, opts);
|
|
30
|
+
process.exit(exitCode);
|
|
31
|
+
});
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Execute the validate-tasks command logic. Returns the exit code (0 success,
|
|
35
|
+
* 1 failure) so callers can decide how to surface it — the CLI wrapper calls
|
|
36
|
+
* `process.exit`, tests can assert directly.
|
|
37
|
+
*/
|
|
38
|
+
export async function runValidateTasks(tasksPath, opts) {
|
|
39
|
+
// Resolve relative to the caller's working directory, not the
|
|
40
|
+
// eval package root (which differs when run via bin/ailf.js)
|
|
41
|
+
const callerCwd = opts.callerCwd ?? process.env.AILF_CALLER_CWD ?? process.cwd();
|
|
42
|
+
const resolvedPath = resolve(callerCwd, tasksPath);
|
|
43
|
+
if (!existsSync(resolvedPath)) {
|
|
44
|
+
console.error(`Directory not found: ${resolvedPath}`);
|
|
45
|
+
return 1;
|
|
46
|
+
}
|
|
47
|
+
const yamlFiles = readdirSync(resolvedPath).filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."));
|
|
48
|
+
const tsFiles = discoverTsTaskFiles(resolvedPath);
|
|
49
|
+
const fileCount = yamlFiles.length + tsFiles.length;
|
|
50
|
+
if (fileCount === 0) {
|
|
51
|
+
console.error(`No task files found in ${resolvedPath}\n` +
|
|
52
|
+
" Expected .yaml, .yml, .task.ts, or .task.js files");
|
|
53
|
+
return 1;
|
|
54
|
+
}
|
|
55
|
+
console.log(`\nValidating ${fileCount} task file(s) in ${relative(process.cwd(), resolvedPath)}/\n`);
|
|
56
|
+
let totalTasks = 0;
|
|
57
|
+
let hasErrors = false;
|
|
58
|
+
const allTasks = [];
|
|
59
|
+
for (const file of yamlFiles) {
|
|
60
|
+
const filePath = resolve(resolvedPath, file);
|
|
61
|
+
const raw = readFileSync(filePath, "utf-8");
|
|
62
|
+
let parsed;
|
|
63
|
+
try {
|
|
64
|
+
parsed = load(raw);
|
|
34
65
|
}
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
console.error(`
|
|
38
|
-
|
|
66
|
+
catch (err) {
|
|
67
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
68
|
+
console.error(` ${file}: YAML parse error`);
|
|
69
|
+
console.error(` ${msg}\n`);
|
|
70
|
+
hasErrors = true;
|
|
71
|
+
continue;
|
|
39
72
|
}
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
for (const file of yamlFiles) {
|
|
45
|
-
const filePath = resolve(resolvedPath, file);
|
|
46
|
-
const raw = readFileSync(filePath, "utf-8");
|
|
47
|
-
let parsed;
|
|
48
|
-
try {
|
|
49
|
-
parsed = load(raw);
|
|
50
|
-
}
|
|
51
|
-
catch (err) {
|
|
52
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
53
|
-
console.error(` ${file}: YAML parse error`);
|
|
54
|
-
console.error(` ${msg}\n`);
|
|
55
|
-
hasErrors = true;
|
|
56
|
-
continue;
|
|
57
|
-
}
|
|
58
|
-
if (!Array.isArray(parsed)) {
|
|
59
|
-
console.error(` ${file}: Expected a YAML array of task definitions`);
|
|
60
|
-
hasErrors = true;
|
|
61
|
-
continue;
|
|
62
|
-
}
|
|
63
|
-
// Detect legacy field names before Zod validation
|
|
64
|
-
const legacyWarnings = detectLegacyFieldNames(parsed, file);
|
|
65
|
-
if (legacyWarnings.length > 0) {
|
|
66
|
-
console.error(` ${file}: Uses legacy field names`);
|
|
67
|
-
for (const w of legacyWarnings) {
|
|
68
|
-
console.error(` ${w}`);
|
|
69
|
-
}
|
|
70
|
-
console.error();
|
|
71
|
-
hasErrors = true;
|
|
72
|
-
continue;
|
|
73
|
-
}
|
|
74
|
-
try {
|
|
75
|
-
const tasks = parseCanonicalTaskFile(parsed, file);
|
|
76
|
-
console.log(` ${file}: ${tasks.length} task${tasks.length === 1 ? "" : "s"} valid`);
|
|
77
|
-
totalTasks += tasks.length;
|
|
78
|
-
allTasks.push(...tasks);
|
|
79
|
-
}
|
|
80
|
-
catch (err) {
|
|
81
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
82
|
-
console.error(` ${file}: Schema validation failed`);
|
|
83
|
-
console.error(`${msg
|
|
84
|
-
.split("\n")
|
|
85
|
-
.map((l) => ` ${l}`)
|
|
86
|
-
.join("\n")}\n`);
|
|
87
|
-
hasErrors = true;
|
|
88
|
-
}
|
|
73
|
+
if (!Array.isArray(parsed)) {
|
|
74
|
+
console.error(` ${file}: Expected a YAML array of task definitions`);
|
|
75
|
+
hasErrors = true;
|
|
76
|
+
continue;
|
|
89
77
|
}
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
const semanticResult = validateCanonicalTasks(allTasks);
|
|
94
|
-
const formatted = formatValidationResult(semanticResult);
|
|
95
|
-
console.log(formatted);
|
|
96
|
-
if (!semanticResult.valid) {
|
|
97
|
-
hasErrors = true;
|
|
98
|
-
}
|
|
99
|
-
if (opts.strict && semanticResult.warnings.length > 0) {
|
|
100
|
-
hasErrors = true;
|
|
101
|
-
console.log("\n --strict mode: warnings treated as errors");
|
|
102
|
-
}
|
|
78
|
+
if (!validateTaskArray(parsed, file, allTasks)) {
|
|
79
|
+
hasErrors = true;
|
|
80
|
+
continue;
|
|
103
81
|
}
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
82
|
+
totalTasks += parsed.length;
|
|
83
|
+
}
|
|
84
|
+
for (const tsFilePath of tsFiles) {
|
|
85
|
+
const file = basename(tsFilePath);
|
|
86
|
+
let loaded;
|
|
87
|
+
try {
|
|
88
|
+
loaded = await loadTsTaskFile(tsFilePath);
|
|
89
|
+
}
|
|
90
|
+
catch (err) {
|
|
91
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
92
|
+
console.error(` ${file}: Failed to load TypeScript task file`);
|
|
93
|
+
console.error(` ${msg}\n`);
|
|
94
|
+
hasErrors = true;
|
|
95
|
+
continue;
|
|
96
|
+
}
|
|
97
|
+
if (!validateTaskArray(loaded.tasks, file, allTasks)) {
|
|
98
|
+
hasErrors = true;
|
|
99
|
+
continue;
|
|
100
|
+
}
|
|
101
|
+
totalTasks += loaded.tasks.length;
|
|
102
|
+
}
|
|
103
|
+
if (allTasks.length > 0) {
|
|
104
|
+
console.log();
|
|
105
|
+
const semanticResult = validateCanonicalTasks(allTasks);
|
|
106
|
+
const formatted = formatValidationResult(semanticResult);
|
|
107
|
+
console.log(formatted);
|
|
108
|
+
if (!semanticResult.valid) {
|
|
109
|
+
hasErrors = true;
|
|
110
|
+
}
|
|
111
|
+
if (opts.strict && semanticResult.warnings.length > 0) {
|
|
112
|
+
hasErrors = true;
|
|
113
|
+
console.log("\n --strict mode: warnings treated as errors");
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
console.log(`\n${hasErrors ? "FAIL" : "OK"} ${totalTasks} task${totalTasks === 1 ? "" : "s"} across ${fileCount} file${fileCount === 1 ? "" : "s"}\n`);
|
|
117
|
+
return hasErrors ? 1 : 0;
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Validate an array of raw task entries — runs the legacy-field detector and
|
|
121
|
+
* the canonical Zod schema, appending valid tasks to `accumulator`.
|
|
122
|
+
*
|
|
123
|
+
* Returns `true` when the file is fully valid, `false` when any error was
|
|
124
|
+
* reported (the caller is responsible for flipping its own error flag).
|
|
125
|
+
*/
|
|
126
|
+
function validateTaskArray(entries, file, accumulator) {
|
|
127
|
+
const legacyWarnings = detectLegacyFieldNames(entries, file);
|
|
128
|
+
if (legacyWarnings.length > 0) {
|
|
129
|
+
console.error(` ${file}: Uses legacy field names`);
|
|
130
|
+
for (const w of legacyWarnings) {
|
|
131
|
+
console.error(` ${w}`);
|
|
132
|
+
}
|
|
133
|
+
console.error();
|
|
134
|
+
return false;
|
|
135
|
+
}
|
|
136
|
+
try {
|
|
137
|
+
const tasks = parseCanonicalTaskFile(entries, file);
|
|
138
|
+
console.log(` ${file}: ${tasks.length} task${tasks.length === 1 ? "" : "s"} valid`);
|
|
139
|
+
accumulator.push(...tasks);
|
|
140
|
+
return true;
|
|
141
|
+
}
|
|
142
|
+
catch (err) {
|
|
143
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
144
|
+
console.error(` ${file}: Schema validation failed`);
|
|
145
|
+
console.error(`${msg
|
|
146
|
+
.split("\n")
|
|
147
|
+
.map((l) => ` ${l}`)
|
|
148
|
+
.join("\n")}\n`);
|
|
149
|
+
return false;
|
|
150
|
+
}
|
|
107
151
|
}
|
package/dist/composition-root.js
CHANGED
|
@@ -188,14 +188,19 @@ export function createArtifactWriter(config, logger, progress) {
|
|
|
188
188
|
exclude,
|
|
189
189
|
...(remote ? {} : { progress }),
|
|
190
190
|
});
|
|
191
|
+
// W0064 — when a remote backend is wired, list it first so its ArtifactRef
|
|
192
|
+
// wins the fanout's firstNonNull() selection and the published manifest
|
|
193
|
+
// points at a cross-machine-readable store. Local stays attached as the
|
|
194
|
+
// resilience tier: if the remote leg fails, firstNonNull falls through to
|
|
195
|
+
// local and the pipeline still produces a non-null ref.
|
|
191
196
|
const base = remote
|
|
192
|
-
? new FanoutArtifactWriter([
|
|
197
|
+
? new FanoutArtifactWriter([remote, local], { progress })
|
|
193
198
|
: local;
|
|
194
199
|
if (!remote) {
|
|
195
200
|
logger.debug(`Artifact writer: LocalFilesystemArtifactWriter only (rootDir=${rootDir})`);
|
|
196
201
|
}
|
|
197
202
|
else {
|
|
198
|
-
logger.debug(`Artifact writer: FanoutArtifactWriter([
|
|
203
|
+
logger.debug(`Artifact writer: FanoutArtifactWriter([${remote.constructor.name}, local=${rootDir}])`);
|
|
199
204
|
}
|
|
200
205
|
// Wrap in the accumulator so FinalizeRunStep can build a populated
|
|
201
206
|
// RunManifest without each producer bookkeeping its own ArtifactRefs
|
package/dist/index.d.ts
CHANGED
|
@@ -39,3 +39,5 @@ export { env } from "./_vendor/ailf-core/index.d.ts";
|
|
|
39
39
|
export type { AgentHarnessTaskDefinition, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PathDocRef, PerspectiveDocRef, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./_vendor/ailf-core/index.d.ts";
|
|
40
40
|
export { CanonicalTaskFileSchema, CanonicalTaskSchema, CURATED_ASSERTION_TYPES, detectLegacyFieldNames, parseCanonicalTaskFile, RUBRIC_TEMPLATE_NAMES, type CanonicalTask, type CuratedAssertionType, type RubricTemplateName, } from "./adapters/task-sources/repo-schemas.js";
|
|
41
41
|
export { formatValidationResult, validateCanonicalTasks, type ValidationMessage, type ValidationResult, } from "./adapters/task-sources/repo-validation.js";
|
|
42
|
+
export { InMemoryPluginRegistry } from "./_vendor/ailf-core/index.d.ts";
|
|
43
|
+
export type { CompilationContext, ModeBase, ModeCompileResult, ModeHandler, PresetDefinition, } from "./_vendor/ailf-core/index.d.ts";
|
package/dist/index.js
CHANGED
|
@@ -46,3 +46,7 @@ export { env } from "./_vendor/ailf-core/index.js";
|
|
|
46
46
|
// ---------------------------------------------------------------------------
|
|
47
47
|
export { CanonicalTaskFileSchema, CanonicalTaskSchema, CURATED_ASSERTION_TYPES, detectLegacyFieldNames, parseCanonicalTaskFile, RUBRIC_TEMPLATE_NAMES, } from "./adapters/task-sources/repo-schemas.js";
|
|
48
48
|
export { formatValidationResult, validateCanonicalTasks, } from "./adapters/task-sources/repo-validation.js";
|
|
49
|
+
// ---------------------------------------------------------------------------
|
|
50
|
+
// Plugin extension points — for authoring custom presets, modes, and registries
|
|
51
|
+
// ---------------------------------------------------------------------------
|
|
52
|
+
export { InMemoryPluginRegistry } from "./_vendor/ailf-core/index.js";
|
|
@@ -130,6 +130,12 @@ export async function orchestratePipeline(ctx, steps) {
|
|
|
130
130
|
const pipelineStart = Date.now();
|
|
131
131
|
const hasJob = !!ctx.config.jobId;
|
|
132
132
|
const jobUpdates = [];
|
|
133
|
+
// DOC-2064 — tracks whether the pre-finalize pipelineContext emit fired so
|
|
134
|
+
// the post-loop fallback can skip redundant writes. A second emit to the
|
|
135
|
+
// same GCS path produces a 412 Precondition Failed from the signed-URL
|
|
136
|
+
// writer (which enforces no-overwrite), logging spurious warnings on every
|
|
137
|
+
// successful run.
|
|
138
|
+
let pipelineContextEmitted = false;
|
|
133
139
|
ctx.logger.section("ai-literacy-framework — Evaluation Pipeline");
|
|
134
140
|
ctx.logger.debug(`Pipeline starting with ${steps.length} steps`, {
|
|
135
141
|
steps: steps.map((s) => s.name),
|
|
@@ -152,6 +158,16 @@ export async function orchestratePipeline(ctx, steps) {
|
|
|
152
158
|
ctx.logger.debug(`Starting step ${i + 1}/${steps.length}: ${step.name}`);
|
|
153
159
|
ctx.logger.section(step.name);
|
|
154
160
|
exportPhase.maybeOpen(step.name);
|
|
161
|
+
// DOC-2064 — emit pipelineContext BEFORE finalize-run so the artifact
|
|
162
|
+
// ref registers with the accumulator and lands in RunManifest.artifacts,
|
|
163
|
+
// which PublishReportStep then snapshots into Report.artifactManifest.
|
|
164
|
+
// The previous post-loop emit ran after publish and was invisible to
|
|
165
|
+
// Content Lake readers. The failure-path capture below still fires on
|
|
166
|
+
// pre-finalize aborts so aborted runs retain the on-disk artifact.
|
|
167
|
+
if (step.name === "finalize-run") {
|
|
168
|
+
await capturePipelineContext(ctx, state, results);
|
|
169
|
+
pipelineContextEmitted = true;
|
|
170
|
+
}
|
|
155
171
|
// Report current step progress
|
|
156
172
|
if (hasJob) {
|
|
157
173
|
await reportJobProgress(ctx, step.name, i, steps.length, "running", undefined, jobUpdates);
|
|
@@ -175,8 +191,12 @@ export async function orchestratePipeline(ctx, steps) {
|
|
|
175
191
|
}
|
|
176
192
|
// Capture pipeline context before exiting. `job-updates` was an
|
|
177
193
|
// observability-only capture not tied to a registered artifact type;
|
|
178
|
-
// dropped in W0050. Use the JobStore path for job telemetry.
|
|
179
|
-
|
|
194
|
+
// dropped in W0050. Use the JobStore path for job telemetry. Skip
|
|
195
|
+
// when the pre-finalize emit already fired to avoid a 412 overwrite
|
|
196
|
+
// warning (DOC-2064).
|
|
197
|
+
if (!pipelineContextEmitted) {
|
|
198
|
+
await capturePipelineContext(ctx, state, results);
|
|
199
|
+
}
|
|
180
200
|
exportPhase.close();
|
|
181
201
|
return {
|
|
182
202
|
belowCritical: state.belowCritical,
|
|
@@ -231,9 +251,18 @@ export async function orchestratePipeline(ctx, steps) {
|
|
|
231
251
|
ctx.logger.warn("Failed to report job completion — continuing");
|
|
232
252
|
}
|
|
233
253
|
}
|
|
234
|
-
//
|
|
235
|
-
//
|
|
236
|
-
|
|
254
|
+
// DOC-2064 — post-loop fallback. Only fires when the pre-finalize emit
|
|
255
|
+
// inside the step loop didn't run — typically because the pipeline has no
|
|
256
|
+
// finalize-run step (test harnesses, air-gapped runs). Skipping this when
|
|
257
|
+
// the pre-finalize emit already fired avoids a 412 Precondition Failed
|
|
258
|
+
// from the signed-URL writer, which refuses to overwrite the existing
|
|
259
|
+
// path. The tradeoff is that pipelineContext captures pipeline state as
|
|
260
|
+
// of finalize-run, not post-publish — reportId is absent. Acceptable
|
|
261
|
+
// because runId is the primary join key and reportId is trivially
|
|
262
|
+
// looked up from Content Lake via runId.
|
|
263
|
+
if (!pipelineContextEmitted) {
|
|
264
|
+
await capturePipelineContext(ctx, state, results);
|
|
265
|
+
}
|
|
237
266
|
exportPhase.close();
|
|
238
267
|
return {
|
|
239
268
|
belowCritical: state.belowCritical,
|
|
@@ -223,6 +223,7 @@ export function extractStoredTestResults(resultsPath) {
|
|
|
223
223
|
}
|
|
224
224
|
dimensions.push({ dimension, reason, score });
|
|
225
225
|
}
|
|
226
|
+
const tokenUsage = result.response?.tokenUsage;
|
|
226
227
|
testResults.push({
|
|
227
228
|
area,
|
|
228
229
|
cost: result.cost || undefined,
|
|
@@ -233,6 +234,7 @@ export function extractStoredTestResults(resultsPath) {
|
|
|
233
234
|
responseOutput,
|
|
234
235
|
...(responseOutputTruncated && { responseOutputTruncated: true }),
|
|
235
236
|
taskId,
|
|
237
|
+
...(tokenUsage && { tokenUsage }),
|
|
236
238
|
variant,
|
|
237
239
|
});
|
|
238
240
|
}
|