@sanity/ailf 3.7.0 → 3.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/airbyte/ai_literacy_framework.connector.yaml +1 -1
- package/config/thresholds.ts +3 -3
- package/dist/_vendor/ailf-core/examples/index.d.ts +2 -2
- package/dist/_vendor/ailf-core/examples/index.js +2 -2
- package/dist/_vendor/ailf-core/ports/context.d.ts +0 -4
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +38 -12
- package/dist/_vendor/ailf-core/schemas/eval-config.js +102 -22
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +4 -6
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -3
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +2 -2
- package/dist/_vendor/ailf-shared/run-classification.d.ts +2 -2
- package/dist/_vendor/ailf-shared/run-classification.js +1 -1
- package/dist/_vendor/ailf-shared/run-context.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +0 -2
- package/dist/adapters/api-client/build-request.js +2 -6
- package/dist/adapters/config-sources/cli-config-adapter.d.ts +1 -1
- package/dist/adapters/config-sources/file-config-adapter.d.ts +1 -1
- package/dist/adapters/config-sources/file-config-adapter.js +38 -12
- package/dist/adapters/task-sources/repo-schemas.d.ts +38 -0
- package/dist/adapters/task-sources/repo-schemas.js +127 -0
- package/dist/cli.d.ts +2 -2
- package/dist/cli.js +134 -38
- package/dist/commands/agent-report.js +1 -1
- package/dist/commands/calculate-scores.js +0 -2
- package/dist/commands/check-staleness.js +1 -1
- package/dist/commands/chronic-failures.js +4 -4
- package/dist/commands/coverage-audit.js +6 -7
- package/dist/commands/discovery-report.js +16 -4
- package/dist/commands/eval.d.ts +1 -1
- package/dist/commands/eval.js +1 -1
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +13 -44
- package/dist/commands/fetch-docs.js +0 -2
- package/dist/commands/generate-configs.js +0 -2
- package/dist/commands/grader/index.js +3 -3
- package/dist/commands/init.d.ts +2 -2
- package/dist/commands/init.js +10 -9
- package/dist/commands/interactive.d.ts +1 -1
- package/dist/commands/interactive.js +8 -8
- package/dist/commands/pipeline-action.d.ts +1 -3
- package/dist/commands/pipeline-action.js +174 -140
- package/dist/commands/pr-comment.js +1 -3
- package/dist/commands/publish.d.ts +1 -1
- package/dist/commands/publish.js +2 -4
- package/dist/commands/readiness-report.js +17 -8
- package/dist/commands/remote-pipeline.d.ts +1 -1
- package/dist/commands/remote-pipeline.js +1 -3
- package/dist/commands/run.d.ts +64 -0
- package/dist/commands/{pipeline.js → run.js} +19 -30
- package/dist/commands/shared/help.js +4 -4
- package/dist/commands/shared/options.d.ts +29 -3
- package/dist/commands/shared/options.js +37 -13
- package/dist/commands/validate-tasks.js +1 -1
- package/dist/commands/validate.d.ts +1 -1
- package/dist/commands/validate.js +2 -2
- package/dist/commands/weekly-digest.js +3 -3
- package/dist/config/thresholds.ts +3 -3
- package/dist/orchestration/build-app-context.js +0 -2
- package/dist/orchestration/build-step-sequence.js +1 -11
- package/dist/orchestration/steps/fetch-docs-step.js +1 -1
- package/dist/orchestration/steps/index.d.ts +0 -2
- package/dist/orchestration/steps/index.js +0 -2
- package/dist/orchestration/steps/run-eval-step.js +1 -1
- package/dist/pipeline/cache.d.ts +1 -1
- package/dist/pipeline/map-request-to-config.js +0 -2
- package/dist/pipeline/plan.d.ts +2 -4
- package/dist/pipeline/plan.js +4 -32
- package/dist/pipeline/run-context.d.ts +1 -1
- package/dist/pipeline/run-context.js +4 -4
- package/dist/pipeline/validate.d.ts +1 -1
- package/dist/pipeline/validate.js +1 -1
- package/package.json +7 -7
- package/dist/commands/pipeline.d.ts +0 -77
- package/dist/orchestration/steps/discovery-report-step.d.ts +0 -13
- package/dist/orchestration/steps/discovery-report-step.js +0 -62
- package/dist/orchestration/steps/readiness-step.d.ts +0 -13
- package/dist/orchestration/steps/readiness-step.js +0 -98
|
@@ -1,23 +1,29 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* run command — the main evaluation entry point.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
4
|
+
* Renamed from `ailf pipeline` to `ailf run` per W0077 Phase 1
|
|
5
|
+
* (see docs/design-docs/pipeline-command-surface.md §3). The command drives
|
|
6
|
+
* the evaluation pipeline; the orchestrator internals retain the "pipeline"
|
|
7
|
+
* name because they describe a multi-step process, not the CLI verb.
|
|
8
|
+
*
|
|
9
|
+
* Defines all 44+ CLI flags via Commander, resolves them into a typed
|
|
5
10
|
* options object, bridges to process.env for downstream modules, and
|
|
6
11
|
* delegates to runPipeline().
|
|
7
12
|
*
|
|
8
|
-
* @see docs/cli.md for
|
|
13
|
+
* @see docs/guides/cli-guide.md for per-flag behavior notes, or
|
|
14
|
+
* docs/references/cli-reference.md for the auto-generated flag matrix.
|
|
9
15
|
*/
|
|
10
16
|
import { Command } from "commander";
|
|
11
|
-
import { addAgenticOptions, addDebugOptions,
|
|
12
|
-
export function
|
|
13
|
-
const cmd = new Command("
|
|
17
|
+
import { addAgenticOptions, addDebugOptions, addSanityScopeOptions, } from "./shared/options.js";
|
|
18
|
+
export function createRunCommand() {
|
|
19
|
+
const cmd = new Command("run")
|
|
14
20
|
.description("Run the full evaluation pipeline")
|
|
15
21
|
.option("-m, --mode <mode>", "Evaluation mode: literacy (default), mcp-server, agent-harness, knowledge-probe, custom. Legacy aliases (baseline, agentic, observed, full) are accepted and normalized to literacy + variant.", "literacy")
|
|
16
22
|
.option("--variant <variant>", "Literacy variant: full (default — standard + agentic), baseline (standard only), agentic (agentic only), observed. Only applies to --mode literacy.")
|
|
17
23
|
.option("-s, --source <name>", "Documentation source name (from sources.yaml)")
|
|
18
24
|
.option("-n, --dry-run", "Validate configuration only, no execution", false)
|
|
19
|
-
.option("--
|
|
20
|
-
.option("--
|
|
25
|
+
.option("--no-fetch", "Reuse cached documentation contexts")
|
|
26
|
+
.option("--no-eval", "Recalculate from existing eval results")
|
|
21
27
|
.option("--no-cache", "Bypass all pipeline-level caching")
|
|
22
28
|
.option("--no-remote-cache", "Disable Content Lake cache lookup (local cache still active)")
|
|
23
29
|
.option("--no-auto-scope", "Disable release-aware auto-scoping (evaluate all tasks even when a perspective is set)")
|
|
@@ -31,39 +37,22 @@ export function createPipelineCommand() {
|
|
|
31
37
|
.filter(Boolean),
|
|
32
38
|
], [])
|
|
33
39
|
.option("--changed-docs <slugs>", "Auto-scope to tasks affected by these document slugs")
|
|
34
|
-
.option("-
|
|
35
|
-
.option("
|
|
36
|
-
.option("--before <source>", "Before-state for impact evaluation")
|
|
37
|
-
.option("-c, --compare", "Compare scores against latest baseline", false)
|
|
38
|
-
.option("--compare-baseline <path>", "Specific baseline file to compare")
|
|
40
|
+
.option("--before-source <name>", "Swap the doc source to a `before` state and run a paired evaluation for impact analysis. Pairs with `ailf baseline` and `--compare`. Distinct from `--compare <baseline>`, which compares scores against a saved snapshot.")
|
|
41
|
+
.option("-c, --compare [baseline]", "Compare scores against the latest baseline. Pass a path to pin a specific baseline file (e.g. --compare results/baselines/2026-04-22.json).")
|
|
39
42
|
.option("--threshold <n>", "Noise threshold for comparison (default: 2)", parseFloat)
|
|
40
|
-
.option("--no-gap-analysis", "Skip failure mode + impact analysis")
|
|
41
|
-
.option("--readiness", "Generate launch readiness checklist", false)
|
|
42
|
-
.option("--discovery-report", "Generate agent discoverability report", false)
|
|
43
43
|
.option("-p, --publish", "Write report to Sanity + fan out to sinks (auto-enabled for full runs when report store is configured)")
|
|
44
44
|
.option("--no-publish", "Suppress auto-publishing")
|
|
45
45
|
.option("--publish-tag <tag>", "Label for published report")
|
|
46
|
-
.option("--report-dataset <name>", "Sanity dataset for report store")
|
|
47
|
-
.option("--report-project <id>", "Sanity project ID for report store")
|
|
48
46
|
.option("--config <path>", "Load pipeline config from a TS/JS/YAML/JSON file (overrides most CLI flags)")
|
|
49
47
|
.option("-o, --output <path>", "Write PR comment markdown to file")
|
|
50
|
-
.option("--output-dir <path>", "Base directory for pipeline output artifacts (default: inferred from execution context)")
|
|
51
48
|
.option("--promptfoo-url <url>", "Promptfoo share URL for report")
|
|
52
|
-
.option("--task-source <type>", "Task definition source: content-lake (default — Sanity Content Lake), repo (repo tasks only, no Content Lake merge)", "content-lake")
|
|
53
|
-
.option("--repo-tasks-path <path>", "Path to repo-based task definitions. Defaults to ./.ailf/tasks/ when --task-source=repo.")
|
|
54
49
|
.option("--remote", "Submit evaluation to the AILF API instead of running locally", false)
|
|
55
|
-
.option("--
|
|
56
|
-
.option("--no-artifacts", "Disable all artifact writers (D0033). Overrides --artifacts-dir.")
|
|
57
|
-
.option("--artifacts-dir <path>", "Root directory for local artifact output (D0033; default: .ailf/results/captures/)")
|
|
58
|
-
.option("--artifacts-dry-run", "Run artifact writers in dry-run mode — log intended writes, touch no storage", false)
|
|
59
|
-
.option("--artifacts-exclude <types>", "Comma-separated artifact types to skip (e.g. traces,graderPrompts)")
|
|
50
|
+
.option("--no-artifacts-write", "Run artifact writers in dry-run mode — log intended writes, touch no storage")
|
|
60
51
|
// D0037 caller envelope (W0069) — threads through --remote so the
|
|
61
52
|
// server-side pipeline attributes provenance to the caller, not the
|
|
62
53
|
// API gateway runner. All env-var equivalents are honored too;
|
|
63
54
|
// explicit flags win over env vars.
|
|
64
|
-
.option("--classification <value>", "Run classification for provenance: official |
|
|
65
|
-
.option("--owner-team <slug>", "Team slug this run is attributable to. Overrides AILF_OWNER_TEAM.")
|
|
66
|
-
.option("--owner-individual <slug>", "Individual (GH actor / user ID) this run is attributable to. Overrides AILF_OWNER_INDIVIDUAL.")
|
|
55
|
+
.option("--classification <value>", "Run classification for provenance: official | adhoc | experimental | test | external. Overrides AILF_CLASSIFICATION. See D0037.")
|
|
67
56
|
.option("--purpose <text>", 'Free-text "why I ran this" attached to provenance. Overrides AILF_PURPOSE.')
|
|
68
57
|
.option("--label <value>", "Free-form searchable label (repeatable). Appends to any AILF_LABELS env value.", (val, prev) => [
|
|
69
58
|
...prev,
|
|
@@ -78,7 +67,7 @@ export function createPipelineCommand() {
|
|
|
78
67
|
});
|
|
79
68
|
// Add shared option groups
|
|
80
69
|
addDebugOptions(cmd);
|
|
81
|
-
|
|
70
|
+
addSanityScopeOptions(cmd);
|
|
82
71
|
addAgenticOptions(cmd);
|
|
83
72
|
return cmd;
|
|
84
73
|
}
|
|
@@ -67,14 +67,14 @@ function hasColorSupport() {
|
|
|
67
67
|
// ---------------------------------------------------------------------------
|
|
68
68
|
const afterHelpText = `
|
|
69
69
|
Quick Start:
|
|
70
|
-
$ ailf
|
|
71
|
-
$ ailf
|
|
72
|
-
$ ailf
|
|
70
|
+
$ ailf run --debug Run a quick evaluation (first 2 tests)
|
|
71
|
+
$ ailf run --area groq Evaluate a specific feature area
|
|
72
|
+
$ ailf run --explain Preview the execution plan
|
|
73
73
|
$ ailf init Set up AILF in a new project
|
|
74
74
|
|
|
75
75
|
Documentation:
|
|
76
76
|
Repository https://github.com/sanity-io/ai-literacy-framework
|
|
77
|
-
CLI Guide https://github.com/sanity-io/ai-literacy-framework/blob/main/docs/cli.md
|
|
77
|
+
CLI Guide https://github.com/sanity-io/ai-literacy-framework/blob/main/docs/guides/cli-guide.md
|
|
78
78
|
Getting Started https://github.com/sanity-io/ai-literacy-framework/blob/main/docs/getting-started.md
|
|
79
79
|
|
|
80
80
|
Run ailf <command> --help for detailed usage of any command.`;
|
|
@@ -7,11 +7,22 @@
|
|
|
7
7
|
*/
|
|
8
8
|
import type { Command } from "commander";
|
|
9
9
|
/**
|
|
10
|
-
* Add agentic options
|
|
10
|
+
* Add agentic options on `ailf run`: `--url` and `--search`. Both are
|
|
11
|
+
* per-run overrides.
|
|
12
|
+
*
|
|
13
|
+
* **`--header` and `--allowed-origin` retired in W0077 Phase 6f** —
|
|
14
|
+
* configure them in `.ailf/config.yaml`'s `agentic` block instead. The
|
|
15
|
+
* `DOC_HEADERS` and `DOC_ALLOWED_ORIGIN(S)` env vars still apply.
|
|
11
16
|
*/
|
|
12
17
|
export declare function addAgenticOptions(cmd: Command): Command;
|
|
13
18
|
/**
|
|
14
|
-
* Add debug
|
|
19
|
+
* Add debug + filter options.
|
|
20
|
+
*
|
|
21
|
+
* `-d, --debug` is a shortcut that runs only the first 2 tests for fast
|
|
22
|
+
* feedback. `--filter-first-n`, `--filter-pattern`, and `--filter-sample`
|
|
23
|
+
* narrow the test set explicitly and mirror the same-named flags on
|
|
24
|
+
* `ailf eval` so the pipeline surface matches the underlying Promptfoo
|
|
25
|
+
* passthrough.
|
|
15
26
|
*/
|
|
16
27
|
export declare function addDebugOptions(cmd: Command): Command;
|
|
17
28
|
/**
|
|
@@ -27,9 +38,24 @@ export declare function addOutputOptions(cmd: Command): Command;
|
|
|
27
38
|
*/
|
|
28
39
|
export declare function addOutputDirOption(cmd: Command): Command;
|
|
29
40
|
/**
|
|
30
|
-
* Add Sanity
|
|
41
|
+
* Add the full Sanity-source CLI surface — dataset, project, perspective,
|
|
42
|
+
* studio origin, document. Used by `ailf fetch-docs`, which is invoked
|
|
43
|
+
* directly with explicit Sanity targeting per run.
|
|
44
|
+
*
|
|
45
|
+
* **Not used by `ailf run`** post-W0077 Phase 6d. The dataset, project, and
|
|
46
|
+
* studio-origin trio moved to `.ailf/config.yaml`'s `source` block (with
|
|
47
|
+
* `SANITY_DATASET` / `SANITY_PROJECT_ID` / `SANITY_STUDIO_ORIGIN` env-var
|
|
48
|
+
* fallbacks). `ailf run` uses `addSanityScopeOptions` instead, which keeps
|
|
49
|
+
* only the per-run flags (`--sanity-perspective`, `--sanity-document`).
|
|
31
50
|
*/
|
|
32
51
|
export declare function addSanitySourceOptions(cmd: Command): Command;
|
|
52
|
+
/**
|
|
53
|
+
* Add the per-run Sanity-scope CLI surface — `--sanity-perspective` and
|
|
54
|
+
* `--sanity-document`. Used by `ailf run`. The per-environment trio
|
|
55
|
+
* (dataset, project, studio origin) lives in `.ailf/config.yaml`'s
|
|
56
|
+
* `source` block instead (W0077 Phase 6d).
|
|
57
|
+
*/
|
|
58
|
+
export declare function addSanityScopeOptions(cmd: Command): Command;
|
|
33
59
|
/**
|
|
34
60
|
* Collect repeatable string options into an array.
|
|
35
61
|
* Used as a Commander argParser for options like --url, --header, --allowed-origin.
|
|
@@ -6,27 +6,33 @@
|
|
|
6
6
|
* (e.g., debug options, Sanity source options, output options).
|
|
7
7
|
*/
|
|
8
8
|
/**
|
|
9
|
-
* Add agentic options
|
|
9
|
+
* Add agentic options on `ailf run`: `--url` and `--search`. Both are
|
|
10
|
+
* per-run overrides.
|
|
11
|
+
*
|
|
12
|
+
* **`--header` and `--allowed-origin` retired in W0077 Phase 6f** —
|
|
13
|
+
* configure them in `.ailf/config.yaml`'s `agentic` block instead. The
|
|
14
|
+
* `DOC_HEADERS` and `DOC_ALLOWED_ORIGIN(S)` env vars still apply.
|
|
10
15
|
*/
|
|
11
16
|
export function addAgenticOptions(cmd) {
|
|
12
17
|
return cmd
|
|
13
18
|
.option("--url <url>", "Documentation URL (repeatable)", collect, [])
|
|
14
|
-
.option("--urls <url>", "Alias for --url (repeatable)", collect, [])
|
|
15
|
-
.option("--header <header>", 'Custom HTTP header "Key: Value" (repeatable)', collect, [])
|
|
16
|
-
.option("--headers <header>", "Alias for --header (repeatable)", collect, [])
|
|
17
|
-
.option("--allowed-origin <origin>", "Agent origin sandbox (repeatable, supports globs)", collect, [])
|
|
18
|
-
.option("--allowed-origins <origin>", "Alias for --allowed-origin (repeatable)", collect, [])
|
|
19
19
|
.option("-S, --search <mode>", "Web search mode: open, origin-only, off");
|
|
20
20
|
}
|
|
21
21
|
/**
|
|
22
|
-
* Add debug
|
|
22
|
+
* Add debug + filter options.
|
|
23
|
+
*
|
|
24
|
+
* `-d, --debug` is a shortcut that runs only the first 2 tests for fast
|
|
25
|
+
* feedback. `--filter-first-n`, `--filter-pattern`, and `--filter-sample`
|
|
26
|
+
* narrow the test set explicitly and mirror the same-named flags on
|
|
27
|
+
* `ailf eval` so the pipeline surface matches the underlying Promptfoo
|
|
28
|
+
* passthrough.
|
|
23
29
|
*/
|
|
24
30
|
export function addDebugOptions(cmd) {
|
|
25
31
|
return cmd
|
|
26
32
|
.option("-d, --debug", "Run subset of tests for fast feedback", false)
|
|
27
|
-
.option("--
|
|
28
|
-
.option("--
|
|
29
|
-
.option("--
|
|
33
|
+
.option("--filter-first-n <n>", "Run only first N tests", parseInt)
|
|
34
|
+
.option("--filter-pattern <regex>", "Filter tests by description regex")
|
|
35
|
+
.option("--filter-sample <n>", "Random sample of N tests", parseInt);
|
|
30
36
|
}
|
|
31
37
|
/**
|
|
32
38
|
* Add output options: --output, --format
|
|
@@ -47,7 +53,15 @@ export function addOutputDirOption(cmd) {
|
|
|
47
53
|
return cmd.option("--output-dir <path>", "Base directory for output artifacts (default: .ailf/results/latest/)");
|
|
48
54
|
}
|
|
49
55
|
/**
|
|
50
|
-
* Add Sanity
|
|
56
|
+
* Add the full Sanity-source CLI surface — dataset, project, perspective,
|
|
57
|
+
* studio origin, document. Used by `ailf fetch-docs`, which is invoked
|
|
58
|
+
* directly with explicit Sanity targeting per run.
|
|
59
|
+
*
|
|
60
|
+
* **Not used by `ailf run`** post-W0077 Phase 6d. The dataset, project, and
|
|
61
|
+
* studio-origin trio moved to `.ailf/config.yaml`'s `source` block (with
|
|
62
|
+
* `SANITY_DATASET` / `SANITY_PROJECT_ID` / `SANITY_STUDIO_ORIGIN` env-var
|
|
63
|
+
* fallbacks). `ailf run` uses `addSanityScopeOptions` instead, which keeps
|
|
64
|
+
* only the per-run flags (`--sanity-perspective`, `--sanity-document`).
|
|
51
65
|
*/
|
|
52
66
|
export function addSanitySourceOptions(cmd) {
|
|
53
67
|
return cmd
|
|
@@ -55,8 +69,18 @@ export function addSanitySourceOptions(cmd) {
|
|
|
55
69
|
.option("--sanity-project <id>", "Override Sanity project ID")
|
|
56
70
|
.option("--sanity-perspective <id>", "Sanity release perspective ID")
|
|
57
71
|
.option("--sanity-studio-origin <url>", "Sanity Studio base URL")
|
|
58
|
-
.option("--sanity-document <id>", "Evaluate specific Sanity document(s) (repeatable)", collect, [])
|
|
59
|
-
|
|
72
|
+
.option("--sanity-document <id>", "Evaluate specific Sanity document(s) (repeatable)", collect, []);
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Add the per-run Sanity-scope CLI surface — `--sanity-perspective` and
|
|
76
|
+
* `--sanity-document`. Used by `ailf run`. The per-environment trio
|
|
77
|
+
* (dataset, project, studio origin) lives in `.ailf/config.yaml`'s
|
|
78
|
+
* `source` block instead (W0077 Phase 6d).
|
|
79
|
+
*/
|
|
80
|
+
export function addSanityScopeOptions(cmd) {
|
|
81
|
+
return cmd
|
|
82
|
+
.option("--sanity-perspective <id>", "Sanity release perspective ID")
|
|
83
|
+
.option("--sanity-document <id>", "Evaluate specific Sanity document(s) (repeatable)", collect, []);
|
|
60
84
|
}
|
|
61
85
|
/**
|
|
62
86
|
* Collect repeatable string options into an array.
|
|
@@ -21,7 +21,7 @@ import { detectLegacyFieldNames, parseCanonicalTaskFile, } from "../adapters/tas
|
|
|
21
21
|
import { validateCanonicalTasks, formatValidationResult, } from "../adapters/task-sources/repo-validation.js";
|
|
22
22
|
import { discoverTsTaskFiles, loadTsTaskFile, } from "../adapters/task-sources/task-file-loader.js";
|
|
23
23
|
export function createValidateTasksCommand() {
|
|
24
|
-
return new Command("
|
|
24
|
+
return new Command("tasks")
|
|
25
25
|
.description("Validate task files (YAML and TypeScript) in .ailf/tasks/ against the canonical schema")
|
|
26
26
|
.argument("[path]", "Path to tasks directory (default: .ailf/tasks/)", ".ailf/tasks")
|
|
27
27
|
.option("--strict", "Treat warnings as errors", false)
|
|
@@ -10,8 +10,8 @@ import { dirname, resolve } from "path";
|
|
|
10
10
|
import { fileURLToPath } from "url";
|
|
11
11
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
12
12
|
const ROOT = resolve(__dirname, "..", "..");
|
|
13
|
-
export function
|
|
14
|
-
return new Command("
|
|
13
|
+
export function createValidateConfigCommand() {
|
|
14
|
+
return new Command("config")
|
|
15
15
|
.description("Validate pipeline configuration")
|
|
16
16
|
.option("--strict", "Treat warnings as errors", false)
|
|
17
17
|
.option("--contexts", "Check that context files exist", false)
|
|
@@ -8,11 +8,11 @@
|
|
|
8
8
|
*/
|
|
9
9
|
import { Command } from "commander";
|
|
10
10
|
export function createWeeklyDigestCommand() {
|
|
11
|
-
return new Command("
|
|
11
|
+
return new Command("digest")
|
|
12
12
|
.description("Generate and deliver a weekly evaluation digest")
|
|
13
13
|
.option("-n, --dry-run", "Print to stdout only, do not send to Slack", false)
|
|
14
14
|
.option("--lookback <days>", "Lookback window in days (default: from config or 7)", parseInt)
|
|
15
|
-
.option("--
|
|
15
|
+
.option("-f, --format <fmt>", "Output format: console or json", "console")
|
|
16
16
|
.action(async (opts) => {
|
|
17
17
|
const { generateDigest } = await import("../schedules/digest.js");
|
|
18
18
|
const { getDigestConfig } = await import("../schedules/loader.js");
|
|
@@ -45,7 +45,7 @@ export function createWeeklyDigestCommand() {
|
|
|
45
45
|
console.log(` Regressed: ${digest.regressed.join(", ") || "none"}`);
|
|
46
46
|
console.log(` Stable: ${digest.stable.join(", ") || "none"}`);
|
|
47
47
|
console.log();
|
|
48
|
-
if (opts.json) {
|
|
48
|
+
if (opts.format === "json") {
|
|
49
49
|
console.log(JSON.stringify(digest, null, 2));
|
|
50
50
|
return;
|
|
51
51
|
}
|
|
@@ -2,9 +2,9 @@
|
|
|
2
2
|
* thresholds.ts — Quality thresholds for readiness gates and regression alerts.
|
|
3
3
|
*
|
|
4
4
|
* Used by:
|
|
5
|
-
* - `npx @sanity/ailf
|
|
6
|
-
* - `npx @sanity/ailf
|
|
7
|
-
* - `npx @sanity/ailf
|
|
5
|
+
* - `npx @sanity/ailf report readiness` (launch readiness checklist)
|
|
6
|
+
* - `npx @sanity/ailf run --publish` (severity-aware sink routing)
|
|
7
|
+
* - `npx @sanity/ailf run --compare` (regression alerting)
|
|
8
8
|
*
|
|
9
9
|
* @see docs/archive/exec-plans/scenario-matrix-implementation/phase-5-readiness-thresholds.md
|
|
10
10
|
*/
|
|
@@ -44,8 +44,6 @@ export function mapToResolvedConfig(opts, rootDir) {
|
|
|
44
44
|
compareThreshold: opts.compareThreshold,
|
|
45
45
|
compareBaseline: opts.compareBaseline,
|
|
46
46
|
gapAnalysisEnabled: opts.gapAnalysisEnabled,
|
|
47
|
-
readinessEnabled: opts.readinessEnabled,
|
|
48
|
-
discoveryReportEnabled: opts.discoveryReportEnabled,
|
|
49
47
|
publishEnabled: opts.publishEnabled,
|
|
50
48
|
publishTag: opts.publishTag,
|
|
51
49
|
noCache: opts.noCache,
|
|
@@ -9,14 +9,12 @@ import { LiteracyVariant } from "../pipeline/normalize-mode.js";
|
|
|
9
9
|
import { CallbackStep } from "./steps/callback-step.js";
|
|
10
10
|
import { CalculateScoresStep } from "./steps/calculate-scores-step.js";
|
|
11
11
|
import { CompareStep } from "./steps/compare-step.js";
|
|
12
|
-
import { DiscoveryReportStep } from "./steps/discovery-report-step.js";
|
|
13
12
|
import { FetchDocsStep } from "./steps/fetch-docs-step.js";
|
|
14
13
|
import { FinalizeRunStep } from "./steps/finalize-run-step.js";
|
|
15
14
|
import { GapAnalysisStep } from "./steps/gap-analysis-step.js";
|
|
16
15
|
import { GenerateConfigsStep } from "./steps/generate-configs-step.js";
|
|
17
16
|
import { GraderConsistencyStep } from "./steps/grader-consistency-step.js";
|
|
18
17
|
import { PublishReportStep } from "./steps/publish-report-step.js";
|
|
19
|
-
import { ReadinessStep } from "./steps/readiness-step.js";
|
|
20
18
|
import { ReportStep } from "./steps/report-step.js";
|
|
21
19
|
import { RunEvalStep } from "./steps/run-eval-step.js";
|
|
22
20
|
import { MirrorRepoTasksStep } from "./steps/mirror-repo-tasks-step.js";
|
|
@@ -93,15 +91,7 @@ export function buildStepSequence(ctx, pipelineStart = Date.now()) {
|
|
|
93
91
|
if (config.compareEnabled) {
|
|
94
92
|
steps.push(new CompareStep());
|
|
95
93
|
}
|
|
96
|
-
// Step
|
|
97
|
-
if (config.readinessEnabled) {
|
|
98
|
-
steps.push(new ReadinessStep());
|
|
99
|
-
}
|
|
100
|
-
// Step 6c: Discovery report (optional)
|
|
101
|
-
if (config.discoveryReportEnabled) {
|
|
102
|
-
steps.push(new DiscoveryReportStep());
|
|
103
|
-
}
|
|
104
|
-
// Step 7: Callback delivery (optional, API-triggered evaluations)
|
|
94
|
+
// Step 6: Callback delivery (optional, API-triggered evaluations)
|
|
105
95
|
if (config.callback?.url) {
|
|
106
96
|
steps.push(new CallbackStep(config.callback, config.jobId));
|
|
107
97
|
}
|
|
@@ -27,7 +27,7 @@ export class FetchDocsStep {
|
|
|
27
27
|
}
|
|
28
28
|
async execute(ctx, state) {
|
|
29
29
|
if (ctx.config.skipFetch) {
|
|
30
|
-
return { status: "skipped", reason: "--
|
|
30
|
+
return { status: "skipped", reason: "--no-fetch" };
|
|
31
31
|
}
|
|
32
32
|
const start = Date.now();
|
|
33
33
|
// Load tasks — use the same source as GenerateConfigsStep to avoid
|
|
@@ -6,14 +6,12 @@
|
|
|
6
6
|
*/
|
|
7
7
|
export { CalculateScoresStep } from "./calculate-scores-step.js";
|
|
8
8
|
export { CompareStep } from "./compare-step.js";
|
|
9
|
-
export { DiscoveryReportStep } from "./discovery-report-step.js";
|
|
10
9
|
export { FetchDocsStep } from "./fetch-docs-step.js";
|
|
11
10
|
export { GapAnalysisStep } from "./gap-analysis-step.js";
|
|
12
11
|
export { GenerateConfigsStep } from "./generate-configs-step.js";
|
|
13
12
|
export { MirrorRepoTasksStep } from "./mirror-repo-tasks-step.js";
|
|
14
13
|
export { GraderConsistencyStep } from "./grader-consistency-step.js";
|
|
15
14
|
export { PublishReportStep } from "./publish-report-step.js";
|
|
16
|
-
export { ReadinessStep } from "./readiness-step.js";
|
|
17
15
|
export { ReportStep } from "./report-step.js";
|
|
18
16
|
export { RunEvalStep } from "./run-eval-step.js";
|
|
19
17
|
export { ValidateStep } from "./validate-step.js";
|
|
@@ -6,14 +6,12 @@
|
|
|
6
6
|
*/
|
|
7
7
|
export { CalculateScoresStep } from "./calculate-scores-step.js";
|
|
8
8
|
export { CompareStep } from "./compare-step.js";
|
|
9
|
-
export { DiscoveryReportStep } from "./discovery-report-step.js";
|
|
10
9
|
export { FetchDocsStep } from "./fetch-docs-step.js";
|
|
11
10
|
export { GapAnalysisStep } from "./gap-analysis-step.js";
|
|
12
11
|
export { GenerateConfigsStep } from "./generate-configs-step.js";
|
|
13
12
|
export { MirrorRepoTasksStep } from "./mirror-repo-tasks-step.js";
|
|
14
13
|
export { GraderConsistencyStep } from "./grader-consistency-step.js";
|
|
15
14
|
export { PublishReportStep } from "./publish-report-step.js";
|
|
16
|
-
export { ReadinessStep } from "./readiness-step.js";
|
|
17
15
|
export { ReportStep } from "./report-step.js";
|
|
18
16
|
export { RunEvalStep } from "./run-eval-step.js";
|
|
19
17
|
export { ValidateStep } from "./validate-step.js";
|
|
@@ -25,7 +25,7 @@ export class RunEvalStep {
|
|
|
25
25
|
}
|
|
26
26
|
async execute(ctx, state) {
|
|
27
27
|
if (ctx.config.skipEval) {
|
|
28
|
-
return { status: "skipped", reason: "--
|
|
28
|
+
return { status: "skipped", reason: "--no-eval" };
|
|
29
29
|
}
|
|
30
30
|
const start = Date.now();
|
|
31
31
|
const { rootDir, debug, concurrency, noCache } = ctx.config;
|
package/dist/pipeline/cache.d.ts
CHANGED
|
@@ -48,7 +48,7 @@ export interface CacheStats {
|
|
|
48
48
|
hits: number;
|
|
49
49
|
/** Steps where cache was missed (executed normally) */
|
|
50
50
|
misses: number;
|
|
51
|
-
/** Steps that were skipped for other reasons (--
|
|
51
|
+
/** Steps that were skipped for other reasons (--no-fetch, etc.) */
|
|
52
52
|
skipped: number;
|
|
53
53
|
/** Per-step detail */
|
|
54
54
|
steps: Record<string, "disabled" | "hit" | "miss" | "skipped">;
|
|
@@ -47,8 +47,6 @@ export function mapRequestToConfig(request, rootDir) {
|
|
|
47
47
|
compareThreshold: request.compareThreshold,
|
|
48
48
|
compareBaseline: request.compareBaseline,
|
|
49
49
|
gapAnalysisEnabled: request.gapAnalysis ?? true,
|
|
50
|
-
readinessEnabled: request.readiness ?? false,
|
|
51
|
-
discoveryReportEnabled: request.discoveryReport ?? false,
|
|
52
50
|
publishEnabled: request.publish ?? publishDefault,
|
|
53
51
|
publishTag: request.publishTag,
|
|
54
52
|
noAutoScope: request.noAutoScope ?? false,
|
package/dist/pipeline/plan.d.ts
CHANGED
|
@@ -61,7 +61,7 @@ export interface DebugPlan {
|
|
|
61
61
|
export interface ExecutionPlan {
|
|
62
62
|
/** Cache prediction per step */
|
|
63
63
|
cacheStatus: Record<string, "hit" | "miss" | "skipped" | "unknown">;
|
|
64
|
-
/** Command name (e.g., "
|
|
64
|
+
/** Command name (e.g., "run", "compare", "validate") */
|
|
65
65
|
command: string;
|
|
66
66
|
/** Comparison plan (when --compare is set) */
|
|
67
67
|
comparison?: ComparisonPlan;
|
|
@@ -134,7 +134,6 @@ export interface PlanOptions {
|
|
|
134
134
|
compareThreshold?: number;
|
|
135
135
|
concurrency?: number;
|
|
136
136
|
debug?: DebugOptions;
|
|
137
|
-
discoveryReportEnabled: boolean;
|
|
138
137
|
dryRun: boolean;
|
|
139
138
|
gapAnalysisEnabled: boolean;
|
|
140
139
|
graderReplications?: number;
|
|
@@ -143,7 +142,6 @@ export interface PlanOptions {
|
|
|
143
142
|
variant?: string;
|
|
144
143
|
noCache: boolean;
|
|
145
144
|
publishEnabled: boolean;
|
|
146
|
-
readinessEnabled: boolean;
|
|
147
145
|
repoTasksPath?: string;
|
|
148
146
|
skipEval: boolean;
|
|
149
147
|
skipFetch: boolean;
|
|
@@ -152,7 +150,7 @@ export interface PlanOptions {
|
|
|
152
150
|
taskOption?: string;
|
|
153
151
|
}
|
|
154
152
|
/**
|
|
155
|
-
* Build a complete execution plan for the `
|
|
153
|
+
* Build a complete execution plan for the `run` command.
|
|
156
154
|
*
|
|
157
155
|
* This is a read-only operation — it computes the plan by calling existing
|
|
158
156
|
* pure functions (task expansion, model loading, cache lookup, pricing)
|
package/dist/pipeline/plan.js
CHANGED
|
@@ -99,7 +99,7 @@ const AVG_TOKENS = {
|
|
|
99
99
|
// Cache prediction
|
|
100
100
|
// ---------------------------------------------------------------------------
|
|
101
101
|
/**
|
|
102
|
-
* Build a complete execution plan for the `
|
|
102
|
+
* Build a complete execution plan for the `run` command.
|
|
103
103
|
*
|
|
104
104
|
* This is a read-only operation — it computes the plan by calling existing
|
|
105
105
|
* pure functions (task expansion, model loading, cache lookup, pricing)
|
|
@@ -322,13 +322,11 @@ export async function buildPipelinePlan(opts, rootDir) {
|
|
|
322
322
|
// 7. Build step plan
|
|
323
323
|
const steps = buildStepPlan({
|
|
324
324
|
compareEnabled: opts.compareEnabled,
|
|
325
|
-
discoveryReportEnabled: opts.discoveryReportEnabled,
|
|
326
325
|
dryRun: opts.dryRun,
|
|
327
326
|
gapAnalysisEnabled: opts.gapAnalysisEnabled,
|
|
328
327
|
graderReplications: opts.graderReplications,
|
|
329
328
|
noCache: opts.noCache,
|
|
330
329
|
publishEnabled: opts.publishEnabled,
|
|
331
|
-
readinessEnabled: opts.readinessEnabled,
|
|
332
330
|
skipEval: opts.skipEval,
|
|
333
331
|
skipFetch: opts.skipFetch,
|
|
334
332
|
}, cachePrediction);
|
|
@@ -338,14 +336,12 @@ export async function buildPipelinePlan(opts, rootDir) {
|
|
|
338
336
|
const filesRead = collectFilesRead(rootDir, opts.mode);
|
|
339
337
|
const filesCreated = collectFilesCreated({
|
|
340
338
|
compareEnabled: opts.compareEnabled,
|
|
341
|
-
discoveryReportEnabled: opts.discoveryReportEnabled,
|
|
342
339
|
gapAnalysisEnabled: opts.gapAnalysisEnabled,
|
|
343
340
|
publishEnabled: opts.publishEnabled,
|
|
344
|
-
readinessEnabled: opts.readinessEnabled,
|
|
345
341
|
});
|
|
346
342
|
return {
|
|
347
343
|
cacheStatus: cachePrediction.predictions,
|
|
348
|
-
command: "
|
|
344
|
+
command: "run",
|
|
349
345
|
comparison,
|
|
350
346
|
costEstimate,
|
|
351
347
|
debug: debugPlan,
|
|
@@ -486,7 +482,7 @@ function buildStepPlan(opts, cachePrediction) {
|
|
|
486
482
|
estimatedSavedMs: fetchStatus === "hit" ? cachePrediction.estimatedSavedMs : undefined,
|
|
487
483
|
name: "Fetch documentation",
|
|
488
484
|
reason: fetchStatus === "skipped"
|
|
489
|
-
? "--
|
|
485
|
+
? "--no-fetch: reuse cached contexts"
|
|
490
486
|
: fetchStatus === "hit"
|
|
491
487
|
? "CACHED (inputs unchanged)"
|
|
492
488
|
: "Fetch from Sanity Content Lake",
|
|
@@ -508,7 +504,7 @@ function buildStepPlan(opts, cachePrediction) {
|
|
|
508
504
|
cacheStatus: evalStatus,
|
|
509
505
|
name: "Run evaluation",
|
|
510
506
|
reason: opts.skipEval
|
|
511
|
-
? "--
|
|
507
|
+
? "--no-eval: reuse existing results"
|
|
512
508
|
: evalStatus === "hit"
|
|
513
509
|
? "CACHED (inputs unchanged)"
|
|
514
510
|
: "Execute Promptfoo evaluation against all models",
|
|
@@ -567,24 +563,6 @@ function buildStepPlan(opts, cachePrediction) {
|
|
|
567
563
|
willRun: true,
|
|
568
564
|
});
|
|
569
565
|
}
|
|
570
|
-
// Step 6: Readiness report (optional)
|
|
571
|
-
if (opts.readinessEnabled) {
|
|
572
|
-
steps.push({
|
|
573
|
-
cacheStatus: "miss",
|
|
574
|
-
name: "Readiness report",
|
|
575
|
-
reason: "Generate launch readiness checklist",
|
|
576
|
-
willRun: true,
|
|
577
|
-
});
|
|
578
|
-
}
|
|
579
|
-
// Step 6c: Discovery report (optional)
|
|
580
|
-
if (opts.discoveryReportEnabled) {
|
|
581
|
-
steps.push({
|
|
582
|
-
cacheStatus: "miss",
|
|
583
|
-
name: "Discovery report",
|
|
584
|
-
reason: "Analyze agent discoverability from retrieval metrics",
|
|
585
|
-
willRun: true,
|
|
586
|
-
});
|
|
587
|
-
}
|
|
588
566
|
return steps;
|
|
589
567
|
}
|
|
590
568
|
function collectFilesCreated(opts) {
|
|
@@ -601,12 +579,6 @@ function collectFilesCreated(opts) {
|
|
|
601
579
|
files.push("results/latest/failure-modes.json");
|
|
602
580
|
files.push("results/latest/gap-analysis.json");
|
|
603
581
|
}
|
|
604
|
-
if (opts.readinessEnabled) {
|
|
605
|
-
files.push("results/latest/readiness-report.md");
|
|
606
|
-
}
|
|
607
|
-
if (opts.discoveryReportEnabled) {
|
|
608
|
-
files.push("results/latest/discovery-report.md");
|
|
609
|
-
}
|
|
610
582
|
return files.sort();
|
|
611
583
|
}
|
|
612
584
|
// ---------------------------------------------------------------------------
|
|
@@ -85,7 +85,7 @@ export interface RunContextInput {
|
|
|
85
85
|
export declare function buildRunContext(input: RunContextInput): RunContext;
|
|
86
86
|
/**
|
|
87
87
|
* Resolve `classification` from `AILF_CLASSIFICATION`, validated against
|
|
88
|
-
* the closed enum. Defaults to `"
|
|
88
|
+
* the closed enum. Defaults to `"adhoc"` so unannotated runs never leak
|
|
89
89
|
* into the canonical `"official"` series.
|
|
90
90
|
*/
|
|
91
91
|
export declare function detectClassification(log: Logger): RunClassification;
|
|
@@ -169,17 +169,17 @@ function detectTrigger() {
|
|
|
169
169
|
// ---------------------------------------------------------------------------
|
|
170
170
|
/**
|
|
171
171
|
* Resolve `classification` from `AILF_CLASSIFICATION`, validated against
|
|
172
|
-
* the closed enum. Defaults to `"
|
|
172
|
+
* the closed enum. Defaults to `"adhoc"` so unannotated runs never leak
|
|
173
173
|
* into the canonical `"official"` series.
|
|
174
174
|
*/
|
|
175
175
|
export function detectClassification(log) {
|
|
176
176
|
const raw = process.env.AILF_CLASSIFICATION?.trim();
|
|
177
177
|
if (!raw)
|
|
178
|
-
return "
|
|
178
|
+
return "adhoc";
|
|
179
179
|
if (isRunClassification(raw))
|
|
180
180
|
return raw;
|
|
181
|
-
log.warn(`AILF_CLASSIFICATION="${raw}" is not a recognized value; defaulting to "
|
|
182
|
-
return "
|
|
181
|
+
log.warn(`AILF_CLASSIFICATION="${raw}" is not a recognized value; defaulting to "adhoc"`);
|
|
182
|
+
return "adhoc";
|
|
183
183
|
}
|
|
184
184
|
/**
|
|
185
185
|
* Resolve `owner` from `AILF_OWNER_TEAM` (+ optional
|
|
@@ -63,6 +63,6 @@ export declare function validateTaskFiles(rootDir: string): ValidationIssue[];
|
|
|
63
63
|
*
|
|
64
64
|
* Returns warnings (not errors) if the file is missing — thresholds are
|
|
65
65
|
* optional and don't block evaluation. They only activate when
|
|
66
|
-
*
|
|
66
|
+
* `ailf report readiness` or severity-aware sink routing is used.
|
|
67
67
|
*/
|
|
68
68
|
export declare function validateThresholdsYaml(rootDir: string): ValidationIssue[];
|
|
@@ -272,7 +272,7 @@ export function validateTaskFiles(rootDir) {
|
|
|
272
272
|
*
|
|
273
273
|
* Returns warnings (not errors) if the file is missing — thresholds are
|
|
274
274
|
* optional and don't block evaluation. They only activate when
|
|
275
|
-
*
|
|
275
|
+
* `ailf report readiness` or severity-aware sink routing is used.
|
|
276
276
|
*/
|
|
277
277
|
export function validateThresholdsYaml(rootDir) {
|
|
278
278
|
const source = "validateThresholdsYaml";
|