@sanity/ailf 2.8.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/artifact-capture/association.d.ts +35 -0
- package/dist/_vendor/ailf-core/artifact-capture/association.js +28 -0
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +124 -23
- package/dist/_vendor/ailf-core/artifact-registry.js +708 -64
- package/dist/_vendor/ailf-core/batch-signing.d.ts +64 -0
- package/dist/_vendor/ailf-core/batch-signing.js +23 -0
- package/dist/_vendor/ailf-core/index.d.ts +3 -2
- package/dist/_vendor/ailf-core/index.js +3 -2
- package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +59 -20
- package/dist/_vendor/ailf-core/ports/artifact-writer.js +33 -10
- package/dist/_vendor/ailf-core/ports/context.d.ts +20 -17
- package/dist/_vendor/ailf-core/ports/index.d.ts +0 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +6 -6
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/services/index.js +1 -0
- package/dist/_vendor/ailf-core/services/slim-report-summary.d.ts +31 -0
- package/dist/_vendor/ailf-core/services/slim-report-summary.js +217 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +33 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +202 -23
- package/dist/adapters/config-sources/file-config-adapter.js +0 -4
- package/dist/artifact-capture/accumulating-artifact-writer.d.ts +50 -0
- package/dist/artifact-capture/accumulating-artifact-writer.js +111 -0
- package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +17 -4
- package/dist/artifact-capture/api-gateway-artifact-writer.js +58 -7
- package/dist/artifact-capture/emit-file.d.ts +28 -0
- package/dist/artifact-capture/emit-file.js +56 -0
- package/dist/artifact-capture/fanout-artifact-writer.d.ts +39 -0
- package/dist/artifact-capture/fanout-artifact-writer.js +76 -0
- package/dist/artifact-capture/gcs-artifact-writer.d.ts +40 -3
- package/dist/artifact-capture/gcs-artifact-writer.js +238 -14
- package/dist/artifact-capture/local-fs-artifact-writer.d.ts +71 -0
- package/dist/artifact-capture/local-fs-artifact-writer.js +273 -0
- package/dist/artifact-capture/redact-artifact.d.ts +3 -5
- package/dist/artifact-capture/redact-artifact.js +3 -5
- package/dist/cli.js +56 -2
- package/dist/commands/explain-handler.js +4 -4
- package/dist/commands/pipeline-action.d.ts +5 -4
- package/dist/commands/pipeline-action.js +33 -16
- package/dist/commands/pipeline.d.ts +4 -4
- package/dist/commands/pipeline.js +4 -4
- package/dist/commands/publish.js +4 -1
- package/dist/commands/runs.d.ts +18 -0
- package/dist/commands/runs.js +71 -0
- package/dist/composition-root.d.ts +13 -10
- package/dist/composition-root.js +74 -46
- package/dist/orchestration/build-app-context.js +4 -7
- package/dist/orchestration/pipeline-orchestrator.d.ts +1 -1
- package/dist/orchestration/pipeline-orchestrator.js +37 -46
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -1
- package/dist/orchestration/steps/calculate-scores-step.js +19 -19
- package/dist/orchestration/steps/callback-step.d.ts +1 -1
- package/dist/orchestration/steps/callback-step.js +6 -4
- package/dist/orchestration/steps/compare-step.d.ts +1 -1
- package/dist/orchestration/steps/compare-step.js +4 -2
- package/dist/orchestration/steps/discovery-report-step.d.ts +1 -1
- package/dist/orchestration/steps/discovery-report-step.js +4 -1
- package/dist/orchestration/steps/fetch-docs-step.js +9 -15
- package/dist/orchestration/steps/finalize-run-step.js +21 -7
- package/dist/orchestration/steps/gap-analysis-step.js +34 -6
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -1
- package/dist/orchestration/steps/generate-configs-step.js +11 -11
- package/dist/orchestration/steps/publish-report-step.d.ts +1 -1
- package/dist/orchestration/steps/publish-report-step.js +24 -19
- package/dist/orchestration/steps/readiness-step.d.ts +1 -1
- package/dist/orchestration/steps/readiness-step.js +4 -1
- package/dist/orchestration/steps/report-step.d.ts +1 -1
- package/dist/orchestration/steps/report-step.js +6 -3
- package/dist/orchestration/steps/run-eval-step.js +14 -9
- package/dist/pipeline/compare.d.ts +2 -2
- package/dist/pipeline/emit-eval-results.d.ts +38 -0
- package/dist/pipeline/emit-eval-results.js +100 -0
- package/dist/pipeline/map-request-to-config.js +0 -4
- package/package.json +1 -1
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +0 -14
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +0 -25
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +0 -94
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +0 -13
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +0 -138
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +0 -10
- package/dist/artifact-capture/comparator.d.ts +0 -22
- package/dist/artifact-capture/comparator.js +0 -493
- package/dist/artifact-capture/filesystem-collector.d.ts +0 -42
- package/dist/artifact-capture/filesystem-collector.js +0 -237
- package/dist/artifact-capture/gcs-collector.d.ts +0 -55
- package/dist/artifact-capture/gcs-collector.js +0 -117
- package/dist/commands/capture-compare.d.ts +0 -15
- package/dist/commands/capture-compare.js +0 -253
- package/dist/commands/capture-list.d.ts +0 -12
- package/dist/commands/capture-list.js +0 -150
- package/dist/commands/capture.d.ts +0 -9
- package/dist/commands/capture.js +0 -16
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* Calls generatePrComment() from pipeline/pr-comment.ts with typed options.
|
|
5
5
|
* No env bridge or process.argv manipulation needed.
|
|
6
6
|
*/
|
|
7
|
-
import type
|
|
7
|
+
import { type AppContext, type PipelineStep, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
8
8
|
export declare class ReportStep implements PipelineStep {
|
|
9
9
|
readonly name = "report";
|
|
10
10
|
check(): ValidationIssue[];
|
|
@@ -6,6 +6,8 @@
|
|
|
6
6
|
*/
|
|
7
7
|
import { existsSync, mkdirSync } from "node:fs";
|
|
8
8
|
import { dirname, resolve } from "path";
|
|
9
|
+
import { assoc, } from "../../_vendor/ailf-core/index.js";
|
|
10
|
+
import { emitFileContents } from "../../artifact-capture/emit-file.js";
|
|
9
11
|
import { checkScoreSummaryValid } from "../../pipeline/checks.js";
|
|
10
12
|
import { generatePrComment } from "../../pipeline/pr-comment.js";
|
|
11
13
|
export class ReportStep {
|
|
@@ -45,13 +47,14 @@ export class ReportStep {
|
|
|
45
47
|
status: "failed",
|
|
46
48
|
};
|
|
47
49
|
}
|
|
48
|
-
//
|
|
50
|
+
// W0050 — captureFile → emitFileContents. Both are run-scoped bulk
|
|
51
|
+
// artifacts; the writer handles redaction + excluded-types gating.
|
|
49
52
|
if (existsSync(resolvedOutput)) {
|
|
50
|
-
ctx.
|
|
53
|
+
await emitFileContents(ctx.artifactWriter, "prComment", assoc(ctx), resolvedOutput);
|
|
51
54
|
}
|
|
52
55
|
const pipelineResultPath = resolve(ctx.config.outputDir, "pipeline-result.json");
|
|
53
56
|
if (existsSync(pipelineResultPath)) {
|
|
54
|
-
ctx.
|
|
57
|
+
await emitFileContents(ctx.artifactWriter, "pipelineResult", assoc(ctx), pipelineResultPath);
|
|
55
58
|
}
|
|
56
59
|
return {
|
|
57
60
|
durationMs: Date.now() - start,
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
*/
|
|
8
8
|
import { existsSync, mkdirSync, writeFileSync } from "fs";
|
|
9
9
|
import { resolve } from "path";
|
|
10
|
+
import { emitPerEntryEvalResults } from "../../pipeline/emit-eval-results.js";
|
|
10
11
|
import { getStepInputPaths } from "../../pipeline/cache.js";
|
|
11
12
|
import { buildCacheContext } from "../cache-context.js";
|
|
12
13
|
import { checkCanonicalContextsExist, checkGeneratedConfigsExist, checkResultsExist, } from "../../pipeline/checks.js";
|
|
@@ -118,11 +119,11 @@ export class RunEvalStep {
|
|
|
118
119
|
state.promptfooUrls ??= [];
|
|
119
120
|
state.promptfooUrls.push(...remoteCacheResult.promptfooUrls);
|
|
120
121
|
}
|
|
121
|
-
//
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
122
|
+
// W0050 — score-summary-cached was an unregistered capture;
|
|
123
|
+
// scoreSummary is already emitted by calculate-scores-step on the
|
|
124
|
+
// non-cached path, which also runs when we have a remote cache hit
|
|
125
|
+
// (populating state.remoteCacheHits → CalculateScoresStep still
|
|
126
|
+
// invokes for the score-summary emit). Dropped here.
|
|
126
127
|
return {
|
|
127
128
|
durationMs: Date.now() - start,
|
|
128
129
|
status: "success",
|
|
@@ -187,12 +188,16 @@ export class RunEvalStep {
|
|
|
187
188
|
console.log();
|
|
188
189
|
console.log(errorSummary);
|
|
189
190
|
}
|
|
190
|
-
//
|
|
191
|
+
// W0050 — decompose the promptfoo aggregate into the per-entry
|
|
192
|
+
// descriptors the W0049 registry expects: rawResults / renderedPrompts
|
|
193
|
+
// per (run, mode, task, model); graderPrompts / graderJudgments per
|
|
194
|
+
// (run, mode, task, model, grader). See pipeline/emit-eval-results.ts.
|
|
195
|
+
// `testOutputs` still flows through uploadTestOutputs() in
|
|
196
|
+
// calculate-scores-step. `traces` ships via agent-observer (out of
|
|
197
|
+
// scope for the promptfoo shape parser — follow-up).
|
|
191
198
|
const resultsPath = resolve(rootDir, resultsFileForMode(this.mode));
|
|
192
199
|
if (existsSync(resultsPath)) {
|
|
193
|
-
ctx.
|
|
194
|
-
mode: this.mode,
|
|
195
|
-
});
|
|
200
|
+
await emitPerEntryEvalResults(ctx.artifactWriter, ctx, this.mode, resultsPath);
|
|
196
201
|
}
|
|
197
202
|
// Extract Promptfoo share URL from eval results (Step 3b)
|
|
198
203
|
if (ctx.evalRunner.extractShareUrl) {
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
* @see docs/ideas/evaluation-roadmap.md — BP5: Make comparison a primitive
|
|
16
16
|
* @see docs/ideas/metrics-design.md — Tier 4: Comparison results
|
|
17
17
|
*/
|
|
18
|
-
import { type ChangeClass, type
|
|
18
|
+
import { type ChangeClass, type ComparableSummary, type CompareOptions, type ComparisonReport } from "./types.js";
|
|
19
19
|
/** Classify a delta as improved, regressed, or unchanged given a threshold */
|
|
20
20
|
export declare function classifyChange(delta: number, threshold: number): ChangeClass;
|
|
21
21
|
/**
|
|
@@ -28,4 +28,4 @@ export declare function classifyChange(delta: number, threshold: number): Change
|
|
|
28
28
|
* @param options Optional configuration (noise threshold, etc.)
|
|
29
29
|
* @returns A ComparisonReport with deltas, classifications, and breakdowns
|
|
30
30
|
*/
|
|
31
|
-
export declare function compare(baseline:
|
|
31
|
+
export declare function compare(baseline: ComparableSummary, experiment: ComparableSummary, options?: CompareOptions): ComparisonReport;
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* emit-eval-results.ts — decompose the promptfoo results file into the
|
|
3
|
+
* per-entry descriptors that W0049's registry expects.
|
|
4
|
+
*
|
|
5
|
+
* Replaces the Phase-B-stopgap "route the aggregated JSON through the
|
|
6
|
+
* deprecated `evalResults` bulk descriptor" path. For each test in the
|
|
7
|
+
* promptfoo output we emit:
|
|
8
|
+
*
|
|
9
|
+
* - `rawResults` per (run, mode, task, model) — the full result
|
|
10
|
+
* - `renderedPrompts` per (run, mode, task, model) — prompt the model saw
|
|
11
|
+
* - `graderPrompts` per (run, mode, task, model, grader) — rubric text
|
|
12
|
+
* - `graderJudgments` per (run, mode, task, model, grader) — {score, reason, pass}
|
|
13
|
+
*
|
|
14
|
+
* `testOutputs` is still emitted separately by `calculate-scores-step`
|
|
15
|
+
* via `uploadTestOutputs()` (carried forward from W0048 for byte-
|
|
16
|
+
* equivalence with the original rollout).
|
|
17
|
+
*
|
|
18
|
+
* `traces` is NOT produced here — agentic trace data flows through the
|
|
19
|
+
* agent-observer, not through the promptfoo result shape. Traces
|
|
20
|
+
* emission is out of scope for this helper and lands when the observer
|
|
21
|
+
* integration migrates (follow-up; not in W0050).
|
|
22
|
+
*
|
|
23
|
+
* The "grader" axis value is the rubric dimension string produced by
|
|
24
|
+
* `classifyRubric` (e.g. "task-completion", "code-correctness"). Non-
|
|
25
|
+
* LLM-rubric component assertions (javascript, contains, etc.) don't
|
|
26
|
+
* have a natural grader identifier and are skipped — their outcomes
|
|
27
|
+
* still live inside the full `rawResults` object.
|
|
28
|
+
*/
|
|
29
|
+
import { type ArtifactWriter, type RunId } from "../_vendor/ailf-core/index.d.ts";
|
|
30
|
+
/**
|
|
31
|
+
* Parse a promptfoo results file and emit the per-entry artifacts.
|
|
32
|
+
*
|
|
33
|
+
* Non-blocking: any individual emit failure warns but does not halt.
|
|
34
|
+
* File read/parse errors are caught and logged; the caller keeps going.
|
|
35
|
+
*/
|
|
36
|
+
export declare function emitPerEntryEvalResults(writer: ArtifactWriter, ctx: {
|
|
37
|
+
runId: RunId;
|
|
38
|
+
}, mode: string, resultsPath: string): Promise<void>;
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* emit-eval-results.ts — decompose the promptfoo results file into the
|
|
3
|
+
* per-entry descriptors that W0049's registry expects.
|
|
4
|
+
*
|
|
5
|
+
* Replaces the Phase-B-stopgap "route the aggregated JSON through the
|
|
6
|
+
* deprecated `evalResults` bulk descriptor" path. For each test in the
|
|
7
|
+
* promptfoo output we emit:
|
|
8
|
+
*
|
|
9
|
+
* - `rawResults` per (run, mode, task, model) — the full result
|
|
10
|
+
* - `renderedPrompts` per (run, mode, task, model) — prompt the model saw
|
|
11
|
+
* - `graderPrompts` per (run, mode, task, model, grader) — rubric text
|
|
12
|
+
* - `graderJudgments` per (run, mode, task, model, grader) — {score, reason, pass}
|
|
13
|
+
*
|
|
14
|
+
* `testOutputs` is still emitted separately by `calculate-scores-step`
|
|
15
|
+
* via `uploadTestOutputs()` (carried forward from W0048 for byte-
|
|
16
|
+
* equivalence with the original rollout).
|
|
17
|
+
*
|
|
18
|
+
* `traces` is NOT produced here — agentic trace data flows through the
|
|
19
|
+
* agent-observer, not through the promptfoo result shape. Traces
|
|
20
|
+
* emission is out of scope for this helper and lands when the observer
|
|
21
|
+
* integration migrates (follow-up; not in W0050).
|
|
22
|
+
*
|
|
23
|
+
* The "grader" axis value is the rubric dimension string produced by
|
|
24
|
+
* `classifyRubric` (e.g. "task-completion", "code-correctness"). Non-
|
|
25
|
+
* LLM-rubric component assertions (javascript, contains, etc.) don't
|
|
26
|
+
* have a natural grader identifier and are skipped — their outcomes
|
|
27
|
+
* still live inside the full `rawResults` object.
|
|
28
|
+
*/
|
|
29
|
+
import { readFileSync } from "node:fs";
|
|
30
|
+
import { classifyRubric, parseRubricScore, } from "../_vendor/ailf-core/index.js";
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
// Public entry point
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
/**
|
|
35
|
+
* Parse a promptfoo results file and emit the per-entry artifacts.
|
|
36
|
+
*
|
|
37
|
+
* Non-blocking: any individual emit failure warns but does not halt.
|
|
38
|
+
* File read/parse errors are caught and logged; the caller keeps going.
|
|
39
|
+
*/
|
|
40
|
+
export async function emitPerEntryEvalResults(writer, ctx, mode, resultsPath) {
|
|
41
|
+
let raw;
|
|
42
|
+
try {
|
|
43
|
+
raw = JSON.parse(readFileSync(resultsPath, "utf-8"));
|
|
44
|
+
}
|
|
45
|
+
catch (err) {
|
|
46
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
47
|
+
console.warn(` ⚠️ emitPerEntryEvalResults: failed to read ${resultsPath} — ${message}`);
|
|
48
|
+
return;
|
|
49
|
+
}
|
|
50
|
+
// Promptfoo wraps results in either `{ results: { results: [...] } }`
|
|
51
|
+
// (older shape) or directly as `{ results: [...] }` (some adapters).
|
|
52
|
+
const wrapper = raw.results && "results" in raw.results
|
|
53
|
+
? raw.results
|
|
54
|
+
: raw;
|
|
55
|
+
const rows = wrapper?.results ?? [];
|
|
56
|
+
if (rows.length === 0) {
|
|
57
|
+
console.warn(` ⚠️ emitPerEntryEvalResults: ${resultsPath} has no results[]`);
|
|
58
|
+
return;
|
|
59
|
+
}
|
|
60
|
+
for (const result of rows) {
|
|
61
|
+
const taskId = result.testCase?.description ?? "unknown-task";
|
|
62
|
+
const modelId = result.provider?.id ?? result.provider?.label ?? "unknown-model";
|
|
63
|
+
const baseAssoc = {
|
|
64
|
+
run: ctx.runId,
|
|
65
|
+
mode,
|
|
66
|
+
task: taskId,
|
|
67
|
+
model: modelId,
|
|
68
|
+
};
|
|
69
|
+
// rawResults — full raw entry (bounded by descriptor capBytes: 1 MB)
|
|
70
|
+
await writer.emit("rawResults", baseAssoc, result);
|
|
71
|
+
// renderedPrompts — what the model saw + which provider it went to
|
|
72
|
+
if (result.prompt !== undefined) {
|
|
73
|
+
await writer.emit("renderedPrompts", baseAssoc, {
|
|
74
|
+
prompt: result.prompt,
|
|
75
|
+
provider: result.provider,
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
// Per-grader decomposition — only LLM-rubric assertions have a
|
|
79
|
+
// natural grader identity. Code assertions (javascript/contains/…)
|
|
80
|
+
// show up in rawResults but not as standalone graderJudgments.
|
|
81
|
+
const components = result.gradingResult?.componentResults ?? [];
|
|
82
|
+
for (const comp of components) {
|
|
83
|
+
if (comp.assertion?.type !== "llm-rubric")
|
|
84
|
+
continue;
|
|
85
|
+
const dimension = classifyRubric(comp);
|
|
86
|
+
if (!dimension)
|
|
87
|
+
continue;
|
|
88
|
+
const graderAssoc = { ...baseAssoc, grader: dimension };
|
|
89
|
+
await writer.emit("graderPrompts", graderAssoc, {
|
|
90
|
+
dimension,
|
|
91
|
+
assertion: comp.assertion,
|
|
92
|
+
});
|
|
93
|
+
await writer.emit("graderJudgments", graderAssoc, {
|
|
94
|
+
score: parseRubricScore(comp) ?? 0,
|
|
95
|
+
reason: comp.reason ?? "",
|
|
96
|
+
pass: comp.pass,
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
@@ -74,10 +74,6 @@ export function mapRequestToConfig(request, rootDir) {
|
|
|
74
74
|
callerGit: request.callerGit,
|
|
75
75
|
callback: request.callback,
|
|
76
76
|
jobId: request.jobId,
|
|
77
|
-
captureEnabled: false,
|
|
78
|
-
captureDir: undefined,
|
|
79
|
-
captureCompress: true,
|
|
80
|
-
captureExtras: true,
|
|
81
77
|
remote: false,
|
|
82
78
|
apiUrl: "https://ailf-api.sanity.build",
|
|
83
79
|
presets: request.presets,
|
package/package.json
CHANGED
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* No-op artifact collector — used when --capture is not set.
|
|
3
|
-
*
|
|
4
|
-
* All methods are constant-time stubs. Zero overhead on the
|
|
5
|
-
* default pipeline path.
|
|
6
|
-
*/
|
|
7
|
-
import type { ArtifactCollector, CaptureFlushResult } from "../ports/artifact-collector.js";
|
|
8
|
-
export declare class NoOpArtifactCollector implements ArtifactCollector {
|
|
9
|
-
readonly enabled = false;
|
|
10
|
-
readonly extrasEnabled = false;
|
|
11
|
-
capture(): void;
|
|
12
|
-
captureFile(): void;
|
|
13
|
-
flush(): Promise<CaptureFlushResult>;
|
|
14
|
-
}
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* No-op artifact collector — used when --capture is not set.
|
|
3
|
-
*
|
|
4
|
-
* All methods are constant-time stubs. Zero overhead on the
|
|
5
|
-
* default pipeline path.
|
|
6
|
-
*/
|
|
7
|
-
const EMPTY_RESULT = Object.freeze({
|
|
8
|
-
artifactCount: 0,
|
|
9
|
-
compressed: false,
|
|
10
|
-
destination: "",
|
|
11
|
-
totalBytes: 0,
|
|
12
|
-
});
|
|
13
|
-
export class NoOpArtifactCollector {
|
|
14
|
-
enabled = false;
|
|
15
|
-
extrasEnabled = false;
|
|
16
|
-
capture() {
|
|
17
|
-
// no-op
|
|
18
|
-
}
|
|
19
|
-
captureFile() {
|
|
20
|
-
// no-op
|
|
21
|
-
}
|
|
22
|
-
async flush() {
|
|
23
|
-
return EMPTY_RESULT;
|
|
24
|
-
}
|
|
25
|
-
}
|
|
@@ -1,94 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Port: ArtifactCollector — captures pipeline artifacts during execution.
|
|
3
|
-
*
|
|
4
|
-
* Injected into AppContext. When capture is disabled (default), the
|
|
5
|
-
* composition root provides NoOpArtifactCollector. When --capture is
|
|
6
|
-
* set, provides FilesystemArtifactCollector.
|
|
7
|
-
*
|
|
8
|
-
* Design principles:
|
|
9
|
-
* - P1: Zero-cost when off (no-op stub)
|
|
10
|
-
* - P2: Capture, don't intercept (steps call capture() explicitly)
|
|
11
|
-
* - P5: Non-blocking (failures swallowed, never block the pipeline)
|
|
12
|
-
*/
|
|
13
|
-
/**
|
|
14
|
-
* The contract for artifact capture during pipeline execution.
|
|
15
|
-
*
|
|
16
|
-
* Steps call capture() for in-memory data and captureFile() for
|
|
17
|
-
* artifacts already on disk. The orchestrator calls flush() once
|
|
18
|
-
* at pipeline end to write everything to the configured destination.
|
|
19
|
-
*/
|
|
20
|
-
export interface ArtifactCollector {
|
|
21
|
-
/**
|
|
22
|
-
* Record an in-memory artifact produced during pipeline execution.
|
|
23
|
-
*
|
|
24
|
-
* Callers need not check `enabled` before calling — the NoOp
|
|
25
|
-
* implementation is zero-cost, so unconditional calls are safe.
|
|
26
|
-
*
|
|
27
|
-
* @param step - Pipeline step name (e.g., "run-eval")
|
|
28
|
-
* @param type - Artifact type identifier (e.g., "eval-results")
|
|
29
|
-
* @param data - Content to serialize (JSON or text)
|
|
30
|
-
* @param meta - Optional metadata (variant, model, etc.)
|
|
31
|
-
*/
|
|
32
|
-
capture(step: string, type: string, data: unknown, meta?: Record<string, unknown>): void;
|
|
33
|
-
/**
|
|
34
|
-
* Record a file reference for an artifact already on disk.
|
|
35
|
-
* The file is copied into the capture directory on flush().
|
|
36
|
-
*
|
|
37
|
-
* @param step - Pipeline step name
|
|
38
|
-
* @param type - Artifact type identifier
|
|
39
|
-
* @param filePath - Absolute path to the existing file
|
|
40
|
-
* @param meta - Optional metadata
|
|
41
|
-
*/
|
|
42
|
-
captureFile(step: string, type: string, filePath: string, meta?: Record<string, unknown>): void;
|
|
43
|
-
/**
|
|
44
|
-
* Flush all captured artifacts to the configured destination.
|
|
45
|
-
* Called once at pipeline end by the orchestrator.
|
|
46
|
-
*/
|
|
47
|
-
flush(): Promise<CaptureFlushResult>;
|
|
48
|
-
/** Whether capture is active */
|
|
49
|
-
readonly enabled: boolean;
|
|
50
|
-
/** Whether mode-specific extras are being captured */
|
|
51
|
-
readonly extrasEnabled: boolean;
|
|
52
|
-
}
|
|
53
|
-
/** Result of flushing captured artifacts to the destination. */
|
|
54
|
-
export interface CaptureFlushResult {
|
|
55
|
-
/** Total number of artifacts captured */
|
|
56
|
-
artifactCount: number;
|
|
57
|
-
/** Output path (directory or .tar.gz) */
|
|
58
|
-
destination: string;
|
|
59
|
-
/** Total bytes written (uncompressed) */
|
|
60
|
-
totalBytes: number;
|
|
61
|
-
/** Whether output was compressed */
|
|
62
|
-
compressed: boolean;
|
|
63
|
-
}
|
|
64
|
-
/** A single entry in the capture manifest. */
|
|
65
|
-
export interface CaptureManifestEntry {
|
|
66
|
-
/** Pipeline step that produced this artifact */
|
|
67
|
-
step: string;
|
|
68
|
-
/** Artifact type identifier */
|
|
69
|
-
type: string;
|
|
70
|
-
/** Relative path within the capture directory */
|
|
71
|
-
path: string;
|
|
72
|
-
/** ISO 8601 timestamp of when capture() was called */
|
|
73
|
-
capturedAt: string;
|
|
74
|
-
/** Byte size of the artifact */
|
|
75
|
-
bytes: number;
|
|
76
|
-
/** Content format */
|
|
77
|
-
format: "json" | "markdown" | "text";
|
|
78
|
-
/** Optional metadata */
|
|
79
|
-
meta?: Record<string, unknown>;
|
|
80
|
-
}
|
|
81
|
-
/** The manifest.json written to each capture directory. */
|
|
82
|
-
export interface CaptureManifest {
|
|
83
|
-
version: 1;
|
|
84
|
-
captureId: string;
|
|
85
|
-
startedAt: string;
|
|
86
|
-
completedAt: string;
|
|
87
|
-
pipeline: {
|
|
88
|
-
mode: string;
|
|
89
|
-
variant?: string;
|
|
90
|
-
source?: string;
|
|
91
|
-
areas?: string[];
|
|
92
|
-
};
|
|
93
|
-
artifacts: CaptureManifestEntry[];
|
|
94
|
-
}
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Port: ArtifactCollector — captures pipeline artifacts during execution.
|
|
3
|
-
*
|
|
4
|
-
* Injected into AppContext. When capture is disabled (default), the
|
|
5
|
-
* composition root provides NoOpArtifactCollector. When --capture is
|
|
6
|
-
* set, provides FilesystemArtifactCollector.
|
|
7
|
-
*
|
|
8
|
-
* Design principles:
|
|
9
|
-
* - P1: Zero-cost when off (no-op stub)
|
|
10
|
-
* - P2: Capture, don't intercept (steps call capture() explicitly)
|
|
11
|
-
* - P5: Non-blocking (failures swallowed, never block the pipeline)
|
|
12
|
-
*/
|
|
13
|
-
export {};
|
|
@@ -1,138 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Types for cross-run capture comparison.
|
|
3
|
-
*
|
|
4
|
-
* The CaptureComparator reads two capture directories (baseline + experiment)
|
|
5
|
-
* and produces a CaptureDiffReport. Types are defined in core so external
|
|
6
|
-
* tooling can consume diff reports without depending on the eval package.
|
|
7
|
-
*
|
|
8
|
-
* Implementation lives in packages/eval/src/artifact-capture/comparator.ts.
|
|
9
|
-
*/
|
|
10
|
-
/** How deeply to compare artifacts. */
|
|
11
|
-
export type ComparisonMode = "strict" | "structural" | "inventory";
|
|
12
|
-
/** Configurable thresholds for comparison. */
|
|
13
|
-
export interface ComparisonOptions {
|
|
14
|
-
/** Comparison depth: inventory (existence), structural (shape), strict (content) */
|
|
15
|
-
mode: ComparisonMode;
|
|
16
|
-
/** Score regression thresholds */
|
|
17
|
-
scoreThresholds?: {
|
|
18
|
-
/** Maximum allowed aggregate score delta (percentage points, default 5) */
|
|
19
|
-
aggregate: number;
|
|
20
|
-
/** Maximum allowed per-task score drop (points, default 10) */
|
|
21
|
-
perTask: number;
|
|
22
|
-
};
|
|
23
|
-
/** Timing regression thresholds */
|
|
24
|
-
timingThresholds?: {
|
|
25
|
-
/** Multiplier — flag steps exceeding this ratio (default 2.0) */
|
|
26
|
-
multiplier: number;
|
|
27
|
-
/** Per-step overrides (step name → custom multiplier) */
|
|
28
|
-
perStep?: Record<string, number>;
|
|
29
|
-
};
|
|
30
|
-
/** JSON structural diff depth (default 3) */
|
|
31
|
-
jsonDiffDepth?: number;
|
|
32
|
-
/** Additional ephemeral fields to ignore (merged with defaults) */
|
|
33
|
-
ephemeralFields?: string[];
|
|
34
|
-
}
|
|
35
|
-
/** Inventory diff — which artifacts exist in each capture. */
|
|
36
|
-
export interface InventoryDiff {
|
|
37
|
-
/** Artifact types in experiment but not in baseline */
|
|
38
|
-
added: string[];
|
|
39
|
-
/** Artifact types in baseline but not in experiment */
|
|
40
|
-
removed: string[];
|
|
41
|
-
/** Artifact types present in both */
|
|
42
|
-
common: string[];
|
|
43
|
-
}
|
|
44
|
-
/** A single structural change in a JSON artifact. */
|
|
45
|
-
export interface JsonDiffEntry {
|
|
46
|
-
/** JSON pointer path (e.g., "config.mode") */
|
|
47
|
-
path: string;
|
|
48
|
-
/** Value in baseline (undefined if key is added) */
|
|
49
|
-
baseline?: unknown;
|
|
50
|
-
/** Value in experiment (undefined if key is removed) */
|
|
51
|
-
experiment?: unknown;
|
|
52
|
-
}
|
|
53
|
-
/** Content diff for a single artifact. */
|
|
54
|
-
export interface ArtifactContentDiff {
|
|
55
|
-
/** Artifact type identifier (step/type) */
|
|
56
|
-
artifactKey: string;
|
|
57
|
-
/** Content format */
|
|
58
|
-
format: "json" | "markdown" | "text";
|
|
59
|
-
/** Structural changes (JSON) or line diff summary (text/markdown) */
|
|
60
|
-
changes: JsonDiffEntry[] | {
|
|
61
|
-
addedLines: number;
|
|
62
|
-
removedLines: number;
|
|
63
|
-
};
|
|
64
|
-
}
|
|
65
|
-
/** Score comparison between two captures. */
|
|
66
|
-
export interface ScoreComparison {
|
|
67
|
-
/** Baseline aggregate score */
|
|
68
|
-
baselineMean: number;
|
|
69
|
-
/** Experiment aggregate score */
|
|
70
|
-
currentMean: number;
|
|
71
|
-
/** Absolute delta (current - baseline) */
|
|
72
|
-
delta: number;
|
|
73
|
-
/** Per-task score deltas */
|
|
74
|
-
perTask: {
|
|
75
|
-
task: string;
|
|
76
|
-
baseline: number;
|
|
77
|
-
current: number;
|
|
78
|
-
delta: number;
|
|
79
|
-
}[];
|
|
80
|
-
/** Tasks that breached configured thresholds */
|
|
81
|
-
breaches: string[];
|
|
82
|
-
}
|
|
83
|
-
/** Timing comparison between two captures. */
|
|
84
|
-
export interface TimingComparison {
|
|
85
|
-
/** Total pipeline duration delta in ms */
|
|
86
|
-
totalDeltaMs: number;
|
|
87
|
-
/** Per-step timing */
|
|
88
|
-
perStep: {
|
|
89
|
-
step: string;
|
|
90
|
-
baselineMs: number;
|
|
91
|
-
currentMs: number;
|
|
92
|
-
ratio: number;
|
|
93
|
-
}[];
|
|
94
|
-
/** Steps that breached the timing multiplier threshold */
|
|
95
|
-
breaches: string[];
|
|
96
|
-
}
|
|
97
|
-
/** Metadata comparison between two captures. */
|
|
98
|
-
export interface MetadataComparison {
|
|
99
|
-
/** Whether pipeline modes match */
|
|
100
|
-
modeMatch: boolean;
|
|
101
|
-
/** Whether pipeline variants match */
|
|
102
|
-
variantMatch: boolean;
|
|
103
|
-
/** Config key differences */
|
|
104
|
-
configDiffs: JsonDiffEntry[];
|
|
105
|
-
}
|
|
106
|
-
/** Security scan results. */
|
|
107
|
-
export interface SecurityScan {
|
|
108
|
-
/** Whether any potential secret leaks were found */
|
|
109
|
-
leaksFound: boolean;
|
|
110
|
-
/** Details of each violation */
|
|
111
|
-
violations: {
|
|
112
|
-
/** Relative artifact file path */
|
|
113
|
-
file: string;
|
|
114
|
-
/** Description of the finding */
|
|
115
|
-
detail: string;
|
|
116
|
-
}[];
|
|
117
|
-
}
|
|
118
|
-
/** The full diff report produced by CaptureComparator. */
|
|
119
|
-
export interface CaptureDiffReport {
|
|
120
|
-
/** Are the two captures semantically equivalent? */
|
|
121
|
-
equivalent: boolean;
|
|
122
|
-
/** Human-readable summary (1-3 sentences) */
|
|
123
|
-
summary: string;
|
|
124
|
-
/** Comparison mode used */
|
|
125
|
-
mode: ComparisonMode;
|
|
126
|
-
/** Artifact inventory diff */
|
|
127
|
-
inventory: InventoryDiff;
|
|
128
|
-
/** Content diffs for common artifacts (structural/strict modes only) */
|
|
129
|
-
content?: ArtifactContentDiff[];
|
|
130
|
-
/** Score comparison (if score-summary exists in both captures) */
|
|
131
|
-
scores?: ScoreComparison;
|
|
132
|
-
/** Timing comparison (if pipeline-context exists in both captures) */
|
|
133
|
-
timing?: TimingComparison;
|
|
134
|
-
/** Metadata comparison */
|
|
135
|
-
metadata?: MetadataComparison;
|
|
136
|
-
/** Security scan results */
|
|
137
|
-
security: SecurityScan;
|
|
138
|
-
}
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Types for cross-run capture comparison.
|
|
3
|
-
*
|
|
4
|
-
* The CaptureComparator reads two capture directories (baseline + experiment)
|
|
5
|
-
* and produces a CaptureDiffReport. Types are defined in core so external
|
|
6
|
-
* tooling can consume diff reports without depending on the eval package.
|
|
7
|
-
*
|
|
8
|
-
* Implementation lives in packages/eval/src/artifact-capture/comparator.ts.
|
|
9
|
-
*/
|
|
10
|
-
export {};
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* CaptureComparator — compares two capture directories and produces a diff report.
|
|
3
|
-
*
|
|
4
|
-
* Reads manifest.json from both directories and computes:
|
|
5
|
-
* - Inventory diff (added/removed/common artifacts)
|
|
6
|
-
* - Content diff (structural or strict, for common artifacts)
|
|
7
|
-
* - Score comparison (from score-summary.json)
|
|
8
|
-
* - Timing comparison (from pipeline-context.json)
|
|
9
|
-
* - Metadata comparison (mode, variant, config keys)
|
|
10
|
-
* - Security scan (regex for leaked secrets)
|
|
11
|
-
*
|
|
12
|
-
* Implementation for the types defined in @sanity/ailf-core.
|
|
13
|
-
*/
|
|
14
|
-
import type { CaptureDiffReport, ComparisonOptions } from "../_vendor/ailf-core/index.d.ts";
|
|
15
|
-
/**
|
|
16
|
-
* Compare two capture directories and produce a structured diff report.
|
|
17
|
-
*
|
|
18
|
-
* @param baselineDir - Path to the baseline capture directory (contains manifest.json)
|
|
19
|
-
* @param experimentDir - Path to the experiment capture directory
|
|
20
|
-
* @param opts - Comparison options (mode, thresholds, etc.)
|
|
21
|
-
*/
|
|
22
|
-
export declare function compareCaptures(baselineDir: string, experimentDir: string, opts?: Partial<ComparisonOptions>): CaptureDiffReport;
|