@sanity/ailf 2.8.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/dist/_vendor/ailf-core/artifact-capture/association.d.ts +35 -0
  2. package/dist/_vendor/ailf-core/artifact-capture/association.js +28 -0
  3. package/dist/_vendor/ailf-core/artifact-registry.d.ts +124 -23
  4. package/dist/_vendor/ailf-core/artifact-registry.js +708 -64
  5. package/dist/_vendor/ailf-core/batch-signing.d.ts +64 -0
  6. package/dist/_vendor/ailf-core/batch-signing.js +23 -0
  7. package/dist/_vendor/ailf-core/index.d.ts +3 -2
  8. package/dist/_vendor/ailf-core/index.js +3 -2
  9. package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +59 -20
  10. package/dist/_vendor/ailf-core/ports/artifact-writer.js +33 -10
  11. package/dist/_vendor/ailf-core/ports/context.d.ts +20 -17
  12. package/dist/_vendor/ailf-core/ports/index.d.ts +0 -2
  13. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +6 -6
  14. package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
  15. package/dist/_vendor/ailf-core/services/index.js +1 -0
  16. package/dist/_vendor/ailf-core/services/slim-report-summary.d.ts +31 -0
  17. package/dist/_vendor/ailf-core/services/slim-report-summary.js +217 -0
  18. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +33 -0
  19. package/dist/_vendor/ailf-core/types/index.d.ts +202 -23
  20. package/dist/adapters/config-sources/file-config-adapter.js +0 -4
  21. package/dist/artifact-capture/accumulating-artifact-writer.d.ts +50 -0
  22. package/dist/artifact-capture/accumulating-artifact-writer.js +111 -0
  23. package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +17 -4
  24. package/dist/artifact-capture/api-gateway-artifact-writer.js +58 -7
  25. package/dist/artifact-capture/emit-file.d.ts +28 -0
  26. package/dist/artifact-capture/emit-file.js +56 -0
  27. package/dist/artifact-capture/fanout-artifact-writer.d.ts +39 -0
  28. package/dist/artifact-capture/fanout-artifact-writer.js +76 -0
  29. package/dist/artifact-capture/gcs-artifact-writer.d.ts +40 -3
  30. package/dist/artifact-capture/gcs-artifact-writer.js +238 -14
  31. package/dist/artifact-capture/local-fs-artifact-writer.d.ts +71 -0
  32. package/dist/artifact-capture/local-fs-artifact-writer.js +273 -0
  33. package/dist/artifact-capture/redact-artifact.d.ts +3 -5
  34. package/dist/artifact-capture/redact-artifact.js +3 -5
  35. package/dist/cli.js +56 -2
  36. package/dist/commands/explain-handler.js +4 -4
  37. package/dist/commands/pipeline-action.d.ts +5 -4
  38. package/dist/commands/pipeline-action.js +33 -16
  39. package/dist/commands/pipeline.d.ts +4 -4
  40. package/dist/commands/pipeline.js +4 -4
  41. package/dist/commands/publish.js +4 -1
  42. package/dist/commands/runs.d.ts +18 -0
  43. package/dist/commands/runs.js +71 -0
  44. package/dist/composition-root.d.ts +13 -10
  45. package/dist/composition-root.js +74 -46
  46. package/dist/orchestration/build-app-context.js +4 -7
  47. package/dist/orchestration/pipeline-orchestrator.d.ts +1 -1
  48. package/dist/orchestration/pipeline-orchestrator.js +37 -46
  49. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -1
  50. package/dist/orchestration/steps/calculate-scores-step.js +19 -19
  51. package/dist/orchestration/steps/callback-step.d.ts +1 -1
  52. package/dist/orchestration/steps/callback-step.js +6 -4
  53. package/dist/orchestration/steps/compare-step.d.ts +1 -1
  54. package/dist/orchestration/steps/compare-step.js +4 -2
  55. package/dist/orchestration/steps/discovery-report-step.d.ts +1 -1
  56. package/dist/orchestration/steps/discovery-report-step.js +4 -1
  57. package/dist/orchestration/steps/fetch-docs-step.js +9 -15
  58. package/dist/orchestration/steps/finalize-run-step.js +21 -7
  59. package/dist/orchestration/steps/gap-analysis-step.js +34 -6
  60. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -1
  61. package/dist/orchestration/steps/generate-configs-step.js +11 -11
  62. package/dist/orchestration/steps/publish-report-step.d.ts +1 -1
  63. package/dist/orchestration/steps/publish-report-step.js +24 -19
  64. package/dist/orchestration/steps/readiness-step.d.ts +1 -1
  65. package/dist/orchestration/steps/readiness-step.js +4 -1
  66. package/dist/orchestration/steps/report-step.d.ts +1 -1
  67. package/dist/orchestration/steps/report-step.js +6 -3
  68. package/dist/orchestration/steps/run-eval-step.js +14 -9
  69. package/dist/pipeline/compare.d.ts +2 -2
  70. package/dist/pipeline/emit-eval-results.d.ts +38 -0
  71. package/dist/pipeline/emit-eval-results.js +100 -0
  72. package/dist/pipeline/map-request-to-config.js +0 -4
  73. package/package.json +1 -1
  74. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +0 -14
  75. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +0 -25
  76. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +0 -94
  77. package/dist/_vendor/ailf-core/ports/artifact-collector.js +0 -13
  78. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +0 -138
  79. package/dist/_vendor/ailf-core/ports/capture-comparator.js +0 -10
  80. package/dist/artifact-capture/comparator.d.ts +0 -22
  81. package/dist/artifact-capture/comparator.js +0 -493
  82. package/dist/artifact-capture/filesystem-collector.d.ts +0 -42
  83. package/dist/artifact-capture/filesystem-collector.js +0 -237
  84. package/dist/artifact-capture/gcs-collector.d.ts +0 -55
  85. package/dist/artifact-capture/gcs-collector.js +0 -117
  86. package/dist/commands/capture-compare.d.ts +0 -15
  87. package/dist/commands/capture-compare.js +0 -253
  88. package/dist/commands/capture-list.d.ts +0 -12
  89. package/dist/commands/capture-list.js +0 -150
  90. package/dist/commands/capture.d.ts +0 -9
  91. package/dist/commands/capture.js +0 -16
@@ -4,7 +4,7 @@
4
4
  * Calls generatePrComment() from pipeline/pr-comment.ts with typed options.
5
5
  * No env bridge or process.argv manipulation needed.
6
6
  */
7
- import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
7
+ import { type AppContext, type PipelineStep, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
8
8
  export declare class ReportStep implements PipelineStep {
9
9
  readonly name = "report";
10
10
  check(): ValidationIssue[];
@@ -6,6 +6,8 @@
6
6
  */
7
7
  import { existsSync, mkdirSync } from "node:fs";
8
8
  import { dirname, resolve } from "path";
9
+ import { assoc, } from "../../_vendor/ailf-core/index.js";
10
+ import { emitFileContents } from "../../artifact-capture/emit-file.js";
9
11
  import { checkScoreSummaryValid } from "../../pipeline/checks.js";
10
12
  import { generatePrComment } from "../../pipeline/pr-comment.js";
11
13
  export class ReportStep {
@@ -45,13 +47,14 @@ export class ReportStep {
45
47
  status: "failed",
46
48
  };
47
49
  }
48
- // Capture report artifacts
50
+ // W0050 captureFile → emitFileContents. Both are run-scoped bulk
51
+ // artifacts; the writer handles redaction + excluded-types gating.
49
52
  if (existsSync(resolvedOutput)) {
50
- ctx.collector.captureFile("report", "pr-comment", resolvedOutput);
53
+ await emitFileContents(ctx.artifactWriter, "prComment", assoc(ctx), resolvedOutput);
51
54
  }
52
55
  const pipelineResultPath = resolve(ctx.config.outputDir, "pipeline-result.json");
53
56
  if (existsSync(pipelineResultPath)) {
54
- ctx.collector.captureFile("report", "pipeline-result", pipelineResultPath);
57
+ await emitFileContents(ctx.artifactWriter, "pipelineResult", assoc(ctx), pipelineResultPath);
55
58
  }
56
59
  return {
57
60
  durationMs: Date.now() - start,
@@ -7,6 +7,7 @@
7
7
  */
8
8
  import { existsSync, mkdirSync, writeFileSync } from "fs";
9
9
  import { resolve } from "path";
10
+ import { emitPerEntryEvalResults } from "../../pipeline/emit-eval-results.js";
10
11
  import { getStepInputPaths } from "../../pipeline/cache.js";
11
12
  import { buildCacheContext } from "../cache-context.js";
12
13
  import { checkCanonicalContextsExist, checkGeneratedConfigsExist, checkResultsExist, } from "../../pipeline/checks.js";
@@ -118,11 +119,11 @@ export class RunEvalStep {
118
119
  state.promptfooUrls ??= [];
119
120
  state.promptfooUrls.push(...remoteCacheResult.promptfooUrls);
120
121
  }
121
- // Capture the restored score-summary from remote cache
122
- const cachedSummaryPath = resolve(rootDir, "results", "latest", "score-summary.json");
123
- if (existsSync(cachedSummaryPath)) {
124
- ctx.collector.captureFile("run-eval", "score-summary-cached", cachedSummaryPath, { source: "remote-cache", mode: this.mode });
125
- }
122
+ // W0050 score-summary-cached was an unregistered capture;
123
+ // scoreSummary is already emitted by calculate-scores-step on the
124
+ // non-cached path, which also runs when we have a remote cache hit
125
+ // (populating state.remoteCacheHits CalculateScoresStep still
126
+ // invokes for the score-summary emit). Dropped here.
126
127
  return {
127
128
  durationMs: Date.now() - start,
128
129
  status: "success",
@@ -187,12 +188,16 @@ export class RunEvalStep {
187
188
  console.log();
188
189
  console.log(errorSummary);
189
190
  }
190
- // Capture eval results
191
+ // W0050 decompose the promptfoo aggregate into the per-entry
192
+ // descriptors the W0049 registry expects: rawResults / renderedPrompts
193
+ // per (run, mode, task, model); graderPrompts / graderJudgments per
194
+ // (run, mode, task, model, grader). See pipeline/emit-eval-results.ts.
195
+ // `testOutputs` still flows through uploadTestOutputs() in
196
+ // calculate-scores-step. `traces` ships via agent-observer (out of
197
+ // scope for the promptfoo shape parser — follow-up).
191
198
  const resultsPath = resolve(rootDir, resultsFileForMode(this.mode));
192
199
  if (existsSync(resultsPath)) {
193
- ctx.collector.captureFile("run-eval", `eval-results-${this.mode}`, resultsPath, {
194
- mode: this.mode,
195
- });
200
+ await emitPerEntryEvalResults(ctx.artifactWriter, ctx, this.mode, resultsPath);
196
201
  }
197
202
  // Extract Promptfoo share URL from eval results (Step 3b)
198
203
  if (ctx.evalRunner.extractShareUrl) {
@@ -15,7 +15,7 @@
15
15
  * @see docs/ideas/evaluation-roadmap.md — BP5: Make comparison a primitive
16
16
  * @see docs/ideas/metrics-design.md — Tier 4: Comparison results
17
17
  */
18
- import { type ChangeClass, type CompareOptions, type ComparisonReport, type ScoreSummary } from "./types.js";
18
+ import { type ChangeClass, type ComparableSummary, type CompareOptions, type ComparisonReport } from "./types.js";
19
19
  /** Classify a delta as improved, regressed, or unchanged given a threshold */
20
20
  export declare function classifyChange(delta: number, threshold: number): ChangeClass;
21
21
  /**
@@ -28,4 +28,4 @@ export declare function classifyChange(delta: number, threshold: number): Change
28
28
  * @param options Optional configuration (noise threshold, etc.)
29
29
  * @returns A ComparisonReport with deltas, classifications, and breakdowns
30
30
  */
31
- export declare function compare(baseline: ScoreSummary, experiment: ScoreSummary, options?: CompareOptions): ComparisonReport;
31
+ export declare function compare(baseline: ComparableSummary, experiment: ComparableSummary, options?: CompareOptions): ComparisonReport;
@@ -0,0 +1,38 @@
1
+ /**
2
+ * emit-eval-results.ts — decompose the promptfoo results file into the
3
+ * per-entry descriptors that W0049's registry expects.
4
+ *
5
+ * Replaces the Phase-B-stopgap "route the aggregated JSON through the
6
+ * deprecated `evalResults` bulk descriptor" path. For each test in the
7
+ * promptfoo output we emit:
8
+ *
9
+ * - `rawResults` per (run, mode, task, model) — the full result
10
+ * - `renderedPrompts` per (run, mode, task, model) — prompt the model saw
11
+ * - `graderPrompts` per (run, mode, task, model, grader) — rubric text
12
+ * - `graderJudgments` per (run, mode, task, model, grader) — {score, reason, pass}
13
+ *
14
+ * `testOutputs` is still emitted separately by `calculate-scores-step`
15
+ * via `uploadTestOutputs()` (carried forward from W0048 for byte-
16
+ * equivalence with the original rollout).
17
+ *
18
+ * `traces` is NOT produced here — agentic trace data flows through the
19
+ * agent-observer, not through the promptfoo result shape. Traces
20
+ * emission is out of scope for this helper and lands when the observer
21
+ * integration migrates (follow-up; not in W0050).
22
+ *
23
+ * The "grader" axis value is the rubric dimension string produced by
24
+ * `classifyRubric` (e.g. "task-completion", "code-correctness"). Non-
25
+ * LLM-rubric component assertions (javascript, contains, etc.) don't
26
+ * have a natural grader identifier and are skipped — their outcomes
27
+ * still live inside the full `rawResults` object.
28
+ */
29
+ import { type ArtifactWriter, type RunId } from "../_vendor/ailf-core/index.d.ts";
30
+ /**
31
+ * Parse a promptfoo results file and emit the per-entry artifacts.
32
+ *
33
+ * Non-blocking: any individual emit failure warns but does not halt.
34
+ * File read/parse errors are caught and logged; the caller keeps going.
35
+ */
36
+ export declare function emitPerEntryEvalResults(writer: ArtifactWriter, ctx: {
37
+ runId: RunId;
38
+ }, mode: string, resultsPath: string): Promise<void>;
@@ -0,0 +1,100 @@
1
+ /**
2
+ * emit-eval-results.ts — decompose the promptfoo results file into the
3
+ * per-entry descriptors that W0049's registry expects.
4
+ *
5
+ * Replaces the Phase-B-stopgap "route the aggregated JSON through the
6
+ * deprecated `evalResults` bulk descriptor" path. For each test in the
7
+ * promptfoo output we emit:
8
+ *
9
+ * - `rawResults` per (run, mode, task, model) — the full result
10
+ * - `renderedPrompts` per (run, mode, task, model) — prompt the model saw
11
+ * - `graderPrompts` per (run, mode, task, model, grader) — rubric text
12
+ * - `graderJudgments` per (run, mode, task, model, grader) — {score, reason, pass}
13
+ *
14
+ * `testOutputs` is still emitted separately by `calculate-scores-step`
15
+ * via `uploadTestOutputs()` (carried forward from W0048 for byte-
16
+ * equivalence with the original rollout).
17
+ *
18
+ * `traces` is NOT produced here — agentic trace data flows through the
19
+ * agent-observer, not through the promptfoo result shape. Traces
20
+ * emission is out of scope for this helper and lands when the observer
21
+ * integration migrates (follow-up; not in W0050).
22
+ *
23
+ * The "grader" axis value is the rubric dimension string produced by
24
+ * `classifyRubric` (e.g. "task-completion", "code-correctness"). Non-
25
+ * LLM-rubric component assertions (javascript, contains, etc.) don't
26
+ * have a natural grader identifier and are skipped — their outcomes
27
+ * still live inside the full `rawResults` object.
28
+ */
29
+ import { readFileSync } from "node:fs";
30
+ import { classifyRubric, parseRubricScore, } from "../_vendor/ailf-core/index.js";
31
+ // ---------------------------------------------------------------------------
32
+ // Public entry point
33
+ // ---------------------------------------------------------------------------
34
+ /**
35
+ * Parse a promptfoo results file and emit the per-entry artifacts.
36
+ *
37
+ * Non-blocking: any individual emit failure warns but does not halt.
38
+ * File read/parse errors are caught and logged; the caller keeps going.
39
+ */
40
+ export async function emitPerEntryEvalResults(writer, ctx, mode, resultsPath) {
41
+ let raw;
42
+ try {
43
+ raw = JSON.parse(readFileSync(resultsPath, "utf-8"));
44
+ }
45
+ catch (err) {
46
+ const message = err instanceof Error ? err.message : String(err);
47
+ console.warn(` ⚠️ emitPerEntryEvalResults: failed to read ${resultsPath} — ${message}`);
48
+ return;
49
+ }
50
+ // Promptfoo wraps results in either `{ results: { results: [...] } }`
51
+ // (older shape) or directly as `{ results: [...] }` (some adapters).
52
+ const wrapper = raw.results && "results" in raw.results
53
+ ? raw.results
54
+ : raw;
55
+ const rows = wrapper?.results ?? [];
56
+ if (rows.length === 0) {
57
+ console.warn(` ⚠️ emitPerEntryEvalResults: ${resultsPath} has no results[]`);
58
+ return;
59
+ }
60
+ for (const result of rows) {
61
+ const taskId = result.testCase?.description ?? "unknown-task";
62
+ const modelId = result.provider?.id ?? result.provider?.label ?? "unknown-model";
63
+ const baseAssoc = {
64
+ run: ctx.runId,
65
+ mode,
66
+ task: taskId,
67
+ model: modelId,
68
+ };
69
+ // rawResults — full raw entry (bounded by descriptor capBytes: 1 MB)
70
+ await writer.emit("rawResults", baseAssoc, result);
71
+ // renderedPrompts — what the model saw + which provider it went to
72
+ if (result.prompt !== undefined) {
73
+ await writer.emit("renderedPrompts", baseAssoc, {
74
+ prompt: result.prompt,
75
+ provider: result.provider,
76
+ });
77
+ }
78
+ // Per-grader decomposition — only LLM-rubric assertions have a
79
+ // natural grader identity. Code assertions (javascript/contains/…)
80
+ // show up in rawResults but not as standalone graderJudgments.
81
+ const components = result.gradingResult?.componentResults ?? [];
82
+ for (const comp of components) {
83
+ if (comp.assertion?.type !== "llm-rubric")
84
+ continue;
85
+ const dimension = classifyRubric(comp);
86
+ if (!dimension)
87
+ continue;
88
+ const graderAssoc = { ...baseAssoc, grader: dimension };
89
+ await writer.emit("graderPrompts", graderAssoc, {
90
+ dimension,
91
+ assertion: comp.assertion,
92
+ });
93
+ await writer.emit("graderJudgments", graderAssoc, {
94
+ score: parseRubricScore(comp) ?? 0,
95
+ reason: comp.reason ?? "",
96
+ pass: comp.pass,
97
+ });
98
+ }
99
+ }
100
+ }
@@ -74,10 +74,6 @@ export function mapRequestToConfig(request, rootDir) {
74
74
  callerGit: request.callerGit,
75
75
  callback: request.callback,
76
76
  jobId: request.jobId,
77
- captureEnabled: false,
78
- captureDir: undefined,
79
- captureCompress: true,
80
- captureExtras: true,
81
77
  remote: false,
82
78
  apiUrl: "https://ailf-api.sanity.build",
83
79
  presets: request.presets,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "2.8.0",
3
+ "version": "3.0.0",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"
@@ -1,14 +0,0 @@
1
- /**
2
- * No-op artifact collector — used when --capture is not set.
3
- *
4
- * All methods are constant-time stubs. Zero overhead on the
5
- * default pipeline path.
6
- */
7
- import type { ArtifactCollector, CaptureFlushResult } from "../ports/artifact-collector.js";
8
- export declare class NoOpArtifactCollector implements ArtifactCollector {
9
- readonly enabled = false;
10
- readonly extrasEnabled = false;
11
- capture(): void;
12
- captureFile(): void;
13
- flush(): Promise<CaptureFlushResult>;
14
- }
@@ -1,25 +0,0 @@
1
- /**
2
- * No-op artifact collector — used when --capture is not set.
3
- *
4
- * All methods are constant-time stubs. Zero overhead on the
5
- * default pipeline path.
6
- */
7
- const EMPTY_RESULT = Object.freeze({
8
- artifactCount: 0,
9
- compressed: false,
10
- destination: "",
11
- totalBytes: 0,
12
- });
13
- export class NoOpArtifactCollector {
14
- enabled = false;
15
- extrasEnabled = false;
16
- capture() {
17
- // no-op
18
- }
19
- captureFile() {
20
- // no-op
21
- }
22
- async flush() {
23
- return EMPTY_RESULT;
24
- }
25
- }
@@ -1,94 +0,0 @@
1
- /**
2
- * Port: ArtifactCollector — captures pipeline artifacts during execution.
3
- *
4
- * Injected into AppContext. When capture is disabled (default), the
5
- * composition root provides NoOpArtifactCollector. When --capture is
6
- * set, provides FilesystemArtifactCollector.
7
- *
8
- * Design principles:
9
- * - P1: Zero-cost when off (no-op stub)
10
- * - P2: Capture, don't intercept (steps call capture() explicitly)
11
- * - P5: Non-blocking (failures swallowed, never block the pipeline)
12
- */
13
- /**
14
- * The contract for artifact capture during pipeline execution.
15
- *
16
- * Steps call capture() for in-memory data and captureFile() for
17
- * artifacts already on disk. The orchestrator calls flush() once
18
- * at pipeline end to write everything to the configured destination.
19
- */
20
- export interface ArtifactCollector {
21
- /**
22
- * Record an in-memory artifact produced during pipeline execution.
23
- *
24
- * Callers need not check `enabled` before calling — the NoOp
25
- * implementation is zero-cost, so unconditional calls are safe.
26
- *
27
- * @param step - Pipeline step name (e.g., "run-eval")
28
- * @param type - Artifact type identifier (e.g., "eval-results")
29
- * @param data - Content to serialize (JSON or text)
30
- * @param meta - Optional metadata (variant, model, etc.)
31
- */
32
- capture(step: string, type: string, data: unknown, meta?: Record<string, unknown>): void;
33
- /**
34
- * Record a file reference for an artifact already on disk.
35
- * The file is copied into the capture directory on flush().
36
- *
37
- * @param step - Pipeline step name
38
- * @param type - Artifact type identifier
39
- * @param filePath - Absolute path to the existing file
40
- * @param meta - Optional metadata
41
- */
42
- captureFile(step: string, type: string, filePath: string, meta?: Record<string, unknown>): void;
43
- /**
44
- * Flush all captured artifacts to the configured destination.
45
- * Called once at pipeline end by the orchestrator.
46
- */
47
- flush(): Promise<CaptureFlushResult>;
48
- /** Whether capture is active */
49
- readonly enabled: boolean;
50
- /** Whether mode-specific extras are being captured */
51
- readonly extrasEnabled: boolean;
52
- }
53
- /** Result of flushing captured artifacts to the destination. */
54
- export interface CaptureFlushResult {
55
- /** Total number of artifacts captured */
56
- artifactCount: number;
57
- /** Output path (directory or .tar.gz) */
58
- destination: string;
59
- /** Total bytes written (uncompressed) */
60
- totalBytes: number;
61
- /** Whether output was compressed */
62
- compressed: boolean;
63
- }
64
- /** A single entry in the capture manifest. */
65
- export interface CaptureManifestEntry {
66
- /** Pipeline step that produced this artifact */
67
- step: string;
68
- /** Artifact type identifier */
69
- type: string;
70
- /** Relative path within the capture directory */
71
- path: string;
72
- /** ISO 8601 timestamp of when capture() was called */
73
- capturedAt: string;
74
- /** Byte size of the artifact */
75
- bytes: number;
76
- /** Content format */
77
- format: "json" | "markdown" | "text";
78
- /** Optional metadata */
79
- meta?: Record<string, unknown>;
80
- }
81
- /** The manifest.json written to each capture directory. */
82
- export interface CaptureManifest {
83
- version: 1;
84
- captureId: string;
85
- startedAt: string;
86
- completedAt: string;
87
- pipeline: {
88
- mode: string;
89
- variant?: string;
90
- source?: string;
91
- areas?: string[];
92
- };
93
- artifacts: CaptureManifestEntry[];
94
- }
@@ -1,13 +0,0 @@
1
- /**
2
- * Port: ArtifactCollector — captures pipeline artifacts during execution.
3
- *
4
- * Injected into AppContext. When capture is disabled (default), the
5
- * composition root provides NoOpArtifactCollector. When --capture is
6
- * set, provides FilesystemArtifactCollector.
7
- *
8
- * Design principles:
9
- * - P1: Zero-cost when off (no-op stub)
10
- * - P2: Capture, don't intercept (steps call capture() explicitly)
11
- * - P5: Non-blocking (failures swallowed, never block the pipeline)
12
- */
13
- export {};
@@ -1,138 +0,0 @@
1
- /**
2
- * Types for cross-run capture comparison.
3
- *
4
- * The CaptureComparator reads two capture directories (baseline + experiment)
5
- * and produces a CaptureDiffReport. Types are defined in core so external
6
- * tooling can consume diff reports without depending on the eval package.
7
- *
8
- * Implementation lives in packages/eval/src/artifact-capture/comparator.ts.
9
- */
10
- /** How deeply to compare artifacts. */
11
- export type ComparisonMode = "strict" | "structural" | "inventory";
12
- /** Configurable thresholds for comparison. */
13
- export interface ComparisonOptions {
14
- /** Comparison depth: inventory (existence), structural (shape), strict (content) */
15
- mode: ComparisonMode;
16
- /** Score regression thresholds */
17
- scoreThresholds?: {
18
- /** Maximum allowed aggregate score delta (percentage points, default 5) */
19
- aggregate: number;
20
- /** Maximum allowed per-task score drop (points, default 10) */
21
- perTask: number;
22
- };
23
- /** Timing regression thresholds */
24
- timingThresholds?: {
25
- /** Multiplier — flag steps exceeding this ratio (default 2.0) */
26
- multiplier: number;
27
- /** Per-step overrides (step name → custom multiplier) */
28
- perStep?: Record<string, number>;
29
- };
30
- /** JSON structural diff depth (default 3) */
31
- jsonDiffDepth?: number;
32
- /** Additional ephemeral fields to ignore (merged with defaults) */
33
- ephemeralFields?: string[];
34
- }
35
- /** Inventory diff — which artifacts exist in each capture. */
36
- export interface InventoryDiff {
37
- /** Artifact types in experiment but not in baseline */
38
- added: string[];
39
- /** Artifact types in baseline but not in experiment */
40
- removed: string[];
41
- /** Artifact types present in both */
42
- common: string[];
43
- }
44
- /** A single structural change in a JSON artifact. */
45
- export interface JsonDiffEntry {
46
- /** JSON pointer path (e.g., "config.mode") */
47
- path: string;
48
- /** Value in baseline (undefined if key is added) */
49
- baseline?: unknown;
50
- /** Value in experiment (undefined if key is removed) */
51
- experiment?: unknown;
52
- }
53
- /** Content diff for a single artifact. */
54
- export interface ArtifactContentDiff {
55
- /** Artifact type identifier (step/type) */
56
- artifactKey: string;
57
- /** Content format */
58
- format: "json" | "markdown" | "text";
59
- /** Structural changes (JSON) or line diff summary (text/markdown) */
60
- changes: JsonDiffEntry[] | {
61
- addedLines: number;
62
- removedLines: number;
63
- };
64
- }
65
- /** Score comparison between two captures. */
66
- export interface ScoreComparison {
67
- /** Baseline aggregate score */
68
- baselineMean: number;
69
- /** Experiment aggregate score */
70
- currentMean: number;
71
- /** Absolute delta (current - baseline) */
72
- delta: number;
73
- /** Per-task score deltas */
74
- perTask: {
75
- task: string;
76
- baseline: number;
77
- current: number;
78
- delta: number;
79
- }[];
80
- /** Tasks that breached configured thresholds */
81
- breaches: string[];
82
- }
83
- /** Timing comparison between two captures. */
84
- export interface TimingComparison {
85
- /** Total pipeline duration delta in ms */
86
- totalDeltaMs: number;
87
- /** Per-step timing */
88
- perStep: {
89
- step: string;
90
- baselineMs: number;
91
- currentMs: number;
92
- ratio: number;
93
- }[];
94
- /** Steps that breached the timing multiplier threshold */
95
- breaches: string[];
96
- }
97
- /** Metadata comparison between two captures. */
98
- export interface MetadataComparison {
99
- /** Whether pipeline modes match */
100
- modeMatch: boolean;
101
- /** Whether pipeline variants match */
102
- variantMatch: boolean;
103
- /** Config key differences */
104
- configDiffs: JsonDiffEntry[];
105
- }
106
- /** Security scan results. */
107
- export interface SecurityScan {
108
- /** Whether any potential secret leaks were found */
109
- leaksFound: boolean;
110
- /** Details of each violation */
111
- violations: {
112
- /** Relative artifact file path */
113
- file: string;
114
- /** Description of the finding */
115
- detail: string;
116
- }[];
117
- }
118
- /** The full diff report produced by CaptureComparator. */
119
- export interface CaptureDiffReport {
120
- /** Are the two captures semantically equivalent? */
121
- equivalent: boolean;
122
- /** Human-readable summary (1-3 sentences) */
123
- summary: string;
124
- /** Comparison mode used */
125
- mode: ComparisonMode;
126
- /** Artifact inventory diff */
127
- inventory: InventoryDiff;
128
- /** Content diffs for common artifacts (structural/strict modes only) */
129
- content?: ArtifactContentDiff[];
130
- /** Score comparison (if score-summary exists in both captures) */
131
- scores?: ScoreComparison;
132
- /** Timing comparison (if pipeline-context exists in both captures) */
133
- timing?: TimingComparison;
134
- /** Metadata comparison */
135
- metadata?: MetadataComparison;
136
- /** Security scan results */
137
- security: SecurityScan;
138
- }
@@ -1,10 +0,0 @@
1
- /**
2
- * Types for cross-run capture comparison.
3
- *
4
- * The CaptureComparator reads two capture directories (baseline + experiment)
5
- * and produces a CaptureDiffReport. Types are defined in core so external
6
- * tooling can consume diff reports without depending on the eval package.
7
- *
8
- * Implementation lives in packages/eval/src/artifact-capture/comparator.ts.
9
- */
10
- export {};
@@ -1,22 +0,0 @@
1
- /**
2
- * CaptureComparator — compares two capture directories and produces a diff report.
3
- *
4
- * Reads manifest.json from both directories and computes:
5
- * - Inventory diff (added/removed/common artifacts)
6
- * - Content diff (structural or strict, for common artifacts)
7
- * - Score comparison (from score-summary.json)
8
- * - Timing comparison (from pipeline-context.json)
9
- * - Metadata comparison (mode, variant, config keys)
10
- * - Security scan (regex for leaked secrets)
11
- *
12
- * Implementation for the types defined in @sanity/ailf-core.
13
- */
14
- import type { CaptureDiffReport, ComparisonOptions } from "../_vendor/ailf-core/index.d.ts";
15
- /**
16
- * Compare two capture directories and produce a structured diff report.
17
- *
18
- * @param baselineDir - Path to the baseline capture directory (contains manifest.json)
19
- * @param experimentDir - Path to the experiment capture directory
20
- * @param opts - Comparison options (mode, thresholds, etc.)
21
- */
22
- export declare function compareCaptures(baselineDir: string, experimentDir: string, opts?: Partial<ComparisonOptions>): CaptureDiffReport;