@sanity/ailf 3.0.0 → 3.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/dist/_vendor/ailf-core/artifact-capture/association.d.ts +37 -0
  2. package/dist/_vendor/ailf-core/artifact-capture/association.js +19 -0
  3. package/dist/_vendor/ailf-core/index.d.ts +1 -1
  4. package/dist/_vendor/ailf-core/index.js +1 -1
  5. package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
  6. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  7. package/dist/_vendor/ailf-core/ports/index.js +1 -0
  8. package/dist/_vendor/ailf-core/ports/progress-reporter.d.ts +74 -0
  9. package/dist/_vendor/ailf-core/ports/progress-reporter.js +26 -0
  10. package/dist/_vendor/ailf-core/services/slim-report-summary.js +1 -16
  11. package/dist/adapters/progress/console-progress-reporter.d.ts +35 -0
  12. package/dist/adapters/progress/console-progress-reporter.js +110 -0
  13. package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +8 -1
  14. package/dist/artifact-capture/api-gateway-artifact-writer.js +79 -42
  15. package/dist/artifact-capture/batching-api-gateway-artifact-writer.d.ts +108 -0
  16. package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +492 -0
  17. package/dist/artifact-capture/fanout-artifact-writer.d.ts +14 -2
  18. package/dist/artifact-capture/fanout-artifact-writer.js +25 -4
  19. package/dist/artifact-capture/gcs-artifact-writer.d.ts +27 -1
  20. package/dist/artifact-capture/gcs-artifact-writer.js +168 -38
  21. package/dist/artifact-capture/instrumented-artifact-writer.d.ts +32 -0
  22. package/dist/artifact-capture/instrumented-artifact-writer.js +151 -0
  23. package/dist/artifact-capture/local-fs-artifact-writer.d.ts +8 -1
  24. package/dist/artifact-capture/local-fs-artifact-writer.js +23 -4
  25. package/dist/artifact-capture/parallel-emit.d.ts +43 -0
  26. package/dist/artifact-capture/parallel-emit.js +84 -0
  27. package/dist/artifact-capture/upload-metrics.d.ts +62 -0
  28. package/dist/artifact-capture/upload-metrics.js +125 -0
  29. package/dist/composition-root.d.ts +2 -2
  30. package/dist/composition-root.js +97 -11
  31. package/dist/orchestration/pipeline-orchestrator.js +97 -1
  32. package/dist/orchestration/steps/calculate-scores-step.js +9 -7
  33. package/dist/orchestration/steps/finalize-run-step.js +40 -8
  34. package/dist/pipeline/emit-eval-results.js +29 -11
  35. package/dist/pipeline/upload-test-outputs.d.ts +12 -5
  36. package/dist/pipeline/upload-test-outputs.js +27 -10
  37. package/package.json +1 -1
@@ -143,16 +143,18 @@ export class CalculateScoresStep {
143
143
  // The full responseOutput lives in the GCS artifact; PublishReportStep
144
144
  // later strips it from the inline Content Lake document when this
145
145
  // upload succeeds.
146
+ //
147
+ // The emits flow through `ctx.artifactWriter`, which the composition
148
+ // root wraps in `AccumulatingArtifactWriter`. That's where the
149
+ // authoritative merged ref is built; `FinalizeRunStep` reads it
150
+ // straight from the accumulator, so producer-side registration on
151
+ // `state.artifactRefs` would only clobber the accumulator's full set
152
+ // with a partial single-entry ref.
153
+ //
146
154
  // W0050 — ctx.artifactWriter is always present; no guard needed.
147
155
  const testResults = tryReadTestResults(ctx.config.rootDir);
148
156
  if (testResults?.length) {
149
- const artifactRef = await uploadTestOutputs(ctx.artifactWriter, ctx.runId, testResults);
150
- if (artifactRef) {
151
- state.artifactRefs = {
152
- ...state.artifactRefs,
153
- testOutputs: artifactRef,
154
- };
155
- }
157
+ await uploadTestOutputs(ctx.artifactWriter, ctx.runId, testResults, ctx.config.mode);
156
158
  }
157
159
  const criticalSuffix = belowCritical.length > 0
158
160
  ? ` (${belowCritical.length} area(s) below critical threshold: ${belowCritical.join(", ")})`
@@ -18,9 +18,36 @@
18
18
  import { existsSync, readFileSync } from "node:fs";
19
19
  import { resolve } from "node:path";
20
20
  import { AccumulatingArtifactWriter } from "../../artifact-capture/accumulating-artifact-writer.js";
21
+ import { InstrumentedArtifactWriter } from "../../artifact-capture/instrumented-artifact-writer.js";
21
22
  import { buildRunContext } from "../../pipeline/run-context.js";
22
23
  import { loadSource } from "../../sources.js";
23
24
  import { configToSourceOverrides } from "../config-to-source-overrides.js";
25
+ /**
26
+ * Walk a writer's `inner` decorator chain looking for an
27
+ * `AccumulatingArtifactWriter`. Composition root wraps the accumulator
28
+ * in `InstrumentedArtifactWriter` when `AILF_UPLOAD_METRICS=1`, so a
29
+ * naive `instanceof AccumulatingArtifactWriter` check misses it and the
30
+ * manifest comes out empty. Rather than teaching every caller about the
31
+ * instrumentation wrapper, unwrap once here.
32
+ *
33
+ * `MAX_DEPTH` is a safety belt against a future decorator chain
34
+ * accidentally introducing a cycle — the current writers can't, but one
35
+ * `inner` self-reference would otherwise spin forever.
36
+ */
37
+ const FIND_ACCUMULATOR_MAX_DEPTH = 8;
38
+ function findAccumulator(writer) {
39
+ let cursor = writer;
40
+ for (let depth = 0; cursor && depth < FIND_ACCUMULATOR_MAX_DEPTH; depth++) {
41
+ if (cursor instanceof AccumulatingArtifactWriter)
42
+ return cursor;
43
+ if (cursor instanceof InstrumentedArtifactWriter) {
44
+ cursor = cursor.inner;
45
+ continue;
46
+ }
47
+ return null;
48
+ }
49
+ return null;
50
+ }
24
51
  export class FinalizeRunStep {
25
52
  pipelineStart;
26
53
  options;
@@ -59,17 +86,22 @@ export class FinalizeRunStep {
59
86
  });
60
87
  // W0051 revisit: the composition-root wraps `ctx.artifactWriter` in
61
88
  // `AccumulatingArtifactWriter`, which keeps a map of every ref any
62
- // producer emitted this run. Merge that into `state.artifactRefs` so
63
- // the manifest reflects the FULL set not just the subset producers
64
- // happened to register manually. When the writer is a NoOp / plain
65
- // decorator without accumulation, `aggregated` stays empty and the
66
- // manifest falls back to the producer-side registration.
67
- const aggregated = ctx.artifactWriter instanceof AccumulatingArtifactWriter
68
- ? ctx.artifactWriter.getAccumulatedArtifactRefs()
89
+ // producer emitted this run the authoritative FULL set, merged by
90
+ // entry key. `state.artifactRefs` is a producer-side fallback for
91
+ // writers that don't accumulate (NoOp / plain decorators). The
92
+ // accumulator wins per type when both exist, because producer-side
93
+ // registrations tend to capture only the last ref from a parallel
94
+ // batch and would otherwise clobber the merged entries list.
95
+ //
96
+ // W0058: `findAccumulator` unwraps `InstrumentedArtifactWriter` too
97
+ // so the manifest stays fully populated when `AILF_UPLOAD_METRICS=1`.
98
+ const accumulator = findAccumulator(ctx.artifactWriter);
99
+ const aggregated = accumulator
100
+ ? accumulator.getAccumulatedArtifactRefs()
69
101
  : {};
70
102
  const artifacts = {
71
- ...aggregated,
72
103
  ...(state.artifactRefs ?? {}),
104
+ ...aggregated,
73
105
  };
74
106
  const manifest = {
75
107
  version: 1,
@@ -27,7 +27,7 @@
27
27
  * still live inside the full `rawResults` object.
28
28
  */
29
29
  import { readFileSync } from "node:fs";
30
- import { classifyRubric, parseRubricScore, } from "../_vendor/ailf-core/index.js";
30
+ import { classifyRubric, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
31
31
  // ---------------------------------------------------------------------------
32
32
  // Public entry point
33
33
  // ---------------------------------------------------------------------------
@@ -57,23 +57,40 @@ export async function emitPerEntryEvalResults(writer, ctx, mode, resultsPath) {
57
57
  console.warn(` ⚠️ emitPerEntryEvalResults: ${resultsPath} has no results[]`);
58
58
  return;
59
59
  }
60
+ // W0058: fire every emit synchronously and `Promise.all` once at the
61
+ // end. The previous `parallelMap` pattern created producer-side
62
+ // backpressure (each worker was blocked on its own `await emit(...)`),
63
+ // which kept the batching writer's queue shallow (≤ producer
64
+ // concurrency). With queueing delegated entirely to the writer, the
65
+ // batching API-Gateway writer gets a fully-populated pending queue
66
+ // and can pack ~hundreds of entries into a single batch-sign RTT;
67
+ // the GCS-direct writer's own `ConcurrencyLimiter` caps the PUT
68
+ // fan-out so the 1 500-concurrent-PUT scenario cannot happen.
69
+ const emits = [];
60
70
  for (const result of rows) {
61
- const taskId = result.testCase?.description ?? "unknown-task";
71
+ const rawTaskId = result.testCase?.description ?? "unknown-task";
62
72
  const modelId = result.provider?.id ?? result.provider?.label ?? "unknown-model";
73
+ // D0033 axis convention: literacy-mode task descriptions carry a
74
+ // `(gold)` / `(baseline)` suffix; that variant IS the `mode` axis
75
+ // value. Stripping the suffix here keeps the writer's key aligned
76
+ // with what `slim-report-summary#slimJudgments` and the Studio
77
+ // `testOutputsKeyFor` hook compute on the read side. Without this,
78
+ // signed-URL lookups for grader/judgment artifacts 404.
79
+ const { mode: axisMode, task: axisTask } = resolveVariantMode(rawTaskId, mode);
63
80
  const baseAssoc = {
64
81
  run: ctx.runId,
65
- mode,
66
- task: taskId,
82
+ mode: axisMode,
83
+ task: axisTask,
67
84
  model: modelId,
68
85
  };
69
86
  // rawResults — full raw entry (bounded by descriptor capBytes: 1 MB)
70
- await writer.emit("rawResults", baseAssoc, result);
87
+ emits.push(writer.emit("rawResults", baseAssoc, result));
71
88
  // renderedPrompts — what the model saw + which provider it went to
72
89
  if (result.prompt !== undefined) {
73
- await writer.emit("renderedPrompts", baseAssoc, {
90
+ emits.push(writer.emit("renderedPrompts", baseAssoc, {
74
91
  prompt: result.prompt,
75
92
  provider: result.provider,
76
- });
93
+ }));
77
94
  }
78
95
  // Per-grader decomposition — only LLM-rubric assertions have a
79
96
  // natural grader identity. Code assertions (javascript/contains/…)
@@ -86,15 +103,16 @@ export async function emitPerEntryEvalResults(writer, ctx, mode, resultsPath) {
86
103
  if (!dimension)
87
104
  continue;
88
105
  const graderAssoc = { ...baseAssoc, grader: dimension };
89
- await writer.emit("graderPrompts", graderAssoc, {
106
+ emits.push(writer.emit("graderPrompts", graderAssoc, {
90
107
  dimension,
91
108
  assertion: comp.assertion,
92
- });
93
- await writer.emit("graderJudgments", graderAssoc, {
109
+ }));
110
+ emits.push(writer.emit("graderJudgments", graderAssoc, {
94
111
  score: parseRubricScore(comp) ?? 0,
95
112
  reason: comp.reason ?? "",
96
113
  pass: comp.pass,
97
- });
114
+ }));
98
115
  }
99
116
  }
117
+ await Promise.all(emits);
100
118
  }
@@ -15,12 +15,19 @@
15
15
  *
16
16
  * @see docs/decisions/D0032-run-anchored-artifact-store.md
17
17
  */
18
- import type { ArtifactRef, ArtifactWriter, RunId, StoredTestResult } from "../_vendor/ailf-core/index.d.ts";
18
+ import { type ArtifactRef, type ArtifactWriter, type RunId, type StoredTestResult } from "../_vendor/ailf-core/index.d.ts";
19
19
  /**
20
20
  * Upload testOutputs as per-entry GCS objects under
21
- * `runs/{runId}/test-outputs/`, one per `{taskId}::{modelId}` pair.
21
+ * `runs/{runId}/test-outputs/`, one per `(mode, task, model)` triple.
22
22
  *
23
- * Returns the `ArtifactRef` on success, or `null` when upload is skipped or
24
- * fails (P5: non-blocking).
23
+ * D0033 axis convention: literacy-mode taskIds carry a `(gold)` / `(baseline)`
24
+ * suffix whose value is the `mode` axis on the artifact — stripped via
25
+ * `resolveVariantMode`. This aligns the on-disk key with what the Studio
26
+ * hover-prefetch (`testOutputsKeyFor`) and slim-report readers compute; the
27
+ * legacy 2-segment form produced by the pre-D0033 writer 404'd on the new
28
+ * 3-segment read path.
29
+ *
30
+ * Returns the first non-null `ArtifactRef` emitted, or `null` when upload is
31
+ * skipped / every emit fails (P5: non-blocking).
25
32
  */
26
- export declare function uploadTestOutputs(writer: ArtifactWriter, runId: RunId, testResults: StoredTestResult[]): Promise<ArtifactRef | null>;
33
+ export declare function uploadTestOutputs(writer: ArtifactWriter, runId: RunId, testResults: StoredTestResult[], defaultMode: string): Promise<ArtifactRef | null>;
@@ -15,20 +15,37 @@
15
15
  *
16
16
  * @see docs/decisions/D0032-run-anchored-artifact-store.md
17
17
  */
18
+ import { resolveVariantMode, } from "../_vendor/ailf-core/index.js";
18
19
  /**
19
20
  * Upload testOutputs as per-entry GCS objects under
20
- * `runs/{runId}/test-outputs/`, one per `{taskId}::{modelId}` pair.
21
+ * `runs/{runId}/test-outputs/`, one per `(mode, task, model)` triple.
21
22
  *
22
- * Returns the `ArtifactRef` on success, or `null` when upload is skipped or
23
- * fails (P5: non-blocking).
23
+ * D0033 axis convention: literacy-mode taskIds carry a `(gold)` / `(baseline)`
24
+ * suffix whose value is the `mode` axis on the artifact — stripped via
25
+ * `resolveVariantMode`. This aligns the on-disk key with what the Studio
26
+ * hover-prefetch (`testOutputsKeyFor`) and slim-report readers compute; the
27
+ * legacy 2-segment form produced by the pre-D0033 writer 404'd on the new
28
+ * 3-segment read path.
29
+ *
30
+ * Returns the first non-null `ArtifactRef` emitted, or `null` when upload is
31
+ * skipped / every emit fails (P5: non-blocking).
24
32
  */
25
- export async function uploadTestOutputs(writer, runId, testResults) {
26
- const entries = testResults.map((tr) => ({
27
- key: `${tr.taskId}::${tr.modelId}`,
28
- data: {
33
+ export async function uploadTestOutputs(writer, runId, testResults, defaultMode) {
34
+ // W0058: fire every emit synchronously and `Promise.all` once — the
35
+ // writer (batching or GCS-direct) owns concurrency bounds. See the
36
+ // equivalent rationale in `emit-eval-results.ts`.
37
+ const emits = testResults.map((tr) => {
38
+ const { mode: axisMode, task: axisTask } = resolveVariantMode(tr.taskId, defaultMode);
39
+ return writer.emit("testOutputs", { run: runId, mode: axisMode, task: axisTask, model: tr.modelId }, {
29
40
  responseOutput: tr.responseOutput ?? "",
30
41
  responseOutputTruncated: tr.responseOutputTruncated ?? false,
31
- },
32
- }));
33
- return writer.writePerEntry("testOutputs", runId, entries);
42
+ });
43
+ });
44
+ const refs = await Promise.all(emits);
45
+ let lastRef = null;
46
+ for (const ref of refs) {
47
+ if (ref)
48
+ lastRef = ref;
49
+ }
50
+ return lastRef;
34
51
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "3.0.0",
3
+ "version": "3.1.1",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"