@sanity/ailf 3.0.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/dist/_vendor/ailf-core/artifact-capture/association.d.ts +37 -0
  2. package/dist/_vendor/ailf-core/artifact-capture/association.js +19 -0
  3. package/dist/_vendor/ailf-core/index.d.ts +1 -1
  4. package/dist/_vendor/ailf-core/index.js +1 -1
  5. package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
  6. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  7. package/dist/_vendor/ailf-core/ports/index.js +1 -0
  8. package/dist/_vendor/ailf-core/ports/progress-reporter.d.ts +74 -0
  9. package/dist/_vendor/ailf-core/ports/progress-reporter.js +26 -0
  10. package/dist/_vendor/ailf-core/services/slim-report-summary.js +1 -16
  11. package/dist/adapters/progress/console-progress-reporter.d.ts +35 -0
  12. package/dist/adapters/progress/console-progress-reporter.js +110 -0
  13. package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +8 -1
  14. package/dist/artifact-capture/api-gateway-artifact-writer.js +79 -42
  15. package/dist/artifact-capture/batching-api-gateway-artifact-writer.d.ts +108 -0
  16. package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +492 -0
  17. package/dist/artifact-capture/fanout-artifact-writer.d.ts +14 -2
  18. package/dist/artifact-capture/fanout-artifact-writer.js +25 -4
  19. package/dist/artifact-capture/gcs-artifact-writer.d.ts +27 -1
  20. package/dist/artifact-capture/gcs-artifact-writer.js +168 -38
  21. package/dist/artifact-capture/instrumented-artifact-writer.d.ts +32 -0
  22. package/dist/artifact-capture/instrumented-artifact-writer.js +151 -0
  23. package/dist/artifact-capture/local-fs-artifact-writer.d.ts +8 -1
  24. package/dist/artifact-capture/local-fs-artifact-writer.js +23 -4
  25. package/dist/artifact-capture/parallel-emit.d.ts +43 -0
  26. package/dist/artifact-capture/parallel-emit.js +84 -0
  27. package/dist/artifact-capture/upload-metrics.d.ts +62 -0
  28. package/dist/artifact-capture/upload-metrics.js +125 -0
  29. package/dist/composition-root.d.ts +2 -2
  30. package/dist/composition-root.js +97 -11
  31. package/dist/orchestration/pipeline-orchestrator.js +97 -1
  32. package/dist/orchestration/steps/calculate-scores-step.js +1 -1
  33. package/dist/orchestration/steps/finalize-run-step.js +33 -2
  34. package/dist/pipeline/emit-eval-results.js +29 -11
  35. package/dist/pipeline/upload-test-outputs.d.ts +12 -5
  36. package/dist/pipeline/upload-test-outputs.js +27 -10
  37. package/package.json +3 -3
@@ -18,9 +18,36 @@
18
18
  import { existsSync, readFileSync } from "node:fs";
19
19
  import { resolve } from "node:path";
20
20
  import { AccumulatingArtifactWriter } from "../../artifact-capture/accumulating-artifact-writer.js";
21
+ import { InstrumentedArtifactWriter } from "../../artifact-capture/instrumented-artifact-writer.js";
21
22
  import { buildRunContext } from "../../pipeline/run-context.js";
22
23
  import { loadSource } from "../../sources.js";
23
24
  import { configToSourceOverrides } from "../config-to-source-overrides.js";
25
+ /**
26
+ * Walk a writer's `inner` decorator chain looking for an
27
+ * `AccumulatingArtifactWriter`. Composition root wraps the accumulator
28
+ * in `InstrumentedArtifactWriter` when `AILF_UPLOAD_METRICS=1`, so a
29
+ * naive `instanceof AccumulatingArtifactWriter` check misses it and the
30
+ * manifest comes out empty. Rather than teaching every caller about the
31
+ * instrumentation wrapper, unwrap once here.
32
+ *
33
+ * `MAX_DEPTH` is a safety belt against a future decorator chain
34
+ * accidentally introducing a cycle — the current writers can't, but one
35
+ * `inner` self-reference would otherwise spin forever.
36
+ */
37
+ const FIND_ACCUMULATOR_MAX_DEPTH = 8;
38
+ function findAccumulator(writer) {
39
+ let cursor = writer;
40
+ for (let depth = 0; cursor && depth < FIND_ACCUMULATOR_MAX_DEPTH; depth++) {
41
+ if (cursor instanceof AccumulatingArtifactWriter)
42
+ return cursor;
43
+ if (cursor instanceof InstrumentedArtifactWriter) {
44
+ cursor = cursor.inner;
45
+ continue;
46
+ }
47
+ return null;
48
+ }
49
+ return null;
50
+ }
24
51
  export class FinalizeRunStep {
25
52
  pipelineStart;
26
53
  options;
@@ -64,8 +91,12 @@ export class FinalizeRunStep {
64
91
  // happened to register manually. When the writer is a NoOp / plain
65
92
  // decorator without accumulation, `aggregated` stays empty and the
66
93
  // manifest falls back to the producer-side registration.
67
- const aggregated = ctx.artifactWriter instanceof AccumulatingArtifactWriter
68
- ? ctx.artifactWriter.getAccumulatedArtifactRefs()
94
+ //
95
+ // W0058: `findAccumulator` unwraps `InstrumentedArtifactWriter` too
96
+ // so the manifest stays fully populated when `AILF_UPLOAD_METRICS=1`.
97
+ const accumulator = findAccumulator(ctx.artifactWriter);
98
+ const aggregated = accumulator
99
+ ? accumulator.getAccumulatedArtifactRefs()
69
100
  : {};
70
101
  const artifacts = {
71
102
  ...aggregated,
@@ -27,7 +27,7 @@
27
27
  * still live inside the full `rawResults` object.
28
28
  */
29
29
  import { readFileSync } from "node:fs";
30
- import { classifyRubric, parseRubricScore, } from "../_vendor/ailf-core/index.js";
30
+ import { classifyRubric, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
31
31
  // ---------------------------------------------------------------------------
32
32
  // Public entry point
33
33
  // ---------------------------------------------------------------------------
@@ -57,23 +57,40 @@ export async function emitPerEntryEvalResults(writer, ctx, mode, resultsPath) {
57
57
  console.warn(` ⚠️ emitPerEntryEvalResults: ${resultsPath} has no results[]`);
58
58
  return;
59
59
  }
60
+ // W0058: fire every emit synchronously and `Promise.all` once at the
61
+ // end. The previous `parallelMap` pattern created producer-side
62
+ // backpressure (each worker was blocked on its own `await emit(...)`),
63
+ // which kept the batching writer's queue shallow (≤ producer
64
+ // concurrency). With queueing delegated entirely to the writer, the
65
+ // batching API-Gateway writer gets a fully-populated pending queue
66
+ // and can pack ~hundreds of entries into a single batch-sign RTT;
67
+ // the GCS-direct writer's own `ConcurrencyLimiter` caps the PUT
68
+ // fan-out so the 1 500-concurrent-PUT scenario cannot happen.
69
+ const emits = [];
60
70
  for (const result of rows) {
61
- const taskId = result.testCase?.description ?? "unknown-task";
71
+ const rawTaskId = result.testCase?.description ?? "unknown-task";
62
72
  const modelId = result.provider?.id ?? result.provider?.label ?? "unknown-model";
73
+ // D0033 axis convention: literacy-mode task descriptions carry a
74
+ // `(gold)` / `(baseline)` suffix; that variant IS the `mode` axis
75
+ // value. Stripping the suffix here keeps the writer's key aligned
76
+ // with what `slim-report-summary#slimJudgments` and the Studio
77
+ // `testOutputsKeyFor` hook compute on the read side. Without this,
78
+ // signed-URL lookups for grader/judgment artifacts 404.
79
+ const { mode: axisMode, task: axisTask } = resolveVariantMode(rawTaskId, mode);
63
80
  const baseAssoc = {
64
81
  run: ctx.runId,
65
- mode,
66
- task: taskId,
82
+ mode: axisMode,
83
+ task: axisTask,
67
84
  model: modelId,
68
85
  };
69
86
  // rawResults — full raw entry (bounded by descriptor capBytes: 1 MB)
70
- await writer.emit("rawResults", baseAssoc, result);
87
+ emits.push(writer.emit("rawResults", baseAssoc, result));
71
88
  // renderedPrompts — what the model saw + which provider it went to
72
89
  if (result.prompt !== undefined) {
73
- await writer.emit("renderedPrompts", baseAssoc, {
90
+ emits.push(writer.emit("renderedPrompts", baseAssoc, {
74
91
  prompt: result.prompt,
75
92
  provider: result.provider,
76
- });
93
+ }));
77
94
  }
78
95
  // Per-grader decomposition — only LLM-rubric assertions have a
79
96
  // natural grader identity. Code assertions (javascript/contains/…)
@@ -86,15 +103,16 @@ export async function emitPerEntryEvalResults(writer, ctx, mode, resultsPath) {
86
103
  if (!dimension)
87
104
  continue;
88
105
  const graderAssoc = { ...baseAssoc, grader: dimension };
89
- await writer.emit("graderPrompts", graderAssoc, {
106
+ emits.push(writer.emit("graderPrompts", graderAssoc, {
90
107
  dimension,
91
108
  assertion: comp.assertion,
92
- });
93
- await writer.emit("graderJudgments", graderAssoc, {
109
+ }));
110
+ emits.push(writer.emit("graderJudgments", graderAssoc, {
94
111
  score: parseRubricScore(comp) ?? 0,
95
112
  reason: comp.reason ?? "",
96
113
  pass: comp.pass,
97
- });
114
+ }));
98
115
  }
99
116
  }
117
+ await Promise.all(emits);
100
118
  }
@@ -15,12 +15,19 @@
15
15
  *
16
16
  * @see docs/decisions/D0032-run-anchored-artifact-store.md
17
17
  */
18
- import type { ArtifactRef, ArtifactWriter, RunId, StoredTestResult } from "../_vendor/ailf-core/index.d.ts";
18
+ import { type ArtifactRef, type ArtifactWriter, type RunId, type StoredTestResult } from "../_vendor/ailf-core/index.d.ts";
19
19
  /**
20
20
  * Upload testOutputs as per-entry GCS objects under
21
- * `runs/{runId}/test-outputs/`, one per `{taskId}::{modelId}` pair.
21
+ * `runs/{runId}/test-outputs/`, one per `(mode, task, model)` triple.
22
22
  *
23
- * Returns the `ArtifactRef` on success, or `null` when upload is skipped or
24
- * fails (P5: non-blocking).
23
+ * D0033 axis convention: literacy-mode taskIds carry a `(gold)` / `(baseline)`
24
+ * suffix whose value is the `mode` axis on the artifact — stripped via
25
+ * `resolveVariantMode`. This aligns the on-disk key with what the Studio
26
+ * hover-prefetch (`testOutputsKeyFor`) and slim-report readers compute; the
27
+ * legacy 2-segment form produced by the pre-D0033 writer 404'd on the new
28
+ * 3-segment read path.
29
+ *
30
+ * Returns the first non-null `ArtifactRef` emitted, or `null` when upload is
31
+ * skipped / every emit fails (P5: non-blocking).
25
32
  */
26
- export declare function uploadTestOutputs(writer: ArtifactWriter, runId: RunId, testResults: StoredTestResult[]): Promise<ArtifactRef | null>;
33
+ export declare function uploadTestOutputs(writer: ArtifactWriter, runId: RunId, testResults: StoredTestResult[], defaultMode: string): Promise<ArtifactRef | null>;
@@ -15,20 +15,37 @@
15
15
  *
16
16
  * @see docs/decisions/D0032-run-anchored-artifact-store.md
17
17
  */
18
+ import { resolveVariantMode, } from "../_vendor/ailf-core/index.js";
18
19
  /**
19
20
  * Upload testOutputs as per-entry GCS objects under
20
- * `runs/{runId}/test-outputs/`, one per `{taskId}::{modelId}` pair.
21
+ * `runs/{runId}/test-outputs/`, one per `(mode, task, model)` triple.
21
22
  *
22
- * Returns the `ArtifactRef` on success, or `null` when upload is skipped or
23
- * fails (P5: non-blocking).
23
+ * D0033 axis convention: literacy-mode taskIds carry a `(gold)` / `(baseline)`
24
+ * suffix whose value is the `mode` axis on the artifact — stripped via
25
+ * `resolveVariantMode`. This aligns the on-disk key with what the Studio
26
+ * hover-prefetch (`testOutputsKeyFor`) and slim-report readers compute; the
27
+ * legacy 2-segment form produced by the pre-D0033 writer 404'd on the new
28
+ * 3-segment read path.
29
+ *
30
+ * Returns the first non-null `ArtifactRef` emitted, or `null` when upload is
31
+ * skipped / every emit fails (P5: non-blocking).
24
32
  */
25
- export async function uploadTestOutputs(writer, runId, testResults) {
26
- const entries = testResults.map((tr) => ({
27
- key: `${tr.taskId}::${tr.modelId}`,
28
- data: {
33
+ export async function uploadTestOutputs(writer, runId, testResults, defaultMode) {
34
+ // W0058: fire every emit synchronously and `Promise.all` once — the
35
+ // writer (batching or GCS-direct) owns concurrency bounds. See the
36
+ // equivalent rationale in `emit-eval-results.ts`.
37
+ const emits = testResults.map((tr) => {
38
+ const { mode: axisMode, task: axisTask } = resolveVariantMode(tr.taskId, defaultMode);
39
+ return writer.emit("testOutputs", { run: runId, mode: axisMode, task: axisTask, model: tr.modelId }, {
29
40
  responseOutput: tr.responseOutput ?? "",
30
41
  responseOutputTruncated: tr.responseOutputTruncated ?? false,
31
- },
32
- }));
33
- return writer.writePerEntry("testOutputs", runId, entries);
42
+ });
43
+ });
44
+ const refs = await Promise.all(emits);
45
+ let lastRef = null;
46
+ for (const ref of refs) {
47
+ if (ref)
48
+ lastRef = ref;
49
+ }
50
+ return lastRef;
34
51
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "3.0.0",
3
+ "version": "3.1.0",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"
@@ -52,8 +52,8 @@
52
52
  "@types/node": "^22.13.1",
53
53
  "tsx": "^4.19.2",
54
54
  "typescript": "^5.7.3",
55
- "@sanity/ailf-core": "0.1.0",
56
- "@sanity/ailf-shared": "0.1.0"
55
+ "@sanity/ailf-shared": "0.1.0",
56
+ "@sanity/ailf-core": "0.1.0"
57
57
  },
58
58
  "scripts": {
59
59
  "build": "tsc && tsx scripts/bundle-workspace-deps.ts",