npm - @sanity/ailf - Versions diffs - 3.0.0 → 3.1.0 - Mend

@sanity/ailf 3.0.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/dist/orchestration/steps/finalize-run-step.js CHANGED Viewed

@@ -18,9 +18,36 @@
 import { existsSync, readFileSync } from "node:fs";
 import { resolve } from "node:path";
 import { AccumulatingArtifactWriter } from "../../artifact-capture/accumulating-artifact-writer.js";
+import { InstrumentedArtifactWriter } from "../../artifact-capture/instrumented-artifact-writer.js";
 import { buildRunContext } from "../../pipeline/run-context.js";
 import { loadSource } from "../../sources.js";
 import { configToSourceOverrides } from "../config-to-source-overrides.js";
+/**
+ * Walk a writer's `inner` decorator chain looking for an
+ * `AccumulatingArtifactWriter`. Composition root wraps the accumulator
+ * in `InstrumentedArtifactWriter` when `AILF_UPLOAD_METRICS=1`, so a
+ * naive `instanceof AccumulatingArtifactWriter` check misses it and the
+ * manifest comes out empty. Rather than teaching every caller about the
+ * instrumentation wrapper, unwrap once here.
+ *
+ * `MAX_DEPTH` is a safety belt against a future decorator chain
+ * accidentally introducing a cycle — the current writers can't, but one
+ * `inner` self-reference would otherwise spin forever.
+ */
+const FIND_ACCUMULATOR_MAX_DEPTH = 8;
+function findAccumulator(writer) {
+    let cursor = writer;
+    for (let depth = 0; cursor && depth < FIND_ACCUMULATOR_MAX_DEPTH; depth++) {
+        if (cursor instanceof AccumulatingArtifactWriter)
+            return cursor;
+        if (cursor instanceof InstrumentedArtifactWriter) {
+            cursor = cursor.inner;
+            continue;
+        }
+        return null;
+    }
+    return null;
+}
 export class FinalizeRunStep {
     pipelineStart;
     options;
@@ -64,8 +91,12 @@ export class FinalizeRunStep {
         // happened to register manually. When the writer is a NoOp / plain
         // decorator without accumulation, `aggregated` stays empty and the
         // manifest falls back to the producer-side registration.
-        const aggregated = ctx.artifactWriter instanceof AccumulatingArtifactWriter
-            ? ctx.artifactWriter.getAccumulatedArtifactRefs()
+        //
+        // W0058: `findAccumulator` unwraps `InstrumentedArtifactWriter` too
+        // so the manifest stays fully populated when `AILF_UPLOAD_METRICS=1`.
+        const accumulator = findAccumulator(ctx.artifactWriter);
+        const aggregated = accumulator
+            ? accumulator.getAccumulatedArtifactRefs()
             : {};
         const artifacts = {
             ...aggregated,

package/dist/pipeline/emit-eval-results.js CHANGED Viewed

@@ -27,7 +27,7 @@
  * still live inside the full `rawResults` object.
  */
 import { readFileSync } from "node:fs";
-import { classifyRubric, parseRubricScore, } from "../_vendor/ailf-core/index.js";
+import { classifyRubric, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
 // ---------------------------------------------------------------------------
 // Public entry point
 // ---------------------------------------------------------------------------
@@ -57,23 +57,40 @@ export async function emitPerEntryEvalResults(writer, ctx, mode, resultsPath) {
         console.warn(`  ⚠️  emitPerEntryEvalResults: ${resultsPath} has no results[]`);
         return;
     }
+    // W0058: fire every emit synchronously and `Promise.all` once at the
+    // end. The previous `parallelMap` pattern created producer-side
+    // backpressure (each worker was blocked on its own `await emit(...)`),
+    // which kept the batching writer's queue shallow (≤ producer
+    // concurrency). With queueing delegated entirely to the writer, the
+    // batching API-Gateway writer gets a fully-populated pending queue
+    // and can pack ~hundreds of entries into a single batch-sign RTT;
+    // the GCS-direct writer's own `ConcurrencyLimiter` caps the PUT
+    // fan-out so the 1 500-concurrent-PUT scenario cannot happen.
+    const emits = [];
     for (const result of rows) {
-        const taskId = result.testCase?.description ?? "unknown-task";
+        const rawTaskId = result.testCase?.description ?? "unknown-task";
         const modelId = result.provider?.id ?? result.provider?.label ?? "unknown-model";
+        // D0033 axis convention: literacy-mode task descriptions carry a
+        // `(gold)` / `(baseline)` suffix; that variant IS the `mode` axis
+        // value. Stripping the suffix here keeps the writer's key aligned
+        // with what `slim-report-summary#slimJudgments` and the Studio
+        // `testOutputsKeyFor` hook compute on the read side. Without this,
+        // signed-URL lookups for grader/judgment artifacts 404.
+        const { mode: axisMode, task: axisTask } = resolveVariantMode(rawTaskId, mode);
         const baseAssoc = {
             run: ctx.runId,
-            mode,
-            task: taskId,
+            mode: axisMode,
+            task: axisTask,
             model: modelId,
         };
         // rawResults — full raw entry (bounded by descriptor capBytes: 1 MB)
-        await writer.emit("rawResults", baseAssoc, result);
+        emits.push(writer.emit("rawResults", baseAssoc, result));
         // renderedPrompts — what the model saw + which provider it went to
         if (result.prompt !== undefined) {
-            await writer.emit("renderedPrompts", baseAssoc, {
+            emits.push(writer.emit("renderedPrompts", baseAssoc, {
                 prompt: result.prompt,
                 provider: result.provider,
-            });
+            }));
         }
         // Per-grader decomposition — only LLM-rubric assertions have a
         // natural grader identity. Code assertions (javascript/contains/…)
@@ -86,15 +103,16 @@ export async function emitPerEntryEvalResults(writer, ctx, mode, resultsPath) {
             if (!dimension)
                 continue;
             const graderAssoc = { ...baseAssoc, grader: dimension };
-            await writer.emit("graderPrompts", graderAssoc, {
+            emits.push(writer.emit("graderPrompts", graderAssoc, {
                 dimension,
                 assertion: comp.assertion,
-            });
-            await writer.emit("graderJudgments", graderAssoc, {
+            }));
+            emits.push(writer.emit("graderJudgments", graderAssoc, {
                 score: parseRubricScore(comp) ?? 0,
                 reason: comp.reason ?? "",
                 pass: comp.pass,
-            });
+            }));
         }
     }
+    await Promise.all(emits);
 }

package/dist/pipeline/upload-test-outputs.d.ts CHANGED Viewed

@@ -15,12 +15,19 @@
  *
  * @see docs/decisions/D0032-run-anchored-artifact-store.md
  */
-import type { ArtifactRef, ArtifactWriter, RunId, StoredTestResult } from "../_vendor/ailf-core/index.d.ts";
+import { type ArtifactRef, type ArtifactWriter, type RunId, type StoredTestResult } from "../_vendor/ailf-core/index.d.ts";
 /**
  * Upload testOutputs as per-entry GCS objects under
- * `runs/{runId}/test-outputs/`, one per `{taskId}::{modelId}` pair.
+ * `runs/{runId}/test-outputs/`, one per `(mode, task, model)` triple.
  *
- * Returns the `ArtifactRef` on success, or `null` when upload is skipped or
- * fails (P5: non-blocking).
+ * D0033 axis convention: literacy-mode taskIds carry a `(gold)` / `(baseline)`
+ * suffix whose value is the `mode` axis on the artifact — stripped via
+ * `resolveVariantMode`. This aligns the on-disk key with what the Studio
+ * hover-prefetch (`testOutputsKeyFor`) and slim-report readers compute; the
+ * legacy 2-segment form produced by the pre-D0033 writer 404'd on the new
+ * 3-segment read path.
+ *
+ * Returns the first non-null `ArtifactRef` emitted, or `null` when upload is
+ * skipped / every emit fails (P5: non-blocking).
  */
-export declare function uploadTestOutputs(writer: ArtifactWriter, runId: RunId, testResults: StoredTestResult[]): Promise<ArtifactRef | null>;
+export declare function uploadTestOutputs(writer: ArtifactWriter, runId: RunId, testResults: StoredTestResult[], defaultMode: string): Promise<ArtifactRef | null>;

package/dist/pipeline/upload-test-outputs.js CHANGED Viewed

@@ -15,20 +15,37 @@
  *
  * @see docs/decisions/D0032-run-anchored-artifact-store.md
  */
+import { resolveVariantMode, } from "../_vendor/ailf-core/index.js";
 /**
  * Upload testOutputs as per-entry GCS objects under
- * `runs/{runId}/test-outputs/`, one per `{taskId}::{modelId}` pair.
+ * `runs/{runId}/test-outputs/`, one per `(mode, task, model)` triple.
  *
- * Returns the `ArtifactRef` on success, or `null` when upload is skipped or
- * fails (P5: non-blocking).
+ * D0033 axis convention: literacy-mode taskIds carry a `(gold)` / `(baseline)`
+ * suffix whose value is the `mode` axis on the artifact — stripped via
+ * `resolveVariantMode`. This aligns the on-disk key with what the Studio
+ * hover-prefetch (`testOutputsKeyFor`) and slim-report readers compute; the
+ * legacy 2-segment form produced by the pre-D0033 writer 404'd on the new
+ * 3-segment read path.
+ *
+ * Returns the first non-null `ArtifactRef` emitted, or `null` when upload is
+ * skipped / every emit fails (P5: non-blocking).
  */
-export async function uploadTestOutputs(writer, runId, testResults) {
-    const entries = testResults.map((tr) => ({
-        key: `${tr.taskId}::${tr.modelId}`,
-        data: {
+export async function uploadTestOutputs(writer, runId, testResults, defaultMode) {
+    // W0058: fire every emit synchronously and `Promise.all` once — the
+    // writer (batching or GCS-direct) owns concurrency bounds. See the
+    // equivalent rationale in `emit-eval-results.ts`.
+    const emits = testResults.map((tr) => {
+        const { mode: axisMode, task: axisTask } = resolveVariantMode(tr.taskId, defaultMode);
+        return writer.emit("testOutputs", { run: runId, mode: axisMode, task: axisTask, model: tr.modelId }, {
             responseOutput: tr.responseOutput ?? "",
             responseOutputTruncated: tr.responseOutputTruncated ?? false,
-        },
-    }));
-    return writer.writePerEntry("testOutputs", runId, entries);
+        });
+    });
+    const refs = await Promise.all(emits);
+    let lastRef = null;
+    for (const ref of refs) {
+        if (ref)
+            lastRef = ref;
+    }
+    return lastRef;
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sanity/ailf",
-  "version": "3.0.0",
+  "version": "3.1.0",
   "private": false,
   "publishConfig": {
     "access": "public"
@@ -52,8 +52,8 @@
     "@types/node": "^22.13.1",
     "tsx": "^4.19.2",
     "typescript": "^5.7.3",
-    "@sanity/ailf-core": "0.1.0",
-    "@sanity/ailf-shared": "0.1.0"
+    "@sanity/ailf-shared": "0.1.0",
+    "@sanity/ailf-core": "0.1.0"
   },
   "scripts": {
     "build": "tsc && tsx scripts/bundle-workspace-deps.ts",