@sanity/ailf 3.0.0 → 3.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/artifact-capture/association.d.ts +37 -0
- package/dist/_vendor/ailf-core/artifact-capture/association.js +19 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/index.js +1 -1
- package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/progress-reporter.d.ts +74 -0
- package/dist/_vendor/ailf-core/ports/progress-reporter.js +26 -0
- package/dist/_vendor/ailf-core/services/slim-report-summary.js +1 -16
- package/dist/adapters/progress/console-progress-reporter.d.ts +35 -0
- package/dist/adapters/progress/console-progress-reporter.js +110 -0
- package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +8 -1
- package/dist/artifact-capture/api-gateway-artifact-writer.js +79 -42
- package/dist/artifact-capture/batching-api-gateway-artifact-writer.d.ts +108 -0
- package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +492 -0
- package/dist/artifact-capture/fanout-artifact-writer.d.ts +14 -2
- package/dist/artifact-capture/fanout-artifact-writer.js +25 -4
- package/dist/artifact-capture/gcs-artifact-writer.d.ts +27 -1
- package/dist/artifact-capture/gcs-artifact-writer.js +168 -38
- package/dist/artifact-capture/instrumented-artifact-writer.d.ts +32 -0
- package/dist/artifact-capture/instrumented-artifact-writer.js +151 -0
- package/dist/artifact-capture/local-fs-artifact-writer.d.ts +8 -1
- package/dist/artifact-capture/local-fs-artifact-writer.js +23 -4
- package/dist/artifact-capture/parallel-emit.d.ts +43 -0
- package/dist/artifact-capture/parallel-emit.js +84 -0
- package/dist/artifact-capture/upload-metrics.d.ts +62 -0
- package/dist/artifact-capture/upload-metrics.js +125 -0
- package/dist/composition-root.d.ts +2 -2
- package/dist/composition-root.js +97 -11
- package/dist/orchestration/pipeline-orchestrator.js +97 -1
- package/dist/orchestration/steps/calculate-scores-step.js +9 -7
- package/dist/orchestration/steps/finalize-run-step.js +40 -8
- package/dist/pipeline/emit-eval-results.js +29 -11
- package/dist/pipeline/upload-test-outputs.d.ts +12 -5
- package/dist/pipeline/upload-test-outputs.js +27 -10
- package/package.json +1 -1
|
@@ -143,16 +143,18 @@ export class CalculateScoresStep {
|
|
|
143
143
|
// The full responseOutput lives in the GCS artifact; PublishReportStep
|
|
144
144
|
// later strips it from the inline Content Lake document when this
|
|
145
145
|
// upload succeeds.
|
|
146
|
+
//
|
|
147
|
+
// The emits flow through `ctx.artifactWriter`, which the composition
|
|
148
|
+
// root wraps in `AccumulatingArtifactWriter`. That's where the
|
|
149
|
+
// authoritative merged ref is built; `FinalizeRunStep` reads it
|
|
150
|
+
// straight from the accumulator, so producer-side registration on
|
|
151
|
+
// `state.artifactRefs` would only clobber the accumulator's full set
|
|
152
|
+
// with a partial single-entry ref.
|
|
153
|
+
//
|
|
146
154
|
// W0050 — ctx.artifactWriter is always present; no guard needed.
|
|
147
155
|
const testResults = tryReadTestResults(ctx.config.rootDir);
|
|
148
156
|
if (testResults?.length) {
|
|
149
|
-
|
|
150
|
-
if (artifactRef) {
|
|
151
|
-
state.artifactRefs = {
|
|
152
|
-
...state.artifactRefs,
|
|
153
|
-
testOutputs: artifactRef,
|
|
154
|
-
};
|
|
155
|
-
}
|
|
157
|
+
await uploadTestOutputs(ctx.artifactWriter, ctx.runId, testResults, ctx.config.mode);
|
|
156
158
|
}
|
|
157
159
|
const criticalSuffix = belowCritical.length > 0
|
|
158
160
|
? ` (${belowCritical.length} area(s) below critical threshold: ${belowCritical.join(", ")})`
|
|
@@ -18,9 +18,36 @@
|
|
|
18
18
|
import { existsSync, readFileSync } from "node:fs";
|
|
19
19
|
import { resolve } from "node:path";
|
|
20
20
|
import { AccumulatingArtifactWriter } from "../../artifact-capture/accumulating-artifact-writer.js";
|
|
21
|
+
import { InstrumentedArtifactWriter } from "../../artifact-capture/instrumented-artifact-writer.js";
|
|
21
22
|
import { buildRunContext } from "../../pipeline/run-context.js";
|
|
22
23
|
import { loadSource } from "../../sources.js";
|
|
23
24
|
import { configToSourceOverrides } from "../config-to-source-overrides.js";
|
|
25
|
+
/**
|
|
26
|
+
* Walk a writer's `inner` decorator chain looking for an
|
|
27
|
+
* `AccumulatingArtifactWriter`. Composition root wraps the accumulator
|
|
28
|
+
* in `InstrumentedArtifactWriter` when `AILF_UPLOAD_METRICS=1`, so a
|
|
29
|
+
* naive `instanceof AccumulatingArtifactWriter` check misses it and the
|
|
30
|
+
* manifest comes out empty. Rather than teaching every caller about the
|
|
31
|
+
* instrumentation wrapper, unwrap once here.
|
|
32
|
+
*
|
|
33
|
+
* `MAX_DEPTH` is a safety belt against a future decorator chain
|
|
34
|
+
* accidentally introducing a cycle — the current writers can't, but one
|
|
35
|
+
* `inner` self-reference would otherwise spin forever.
|
|
36
|
+
*/
|
|
37
|
+
const FIND_ACCUMULATOR_MAX_DEPTH = 8;
|
|
38
|
+
function findAccumulator(writer) {
|
|
39
|
+
let cursor = writer;
|
|
40
|
+
for (let depth = 0; cursor && depth < FIND_ACCUMULATOR_MAX_DEPTH; depth++) {
|
|
41
|
+
if (cursor instanceof AccumulatingArtifactWriter)
|
|
42
|
+
return cursor;
|
|
43
|
+
if (cursor instanceof InstrumentedArtifactWriter) {
|
|
44
|
+
cursor = cursor.inner;
|
|
45
|
+
continue;
|
|
46
|
+
}
|
|
47
|
+
return null;
|
|
48
|
+
}
|
|
49
|
+
return null;
|
|
50
|
+
}
|
|
24
51
|
export class FinalizeRunStep {
|
|
25
52
|
pipelineStart;
|
|
26
53
|
options;
|
|
@@ -59,17 +86,22 @@ export class FinalizeRunStep {
|
|
|
59
86
|
});
|
|
60
87
|
// W0051 revisit: the composition-root wraps `ctx.artifactWriter` in
|
|
61
88
|
// `AccumulatingArtifactWriter`, which keeps a map of every ref any
|
|
62
|
-
// producer emitted this run
|
|
63
|
-
//
|
|
64
|
-
//
|
|
65
|
-
//
|
|
66
|
-
//
|
|
67
|
-
|
|
68
|
-
|
|
89
|
+
// producer emitted this run — the authoritative FULL set, merged by
|
|
90
|
+
// entry key. `state.artifactRefs` is a producer-side fallback for
|
|
91
|
+
// writers that don't accumulate (NoOp / plain decorators). The
|
|
92
|
+
// accumulator wins per type when both exist, because producer-side
|
|
93
|
+
// registrations tend to capture only the last ref from a parallel
|
|
94
|
+
// batch and would otherwise clobber the merged entries list.
|
|
95
|
+
//
|
|
96
|
+
// W0058: `findAccumulator` unwraps `InstrumentedArtifactWriter` too
|
|
97
|
+
// so the manifest stays fully populated when `AILF_UPLOAD_METRICS=1`.
|
|
98
|
+
const accumulator = findAccumulator(ctx.artifactWriter);
|
|
99
|
+
const aggregated = accumulator
|
|
100
|
+
? accumulator.getAccumulatedArtifactRefs()
|
|
69
101
|
: {};
|
|
70
102
|
const artifacts = {
|
|
71
|
-
...aggregated,
|
|
72
103
|
...(state.artifactRefs ?? {}),
|
|
104
|
+
...aggregated,
|
|
73
105
|
};
|
|
74
106
|
const manifest = {
|
|
75
107
|
version: 1,
|
|
@@ -27,7 +27,7 @@
|
|
|
27
27
|
* still live inside the full `rawResults` object.
|
|
28
28
|
*/
|
|
29
29
|
import { readFileSync } from "node:fs";
|
|
30
|
-
import { classifyRubric, parseRubricScore, } from "../_vendor/ailf-core/index.js";
|
|
30
|
+
import { classifyRubric, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
|
|
31
31
|
// ---------------------------------------------------------------------------
|
|
32
32
|
// Public entry point
|
|
33
33
|
// ---------------------------------------------------------------------------
|
|
@@ -57,23 +57,40 @@ export async function emitPerEntryEvalResults(writer, ctx, mode, resultsPath) {
|
|
|
57
57
|
console.warn(` ⚠️ emitPerEntryEvalResults: ${resultsPath} has no results[]`);
|
|
58
58
|
return;
|
|
59
59
|
}
|
|
60
|
+
// W0058: fire every emit synchronously and `Promise.all` once at the
|
|
61
|
+
// end. The previous `parallelMap` pattern created producer-side
|
|
62
|
+
// backpressure (each worker was blocked on its own `await emit(...)`),
|
|
63
|
+
// which kept the batching writer's queue shallow (≤ producer
|
|
64
|
+
// concurrency). With queueing delegated entirely to the writer, the
|
|
65
|
+
// batching API-Gateway writer gets a fully-populated pending queue
|
|
66
|
+
// and can pack ~hundreds of entries into a single batch-sign RTT;
|
|
67
|
+
// the GCS-direct writer's own `ConcurrencyLimiter` caps the PUT
|
|
68
|
+
// fan-out so the 1 500-concurrent-PUT scenario cannot happen.
|
|
69
|
+
const emits = [];
|
|
60
70
|
for (const result of rows) {
|
|
61
|
-
const
|
|
71
|
+
const rawTaskId = result.testCase?.description ?? "unknown-task";
|
|
62
72
|
const modelId = result.provider?.id ?? result.provider?.label ?? "unknown-model";
|
|
73
|
+
// D0033 axis convention: literacy-mode task descriptions carry a
|
|
74
|
+
// `(gold)` / `(baseline)` suffix; that variant IS the `mode` axis
|
|
75
|
+
// value. Stripping the suffix here keeps the writer's key aligned
|
|
76
|
+
// with what `slim-report-summary#slimJudgments` and the Studio
|
|
77
|
+
// `testOutputsKeyFor` hook compute on the read side. Without this,
|
|
78
|
+
// signed-URL lookups for grader/judgment artifacts 404.
|
|
79
|
+
const { mode: axisMode, task: axisTask } = resolveVariantMode(rawTaskId, mode);
|
|
63
80
|
const baseAssoc = {
|
|
64
81
|
run: ctx.runId,
|
|
65
|
-
mode,
|
|
66
|
-
task:
|
|
82
|
+
mode: axisMode,
|
|
83
|
+
task: axisTask,
|
|
67
84
|
model: modelId,
|
|
68
85
|
};
|
|
69
86
|
// rawResults — full raw entry (bounded by descriptor capBytes: 1 MB)
|
|
70
|
-
|
|
87
|
+
emits.push(writer.emit("rawResults", baseAssoc, result));
|
|
71
88
|
// renderedPrompts — what the model saw + which provider it went to
|
|
72
89
|
if (result.prompt !== undefined) {
|
|
73
|
-
|
|
90
|
+
emits.push(writer.emit("renderedPrompts", baseAssoc, {
|
|
74
91
|
prompt: result.prompt,
|
|
75
92
|
provider: result.provider,
|
|
76
|
-
});
|
|
93
|
+
}));
|
|
77
94
|
}
|
|
78
95
|
// Per-grader decomposition — only LLM-rubric assertions have a
|
|
79
96
|
// natural grader identity. Code assertions (javascript/contains/…)
|
|
@@ -86,15 +103,16 @@ export async function emitPerEntryEvalResults(writer, ctx, mode, resultsPath) {
|
|
|
86
103
|
if (!dimension)
|
|
87
104
|
continue;
|
|
88
105
|
const graderAssoc = { ...baseAssoc, grader: dimension };
|
|
89
|
-
|
|
106
|
+
emits.push(writer.emit("graderPrompts", graderAssoc, {
|
|
90
107
|
dimension,
|
|
91
108
|
assertion: comp.assertion,
|
|
92
|
-
});
|
|
93
|
-
|
|
109
|
+
}));
|
|
110
|
+
emits.push(writer.emit("graderJudgments", graderAssoc, {
|
|
94
111
|
score: parseRubricScore(comp) ?? 0,
|
|
95
112
|
reason: comp.reason ?? "",
|
|
96
113
|
pass: comp.pass,
|
|
97
|
-
});
|
|
114
|
+
}));
|
|
98
115
|
}
|
|
99
116
|
}
|
|
117
|
+
await Promise.all(emits);
|
|
100
118
|
}
|
|
@@ -15,12 +15,19 @@
|
|
|
15
15
|
*
|
|
16
16
|
* @see docs/decisions/D0032-run-anchored-artifact-store.md
|
|
17
17
|
*/
|
|
18
|
-
import type
|
|
18
|
+
import { type ArtifactRef, type ArtifactWriter, type RunId, type StoredTestResult } from "../_vendor/ailf-core/index.d.ts";
|
|
19
19
|
/**
|
|
20
20
|
* Upload testOutputs as per-entry GCS objects under
|
|
21
|
-
* `runs/{runId}/test-outputs/`, one per `
|
|
21
|
+
* `runs/{runId}/test-outputs/`, one per `(mode, task, model)` triple.
|
|
22
22
|
*
|
|
23
|
-
*
|
|
24
|
-
*
|
|
23
|
+
* D0033 axis convention: literacy-mode taskIds carry a `(gold)` / `(baseline)`
|
|
24
|
+
* suffix whose value is the `mode` axis on the artifact — stripped via
|
|
25
|
+
* `resolveVariantMode`. This aligns the on-disk key with what the Studio
|
|
26
|
+
* hover-prefetch (`testOutputsKeyFor`) and slim-report readers compute; the
|
|
27
|
+
* legacy 2-segment form produced by the pre-D0033 writer 404'd on the new
|
|
28
|
+
* 3-segment read path.
|
|
29
|
+
*
|
|
30
|
+
* Returns the first non-null `ArtifactRef` emitted, or `null` when upload is
|
|
31
|
+
* skipped / every emit fails (P5: non-blocking).
|
|
25
32
|
*/
|
|
26
|
-
export declare function uploadTestOutputs(writer: ArtifactWriter, runId: RunId, testResults: StoredTestResult[]): Promise<ArtifactRef | null>;
|
|
33
|
+
export declare function uploadTestOutputs(writer: ArtifactWriter, runId: RunId, testResults: StoredTestResult[], defaultMode: string): Promise<ArtifactRef | null>;
|
|
@@ -15,20 +15,37 @@
|
|
|
15
15
|
*
|
|
16
16
|
* @see docs/decisions/D0032-run-anchored-artifact-store.md
|
|
17
17
|
*/
|
|
18
|
+
import { resolveVariantMode, } from "../_vendor/ailf-core/index.js";
|
|
18
19
|
/**
|
|
19
20
|
* Upload testOutputs as per-entry GCS objects under
|
|
20
|
-
* `runs/{runId}/test-outputs/`, one per `
|
|
21
|
+
* `runs/{runId}/test-outputs/`, one per `(mode, task, model)` triple.
|
|
21
22
|
*
|
|
22
|
-
*
|
|
23
|
-
*
|
|
23
|
+
* D0033 axis convention: literacy-mode taskIds carry a `(gold)` / `(baseline)`
|
|
24
|
+
* suffix whose value is the `mode` axis on the artifact — stripped via
|
|
25
|
+
* `resolveVariantMode`. This aligns the on-disk key with what the Studio
|
|
26
|
+
* hover-prefetch (`testOutputsKeyFor`) and slim-report readers compute; the
|
|
27
|
+
* legacy 2-segment form produced by the pre-D0033 writer 404'd on the new
|
|
28
|
+
* 3-segment read path.
|
|
29
|
+
*
|
|
30
|
+
* Returns the first non-null `ArtifactRef` emitted, or `null` when upload is
|
|
31
|
+
* skipped / every emit fails (P5: non-blocking).
|
|
24
32
|
*/
|
|
25
|
-
export async function uploadTestOutputs(writer, runId, testResults) {
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
33
|
+
export async function uploadTestOutputs(writer, runId, testResults, defaultMode) {
|
|
34
|
+
// W0058: fire every emit synchronously and `Promise.all` once — the
|
|
35
|
+
// writer (batching or GCS-direct) owns concurrency bounds. See the
|
|
36
|
+
// equivalent rationale in `emit-eval-results.ts`.
|
|
37
|
+
const emits = testResults.map((tr) => {
|
|
38
|
+
const { mode: axisMode, task: axisTask } = resolveVariantMode(tr.taskId, defaultMode);
|
|
39
|
+
return writer.emit("testOutputs", { run: runId, mode: axisMode, task: axisTask, model: tr.modelId }, {
|
|
29
40
|
responseOutput: tr.responseOutput ?? "",
|
|
30
41
|
responseOutputTruncated: tr.responseOutputTruncated ?? false,
|
|
31
|
-
}
|
|
32
|
-
})
|
|
33
|
-
|
|
42
|
+
});
|
|
43
|
+
});
|
|
44
|
+
const refs = await Promise.all(emits);
|
|
45
|
+
let lastRef = null;
|
|
46
|
+
for (const ref of refs) {
|
|
47
|
+
if (ref)
|
|
48
|
+
lastRef = ref;
|
|
49
|
+
}
|
|
50
|
+
return lastRef;
|
|
34
51
|
}
|