@sanity/ailf 3.0.0 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/artifact-capture/association.d.ts +37 -0
- package/dist/_vendor/ailf-core/artifact-capture/association.js +19 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/index.js +1 -1
- package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/progress-reporter.d.ts +74 -0
- package/dist/_vendor/ailf-core/ports/progress-reporter.js +26 -0
- package/dist/_vendor/ailf-core/services/slim-report-summary.js +1 -16
- package/dist/adapters/progress/console-progress-reporter.d.ts +35 -0
- package/dist/adapters/progress/console-progress-reporter.js +110 -0
- package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +8 -1
- package/dist/artifact-capture/api-gateway-artifact-writer.js +79 -42
- package/dist/artifact-capture/batching-api-gateway-artifact-writer.d.ts +108 -0
- package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +492 -0
- package/dist/artifact-capture/fanout-artifact-writer.d.ts +14 -2
- package/dist/artifact-capture/fanout-artifact-writer.js +25 -4
- package/dist/artifact-capture/gcs-artifact-writer.d.ts +27 -1
- package/dist/artifact-capture/gcs-artifact-writer.js +168 -38
- package/dist/artifact-capture/instrumented-artifact-writer.d.ts +32 -0
- package/dist/artifact-capture/instrumented-artifact-writer.js +151 -0
- package/dist/artifact-capture/local-fs-artifact-writer.d.ts +8 -1
- package/dist/artifact-capture/local-fs-artifact-writer.js +23 -4
- package/dist/artifact-capture/parallel-emit.d.ts +43 -0
- package/dist/artifact-capture/parallel-emit.js +84 -0
- package/dist/artifact-capture/upload-metrics.d.ts +62 -0
- package/dist/artifact-capture/upload-metrics.js +125 -0
- package/dist/composition-root.d.ts +2 -2
- package/dist/composition-root.js +97 -11
- package/dist/orchestration/pipeline-orchestrator.js +97 -1
- package/dist/orchestration/steps/calculate-scores-step.js +1 -1
- package/dist/orchestration/steps/finalize-run-step.js +33 -2
- package/dist/pipeline/emit-eval-results.js +29 -11
- package/dist/pipeline/upload-test-outputs.d.ts +12 -5
- package/dist/pipeline/upload-test-outputs.js +27 -10
- package/package.json +3 -3
|
@@ -18,9 +18,36 @@
|
|
|
18
18
|
import { existsSync, readFileSync } from "node:fs";
|
|
19
19
|
import { resolve } from "node:path";
|
|
20
20
|
import { AccumulatingArtifactWriter } from "../../artifact-capture/accumulating-artifact-writer.js";
|
|
21
|
+
import { InstrumentedArtifactWriter } from "../../artifact-capture/instrumented-artifact-writer.js";
|
|
21
22
|
import { buildRunContext } from "../../pipeline/run-context.js";
|
|
22
23
|
import { loadSource } from "../../sources.js";
|
|
23
24
|
import { configToSourceOverrides } from "../config-to-source-overrides.js";
|
|
25
|
+
/**
|
|
26
|
+
* Walk a writer's `inner` decorator chain looking for an
|
|
27
|
+
* `AccumulatingArtifactWriter`. Composition root wraps the accumulator
|
|
28
|
+
* in `InstrumentedArtifactWriter` when `AILF_UPLOAD_METRICS=1`, so a
|
|
29
|
+
* naive `instanceof AccumulatingArtifactWriter` check misses it and the
|
|
30
|
+
* manifest comes out empty. Rather than teaching every caller about the
|
|
31
|
+
* instrumentation wrapper, unwrap once here.
|
|
32
|
+
*
|
|
33
|
+
* `MAX_DEPTH` is a safety belt against a future decorator chain
|
|
34
|
+
* accidentally introducing a cycle — the current writers can't, but one
|
|
35
|
+
* `inner` self-reference would otherwise spin forever.
|
|
36
|
+
*/
|
|
37
|
+
const FIND_ACCUMULATOR_MAX_DEPTH = 8;
|
|
38
|
+
function findAccumulator(writer) {
|
|
39
|
+
let cursor = writer;
|
|
40
|
+
for (let depth = 0; cursor && depth < FIND_ACCUMULATOR_MAX_DEPTH; depth++) {
|
|
41
|
+
if (cursor instanceof AccumulatingArtifactWriter)
|
|
42
|
+
return cursor;
|
|
43
|
+
if (cursor instanceof InstrumentedArtifactWriter) {
|
|
44
|
+
cursor = cursor.inner;
|
|
45
|
+
continue;
|
|
46
|
+
}
|
|
47
|
+
return null;
|
|
48
|
+
}
|
|
49
|
+
return null;
|
|
50
|
+
}
|
|
24
51
|
export class FinalizeRunStep {
|
|
25
52
|
pipelineStart;
|
|
26
53
|
options;
|
|
@@ -64,8 +91,12 @@ export class FinalizeRunStep {
|
|
|
64
91
|
// happened to register manually. When the writer is a NoOp / plain
|
|
65
92
|
// decorator without accumulation, `aggregated` stays empty and the
|
|
66
93
|
// manifest falls back to the producer-side registration.
|
|
67
|
-
|
|
68
|
-
|
|
94
|
+
//
|
|
95
|
+
// W0058: `findAccumulator` unwraps `InstrumentedArtifactWriter` too
|
|
96
|
+
// so the manifest stays fully populated when `AILF_UPLOAD_METRICS=1`.
|
|
97
|
+
const accumulator = findAccumulator(ctx.artifactWriter);
|
|
98
|
+
const aggregated = accumulator
|
|
99
|
+
? accumulator.getAccumulatedArtifactRefs()
|
|
69
100
|
: {};
|
|
70
101
|
const artifacts = {
|
|
71
102
|
...aggregated,
|
|
@@ -27,7 +27,7 @@
|
|
|
27
27
|
* still live inside the full `rawResults` object.
|
|
28
28
|
*/
|
|
29
29
|
import { readFileSync } from "node:fs";
|
|
30
|
-
import { classifyRubric, parseRubricScore, } from "../_vendor/ailf-core/index.js";
|
|
30
|
+
import { classifyRubric, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
|
|
31
31
|
// ---------------------------------------------------------------------------
|
|
32
32
|
// Public entry point
|
|
33
33
|
// ---------------------------------------------------------------------------
|
|
@@ -57,23 +57,40 @@ export async function emitPerEntryEvalResults(writer, ctx, mode, resultsPath) {
|
|
|
57
57
|
console.warn(` ⚠️ emitPerEntryEvalResults: ${resultsPath} has no results[]`);
|
|
58
58
|
return;
|
|
59
59
|
}
|
|
60
|
+
// W0058: fire every emit synchronously and `Promise.all` once at the
|
|
61
|
+
// end. The previous `parallelMap` pattern created producer-side
|
|
62
|
+
// backpressure (each worker was blocked on its own `await emit(...)`),
|
|
63
|
+
// which kept the batching writer's queue shallow (≤ producer
|
|
64
|
+
// concurrency). With queueing delegated entirely to the writer, the
|
|
65
|
+
// batching API-Gateway writer gets a fully-populated pending queue
|
|
66
|
+
// and can pack ~hundreds of entries into a single batch-sign RTT;
|
|
67
|
+
// the GCS-direct writer's own `ConcurrencyLimiter` caps the PUT
|
|
68
|
+
// fan-out so the 1 500-concurrent-PUT scenario cannot happen.
|
|
69
|
+
const emits = [];
|
|
60
70
|
for (const result of rows) {
|
|
61
|
-
const
|
|
71
|
+
const rawTaskId = result.testCase?.description ?? "unknown-task";
|
|
62
72
|
const modelId = result.provider?.id ?? result.provider?.label ?? "unknown-model";
|
|
73
|
+
// D0033 axis convention: literacy-mode task descriptions carry a
|
|
74
|
+
// `(gold)` / `(baseline)` suffix; that variant IS the `mode` axis
|
|
75
|
+
// value. Stripping the suffix here keeps the writer's key aligned
|
|
76
|
+
// with what `slim-report-summary#slimJudgments` and the Studio
|
|
77
|
+
// `testOutputsKeyFor` hook compute on the read side. Without this,
|
|
78
|
+
// signed-URL lookups for grader/judgment artifacts 404.
|
|
79
|
+
const { mode: axisMode, task: axisTask } = resolveVariantMode(rawTaskId, mode);
|
|
63
80
|
const baseAssoc = {
|
|
64
81
|
run: ctx.runId,
|
|
65
|
-
mode,
|
|
66
|
-
task:
|
|
82
|
+
mode: axisMode,
|
|
83
|
+
task: axisTask,
|
|
67
84
|
model: modelId,
|
|
68
85
|
};
|
|
69
86
|
// rawResults — full raw entry (bounded by descriptor capBytes: 1 MB)
|
|
70
|
-
|
|
87
|
+
emits.push(writer.emit("rawResults", baseAssoc, result));
|
|
71
88
|
// renderedPrompts — what the model saw + which provider it went to
|
|
72
89
|
if (result.prompt !== undefined) {
|
|
73
|
-
|
|
90
|
+
emits.push(writer.emit("renderedPrompts", baseAssoc, {
|
|
74
91
|
prompt: result.prompt,
|
|
75
92
|
provider: result.provider,
|
|
76
|
-
});
|
|
93
|
+
}));
|
|
77
94
|
}
|
|
78
95
|
// Per-grader decomposition — only LLM-rubric assertions have a
|
|
79
96
|
// natural grader identity. Code assertions (javascript/contains/…)
|
|
@@ -86,15 +103,16 @@ export async function emitPerEntryEvalResults(writer, ctx, mode, resultsPath) {
|
|
|
86
103
|
if (!dimension)
|
|
87
104
|
continue;
|
|
88
105
|
const graderAssoc = { ...baseAssoc, grader: dimension };
|
|
89
|
-
|
|
106
|
+
emits.push(writer.emit("graderPrompts", graderAssoc, {
|
|
90
107
|
dimension,
|
|
91
108
|
assertion: comp.assertion,
|
|
92
|
-
});
|
|
93
|
-
|
|
109
|
+
}));
|
|
110
|
+
emits.push(writer.emit("graderJudgments", graderAssoc, {
|
|
94
111
|
score: parseRubricScore(comp) ?? 0,
|
|
95
112
|
reason: comp.reason ?? "",
|
|
96
113
|
pass: comp.pass,
|
|
97
|
-
});
|
|
114
|
+
}));
|
|
98
115
|
}
|
|
99
116
|
}
|
|
117
|
+
await Promise.all(emits);
|
|
100
118
|
}
|
|
@@ -15,12 +15,19 @@
|
|
|
15
15
|
*
|
|
16
16
|
* @see docs/decisions/D0032-run-anchored-artifact-store.md
|
|
17
17
|
*/
|
|
18
|
-
import type
|
|
18
|
+
import { type ArtifactRef, type ArtifactWriter, type RunId, type StoredTestResult } from "../_vendor/ailf-core/index.d.ts";
|
|
19
19
|
/**
|
|
20
20
|
* Upload testOutputs as per-entry GCS objects under
|
|
21
|
-
* `runs/{runId}/test-outputs/`, one per `
|
|
21
|
+
* `runs/{runId}/test-outputs/`, one per `(mode, task, model)` triple.
|
|
22
22
|
*
|
|
23
|
-
*
|
|
24
|
-
*
|
|
23
|
+
* D0033 axis convention: literacy-mode taskIds carry a `(gold)` / `(baseline)`
|
|
24
|
+
* suffix whose value is the `mode` axis on the artifact — stripped via
|
|
25
|
+
* `resolveVariantMode`. This aligns the on-disk key with what the Studio
|
|
26
|
+
* hover-prefetch (`testOutputsKeyFor`) and slim-report readers compute; the
|
|
27
|
+
* legacy 2-segment form produced by the pre-D0033 writer 404'd on the new
|
|
28
|
+
* 3-segment read path.
|
|
29
|
+
*
|
|
30
|
+
* Returns the first non-null `ArtifactRef` emitted, or `null` when upload is
|
|
31
|
+
* skipped / every emit fails (P5: non-blocking).
|
|
25
32
|
*/
|
|
26
|
-
export declare function uploadTestOutputs(writer: ArtifactWriter, runId: RunId, testResults: StoredTestResult[]): Promise<ArtifactRef | null>;
|
|
33
|
+
export declare function uploadTestOutputs(writer: ArtifactWriter, runId: RunId, testResults: StoredTestResult[], defaultMode: string): Promise<ArtifactRef | null>;
|
|
@@ -15,20 +15,37 @@
|
|
|
15
15
|
*
|
|
16
16
|
* @see docs/decisions/D0032-run-anchored-artifact-store.md
|
|
17
17
|
*/
|
|
18
|
+
import { resolveVariantMode, } from "../_vendor/ailf-core/index.js";
|
|
18
19
|
/**
|
|
19
20
|
* Upload testOutputs as per-entry GCS objects under
|
|
20
|
-
* `runs/{runId}/test-outputs/`, one per `
|
|
21
|
+
* `runs/{runId}/test-outputs/`, one per `(mode, task, model)` triple.
|
|
21
22
|
*
|
|
22
|
-
*
|
|
23
|
-
*
|
|
23
|
+
* D0033 axis convention: literacy-mode taskIds carry a `(gold)` / `(baseline)`
|
|
24
|
+
* suffix whose value is the `mode` axis on the artifact — stripped via
|
|
25
|
+
* `resolveVariantMode`. This aligns the on-disk key with what the Studio
|
|
26
|
+
* hover-prefetch (`testOutputsKeyFor`) and slim-report readers compute; the
|
|
27
|
+
* legacy 2-segment form produced by the pre-D0033 writer 404'd on the new
|
|
28
|
+
* 3-segment read path.
|
|
29
|
+
*
|
|
30
|
+
* Returns the first non-null `ArtifactRef` emitted, or `null` when upload is
|
|
31
|
+
* skipped / every emit fails (P5: non-blocking).
|
|
24
32
|
*/
|
|
25
|
-
export async function uploadTestOutputs(writer, runId, testResults) {
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
33
|
+
export async function uploadTestOutputs(writer, runId, testResults, defaultMode) {
|
|
34
|
+
// W0058: fire every emit synchronously and `Promise.all` once — the
|
|
35
|
+
// writer (batching or GCS-direct) owns concurrency bounds. See the
|
|
36
|
+
// equivalent rationale in `emit-eval-results.ts`.
|
|
37
|
+
const emits = testResults.map((tr) => {
|
|
38
|
+
const { mode: axisMode, task: axisTask } = resolveVariantMode(tr.taskId, defaultMode);
|
|
39
|
+
return writer.emit("testOutputs", { run: runId, mode: axisMode, task: axisTask, model: tr.modelId }, {
|
|
29
40
|
responseOutput: tr.responseOutput ?? "",
|
|
30
41
|
responseOutputTruncated: tr.responseOutputTruncated ?? false,
|
|
31
|
-
}
|
|
32
|
-
})
|
|
33
|
-
|
|
42
|
+
});
|
|
43
|
+
});
|
|
44
|
+
const refs = await Promise.all(emits);
|
|
45
|
+
let lastRef = null;
|
|
46
|
+
for (const ref of refs) {
|
|
47
|
+
if (ref)
|
|
48
|
+
lastRef = ref;
|
|
49
|
+
}
|
|
50
|
+
return lastRef;
|
|
34
51
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sanity/ailf",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.1.0",
|
|
4
4
|
"private": false,
|
|
5
5
|
"publishConfig": {
|
|
6
6
|
"access": "public"
|
|
@@ -52,8 +52,8 @@
|
|
|
52
52
|
"@types/node": "^22.13.1",
|
|
53
53
|
"tsx": "^4.19.2",
|
|
54
54
|
"typescript": "^5.7.3",
|
|
55
|
-
"@sanity/ailf-
|
|
56
|
-
"@sanity/ailf-
|
|
55
|
+
"@sanity/ailf-shared": "0.1.0",
|
|
56
|
+
"@sanity/ailf-core": "0.1.0"
|
|
57
57
|
},
|
|
58
58
|
"scripts": {
|
|
59
59
|
"build": "tsc && tsx scripts/bundle-workspace-deps.ts",
|