@sanity/ailf-studio 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +24 -4
- package/dist/index.js +177 -83
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -318,6 +318,8 @@ interface ProvenanceData {
|
|
|
318
318
|
id: string;
|
|
319
319
|
label: string;
|
|
320
320
|
}[];
|
|
321
|
+
/** Identity of the pipeline run that produced this report (D0032) */
|
|
322
|
+
runId: string;
|
|
321
323
|
/** @deprecated Use `promptfooUrls` when available */
|
|
322
324
|
promptfooUrl?: string;
|
|
323
325
|
/** Per-mode Promptfoo share URLs (one per sub-eval) */
|
|
@@ -441,6 +443,16 @@ interface ArtifactRef {
|
|
|
441
443
|
path: string;
|
|
442
444
|
bytes?: number;
|
|
443
445
|
entryCount?: number;
|
|
446
|
+
/**
|
|
447
|
+
* Added in W0047 / D0032. Missing on pre-W0047 legacy refs, in which case
|
|
448
|
+
* dispatchers must treat it as `"bulk"` (the only layout that existed then).
|
|
449
|
+
*/
|
|
450
|
+
layout?: "bulk" | "per-entry";
|
|
451
|
+
/** Per-entry index (populated for `layout: "per-entry"` refs only). */
|
|
452
|
+
entries?: {
|
|
453
|
+
key: string;
|
|
454
|
+
bytes: number;
|
|
455
|
+
}[];
|
|
444
456
|
}
|
|
445
457
|
/** A single gap/recommendation from gap analysis */
|
|
446
458
|
interface RecommendationGap {
|
|
@@ -461,7 +473,12 @@ interface RecommendationsData {
|
|
|
461
473
|
}
|
|
462
474
|
/**
|
|
463
475
|
* Per-test result stored in reports for drill-down and audit.
|
|
464
|
-
* Mirrors StoredTestResult from @sanity/ailf-core.
|
|
476
|
+
* Mirrors StoredTestResult from @sanity/ailf-core.
|
|
477
|
+
*
|
|
478
|
+
* Per D0030, new reports omit `responseOutput` / `responseOutputTruncated`
|
|
479
|
+
* inline — the full output lives in the `testOutputs` GCS artifact and is
|
|
480
|
+
* fetched via `useArtifactCache`. Both fields remain optional so the
|
|
481
|
+
* reader path tolerates legacy reports that were published before W0045.
|
|
465
482
|
*/
|
|
466
483
|
interface StoredTestResultData {
|
|
467
484
|
area: string;
|
|
@@ -476,7 +493,7 @@ interface StoredTestResultData {
|
|
|
476
493
|
latencyMs?: number;
|
|
477
494
|
modelId: string;
|
|
478
495
|
outputFailure?: boolean;
|
|
479
|
-
responseOutput
|
|
496
|
+
responseOutput?: string;
|
|
480
497
|
responseOutputTruncated?: boolean;
|
|
481
498
|
taskId: string;
|
|
482
499
|
tokenUsage?: {
|
|
@@ -534,11 +551,14 @@ interface PerModelData {
|
|
|
534
551
|
interface SummaryData {
|
|
535
552
|
/** Per-feature agent behavior data (only present when agentic mode ran) */
|
|
536
553
|
agentBehavior?: FeatureAgentBehaviorData[] | null;
|
|
537
|
-
/** External artifact references — present when pipeline uploads to GCS (
|
|
538
|
-
|
|
554
|
+
/** External artifact references — present when pipeline uploads to GCS (D0032) */
|
|
555
|
+
artifactManifest?: {
|
|
539
556
|
testOutputs?: ArtifactRef;
|
|
540
557
|
renderedPrompts?: ArtifactRef;
|
|
541
558
|
rawResults?: ArtifactRef;
|
|
559
|
+
graderPrompts?: ArtifactRef;
|
|
560
|
+
taskDefinitions?: ArtifactRef;
|
|
561
|
+
evalResults?: ArtifactRef;
|
|
542
562
|
traces?: ArtifactRef;
|
|
543
563
|
};
|
|
544
564
|
belowCritical: string[];
|
package/dist/index.js
CHANGED
|
@@ -10,7 +10,7 @@ import { useClient, useCurrentUser } from "sanity";
|
|
|
10
10
|
// src/lib/constants.ts
|
|
11
11
|
var API_VERSION = "2026-03-11";
|
|
12
12
|
var isProduction = process.env.NODE_ENV === "production";
|
|
13
|
-
var ARTIFACT_API_BASE_URL = isProduction ? "https://ailf-api.sanity.build/v1" : "http://localhost:
|
|
13
|
+
var ARTIFACT_API_BASE_URL = isProduction ? "https://ailf-api.sanity.build/v1" : "http://localhost:3000/v1";
|
|
14
14
|
|
|
15
15
|
// src/actions/GraduateToNativeAction.tsx
|
|
16
16
|
import { jsx, jsxs } from "react/jsx-runtime";
|
|
@@ -734,6 +734,30 @@ function artifactRefSchema() {
|
|
|
734
734
|
name: "entryCount",
|
|
735
735
|
title: "Entry Count",
|
|
736
736
|
type: "number"
|
|
737
|
+
}),
|
|
738
|
+
defineField4({
|
|
739
|
+
name: "layout",
|
|
740
|
+
title: "Layout",
|
|
741
|
+
type: "string",
|
|
742
|
+
options: { list: ["bulk", "per-entry"] }
|
|
743
|
+
}),
|
|
744
|
+
defineField4({
|
|
745
|
+
name: "entries",
|
|
746
|
+
title: "Per-Entry Index",
|
|
747
|
+
type: "array",
|
|
748
|
+
of: [
|
|
749
|
+
{
|
|
750
|
+
fields: [
|
|
751
|
+
defineField4({ name: "key", title: "Key", type: "string" }),
|
|
752
|
+
defineField4({
|
|
753
|
+
name: "bytes",
|
|
754
|
+
title: "Size (bytes)",
|
|
755
|
+
type: "number"
|
|
756
|
+
})
|
|
757
|
+
],
|
|
758
|
+
type: "object"
|
|
759
|
+
}
|
|
760
|
+
]
|
|
737
761
|
})
|
|
738
762
|
],
|
|
739
763
|
type: "object"
|
|
@@ -1418,17 +1442,6 @@ var reportSchema = defineType4({
|
|
|
1418
1442
|
title: "Composite Score",
|
|
1419
1443
|
type: "number"
|
|
1420
1444
|
}),
|
|
1421
|
-
defineField4({
|
|
1422
|
-
description: "The model's generated code/response (truncated to 8000 chars).",
|
|
1423
|
-
name: "responseOutput",
|
|
1424
|
-
title: "Response Output",
|
|
1425
|
-
type: "text"
|
|
1426
|
-
}),
|
|
1427
|
-
defineField4({
|
|
1428
|
-
name: "responseOutputTruncated",
|
|
1429
|
-
title: "Truncated",
|
|
1430
|
-
type: "boolean"
|
|
1431
|
-
}),
|
|
1432
1445
|
defineField4({
|
|
1433
1446
|
name: "latencyMs",
|
|
1434
1447
|
title: "Latency (ms)",
|
|
@@ -1661,7 +1674,7 @@ var reportSchema = defineType4({
|
|
|
1661
1674
|
type: "array"
|
|
1662
1675
|
}),
|
|
1663
1676
|
defineField4({
|
|
1664
|
-
description: "External artifact references \u2014 points to large data in GCS that was too big for inline storage (
|
|
1677
|
+
description: "External artifact references \u2014 points to large data in GCS that was too big for inline storage (D0032).",
|
|
1665
1678
|
fields: [
|
|
1666
1679
|
defineField4({
|
|
1667
1680
|
...artifactRefSchema(),
|
|
@@ -1678,14 +1691,29 @@ var reportSchema = defineType4({
|
|
|
1678
1691
|
name: "rawResults",
|
|
1679
1692
|
title: "Raw Results"
|
|
1680
1693
|
}),
|
|
1694
|
+
defineField4({
|
|
1695
|
+
...artifactRefSchema(),
|
|
1696
|
+
name: "graderPrompts",
|
|
1697
|
+
title: "Grader Prompts"
|
|
1698
|
+
}),
|
|
1699
|
+
defineField4({
|
|
1700
|
+
...artifactRefSchema(),
|
|
1701
|
+
name: "taskDefinitions",
|
|
1702
|
+
title: "Task Definitions"
|
|
1703
|
+
}),
|
|
1704
|
+
defineField4({
|
|
1705
|
+
...artifactRefSchema(),
|
|
1706
|
+
name: "evalResults",
|
|
1707
|
+
title: "Eval Results"
|
|
1708
|
+
}),
|
|
1681
1709
|
defineField4({
|
|
1682
1710
|
...artifactRefSchema(),
|
|
1683
1711
|
name: "traces",
|
|
1684
1712
|
title: "Traces"
|
|
1685
1713
|
})
|
|
1686
1714
|
],
|
|
1687
|
-
name: "
|
|
1688
|
-
title: "
|
|
1715
|
+
name: "artifactManifest",
|
|
1716
|
+
title: "Artifact Manifest",
|
|
1689
1717
|
type: "object"
|
|
1690
1718
|
})
|
|
1691
1719
|
],
|
|
@@ -5901,71 +5929,128 @@ import { useClient as useClient10 } from "sanity";
|
|
|
5901
5929
|
|
|
5902
5930
|
// src/lib/useArtifactCache.ts
|
|
5903
5931
|
import { useCallback as useCallback14, useRef as useRef5, useState as useState10 } from "react";
|
|
5904
|
-
function useArtifactCache(
|
|
5932
|
+
function useArtifactCache(opts) {
|
|
5933
|
+
const { runId, artifactRef, type } = opts;
|
|
5905
5934
|
const cacheRef = useRef5(/* @__PURE__ */ new Map());
|
|
5906
5935
|
const [status, setStatus] = useState10("idle");
|
|
5907
5936
|
const [error, setError] = useState10(null);
|
|
5908
|
-
const
|
|
5909
|
-
const
|
|
5910
|
-
|
|
5911
|
-
|
|
5912
|
-
|
|
5913
|
-
|
|
5914
|
-
|
|
5915
|
-
|
|
5916
|
-
|
|
5917
|
-
|
|
5918
|
-
|
|
5919
|
-
|
|
5920
|
-
|
|
5921
|
-
|
|
5922
|
-
|
|
5923
|
-
|
|
5924
|
-
|
|
5925
|
-
|
|
5926
|
-
|
|
5927
|
-
|
|
5928
|
-
|
|
5929
|
-
throw new Error(
|
|
5930
|
-
`Artifact signing failed: ${signingRes.status} ${signingRes.statusText}${body ? ` \u2014 ${body.slice(0, 200)}` : ""}`
|
|
5931
|
-
);
|
|
5932
|
-
}
|
|
5933
|
-
const envelope = await signingRes.json();
|
|
5934
|
-
if (!envelope.ok || !envelope.data?.url) {
|
|
5935
|
-
throw new Error(
|
|
5936
|
-
envelope.error ?? "Invalid signing response \u2014 missing signed URL"
|
|
5937
|
-
);
|
|
5938
|
-
}
|
|
5939
|
-
const artifactRes = await fetch(envelope.data.url, {
|
|
5940
|
-
credentials: "omit"
|
|
5941
|
-
});
|
|
5942
|
-
if (!artifactRes.ok) {
|
|
5943
|
-
throw new Error(
|
|
5944
|
-
`GCS artifact fetch failed: ${artifactRes.status} ${artifactRes.statusText}`
|
|
5945
|
-
);
|
|
5937
|
+
const inFlight = useRef5(/* @__PURE__ */ new Set());
|
|
5938
|
+
const BULK_KEY = "__bulk__";
|
|
5939
|
+
const availableEntries = artifactRef?.entries?.map((e) => e.key) ?? [];
|
|
5940
|
+
const fetchEntry = useCallback14(
|
|
5941
|
+
async (key) => {
|
|
5942
|
+
if (!artifactRef || !runId) return;
|
|
5943
|
+
if (!artifactRef.layout || artifactRef.layout === "bulk") return;
|
|
5944
|
+
if (cacheRef.current.has(key) || inFlight.current.has(key)) return;
|
|
5945
|
+
inFlight.current.add(key);
|
|
5946
|
+
setStatus("loading");
|
|
5947
|
+
setError(null);
|
|
5948
|
+
try {
|
|
5949
|
+
const url = `${ARTIFACT_API_BASE_URL}/runs/${encodeURIComponent(runId)}/artifacts/${encodeURIComponent(type)}/${encodeURIComponent(key)}`;
|
|
5950
|
+
const entry = await signAndFetch(url);
|
|
5951
|
+
cacheRef.current.set(key, entry);
|
|
5952
|
+
setStatus("ready");
|
|
5953
|
+
} catch (err) {
|
|
5954
|
+
setError(err instanceof Error ? err.message : String(err));
|
|
5955
|
+
setStatus("error");
|
|
5956
|
+
} finally {
|
|
5957
|
+
inFlight.current.delete(key);
|
|
5946
5958
|
}
|
|
5947
|
-
|
|
5948
|
-
|
|
5949
|
-
|
|
5950
|
-
|
|
5959
|
+
},
|
|
5960
|
+
[runId, artifactRef, type]
|
|
5961
|
+
);
|
|
5962
|
+
const fetchAll = useCallback14(async () => {
|
|
5963
|
+
if (!artifactRef || !runId) return;
|
|
5964
|
+
if (inFlight.current.has(BULK_KEY)) return;
|
|
5965
|
+
if (!artifactRef.layout || artifactRef.layout === "bulk") {
|
|
5966
|
+
if (cacheRef.current.size > 0) return;
|
|
5967
|
+
inFlight.current.add(BULK_KEY);
|
|
5968
|
+
setStatus("loading");
|
|
5969
|
+
setError(null);
|
|
5970
|
+
try {
|
|
5971
|
+
const url = `${ARTIFACT_API_BASE_URL}/runs/${encodeURIComponent(runId)}/artifacts/${encodeURIComponent(type)}`;
|
|
5972
|
+
const body = await signAndFetch(url);
|
|
5973
|
+
const next = /* @__PURE__ */ new Map();
|
|
5974
|
+
for (const [key, entry] of Object.entries(body.entries)) {
|
|
5975
|
+
next.set(key, entry);
|
|
5976
|
+
}
|
|
5977
|
+
cacheRef.current = next;
|
|
5978
|
+
setStatus("ready");
|
|
5979
|
+
} catch (err) {
|
|
5980
|
+
setError(err instanceof Error ? err.message : String(err));
|
|
5981
|
+
setStatus("error");
|
|
5982
|
+
} finally {
|
|
5983
|
+
inFlight.current.delete(BULK_KEY);
|
|
5951
5984
|
}
|
|
5952
|
-
|
|
5953
|
-
setStatus("ready");
|
|
5954
|
-
} catch (err) {
|
|
5955
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
5956
|
-
setError(message);
|
|
5957
|
-
setStatus("error");
|
|
5958
|
-
} finally {
|
|
5959
|
-
fetchingRef.current = false;
|
|
5985
|
+
return;
|
|
5960
5986
|
}
|
|
5961
|
-
|
|
5962
|
-
|
|
5963
|
-
|
|
5964
|
-
|
|
5965
|
-
|
|
5987
|
+
const keys = artifactRef.entries?.map((e) => e.key) ?? [];
|
|
5988
|
+
await Promise.all(keys.map((k) => fetchEntry(k)));
|
|
5989
|
+
}, [runId, artifactRef, type, fetchEntry]);
|
|
5990
|
+
const getEntry = useCallback14(
|
|
5991
|
+
(key) => cacheRef.current.get(key) ?? null,
|
|
5966
5992
|
[]
|
|
5967
5993
|
);
|
|
5968
|
-
return {
|
|
5994
|
+
return {
|
|
5995
|
+
status,
|
|
5996
|
+
error,
|
|
5997
|
+
availableEntries,
|
|
5998
|
+
getEntry,
|
|
5999
|
+
fetchEntry,
|
|
6000
|
+
fetchAll
|
|
6001
|
+
};
|
|
6002
|
+
}
|
|
6003
|
+
async function signAndFetch(signingUrl) {
|
|
6004
|
+
const signingRes = await fetch(signingUrl, {
|
|
6005
|
+
credentials: "omit",
|
|
6006
|
+
headers: { Accept: "application/json" }
|
|
6007
|
+
});
|
|
6008
|
+
if (!signingRes.ok) {
|
|
6009
|
+
const body = await signingRes.text().catch(() => "");
|
|
6010
|
+
throw new Error(
|
|
6011
|
+
`Artifact signing failed: ${signingRes.status} ${signingRes.statusText}${body ? ` \u2014 ${body.slice(0, 200)}` : ""}`
|
|
6012
|
+
);
|
|
6013
|
+
}
|
|
6014
|
+
const envelope = await signingRes.json();
|
|
6015
|
+
if (envelope.object === "error" || !envelope.url) {
|
|
6016
|
+
throw new Error(
|
|
6017
|
+
envelope.error?.message ?? "Invalid signing response \u2014 missing signed URL"
|
|
6018
|
+
);
|
|
6019
|
+
}
|
|
6020
|
+
const artifactRes = await fetch(envelope.url, { credentials: "omit" });
|
|
6021
|
+
if (!artifactRes.ok) {
|
|
6022
|
+
throw new Error(
|
|
6023
|
+
`GCS artifact fetch failed: ${artifactRes.status} ${artifactRes.statusText}`
|
|
6024
|
+
);
|
|
6025
|
+
}
|
|
6026
|
+
return await artifactRes.json();
|
|
6027
|
+
}
|
|
6028
|
+
|
|
6029
|
+
// src/lib/use-test-outputs-artifact.ts
|
|
6030
|
+
function useTestOutputsArtifact(runId, artifactRef) {
|
|
6031
|
+
const cache = useArtifactCache({
|
|
6032
|
+
runId,
|
|
6033
|
+
artifactRef,
|
|
6034
|
+
type: "testOutputs"
|
|
6035
|
+
});
|
|
6036
|
+
return {
|
|
6037
|
+
status: cache.status,
|
|
6038
|
+
error: cache.error,
|
|
6039
|
+
getOutput: (taskId, modelId) => cache.getEntry(`${taskId}::${modelId}`),
|
|
6040
|
+
fetchOutput: async (taskId, modelId) => {
|
|
6041
|
+
if (!artifactRef) return;
|
|
6042
|
+
if (!artifactRef.layout || artifactRef.layout === "bulk") {
|
|
6043
|
+
await cache.fetchAll();
|
|
6044
|
+
return;
|
|
6045
|
+
}
|
|
6046
|
+
await cache.fetchEntry(`${taskId}::${modelId}`);
|
|
6047
|
+
},
|
|
6048
|
+
hasOutput: (taskId, modelId) => {
|
|
6049
|
+
if (!artifactRef) return false;
|
|
6050
|
+
if (!artifactRef.layout || artifactRef.layout === "bulk") return true;
|
|
6051
|
+
return cache.availableEntries.includes(`${taskId}::${modelId}`);
|
|
6052
|
+
}
|
|
6053
|
+
};
|
|
5969
6054
|
}
|
|
5970
6055
|
|
|
5971
6056
|
// src/lib/thresholds.ts
|
|
@@ -7175,8 +7260,10 @@ function JudgmentCard({
|
|
|
7175
7260
|
);
|
|
7176
7261
|
const resolvedOutput = inlineOutput ?? artifactEntry?.responseOutput ?? null;
|
|
7177
7262
|
const resolvedTruncated = testResult?.responseOutputTruncated ?? artifactEntry?.responseOutputTruncated ?? false;
|
|
7178
|
-
const
|
|
7179
|
-
const
|
|
7263
|
+
const entryKnownToManifest = artifactCache?.hasOutput(judgment.taskId, judgment.modelId) ?? false;
|
|
7264
|
+
const canFetchArtifact = !inlineOutput && !artifactEntry && artifactCache != null && entryKnownToManifest;
|
|
7265
|
+
const entryUnavailable = !inlineOutput && !artifactEntry && artifactCache != null && !entryKnownToManifest;
|
|
7266
|
+
const hasOutputOrCanFetch = resolvedOutput != null || canFetchArtifact || entryUnavailable;
|
|
7180
7267
|
useEffect8(() => {
|
|
7181
7268
|
if (focused) {
|
|
7182
7269
|
setExpanded(true);
|
|
@@ -7239,12 +7326,19 @@ function JudgmentCard({
|
|
|
7239
7326
|
const handleToggleOutput = useCallback16(
|
|
7240
7327
|
(e) => {
|
|
7241
7328
|
e.stopPropagation();
|
|
7242
|
-
if (!outputExpanded && !resolvedOutput && artifactCache) {
|
|
7243
|
-
artifactCache.
|
|
7329
|
+
if (!outputExpanded && !resolvedOutput && artifactCache && entryKnownToManifest) {
|
|
7330
|
+
artifactCache.fetchOutput(judgment.taskId, judgment.modelId);
|
|
7244
7331
|
}
|
|
7245
7332
|
setOutputExpanded((prev) => !prev);
|
|
7246
7333
|
},
|
|
7247
|
-
[
|
|
7334
|
+
[
|
|
7335
|
+
outputExpanded,
|
|
7336
|
+
resolvedOutput,
|
|
7337
|
+
artifactCache,
|
|
7338
|
+
entryKnownToManifest,
|
|
7339
|
+
judgment.taskId,
|
|
7340
|
+
judgment.modelId
|
|
7341
|
+
]
|
|
7248
7342
|
);
|
|
7249
7343
|
return /* @__PURE__ */ jsx28(
|
|
7250
7344
|
Box16,
|
|
@@ -7383,8 +7477,9 @@ function JudgmentCard({
|
|
|
7383
7477
|
padding: 12
|
|
7384
7478
|
},
|
|
7385
7479
|
children: [
|
|
7386
|
-
!resolvedOutput &&
|
|
7387
|
-
!resolvedOutput && artifactCache?.status === "
|
|
7480
|
+
!resolvedOutput && entryUnavailable && /* @__PURE__ */ jsx28(Text25, { muted: true, size: 1, children: "Model output not available for this entry." }),
|
|
7481
|
+
!resolvedOutput && !entryUnavailable && artifactCache?.status === "loading" && /* @__PURE__ */ jsx28(Text25, { muted: true, size: 1, children: "Fetching model output\u2026" }),
|
|
7482
|
+
!resolvedOutput && !entryUnavailable && artifactCache?.status === "error" && /* @__PURE__ */ jsxs23(Text25, { muted: true, size: 1, style: { color: "#f87171" }, children: [
|
|
7388
7483
|
"Failed to load model output",
|
|
7389
7484
|
artifactCache.error ? `: ${artifactCache.error}` : ""
|
|
7390
7485
|
] }),
|
|
@@ -9338,10 +9433,9 @@ function ReportDetail({
|
|
|
9338
9433
|
cancelled = true;
|
|
9339
9434
|
};
|
|
9340
9435
|
}, [client, reportId]);
|
|
9341
|
-
const artifactCache =
|
|
9342
|
-
|
|
9343
|
-
report?.summary?.
|
|
9344
|
-
client
|
|
9436
|
+
const artifactCache = useTestOutputsArtifact(
|
|
9437
|
+
report?.provenance?.runId,
|
|
9438
|
+
report?.summary?.artifactManifest?.testOutputs
|
|
9345
9439
|
);
|
|
9346
9440
|
const { summary } = report ?? {};
|
|
9347
9441
|
const hasWeaknesses = Boolean(
|