@tangle-network/agent-eval 0.72.0 → 0.72.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +39 -0
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +3 -2
- package/dist/agent-profile-DYRboYWu.d.ts +364 -0
- package/dist/analyst/index.d.ts +221 -0
- package/dist/analyst/index.js +371 -0
- package/dist/analyst/index.js.map +1 -0
- package/dist/analyst-t7zZS3TV.d.ts +88 -0
- package/dist/campaign/index.d.ts +518 -9
- package/dist/campaign/index.js +672 -22
- package/dist/campaign/index.js.map +1 -1
- package/dist/chunk-7W4SM7FD.js +1075 -0
- package/dist/chunk-7W4SM7FD.js.map +1 -0
- package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
- package/dist/chunk-JHA3ZGSO.js +1496 -0
- package/dist/chunk-JHA3ZGSO.js.map +1 -0
- package/dist/{chunk-4QJN7RDX.js → chunk-JYE3WOTE.js} +55 -7
- package/dist/{chunk-4QJN7RDX.js.map → chunk-JYE3WOTE.js.map} +1 -1
- package/dist/chunk-LB2UOI5F.js +412 -0
- package/dist/chunk-LB2UOI5F.js.map +1 -0
- package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
- package/dist/chunk-VUINJM5M.js.map +1 -0
- package/dist/chunk-WYIHD6EB.js +1044 -0
- package/dist/chunk-WYIHD6EB.js.map +1 -0
- package/dist/{chunk-UD6EF73X.js → chunk-XPILG2CA.js} +119 -2
- package/dist/chunk-XPILG2CA.js.map +1 -0
- package/dist/contract/index.d.ts +17 -13
- package/dist/contract/index.js +13 -7
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
- package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
- package/dist/hosted/index.d.ts +223 -2
- package/dist/index.d.ts +49 -1323
- package/dist/index.js +353 -2496
- package/dist/index.js.map +1 -1
- package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
- package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
- package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
- package/dist/openapi.json +1 -1
- package/dist/pareto-E-pembql.d.ts +81 -0
- package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
- package/dist/redact-B40YG2M_.d.ts +45 -0
- package/dist/registry-DuVYiTvw.d.ts +128 -0
- package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
- package/dist/rl.d.ts +4 -3
- package/dist/rl.js +4 -4
- package/dist/run-critic-BAIjX99r.d.ts +56 -0
- package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
- package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
- package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
- package/dist/traces.d.ts +371 -308
- package/dist/traces.js +43 -18
- package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
- package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
- package/dist/wire/index.d.ts +1 -1
- package/dist/workflow/index.d.ts +494 -0
- package/dist/workflow/index.js +2177 -0
- package/dist/workflow/index.js.map +1 -0
- package/docs/design/self-improvement-roadmap.md +106 -0
- package/package.json +36 -12
- package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
- package/dist/chunk-ODGETRTM.js.map +0 -1
- package/dist/chunk-SL55X4VN.js +0 -186
- package/dist/chunk-SL55X4VN.js.map +0 -1
- package/dist/chunk-UD6EF73X.js.map +0 -1
- /package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0
|
@@ -1,82 +1,116 @@
|
|
|
1
1
|
import {
|
|
2
|
-
|
|
3
|
-
hashJson
|
|
4
|
-
} from "./chunk-VSMTAMNK.js";
|
|
5
|
-
import {
|
|
6
|
-
defaultProviderRedactor,
|
|
7
|
-
providerFromBaseUrl
|
|
8
|
-
} from "./chunk-PC4UYEBM.js";
|
|
9
|
-
import {
|
|
10
|
-
NotFoundError,
|
|
11
|
-
ReplayError
|
|
2
|
+
NotFoundError
|
|
12
3
|
} from "./chunk-3BFEG2F6.js";
|
|
13
4
|
|
|
14
|
-
// src/trace-analyst/
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
5
|
+
// src/trace-analyst/otlp-span.ts
|
|
6
|
+
function projectOtlpFlatLine(raw) {
|
|
7
|
+
const trace_id = stringField(raw, "trace_id") ?? stringField(raw, "traceId");
|
|
8
|
+
const span_id = stringField(raw, "span_id") ?? stringField(raw, "spanId");
|
|
9
|
+
if (!trace_id || !span_id) return null;
|
|
10
|
+
const parent_id = stringField(raw, "parent_span_id") ?? stringField(raw, "parentSpanId") ?? null;
|
|
11
|
+
const name = stringField(raw, "name") ?? "unknown";
|
|
12
|
+
const start_time = stringField(raw, "start_time") ?? stringField(raw, "startTime") ?? "";
|
|
13
|
+
const end_time = stringField(raw, "end_time") ?? stringField(raw, "endTime") ?? start_time;
|
|
14
|
+
const status = readOtlpStatus(raw);
|
|
15
|
+
const attributes = extractOtlpAttributes(raw);
|
|
16
|
+
const service_name = asString(attributes["service.name"]) ?? asString(attributes["resource.attributes.service.name"]) ?? null;
|
|
17
|
+
const agent_name = asString(attributes["agent.name"]) ?? asString(attributes["inference.agent.name"]) ?? asString(attributes["inference.agent_name"]) ?? null;
|
|
18
|
+
const model_name = asString(attributes["llm.model_name"]) ?? asString(attributes["inference.llm.model_name"]) ?? asString(attributes["llm.model"]) ?? null;
|
|
19
|
+
const tool_name = asString(attributes["tool.name"]) ?? asString(attributes["inference.tool.name"]) ?? null;
|
|
20
|
+
const kind = inferOtlpKind(attributes);
|
|
21
|
+
let duration_ms = 0;
|
|
22
|
+
if (start_time && end_time) {
|
|
23
|
+
const a = Date.parse(start_time);
|
|
24
|
+
const b = Date.parse(end_time);
|
|
25
|
+
if (!Number.isNaN(a) && !Number.isNaN(b)) duration_ms = Math.max(0, b - a);
|
|
26
|
+
}
|
|
27
|
+
return {
|
|
28
|
+
trace_id,
|
|
29
|
+
span_id,
|
|
30
|
+
parent_span_id: parent_id && parent_id.length > 0 ? parent_id : null,
|
|
31
|
+
name,
|
|
32
|
+
kind,
|
|
33
|
+
start_time,
|
|
34
|
+
end_time,
|
|
35
|
+
duration_ms,
|
|
36
|
+
status: status.code,
|
|
37
|
+
status_message: status.message,
|
|
38
|
+
service_name,
|
|
39
|
+
agent_name,
|
|
40
|
+
model_name,
|
|
41
|
+
tool_name,
|
|
42
|
+
attributes
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
function readOtlpStatus(raw) {
|
|
46
|
+
const status = raw.status;
|
|
47
|
+
if (status && typeof status === "object" && !Array.isArray(status)) {
|
|
48
|
+
const codeRaw = status.code;
|
|
49
|
+
const code = codeRaw === "STATUS_CODE_OK" || codeRaw === "OK" ? "OK" : codeRaw === "STATUS_CODE_ERROR" || codeRaw === "ERROR" ? "ERROR" : "UNSET";
|
|
50
|
+
const messageRaw = status.message;
|
|
51
|
+
const message = typeof messageRaw === "string" && messageRaw.length > 0 ? messageRaw : void 0;
|
|
52
|
+
return { code, message };
|
|
53
|
+
}
|
|
54
|
+
return { code: "UNSET", message: void 0 };
|
|
55
|
+
}
|
|
56
|
+
function inferOtlpKind(attrs) {
|
|
57
|
+
const opik = asString(attrs["openinference.span.kind"]) ?? asString(attrs["inference.observation_kind"]);
|
|
58
|
+
if (opik) {
|
|
59
|
+
const upper = opik.toUpperCase();
|
|
60
|
+
if (upper === "AGENT" || upper === "LLM" || upper === "TOOL" || upper === "CHAIN" || upper === "GUARDRAIL" || upper === "SPAN") {
|
|
61
|
+
return upper;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
return "UNKNOWN";
|
|
65
|
+
}
|
|
66
|
+
function extractOtlpAttributes(raw) {
|
|
67
|
+
const out = {};
|
|
68
|
+
const resource = raw.resource;
|
|
69
|
+
if (resource && typeof resource === "object" && !Array.isArray(resource)) {
|
|
70
|
+
const ra = resource.attributes;
|
|
71
|
+
if (ra && typeof ra === "object" && !Array.isArray(ra)) {
|
|
72
|
+
for (const [k, v] of Object.entries(ra)) {
|
|
73
|
+
out[k] = v;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
const spanAttrs = raw.attributes;
|
|
78
|
+
if (spanAttrs && typeof spanAttrs === "object" && !Array.isArray(spanAttrs)) {
|
|
79
|
+
for (const [k, v] of Object.entries(spanAttrs)) {
|
|
80
|
+
out[k] = v;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
return out;
|
|
84
|
+
}
|
|
85
|
+
function stringField(raw, key) {
|
|
86
|
+
const v = raw[key];
|
|
87
|
+
return typeof v === "string" ? v : void 0;
|
|
88
|
+
}
|
|
89
|
+
function asString(v) {
|
|
90
|
+
return typeof v === "string" && v.length > 0 ? v : null;
|
|
91
|
+
}
|
|
92
|
+
function asNumber(v) {
|
|
93
|
+
if (typeof v === "number") return Number.isFinite(v) ? v : null;
|
|
94
|
+
if (typeof v === "string" && v.length > 0) {
|
|
95
|
+
const n = Number(v);
|
|
96
|
+
return Number.isFinite(n) ? n : null;
|
|
97
|
+
}
|
|
98
|
+
return null;
|
|
99
|
+
}
|
|
100
|
+
function firstNumberAttr(attrs, keys) {
|
|
101
|
+
for (const k of keys) {
|
|
102
|
+
const n = asNumber(attrs[k]);
|
|
103
|
+
if (n !== null) return n;
|
|
104
|
+
}
|
|
105
|
+
return null;
|
|
106
|
+
}
|
|
107
|
+
function firstStringAttr(attrs, keys) {
|
|
108
|
+
for (const k of keys) {
|
|
109
|
+
const s = asString(attrs[k]);
|
|
110
|
+
if (s !== null) return s;
|
|
111
|
+
}
|
|
112
|
+
return null;
|
|
113
|
+
}
|
|
80
114
|
|
|
81
115
|
// src/trace-analyst/types.ts
|
|
82
116
|
var DEFAULT_TRACE_ANALYST_BUDGETS = {
|
|
@@ -385,7 +419,7 @@ var OtlpFileTraceStore = class {
|
|
|
385
419
|
continue;
|
|
386
420
|
}
|
|
387
421
|
if (!parsed || typeof parsed !== "object") continue;
|
|
388
|
-
const span =
|
|
422
|
+
const span = projectOtlpFlatLine(parsed);
|
|
389
423
|
if (!span) continue;
|
|
390
424
|
let entry = byTrace.get(span.trace_id);
|
|
391
425
|
if (!entry) {
|
|
@@ -508,7 +542,7 @@ var OtlpFileTraceStore = class {
|
|
|
508
542
|
if (parsed && typeof parsed === "object") raw = parsed;
|
|
509
543
|
} catch {
|
|
510
544
|
}
|
|
511
|
-
const attrs =
|
|
545
|
+
const attrs = extractOtlpAttributes(raw);
|
|
512
546
|
const projected = {};
|
|
513
547
|
for (const [k, v] of Object.entries(attrs)) {
|
|
514
548
|
if (typeof v === "string") {
|
|
@@ -617,91 +651,6 @@ var SpanNotFoundError = class extends NotFoundError {
|
|
|
617
651
|
this.span_id = span_id;
|
|
618
652
|
}
|
|
619
653
|
};
|
|
620
|
-
function readOtlpSpan(raw) {
|
|
621
|
-
const trace_id = stringField(raw, "trace_id") ?? stringField(raw, "traceId");
|
|
622
|
-
const span_id = stringField(raw, "span_id") ?? stringField(raw, "spanId");
|
|
623
|
-
if (!trace_id || !span_id) return null;
|
|
624
|
-
const parent_id = stringField(raw, "parent_span_id") ?? stringField(raw, "parentSpanId") ?? null;
|
|
625
|
-
const name = stringField(raw, "name") ?? "unknown";
|
|
626
|
-
const start_time = stringField(raw, "start_time") ?? stringField(raw, "startTime") ?? "";
|
|
627
|
-
const end_time = stringField(raw, "end_time") ?? stringField(raw, "endTime") ?? start_time;
|
|
628
|
-
const status = readStatus(raw);
|
|
629
|
-
const attrs = extractAttributes(raw);
|
|
630
|
-
const service_name = asString(attrs["service.name"]) ?? asString(attrs["resource.attributes.service.name"]) ?? null;
|
|
631
|
-
const agent_name = asString(attrs["agent.name"]) ?? asString(attrs["inference.agent.name"]) ?? null;
|
|
632
|
-
const model_name = asString(attrs["llm.model_name"]) ?? asString(attrs["inference.llm.model_name"]) ?? null;
|
|
633
|
-
const tool_name = asString(attrs["tool.name"]) ?? asString(attrs["inference.tool.name"]) ?? null;
|
|
634
|
-
const kind = inferKind(attrs);
|
|
635
|
-
let duration_ms = 0;
|
|
636
|
-
if (start_time && end_time) {
|
|
637
|
-
const a = Date.parse(start_time);
|
|
638
|
-
const b = Date.parse(end_time);
|
|
639
|
-
if (!Number.isNaN(a) && !Number.isNaN(b)) duration_ms = Math.max(0, b - a);
|
|
640
|
-
}
|
|
641
|
-
return {
|
|
642
|
-
trace_id,
|
|
643
|
-
span_id,
|
|
644
|
-
parent_span_id: parent_id && parent_id.length > 0 ? parent_id : null,
|
|
645
|
-
name,
|
|
646
|
-
kind,
|
|
647
|
-
start_time,
|
|
648
|
-
end_time,
|
|
649
|
-
duration_ms,
|
|
650
|
-
status: status.code,
|
|
651
|
-
status_message: status.message,
|
|
652
|
-
service_name,
|
|
653
|
-
agent_name,
|
|
654
|
-
model_name,
|
|
655
|
-
tool_name
|
|
656
|
-
};
|
|
657
|
-
}
|
|
658
|
-
function readStatus(raw) {
|
|
659
|
-
const status = raw.status;
|
|
660
|
-
if (status && typeof status === "object" && !Array.isArray(status)) {
|
|
661
|
-
const codeRaw = status.code;
|
|
662
|
-
const code = codeRaw === "STATUS_CODE_OK" || codeRaw === "OK" ? "OK" : codeRaw === "STATUS_CODE_ERROR" || codeRaw === "ERROR" ? "ERROR" : "UNSET";
|
|
663
|
-
const messageRaw = status.message;
|
|
664
|
-
const message = typeof messageRaw === "string" && messageRaw.length > 0 ? messageRaw : void 0;
|
|
665
|
-
return { code, message };
|
|
666
|
-
}
|
|
667
|
-
return { code: "UNSET", message: void 0 };
|
|
668
|
-
}
|
|
669
|
-
function inferKind(attrs) {
|
|
670
|
-
const opik = asString(attrs["openinference.span.kind"]) ?? asString(attrs["inference.observation_kind"]);
|
|
671
|
-
if (opik) {
|
|
672
|
-
const upper = opik.toUpperCase();
|
|
673
|
-
if (upper === "AGENT" || upper === "LLM" || upper === "TOOL" || upper === "CHAIN" || upper === "GUARDRAIL" || upper === "SPAN") {
|
|
674
|
-
return upper;
|
|
675
|
-
}
|
|
676
|
-
}
|
|
677
|
-
return "UNKNOWN";
|
|
678
|
-
}
|
|
679
|
-
function extractAttributes(raw) {
|
|
680
|
-
const out = {};
|
|
681
|
-
const resource = raw.resource;
|
|
682
|
-
if (resource && typeof resource === "object" && !Array.isArray(resource)) {
|
|
683
|
-
const ra = resource.attributes;
|
|
684
|
-
if (ra && typeof ra === "object" && !Array.isArray(ra)) {
|
|
685
|
-
for (const [k, v] of Object.entries(ra)) {
|
|
686
|
-
out[k] = v;
|
|
687
|
-
}
|
|
688
|
-
}
|
|
689
|
-
}
|
|
690
|
-
const spanAttrs = raw.attributes;
|
|
691
|
-
if (spanAttrs && typeof spanAttrs === "object" && !Array.isArray(spanAttrs)) {
|
|
692
|
-
for (const [k, v] of Object.entries(spanAttrs)) {
|
|
693
|
-
out[k] = v;
|
|
694
|
-
}
|
|
695
|
-
}
|
|
696
|
-
return out;
|
|
697
|
-
}
|
|
698
|
-
function stringField(raw, key) {
|
|
699
|
-
const v = raw[key];
|
|
700
|
-
return typeof v === "string" ? v : void 0;
|
|
701
|
-
}
|
|
702
|
-
function asString(v) {
|
|
703
|
-
return typeof v === "string" && v.length > 0 ? v : null;
|
|
704
|
-
}
|
|
705
654
|
function isPresent(v) {
|
|
706
655
|
return v !== void 0;
|
|
707
656
|
}
|
|
@@ -944,42 +893,109 @@ function assertStringArray(v, label) {
|
|
|
944
893
|
return v;
|
|
945
894
|
}
|
|
946
895
|
|
|
947
|
-
// src/trace-analyst/
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
896
|
+
// src/trace-analyst/prompts.ts
|
|
897
|
+
var TRACE_ANALYST_ACTOR_DESCRIPTION = `You answer questions about an OTLP-shaped JSONL trace dataset using the trace tools provided in the \`traces\` namespace.
|
|
898
|
+
|
|
899
|
+
DISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol \u2014 follow exactly:
|
|
900
|
+
|
|
901
|
+
1. ALWAYS call \`traces.getDatasetOverview({})\` FIRST without a regex_pattern. The result tells you total_traces, raw_jsonl_bytes, services, agents, models, and sample_trace_ids (real ids \u2014 never fabricate one).
|
|
902
|
+
|
|
903
|
+
2. Use raw_jsonl_bytes to gauge how expensive raw scans will be. \`filters.regex_pattern\` is the one scan-heavy filter on getDatasetOverview / queryTraces / countTraces \u2014 narrow with indexed fields (has_errors, model_names, service_names, agent_names, time bounds) BEFORE adding a regex on a large dataset.
|
|
904
|
+
|
|
905
|
+
3. To list more traces than the sample, call \`traces.queryTraces({ filters?, limit, offset? })\`. Each summary carries raw_jsonl_bytes \u2014 use it to choose between viewTrace and searchTrace BEFORE calling either.
|
|
906
|
+
|
|
907
|
+
4. Per-trace inspection:
|
|
908
|
+
- SMALL trace (raw_jsonl_bytes well under 150_000): call \`traces.viewTrace({ trace_id })\`. Returns all spans. Per-attribute payloads are head-capped at ~4KB; large \`input.value\` / \`output.value\` / \`llm.input_messages\` will show a \`[trace-analyst truncated: N bytes]\` marker.
|
|
909
|
+
- LARGE trace (raw_jsonl_bytes near or above 150_000, or you saw an \`oversized\` response): use \`traces.searchTrace({ trace_id, regex_pattern })\` to get bounded SpanMatchRecords (span metadata + matched text + surrounding context). Then call \`traces.viewSpans({ trace_id, span_ids: [...] })\` for surgical reads (~16KB cap, 4\xD7 higher than discovery), or \`traces.searchSpan({ trace_id, span_id, regex_pattern })\` for one large span. Stays bounded regardless of trace size.
|
|
910
|
+
- Useful regex patterns: \`STATUS_CODE_ERROR\` (failures), tool names like \`grep\` or \`view_trace\`, error strings like \`MaxTurnsExceeded\`, model names, attribute keys.
|
|
911
|
+
|
|
912
|
+
5. ONLY call viewTrace / viewSpans / searchTrace / searchSpan with trace/span ids you have already seen in sample_trace_ids, a queryTraces page, or a previous search result. Never invent ids.
|
|
913
|
+
|
|
914
|
+
5a. **Result-shape contract** \u2014 searchTrace and searchSpan return \`{ trace_id, hits, total_matches, has_more }\`. Iterate \`result.hits\` (NOT result.matches). Each hit has \`{ span_id, span_name, span_kind, attribute_path, matched_text, context_before, context_after, match_offset }\`. viewTrace returns \`{ trace_id, spans }\` (or \`oversized\`). viewSpans returns \`{ trace_id, spans, missing_span_ids, truncated_attribute_count }\`. Never assume a field name \u2014 log the result shape first if unsure.
|
|
915
|
+
|
|
916
|
+
6. If viewTrace returns an \`oversized\` summary instead of \`spans\`, DO NOT retry the same call. Read the summary's top_span_names, span_count, span_response_bytes_max, error_span_count to plan a follow-up: switch to searchTrace (or searchSpan for one large span), then viewSpans on a smaller, surgical span_ids set.
|
|
917
|
+
|
|
918
|
+
7. If searchTrace or searchSpan returns has_more=true, REFINE the regex to be more specific rather than blindly raising max_matches.
|
|
919
|
+
|
|
920
|
+
8. If a tool errors (invalid regex, range error), STOP and reconsider \u2014 don't retry with a guessed id or argument. Use the discovery tools above to recover.
|
|
921
|
+
|
|
922
|
+
9. If a ~4KB-truncated payload from viewTrace / searchTrace matters for your answer, first try viewSpans on that span id (~16KB cap). If a 16KB-truncated payload from viewSpans still matters, narrow further with searchSpan against a more specific regex rather than asking for the full payload again.
|
|
923
|
+
|
|
924
|
+
10. If maxDepth > 0 and the question splits into independent semantic branches, delegate well-defined subtasks to subagents using \`await llmQuery(...)\`. Pass narrow context and a focused query. Examples:
|
|
925
|
+
|
|
926
|
+
const reviews = await llmQuery([
|
|
927
|
+
{ query: 'Drill into trace abc123 \u2014 what tool calls preceded the failure?', context: { trace_id: 'abc123' } },
|
|
928
|
+
{ query: 'Drill into trace def456 \u2014 same failure mode?', context: { trace_id: 'def456' } },
|
|
929
|
+
]);
|
|
930
|
+
|
|
931
|
+
OBSERVABILITY rules:
|
|
932
|
+
- Each non-final actor turn must emit at least one \`console.log(...)\` for evidence. Up to 3 logs per turn is fine when correlating multiple data sources (e.g. one log for findings list, one for source-file content, one for derived analysis).
|
|
933
|
+
- Do NOT combine \`console.log\` with \`final(...)\` or \`askClarification(...)\` in the same turn \u2014 finish gathering data first, then call final on its own turn.
|
|
934
|
+
- Reuse runtime variables across turns; don't recompute.
|
|
935
|
+
- When done, call \`await final(answer)\` with the fully-formed report. The responder rewrites the answer into output fields; if you only pass a vague summary string the responder has nothing concrete to format.
|
|
936
|
+
|
|
937
|
+
CRITICAL \u2014 \`final()\` payload contract for evidence-grounded analysis tasks:
|
|
938
|
+
- Pass a STRUCTURED object as the second arg with the actual data the responder needs to format the answer. Do NOT pass abstract instructions; pass evidence.
|
|
939
|
+
- Example for per-item verdict tasks:
|
|
940
|
+
\`\`\`js
|
|
941
|
+
await final("Format the per-item verdict report from the evidence below.", {
|
|
942
|
+
findings: [
|
|
943
|
+
{ id: 'sub-1-finding-1', claim: '...', verdict: 'TRUE-POSITIVE', evidence: 'lines 42-45 of contracts/X.sol show ...' },
|
|
944
|
+
...all items
|
|
945
|
+
],
|
|
946
|
+
systemic_summary: '3 sentences I wrote based on the evidence above'
|
|
947
|
+
});
|
|
948
|
+
\`\`\`
|
|
949
|
+
- Calling \`final("answer", {})\` with no evidence is a failure mode \u2014 the responder will hallucinate or echo back the field names. Always include the gathered data.
|
|
950
|
+
- Premature final after a single viewSpans call is INSUFFICIENT for per-finding analysis tasks. Read the requested attributes (e.g. \`spans[i].attributes['redteam.finding.title']\`), and for each one perform the requested cross-reference (e.g. read the source SPAN's \`attributes['source.content']\`).
|
|
951
|
+
|
|
952
|
+
OUTPUT contract \u2014 your final answer must include:
|
|
953
|
+
- A clear prose conclusion answering the user's question.
|
|
954
|
+
- Trace ids and span ids cited as evidence for each claim.
|
|
955
|
+
- Failure modes named in the user's domain language, with frequency and concrete examples.
|
|
956
|
+
|
|
957
|
+
Do NOT invent trace ids, span ids, error messages, or model names. Every fact must be traceable to a tool result.`;
|
|
958
|
+
var TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION = "trace-analyst-actor-v5-2026-05-06";
|
|
959
|
+
var TRACE_ANALYST_SUBAGENT_DESCRIPTION = `You are a trace-analyst subagent. Your parent has delegated a focused trace-inspection question. Use the same DISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol but stay tightly scoped: do exactly what was asked, return a concise compact answer, do NOT spawn further subagents unless the parent's question is genuinely multi-branch.
|
|
960
|
+
|
|
961
|
+
Cite trace ids and span ids for every claim. Do NOT invent ids.`;
|
|
962
|
+
|
|
963
|
+
// src/trace-analyst/analyst.ts
|
|
964
|
+
import { AxJSRuntime, agent } from "@ax-llm/ax";
|
|
965
|
+
async function analyzeTraces(input, options) {
|
|
966
|
+
if (!input.question || typeof input.question !== "string") {
|
|
967
|
+
throw new TypeError("analyzeTraces: input.question must be a non-empty string");
|
|
968
|
+
}
|
|
969
|
+
const store = typeof options.source === "string" ? new OtlpFileTraceStore({ path: options.source }) : options.source;
|
|
970
|
+
if (store instanceof OtlpFileTraceStore) {
|
|
971
|
+
await store.ensureIndexed();
|
|
972
|
+
}
|
|
973
|
+
const tools = buildTraceAnalystTools({ store });
|
|
974
|
+
const turns = [];
|
|
975
|
+
let progressFs;
|
|
976
|
+
if (options.progressLogPath) {
|
|
977
|
+
const { createWriteStream } = await import("fs");
|
|
978
|
+
const { mkdir } = await import("fs/promises");
|
|
979
|
+
const { dirname } = await import("path");
|
|
980
|
+
await mkdir(dirname(options.progressLogPath), { recursive: true });
|
|
981
|
+
progressFs = createWriteStream(options.progressLogPath, { flags: "a" });
|
|
982
|
+
}
|
|
983
|
+
const actorTurnCallback = async (turn) => {
|
|
984
|
+
const snap = {
|
|
985
|
+
turn: turn.turn,
|
|
986
|
+
isError: turn.isError,
|
|
987
|
+
code: turn.code,
|
|
988
|
+
output: turn.output,
|
|
989
|
+
thought: turn.thought
|
|
990
|
+
};
|
|
991
|
+
turns.push(snap);
|
|
992
|
+
if (progressFs) {
|
|
993
|
+
try {
|
|
994
|
+
progressFs.write(`${JSON.stringify({ ...snap, ts: Date.now() })}
|
|
995
|
+
`);
|
|
996
|
+
} catch {
|
|
997
|
+
}
|
|
998
|
+
}
|
|
983
999
|
if (options.onTurn) await options.onTurn(snap);
|
|
984
1000
|
};
|
|
985
1001
|
const maxDepth = options.maxDepth ?? 1;
|
|
@@ -987,7 +1003,11 @@ async function analyzeTraces(input, options) {
|
|
|
987
1003
|
const maxParallelSubagents = options.maxParallelSubagents ?? 2;
|
|
988
1004
|
const maxRuntimeChars = options.maxRuntimeChars ?? 6e3;
|
|
989
1005
|
const analyst = agent(
|
|
990
|
-
|
|
1006
|
+
// `reasoning!` is an internal (Ax `!`) scratchpad field: generated first to
|
|
1007
|
+
// force reason-before-conclude, stripped from the returned output — so the
|
|
1008
|
+
// consumed shape stays { answer, findings }. Brings the trace-analyst to the
|
|
1009
|
+
// same prose-first CoT ordering the kind-factory gets from its `report` field.
|
|
1010
|
+
"question:string -> reasoning!:string, answer:string, findings:string[]",
|
|
991
1011
|
{
|
|
992
1012
|
agentIdentity: {
|
|
993
1013
|
name: "TraceAnalyst",
|
|
@@ -1060,1197 +1080,16 @@ function normalizeRecordArray(value) {
|
|
|
1060
1080
|
);
|
|
1061
1081
|
}
|
|
1062
1082
|
|
|
1063
|
-
// src/trace-analyst/hook.ts
|
|
1064
|
-
var DEFAULT_QUESTION = "Summarise what happened in this run. Surface any failure modes, surprising findings, or evidence that the run's verdict is wrong.";
|
|
1065
|
-
function traceAnalystOnRunComplete(opts) {
|
|
1066
|
-
return async (ctx) => {
|
|
1067
|
-
if (opts.shouldRun && !opts.shouldRun(ctx)) return;
|
|
1068
|
-
const source = opts.analyze.source;
|
|
1069
|
-
if (source === void 0) {
|
|
1070
|
-
await ctx.store.appendEvent({
|
|
1071
|
-
eventId: `analyst-skip-${ctx.runId}`,
|
|
1072
|
-
runId: ctx.runId,
|
|
1073
|
-
kind: "log",
|
|
1074
|
-
timestamp: Date.now(),
|
|
1075
|
-
payload: { source: "trace_analyst_hook", reason: "no source configured" }
|
|
1076
|
-
});
|
|
1077
|
-
return;
|
|
1078
|
-
}
|
|
1079
|
-
const result = await analyzeTraces({ question: opts.question ?? DEFAULT_QUESTION }, {
|
|
1080
|
-
...opts.analyze,
|
|
1081
|
-
source
|
|
1082
|
-
});
|
|
1083
|
-
if (opts.save) await opts.save(result, ctx);
|
|
1084
|
-
if (opts.gateOn && !opts.gateOn(result, ctx)) {
|
|
1085
|
-
await ctx.store.appendEvent({
|
|
1086
|
-
eventId: `analyst-gate-${ctx.runId}`,
|
|
1087
|
-
runId: ctx.runId,
|
|
1088
|
-
kind: "log",
|
|
1089
|
-
timestamp: Date.now(),
|
|
1090
|
-
payload: {
|
|
1091
|
-
source: "trace_analyst_hook",
|
|
1092
|
-
reason: "analyst_gate_failed",
|
|
1093
|
-
findings: result.findings
|
|
1094
|
-
}
|
|
1095
|
-
});
|
|
1096
|
-
}
|
|
1097
|
-
};
|
|
1098
|
-
}
|
|
1099
|
-
|
|
1100
|
-
// src/trace-analyst/insights.ts
|
|
1101
|
-
var DOMAIN_STOP_WORDS = /* @__PURE__ */ new Set([
|
|
1102
|
-
"and",
|
|
1103
|
-
"advanced",
|
|
1104
|
-
"app",
|
|
1105
|
-
"build",
|
|
1106
|
-
"create",
|
|
1107
|
-
"easy",
|
|
1108
|
-
"expert",
|
|
1109
|
-
"extreme",
|
|
1110
|
-
"for",
|
|
1111
|
-
"from",
|
|
1112
|
-
"hard",
|
|
1113
|
-
"implementation",
|
|
1114
|
-
"integrate",
|
|
1115
|
-
"medium",
|
|
1116
|
-
"project",
|
|
1117
|
-
"task",
|
|
1118
|
-
"the",
|
|
1119
|
-
"this",
|
|
1120
|
-
"with",
|
|
1121
|
-
"workflow"
|
|
1122
|
-
]);
|
|
1123
|
-
function tokenizeDomainWords(value) {
|
|
1124
|
-
return [...value.matchAll(/[A-Za-z][A-Za-z0-9.+#-]{2,}/g)].map((match) => match[0].toLowerCase()).filter((word) => !DOMAIN_STOP_WORDS.has(word));
|
|
1125
|
-
}
|
|
1126
|
-
function inferDomainKeywords(suite) {
|
|
1127
|
-
const suiteWords = new Set(tokenizeDomainWords(`${suite.name} ${suite.collectionId ?? ""}`));
|
|
1128
|
-
const source = [
|
|
1129
|
-
suite.name,
|
|
1130
|
-
suite.collectionId ?? "",
|
|
1131
|
-
...suite.tasks.flatMap((task) => [
|
|
1132
|
-
task.id,
|
|
1133
|
-
task.name,
|
|
1134
|
-
task.prompt ?? "",
|
|
1135
|
-
task.difficulty ?? "",
|
|
1136
|
-
...task.tags ?? [],
|
|
1137
|
-
...task.gaps ?? []
|
|
1138
|
-
])
|
|
1139
|
-
].join(" ");
|
|
1140
|
-
const counts = /* @__PURE__ */ new Map();
|
|
1141
|
-
for (const word of tokenizeDomainWords(source)) counts.set(word, (counts.get(word) ?? 0) + 1);
|
|
1142
|
-
return [...counts.entries()].filter(([word, count]) => count >= 2 || suiteWords.has(word)).sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])).map(([word]) => word).slice(0, 18);
|
|
1143
|
-
}
|
|
1144
|
-
function domainEvidencePattern(keywords) {
|
|
1145
|
-
const escaped = keywords.filter((keyword) => keyword.length >= 3).map((keyword) => keyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
|
|
1146
|
-
return escaped.length > 0 ? new RegExp(`(?<![A-Za-z0-9])(?:${escaped.join("|")})(?![A-Za-z0-9])`, "i") : /(?<![A-Za-z0-9])(?:sdk|api|css|dns|xml|provider|client|service|integration|webhook|transaction|auth|oauth|graphql|rest)(?![A-Za-z0-9])/i;
|
|
1147
|
-
}
|
|
1148
|
-
function describeTraceInsightScope(suite) {
|
|
1149
|
-
const taskLabel = suite.tasks.length === 1 ? "1 implementation task" : `${suite.tasks.length} implementation tasks`;
|
|
1150
|
-
const tags = /* @__PURE__ */ new Map();
|
|
1151
|
-
for (const task of suite.tasks) {
|
|
1152
|
-
for (const tag of task.tags ?? []) tags.set(tag, (tags.get(tag) ?? 0) + 1);
|
|
1153
|
-
}
|
|
1154
|
-
const topTags = [...tags.entries()].sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])).slice(0, 8).map(([tag]) => tag);
|
|
1155
|
-
if (topTags.length > 0) return `${taskLabel} across ${topTags.join(", ")}.`;
|
|
1156
|
-
const difficulties = [
|
|
1157
|
-
...new Set(
|
|
1158
|
-
suite.tasks.map((task) => task.difficulty).filter((value) => Boolean(value))
|
|
1159
|
-
)
|
|
1160
|
-
].join(", ");
|
|
1161
|
-
return `${taskLabel} across ${difficulties || "the selected benchmark scope"}.`;
|
|
1162
|
-
}
|
|
1163
|
-
function planTraceInsightQuestions(input) {
|
|
1164
|
-
const hasFailures = input.suite.tasks.some((task) => task.outcome && task.outcome !== "satisfied");
|
|
1165
|
-
const hasMultipleShots = input.suite.tasks.some(
|
|
1166
|
-
(task) => (task.gaps ?? []).some((gap) => /shot|review|retry|continue/i.test(gap))
|
|
1167
|
-
);
|
|
1168
|
-
const questions = [
|
|
1169
|
-
{
|
|
1170
|
-
id: "execution-path",
|
|
1171
|
-
question: "What did the worker actually do before the first meaningful implementation edit?",
|
|
1172
|
-
why: "Separates grounded execution from polished but shallow output."
|
|
1173
|
-
},
|
|
1174
|
-
{
|
|
1175
|
-
id: "research-grounding",
|
|
1176
|
-
question: "Did the worker inspect docs, source, examples, or package references before committing to an implementation path?",
|
|
1177
|
-
why: "Identifies whether failures came from weak retrieval, weak examples, or premature coding."
|
|
1178
|
-
},
|
|
1179
|
-
{
|
|
1180
|
-
id: "domain-proof",
|
|
1181
|
-
question: "Which tasks produced executable domain proof versus UI copy, placeholders, or inferred behavior?",
|
|
1182
|
-
why: "Keeps product-quality claims tied to concrete evidence."
|
|
1183
|
-
},
|
|
1184
|
-
{
|
|
1185
|
-
id: "root-cause",
|
|
1186
|
-
question: "For each major failure cluster, is the likely root cause prompt/scaffold, docs/examples, SDK/API ergonomics, evaluator, runtime, or model behavior?",
|
|
1187
|
-
why: "Turns trace observations into actionable ownership."
|
|
1188
|
-
},
|
|
1189
|
-
{
|
|
1190
|
-
id: "evidence-quality",
|
|
1191
|
-
question: "Which external-facing claims are directly supported by trace ids, span ids, verifier findings, reviewer notes, or generated code?",
|
|
1192
|
-
why: "Prevents unsupported customer-report conclusions."
|
|
1193
|
-
}
|
|
1194
|
-
];
|
|
1195
|
-
if (hasMultipleShots) {
|
|
1196
|
-
questions.push({
|
|
1197
|
-
id: "reviewer-lift",
|
|
1198
|
-
question: "Where did reviewer feedback improve score, stall, or regress across shots?",
|
|
1199
|
-
why: "Shows whether the driver loop is learning or merely repeating work."
|
|
1200
|
-
});
|
|
1201
|
-
}
|
|
1202
|
-
if (hasFailures) {
|
|
1203
|
-
questions.push({
|
|
1204
|
-
id: "optimization-targets",
|
|
1205
|
-
question: "Which prompt, evaluator, scaffold, or workflow changes should feed the next GEPA/autoresearch optimization run?",
|
|
1206
|
-
why: "Connects benchmark evidence to the optimization loop."
|
|
1207
|
-
});
|
|
1208
|
-
}
|
|
1209
|
-
return questions;
|
|
1210
|
-
}
|
|
1211
|
-
function buildTraceInsightContext(input) {
|
|
1212
|
-
return {
|
|
1213
|
-
suite: input.suite,
|
|
1214
|
-
scope: describeTraceInsightScope(input.suite),
|
|
1215
|
-
keywords: inferDomainKeywords(input.suite),
|
|
1216
|
-
questions: planTraceInsightQuestions(input),
|
|
1217
|
-
panel: defaultTraceInsightPanel(),
|
|
1218
|
-
findings: input.findings ?? [],
|
|
1219
|
-
agent: input.agent ?? null,
|
|
1220
|
-
totals: input.totals ?? null
|
|
1221
|
-
};
|
|
1222
|
-
}
|
|
1223
|
-
function scoreTraceInsightReadiness(context) {
|
|
1224
|
-
const failedTasks = context.suite.tasks.filter(
|
|
1225
|
-
(task) => task.outcome && task.outcome !== "satisfied"
|
|
1226
|
-
);
|
|
1227
|
-
const findingTaskIds = new Set(context.findings.flatMap((finding) => finding.taskIds));
|
|
1228
|
-
const failedTasksWithFindings = failedTasks.filter((task) => findingTaskIds.has(task.id));
|
|
1229
|
-
const tasksWithGaps = context.suite.tasks.filter((task) => (task.gaps ?? []).length > 0);
|
|
1230
|
-
const gates = [
|
|
1231
|
-
{
|
|
1232
|
-
id: "domain-context",
|
|
1233
|
-
label: "Domain context inferred",
|
|
1234
|
-
passed: context.keywords.length > 0,
|
|
1235
|
-
severity: "high",
|
|
1236
|
-
detail: context.keywords.length > 0 ? `${context.keywords.length} domain terms inferred: ${context.keywords.slice(0, 8).join(", ")}` : "No domain terms were inferred from suite, tasks, prompts, tags, or gaps."
|
|
1237
|
-
},
|
|
1238
|
-
{
|
|
1239
|
-
id: "panel-coverage",
|
|
1240
|
-
label: "Analyst panel planned",
|
|
1241
|
-
passed: context.panel.length >= 4 && context.questions.length >= 5,
|
|
1242
|
-
severity: "high",
|
|
1243
|
-
detail: `${context.panel.length} panel roles and ${context.questions.length} investigation questions planned.`
|
|
1244
|
-
},
|
|
1245
|
-
{
|
|
1246
|
-
id: "failure-coverage",
|
|
1247
|
-
label: "Failures mapped to findings",
|
|
1248
|
-
passed: failedTasks.length === 0 || failedTasksWithFindings.length / failedTasks.length >= 0.5,
|
|
1249
|
-
severity: "critical",
|
|
1250
|
-
detail: failedTasks.length === 0 ? "No failed tasks in suite." : `${failedTasksWithFindings.length}/${failedTasks.length} failed tasks appear in finding clusters.`
|
|
1251
|
-
},
|
|
1252
|
-
{
|
|
1253
|
-
id: "gap-evidence",
|
|
1254
|
-
label: "Task gaps captured",
|
|
1255
|
-
passed: failedTasks.length === 0 || tasksWithGaps.length / failedTasks.length >= 0.5,
|
|
1256
|
-
severity: "medium",
|
|
1257
|
-
detail: `${tasksWithGaps.length} tasks include explicit evaluator or analyst gaps.`
|
|
1258
|
-
}
|
|
1259
|
-
];
|
|
1260
|
-
const penalty = gates.reduce((sum, gate) => {
|
|
1261
|
-
if (gate.passed) return sum;
|
|
1262
|
-
if (gate.severity === "critical") return sum + 35;
|
|
1263
|
-
if (gate.severity === "high") return sum + 20;
|
|
1264
|
-
if (gate.severity === "medium") return sum + 10;
|
|
1265
|
-
return sum + 5;
|
|
1266
|
-
}, 0);
|
|
1267
|
-
const score = Math.max(0, Math.min(1, 1 - penalty / 100));
|
|
1268
|
-
return {
|
|
1269
|
-
score,
|
|
1270
|
-
grade: score >= 0.9 ? "external-ready" : score >= 0.7 ? "internal-review" : "raw-analysis",
|
|
1271
|
-
gates
|
|
1272
|
-
};
|
|
1273
|
-
}
|
|
1274
|
-
function defaultTraceInsightPanel() {
|
|
1275
|
-
return [
|
|
1276
|
-
{
|
|
1277
|
-
id: "trace-forensics",
|
|
1278
|
-
name: "Trace Forensics",
|
|
1279
|
-
responsibility: "Reconstruct what the worker did in order, including research, edits, reviewer interventions, verifier feedback, and stop reason."
|
|
1280
|
-
},
|
|
1281
|
-
{
|
|
1282
|
-
id: "root-cause",
|
|
1283
|
-
name: "Root Cause",
|
|
1284
|
-
responsibility: "Map failures to prompt/scaffold, docs/examples, SDK/API/product ergonomics, evaluator, runtime, or model behavior."
|
|
1285
|
-
},
|
|
1286
|
-
{
|
|
1287
|
-
id: "optimization",
|
|
1288
|
-
name: "Optimization",
|
|
1289
|
-
responsibility: "Identify prompt, reviewer, evaluator, scaffold, and GEPA/autoresearch changes that should be tested next."
|
|
1290
|
-
},
|
|
1291
|
-
{
|
|
1292
|
-
id: "external-evidence",
|
|
1293
|
-
name: "External Evidence",
|
|
1294
|
-
responsibility: "Separate customer-safe claims from internal harness findings and reject conclusions without task, trace, span, code, reviewer, or verifier evidence."
|
|
1295
|
-
}
|
|
1296
|
-
];
|
|
1297
|
-
}
|
|
1298
|
-
function buildTraceInsightPrompt(input) {
|
|
1299
|
-
const context = buildTraceInsightContext(input);
|
|
1300
|
-
const maxRepresentativeTraces = input.maxRepresentativeTraces ?? 6;
|
|
1301
|
-
return `Analyze this benchmark run and produce evidence-backed trace intelligence.
|
|
1302
|
-
|
|
1303
|
-
Audience:
|
|
1304
|
-
- internal AI/product leadership
|
|
1305
|
-
- possible customer-facing report for ${input.suite.name}
|
|
1306
|
-
|
|
1307
|
-
Investigation plan:
|
|
1308
|
-
${context.questions.map((item, index) => `${index + 1}. ${item.question} (${item.why})`).join("\n")}
|
|
1309
|
-
|
|
1310
|
-
Analyst panel:
|
|
1311
|
-
${context.panel.map((role) => `- ${role.name}: ${role.responsibility}`).join("\n")}
|
|
1312
|
-
|
|
1313
|
-
If the task branches are independent, use subagents for the panel roles above and aggregate their findings. Do not run a panel role unless its answer will change the final report.
|
|
1314
|
-
|
|
1315
|
-
Required output:
|
|
1316
|
-
1. Executive verdict: what this run proves and does not prove.
|
|
1317
|
-
2. The investigation questions you answered and the evidence used.
|
|
1318
|
-
3. Failure taxonomy: agent prompting, evaluator/harness, docs/examples, SDK/API/product integration, infra.
|
|
1319
|
-
4. Evidence-backed examples with trace ids/task ids and concrete verifier findings.
|
|
1320
|
-
5. Highest-ROI fixes for the benchmark harness, prompt/GEPA optimization, and customer-facing product/docs surface.
|
|
1321
|
-
6. What is safe for an external report versus what must stay internal.
|
|
1322
|
-
7. One rerun plan that would validate lift after optimization.
|
|
1323
|
-
|
|
1324
|
-
Budget:
|
|
1325
|
-
- Inspect the dataset overview, the failure summary, and at most ${maxRepresentativeTraces} representative traces.
|
|
1326
|
-
- Prefer traces named in the failure summary over broad exploration.
|
|
1327
|
-
- Do not do exhaustive trace sweeps.
|
|
1328
|
-
- Return the final report as soon as the taxonomy and examples are supported.
|
|
1329
|
-
|
|
1330
|
-
Run summary:
|
|
1331
|
-
${JSON.stringify(
|
|
1332
|
-
{
|
|
1333
|
-
suite: input.suite.name,
|
|
1334
|
-
scope: context.scope,
|
|
1335
|
-
inferredKeywords: context.keywords,
|
|
1336
|
-
agent: context.agent,
|
|
1337
|
-
totals: context.totals,
|
|
1338
|
-
findings: context.findings.map((finding) => ({
|
|
1339
|
-
kind: finding.kind,
|
|
1340
|
-
severity: finding.severity,
|
|
1341
|
-
taskCount: finding.taskIds.length,
|
|
1342
|
-
proposedFixClass: finding.proposedFixClass
|
|
1343
|
-
})),
|
|
1344
|
-
failures: input.suite.tasks.filter((task) => task.outcome && task.outcome !== "satisfied").map((task) => ({
|
|
1345
|
-
task: task.id,
|
|
1346
|
-
difficulty: task.difficulty,
|
|
1347
|
-
outcome: task.outcome,
|
|
1348
|
-
score: task.score,
|
|
1349
|
-
gaps: task.gaps ?? []
|
|
1350
|
-
}))
|
|
1351
|
-
},
|
|
1352
|
-
null,
|
|
1353
|
-
2
|
|
1354
|
-
)}
|
|
1355
|
-
|
|
1356
|
-
Use the trace tools. Do not invent facts. Cite task ids. Separate customer-facing claims from internal harness/model findings.`;
|
|
1357
|
-
}
|
|
1358
|
-
|
|
1359
|
-
// src/trace-analyst/otlp-flatten.ts
|
|
1360
|
-
var DEFAULT_KIND_MAP = {
|
|
1361
|
-
0: "SPAN_KIND_UNSPECIFIED",
|
|
1362
|
-
1: "SPAN_KIND_INTERNAL",
|
|
1363
|
-
2: "SPAN_KIND_SERVER",
|
|
1364
|
-
3: "SPAN_KIND_CLIENT",
|
|
1365
|
-
4: "SPAN_KIND_PRODUCER",
|
|
1366
|
-
5: "SPAN_KIND_CONSUMER"
|
|
1367
|
-
};
|
|
1368
|
-
var STATUS_MAP = {
|
|
1369
|
-
0: "STATUS_CODE_UNSET",
|
|
1370
|
-
1: "STATUS_CODE_OK",
|
|
1371
|
-
2: "STATUS_CODE_ERROR"
|
|
1372
|
-
};
|
|
1373
|
-
function attrValue(v) {
|
|
1374
|
-
if (v.stringValue !== void 0) return v.stringValue;
|
|
1375
|
-
if (v.intValue !== void 0) return Number(v.intValue);
|
|
1376
|
-
if (v.doubleValue !== void 0) return v.doubleValue;
|
|
1377
|
-
if (v.boolValue !== void 0) return v.boolValue;
|
|
1378
|
-
return "";
|
|
1379
|
-
}
|
|
1380
|
-
function attrsToRecord(attrs) {
|
|
1381
|
-
const out = {};
|
|
1382
|
-
for (const a of attrs) out[a.key] = attrValue(a.value);
|
|
1383
|
-
return out;
|
|
1384
|
-
}
|
|
1385
|
-
function nanoToIso(nano) {
|
|
1386
|
-
const ms = Number(nano) / 1e6;
|
|
1387
|
-
return Number.isFinite(ms) ? new Date(ms).toISOString() : (/* @__PURE__ */ new Date(0)).toISOString();
|
|
1388
|
-
}
|
|
1389
|
-
function applyOpenInference(attrs) {
|
|
1390
|
-
if ("llm.model" in attrs && !("llm.model_name" in attrs)) {
|
|
1391
|
-
attrs["llm.model_name"] = attrs["llm.model"];
|
|
1392
|
-
}
|
|
1393
|
-
if ("tool.name" in attrs && !("inference.tool.name" in attrs)) {
|
|
1394
|
-
attrs["inference.tool.name"] = attrs["tool.name"];
|
|
1395
|
-
}
|
|
1396
|
-
if ("span.kind" in attrs && !("openinference.span.kind" in attrs)) {
|
|
1397
|
-
attrs["openinference.span.kind"] = String(attrs["span.kind"]).toUpperCase();
|
|
1398
|
-
}
|
|
1399
|
-
}
|
|
1400
|
-
function flattenOtlpExportToNdjson(otlpExport, opts = {}) {
|
|
1401
|
-
const vocab = opts.attributeVocabulary ?? "openinference";
|
|
1402
|
-
const kindMap = { ...DEFAULT_KIND_MAP, ...opts.kindMap };
|
|
1403
|
-
const lines = [];
|
|
1404
|
-
for (const rs of otlpExport.resourceSpans ?? []) {
|
|
1405
|
-
const resource = { attributes: attrsToRecord(rs.resource?.attributes ?? []) };
|
|
1406
|
-
for (const scope of rs.scopeSpans ?? []) {
|
|
1407
|
-
for (const span of scope.spans ?? []) {
|
|
1408
|
-
const attributes = attrsToRecord(span.attributes ?? []);
|
|
1409
|
-
if (vocab === "openinference") applyOpenInference(attributes);
|
|
1410
|
-
const line = {
|
|
1411
|
-
trace_id: span.traceId,
|
|
1412
|
-
span_id: span.spanId,
|
|
1413
|
-
parent_span_id: span.parentSpanId ?? null,
|
|
1414
|
-
name: span.name,
|
|
1415
|
-
kind: kindMap[span.kind] ?? "SPAN_KIND_UNSPECIFIED",
|
|
1416
|
-
start_time: nanoToIso(span.startTimeUnixNano),
|
|
1417
|
-
end_time: nanoToIso(span.endTimeUnixNano),
|
|
1418
|
-
status: {
|
|
1419
|
-
code: STATUS_MAP[span.status?.code ?? 0] ?? "STATUS_CODE_UNSET",
|
|
1420
|
-
...span.status?.message ? { message: span.status.message } : {}
|
|
1421
|
-
},
|
|
1422
|
-
resource,
|
|
1423
|
-
attributes
|
|
1424
|
-
};
|
|
1425
|
-
if (span.events && span.events.length > 0) {
|
|
1426
|
-
line.events = span.events.map((e) => ({
|
|
1427
|
-
name: e.name,
|
|
1428
|
-
timeUnixNano: e.timeUnixNano,
|
|
1429
|
-
...e.attributes ? { attributes: attrsToRecord(e.attributes) } : {}
|
|
1430
|
-
}));
|
|
1431
|
-
}
|
|
1432
|
-
lines.push(line);
|
|
1433
|
-
}
|
|
1434
|
-
}
|
|
1435
|
-
}
|
|
1436
|
-
return lines;
|
|
1437
|
-
}
|
|
1438
|
-
|
|
1439
|
-
// src/trace/store.ts
|
|
1440
|
-
var InMemoryTraceStore = class {
|
|
1441
|
-
runs = /* @__PURE__ */ new Map();
|
|
1442
|
-
allSpans = [];
|
|
1443
|
-
allEvents = [];
|
|
1444
|
-
allArtifacts = [];
|
|
1445
|
-
allBudget = [];
|
|
1446
|
-
async appendRun(run) {
|
|
1447
|
-
if (this.runs.has(run.runId)) throw new Error(`run ${run.runId} already exists`);
|
|
1448
|
-
this.runs.set(run.runId, { ...run });
|
|
1449
|
-
}
|
|
1450
|
-
async updateRun(runId, patch) {
|
|
1451
|
-
const existing = this.runs.get(runId);
|
|
1452
|
-
if (!existing) throw new Error(`run ${runId} not found`);
|
|
1453
|
-
this.runs.set(runId, { ...existing, ...patch });
|
|
1454
|
-
}
|
|
1455
|
-
async appendSpan(span) {
|
|
1456
|
-
this.allSpans.push({ ...span });
|
|
1457
|
-
}
|
|
1458
|
-
async updateSpan(spanId, patch) {
|
|
1459
|
-
const idx = this.allSpans.findIndex((s) => s.spanId === spanId);
|
|
1460
|
-
if (idx < 0) throw new Error(`span ${spanId} not found`);
|
|
1461
|
-
this.allSpans[idx] = { ...this.allSpans[idx], ...patch };
|
|
1462
|
-
}
|
|
1463
|
-
async appendEvent(event) {
|
|
1464
|
-
this.allEvents.push({ ...event });
|
|
1465
|
-
}
|
|
1466
|
-
async appendArtifact(artifact) {
|
|
1467
|
-
this.allArtifacts.push({ ...artifact });
|
|
1468
|
-
}
|
|
1469
|
-
async appendBudgetEntry(entry) {
|
|
1470
|
-
this.allBudget.push({ ...entry });
|
|
1471
|
-
}
|
|
1472
|
-
async getRun(runId) {
|
|
1473
|
-
const r = this.runs.get(runId);
|
|
1474
|
-
return r ? { ...r } : void 0;
|
|
1475
|
-
}
|
|
1476
|
-
async listRuns(filter = {}) {
|
|
1477
|
-
return [...this.runs.values()].filter((r) => matchesRun(r, filter));
|
|
1478
|
-
}
|
|
1479
|
-
async spans(filter = {}) {
|
|
1480
|
-
return this.allSpans.filter((s) => matchesSpan(s, filter)).map((s) => ({ ...s }));
|
|
1481
|
-
}
|
|
1482
|
-
async events(filter = {}) {
|
|
1483
|
-
return this.allEvents.filter((e) => matchesEvent(e, filter)).map((e) => ({ ...e }));
|
|
1484
|
-
}
|
|
1485
|
-
async budget(runId) {
|
|
1486
|
-
return this.allBudget.filter((b) => b.runId === runId).map((b) => ({ ...b }));
|
|
1487
|
-
}
|
|
1488
|
-
async artifacts(runId) {
|
|
1489
|
-
return this.allArtifacts.filter((a) => a.runId === runId).map((a) => ({ ...a }));
|
|
1490
|
-
}
|
|
1491
|
-
};
|
|
1492
|
-
function matchesRun(r, f2) {
|
|
1493
|
-
if (f2.scenarioId && r.scenarioId !== f2.scenarioId) return false;
|
|
1494
|
-
if (f2.variantId && r.variantId !== f2.variantId) return false;
|
|
1495
|
-
if (f2.status && r.status !== f2.status) return false;
|
|
1496
|
-
if (f2.since !== void 0 && r.startedAt < f2.since) return false;
|
|
1497
|
-
if (f2.until !== void 0 && r.startedAt > f2.until) return false;
|
|
1498
|
-
if (f2.tag && r.tags?.[f2.tag.key] !== f2.tag.value) return false;
|
|
1499
|
-
if (f2.parentRunId && r.parentRunId !== f2.parentRunId) return false;
|
|
1500
|
-
if (f2.projectId && r.projectId !== f2.projectId) return false;
|
|
1501
|
-
if (f2.chatId && r.chatId !== f2.chatId) return false;
|
|
1502
|
-
if (f2.layer && r.layer !== f2.layer) return false;
|
|
1503
|
-
return true;
|
|
1504
|
-
}
|
|
1505
|
-
function matchesSpan(s, f2) {
|
|
1506
|
-
if (f2.runId && s.runId !== f2.runId) return false;
|
|
1507
|
-
if (f2.parentSpanId && s.parentSpanId !== f2.parentSpanId) return false;
|
|
1508
|
-
if (f2.kind && s.kind !== f2.kind) return false;
|
|
1509
|
-
if (f2.name && s.name !== f2.name) return false;
|
|
1510
|
-
if (f2.toolName && (s.kind !== "tool" || s.toolName !== f2.toolName)) return false;
|
|
1511
|
-
if (f2.judgeId && (s.kind !== "judge" || s.judgeId !== f2.judgeId)) return false;
|
|
1512
|
-
if (f2.since !== void 0 && s.startedAt < f2.since) return false;
|
|
1513
|
-
if (f2.until !== void 0 && s.startedAt > f2.until) return false;
|
|
1514
|
-
return true;
|
|
1515
|
-
}
|
|
1516
|
-
function matchesEvent(e, f2) {
|
|
1517
|
-
if (f2.runId && e.runId !== f2.runId) return false;
|
|
1518
|
-
if (f2.spanId && e.spanId !== f2.spanId) return false;
|
|
1519
|
-
if (f2.kind && e.kind !== f2.kind) return false;
|
|
1520
|
-
if (f2.since !== void 0 && e.timestamp < f2.since) return false;
|
|
1521
|
-
if (f2.until !== void 0 && e.timestamp > f2.until) return false;
|
|
1522
|
-
return true;
|
|
1523
|
-
}
|
|
1524
|
-
var FileSystemTraceStore = class {
|
|
1525
|
-
dir;
|
|
1526
|
-
maxBytes;
|
|
1527
|
-
/** Lazy in-memory index for queries — populated on first read. */
|
|
1528
|
-
index;
|
|
1529
|
-
loaded = false;
|
|
1530
|
-
constructor(options) {
|
|
1531
|
-
this.dir = options.dir;
|
|
1532
|
-
this.maxBytes = options.maxBytes ?? 32 * 1024 * 1024;
|
|
1533
|
-
}
|
|
1534
|
-
async ensureDir() {
|
|
1535
|
-
const fs = await import("fs/promises");
|
|
1536
|
-
await fs.mkdir(this.dir, { recursive: true });
|
|
1537
|
-
}
|
|
1538
|
-
async append(name, record) {
|
|
1539
|
-
await this.ensureDir();
|
|
1540
|
-
const fs = await import("fs/promises");
|
|
1541
|
-
const path = await import("path");
|
|
1542
|
-
const active = path.join(this.dir, `${name}.ndjson`);
|
|
1543
|
-
try {
|
|
1544
|
-
const stat2 = await fs.stat(active);
|
|
1545
|
-
if (stat2.size >= this.maxBytes) {
|
|
1546
|
-
const rolled = path.join(this.dir, `${name}.${Date.now()}.ndjson`);
|
|
1547
|
-
await fs.rename(active, rolled);
|
|
1548
|
-
}
|
|
1549
|
-
} catch {
|
|
1550
|
-
}
|
|
1551
|
-
await fs.appendFile(active, `${JSON.stringify(record)}
|
|
1552
|
-
`, "utf8");
|
|
1553
|
-
if (this.index && !record?._update) {
|
|
1554
|
-
void this.insertInto(name, record);
|
|
1555
|
-
}
|
|
1556
|
-
}
|
|
1557
|
-
async insertInto(name, record) {
|
|
1558
|
-
if (!this.index) return;
|
|
1559
|
-
switch (name) {
|
|
1560
|
-
case "runs":
|
|
1561
|
-
await this.index.appendRun(record);
|
|
1562
|
-
break;
|
|
1563
|
-
case "spans":
|
|
1564
|
-
await this.index.appendSpan(record);
|
|
1565
|
-
break;
|
|
1566
|
-
case "events":
|
|
1567
|
-
await this.index.appendEvent(record);
|
|
1568
|
-
break;
|
|
1569
|
-
case "artifacts":
|
|
1570
|
-
await this.index.appendArtifact(record);
|
|
1571
|
-
break;
|
|
1572
|
-
case "budget":
|
|
1573
|
-
await this.index.appendBudgetEntry(record);
|
|
1574
|
-
break;
|
|
1575
|
-
}
|
|
1576
|
-
}
|
|
1577
|
-
async load() {
|
|
1578
|
-
if (this.loaded && this.index) return this.index;
|
|
1579
|
-
const fs = await import("fs/promises");
|
|
1580
|
-
const path = await import("path");
|
|
1581
|
-
const store = new InMemoryTraceStore();
|
|
1582
|
-
try {
|
|
1583
|
-
const entries = await fs.readdir(this.dir);
|
|
1584
|
-
for (const file of entries) {
|
|
1585
|
-
if (!file.endsWith(".ndjson")) continue;
|
|
1586
|
-
const full = path.join(this.dir, file);
|
|
1587
|
-
const content = await fs.readFile(full, "utf8");
|
|
1588
|
-
const base = file.split(".")[0];
|
|
1589
|
-
for (const line of content.split("\n")) {
|
|
1590
|
-
if (!line.trim()) continue;
|
|
1591
|
-
const record = JSON.parse(line);
|
|
1592
|
-
if (base === "runs") {
|
|
1593
|
-
try {
|
|
1594
|
-
await store.appendRun(record);
|
|
1595
|
-
} catch {
|
|
1596
|
-
await store.updateRun(record.runId, record);
|
|
1597
|
-
}
|
|
1598
|
-
} else if (base === "spans") {
|
|
1599
|
-
if (record?._update) {
|
|
1600
|
-
try {
|
|
1601
|
-
await store.updateSpan(record.spanId, record);
|
|
1602
|
-
} catch {
|
|
1603
|
-
await store.appendSpan(record);
|
|
1604
|
-
}
|
|
1605
|
-
} else {
|
|
1606
|
-
await store.appendSpan(record);
|
|
1607
|
-
}
|
|
1608
|
-
} else if (base === "events") {
|
|
1609
|
-
await store.appendEvent(record);
|
|
1610
|
-
} else if (base === "artifacts") {
|
|
1611
|
-
await store.appendArtifact(record);
|
|
1612
|
-
} else if (base === "budget") {
|
|
1613
|
-
await store.appendBudgetEntry(record);
|
|
1614
|
-
}
|
|
1615
|
-
}
|
|
1616
|
-
}
|
|
1617
|
-
} catch {
|
|
1618
|
-
}
|
|
1619
|
-
this.index = store;
|
|
1620
|
-
this.loaded = true;
|
|
1621
|
-
return store;
|
|
1622
|
-
}
|
|
1623
|
-
async appendRun(run) {
|
|
1624
|
-
await this.append("runs", run);
|
|
1625
|
-
}
|
|
1626
|
-
async updateRun(runId, patch) {
|
|
1627
|
-
await this.append("runs", { runId, ...patch, _update: true });
|
|
1628
|
-
if (this.index) await this.index.updateRun(runId, patch);
|
|
1629
|
-
}
|
|
1630
|
-
async appendSpan(span) {
|
|
1631
|
-
await this.append("spans", span);
|
|
1632
|
-
}
|
|
1633
|
-
async updateSpan(spanId, patch) {
|
|
1634
|
-
await this.append("spans", { spanId, ...patch, _update: true });
|
|
1635
|
-
if (this.index) await this.index.updateSpan(spanId, patch);
|
|
1636
|
-
}
|
|
1637
|
-
async appendEvent(event) {
|
|
1638
|
-
await this.append("events", event);
|
|
1639
|
-
}
|
|
1640
|
-
async appendArtifact(artifact) {
|
|
1641
|
-
await this.append("artifacts", artifact);
|
|
1642
|
-
}
|
|
1643
|
-
async appendBudgetEntry(entry) {
|
|
1644
|
-
await this.append("budget", entry);
|
|
1645
|
-
}
|
|
1646
|
-
async getRun(runId) {
|
|
1647
|
-
return (await this.load()).getRun(runId);
|
|
1648
|
-
}
|
|
1649
|
-
async listRuns(filter) {
|
|
1650
|
-
return (await this.load()).listRuns(filter);
|
|
1651
|
-
}
|
|
1652
|
-
async spans(filter) {
|
|
1653
|
-
return (await this.load()).spans(filter);
|
|
1654
|
-
}
|
|
1655
|
-
async events(filter) {
|
|
1656
|
-
return (await this.load()).events(filter);
|
|
1657
|
-
}
|
|
1658
|
-
async budget(runId) {
|
|
1659
|
-
return (await this.load()).budget(runId);
|
|
1660
|
-
}
|
|
1661
|
-
async artifacts(runId) {
|
|
1662
|
-
return (await this.load()).artifacts(runId);
|
|
1663
|
-
}
|
|
1664
|
-
};
|
|
1665
|
-
|
|
1666
|
-
// src/trace/capture-fetch.ts
|
|
1667
|
-
var DEFAULT_BODY_CAP = 2 * 1024 * 1024;
|
|
1668
|
-
function headersToRecord(headers) {
|
|
1669
|
-
if (!headers) return void 0;
|
|
1670
|
-
const out = {};
|
|
1671
|
-
headers.forEach((value, key) => {
|
|
1672
|
-
out[key.toLowerCase()] = value;
|
|
1673
|
-
});
|
|
1674
|
-
return Object.keys(out).length > 0 ? out : void 0;
|
|
1675
|
-
}
|
|
1676
|
-
function parseMaybeJson(text) {
|
|
1677
|
-
if (text.length === 0) return void 0;
|
|
1678
|
-
try {
|
|
1679
|
-
return JSON.parse(text);
|
|
1680
|
-
} catch {
|
|
1681
|
-
return text;
|
|
1682
|
-
}
|
|
1683
|
-
}
|
|
1684
|
-
async function readRequestBody(input, init) {
|
|
1685
|
-
if (typeof init?.body === "string") return parseMaybeJson(init.body);
|
|
1686
|
-
if (init?.body != null) return void 0;
|
|
1687
|
-
if (input instanceof Request) {
|
|
1688
|
-
try {
|
|
1689
|
-
return parseMaybeJson(await input.clone().text());
|
|
1690
|
-
} catch {
|
|
1691
|
-
return void 0;
|
|
1692
|
-
}
|
|
1693
|
-
}
|
|
1694
|
-
return void 0;
|
|
1695
|
-
}
|
|
1696
|
-
function endpointFromUrl(url, baseUrl) {
|
|
1697
|
-
const normalisedBase = baseUrl.replace(/\/+$/, "");
|
|
1698
|
-
if (url.startsWith(normalisedBase)) return url.slice(normalisedBase.length) || "/";
|
|
1699
|
-
try {
|
|
1700
|
-
return new URL(url).pathname;
|
|
1701
|
-
} catch {
|
|
1702
|
-
return url;
|
|
1703
|
-
}
|
|
1704
|
-
}
|
|
1705
|
-
function captureFetchToRawSink(fetch2, sink, ctx, opts = {}) {
|
|
1706
|
-
const provider = ctx.provider ?? providerFromBaseUrl(ctx.baseUrl);
|
|
1707
|
-
const redactor = opts.redactor ?? defaultProviderRedactor;
|
|
1708
|
-
const bodyCap = opts.responseBodyByteCap ?? DEFAULT_BODY_CAP;
|
|
1709
|
-
let warned = false;
|
|
1710
|
-
const baseEvent = (direction, endpoint) => ({
|
|
1711
|
-
eventId: crypto.randomUUID(),
|
|
1712
|
-
runId: ctx.runId,
|
|
1713
|
-
spanId: ctx.spanId,
|
|
1714
|
-
provider,
|
|
1715
|
-
model: ctx.model,
|
|
1716
|
-
endpoint,
|
|
1717
|
-
baseUrl: ctx.baseUrl,
|
|
1718
|
-
attemptIndex: 0,
|
|
1719
|
-
// retries are re-invocations one layer up; documented in 0.x
|
|
1720
|
-
direction,
|
|
1721
|
-
timestamp: Date.now(),
|
|
1722
|
-
redactedFields: []
|
|
1723
|
-
});
|
|
1724
|
-
const record = async (event) => {
|
|
1725
|
-
try {
|
|
1726
|
-
await sink.record(redactor(event));
|
|
1727
|
-
} catch (err) {
|
|
1728
|
-
if (opts.failClosed) throw err;
|
|
1729
|
-
if (!warned) {
|
|
1730
|
-
warned = true;
|
|
1731
|
-
console.warn(
|
|
1732
|
-
`captureFetchToRawSink: sink.record failed (capture is best-effort) \u2014 ${err instanceof Error ? err.message : String(err)}`
|
|
1733
|
-
);
|
|
1734
|
-
}
|
|
1735
|
-
}
|
|
1736
|
-
};
|
|
1737
|
-
return async (input, init) => {
|
|
1738
|
-
const url = typeof input === "string" ? input : input instanceof URL ? input.toString() : input.url;
|
|
1739
|
-
const method = (init?.method ?? (input instanceof Request ? input.method : "GET")).toUpperCase();
|
|
1740
|
-
const endpoint = endpointFromUrl(url, ctx.baseUrl);
|
|
1741
|
-
const reqHeaders = new Headers(
|
|
1742
|
-
init?.headers ?? (input instanceof Request ? input.headers : void 0)
|
|
1743
|
-
);
|
|
1744
|
-
await record({
|
|
1745
|
-
...baseEvent("request", endpoint),
|
|
1746
|
-
requestHeaders: { ...headersToRecord(reqHeaders), "x-http-method": method },
|
|
1747
|
-
requestBody: await readRequestBody(input, init)
|
|
1748
|
-
});
|
|
1749
|
-
const start = Date.now();
|
|
1750
|
-
let response;
|
|
1751
|
-
try {
|
|
1752
|
-
response = await fetch2(input, init);
|
|
1753
|
-
} catch (err) {
|
|
1754
|
-
await record({
|
|
1755
|
-
...baseEvent("error", endpoint),
|
|
1756
|
-
durationMs: Date.now() - start,
|
|
1757
|
-
errorMessage: err instanceof Error ? err.message : String(err)
|
|
1758
|
-
});
|
|
1759
|
-
throw err;
|
|
1760
|
-
}
|
|
1761
|
-
let responseBody;
|
|
1762
|
-
const redactedFields = [];
|
|
1763
|
-
try {
|
|
1764
|
-
const raw = await response.clone().text();
|
|
1765
|
-
if (raw.length > bodyCap) {
|
|
1766
|
-
responseBody = raw.slice(0, bodyCap);
|
|
1767
|
-
redactedFields.push("body_truncated");
|
|
1768
|
-
} else {
|
|
1769
|
-
responseBody = parseMaybeJson(raw);
|
|
1770
|
-
}
|
|
1771
|
-
} catch {
|
|
1772
|
-
responseBody = void 0;
|
|
1773
|
-
}
|
|
1774
|
-
await record({
|
|
1775
|
-
...baseEvent("response", endpoint),
|
|
1776
|
-
durationMs: Date.now() - start,
|
|
1777
|
-
statusCode: response.status,
|
|
1778
|
-
responseHeaders: headersToRecord(response.headers),
|
|
1779
|
-
responseBody,
|
|
1780
|
-
redactedFields
|
|
1781
|
-
});
|
|
1782
|
-
return response;
|
|
1783
|
-
};
|
|
1784
|
-
}
|
|
1785
|
-
|
|
1786
|
-
// src/trace/otel.ts
|
|
1787
|
-
var OTEL_AGENT_EVAL_SCOPE = { name: "@tangle-network/agent-eval", version: "0.3.0" };
|
|
1788
|
-
async function exportRunAsOtlp(store, runId, resourceAttrs = {}) {
|
|
1789
|
-
const run = await store.getRun(runId);
|
|
1790
|
-
if (!run) throw new Error(`run ${runId} not found`);
|
|
1791
|
-
const spans = await store.spans({ runId });
|
|
1792
|
-
const events = await store.events({ runId });
|
|
1793
|
-
const eventsBySpan = /* @__PURE__ */ new Map();
|
|
1794
|
-
for (const e of events) {
|
|
1795
|
-
if (!e.spanId) continue;
|
|
1796
|
-
const arr = eventsBySpan.get(e.spanId) ?? [];
|
|
1797
|
-
arr.push(e);
|
|
1798
|
-
eventsBySpan.set(e.spanId, arr);
|
|
1799
|
-
}
|
|
1800
|
-
const traceId = runToTraceId(run);
|
|
1801
|
-
const otlpSpans = spans.map(
|
|
1802
|
-
(s) => spanToOtlp(s, traceId, eventsBySpan.get(s.spanId) ?? [])
|
|
1803
|
-
);
|
|
1804
|
-
return {
|
|
1805
|
-
resourceSpans: [
|
|
1806
|
-
{
|
|
1807
|
-
resource: {
|
|
1808
|
-
attributes: toAttributes({
|
|
1809
|
-
"service.name": "agent-eval",
|
|
1810
|
-
"run.id": run.runId,
|
|
1811
|
-
"run.scenario_id": run.scenarioId,
|
|
1812
|
-
"run.variant_id": run.variantId ?? "",
|
|
1813
|
-
"run.dataset_version": run.datasetVersion ?? "",
|
|
1814
|
-
"run.code_sha": run.codeSha ?? "",
|
|
1815
|
-
"run.model_fingerprint": run.modelFingerprint ?? "",
|
|
1816
|
-
...resourceAttrs
|
|
1817
|
-
})
|
|
1818
|
-
},
|
|
1819
|
-
scopeSpans: [{ scope: OTEL_AGENT_EVAL_SCOPE, spans: otlpSpans }]
|
|
1820
|
-
}
|
|
1821
|
-
]
|
|
1822
|
-
};
|
|
1823
|
-
}
|
|
1824
|
-
function spanToOtlp(span, traceId, events) {
|
|
1825
|
-
const endedAt = span.endedAt ?? span.startedAt;
|
|
1826
|
-
return {
|
|
1827
|
-
traceId,
|
|
1828
|
-
spanId: padSpanId(span.spanId),
|
|
1829
|
-
parentSpanId: span.parentSpanId ? padSpanId(span.parentSpanId) : void 0,
|
|
1830
|
-
name: span.name,
|
|
1831
|
-
kind: 1,
|
|
1832
|
-
// SPAN_KIND_INTERNAL
|
|
1833
|
-
startTimeUnixNano: msToNs(span.startedAt),
|
|
1834
|
-
endTimeUnixNano: msToNs(endedAt),
|
|
1835
|
-
attributes: toAttributes(flattenSpanAttributes(span)),
|
|
1836
|
-
events: events.map((e) => ({
|
|
1837
|
-
timeUnixNano: msToNs(e.timestamp),
|
|
1838
|
-
name: e.kind,
|
|
1839
|
-
attributes: toAttributes(flattenPayload(e.payload))
|
|
1840
|
-
})),
|
|
1841
|
-
status: span.status === "error" ? { code: 2, message: span.error } : { code: 1 }
|
|
1842
|
-
};
|
|
1843
|
-
}
|
|
1844
|
-
function flattenSpanAttributes(span) {
|
|
1845
|
-
const base = {
|
|
1846
|
-
"span.kind": span.kind
|
|
1847
|
-
};
|
|
1848
|
-
if (span.kind === "llm") {
|
|
1849
|
-
base["llm.model"] = span.model;
|
|
1850
|
-
if (span.inputTokens !== void 0) base["llm.input_tokens"] = span.inputTokens;
|
|
1851
|
-
if (span.outputTokens !== void 0) base["llm.output_tokens"] = span.outputTokens;
|
|
1852
|
-
if (span.costUsd !== void 0) base["llm.cost_usd"] = span.costUsd;
|
|
1853
|
-
if (span.finishReason) base["llm.finish_reason"] = span.finishReason;
|
|
1854
|
-
} else if (span.kind === "tool") {
|
|
1855
|
-
base["tool.name"] = span.toolName;
|
|
1856
|
-
if (span.latencyMs !== void 0) base["tool.latency_ms"] = span.latencyMs;
|
|
1857
|
-
} else if (span.kind === "retrieval") {
|
|
1858
|
-
base["retrieval.query"] = span.query;
|
|
1859
|
-
base["retrieval.hits"] = span.hits.length;
|
|
1860
|
-
} else if (span.kind === "judge") {
|
|
1861
|
-
base["judge.id"] = span.judgeId;
|
|
1862
|
-
base["judge.dimension"] = span.dimension;
|
|
1863
|
-
base["judge.score"] = span.score;
|
|
1864
|
-
base["judge.target_span_id"] = span.targetSpanId;
|
|
1865
|
-
} else if (span.kind === "sandbox") {
|
|
1866
|
-
if (span.image) base["sandbox.image"] = span.image;
|
|
1867
|
-
if (span.exitCode !== void 0) base["sandbox.exit_code"] = span.exitCode;
|
|
1868
|
-
if (span.testsPassed !== void 0) base["sandbox.tests_passed"] = span.testsPassed;
|
|
1869
|
-
if (span.testsTotal !== void 0) base["sandbox.tests_total"] = span.testsTotal;
|
|
1870
|
-
}
|
|
1871
|
-
if (span.attributes) {
|
|
1872
|
-
for (const [k, v] of Object.entries(span.attributes)) {
|
|
1873
|
-
if (typeof v === "string" || typeof v === "number" || typeof v === "boolean") base[k] = v;
|
|
1874
|
-
}
|
|
1875
|
-
}
|
|
1876
|
-
return base;
|
|
1877
|
-
}
|
|
1878
|
-
function flattenPayload(payload) {
|
|
1879
|
-
const out = {};
|
|
1880
|
-
for (const [k, v] of Object.entries(payload)) {
|
|
1881
|
-
if (typeof v === "string" || typeof v === "number" || typeof v === "boolean") out[k] = v;
|
|
1882
|
-
else out[k] = JSON.stringify(v);
|
|
1883
|
-
}
|
|
1884
|
-
return out;
|
|
1885
|
-
}
|
|
1886
|
-
function toAttributes(record) {
|
|
1887
|
-
return Object.entries(record).map(([key, value]) => ({
|
|
1888
|
-
key,
|
|
1889
|
-
value: typeof value === "number" ? Number.isInteger(value) ? { intValue: value.toString() } : { doubleValue: value } : typeof value === "boolean" ? { boolValue: value } : { stringValue: value }
|
|
1890
|
-
}));
|
|
1891
|
-
}
|
|
1892
|
-
function msToNs(ms) {
|
|
1893
|
-
return (BigInt(Math.floor(ms)) * 1000000n).toString();
|
|
1894
|
-
}
|
|
1895
|
-
function padSpanId(id) {
|
|
1896
|
-
const cleaned = id.replace(/-/g, "");
|
|
1897
|
-
return cleaned.slice(0, 16).padEnd(16, "0");
|
|
1898
|
-
}
|
|
1899
|
-
function runToTraceId(run) {
|
|
1900
|
-
const cleaned = run.runId.replace(/-/g, "");
|
|
1901
|
-
return cleaned.slice(0, 32).padEnd(32, "0");
|
|
1902
|
-
}
|
|
1903
|
-
|
|
1904
|
-
// src/trace/otel-bridge.ts
|
|
1905
|
-
function otelRunCompleteHook(exporter) {
|
|
1906
|
-
return async (ctx) => {
|
|
1907
|
-
const spans = await ctx.store.spans({ runId: ctx.runId });
|
|
1908
|
-
for (const span of spans) {
|
|
1909
|
-
if (span.endedAt) {
|
|
1910
|
-
exporter.exportSpan(storeSpanToExportable(span, ctx.runId));
|
|
1911
|
-
}
|
|
1912
|
-
}
|
|
1913
|
-
await exporter.flush();
|
|
1914
|
-
};
|
|
1915
|
-
}
|
|
1916
|
-
function createOtelTracingStore(inner, exporter, traceId) {
|
|
1917
|
-
return {
|
|
1918
|
-
async appendRun(run) {
|
|
1919
|
-
return inner.appendRun(run);
|
|
1920
|
-
},
|
|
1921
|
-
async updateRun(runId, patch) {
|
|
1922
|
-
return inner.updateRun(runId, patch);
|
|
1923
|
-
},
|
|
1924
|
-
async appendSpan(span) {
|
|
1925
|
-
if (span.endedAt) {
|
|
1926
|
-
exporter.exportSpan(storeSpanToExportable(span, traceId));
|
|
1927
|
-
}
|
|
1928
|
-
return inner.appendSpan(span);
|
|
1929
|
-
},
|
|
1930
|
-
async updateSpan(spanId, patch) {
|
|
1931
|
-
await inner.updateSpan(spanId, patch);
|
|
1932
|
-
if (patch.endedAt) {
|
|
1933
|
-
const spans = await inner.spans({ runId: traceId });
|
|
1934
|
-
const found = spans.find((s) => s.spanId === spanId);
|
|
1935
|
-
if (found) {
|
|
1936
|
-
exporter.exportSpan(storeSpanToExportable(found, traceId));
|
|
1937
|
-
}
|
|
1938
|
-
}
|
|
1939
|
-
},
|
|
1940
|
-
async appendEvent(event) {
|
|
1941
|
-
return inner.appendEvent(event);
|
|
1942
|
-
},
|
|
1943
|
-
async appendBudgetEntry(entry) {
|
|
1944
|
-
return inner.appendBudgetEntry(entry);
|
|
1945
|
-
},
|
|
1946
|
-
async appendArtifact(artifact) {
|
|
1947
|
-
return inner.appendArtifact(artifact);
|
|
1948
|
-
},
|
|
1949
|
-
getRun: inner.getRun.bind(inner),
|
|
1950
|
-
listRuns: inner.listRuns.bind(inner),
|
|
1951
|
-
spans: inner.spans.bind(inner),
|
|
1952
|
-
events: inner.events.bind(inner),
|
|
1953
|
-
budget: inner.budget.bind(inner),
|
|
1954
|
-
artifacts: inner.artifacts.bind(inner)
|
|
1955
|
-
};
|
|
1956
|
-
}
|
|
1957
|
-
function storeSpanToExportable(span, traceId) {
|
|
1958
|
-
const llm = span.kind === "llm" ? span : void 0;
|
|
1959
|
-
return {
|
|
1960
|
-
traceId,
|
|
1961
|
-
spanId: span.spanId,
|
|
1962
|
-
parentSpanId: span.parentSpanId,
|
|
1963
|
-
name: span.name,
|
|
1964
|
-
kind: span.kind,
|
|
1965
|
-
startedAt: span.startedAt,
|
|
1966
|
-
endedAt: span.endedAt,
|
|
1967
|
-
status: span.status,
|
|
1968
|
-
error: span.error,
|
|
1969
|
-
model: llm?.model,
|
|
1970
|
-
inputTokens: llm?.inputTokens,
|
|
1971
|
-
outputTokens: llm?.outputTokens,
|
|
1972
|
-
costUsd: llm?.costUsd,
|
|
1973
|
-
attributes: span.attributes
|
|
1974
|
-
};
|
|
1975
|
-
}
|
|
1976
|
-
|
|
1977
|
-
// src/trace/otel-export.ts
|
|
1978
|
-
function createOtelExporter(config) {
|
|
1979
|
-
const resolvedEndpoint = config?.endpoint ?? (typeof process !== "undefined" ? process.env.OTEL_EXPORTER_OTLP_ENDPOINT : void 0);
|
|
1980
|
-
if (!resolvedEndpoint) return void 0;
|
|
1981
|
-
const endpoint = resolvedEndpoint;
|
|
1982
|
-
const headers = config?.headers ?? parseHeadersFromEnv();
|
|
1983
|
-
const batchSize = config?.batchSize ?? 64;
|
|
1984
|
-
const flushIntervalMs = config?.flushIntervalMs ?? 5e3;
|
|
1985
|
-
const serviceName = config?.serviceName ?? "agent-eval";
|
|
1986
|
-
const resourceAttrs = config?.resourceAttributes ?? {};
|
|
1987
|
-
const pending = [];
|
|
1988
|
-
let timer;
|
|
1989
|
-
let stopped = false;
|
|
1990
|
-
const exporter = {
|
|
1991
|
-
exportSpan(span) {
|
|
1992
|
-
if (stopped) return;
|
|
1993
|
-
pending.push(toOtlpSpan(span));
|
|
1994
|
-
if (pending.length >= batchSize) {
|
|
1995
|
-
void doFlush();
|
|
1996
|
-
}
|
|
1997
|
-
},
|
|
1998
|
-
async flush() {
|
|
1999
|
-
await doFlush();
|
|
2000
|
-
},
|
|
2001
|
-
async shutdown() {
|
|
2002
|
-
stopped = true;
|
|
2003
|
-
if (timer !== void 0) {
|
|
2004
|
-
clearInterval(timer);
|
|
2005
|
-
timer = void 0;
|
|
2006
|
-
}
|
|
2007
|
-
await doFlush();
|
|
2008
|
-
}
|
|
2009
|
-
};
|
|
2010
|
-
timer = setInterval(() => {
|
|
2011
|
-
if (pending.length > 0) void doFlush();
|
|
2012
|
-
}, flushIntervalMs);
|
|
2013
|
-
if (typeof timer === "object" && "unref" in timer) {
|
|
2014
|
-
;
|
|
2015
|
-
timer.unref();
|
|
2016
|
-
}
|
|
2017
|
-
async function doFlush() {
|
|
2018
|
-
if (pending.length === 0) return;
|
|
2019
|
-
const batch = pending.splice(0);
|
|
2020
|
-
const body = {
|
|
2021
|
-
resourceSpans: [
|
|
2022
|
-
{
|
|
2023
|
-
resource: {
|
|
2024
|
-
attributes: toAttributes2({
|
|
2025
|
-
"service.name": serviceName,
|
|
2026
|
-
...resourceAttrs
|
|
2027
|
-
})
|
|
2028
|
-
},
|
|
2029
|
-
scopeSpans: [{ scope: OTEL_AGENT_EVAL_SCOPE, spans: batch }]
|
|
2030
|
-
}
|
|
2031
|
-
]
|
|
2032
|
-
};
|
|
2033
|
-
const url = `${endpoint.replace(/\/+$/, "")}/v1/traces`;
|
|
2034
|
-
try {
|
|
2035
|
-
await fetch(url, {
|
|
2036
|
-
method: "POST",
|
|
2037
|
-
headers: {
|
|
2038
|
-
"content-type": "application/json",
|
|
2039
|
-
...headers
|
|
2040
|
-
},
|
|
2041
|
-
body: JSON.stringify(body)
|
|
2042
|
-
});
|
|
2043
|
-
} catch {
|
|
2044
|
-
}
|
|
2045
|
-
}
|
|
2046
|
-
return exporter;
|
|
2047
|
-
}
|
|
2048
|
-
function parseHeadersFromEnv() {
|
|
2049
|
-
if (typeof process === "undefined") return {};
|
|
2050
|
-
const raw = process.env.OTEL_EXPORTER_OTLP_HEADERS;
|
|
2051
|
-
if (!raw) return {};
|
|
2052
|
-
const out = {};
|
|
2053
|
-
for (const pair of raw.split(",")) {
|
|
2054
|
-
const eq = pair.indexOf("=");
|
|
2055
|
-
if (eq < 0) continue;
|
|
2056
|
-
const key = pair.slice(0, eq).trim();
|
|
2057
|
-
const value = pair.slice(eq + 1).trim();
|
|
2058
|
-
if (key) out[key] = value;
|
|
2059
|
-
}
|
|
2060
|
-
return out;
|
|
2061
|
-
}
|
|
2062
|
-
function toOtlpSpan(span) {
|
|
2063
|
-
const endedAt = span.endedAt ?? span.startedAt;
|
|
2064
|
-
const attrs = {
|
|
2065
|
-
"span.kind": span.kind
|
|
2066
|
-
};
|
|
2067
|
-
if (span.model) attrs["llm.model"] = span.model;
|
|
2068
|
-
if (span.inputTokens !== void 0) attrs["llm.input_tokens"] = span.inputTokens;
|
|
2069
|
-
if (span.outputTokens !== void 0) attrs["llm.output_tokens"] = span.outputTokens;
|
|
2070
|
-
if (span.costUsd !== void 0) attrs["llm.cost_usd"] = span.costUsd;
|
|
2071
|
-
if (span.attributes) {
|
|
2072
|
-
for (const [k, v] of Object.entries(span.attributes)) {
|
|
2073
|
-
if (typeof v === "string" || typeof v === "number" || typeof v === "boolean") attrs[k] = v;
|
|
2074
|
-
}
|
|
2075
|
-
}
|
|
2076
|
-
return {
|
|
2077
|
-
traceId: padTraceId(span.traceId),
|
|
2078
|
-
spanId: padSpanId2(span.spanId),
|
|
2079
|
-
parentSpanId: span.parentSpanId ? padSpanId2(span.parentSpanId) : void 0,
|
|
2080
|
-
name: span.name,
|
|
2081
|
-
kind: 1,
|
|
2082
|
-
// SPAN_KIND_INTERNAL
|
|
2083
|
-
startTimeUnixNano: msToNs2(span.startedAt),
|
|
2084
|
-
endTimeUnixNano: msToNs2(endedAt),
|
|
2085
|
-
attributes: toAttributes2(attrs),
|
|
2086
|
-
status: span.status === "error" ? { code: 2, message: span.error } : { code: 1 }
|
|
2087
|
-
};
|
|
2088
|
-
}
|
|
2089
|
-
function toAttributes2(record) {
|
|
2090
|
-
return Object.entries(record).map(([key, value]) => ({
|
|
2091
|
-
key,
|
|
2092
|
-
value: typeof value === "number" ? Number.isInteger(value) ? { intValue: value.toString() } : { doubleValue: value } : typeof value === "boolean" ? { boolValue: value } : { stringValue: value }
|
|
2093
|
-
}));
|
|
2094
|
-
}
|
|
2095
|
-
function msToNs2(ms) {
|
|
2096
|
-
return (BigInt(Math.floor(ms)) * 1000000n).toString();
|
|
2097
|
-
}
|
|
2098
|
-
function padSpanId2(id) {
|
|
2099
|
-
const cleaned = id.replace(/-/g, "");
|
|
2100
|
-
return cleaned.slice(0, 16).padEnd(16, "0");
|
|
2101
|
-
}
|
|
2102
|
-
function padTraceId(id) {
|
|
2103
|
-
const cleaned = id.replace(/-/g, "");
|
|
2104
|
-
return cleaned.slice(0, 32).padEnd(32, "0");
|
|
2105
|
-
}
|
|
2106
|
-
|
|
2107
|
-
// src/replay.ts
|
|
2108
|
-
var ReplayCacheMissError = class extends ReplayError {
|
|
2109
|
-
constructor(url, requestKey2, message) {
|
|
2110
|
-
super(message ?? `replay cache miss for ${url} (key=${requestKey2})`);
|
|
2111
|
-
this.url = url;
|
|
2112
|
-
this.requestKey = requestKey2;
|
|
2113
|
-
}
|
|
2114
|
-
url;
|
|
2115
|
-
requestKey;
|
|
2116
|
-
};
|
|
2117
|
-
var ReplayCache = class _ReplayCache {
|
|
2118
|
-
byKey = /* @__PURE__ */ new Map();
|
|
2119
|
-
orphans = 0;
|
|
2120
|
-
byProvider = {};
|
|
2121
|
-
byModel = {};
|
|
2122
|
-
/**
|
|
2123
|
-
* Build a cache from a sink's events. The sink must implement `list()`.
|
|
2124
|
-
* Filter by `runId` / `spanId` to scope to a specific replay.
|
|
2125
|
-
*/
|
|
2126
|
-
static async fromSink(sink, filter = {}) {
|
|
2127
|
-
if (!sink.list) {
|
|
2128
|
-
throw new ReplayError("ReplayCache.fromSink: sink must implement list() to be replayable.");
|
|
2129
|
-
}
|
|
2130
|
-
const events = await sink.list(filter);
|
|
2131
|
-
return _ReplayCache.fromEvents(events);
|
|
2132
|
-
}
|
|
2133
|
-
/** Build a cache from an in-memory event list. */
|
|
2134
|
-
static async fromEvents(events) {
|
|
2135
|
-
const cache = new _ReplayCache();
|
|
2136
|
-
const groups = /* @__PURE__ */ new Map();
|
|
2137
|
-
for (const e of events) {
|
|
2138
|
-
const k = `${e.runId ?? ""}::${e.spanId ?? ""}::${e.attemptIndex}`;
|
|
2139
|
-
const g = groups.get(k) ?? {};
|
|
2140
|
-
if (e.direction === "request") g.req = e;
|
|
2141
|
-
else g.res = e;
|
|
2142
|
-
groups.set(k, g);
|
|
2143
|
-
}
|
|
2144
|
-
for (const g of groups.values()) {
|
|
2145
|
-
if (!g.req) continue;
|
|
2146
|
-
if (!g.res) {
|
|
2147
|
-
cache.orphans += 1;
|
|
2148
|
-
continue;
|
|
2149
|
-
}
|
|
2150
|
-
const key = await requestKey(g.req);
|
|
2151
|
-
cache.byKey.set(key, { request: g.req, response: g.res });
|
|
2152
|
-
cache.byProvider[g.req.provider] = (cache.byProvider[g.req.provider] ?? 0) + 1;
|
|
2153
|
-
cache.byModel[g.req.model] = (cache.byModel[g.req.model] ?? 0) + 1;
|
|
2154
|
-
}
|
|
2155
|
-
return cache;
|
|
2156
|
-
}
|
|
2157
|
-
/** Number of cacheable (request, response) pairs in the cache. */
|
|
2158
|
-
size() {
|
|
2159
|
-
return this.byKey.size;
|
|
2160
|
-
}
|
|
2161
|
-
stats() {
|
|
2162
|
-
return {
|
|
2163
|
-
total: this.byKey.size,
|
|
2164
|
-
byProvider: { ...this.byProvider },
|
|
2165
|
-
byModel: { ...this.byModel },
|
|
2166
|
-
orphanRequests: this.orphans
|
|
2167
|
-
};
|
|
2168
|
-
}
|
|
2169
|
-
/** Iterate every cached `(request, response)` pair in insertion order. */
|
|
2170
|
-
*entries() {
|
|
2171
|
-
for (const entry of this.byKey.values()) yield entry;
|
|
2172
|
-
}
|
|
2173
|
-
/**
|
|
2174
|
-
* Look up a cached response by hashing the (model, messages, temperature,
|
|
2175
|
-
* maxTokens, response_format) shape. Returns `undefined` on miss; the
|
|
2176
|
-
* caller decides whether to throw, fall back to the network, or skip.
|
|
2177
|
-
*/
|
|
2178
|
-
async lookup(requestBody) {
|
|
2179
|
-
const key = await keyFromBody(requestBody);
|
|
2180
|
-
return this.byKey.get(key);
|
|
2181
|
-
}
|
|
2182
|
-
};
|
|
2183
|
-
function createReplayFetch(cache, opts = {}) {
|
|
2184
|
-
const onMiss = opts.onMiss ?? "throw";
|
|
2185
|
-
const fallback = opts.fallbackFetch ?? globalThis.fetch?.bind(globalThis);
|
|
2186
|
-
return (async (input, init) => {
|
|
2187
|
-
const url = typeof input === "string" ? input : input instanceof URL ? input.toString() : input.url;
|
|
2188
|
-
if (!/\/chat\/completions(?:[?#].*)?$/.test(url)) {
|
|
2189
|
-
if (!fallback)
|
|
2190
|
-
throw new ReplayError(
|
|
2191
|
-
`replay fetch: non-completions URL ${url} but no fallbackFetch configured`
|
|
2192
|
-
);
|
|
2193
|
-
return fallback(input, init);
|
|
2194
|
-
}
|
|
2195
|
-
let bodyParsed;
|
|
2196
|
-
if (init?.body && typeof init.body === "string") {
|
|
2197
|
-
try {
|
|
2198
|
-
bodyParsed = JSON.parse(init.body);
|
|
2199
|
-
} catch {
|
|
2200
|
-
}
|
|
2201
|
-
}
|
|
2202
|
-
const hit = bodyParsed === void 0 ? void 0 : await cache.lookup(bodyParsed);
|
|
2203
|
-
if (hit) {
|
|
2204
|
-
opts.onHit?.({ url, provider: hit.request.provider, model: hit.request.model });
|
|
2205
|
-
const status = hit.response.statusCode ?? 200;
|
|
2206
|
-
const headers = new Headers(
|
|
2207
|
-
Object.entries(hit.response.responseHeaders ?? { "Content-Type": "application/json" })
|
|
2208
|
-
);
|
|
2209
|
-
const bodyText = typeof hit.response.responseBody === "string" ? hit.response.responseBody : JSON.stringify(hit.response.responseBody ?? {});
|
|
2210
|
-
return new Response(bodyText, { status, headers });
|
|
2211
|
-
}
|
|
2212
|
-
opts.onMissNotify?.({ url, requestBody: bodyParsed });
|
|
2213
|
-
if (onMiss === "throw") {
|
|
2214
|
-
const key = bodyParsed === void 0 ? "<unparseable>" : await keyFromBody(bodyParsed);
|
|
2215
|
-
throw new ReplayCacheMissError(url, key);
|
|
2216
|
-
}
|
|
2217
|
-
if (onMiss === "fail-closed") {
|
|
2218
|
-
return new Response(JSON.stringify({ error: "replay_cache_miss" }), { status: 599 });
|
|
2219
|
-
}
|
|
2220
|
-
if (!fallback)
|
|
2221
|
-
throw new ReplayError("replay fetch: onMiss=fallback but no fallbackFetch configured");
|
|
2222
|
-
return fallback(input, init);
|
|
2223
|
-
});
|
|
2224
|
-
}
|
|
2225
|
-
async function* iterateRawCalls(sink, filter = {}) {
|
|
2226
|
-
if (!sink.list) {
|
|
2227
|
-
throw new ReplayError("iterateRawCalls: sink must implement list().");
|
|
2228
|
-
}
|
|
2229
|
-
const events = await sink.list(filter);
|
|
2230
|
-
const cache = await ReplayCache.fromEvents(events);
|
|
2231
|
-
for (const entry of cache.entries()) yield entry;
|
|
2232
|
-
}
|
|
2233
|
-
async function requestKey(event) {
|
|
2234
|
-
return keyFromBody(event.requestBody);
|
|
2235
|
-
}
|
|
2236
|
-
async function keyFromBody(body) {
|
|
2237
|
-
if (body == null || typeof body !== "object") return hashJson({ raw: String(body) });
|
|
2238
|
-
const b = body;
|
|
2239
|
-
const reduced = canonicalize({
|
|
2240
|
-
model: b.model ?? null,
|
|
2241
|
-
messages: b.messages ?? null,
|
|
2242
|
-
temperature: b.temperature ?? null,
|
|
2243
|
-
max_tokens: b.max_tokens ?? null,
|
|
2244
|
-
max_completion_tokens: b.max_completion_tokens ?? null,
|
|
2245
|
-
response_format: b.response_format ?? null
|
|
2246
|
-
});
|
|
2247
|
-
return hashJson(reduced);
|
|
2248
|
-
}
|
|
2249
|
-
|
|
2250
1083
|
export {
|
|
2251
|
-
|
|
2252
|
-
|
|
2253
|
-
|
|
1084
|
+
projectOtlpFlatLine,
|
|
1085
|
+
readOtlpStatus,
|
|
1086
|
+
inferOtlpKind,
|
|
1087
|
+
extractOtlpAttributes,
|
|
1088
|
+
stringField,
|
|
1089
|
+
asString,
|
|
1090
|
+
asNumber,
|
|
1091
|
+
firstNumberAttr,
|
|
1092
|
+
firstStringAttr,
|
|
2254
1093
|
DEFAULT_TRACE_ANALYST_BUDGETS,
|
|
2255
1094
|
TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
|
|
2256
1095
|
OtlpFileTraceStore,
|
|
@@ -2259,29 +1098,9 @@ export {
|
|
|
2259
1098
|
SpanNotFoundError,
|
|
2260
1099
|
buildTraceAnalystTools,
|
|
2261
1100
|
traceAnalystFunctionGroup,
|
|
2262
|
-
|
|
2263
|
-
|
|
2264
|
-
|
|
2265
|
-
|
|
2266
|
-
domainEvidencePattern,
|
|
2267
|
-
describeTraceInsightScope,
|
|
2268
|
-
planTraceInsightQuestions,
|
|
2269
|
-
buildTraceInsightContext,
|
|
2270
|
-
scoreTraceInsightReadiness,
|
|
2271
|
-
defaultTraceInsightPanel,
|
|
2272
|
-
buildTraceInsightPrompt,
|
|
2273
|
-
flattenOtlpExportToNdjson,
|
|
2274
|
-
InMemoryTraceStore,
|
|
2275
|
-
FileSystemTraceStore,
|
|
2276
|
-
captureFetchToRawSink,
|
|
2277
|
-
OTEL_AGENT_EVAL_SCOPE,
|
|
2278
|
-
exportRunAsOtlp,
|
|
2279
|
-
otelRunCompleteHook,
|
|
2280
|
-
createOtelTracingStore,
|
|
2281
|
-
createOtelExporter,
|
|
2282
|
-
ReplayCacheMissError,
|
|
2283
|
-
ReplayCache,
|
|
2284
|
-
createReplayFetch,
|
|
2285
|
-
iterateRawCalls
|
|
1101
|
+
TRACE_ANALYST_ACTOR_DESCRIPTION,
|
|
1102
|
+
TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
|
|
1103
|
+
TRACE_ANALYST_SUBAGENT_DESCRIPTION,
|
|
1104
|
+
analyzeTraces
|
|
2286
1105
|
};
|
|
2287
|
-
//# sourceMappingURL=chunk-
|
|
1106
|
+
//# sourceMappingURL=chunk-VUINJM5M.js.map
|