@tangle-network/agent-eval 0.71.0 → 0.72.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/CHANGELOG.md +63 -0
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/otel.d.ts +3 -2
  5. package/dist/agent-profile-DYRboYWu.d.ts +364 -0
  6. package/dist/analyst/index.d.ts +221 -0
  7. package/dist/analyst/index.js +371 -0
  8. package/dist/analyst/index.js.map +1 -0
  9. package/dist/analyst-t7zZS3TV.d.ts +88 -0
  10. package/dist/campaign/index.d.ts +485 -9
  11. package/dist/campaign/index.js +618 -30
  12. package/dist/campaign/index.js.map +1 -1
  13. package/dist/chunk-7W4SM7FD.js +1075 -0
  14. package/dist/chunk-7W4SM7FD.js.map +1 -0
  15. package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
  16. package/dist/chunk-JHA3ZGSO.js +1496 -0
  17. package/dist/chunk-JHA3ZGSO.js.map +1 -0
  18. package/dist/{chunk-VMAYE3LM.js → chunk-JYE3WOTE.js} +57 -9
  19. package/dist/{chunk-VMAYE3LM.js.map → chunk-JYE3WOTE.js.map} +1 -1
  20. package/dist/chunk-LB2UOI5F.js +412 -0
  21. package/dist/chunk-LB2UOI5F.js.map +1 -0
  22. package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
  23. package/dist/chunk-VUINJM5M.js.map +1 -0
  24. package/dist/chunk-WYIHD6EB.js +1044 -0
  25. package/dist/chunk-WYIHD6EB.js.map +1 -0
  26. package/dist/{chunk-6QZUCFKM.js → chunk-XPILG2CA.js} +120 -3
  27. package/dist/chunk-XPILG2CA.js.map +1 -0
  28. package/dist/{chunk-6XQIEUQ2.js → chunk-ZPSKPT3V.js} +5 -3
  29. package/dist/{chunk-6XQIEUQ2.js.map → chunk-ZPSKPT3V.js.map} +1 -1
  30. package/dist/contract/index.d.ts +17 -13
  31. package/dist/contract/index.js +14 -8
  32. package/dist/contract/index.js.map +1 -1
  33. package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
  34. package/dist/control.d.ts +2 -2
  35. package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
  36. package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
  37. package/dist/hosted/index.d.ts +223 -2
  38. package/dist/index.d.ts +49 -1323
  39. package/dist/index.js +339 -2627
  40. package/dist/index.js.map +1 -1
  41. package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
  42. package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
  43. package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
  44. package/dist/openapi.json +1 -1
  45. package/dist/pareto-E-pembql.d.ts +81 -0
  46. package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
  47. package/dist/redact-B40YG2M_.d.ts +45 -0
  48. package/dist/registry-DuVYiTvw.d.ts +128 -0
  49. package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
  50. package/dist/rl.d.ts +4 -3
  51. package/dist/rl.js +4 -4
  52. package/dist/{run-campaign-BVY3RGAZ.js → run-campaign-OVEZF24D.js} +2 -2
  53. package/dist/run-critic-BAIjX99r.d.ts +56 -0
  54. package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
  55. package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
  56. package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
  57. package/dist/traces.d.ts +371 -308
  58. package/dist/traces.js +43 -18
  59. package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
  60. package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
  61. package/dist/wire/index.d.ts +1 -1
  62. package/dist/workflow/index.d.ts +494 -0
  63. package/dist/workflow/index.js +2177 -0
  64. package/dist/workflow/index.js.map +1 -0
  65. package/docs/design/self-improvement-roadmap.md +106 -0
  66. package/package.json +36 -12
  67. package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
  68. package/dist/chunk-6QZUCFKM.js.map +0 -1
  69. package/dist/chunk-ODGETRTM.js.map +0 -1
  70. package/dist/chunk-PQV2TKC3.js +0 -27
  71. package/dist/chunk-PQV2TKC3.js.map +0 -1
  72. /package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0
  73. /package/dist/{run-campaign-BVY3RGAZ.js.map → run-campaign-OVEZF24D.js.map} +0 -0
@@ -0,0 +1,1496 @@
1
+ import {
2
+ analyzeTraces,
3
+ firstNumberAttr,
4
+ firstStringAttr,
5
+ projectOtlpFlatLine
6
+ } from "./chunk-VUINJM5M.js";
7
+ import {
8
+ defaultProviderRedactor,
9
+ providerFromBaseUrl
10
+ } from "./chunk-PC4UYEBM.js";
11
+ import {
12
+ validateRunRecord
13
+ } from "./chunk-F3SRAAZO.js";
14
+ import {
15
+ canonicalize,
16
+ hashJson
17
+ } from "./chunk-VSMTAMNK.js";
18
+ import {
19
+ ReplayError
20
+ } from "./chunk-3BFEG2F6.js";
21
+
22
+ // src/trace-analyst/hook.ts
23
+ var DEFAULT_QUESTION = "Summarise what happened in this run. Surface any failure modes, surprising findings, or evidence that the run's verdict is wrong.";
24
+ function traceAnalystOnRunComplete(opts) {
25
+ return async (ctx) => {
26
+ if (opts.shouldRun && !opts.shouldRun(ctx)) return;
27
+ const source = opts.analyze.source;
28
+ if (source === void 0) {
29
+ await ctx.store.appendEvent({
30
+ eventId: `analyst-skip-${ctx.runId}`,
31
+ runId: ctx.runId,
32
+ kind: "log",
33
+ timestamp: Date.now(),
34
+ payload: { source: "trace_analyst_hook", reason: "no source configured" }
35
+ });
36
+ return;
37
+ }
38
+ const result = await analyzeTraces({ question: opts.question ?? DEFAULT_QUESTION }, {
39
+ ...opts.analyze,
40
+ source
41
+ });
42
+ if (opts.save) await opts.save(result, ctx);
43
+ if (opts.gateOn && !opts.gateOn(result, ctx)) {
44
+ await ctx.store.appendEvent({
45
+ eventId: `analyst-gate-${ctx.runId}`,
46
+ runId: ctx.runId,
47
+ kind: "log",
48
+ timestamp: Date.now(),
49
+ payload: {
50
+ source: "trace_analyst_hook",
51
+ reason: "analyst_gate_failed",
52
+ findings: result.findings
53
+ }
54
+ });
55
+ }
56
+ };
57
+ }
58
+
59
+ // src/trace-analyst/insights.ts
60
+ var DOMAIN_STOP_WORDS = /* @__PURE__ */ new Set([
61
+ "and",
62
+ "advanced",
63
+ "app",
64
+ "build",
65
+ "create",
66
+ "easy",
67
+ "expert",
68
+ "extreme",
69
+ "for",
70
+ "from",
71
+ "hard",
72
+ "implementation",
73
+ "integrate",
74
+ "medium",
75
+ "project",
76
+ "task",
77
+ "the",
78
+ "this",
79
+ "with",
80
+ "workflow"
81
+ ]);
82
+ function tokenizeDomainWords(value) {
83
+ return [...value.matchAll(/[A-Za-z][A-Za-z0-9.+#-]{2,}/g)].map((match) => match[0].toLowerCase()).filter((word) => !DOMAIN_STOP_WORDS.has(word));
84
+ }
85
+ function inferDomainKeywords(suite) {
86
+ const suiteWords = new Set(tokenizeDomainWords(`${suite.name} ${suite.collectionId ?? ""}`));
87
+ const source = [
88
+ suite.name,
89
+ suite.collectionId ?? "",
90
+ ...suite.tasks.flatMap((task) => [
91
+ task.id,
92
+ task.name,
93
+ task.prompt ?? "",
94
+ task.difficulty ?? "",
95
+ ...task.tags ?? [],
96
+ ...task.gaps ?? []
97
+ ])
98
+ ].join(" ");
99
+ const counts = /* @__PURE__ */ new Map();
100
+ for (const word of tokenizeDomainWords(source)) counts.set(word, (counts.get(word) ?? 0) + 1);
101
+ return [...counts.entries()].filter(([word, count]) => count >= 2 || suiteWords.has(word)).sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])).map(([word]) => word).slice(0, 18);
102
+ }
103
+ function domainEvidencePattern(keywords) {
104
+ const escaped = keywords.filter((keyword) => keyword.length >= 3).map((keyword) => keyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
105
+ return escaped.length > 0 ? new RegExp(`(?<![A-Za-z0-9])(?:${escaped.join("|")})(?![A-Za-z0-9])`, "i") : /(?<![A-Za-z0-9])(?:sdk|api|css|dns|xml|provider|client|service|integration|webhook|transaction|auth|oauth|graphql|rest)(?![A-Za-z0-9])/i;
106
+ }
107
+ function describeTraceInsightScope(suite) {
108
+ const taskLabel = suite.tasks.length === 1 ? "1 implementation task" : `${suite.tasks.length} implementation tasks`;
109
+ const tags = /* @__PURE__ */ new Map();
110
+ for (const task of suite.tasks) {
111
+ for (const tag of task.tags ?? []) tags.set(tag, (tags.get(tag) ?? 0) + 1);
112
+ }
113
+ const topTags = [...tags.entries()].sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])).slice(0, 8).map(([tag]) => tag);
114
+ if (topTags.length > 0) return `${taskLabel} across ${topTags.join(", ")}.`;
115
+ const difficulties = [
116
+ ...new Set(
117
+ suite.tasks.map((task) => task.difficulty).filter((value) => Boolean(value))
118
+ )
119
+ ].join(", ");
120
+ return `${taskLabel} across ${difficulties || "the selected benchmark scope"}.`;
121
+ }
122
+ function planTraceInsightQuestions(input) {
123
+ const hasFailures = input.suite.tasks.some((task) => task.outcome && task.outcome !== "satisfied");
124
+ const hasMultipleShots = input.suite.tasks.some(
125
+ (task) => (task.gaps ?? []).some((gap) => /shot|review|retry|continue/i.test(gap))
126
+ );
127
+ const questions = [
128
+ {
129
+ id: "execution-path",
130
+ question: "What did the worker actually do before the first meaningful implementation edit?",
131
+ why: "Separates grounded execution from polished but shallow output."
132
+ },
133
+ {
134
+ id: "research-grounding",
135
+ question: "Did the worker inspect docs, source, examples, or package references before committing to an implementation path?",
136
+ why: "Identifies whether failures came from weak retrieval, weak examples, or premature coding."
137
+ },
138
+ {
139
+ id: "domain-proof",
140
+ question: "Which tasks produced executable domain proof versus UI copy, placeholders, or inferred behavior?",
141
+ why: "Keeps product-quality claims tied to concrete evidence."
142
+ },
143
+ {
144
+ id: "root-cause",
145
+ question: "For each major failure cluster, is the likely root cause prompt/scaffold, docs/examples, SDK/API ergonomics, evaluator, runtime, or model behavior?",
146
+ why: "Turns trace observations into actionable ownership."
147
+ },
148
+ {
149
+ id: "evidence-quality",
150
+ question: "Which external-facing claims are directly supported by trace ids, span ids, verifier findings, reviewer notes, or generated code?",
151
+ why: "Prevents unsupported customer-report conclusions."
152
+ }
153
+ ];
154
+ if (hasMultipleShots) {
155
+ questions.push({
156
+ id: "reviewer-lift",
157
+ question: "Where did reviewer feedback improve score, stall, or regress across shots?",
158
+ why: "Shows whether the driver loop is learning or merely repeating work."
159
+ });
160
+ }
161
+ if (hasFailures) {
162
+ questions.push({
163
+ id: "optimization-targets",
164
+ question: "Which prompt, evaluator, scaffold, or workflow changes should feed the next GEPA/autoresearch optimization run?",
165
+ why: "Connects benchmark evidence to the optimization loop."
166
+ });
167
+ }
168
+ return questions;
169
+ }
170
+ function buildTraceInsightContext(input) {
171
+ return {
172
+ suite: input.suite,
173
+ scope: describeTraceInsightScope(input.suite),
174
+ keywords: inferDomainKeywords(input.suite),
175
+ questions: planTraceInsightQuestions(input),
176
+ panel: defaultTraceInsightPanel(),
177
+ findings: input.findings ?? [],
178
+ agent: input.agent ?? null,
179
+ totals: input.totals ?? null
180
+ };
181
+ }
182
+ function scoreTraceInsightReadiness(context) {
183
+ const failedTasks = context.suite.tasks.filter(
184
+ (task) => task.outcome && task.outcome !== "satisfied"
185
+ );
186
+ const findingTaskIds = new Set(context.findings.flatMap((finding) => finding.taskIds));
187
+ const failedTasksWithFindings = failedTasks.filter((task) => findingTaskIds.has(task.id));
188
+ const tasksWithGaps = context.suite.tasks.filter((task) => (task.gaps ?? []).length > 0);
189
+ const gates = [
190
+ {
191
+ id: "domain-context",
192
+ label: "Domain context inferred",
193
+ passed: context.keywords.length > 0,
194
+ severity: "high",
195
+ detail: context.keywords.length > 0 ? `${context.keywords.length} domain terms inferred: ${context.keywords.slice(0, 8).join(", ")}` : "No domain terms were inferred from suite, tasks, prompts, tags, or gaps."
196
+ },
197
+ {
198
+ id: "panel-coverage",
199
+ label: "Analyst panel planned",
200
+ passed: context.panel.length >= 4 && context.questions.length >= 5,
201
+ severity: "high",
202
+ detail: `${context.panel.length} panel roles and ${context.questions.length} investigation questions planned.`
203
+ },
204
+ {
205
+ id: "failure-coverage",
206
+ label: "Failures mapped to findings",
207
+ passed: failedTasks.length === 0 || failedTasksWithFindings.length / failedTasks.length >= 0.5,
208
+ severity: "critical",
209
+ detail: failedTasks.length === 0 ? "No failed tasks in suite." : `${failedTasksWithFindings.length}/${failedTasks.length} failed tasks appear in finding clusters.`
210
+ },
211
+ {
212
+ id: "gap-evidence",
213
+ label: "Task gaps captured",
214
+ passed: failedTasks.length === 0 || tasksWithGaps.length / failedTasks.length >= 0.5,
215
+ severity: "medium",
216
+ detail: `${tasksWithGaps.length} tasks include explicit evaluator or analyst gaps.`
217
+ }
218
+ ];
219
+ const penalty = gates.reduce((sum, gate) => {
220
+ if (gate.passed) return sum;
221
+ if (gate.severity === "critical") return sum + 35;
222
+ if (gate.severity === "high") return sum + 20;
223
+ if (gate.severity === "medium") return sum + 10;
224
+ return sum + 5;
225
+ }, 0);
226
+ const score = Math.max(0, Math.min(1, 1 - penalty / 100));
227
+ return {
228
+ score,
229
+ grade: score >= 0.9 ? "external-ready" : score >= 0.7 ? "internal-review" : "raw-analysis",
230
+ gates
231
+ };
232
+ }
233
+ function defaultTraceInsightPanel() {
234
+ return [
235
+ {
236
+ id: "trace-forensics",
237
+ name: "Trace Forensics",
238
+ responsibility: "Reconstruct what the worker did in order, including research, edits, reviewer interventions, verifier feedback, and stop reason."
239
+ },
240
+ {
241
+ id: "root-cause",
242
+ name: "Root Cause",
243
+ responsibility: "Map failures to prompt/scaffold, docs/examples, SDK/API/product ergonomics, evaluator, runtime, or model behavior."
244
+ },
245
+ {
246
+ id: "optimization",
247
+ name: "Optimization",
248
+ responsibility: "Identify prompt, reviewer, evaluator, scaffold, and GEPA/autoresearch changes that should be tested next."
249
+ },
250
+ {
251
+ id: "external-evidence",
252
+ name: "External Evidence",
253
+ responsibility: "Separate customer-safe claims from internal harness findings and reject conclusions without task, trace, span, code, reviewer, or verifier evidence."
254
+ }
255
+ ];
256
+ }
257
+ function buildTraceInsightPrompt(input) {
258
+ const context = buildTraceInsightContext(input);
259
+ const maxRepresentativeTraces = input.maxRepresentativeTraces ?? 6;
260
+ return `Analyze this benchmark run and produce evidence-backed trace intelligence.
261
+
262
+ Audience:
263
+ - internal AI/product leadership
264
+ - possible customer-facing report for ${input.suite.name}
265
+
266
+ Investigation plan:
267
+ ${context.questions.map((item, index) => `${index + 1}. ${item.question} (${item.why})`).join("\n")}
268
+
269
+ Analyst panel:
270
+ ${context.panel.map((role) => `- ${role.name}: ${role.responsibility}`).join("\n")}
271
+
272
+ If the task branches are independent, use subagents for the panel roles above and aggregate their findings. Do not run a panel role unless its answer will change the final report.
273
+
274
+ Required output:
275
+ 1. Executive verdict: what this run proves and does not prove.
276
+ 2. The investigation questions you answered and the evidence used.
277
+ 3. Failure taxonomy: agent prompting, evaluator/harness, docs/examples, SDK/API/product integration, infra.
278
+ 4. Evidence-backed examples with trace ids/task ids and concrete verifier findings.
279
+ 5. Highest-ROI fixes for the benchmark harness, prompt/GEPA optimization, and customer-facing product/docs surface.
280
+ 6. What is safe for an external report versus what must stay internal.
281
+ 7. One rerun plan that would validate lift after optimization.
282
+
283
+ Budget:
284
+ - Inspect the dataset overview, the failure summary, and at most ${maxRepresentativeTraces} representative traces.
285
+ - Prefer traces named in the failure summary over broad exploration.
286
+ - Do not do exhaustive trace sweeps.
287
+ - Return the final report as soon as the taxonomy and examples are supported.
288
+
289
+ Run summary:
290
+ ${JSON.stringify(
291
+ {
292
+ suite: input.suite.name,
293
+ scope: context.scope,
294
+ inferredKeywords: context.keywords,
295
+ agent: context.agent,
296
+ totals: context.totals,
297
+ findings: context.findings.map((finding) => ({
298
+ kind: finding.kind,
299
+ severity: finding.severity,
300
+ taskCount: finding.taskIds.length,
301
+ proposedFixClass: finding.proposedFixClass
302
+ })),
303
+ failures: input.suite.tasks.filter((task) => task.outcome && task.outcome !== "satisfied").map((task) => ({
304
+ task: task.id,
305
+ difficulty: task.difficulty,
306
+ outcome: task.outcome,
307
+ score: task.score,
308
+ gaps: task.gaps ?? []
309
+ }))
310
+ },
311
+ null,
312
+ 2
313
+ )}
314
+
315
+ Use the trace tools. Do not invent facts. Cite task ids. Separate customer-facing claims from internal harness/model findings.`;
316
+ }
317
+
318
+ // src/trace-analyst/otlp-flatten.ts
319
+ var DEFAULT_KIND_MAP = {
320
+ 0: "SPAN_KIND_UNSPECIFIED",
321
+ 1: "SPAN_KIND_INTERNAL",
322
+ 2: "SPAN_KIND_SERVER",
323
+ 3: "SPAN_KIND_CLIENT",
324
+ 4: "SPAN_KIND_PRODUCER",
325
+ 5: "SPAN_KIND_CONSUMER"
326
+ };
327
+ var STATUS_MAP = {
328
+ 0: "STATUS_CODE_UNSET",
329
+ 1: "STATUS_CODE_OK",
330
+ 2: "STATUS_CODE_ERROR"
331
+ };
332
+ function attrValue(v) {
333
+ if (v.stringValue !== void 0) return v.stringValue;
334
+ if (v.intValue !== void 0) return Number(v.intValue);
335
+ if (v.doubleValue !== void 0) return v.doubleValue;
336
+ if (v.boolValue !== void 0) return v.boolValue;
337
+ return "";
338
+ }
339
+ function attrsToRecord(attrs) {
340
+ const out = {};
341
+ for (const a of attrs) out[a.key] = attrValue(a.value);
342
+ return out;
343
+ }
344
+ function nanoToIso(nano) {
345
+ const ms = Number(nano) / 1e6;
346
+ return Number.isFinite(ms) ? new Date(ms).toISOString() : (/* @__PURE__ */ new Date(0)).toISOString();
347
+ }
348
+ function applyOpenInference(attrs) {
349
+ if ("llm.model" in attrs && !("llm.model_name" in attrs)) {
350
+ attrs["llm.model_name"] = attrs["llm.model"];
351
+ }
352
+ if ("tool.name" in attrs && !("inference.tool.name" in attrs)) {
353
+ attrs["inference.tool.name"] = attrs["tool.name"];
354
+ }
355
+ if ("span.kind" in attrs && !("openinference.span.kind" in attrs)) {
356
+ attrs["openinference.span.kind"] = String(attrs["span.kind"]).toUpperCase();
357
+ }
358
+ }
359
+ function flattenOtlpExportToNdjson(otlpExport, opts = {}) {
360
+ const vocab = opts.attributeVocabulary ?? "openinference";
361
+ const kindMap = { ...DEFAULT_KIND_MAP, ...opts.kindMap };
362
+ const lines = [];
363
+ for (const rs of otlpExport.resourceSpans ?? []) {
364
+ const resource = { attributes: attrsToRecord(rs.resource?.attributes ?? []) };
365
+ for (const scope of rs.scopeSpans ?? []) {
366
+ for (const span of scope.spans ?? []) {
367
+ const attributes = attrsToRecord(span.attributes ?? []);
368
+ if (vocab === "openinference") applyOpenInference(attributes);
369
+ const line = {
370
+ trace_id: span.traceId,
371
+ span_id: span.spanId,
372
+ parent_span_id: span.parentSpanId ?? null,
373
+ name: span.name,
374
+ kind: kindMap[span.kind] ?? "SPAN_KIND_UNSPECIFIED",
375
+ start_time: nanoToIso(span.startTimeUnixNano),
376
+ end_time: nanoToIso(span.endTimeUnixNano),
377
+ status: {
378
+ code: STATUS_MAP[span.status?.code ?? 0] ?? "STATUS_CODE_UNSET",
379
+ ...span.status?.message ? { message: span.status.message } : {}
380
+ },
381
+ resource,
382
+ attributes
383
+ };
384
+ if (span.events && span.events.length > 0) {
385
+ line.events = span.events.map((e) => ({
386
+ name: e.name,
387
+ timeUnixNano: e.timeUnixNano,
388
+ ...e.attributes ? { attributes: attrsToRecord(e.attributes) } : {}
389
+ }));
390
+ }
391
+ lines.push(line);
392
+ }
393
+ }
394
+ }
395
+ return lines;
396
+ }
397
+
398
+ // src/trace-analyst/otlp-to-run-records.ts
399
+ var INPUT_TOKEN_KEYS = [
400
+ "llm.input_tokens",
401
+ "inference.llm.input_tokens",
402
+ "llm.token_count.prompt",
403
+ "gen_ai.usage.input_tokens",
404
+ "gen_ai.usage.prompt_tokens"
405
+ ];
406
+ var OUTPUT_TOKEN_KEYS = [
407
+ "llm.output_tokens",
408
+ "inference.llm.output_tokens",
409
+ "llm.token_count.completion",
410
+ "gen_ai.usage.output_tokens",
411
+ "gen_ai.usage.completion_tokens"
412
+ ];
413
+ var CACHED_TOKEN_KEYS = [
414
+ "llm.cached_tokens",
415
+ "inference.llm.cached_tokens",
416
+ "llm.token_count.prompt_cache_hit",
417
+ "gen_ai.usage.cached_tokens"
418
+ ];
419
+ var COST_KEYS = [
420
+ "llm.cost_usd",
421
+ "inference.llm.cost.total",
422
+ "llm.cost.total",
423
+ "gen_ai.usage.cost"
424
+ ];
425
+ var MODEL_KEYS = [
426
+ "llm.model_name",
427
+ "inference.llm.model_name",
428
+ "llm.model",
429
+ "gen_ai.request.model",
430
+ "gen_ai.response.model"
431
+ ];
432
+ function otlpToRunRecords(otlpJsonl, opts) {
433
+ return otlpToTraceRunRecords(otlpJsonl, opts).map((r) => r.record);
434
+ }
435
+ function otlpToTraceRunRecords(otlpJsonl, opts) {
436
+ const splitTag = opts.splitTag ?? "holdout";
437
+ const commitSha = opts.commitSha ?? "unknown";
438
+ const promptHash = opts.promptHash ?? "unknown";
439
+ const configHash = opts.configHash ?? "unknown";
440
+ const seed = opts.seed ?? 0;
441
+ const fallbackModel = opts.fallbackModel ?? "unknown@otlp";
442
+ const byTrace = groupSpansByTrace(otlpJsonl);
443
+ if (byTrace.size === 0) {
444
+ throw new Error(
445
+ "otlpToRunRecords: OTLP input produced zero valid spans \u2014 every line was empty, malformed, or missing trace_id/span_id"
446
+ );
447
+ }
448
+ const traceIds = [...byTrace.keys()].sort();
449
+ const out = [];
450
+ for (const traceId of traceIds) {
451
+ const spans = byTrace.get(traceId);
452
+ const agg = aggregateTrace(traceId, spans, fallbackModel);
453
+ const score = resolveScore(opts, traceId, agg);
454
+ const { costUsd, costUnpriced } = resolveCost(opts, agg, spans);
455
+ const raw = {
456
+ span_count: agg.spanCount,
457
+ llm_span_count: agg.llmSpanCount,
458
+ tool_span_count: agg.toolSpanCount,
459
+ agent_span_count: agg.agentSpanCount,
460
+ error_span_count: agg.errorSpanCount,
461
+ prompt_tokens: agg.tokenUsage.input,
462
+ completion_tokens: agg.tokenUsage.output
463
+ };
464
+ if (agg.tokenUsage.cached !== void 0) raw.cached_tokens = agg.tokenUsage.cached;
465
+ if (costUnpriced) raw.cost_unpriced = 1;
466
+ const outcome = splitTag === "holdout" ? { holdoutScore: score, raw } : { searchScore: score, raw };
467
+ const { promptText, completionText } = extractPromptCompletion(spans);
468
+ const judgeMetadata = opts.judgeMetadataForTrace?.(traceId);
469
+ const record = validateRunRecord({
470
+ runId: `otlp:${opts.experimentId}:${opts.candidateId}:${traceId}`,
471
+ experimentId: opts.experimentId,
472
+ candidateId: opts.candidateId,
473
+ seed,
474
+ model: ensureSnapshot(agg.model, fallbackModel),
475
+ promptHash,
476
+ configHash,
477
+ commitSha,
478
+ wallMs: agg.wallMs,
479
+ costUsd,
480
+ tokenUsage: agg.tokenUsage,
481
+ ...judgeMetadata ? { judgeMetadata } : {},
482
+ outcome,
483
+ ...agg.firstErrorMessage ? { failureMode: agg.firstErrorMessage } : {},
484
+ splitTag,
485
+ scenarioId: traceId
486
+ });
487
+ out.push({
488
+ record,
489
+ ...promptText !== void 0 ? { promptText } : {},
490
+ ...completionText !== void 0 ? { completionText } : {}
491
+ });
492
+ }
493
+ return out;
494
+ }
495
+ function groupSpansByTrace(otlpJsonl) {
496
+ const byTrace = /* @__PURE__ */ new Map();
497
+ for (const line of otlpJsonl.split("\n")) {
498
+ const trimmed = line.trim();
499
+ if (trimmed.length === 0) continue;
500
+ let parsed;
501
+ try {
502
+ parsed = JSON.parse(trimmed);
503
+ } catch {
504
+ continue;
505
+ }
506
+ if (!parsed || typeof parsed !== "object") continue;
507
+ const span = projectOtlpFlatLine(parsed);
508
+ if (!span) continue;
509
+ const arr = byTrace.get(span.trace_id);
510
+ if (arr) arr.push(span);
511
+ else byTrace.set(span.trace_id, [span]);
512
+ }
513
+ return byTrace;
514
+ }
515
+ function aggregateTrace(traceId, spans, fallbackModel) {
516
+ const ordered = [...spans].sort(
517
+ (a, b) => a.start_time.localeCompare(b.start_time) || a.span_id.localeCompare(b.span_id)
518
+ );
519
+ let input = 0;
520
+ let output = 0;
521
+ let cached = 0;
522
+ let sawCached = false;
523
+ let llmSpanCount = 0;
524
+ let toolSpanCount = 0;
525
+ let agentSpanCount = 0;
526
+ let errorSpanCount = 0;
527
+ let firstErrorMessage;
528
+ const modelVotes = /* @__PURE__ */ new Map();
529
+ let earliest = ordered[0]?.start_time ?? "";
530
+ let latest = ordered[0]?.end_time ?? "";
531
+ for (const s of ordered) {
532
+ if (s.start_time && (!earliest || s.start_time < earliest)) earliest = s.start_time;
533
+ if (s.end_time && s.end_time > latest) latest = s.end_time;
534
+ if (s.kind === "LLM") {
535
+ llmSpanCount += 1;
536
+ const i = firstNumberAttr(s.attributes, INPUT_TOKEN_KEYS);
537
+ const o = firstNumberAttr(s.attributes, OUTPUT_TOKEN_KEYS);
538
+ const c = firstNumberAttr(s.attributes, CACHED_TOKEN_KEYS);
539
+ if (i !== null) input += i;
540
+ if (o !== null) output += o;
541
+ if (c !== null) {
542
+ cached += c;
543
+ sawCached = true;
544
+ }
545
+ const m = firstStringAttr(s.attributes, MODEL_KEYS) ?? s.model_name;
546
+ if (m) modelVotes.set(m, (modelVotes.get(m) ?? 0) + 1);
547
+ } else if (s.kind === "TOOL") {
548
+ toolSpanCount += 1;
549
+ } else if (s.kind === "AGENT") {
550
+ agentSpanCount += 1;
551
+ }
552
+ if (s.status === "ERROR") {
553
+ errorSpanCount += 1;
554
+ if (firstErrorMessage === void 0) {
555
+ firstErrorMessage = (s.status_message ?? `${s.name} \u2014 STATUS_CODE_ERROR`).slice(0, 500);
556
+ }
557
+ }
558
+ }
559
+ const model = topVote(modelVotes) ?? firstModelAttr(ordered) ?? fallbackModel;
560
+ let wallMs = 0;
561
+ if (earliest && latest) {
562
+ const a = Date.parse(earliest);
563
+ const b = Date.parse(latest);
564
+ if (!Number.isNaN(a) && !Number.isNaN(b)) wallMs = Math.max(0, b - a);
565
+ }
566
+ const tokenUsage = sawCached ? { input, output, cached } : { input, output };
567
+ return {
568
+ traceId,
569
+ spanCount: spans.length,
570
+ llmSpanCount,
571
+ toolSpanCount,
572
+ agentSpanCount,
573
+ errorSpanCount,
574
+ tokenUsage,
575
+ firstErrorMessage,
576
+ model,
577
+ startTime: earliest,
578
+ endTime: latest,
579
+ wallMs
580
+ };
581
+ }
582
+ function resolveScore(opts, traceId, agg) {
583
+ const supplied = opts.scoreForTrace?.(traceId, agg);
584
+ if (supplied !== void 0) {
585
+ if (!Number.isFinite(supplied)) {
586
+ throw new Error(
587
+ `otlpToRunRecords: scoreForTrace('${traceId}') returned non-finite ${supplied}`
588
+ );
589
+ }
590
+ return supplied;
591
+ }
592
+ return agg.errorSpanCount > 0 ? 0 : 1;
593
+ }
594
+ function resolveCost(opts, agg, spans) {
595
+ let perSpanCost = 0;
596
+ let sawCost = false;
597
+ for (const s of spans) {
598
+ if (s.kind !== "LLM") continue;
599
+ const c = firstNumberAttr(s.attributes, COST_KEYS);
600
+ if (c !== null) {
601
+ perSpanCost += c;
602
+ sawCost = true;
603
+ }
604
+ }
605
+ if (sawCost) return { costUsd: perSpanCost, costUnpriced: false };
606
+ if (opts.priceUsdPerToken !== void 0) {
607
+ const totalTokens = agg.tokenUsage.input + agg.tokenUsage.output;
608
+ return { costUsd: totalTokens * opts.priceUsdPerToken, costUnpriced: false };
609
+ }
610
+ return { costUsd: 0, costUnpriced: true };
611
+ }
612
+ function extractPromptCompletion(spans) {
613
+ const llm = spans.filter((s) => s.kind === "LLM").sort((a, b) => a.start_time.localeCompare(b.start_time) || a.span_id.localeCompare(b.span_id));
614
+ if (llm.length === 0) return {};
615
+ const promptText = firstStringAttr(llm[0].attributes, ["input.value", "llm.input_messages", "gen_ai.prompt"]) ?? void 0;
616
+ const last = llm[llm.length - 1];
617
+ const completionText = firstStringAttr(last.attributes, [
618
+ "output.value",
619
+ "llm.output_messages",
620
+ "gen_ai.completion"
621
+ ]) ?? void 0;
622
+ return {
623
+ ...promptText !== void 0 ? { promptText } : {},
624
+ ...completionText !== void 0 ? { completionText } : {}
625
+ };
626
+ }
627
+ function topVote(votes) {
628
+ let best = null;
629
+ let bestN = 0;
630
+ for (const [k, n] of votes) {
631
+ if (n > bestN || n === bestN && best !== null && k < best) {
632
+ best = k;
633
+ bestN = n;
634
+ }
635
+ }
636
+ return best;
637
+ }
638
+ function firstModelAttr(spans) {
639
+ for (const s of spans) {
640
+ const m = firstStringAttr(s.attributes, MODEL_KEYS) ?? s.model_name;
641
+ if (m) return m;
642
+ }
643
+ return null;
644
+ }
645
+ function ensureSnapshot(model, fallbackModel) {
646
+ if (modelHasSnapshot(model)) return model;
647
+ const fallbackTag = fallbackModel.includes("@") ? fallbackModel.slice(fallbackModel.indexOf("@")) : "@otlp";
648
+ return `${model}${fallbackTag}`;
649
+ }
650
+ function modelHasSnapshot(model) {
651
+ if (model.includes("@")) return true;
652
+ if (/-\d{8}$/.test(model)) return true;
653
+ if (/-\d{4}-\d{2}-\d{2}$/.test(model)) return true;
654
+ if (/:date-/.test(model)) return true;
655
+ return false;
656
+ }
657
+
658
+ // src/trace/store.ts
659
+ var InMemoryTraceStore = class {
660
+ runs = /* @__PURE__ */ new Map();
661
+ allSpans = [];
662
+ allEvents = [];
663
+ allArtifacts = [];
664
+ allBudget = [];
665
+ async appendRun(run) {
666
+ if (this.runs.has(run.runId)) throw new Error(`run ${run.runId} already exists`);
667
+ this.runs.set(run.runId, { ...run });
668
+ }
669
+ async updateRun(runId, patch) {
670
+ const existing = this.runs.get(runId);
671
+ if (!existing) throw new Error(`run ${runId} not found`);
672
+ this.runs.set(runId, { ...existing, ...patch });
673
+ }
674
+ async appendSpan(span) {
675
+ this.allSpans.push({ ...span });
676
+ }
677
+ async updateSpan(spanId, patch) {
678
+ const idx = this.allSpans.findIndex((s) => s.spanId === spanId);
679
+ if (idx < 0) throw new Error(`span ${spanId} not found`);
680
+ this.allSpans[idx] = { ...this.allSpans[idx], ...patch };
681
+ }
682
+ async appendEvent(event) {
683
+ this.allEvents.push({ ...event });
684
+ }
685
+ async appendArtifact(artifact) {
686
+ this.allArtifacts.push({ ...artifact });
687
+ }
688
+ async appendBudgetEntry(entry) {
689
+ this.allBudget.push({ ...entry });
690
+ }
691
+ async getRun(runId) {
692
+ const r = this.runs.get(runId);
693
+ return r ? { ...r } : void 0;
694
+ }
695
+ async listRuns(filter = {}) {
696
+ return [...this.runs.values()].filter((r) => matchesRun(r, filter));
697
+ }
698
+ async spans(filter = {}) {
699
+ return this.allSpans.filter((s) => matchesSpan(s, filter)).map((s) => ({ ...s }));
700
+ }
701
+ async events(filter = {}) {
702
+ return this.allEvents.filter((e) => matchesEvent(e, filter)).map((e) => ({ ...e }));
703
+ }
704
+ async budget(runId) {
705
+ return this.allBudget.filter((b) => b.runId === runId).map((b) => ({ ...b }));
706
+ }
707
+ async artifacts(runId) {
708
+ return this.allArtifacts.filter((a) => a.runId === runId).map((a) => ({ ...a }));
709
+ }
710
+ };
711
+ function matchesRun(r, f) {
712
+ if (f.scenarioId && r.scenarioId !== f.scenarioId) return false;
713
+ if (f.variantId && r.variantId !== f.variantId) return false;
714
+ if (f.status && r.status !== f.status) return false;
715
+ if (f.since !== void 0 && r.startedAt < f.since) return false;
716
+ if (f.until !== void 0 && r.startedAt > f.until) return false;
717
+ if (f.tag && r.tags?.[f.tag.key] !== f.tag.value) return false;
718
+ if (f.parentRunId && r.parentRunId !== f.parentRunId) return false;
719
+ if (f.projectId && r.projectId !== f.projectId) return false;
720
+ if (f.chatId && r.chatId !== f.chatId) return false;
721
+ if (f.layer && r.layer !== f.layer) return false;
722
+ return true;
723
+ }
724
+ function matchesSpan(s, f) {
725
+ if (f.runId && s.runId !== f.runId) return false;
726
+ if (f.parentSpanId && s.parentSpanId !== f.parentSpanId) return false;
727
+ if (f.kind && s.kind !== f.kind) return false;
728
+ if (f.name && s.name !== f.name) return false;
729
+ if (f.toolName && (s.kind !== "tool" || s.toolName !== f.toolName)) return false;
730
+ if (f.judgeId && (s.kind !== "judge" || s.judgeId !== f.judgeId)) return false;
731
+ if (f.since !== void 0 && s.startedAt < f.since) return false;
732
+ if (f.until !== void 0 && s.startedAt > f.until) return false;
733
+ return true;
734
+ }
735
+ function matchesEvent(e, f) {
736
+ if (f.runId && e.runId !== f.runId) return false;
737
+ if (f.spanId && e.spanId !== f.spanId) return false;
738
+ if (f.kind && e.kind !== f.kind) return false;
739
+ if (f.since !== void 0 && e.timestamp < f.since) return false;
740
+ if (f.until !== void 0 && e.timestamp > f.until) return false;
741
+ return true;
742
+ }
743
+ var FileSystemTraceStore = class {
744
+ dir;
745
+ maxBytes;
746
+ /** Lazy in-memory index for queries — populated on first read. */
747
+ index;
748
+ loaded = false;
749
+ constructor(options) {
750
+ this.dir = options.dir;
751
+ this.maxBytes = options.maxBytes ?? 32 * 1024 * 1024;
752
+ }
753
+ async ensureDir() {
754
+ const fs = await import("fs/promises");
755
+ await fs.mkdir(this.dir, { recursive: true });
756
+ }
757
+ async append(name, record) {
758
+ await this.ensureDir();
759
+ const fs = await import("fs/promises");
760
+ const path = await import("path");
761
+ const active = path.join(this.dir, `${name}.ndjson`);
762
+ try {
763
+ const stat = await fs.stat(active);
764
+ if (stat.size >= this.maxBytes) {
765
+ const rolled = path.join(this.dir, `${name}.${Date.now()}.ndjson`);
766
+ await fs.rename(active, rolled);
767
+ }
768
+ } catch {
769
+ }
770
+ await fs.appendFile(active, `${JSON.stringify(record)}
771
+ `, "utf8");
772
+ if (this.index && !record?._update) {
773
+ void this.insertInto(name, record);
774
+ }
775
+ }
776
+ async insertInto(name, record) {
777
+ if (!this.index) return;
778
+ switch (name) {
779
+ case "runs":
780
+ await this.index.appendRun(record);
781
+ break;
782
+ case "spans":
783
+ await this.index.appendSpan(record);
784
+ break;
785
+ case "events":
786
+ await this.index.appendEvent(record);
787
+ break;
788
+ case "artifacts":
789
+ await this.index.appendArtifact(record);
790
+ break;
791
+ case "budget":
792
+ await this.index.appendBudgetEntry(record);
793
+ break;
794
+ }
795
+ }
796
+ async load() {
797
+ if (this.loaded && this.index) return this.index;
798
+ const fs = await import("fs/promises");
799
+ const path = await import("path");
800
+ const store = new InMemoryTraceStore();
801
+ try {
802
+ const entries = await fs.readdir(this.dir);
803
+ for (const file of entries) {
804
+ if (!file.endsWith(".ndjson")) continue;
805
+ const full = path.join(this.dir, file);
806
+ const content = await fs.readFile(full, "utf8");
807
+ const base = file.split(".")[0];
808
+ for (const line of content.split("\n")) {
809
+ if (!line.trim()) continue;
810
+ const record = JSON.parse(line);
811
+ if (base === "runs") {
812
+ try {
813
+ await store.appendRun(record);
814
+ } catch {
815
+ await store.updateRun(record.runId, record);
816
+ }
817
+ } else if (base === "spans") {
818
+ if (record?._update) {
819
+ try {
820
+ await store.updateSpan(record.spanId, record);
821
+ } catch {
822
+ await store.appendSpan(record);
823
+ }
824
+ } else {
825
+ await store.appendSpan(record);
826
+ }
827
+ } else if (base === "events") {
828
+ await store.appendEvent(record);
829
+ } else if (base === "artifacts") {
830
+ await store.appendArtifact(record);
831
+ } else if (base === "budget") {
832
+ await store.appendBudgetEntry(record);
833
+ }
834
+ }
835
+ }
836
+ } catch {
837
+ }
838
+ this.index = store;
839
+ this.loaded = true;
840
+ return store;
841
+ }
842
+ async appendRun(run) {
843
+ await this.append("runs", run);
844
+ }
845
+ async updateRun(runId, patch) {
846
+ await this.append("runs", { runId, ...patch, _update: true });
847
+ if (this.index) await this.index.updateRun(runId, patch);
848
+ }
849
+ async appendSpan(span) {
850
+ await this.append("spans", span);
851
+ }
852
+ async updateSpan(spanId, patch) {
853
+ await this.append("spans", { spanId, ...patch, _update: true });
854
+ if (this.index) await this.index.updateSpan(spanId, patch);
855
+ }
856
+ async appendEvent(event) {
857
+ await this.append("events", event);
858
+ }
859
+ async appendArtifact(artifact) {
860
+ await this.append("artifacts", artifact);
861
+ }
862
+ async appendBudgetEntry(entry) {
863
+ await this.append("budget", entry);
864
+ }
865
+ async getRun(runId) {
866
+ return (await this.load()).getRun(runId);
867
+ }
868
+ async listRuns(filter) {
869
+ return (await this.load()).listRuns(filter);
870
+ }
871
+ async spans(filter) {
872
+ return (await this.load()).spans(filter);
873
+ }
874
+ async events(filter) {
875
+ return (await this.load()).events(filter);
876
+ }
877
+ async budget(runId) {
878
+ return (await this.load()).budget(runId);
879
+ }
880
+ async artifacts(runId) {
881
+ return (await this.load()).artifacts(runId);
882
+ }
883
+ };
884
+
885
+ // src/trace/capture-fetch.ts
886
+ var DEFAULT_BODY_CAP = 2 * 1024 * 1024;
887
+ function headersToRecord(headers) {
888
+ if (!headers) return void 0;
889
+ const out = {};
890
+ headers.forEach((value, key) => {
891
+ out[key.toLowerCase()] = value;
892
+ });
893
+ return Object.keys(out).length > 0 ? out : void 0;
894
+ }
895
+ function parseMaybeJson(text) {
896
+ if (text.length === 0) return void 0;
897
+ try {
898
+ return JSON.parse(text);
899
+ } catch {
900
+ return text;
901
+ }
902
+ }
903
+ async function readRequestBody(input, init) {
904
+ if (typeof init?.body === "string") return parseMaybeJson(init.body);
905
+ if (init?.body != null) return void 0;
906
+ if (input instanceof Request) {
907
+ try {
908
+ return parseMaybeJson(await input.clone().text());
909
+ } catch {
910
+ return void 0;
911
+ }
912
+ }
913
+ return void 0;
914
+ }
915
+ function endpointFromUrl(url, baseUrl) {
916
+ const normalisedBase = baseUrl.replace(/\/+$/, "");
917
+ if (url.startsWith(normalisedBase)) return url.slice(normalisedBase.length) || "/";
918
+ try {
919
+ return new URL(url).pathname;
920
+ } catch {
921
+ return url;
922
+ }
923
+ }
924
+ function captureFetchToRawSink(fetch2, sink, ctx, opts = {}) {
925
+ const provider = ctx.provider ?? providerFromBaseUrl(ctx.baseUrl);
926
+ const redactor = opts.redactor ?? defaultProviderRedactor;
927
+ const bodyCap = opts.responseBodyByteCap ?? DEFAULT_BODY_CAP;
928
+ let warned = false;
929
+ const baseEvent = (direction, endpoint) => ({
930
+ eventId: crypto.randomUUID(),
931
+ runId: ctx.runId,
932
+ spanId: ctx.spanId,
933
+ provider,
934
+ model: ctx.model,
935
+ endpoint,
936
+ baseUrl: ctx.baseUrl,
937
+ attemptIndex: 0,
938
+ // retries are re-invocations one layer up; documented in 0.x
939
+ direction,
940
+ timestamp: Date.now(),
941
+ redactedFields: []
942
+ });
943
+ const record = async (event) => {
944
+ try {
945
+ await sink.record(redactor(event));
946
+ } catch (err) {
947
+ if (opts.failClosed) throw err;
948
+ if (!warned) {
949
+ warned = true;
950
+ console.warn(
951
+ `captureFetchToRawSink: sink.record failed (capture is best-effort) \u2014 ${err instanceof Error ? err.message : String(err)}`
952
+ );
953
+ }
954
+ }
955
+ };
956
+ return async (input, init) => {
957
+ const url = typeof input === "string" ? input : input instanceof URL ? input.toString() : input.url;
958
+ const method = (init?.method ?? (input instanceof Request ? input.method : "GET")).toUpperCase();
959
+ const endpoint = endpointFromUrl(url, ctx.baseUrl);
960
+ const reqHeaders = new Headers(
961
+ init?.headers ?? (input instanceof Request ? input.headers : void 0)
962
+ );
963
+ await record({
964
+ ...baseEvent("request", endpoint),
965
+ requestHeaders: { ...headersToRecord(reqHeaders), "x-http-method": method },
966
+ requestBody: await readRequestBody(input, init)
967
+ });
968
+ const start = Date.now();
969
+ let response;
970
+ try {
971
+ response = await fetch2(input, init);
972
+ } catch (err) {
973
+ await record({
974
+ ...baseEvent("error", endpoint),
975
+ durationMs: Date.now() - start,
976
+ errorMessage: err instanceof Error ? err.message : String(err)
977
+ });
978
+ throw err;
979
+ }
980
+ let responseBody;
981
+ const redactedFields = [];
982
+ try {
983
+ const raw = await response.clone().text();
984
+ if (raw.length > bodyCap) {
985
+ responseBody = raw.slice(0, bodyCap);
986
+ redactedFields.push("body_truncated");
987
+ } else {
988
+ responseBody = parseMaybeJson(raw);
989
+ }
990
+ } catch {
991
+ responseBody = void 0;
992
+ }
993
+ await record({
994
+ ...baseEvent("response", endpoint),
995
+ durationMs: Date.now() - start,
996
+ statusCode: response.status,
997
+ responseHeaders: headersToRecord(response.headers),
998
+ responseBody,
999
+ redactedFields
1000
+ });
1001
+ return response;
1002
+ };
1003
+ }
1004
+
1005
+ // src/trace/otel.ts
1006
+ var OTEL_AGENT_EVAL_SCOPE = { name: "@tangle-network/agent-eval", version: "0.3.0" };
1007
+ async function exportRunAsOtlp(store, runId, resourceAttrs = {}) {
1008
+ const run = await store.getRun(runId);
1009
+ if (!run) throw new Error(`run ${runId} not found`);
1010
+ const spans = await store.spans({ runId });
1011
+ const events = await store.events({ runId });
1012
+ const eventsBySpan = /* @__PURE__ */ new Map();
1013
+ for (const e of events) {
1014
+ if (!e.spanId) continue;
1015
+ const arr = eventsBySpan.get(e.spanId) ?? [];
1016
+ arr.push(e);
1017
+ eventsBySpan.set(e.spanId, arr);
1018
+ }
1019
+ const traceId = runToTraceId(run);
1020
+ const otlpSpans = spans.map(
1021
+ (s) => spanToOtlp(s, traceId, eventsBySpan.get(s.spanId) ?? [])
1022
+ );
1023
+ return {
1024
+ resourceSpans: [
1025
+ {
1026
+ resource: {
1027
+ attributes: toAttributes({
1028
+ "service.name": "agent-eval",
1029
+ "run.id": run.runId,
1030
+ "run.scenario_id": run.scenarioId,
1031
+ "run.variant_id": run.variantId ?? "",
1032
+ "run.dataset_version": run.datasetVersion ?? "",
1033
+ "run.code_sha": run.codeSha ?? "",
1034
+ "run.model_fingerprint": run.modelFingerprint ?? "",
1035
+ ...resourceAttrs
1036
+ })
1037
+ },
1038
+ scopeSpans: [{ scope: OTEL_AGENT_EVAL_SCOPE, spans: otlpSpans }]
1039
+ }
1040
+ ]
1041
+ };
1042
+ }
1043
+ function spanToOtlp(span, traceId, events) {
1044
+ const endedAt = span.endedAt ?? span.startedAt;
1045
+ return {
1046
+ traceId,
1047
+ spanId: padSpanId(span.spanId),
1048
+ parentSpanId: span.parentSpanId ? padSpanId(span.parentSpanId) : void 0,
1049
+ name: span.name,
1050
+ kind: 1,
1051
+ // SPAN_KIND_INTERNAL
1052
+ startTimeUnixNano: msToNs(span.startedAt),
1053
+ endTimeUnixNano: msToNs(endedAt),
1054
+ attributes: toAttributes(flattenSpanAttributes(span)),
1055
+ events: events.map((e) => ({
1056
+ timeUnixNano: msToNs(e.timestamp),
1057
+ name: e.kind,
1058
+ attributes: toAttributes(flattenPayload(e.payload))
1059
+ })),
1060
+ status: span.status === "error" ? { code: 2, message: span.error } : { code: 1 }
1061
+ };
1062
+ }
1063
+ function flattenSpanAttributes(span) {
1064
+ const base = {
1065
+ "span.kind": span.kind
1066
+ };
1067
+ if (span.kind === "llm") {
1068
+ base["llm.model"] = span.model;
1069
+ if (span.inputTokens !== void 0) base["llm.input_tokens"] = span.inputTokens;
1070
+ if (span.outputTokens !== void 0) base["llm.output_tokens"] = span.outputTokens;
1071
+ if (span.costUsd !== void 0) base["llm.cost_usd"] = span.costUsd;
1072
+ if (span.finishReason) base["llm.finish_reason"] = span.finishReason;
1073
+ } else if (span.kind === "tool") {
1074
+ base["tool.name"] = span.toolName;
1075
+ if (span.latencyMs !== void 0) base["tool.latency_ms"] = span.latencyMs;
1076
+ } else if (span.kind === "retrieval") {
1077
+ base["retrieval.query"] = span.query;
1078
+ base["retrieval.hits"] = span.hits.length;
1079
+ } else if (span.kind === "judge") {
1080
+ base["judge.id"] = span.judgeId;
1081
+ base["judge.dimension"] = span.dimension;
1082
+ base["judge.score"] = span.score;
1083
+ base["judge.target_span_id"] = span.targetSpanId;
1084
+ } else if (span.kind === "sandbox") {
1085
+ if (span.image) base["sandbox.image"] = span.image;
1086
+ if (span.exitCode !== void 0) base["sandbox.exit_code"] = span.exitCode;
1087
+ if (span.testsPassed !== void 0) base["sandbox.tests_passed"] = span.testsPassed;
1088
+ if (span.testsTotal !== void 0) base["sandbox.tests_total"] = span.testsTotal;
1089
+ }
1090
+ if (span.attributes) {
1091
+ for (const [k, v] of Object.entries(span.attributes)) {
1092
+ if (typeof v === "string" || typeof v === "number" || typeof v === "boolean") base[k] = v;
1093
+ }
1094
+ }
1095
+ return base;
1096
+ }
1097
+ function flattenPayload(payload) {
1098
+ const out = {};
1099
+ for (const [k, v] of Object.entries(payload)) {
1100
+ if (typeof v === "string" || typeof v === "number" || typeof v === "boolean") out[k] = v;
1101
+ else out[k] = JSON.stringify(v);
1102
+ }
1103
+ return out;
1104
+ }
1105
+ function toAttributes(record) {
1106
+ return Object.entries(record).map(([key, value]) => ({
1107
+ key,
1108
+ value: typeof value === "number" ? Number.isInteger(value) ? { intValue: value.toString() } : { doubleValue: value } : typeof value === "boolean" ? { boolValue: value } : { stringValue: value }
1109
+ }));
1110
+ }
1111
+ function msToNs(ms) {
1112
+ return (BigInt(Math.floor(ms)) * 1000000n).toString();
1113
+ }
1114
+ function padSpanId(id) {
1115
+ const cleaned = id.replace(/-/g, "");
1116
+ return cleaned.slice(0, 16).padEnd(16, "0");
1117
+ }
1118
+ function runToTraceId(run) {
1119
+ const cleaned = run.runId.replace(/-/g, "");
1120
+ return cleaned.slice(0, 32).padEnd(32, "0");
1121
+ }
1122
+
1123
+ // src/trace/otel-bridge.ts
1124
+ function otelRunCompleteHook(exporter) {
1125
+ return async (ctx) => {
1126
+ const spans = await ctx.store.spans({ runId: ctx.runId });
1127
+ for (const span of spans) {
1128
+ if (span.endedAt) {
1129
+ exporter.exportSpan(storeSpanToExportable(span, ctx.runId));
1130
+ }
1131
+ }
1132
+ await exporter.flush();
1133
+ };
1134
+ }
1135
+ function createOtelTracingStore(inner, exporter, traceId) {
1136
+ return {
1137
+ async appendRun(run) {
1138
+ return inner.appendRun(run);
1139
+ },
1140
+ async updateRun(runId, patch) {
1141
+ return inner.updateRun(runId, patch);
1142
+ },
1143
+ async appendSpan(span) {
1144
+ if (span.endedAt) {
1145
+ exporter.exportSpan(storeSpanToExportable(span, traceId));
1146
+ }
1147
+ return inner.appendSpan(span);
1148
+ },
1149
+ async updateSpan(spanId, patch) {
1150
+ await inner.updateSpan(spanId, patch);
1151
+ if (patch.endedAt) {
1152
+ const spans = await inner.spans({ runId: traceId });
1153
+ const found = spans.find((s) => s.spanId === spanId);
1154
+ if (found) {
1155
+ exporter.exportSpan(storeSpanToExportable(found, traceId));
1156
+ }
1157
+ }
1158
+ },
1159
+ async appendEvent(event) {
1160
+ return inner.appendEvent(event);
1161
+ },
1162
+ async appendBudgetEntry(entry) {
1163
+ return inner.appendBudgetEntry(entry);
1164
+ },
1165
+ async appendArtifact(artifact) {
1166
+ return inner.appendArtifact(artifact);
1167
+ },
1168
+ getRun: inner.getRun.bind(inner),
1169
+ listRuns: inner.listRuns.bind(inner),
1170
+ spans: inner.spans.bind(inner),
1171
+ events: inner.events.bind(inner),
1172
+ budget: inner.budget.bind(inner),
1173
+ artifacts: inner.artifacts.bind(inner)
1174
+ };
1175
+ }
1176
+ function storeSpanToExportable(span, traceId) {
1177
+ const llm = span.kind === "llm" ? span : void 0;
1178
+ return {
1179
+ traceId,
1180
+ spanId: span.spanId,
1181
+ parentSpanId: span.parentSpanId,
1182
+ name: span.name,
1183
+ kind: span.kind,
1184
+ startedAt: span.startedAt,
1185
+ endedAt: span.endedAt,
1186
+ status: span.status,
1187
+ error: span.error,
1188
+ model: llm?.model,
1189
+ inputTokens: llm?.inputTokens,
1190
+ outputTokens: llm?.outputTokens,
1191
+ costUsd: llm?.costUsd,
1192
+ attributes: span.attributes
1193
+ };
1194
+ }
1195
+
1196
+ // src/trace/otel-export.ts
1197
+ function createOtelExporter(config) {
1198
+ const resolvedEndpoint = config?.endpoint ?? (typeof process !== "undefined" ? process.env.OTEL_EXPORTER_OTLP_ENDPOINT : void 0);
1199
+ if (!resolvedEndpoint) return void 0;
1200
+ const endpoint = resolvedEndpoint;
1201
+ const headers = config?.headers ?? parseHeadersFromEnv();
1202
+ const batchSize = config?.batchSize ?? 64;
1203
+ const flushIntervalMs = config?.flushIntervalMs ?? 5e3;
1204
+ const serviceName = config?.serviceName ?? "agent-eval";
1205
+ const resourceAttrs = config?.resourceAttributes ?? {};
1206
+ const pending = [];
1207
+ let timer;
1208
+ let stopped = false;
1209
+ const exporter = {
1210
+ exportSpan(span) {
1211
+ if (stopped) return;
1212
+ pending.push(toOtlpSpan(span));
1213
+ if (pending.length >= batchSize) {
1214
+ void doFlush();
1215
+ }
1216
+ },
1217
+ async flush() {
1218
+ await doFlush();
1219
+ },
1220
+ async shutdown() {
1221
+ stopped = true;
1222
+ if (timer !== void 0) {
1223
+ clearInterval(timer);
1224
+ timer = void 0;
1225
+ }
1226
+ await doFlush();
1227
+ }
1228
+ };
1229
+ timer = setInterval(() => {
1230
+ if (pending.length > 0) void doFlush();
1231
+ }, flushIntervalMs);
1232
+ if (typeof timer === "object" && "unref" in timer) {
1233
+ ;
1234
+ timer.unref();
1235
+ }
1236
+ async function doFlush() {
1237
+ if (pending.length === 0) return;
1238
+ const batch = pending.splice(0);
1239
+ const body = {
1240
+ resourceSpans: [
1241
+ {
1242
+ resource: {
1243
+ attributes: toAttributes2({
1244
+ "service.name": serviceName,
1245
+ ...resourceAttrs
1246
+ })
1247
+ },
1248
+ scopeSpans: [{ scope: OTEL_AGENT_EVAL_SCOPE, spans: batch }]
1249
+ }
1250
+ ]
1251
+ };
1252
+ const url = `${endpoint.replace(/\/+$/, "")}/v1/traces`;
1253
+ try {
1254
+ await fetch(url, {
1255
+ method: "POST",
1256
+ headers: {
1257
+ "content-type": "application/json",
1258
+ ...headers
1259
+ },
1260
+ body: JSON.stringify(body)
1261
+ });
1262
+ } catch {
1263
+ }
1264
+ }
1265
+ return exporter;
1266
+ }
1267
+ function parseHeadersFromEnv() {
1268
+ if (typeof process === "undefined") return {};
1269
+ const raw = process.env.OTEL_EXPORTER_OTLP_HEADERS;
1270
+ if (!raw) return {};
1271
+ const out = {};
1272
+ for (const pair of raw.split(",")) {
1273
+ const eq = pair.indexOf("=");
1274
+ if (eq < 0) continue;
1275
+ const key = pair.slice(0, eq).trim();
1276
+ const value = pair.slice(eq + 1).trim();
1277
+ if (key) out[key] = value;
1278
+ }
1279
+ return out;
1280
+ }
1281
+ function toOtlpSpan(span) {
1282
+ const endedAt = span.endedAt ?? span.startedAt;
1283
+ const attrs = {
1284
+ "span.kind": span.kind
1285
+ };
1286
+ if (span.model) attrs["llm.model"] = span.model;
1287
+ if (span.inputTokens !== void 0) attrs["llm.input_tokens"] = span.inputTokens;
1288
+ if (span.outputTokens !== void 0) attrs["llm.output_tokens"] = span.outputTokens;
1289
+ if (span.costUsd !== void 0) attrs["llm.cost_usd"] = span.costUsd;
1290
+ if (span.attributes) {
1291
+ for (const [k, v] of Object.entries(span.attributes)) {
1292
+ if (typeof v === "string" || typeof v === "number" || typeof v === "boolean") attrs[k] = v;
1293
+ }
1294
+ }
1295
+ return {
1296
+ traceId: padTraceId(span.traceId),
1297
+ spanId: padSpanId2(span.spanId),
1298
+ parentSpanId: span.parentSpanId ? padSpanId2(span.parentSpanId) : void 0,
1299
+ name: span.name,
1300
+ kind: 1,
1301
+ // SPAN_KIND_INTERNAL
1302
+ startTimeUnixNano: msToNs2(span.startedAt),
1303
+ endTimeUnixNano: msToNs2(endedAt),
1304
+ attributes: toAttributes2(attrs),
1305
+ status: span.status === "error" ? { code: 2, message: span.error } : { code: 1 }
1306
+ };
1307
+ }
1308
+ function toAttributes2(record) {
1309
+ return Object.entries(record).map(([key, value]) => ({
1310
+ key,
1311
+ value: typeof value === "number" ? Number.isInteger(value) ? { intValue: value.toString() } : { doubleValue: value } : typeof value === "boolean" ? { boolValue: value } : { stringValue: value }
1312
+ }));
1313
+ }
1314
+ function msToNs2(ms) {
1315
+ return (BigInt(Math.floor(ms)) * 1000000n).toString();
1316
+ }
1317
+ function padSpanId2(id) {
1318
+ const cleaned = id.replace(/-/g, "");
1319
+ return cleaned.slice(0, 16).padEnd(16, "0");
1320
+ }
1321
+ function padTraceId(id) {
1322
+ const cleaned = id.replace(/-/g, "");
1323
+ return cleaned.slice(0, 32).padEnd(32, "0");
1324
+ }
1325
+
1326
+ // src/replay.ts
1327
+ var ReplayCacheMissError = class extends ReplayError {
1328
+ constructor(url, requestKey2, message) {
1329
+ super(message ?? `replay cache miss for ${url} (key=${requestKey2})`);
1330
+ this.url = url;
1331
+ this.requestKey = requestKey2;
1332
+ }
1333
+ url;
1334
+ requestKey;
1335
+ };
1336
+ var ReplayCache = class _ReplayCache {
1337
+ byKey = /* @__PURE__ */ new Map();
1338
+ orphans = 0;
1339
+ byProvider = {};
1340
+ byModel = {};
1341
+ /**
1342
+ * Build a cache from a sink's events. The sink must implement `list()`.
1343
+ * Filter by `runId` / `spanId` to scope to a specific replay.
1344
+ */
1345
+ static async fromSink(sink, filter = {}) {
1346
+ if (!sink.list) {
1347
+ throw new ReplayError("ReplayCache.fromSink: sink must implement list() to be replayable.");
1348
+ }
1349
+ const events = await sink.list(filter);
1350
+ return _ReplayCache.fromEvents(events);
1351
+ }
1352
+ /** Build a cache from an in-memory event list. */
1353
+ static async fromEvents(events) {
1354
+ const cache = new _ReplayCache();
1355
+ const groups = /* @__PURE__ */ new Map();
1356
+ for (const e of events) {
1357
+ const k = `${e.runId ?? ""}::${e.spanId ?? ""}::${e.attemptIndex}`;
1358
+ const g = groups.get(k) ?? {};
1359
+ if (e.direction === "request") g.req = e;
1360
+ else g.res = e;
1361
+ groups.set(k, g);
1362
+ }
1363
+ for (const g of groups.values()) {
1364
+ if (!g.req) continue;
1365
+ if (!g.res) {
1366
+ cache.orphans += 1;
1367
+ continue;
1368
+ }
1369
+ const key = await requestKey(g.req);
1370
+ cache.byKey.set(key, { request: g.req, response: g.res });
1371
+ cache.byProvider[g.req.provider] = (cache.byProvider[g.req.provider] ?? 0) + 1;
1372
+ cache.byModel[g.req.model] = (cache.byModel[g.req.model] ?? 0) + 1;
1373
+ }
1374
+ return cache;
1375
+ }
1376
+ /** Number of cacheable (request, response) pairs in the cache. */
1377
+ size() {
1378
+ return this.byKey.size;
1379
+ }
1380
+ stats() {
1381
+ return {
1382
+ total: this.byKey.size,
1383
+ byProvider: { ...this.byProvider },
1384
+ byModel: { ...this.byModel },
1385
+ orphanRequests: this.orphans
1386
+ };
1387
+ }
1388
+ /** Iterate every cached `(request, response)` pair in insertion order. */
1389
+ *entries() {
1390
+ for (const entry of this.byKey.values()) yield entry;
1391
+ }
1392
+ /**
1393
+ * Look up a cached response by hashing the (model, messages, temperature,
1394
+ * maxTokens, response_format) shape. Returns `undefined` on miss; the
1395
+ * caller decides whether to throw, fall back to the network, or skip.
1396
+ */
1397
+ async lookup(requestBody) {
1398
+ const key = await keyFromBody(requestBody);
1399
+ return this.byKey.get(key);
1400
+ }
1401
+ };
1402
+ function createReplayFetch(cache, opts = {}) {
1403
+ const onMiss = opts.onMiss ?? "throw";
1404
+ const fallback = opts.fallbackFetch ?? globalThis.fetch?.bind(globalThis);
1405
+ return (async (input, init) => {
1406
+ const url = typeof input === "string" ? input : input instanceof URL ? input.toString() : input.url;
1407
+ if (!/\/chat\/completions(?:[?#].*)?$/.test(url)) {
1408
+ if (!fallback)
1409
+ throw new ReplayError(
1410
+ `replay fetch: non-completions URL ${url} but no fallbackFetch configured`
1411
+ );
1412
+ return fallback(input, init);
1413
+ }
1414
+ let bodyParsed;
1415
+ if (init?.body && typeof init.body === "string") {
1416
+ try {
1417
+ bodyParsed = JSON.parse(init.body);
1418
+ } catch {
1419
+ }
1420
+ }
1421
+ const hit = bodyParsed === void 0 ? void 0 : await cache.lookup(bodyParsed);
1422
+ if (hit) {
1423
+ opts.onHit?.({ url, provider: hit.request.provider, model: hit.request.model });
1424
+ const status = hit.response.statusCode ?? 200;
1425
+ const headers = new Headers(
1426
+ Object.entries(hit.response.responseHeaders ?? { "Content-Type": "application/json" })
1427
+ );
1428
+ const bodyText = typeof hit.response.responseBody === "string" ? hit.response.responseBody : JSON.stringify(hit.response.responseBody ?? {});
1429
+ return new Response(bodyText, { status, headers });
1430
+ }
1431
+ opts.onMissNotify?.({ url, requestBody: bodyParsed });
1432
+ if (onMiss === "throw") {
1433
+ const key = bodyParsed === void 0 ? "<unparseable>" : await keyFromBody(bodyParsed);
1434
+ throw new ReplayCacheMissError(url, key);
1435
+ }
1436
+ if (onMiss === "fail-closed") {
1437
+ return new Response(JSON.stringify({ error: "replay_cache_miss" }), { status: 599 });
1438
+ }
1439
+ if (!fallback)
1440
+ throw new ReplayError("replay fetch: onMiss=fallback but no fallbackFetch configured");
1441
+ return fallback(input, init);
1442
+ });
1443
+ }
1444
+ async function* iterateRawCalls(sink, filter = {}) {
1445
+ if (!sink.list) {
1446
+ throw new ReplayError("iterateRawCalls: sink must implement list().");
1447
+ }
1448
+ const events = await sink.list(filter);
1449
+ const cache = await ReplayCache.fromEvents(events);
1450
+ for (const entry of cache.entries()) yield entry;
1451
+ }
1452
+ async function requestKey(event) {
1453
+ return keyFromBody(event.requestBody);
1454
+ }
1455
+ async function keyFromBody(body) {
1456
+ if (body == null || typeof body !== "object") return hashJson({ raw: String(body) });
1457
+ const b = body;
1458
+ const reduced = canonicalize({
1459
+ model: b.model ?? null,
1460
+ messages: b.messages ?? null,
1461
+ temperature: b.temperature ?? null,
1462
+ max_tokens: b.max_tokens ?? null,
1463
+ max_completion_tokens: b.max_completion_tokens ?? null,
1464
+ response_format: b.response_format ?? null
1465
+ });
1466
+ return hashJson(reduced);
1467
+ }
1468
+
1469
+ export {
1470
+ traceAnalystOnRunComplete,
1471
+ tokenizeDomainWords,
1472
+ inferDomainKeywords,
1473
+ domainEvidencePattern,
1474
+ describeTraceInsightScope,
1475
+ planTraceInsightQuestions,
1476
+ buildTraceInsightContext,
1477
+ scoreTraceInsightReadiness,
1478
+ defaultTraceInsightPanel,
1479
+ buildTraceInsightPrompt,
1480
+ flattenOtlpExportToNdjson,
1481
+ otlpToRunRecords,
1482
+ otlpToTraceRunRecords,
1483
+ InMemoryTraceStore,
1484
+ FileSystemTraceStore,
1485
+ captureFetchToRawSink,
1486
+ OTEL_AGENT_EVAL_SCOPE,
1487
+ exportRunAsOtlp,
1488
+ otelRunCompleteHook,
1489
+ createOtelTracingStore,
1490
+ createOtelExporter,
1491
+ ReplayCacheMissError,
1492
+ ReplayCache,
1493
+ createReplayFetch,
1494
+ iterateRawCalls
1495
+ };
1496
+ //# sourceMappingURL=chunk-JHA3ZGSO.js.map