@langwatch/scenario 0.4.3 → 0.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -944,9 +944,13 @@ var init_esm = __esm({
944
944
  // src/agents/index.ts
945
945
  var agents_exports = {};
946
946
  __export(agents_exports, {
947
+ DEFAULT_TOKEN_THRESHOLD: () => DEFAULT_TOKEN_THRESHOLD,
947
948
  JudgeSpanCollector: () => JudgeSpanCollector,
948
949
  JudgeSpanDigestFormatter: () => JudgeSpanDigestFormatter,
949
950
  RealtimeAgentAdapter: () => RealtimeAgentAdapter,
951
+ estimateTokens: () => estimateTokens,
952
+ expandTrace: () => expandTrace,
953
+ grepTrace: () => grepTrace,
950
954
  judgeAgent: () => judgeAgent,
951
955
  judgeSpanCollector: () => judgeSpanCollector,
952
956
  judgeSpanDigestFormatter: () => judgeSpanDigestFormatter,
@@ -954,7 +958,11 @@ __export(agents_exports, {
954
958
  });
955
959
 
956
960
  // src/agents/judge/judge-agent.ts
957
- import { tool } from "ai";
961
+ import {
962
+ tool,
963
+ stepCountIs,
964
+ hasToolCall
965
+ } from "ai";
958
966
  import { z as z4 } from "zod/v4";
959
967
 
960
968
  // src/agents/judge/judge-utils.ts
@@ -1026,6 +1034,275 @@ var JudgeUtils = {
1026
1034
  }
1027
1035
  };
1028
1036
 
1037
+ // src/agents/judge/estimate-tokens.ts
1038
+ var DEFAULT_TOKEN_THRESHOLD = 8192;
1039
+ function estimateTokens(text) {
1040
+ const byteLength = new TextEncoder().encode(text).byteLength;
1041
+ return Math.ceil(byteLength / 4);
1042
+ }
1043
+
1044
+ // src/agents/judge/span-utils.ts
1045
+ import { attributes } from "langwatch/observability";
1046
+
1047
+ // src/agents/judge/deep-transform.ts
1048
+ function deepTransform(value, fn) {
1049
+ const result = fn(value);
1050
+ if (result !== value) return result;
1051
+ if (Array.isArray(value)) {
1052
+ return value.map((v) => deepTransform(v, fn));
1053
+ }
1054
+ if (value !== null && typeof value === "object") {
1055
+ const out = {};
1056
+ for (const [k, v] of Object.entries(value)) {
1057
+ out[k] = deepTransform(v, fn);
1058
+ }
1059
+ return out;
1060
+ }
1061
+ return value;
1062
+ }
1063
+
1064
+ // src/agents/judge/truncate-media.ts
1065
+ function truncateMediaUrl(str) {
1066
+ const match = str.match(
1067
+ /^data:((image|audio|video)\/[a-z0-9+.-]+);base64,(.+)$/i
1068
+ );
1069
+ if (!match) return str;
1070
+ const [, mimeType, category, data] = match;
1071
+ return `[${category.toUpperCase()}: ${mimeType}, ~${data.length} bytes]`;
1072
+ }
1073
+ function truncateMediaPart(v) {
1074
+ var _a;
1075
+ if (v === null || typeof v !== "object" || Array.isArray(v)) return null;
1076
+ const obj = v;
1077
+ if (obj.type === "file" && typeof obj.mediaType === "string" && typeof obj.data === "string") {
1078
+ const category = ((_a = obj.mediaType.split("/")[0]) == null ? void 0 : _a.toUpperCase()) ?? "FILE";
1079
+ return {
1080
+ ...obj,
1081
+ data: `[${category}: ${obj.mediaType}, ~${obj.data.length} bytes]`
1082
+ };
1083
+ }
1084
+ if (obj.type === "image" && typeof obj.image === "string") {
1085
+ const imageData = obj.image;
1086
+ const dataUrlMatch = imageData.match(
1087
+ /^data:((image)\/[a-z0-9+.-]+);base64,(.+)$/i
1088
+ );
1089
+ if (dataUrlMatch) {
1090
+ return {
1091
+ ...obj,
1092
+ image: `[IMAGE: ${dataUrlMatch[1]}, ~${dataUrlMatch[3].length} bytes]`
1093
+ };
1094
+ }
1095
+ if (imageData.length > 1e3 && /^[A-Za-z0-9+/=]+$/.test(imageData)) {
1096
+ return {
1097
+ ...obj,
1098
+ image: `[IMAGE: unknown, ~${imageData.length} bytes]`
1099
+ };
1100
+ }
1101
+ }
1102
+ return null;
1103
+ }
1104
+
1105
+ // src/agents/judge/span-utils.ts
1106
+ function hrTimeToMs(hrTime) {
1107
+ return hrTime[0] * 1e3 + hrTime[1] / 1e6;
1108
+ }
1109
+ function formatDuration(ms) {
1110
+ if (ms < 1e3) return `${Math.round(ms)}ms`;
1111
+ return `${(ms / 1e3).toFixed(2)}s`;
1112
+ }
1113
+ function calculateSpanDuration(span) {
1114
+ return hrTimeToMs(span.endTime) - hrTimeToMs(span.startTime);
1115
+ }
1116
+ function getStatusIndicator(span) {
1117
+ if (span.status.code === 2) {
1118
+ return ` \u26A0\uFE0F ERROR: ${span.status.message ?? "unknown"}`;
1119
+ }
1120
+ return "";
1121
+ }
1122
+ function getTokenUsage(span) {
1123
+ const input = span.attributes["gen_ai.usage.input_tokens"];
1124
+ const output = span.attributes["gen_ai.usage.output_tokens"];
1125
+ if (input == null && output == null) return "";
1126
+ const total = (Number(input) || 0) + (Number(output) || 0);
1127
+ return `, ${total} tokens`;
1128
+ }
1129
+ function cleanAttributes(attrs) {
1130
+ const cleaned = {};
1131
+ const seen = /* @__PURE__ */ new Set();
1132
+ const excludedKeys = [
1133
+ attributes.ATTR_LANGWATCH_THREAD_ID,
1134
+ "langwatch.scenario.id",
1135
+ "langwatch.scenario.name"
1136
+ ];
1137
+ for (const [key, value] of Object.entries(attrs)) {
1138
+ if (excludedKeys.includes(key)) {
1139
+ continue;
1140
+ }
1141
+ const cleanKey = key.replace(/^(langwatch)\./, "");
1142
+ if (!seen.has(cleanKey)) {
1143
+ seen.add(cleanKey);
1144
+ cleaned[cleanKey] = value;
1145
+ }
1146
+ }
1147
+ return cleaned;
1148
+ }
1149
+ function formatValue(value) {
1150
+ const processed = transformValue(value);
1151
+ return typeof processed === "string" ? processed : JSON.stringify(processed);
1152
+ }
1153
+ function transformValue(value) {
1154
+ return deepTransform(value, (v) => {
1155
+ const mediaPart = truncateMediaPart(v);
1156
+ if (mediaPart) return mediaPart;
1157
+ if (typeof v !== "string") return v;
1158
+ const truncated = truncateMediaUrl(v);
1159
+ if (truncated !== v) return truncated;
1160
+ if (looksLikeJson(v)) {
1161
+ try {
1162
+ const parsed = transformValue(JSON.parse(v));
1163
+ return JSON.stringify(parsed);
1164
+ } catch {
1165
+ }
1166
+ }
1167
+ return v;
1168
+ });
1169
+ }
1170
+ function looksLikeJson(str) {
1171
+ const t = str.trim();
1172
+ return t.startsWith("{") && t.endsWith("}") || t.startsWith("[") && t.endsWith("]");
1173
+ }
1174
+ function indexSpans(spans) {
1175
+ const sorted = [...spans].sort((a, b) => {
1176
+ return hrTimeToMs(a.startTime) - hrTimeToMs(b.startTime);
1177
+ });
1178
+ return sorted.map((span) => ({
1179
+ span,
1180
+ children: [],
1181
+ shortId: span.spanContext().spanId.slice(0, 8)
1182
+ }));
1183
+ }
1184
+
1185
+ // src/agents/judge/trace-tools.ts
1186
+ var TOOL_RESULT_TOKEN_BUDGET = 4096;
1187
+ var TOOL_RESULT_CHAR_BUDGET = TOOL_RESULT_TOKEN_BUDGET * 4;
1188
+ var MAX_GREP_MATCHES = 20;
1189
+ function renderFullSpanNode(node) {
1190
+ const span = node.span;
1191
+ const duration = calculateSpanDuration(span);
1192
+ const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
1193
+ const status = getStatusIndicator(span);
1194
+ const lines = [];
1195
+ lines.push(
1196
+ `[${node.shortId}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
1197
+ );
1198
+ const attrs = cleanAttributes(span.attributes);
1199
+ if (Object.keys(attrs).length > 0) {
1200
+ for (const [key, value] of Object.entries(attrs)) {
1201
+ lines.push(` ${key}: ${formatValue(value)}`);
1202
+ }
1203
+ }
1204
+ if (span.events.length > 0) {
1205
+ for (const event of span.events) {
1206
+ lines.push(` [event] ${event.name}`);
1207
+ if (event.attributes) {
1208
+ const eventAttrs = cleanAttributes(event.attributes);
1209
+ for (const [key, value] of Object.entries(eventAttrs)) {
1210
+ lines.push(` ${key}: ${formatValue(value)}`);
1211
+ }
1212
+ }
1213
+ }
1214
+ }
1215
+ return lines;
1216
+ }
1217
+ function truncateToCharBudget(text) {
1218
+ if (text.length <= TOOL_RESULT_CHAR_BUDGET) return text;
1219
+ const truncated = text.slice(0, TOOL_RESULT_CHAR_BUDGET);
1220
+ return truncated + "\n\n[TRUNCATED] Output exceeded ~4000 token budget. Use grep_trace(pattern) to search for specific content, or expand_trace with fewer span IDs.";
1221
+ }
1222
+ function spanToSearchableText(span) {
1223
+ const parts = [span.name];
1224
+ const attrs = cleanAttributes(span.attributes);
1225
+ for (const [key, value] of Object.entries(attrs)) {
1226
+ parts.push(`${key}: ${formatValue(value)}`);
1227
+ }
1228
+ for (const event of span.events) {
1229
+ parts.push(event.name);
1230
+ if (event.attributes) {
1231
+ const eventAttrs = cleanAttributes(event.attributes);
1232
+ for (const [key, value] of Object.entries(eventAttrs)) {
1233
+ parts.push(`${key}: ${formatValue(value)}`);
1234
+ }
1235
+ }
1236
+ }
1237
+ return parts.join("\n");
1238
+ }
1239
+ function expandTrace(spans, spanIds) {
1240
+ const nodes = indexSpans(spans);
1241
+ if (nodes.length === 0) {
1242
+ return "No spans recorded.";
1243
+ }
1244
+ if (spanIds.length === 0) {
1245
+ return "Error: provide at least one span ID.";
1246
+ }
1247
+ const selected = nodes.filter((n) => {
1248
+ const fullId = n.span.spanContext().spanId;
1249
+ return spanIds.some((prefix) => fullId.startsWith(prefix));
1250
+ });
1251
+ if (selected.length === 0) {
1252
+ const available = nodes.map((n) => n.shortId).join(", ");
1253
+ return `Error: no spans matched the given ID(s). Available span IDs: ${available}`;
1254
+ }
1255
+ const lines = [];
1256
+ for (const node of selected) {
1257
+ const spanLines = renderFullSpanNode(node);
1258
+ lines.push(...spanLines);
1259
+ lines.push("");
1260
+ }
1261
+ return truncateToCharBudget(lines.join("\n").trimEnd());
1262
+ }
1263
+ function grepTrace(spans, pattern) {
1264
+ const nodes = indexSpans(spans);
1265
+ if (nodes.length === 0) {
1266
+ return "No spans recorded.";
1267
+ }
1268
+ const regex = new RegExp(escapeRegex(pattern), "i");
1269
+ const matches = [];
1270
+ for (const node of nodes) {
1271
+ const searchText = spanToSearchableText(node.span);
1272
+ const lines2 = searchText.split("\n");
1273
+ const matchingLines = lines2.filter((line) => regex.test(line));
1274
+ if (matchingLines.length > 0) {
1275
+ matches.push({ node, matchingLines });
1276
+ }
1277
+ }
1278
+ if (matches.length === 0) {
1279
+ const spanNames = Array.from(new Set(nodes.map((n) => n.span.name)));
1280
+ return `No matches found for "${pattern}". Available span names: ${spanNames.join(", ")}`;
1281
+ }
1282
+ const totalMatches = matches.length;
1283
+ const limited = matches.slice(0, MAX_GREP_MATCHES);
1284
+ const lines = [];
1285
+ for (const { node, matchingLines } of limited) {
1286
+ const duration = calculateSpanDuration(node.span);
1287
+ lines.push(
1288
+ `--- [${node.shortId}] ${node.span.name} (${formatDuration(duration)}) ---`
1289
+ );
1290
+ for (const line of matchingLines) {
1291
+ lines.push(` ${line}`);
1292
+ }
1293
+ lines.push("");
1294
+ }
1295
+ if (totalMatches > MAX_GREP_MATCHES) {
1296
+ lines.push(
1297
+ `[${totalMatches - MAX_GREP_MATCHES} more matches omitted. Refine your search pattern for more specific results.]`
1298
+ );
1299
+ }
1300
+ return truncateToCharBudget(lines.join("\n").trimEnd());
1301
+ }
1302
+ function escapeRegex(str) {
1303
+ return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
1304
+ }
1305
+
1029
1306
  // src/config/env.ts
1030
1307
  import { z } from "zod/v4";
1031
1308
 
@@ -1381,7 +1658,7 @@ var criterionToParamName = (criterion) => {
1381
1658
  };
1382
1659
 
1383
1660
  // src/agents/judge/judge-span-collector.ts
1384
- import { attributes } from "langwatch/observability";
1661
+ import { attributes as attributes2 } from "langwatch/observability";
1385
1662
  var JudgeSpanCollector = class {
1386
1663
  spans = [];
1387
1664
  onStart() {
@@ -1424,7 +1701,7 @@ var JudgeSpanCollector = class {
1424
1701
  const spanId = span.spanContext().spanId;
1425
1702
  if (visited.has(spanId)) return false;
1426
1703
  visited.add(spanId);
1427
- if (span.attributes[attributes.ATTR_LANGWATCH_THREAD_ID] === threadId) {
1704
+ if (span.attributes[attributes2.ATTR_LANGWATCH_THREAD_ID] === threadId) {
1428
1705
  return true;
1429
1706
  }
1430
1707
  const parentId = getParentSpanId(span);
@@ -1443,26 +1720,6 @@ function getParentSpanId(span) {
1443
1720
  }
1444
1721
  var judgeSpanCollector = new JudgeSpanCollector();
1445
1722
 
1446
- // src/agents/judge/judge-span-digest-formatter.ts
1447
- import { attributes as attributes2 } from "langwatch/observability";
1448
-
1449
- // src/agents/judge/deep-transform.ts
1450
- function deepTransform(value, fn) {
1451
- const result = fn(value);
1452
- if (result !== value) return result;
1453
- if (Array.isArray(value)) {
1454
- return value.map((v) => deepTransform(v, fn));
1455
- }
1456
- if (value !== null && typeof value === "object") {
1457
- const out = {};
1458
- for (const [k, v] of Object.entries(value)) {
1459
- out[k] = deepTransform(v, fn);
1460
- }
1461
- return out;
1462
- }
1463
- return value;
1464
- }
1465
-
1466
1723
  // src/agents/judge/string-deduplicator.ts
1467
1724
  var StringDeduplicator = class {
1468
1725
  seen = /* @__PURE__ */ new Map();
@@ -1496,51 +1753,49 @@ var StringDeduplicator = class {
1496
1753
  }
1497
1754
  };
1498
1755
 
1499
- // src/agents/judge/truncate-media.ts
1500
- function truncateMediaUrl(str) {
1501
- const match = str.match(
1502
- /^data:((image|audio|video)\/[a-z0-9+.-]+);base64,(.+)$/i
1503
- );
1504
- if (!match) return str;
1505
- const [, mimeType, category, data] = match;
1506
- return `[${category.toUpperCase()}: ${mimeType}, ~${data.length} bytes]`;
1507
- }
1508
- function truncateMediaPart(v) {
1509
- var _a;
1510
- if (v === null || typeof v !== "object" || Array.isArray(v)) return null;
1511
- const obj = v;
1512
- if (obj.type === "file" && typeof obj.mediaType === "string" && typeof obj.data === "string") {
1513
- const category = ((_a = obj.mediaType.split("/")[0]) == null ? void 0 : _a.toUpperCase()) ?? "FILE";
1514
- return {
1515
- ...obj,
1516
- data: `[${category}: ${obj.mediaType}, ~${obj.data.length} bytes]`
1517
- };
1518
- }
1519
- if (obj.type === "image" && typeof obj.image === "string") {
1520
- const imageData = obj.image;
1521
- const dataUrlMatch = imageData.match(
1522
- /^data:((image)\/[a-z0-9+.-]+);base64,(.+)$/i
1523
- );
1524
- if (dataUrlMatch) {
1525
- return {
1526
- ...obj,
1527
- image: `[IMAGE: ${dataUrlMatch[1]}, ~${dataUrlMatch[3].length} bytes]`
1528
- };
1529
- }
1530
- if (imageData.length > 1e3 && /^[A-Za-z0-9+/=]+$/.test(imageData)) {
1531
- return {
1532
- ...obj,
1533
- image: `[IMAGE: unknown, ~${imageData.length} bytes]`
1534
- };
1535
- }
1536
- }
1537
- return null;
1538
- }
1539
-
1540
1756
  // src/agents/judge/judge-span-digest-formatter.ts
1541
1757
  var JudgeSpanDigestFormatter = class {
1542
1758
  logger = new Logger("JudgeSpanDigestFormatter");
1543
1759
  deduplicator = new StringDeduplicator({ threshold: 50 });
1760
+ /**
1761
+ * Formats spans into a structure-only digest showing span tree hierarchy
1762
+ * without attributes, events, or content. Used for large traces that
1763
+ * exceed the token threshold, paired with expand_trace/grep_trace tools.
1764
+ *
1765
+ * @param spans - All spans for a thread
1766
+ * @returns Plain text digest with only structural information
1767
+ */
1768
+ formatStructureOnly(spans) {
1769
+ this.logger.debug("formatStructureOnly() called", {
1770
+ spanCount: spans.length
1771
+ });
1772
+ if (spans.length === 0) {
1773
+ return "No spans recorded.";
1774
+ }
1775
+ const sortedSpans = this.sortByStartTime(spans);
1776
+ const tree = this.buildHierarchy(sortedSpans);
1777
+ const totalDuration = this.calculateTotalDuration(sortedSpans);
1778
+ const lines = [
1779
+ `Spans: ${spans.length} | Total Duration: ${formatDuration(totalDuration)}`,
1780
+ ""
1781
+ ];
1782
+ const rootCount = tree.length;
1783
+ tree.forEach((node, idx) => {
1784
+ this.renderStructureNode(
1785
+ node,
1786
+ lines,
1787
+ 0,
1788
+ idx === rootCount - 1
1789
+ );
1790
+ });
1791
+ const errors = this.collectErrors(spans);
1792
+ if (errors.length > 0) {
1793
+ lines.push("");
1794
+ lines.push("=== ERRORS ===");
1795
+ errors.forEach((e) => lines.push(e));
1796
+ }
1797
+ return lines.join("\n");
1798
+ }
1544
1799
  /**
1545
1800
  * Formats spans into a complete digest with full content and nesting.
1546
1801
  * @param spans - All spans for a thread
@@ -1564,19 +1819,17 @@ var JudgeSpanDigestFormatter = class {
1564
1819
  totalDuration
1565
1820
  });
1566
1821
  const lines = [
1567
- `Spans: ${spans.length} | Total Duration: ${this.formatDuration(
1822
+ `Spans: ${spans.length} | Total Duration: ${formatDuration(
1568
1823
  totalDuration
1569
1824
  )}`,
1570
1825
  ""
1571
1826
  ];
1572
- let sequence = 1;
1573
1827
  const rootCount = tree.length;
1574
1828
  tree.forEach((node, idx) => {
1575
- sequence = this.renderNode(
1829
+ this.renderNode(
1576
1830
  node,
1577
1831
  lines,
1578
1832
  0,
1579
- sequence,
1580
1833
  idx === rootCount - 1
1581
1834
  );
1582
1835
  });
@@ -1590,9 +1843,7 @@ var JudgeSpanDigestFormatter = class {
1590
1843
  }
1591
1844
  sortByStartTime(spans) {
1592
1845
  return [...spans].sort((a, b) => {
1593
- const aTime = this.hrTimeToMs(a.startTime);
1594
- const bTime = this.hrTimeToMs(b.startTime);
1595
- return aTime - bTime;
1846
+ return hrTimeToMs(a.startTime) - hrTimeToMs(b.startTime);
1596
1847
  });
1597
1848
  }
1598
1849
  buildHierarchy(spans) {
@@ -1612,46 +1863,66 @@ var JudgeSpanDigestFormatter = class {
1612
1863
  }
1613
1864
  return roots;
1614
1865
  }
1615
- renderNode(node, lines, depth, sequence, isLast = true) {
1866
+ renderStructureNode(node, lines, depth, isLast = true) {
1616
1867
  const span = node.span;
1617
- const duration = this.calculateSpanDuration(span);
1618
- const timestamp = this.formatTimestamp(span.startTime);
1619
- const status = this.getStatusIndicator(span);
1868
+ const shortId = span.spanContext().spanId.slice(0, 8);
1869
+ const duration = calculateSpanDuration(span);
1870
+ const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
1871
+ const status = getStatusIndicator(span);
1872
+ const tokens = getTokenUsage(span);
1620
1873
  const prefix = this.getTreePrefix(depth, isLast);
1621
1874
  lines.push(
1622
- `${prefix}[${sequence}] ${new Date(timestamp).toISOString()} ${span.name} (${this.formatDuration(duration)})${status}`
1875
+ `${prefix}[${shortId}] ${timestamp} ${span.name} (${formatDuration(duration)}${tokens})${status}`
1876
+ );
1877
+ lines.push("");
1878
+ const childCount = node.children.length;
1879
+ node.children.forEach((child, idx) => {
1880
+ this.renderStructureNode(
1881
+ child,
1882
+ lines,
1883
+ depth + 1,
1884
+ idx === childCount - 1
1885
+ );
1886
+ });
1887
+ }
1888
+ renderNode(node, lines, depth, isLast = true) {
1889
+ const span = node.span;
1890
+ const shortId = span.spanContext().spanId.slice(0, 8);
1891
+ const duration = calculateSpanDuration(span);
1892
+ const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
1893
+ const status = getStatusIndicator(span);
1894
+ const prefix = this.getTreePrefix(depth, isLast);
1895
+ lines.push(
1896
+ `${prefix}[${shortId}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
1623
1897
  );
1624
1898
  const attrIndent = this.getAttrIndent(depth, isLast);
1625
- const attrs = this.cleanAttributes(span.attributes);
1899
+ const attrs = cleanAttributes(span.attributes);
1626
1900
  if (Object.keys(attrs).length > 0) {
1627
1901
  for (const [key, value] of Object.entries(attrs)) {
1628
- lines.push(`${attrIndent}${key}: ${this.formatValue(value)}`);
1902
+ lines.push(`${attrIndent}${key}: ${this.formatValueWithDedup(value)}`);
1629
1903
  }
1630
1904
  }
1631
1905
  if (span.events.length > 0) {
1632
1906
  for (const event of span.events) {
1633
1907
  lines.push(`${attrIndent}[event] ${event.name}`);
1634
1908
  if (event.attributes) {
1635
- const eventAttrs = this.cleanAttributes(event.attributes);
1909
+ const eventAttrs = cleanAttributes(event.attributes);
1636
1910
  for (const [key, value] of Object.entries(eventAttrs)) {
1637
- lines.push(`${attrIndent} ${key}: ${this.formatValue(value)}`);
1911
+ lines.push(`${attrIndent} ${key}: ${this.formatValueWithDedup(value)}`);
1638
1912
  }
1639
1913
  }
1640
1914
  }
1641
1915
  }
1642
1916
  lines.push("");
1643
- let nextSeq = sequence + 1;
1644
1917
  const childCount = node.children.length;
1645
1918
  node.children.forEach((child, idx) => {
1646
- nextSeq = this.renderNode(
1919
+ this.renderNode(
1647
1920
  child,
1648
1921
  lines,
1649
1922
  depth + 1,
1650
- nextSeq,
1651
1923
  idx === childCount - 1
1652
1924
  );
1653
1925
  });
1654
- return nextSeq;
1655
1926
  }
1656
1927
  getTreePrefix(depth, isLast) {
1657
1928
  if (depth === 0) return "";
@@ -1663,42 +1934,26 @@ var JudgeSpanDigestFormatter = class {
1663
1934
  const continuation = isLast ? " " : "\u2502 ";
1664
1935
  return "\u2502 ".repeat(depth - 1) + continuation + " ";
1665
1936
  }
1666
- cleanAttributes(attrs) {
1667
- const cleaned = {};
1668
- const seen = /* @__PURE__ */ new Set();
1669
- const excludedKeys = [
1670
- attributes2.ATTR_LANGWATCH_THREAD_ID,
1671
- "langwatch.scenario.id",
1672
- "langwatch.scenario.name"
1673
- ];
1674
- for (const [key, value] of Object.entries(attrs)) {
1675
- if (excludedKeys.includes(key)) {
1676
- continue;
1677
- }
1678
- const cleanKey = key.replace(/^(langwatch)\./, "");
1679
- if (!seen.has(cleanKey)) {
1680
- seen.add(cleanKey);
1681
- cleaned[cleanKey] = value;
1682
- }
1683
- }
1684
- return cleaned;
1685
- }
1686
- formatValue(value) {
1687
- const processed = this.transformValue(value);
1937
+ /**
1938
+ * Formats a value with deduplication applied. Used by the `format()` method
1939
+ * to reduce token usage by replacing repeated strings with markers.
1940
+ */
1941
+ formatValueWithDedup(value) {
1942
+ const processed = this.transformValueWithDedup(value);
1688
1943
  return typeof processed === "string" ? processed : JSON.stringify(processed);
1689
1944
  }
1690
- transformValue(value) {
1945
+ transformValueWithDedup(value) {
1691
1946
  return deepTransform(value, (v) => {
1692
1947
  const mediaPart = truncateMediaPart(v);
1693
1948
  if (mediaPart) return mediaPart;
1694
1949
  if (typeof v !== "string") return v;
1695
- return this.transformString(v);
1950
+ return this.transformStringWithDedup(v);
1696
1951
  });
1697
1952
  }
1698
- transformString(str) {
1699
- if (this.looksLikeJson(str)) {
1953
+ transformStringWithDedup(str) {
1954
+ if (looksLikeJson(str)) {
1700
1955
  try {
1701
- const processed = this.transformValue(JSON.parse(str));
1956
+ const processed = this.transformValueWithDedup(JSON.parse(str));
1702
1957
  return JSON.stringify(processed);
1703
1958
  } catch {
1704
1959
  }
@@ -1707,36 +1962,12 @@ var JudgeSpanDigestFormatter = class {
1707
1962
  if (truncated !== str) return truncated;
1708
1963
  return this.deduplicator.process(str);
1709
1964
  }
1710
- looksLikeJson(str) {
1711
- const t = str.trim();
1712
- return t.startsWith("{") && t.endsWith("}") || t.startsWith("[") && t.endsWith("]");
1713
- }
1714
- hrTimeToMs(hrTime) {
1715
- return hrTime[0] * 1e3 + hrTime[1] / 1e6;
1716
- }
1717
- calculateSpanDuration(span) {
1718
- return this.hrTimeToMs(span.endTime) - this.hrTimeToMs(span.startTime);
1719
- }
1720
1965
  calculateTotalDuration(spans) {
1721
1966
  if (spans.length === 0) return 0;
1722
- const first = this.hrTimeToMs(spans[0].startTime);
1723
- const last = Math.max(...spans.map((s) => this.hrTimeToMs(s.endTime)));
1967
+ const first = hrTimeToMs(spans[0].startTime);
1968
+ const last = Math.max(...spans.map((s) => hrTimeToMs(s.endTime)));
1724
1969
  return last - first;
1725
1970
  }
1726
- formatDuration(ms) {
1727
- if (ms < 1e3) return `${Math.round(ms)}ms`;
1728
- return `${(ms / 1e3).toFixed(2)}s`;
1729
- }
1730
- formatTimestamp(hrTime) {
1731
- const ms = this.hrTimeToMs(hrTime);
1732
- return new Date(ms).toISOString();
1733
- }
1734
- getStatusIndicator(span) {
1735
- if (span.status.code === 2) {
1736
- return ` \u26A0\uFE0F ERROR: ${span.status.message ?? "unknown"}`;
1737
- }
1738
- return "";
1739
- }
1740
1971
  collectErrors(spans) {
1741
1972
  return spans.filter((s) => s.status.code === 2).map((s) => `- ${s.name}: ${s.status.message ?? "unknown error"}`);
1742
1973
  }
@@ -1799,15 +2030,41 @@ function buildFinishTestTool(criteria) {
1799
2030
  })
1800
2031
  });
1801
2032
  }
2033
+ function buildProgressiveDiscoveryTools(spans) {
2034
+ return {
2035
+ expand_trace: tool({
2036
+ description: "Expand one or more spans to see their full details (attributes, events, content). Use the span ID shown in brackets in the trace skeleton.",
2037
+ inputSchema: z4.object({
2038
+ span_ids: z4.array(z4.string()).describe("Span IDs (or 8-char prefixes) to expand")
2039
+ }),
2040
+ execute: async ({ span_ids }) => {
2041
+ return expandTrace(spans, span_ids);
2042
+ }
2043
+ }),
2044
+ grep_trace: tool({
2045
+ description: "Search across all span attributes, events, and content for a pattern (case-insensitive). Returns matching spans with context.",
2046
+ inputSchema: z4.object({
2047
+ pattern: z4.string().describe("Search pattern (case-insensitive)")
2048
+ }),
2049
+ execute: async ({ pattern }) => {
2050
+ return grepTrace(spans, pattern);
2051
+ }
2052
+ })
2053
+ };
2054
+ }
1802
2055
  var JudgeAgent = class extends JudgeAgentAdapter {
1803
2056
  constructor(cfg) {
1804
2057
  super();
1805
2058
  this.cfg = cfg;
1806
2059
  this.criteria = cfg.criteria ?? [];
1807
2060
  this.spanCollector = cfg.spanCollector ?? judgeSpanCollector;
2061
+ this.tokenThreshold = cfg.tokenThreshold ?? DEFAULT_TOKEN_THRESHOLD;
2062
+ this.maxDiscoverySteps = cfg.maxDiscoverySteps ?? 10;
1808
2063
  }
1809
2064
  logger = new Logger("JudgeAgent");
1810
2065
  spanCollector;
2066
+ tokenThreshold;
2067
+ maxDiscoverySteps;
1811
2068
  role = "Judge" /* JUDGE */;
1812
2069
  criteria;
1813
2070
  /**
@@ -1815,7 +2072,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1815
2072
  */
1816
2073
  invokeLLM = createLLMInvoker(this.logger);
1817
2074
  async call(input) {
1818
- var _a, _b, _c, _d;
2075
+ var _a;
1819
2076
  const criteria = ((_a = input.judgmentRequest) == null ? void 0 : _a.criteria) ?? this.criteria;
1820
2077
  this.logger.debug("call() invoked", {
1821
2078
  threadId: input.threadId,
@@ -1823,8 +2080,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1823
2080
  maxTurns: input.scenarioConfig.maxTurns,
1824
2081
  judgmentRequest: input.judgmentRequest
1825
2082
  });
1826
- const digest = this.getOpenTelemetryTracesDigest(input.threadId);
1827
- this.logger.debug("OpenTelemetry traces built", { digest });
2083
+ const spans = this.spanCollector.getSpansForThread(input.threadId);
2084
+ const { digest, isLargeTrace } = this.buildTraceDigest(spans);
1828
2085
  const transcript = JudgeUtils.buildTranscriptFromMessages(input.messages);
1829
2086
  const contentForJudge = `
1830
2087
  <transcript>
@@ -1847,6 +2104,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1847
2104
  ...cfg
1848
2105
  });
1849
2106
  const tools = {
2107
+ ...isLargeTrace ? buildProgressiveDiscoveryTools(spans) : {},
1850
2108
  continue_test: buildContinueTestTool(),
1851
2109
  finish_test: buildFinishTestTool(criteria)
1852
2110
  };
@@ -1865,26 +2123,75 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1865
2123
  model: mergedConfig.model,
1866
2124
  toolChoice,
1867
2125
  isLastMessage,
1868
- enforceJudgement
2126
+ enforceJudgement,
2127
+ isLargeTrace
1869
2128
  });
1870
- const completion = await this.invokeLLM({
2129
+ const completion = await this.invokeLLMWithDiscovery({
1871
2130
  model: mergedConfig.model,
1872
2131
  messages,
1873
2132
  temperature: mergedConfig.temperature ?? 0,
1874
2133
  maxOutputTokens: mergedConfig.maxTokens,
1875
2134
  tools,
1876
- toolChoice
2135
+ toolChoice,
2136
+ isLargeTrace
2137
+ });
2138
+ return this.parseToolCalls(completion, criteria);
2139
+ }
2140
+ /**
2141
+ * Builds the trace digest, choosing between full inline rendering
2142
+ * and structure-only mode based on estimated token count.
2143
+ */
2144
+ buildTraceDigest(spans) {
2145
+ const fullDigest = judgeSpanDigestFormatter.format(spans);
2146
+ const isLargeTrace = spans.length > 0 && estimateTokens(fullDigest) > this.tokenThreshold;
2147
+ const digest = isLargeTrace ? judgeSpanDigestFormatter.formatStructureOnly(spans) + "\n\nUse expand_trace(span_id) to see span details or grep_trace(pattern) to search across spans. Reference spans by the ID shown in brackets." : fullDigest;
2148
+ this.logger.debug("Trace digest built", {
2149
+ isLargeTrace,
2150
+ estimatedTokens: estimateTokens(fullDigest)
1877
2151
  });
2152
+ return { digest, isLargeTrace };
2153
+ }
2154
+ /**
2155
+ * Invokes the LLM, enabling multi-step tool execution for large traces.
2156
+ * In multi-step mode, the AI SDK loops automatically: the judge can call
2157
+ * expand_trace/grep_trace tools multiple times before reaching a terminal
2158
+ * tool (finish_test/continue_test) or hitting the step limit.
2159
+ *
2160
+ * When the trace is large, toolChoice is relaxed to "required" so the
2161
+ * judge can freely pick discovery tools (expand_trace/grep_trace) before
2162
+ * being forced to a terminal decision.
2163
+ */
2164
+ async invokeLLMWithDiscovery({
2165
+ isLargeTrace,
2166
+ ...params
2167
+ }) {
2168
+ var _a, _b;
2169
+ if (isLargeTrace) {
2170
+ params.toolChoice = "required";
2171
+ params.stopWhen = [
2172
+ stepCountIs(this.maxDiscoverySteps),
2173
+ hasToolCall("finish_test"),
2174
+ hasToolCall("continue_test")
2175
+ ];
2176
+ }
2177
+ const completion = await this.invokeLLM(params);
1878
2178
  this.logger.debug("LLM response received", {
1879
- toolCallCount: ((_b = completion.toolCalls) == null ? void 0 : _b.length) ?? 0,
1880
- toolCalls: (_c = completion.toolCalls) == null ? void 0 : _c.map((tc) => ({
2179
+ toolCallCount: ((_a = completion.toolCalls) == null ? void 0 : _a.length) ?? 0,
2180
+ toolCalls: (_b = completion.toolCalls) == null ? void 0 : _b.map((tc) => ({
1881
2181
  toolName: tc.toolName,
1882
2182
  args: tc.input
1883
2183
  }))
1884
2184
  });
2185
+ return completion;
2186
+ }
2187
+ parseToolCalls(completion, criteria) {
2188
+ var _a;
1885
2189
  let args;
1886
- if ((_d = completion.toolCalls) == null ? void 0 : _d.length) {
1887
- const toolCall = completion.toolCalls[0];
2190
+ if ((_a = completion.toolCalls) == null ? void 0 : _a.length) {
2191
+ const terminalCall = completion.toolCalls.find(
2192
+ (tc) => tc.toolName === "finish_test" || tc.toolName === "continue_test"
2193
+ );
2194
+ const toolCall = terminalCall ?? completion.toolCalls[0];
1888
2195
  switch (toolCall.toolName) {
1889
2196
  case "finish_test": {
1890
2197
  args = toolCall.input;
@@ -1926,11 +2233,6 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1926
2233
  unmetCriteria: criteria
1927
2234
  };
1928
2235
  }
1929
- getOpenTelemetryTracesDigest(threadId) {
1930
- const spans = this.spanCollector.getSpansForThread(threadId);
1931
- const digest = judgeSpanDigestFormatter.format(spans);
1932
- return digest;
1933
- }
1934
2236
  };
1935
2237
  var judgeAgent = (cfg) => {
1936
2238
  return new JudgeAgent(cfg ?? {});
@@ -4677,6 +4979,7 @@ export {
4677
4979
  AgentAdapter,
4678
4980
  AgentRole,
4679
4981
  DEFAULT_MAX_TURNS,
4982
+ DEFAULT_TOKEN_THRESHOLD,
4680
4983
  DEFAULT_VERBOSE,
4681
4984
  JudgeAgentAdapter,
4682
4985
  JudgeSpanCollector,
@@ -4690,7 +4993,10 @@ export {
4690
4993
  allAgentRoles,
4691
4994
  index_default as default,
4692
4995
  defineConfig,
4996
+ estimateTokens,
4997
+ expandTrace,
4693
4998
  fail,
4999
+ grepTrace,
4694
5000
  judge,
4695
5001
  judgeAgent,
4696
5002
  judgeSpanCollector,