@langwatch/scenario 0.4.3 → 0.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -969,6 +969,7 @@ __export(index_exports, {
969
969
  AgentAdapter: () => AgentAdapter,
970
970
  AgentRole: () => AgentRole,
971
971
  DEFAULT_MAX_TURNS: () => DEFAULT_MAX_TURNS,
972
+ DEFAULT_TOKEN_THRESHOLD: () => DEFAULT_TOKEN_THRESHOLD,
972
973
  DEFAULT_VERBOSE: () => DEFAULT_VERBOSE,
973
974
  JudgeAgentAdapter: () => JudgeAgentAdapter,
974
975
  JudgeSpanCollector: () => JudgeSpanCollector,
@@ -982,7 +983,10 @@ __export(index_exports, {
982
983
  allAgentRoles: () => allAgentRoles,
983
984
  default: () => index_default,
984
985
  defineConfig: () => defineConfig,
986
+ estimateTokens: () => estimateTokens,
987
+ expandTrace: () => expandTrace,
985
988
  fail: () => fail,
989
+ grepTrace: () => grepTrace,
986
990
  judge: () => judge,
987
991
  judgeAgent: () => judgeAgent,
988
992
  judgeSpanCollector: () => judgeSpanCollector,
@@ -1004,9 +1008,13 @@ module.exports = __toCommonJS(index_exports);
1004
1008
  // src/agents/index.ts
1005
1009
  var agents_exports = {};
1006
1010
  __export(agents_exports, {
1011
+ DEFAULT_TOKEN_THRESHOLD: () => DEFAULT_TOKEN_THRESHOLD,
1007
1012
  JudgeSpanCollector: () => JudgeSpanCollector,
1008
1013
  JudgeSpanDigestFormatter: () => JudgeSpanDigestFormatter,
1009
1014
  RealtimeAgentAdapter: () => RealtimeAgentAdapter,
1015
+ estimateTokens: () => estimateTokens,
1016
+ expandTrace: () => expandTrace,
1017
+ grepTrace: () => grepTrace,
1010
1018
  judgeAgent: () => judgeAgent,
1011
1019
  judgeSpanCollector: () => judgeSpanCollector,
1012
1020
  judgeSpanDigestFormatter: () => judgeSpanDigestFormatter,
@@ -1086,6 +1094,275 @@ var JudgeUtils = {
1086
1094
  }
1087
1095
  };
1088
1096
 
1097
+ // src/agents/judge/estimate-tokens.ts
1098
+ var DEFAULT_TOKEN_THRESHOLD = 8192;
1099
+ function estimateTokens(text) {
1100
+ const byteLength = new TextEncoder().encode(text).byteLength;
1101
+ return Math.ceil(byteLength / 4);
1102
+ }
1103
+
1104
+ // src/agents/judge/span-utils.ts
1105
+ var import_observability = require("langwatch/observability");
1106
+
1107
+ // src/agents/judge/deep-transform.ts
1108
+ function deepTransform(value, fn) {
1109
+ const result = fn(value);
1110
+ if (result !== value) return result;
1111
+ if (Array.isArray(value)) {
1112
+ return value.map((v) => deepTransform(v, fn));
1113
+ }
1114
+ if (value !== null && typeof value === "object") {
1115
+ const out = {};
1116
+ for (const [k, v] of Object.entries(value)) {
1117
+ out[k] = deepTransform(v, fn);
1118
+ }
1119
+ return out;
1120
+ }
1121
+ return value;
1122
+ }
1123
+
1124
+ // src/agents/judge/truncate-media.ts
1125
+ function truncateMediaUrl(str) {
1126
+ const match = str.match(
1127
+ /^data:((image|audio|video)\/[a-z0-9+.-]+);base64,(.+)$/i
1128
+ );
1129
+ if (!match) return str;
1130
+ const [, mimeType, category, data] = match;
1131
+ return `[${category.toUpperCase()}: ${mimeType}, ~${data.length} bytes]`;
1132
+ }
1133
+ function truncateMediaPart(v) {
1134
+ var _a;
1135
+ if (v === null || typeof v !== "object" || Array.isArray(v)) return null;
1136
+ const obj = v;
1137
+ if (obj.type === "file" && typeof obj.mediaType === "string" && typeof obj.data === "string") {
1138
+ const category = ((_a = obj.mediaType.split("/")[0]) == null ? void 0 : _a.toUpperCase()) ?? "FILE";
1139
+ return {
1140
+ ...obj,
1141
+ data: `[${category}: ${obj.mediaType}, ~${obj.data.length} bytes]`
1142
+ };
1143
+ }
1144
+ if (obj.type === "image" && typeof obj.image === "string") {
1145
+ const imageData = obj.image;
1146
+ const dataUrlMatch = imageData.match(
1147
+ /^data:((image)\/[a-z0-9+.-]+);base64,(.+)$/i
1148
+ );
1149
+ if (dataUrlMatch) {
1150
+ return {
1151
+ ...obj,
1152
+ image: `[IMAGE: ${dataUrlMatch[1]}, ~${dataUrlMatch[3].length} bytes]`
1153
+ };
1154
+ }
1155
+ if (imageData.length > 1e3 && /^[A-Za-z0-9+/=]+$/.test(imageData)) {
1156
+ return {
1157
+ ...obj,
1158
+ image: `[IMAGE: unknown, ~${imageData.length} bytes]`
1159
+ };
1160
+ }
1161
+ }
1162
+ return null;
1163
+ }
1164
+
1165
+ // src/agents/judge/span-utils.ts
1166
+ function hrTimeToMs(hrTime) {
1167
+ return hrTime[0] * 1e3 + hrTime[1] / 1e6;
1168
+ }
1169
+ function formatDuration(ms) {
1170
+ if (ms < 1e3) return `${Math.round(ms)}ms`;
1171
+ return `${(ms / 1e3).toFixed(2)}s`;
1172
+ }
1173
+ function calculateSpanDuration(span) {
1174
+ return hrTimeToMs(span.endTime) - hrTimeToMs(span.startTime);
1175
+ }
1176
+ function getStatusIndicator(span) {
1177
+ if (span.status.code === 2) {
1178
+ return ` \u26A0\uFE0F ERROR: ${span.status.message ?? "unknown"}`;
1179
+ }
1180
+ return "";
1181
+ }
1182
+ function getTokenUsage(span) {
1183
+ const input = span.attributes["gen_ai.usage.input_tokens"];
1184
+ const output = span.attributes["gen_ai.usage.output_tokens"];
1185
+ if (input == null && output == null) return "";
1186
+ const total = (Number(input) || 0) + (Number(output) || 0);
1187
+ return `, ${total} tokens`;
1188
+ }
1189
+ function cleanAttributes(attrs) {
1190
+ const cleaned = {};
1191
+ const seen = /* @__PURE__ */ new Set();
1192
+ const excludedKeys = [
1193
+ import_observability.attributes.ATTR_LANGWATCH_THREAD_ID,
1194
+ "langwatch.scenario.id",
1195
+ "langwatch.scenario.name"
1196
+ ];
1197
+ for (const [key, value] of Object.entries(attrs)) {
1198
+ if (excludedKeys.includes(key)) {
1199
+ continue;
1200
+ }
1201
+ const cleanKey = key.replace(/^(langwatch)\./, "");
1202
+ if (!seen.has(cleanKey)) {
1203
+ seen.add(cleanKey);
1204
+ cleaned[cleanKey] = value;
1205
+ }
1206
+ }
1207
+ return cleaned;
1208
+ }
1209
+ function formatValue(value) {
1210
+ const processed = transformValue(value);
1211
+ return typeof processed === "string" ? processed : JSON.stringify(processed);
1212
+ }
1213
+ function transformValue(value) {
1214
+ return deepTransform(value, (v) => {
1215
+ const mediaPart = truncateMediaPart(v);
1216
+ if (mediaPart) return mediaPart;
1217
+ if (typeof v !== "string") return v;
1218
+ const truncated = truncateMediaUrl(v);
1219
+ if (truncated !== v) return truncated;
1220
+ if (looksLikeJson(v)) {
1221
+ try {
1222
+ const parsed = transformValue(JSON.parse(v));
1223
+ return JSON.stringify(parsed);
1224
+ } catch {
1225
+ }
1226
+ }
1227
+ return v;
1228
+ });
1229
+ }
1230
+ function looksLikeJson(str) {
1231
+ const t = str.trim();
1232
+ return t.startsWith("{") && t.endsWith("}") || t.startsWith("[") && t.endsWith("]");
1233
+ }
1234
+ function indexSpans(spans) {
1235
+ const sorted = [...spans].sort((a, b) => {
1236
+ return hrTimeToMs(a.startTime) - hrTimeToMs(b.startTime);
1237
+ });
1238
+ return sorted.map((span) => ({
1239
+ span,
1240
+ children: [],
1241
+ shortId: span.spanContext().spanId.slice(0, 8)
1242
+ }));
1243
+ }
1244
+
1245
+ // src/agents/judge/trace-tools.ts
1246
+ var TOOL_RESULT_TOKEN_BUDGET = 4096;
1247
+ var TOOL_RESULT_CHAR_BUDGET = TOOL_RESULT_TOKEN_BUDGET * 4;
1248
+ var MAX_GREP_MATCHES = 20;
1249
+ function renderFullSpanNode(node) {
1250
+ const span = node.span;
1251
+ const duration = calculateSpanDuration(span);
1252
+ const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
1253
+ const status = getStatusIndicator(span);
1254
+ const lines = [];
1255
+ lines.push(
1256
+ `[${node.shortId}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
1257
+ );
1258
+ const attrs = cleanAttributes(span.attributes);
1259
+ if (Object.keys(attrs).length > 0) {
1260
+ for (const [key, value] of Object.entries(attrs)) {
1261
+ lines.push(` ${key}: ${formatValue(value)}`);
1262
+ }
1263
+ }
1264
+ if (span.events.length > 0) {
1265
+ for (const event of span.events) {
1266
+ lines.push(` [event] ${event.name}`);
1267
+ if (event.attributes) {
1268
+ const eventAttrs = cleanAttributes(event.attributes);
1269
+ for (const [key, value] of Object.entries(eventAttrs)) {
1270
+ lines.push(` ${key}: ${formatValue(value)}`);
1271
+ }
1272
+ }
1273
+ }
1274
+ }
1275
+ return lines;
1276
+ }
1277
+ function truncateToCharBudget(text) {
1278
+ if (text.length <= TOOL_RESULT_CHAR_BUDGET) return text;
1279
+ const truncated = text.slice(0, TOOL_RESULT_CHAR_BUDGET);
1280
+ return truncated + "\n\n[TRUNCATED] Output exceeded ~4000 token budget. Use grep_trace(pattern) to search for specific content, or expand_trace with fewer span IDs.";
1281
+ }
1282
+ function spanToSearchableText(span) {
1283
+ const parts = [span.name];
1284
+ const attrs = cleanAttributes(span.attributes);
1285
+ for (const [key, value] of Object.entries(attrs)) {
1286
+ parts.push(`${key}: ${formatValue(value)}`);
1287
+ }
1288
+ for (const event of span.events) {
1289
+ parts.push(event.name);
1290
+ if (event.attributes) {
1291
+ const eventAttrs = cleanAttributes(event.attributes);
1292
+ for (const [key, value] of Object.entries(eventAttrs)) {
1293
+ parts.push(`${key}: ${formatValue(value)}`);
1294
+ }
1295
+ }
1296
+ }
1297
+ return parts.join("\n");
1298
+ }
1299
+ function expandTrace(spans, spanIds) {
1300
+ const nodes = indexSpans(spans);
1301
+ if (nodes.length === 0) {
1302
+ return "No spans recorded.";
1303
+ }
1304
+ if (spanIds.length === 0) {
1305
+ return "Error: provide at least one span ID.";
1306
+ }
1307
+ const selected = nodes.filter((n) => {
1308
+ const fullId = n.span.spanContext().spanId;
1309
+ return spanIds.some((prefix) => fullId.startsWith(prefix));
1310
+ });
1311
+ if (selected.length === 0) {
1312
+ const available = nodes.map((n) => n.shortId).join(", ");
1313
+ return `Error: no spans matched the given ID(s). Available span IDs: ${available}`;
1314
+ }
1315
+ const lines = [];
1316
+ for (const node of selected) {
1317
+ const spanLines = renderFullSpanNode(node);
1318
+ lines.push(...spanLines);
1319
+ lines.push("");
1320
+ }
1321
+ return truncateToCharBudget(lines.join("\n").trimEnd());
1322
+ }
1323
+ function grepTrace(spans, pattern) {
1324
+ const nodes = indexSpans(spans);
1325
+ if (nodes.length === 0) {
1326
+ return "No spans recorded.";
1327
+ }
1328
+ const regex = new RegExp(escapeRegex(pattern), "i");
1329
+ const matches = [];
1330
+ for (const node of nodes) {
1331
+ const searchText = spanToSearchableText(node.span);
1332
+ const lines2 = searchText.split("\n");
1333
+ const matchingLines = lines2.filter((line) => regex.test(line));
1334
+ if (matchingLines.length > 0) {
1335
+ matches.push({ node, matchingLines });
1336
+ }
1337
+ }
1338
+ if (matches.length === 0) {
1339
+ const spanNames = Array.from(new Set(nodes.map((n) => n.span.name)));
1340
+ return `No matches found for "${pattern}". Available span names: ${spanNames.join(", ")}`;
1341
+ }
1342
+ const totalMatches = matches.length;
1343
+ const limited = matches.slice(0, MAX_GREP_MATCHES);
1344
+ const lines = [];
1345
+ for (const { node, matchingLines } of limited) {
1346
+ const duration = calculateSpanDuration(node.span);
1347
+ lines.push(
1348
+ `--- [${node.shortId}] ${node.span.name} (${formatDuration(duration)}) ---`
1349
+ );
1350
+ for (const line of matchingLines) {
1351
+ lines.push(` ${line}`);
1352
+ }
1353
+ lines.push("");
1354
+ }
1355
+ if (totalMatches > MAX_GREP_MATCHES) {
1356
+ lines.push(
1357
+ `[${totalMatches - MAX_GREP_MATCHES} more matches omitted. Refine your search pattern for more specific results.]`
1358
+ );
1359
+ }
1360
+ return truncateToCharBudget(lines.join("\n").trimEnd());
1361
+ }
1362
+ function escapeRegex(str) {
1363
+ return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
1364
+ }
1365
+
1089
1366
  // src/config/env.ts
1090
1367
  var import_v4 = require("zod/v4");
1091
1368
 
@@ -1441,7 +1718,7 @@ var criterionToParamName = (criterion) => {
1441
1718
  };
1442
1719
 
1443
1720
  // src/agents/judge/judge-span-collector.ts
1444
- var import_observability = require("langwatch/observability");
1721
+ var import_observability2 = require("langwatch/observability");
1445
1722
  var JudgeSpanCollector = class {
1446
1723
  spans = [];
1447
1724
  onStart() {
@@ -1484,7 +1761,7 @@ var JudgeSpanCollector = class {
1484
1761
  const spanId = span.spanContext().spanId;
1485
1762
  if (visited.has(spanId)) return false;
1486
1763
  visited.add(spanId);
1487
- if (span.attributes[import_observability.attributes.ATTR_LANGWATCH_THREAD_ID] === threadId) {
1764
+ if (span.attributes[import_observability2.attributes.ATTR_LANGWATCH_THREAD_ID] === threadId) {
1488
1765
  return true;
1489
1766
  }
1490
1767
  const parentId = getParentSpanId(span);
@@ -1503,26 +1780,6 @@ function getParentSpanId(span) {
1503
1780
  }
1504
1781
  var judgeSpanCollector = new JudgeSpanCollector();
1505
1782
 
1506
- // src/agents/judge/judge-span-digest-formatter.ts
1507
- var import_observability2 = require("langwatch/observability");
1508
-
1509
- // src/agents/judge/deep-transform.ts
1510
- function deepTransform(value, fn) {
1511
- const result = fn(value);
1512
- if (result !== value) return result;
1513
- if (Array.isArray(value)) {
1514
- return value.map((v) => deepTransform(v, fn));
1515
- }
1516
- if (value !== null && typeof value === "object") {
1517
- const out = {};
1518
- for (const [k, v] of Object.entries(value)) {
1519
- out[k] = deepTransform(v, fn);
1520
- }
1521
- return out;
1522
- }
1523
- return value;
1524
- }
1525
-
1526
1783
  // src/agents/judge/string-deduplicator.ts
1527
1784
  var StringDeduplicator = class {
1528
1785
  seen = /* @__PURE__ */ new Map();
@@ -1556,51 +1813,49 @@ var StringDeduplicator = class {
1556
1813
  }
1557
1814
  };
1558
1815
 
1559
- // src/agents/judge/truncate-media.ts
1560
- function truncateMediaUrl(str) {
1561
- const match = str.match(
1562
- /^data:((image|audio|video)\/[a-z0-9+.-]+);base64,(.+)$/i
1563
- );
1564
- if (!match) return str;
1565
- const [, mimeType, category, data] = match;
1566
- return `[${category.toUpperCase()}: ${mimeType}, ~${data.length} bytes]`;
1567
- }
1568
- function truncateMediaPart(v) {
1569
- var _a;
1570
- if (v === null || typeof v !== "object" || Array.isArray(v)) return null;
1571
- const obj = v;
1572
- if (obj.type === "file" && typeof obj.mediaType === "string" && typeof obj.data === "string") {
1573
- const category = ((_a = obj.mediaType.split("/")[0]) == null ? void 0 : _a.toUpperCase()) ?? "FILE";
1574
- return {
1575
- ...obj,
1576
- data: `[${category}: ${obj.mediaType}, ~${obj.data.length} bytes]`
1577
- };
1578
- }
1579
- if (obj.type === "image" && typeof obj.image === "string") {
1580
- const imageData = obj.image;
1581
- const dataUrlMatch = imageData.match(
1582
- /^data:((image)\/[a-z0-9+.-]+);base64,(.+)$/i
1583
- );
1584
- if (dataUrlMatch) {
1585
- return {
1586
- ...obj,
1587
- image: `[IMAGE: ${dataUrlMatch[1]}, ~${dataUrlMatch[3].length} bytes]`
1588
- };
1589
- }
1590
- if (imageData.length > 1e3 && /^[A-Za-z0-9+/=]+$/.test(imageData)) {
1591
- return {
1592
- ...obj,
1593
- image: `[IMAGE: unknown, ~${imageData.length} bytes]`
1594
- };
1595
- }
1596
- }
1597
- return null;
1598
- }
1599
-
1600
1816
  // src/agents/judge/judge-span-digest-formatter.ts
1601
1817
  var JudgeSpanDigestFormatter = class {
1602
1818
  logger = new Logger("JudgeSpanDigestFormatter");
1603
1819
  deduplicator = new StringDeduplicator({ threshold: 50 });
1820
+ /**
1821
+ * Formats spans into a structure-only digest showing span tree hierarchy
1822
+ * without attributes, events, or content. Used for large traces that
1823
+ * exceed the token threshold, paired with expand_trace/grep_trace tools.
1824
+ *
1825
+ * @param spans - All spans for a thread
1826
+ * @returns Plain text digest with only structural information
1827
+ */
1828
+ formatStructureOnly(spans) {
1829
+ this.logger.debug("formatStructureOnly() called", {
1830
+ spanCount: spans.length
1831
+ });
1832
+ if (spans.length === 0) {
1833
+ return "No spans recorded.";
1834
+ }
1835
+ const sortedSpans = this.sortByStartTime(spans);
1836
+ const tree = this.buildHierarchy(sortedSpans);
1837
+ const totalDuration = this.calculateTotalDuration(sortedSpans);
1838
+ const lines = [
1839
+ `Spans: ${spans.length} | Total Duration: ${formatDuration(totalDuration)}`,
1840
+ ""
1841
+ ];
1842
+ const rootCount = tree.length;
1843
+ tree.forEach((node, idx) => {
1844
+ this.renderStructureNode(
1845
+ node,
1846
+ lines,
1847
+ 0,
1848
+ idx === rootCount - 1
1849
+ );
1850
+ });
1851
+ const errors = this.collectErrors(spans);
1852
+ if (errors.length > 0) {
1853
+ lines.push("");
1854
+ lines.push("=== ERRORS ===");
1855
+ errors.forEach((e) => lines.push(e));
1856
+ }
1857
+ return lines.join("\n");
1858
+ }
1604
1859
  /**
1605
1860
  * Formats spans into a complete digest with full content and nesting.
1606
1861
  * @param spans - All spans for a thread
@@ -1624,19 +1879,17 @@ var JudgeSpanDigestFormatter = class {
1624
1879
  totalDuration
1625
1880
  });
1626
1881
  const lines = [
1627
- `Spans: ${spans.length} | Total Duration: ${this.formatDuration(
1882
+ `Spans: ${spans.length} | Total Duration: ${formatDuration(
1628
1883
  totalDuration
1629
1884
  )}`,
1630
1885
  ""
1631
1886
  ];
1632
- let sequence = 1;
1633
1887
  const rootCount = tree.length;
1634
1888
  tree.forEach((node, idx) => {
1635
- sequence = this.renderNode(
1889
+ this.renderNode(
1636
1890
  node,
1637
1891
  lines,
1638
1892
  0,
1639
- sequence,
1640
1893
  idx === rootCount - 1
1641
1894
  );
1642
1895
  });
@@ -1650,9 +1903,7 @@ var JudgeSpanDigestFormatter = class {
1650
1903
  }
1651
1904
  sortByStartTime(spans) {
1652
1905
  return [...spans].sort((a, b) => {
1653
- const aTime = this.hrTimeToMs(a.startTime);
1654
- const bTime = this.hrTimeToMs(b.startTime);
1655
- return aTime - bTime;
1906
+ return hrTimeToMs(a.startTime) - hrTimeToMs(b.startTime);
1656
1907
  });
1657
1908
  }
1658
1909
  buildHierarchy(spans) {
@@ -1672,46 +1923,66 @@ var JudgeSpanDigestFormatter = class {
1672
1923
  }
1673
1924
  return roots;
1674
1925
  }
1675
- renderNode(node, lines, depth, sequence, isLast = true) {
1926
+ renderStructureNode(node, lines, depth, isLast = true) {
1676
1927
  const span = node.span;
1677
- const duration = this.calculateSpanDuration(span);
1678
- const timestamp = this.formatTimestamp(span.startTime);
1679
- const status = this.getStatusIndicator(span);
1928
+ const shortId = span.spanContext().spanId.slice(0, 8);
1929
+ const duration = calculateSpanDuration(span);
1930
+ const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
1931
+ const status = getStatusIndicator(span);
1932
+ const tokens = getTokenUsage(span);
1680
1933
  const prefix = this.getTreePrefix(depth, isLast);
1681
1934
  lines.push(
1682
- `${prefix}[${sequence}] ${new Date(timestamp).toISOString()} ${span.name} (${this.formatDuration(duration)})${status}`
1935
+ `${prefix}[${shortId}] ${timestamp} ${span.name} (${formatDuration(duration)}${tokens})${status}`
1936
+ );
1937
+ lines.push("");
1938
+ const childCount = node.children.length;
1939
+ node.children.forEach((child, idx) => {
1940
+ this.renderStructureNode(
1941
+ child,
1942
+ lines,
1943
+ depth + 1,
1944
+ idx === childCount - 1
1945
+ );
1946
+ });
1947
+ }
1948
+ renderNode(node, lines, depth, isLast = true) {
1949
+ const span = node.span;
1950
+ const shortId = span.spanContext().spanId.slice(0, 8);
1951
+ const duration = calculateSpanDuration(span);
1952
+ const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
1953
+ const status = getStatusIndicator(span);
1954
+ const prefix = this.getTreePrefix(depth, isLast);
1955
+ lines.push(
1956
+ `${prefix}[${shortId}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
1683
1957
  );
1684
1958
  const attrIndent = this.getAttrIndent(depth, isLast);
1685
- const attrs = this.cleanAttributes(span.attributes);
1959
+ const attrs = cleanAttributes(span.attributes);
1686
1960
  if (Object.keys(attrs).length > 0) {
1687
1961
  for (const [key, value] of Object.entries(attrs)) {
1688
- lines.push(`${attrIndent}${key}: ${this.formatValue(value)}`);
1962
+ lines.push(`${attrIndent}${key}: ${this.formatValueWithDedup(value)}`);
1689
1963
  }
1690
1964
  }
1691
1965
  if (span.events.length > 0) {
1692
1966
  for (const event of span.events) {
1693
1967
  lines.push(`${attrIndent}[event] ${event.name}`);
1694
1968
  if (event.attributes) {
1695
- const eventAttrs = this.cleanAttributes(event.attributes);
1969
+ const eventAttrs = cleanAttributes(event.attributes);
1696
1970
  for (const [key, value] of Object.entries(eventAttrs)) {
1697
- lines.push(`${attrIndent} ${key}: ${this.formatValue(value)}`);
1971
+ lines.push(`${attrIndent} ${key}: ${this.formatValueWithDedup(value)}`);
1698
1972
  }
1699
1973
  }
1700
1974
  }
1701
1975
  }
1702
1976
  lines.push("");
1703
- let nextSeq = sequence + 1;
1704
1977
  const childCount = node.children.length;
1705
1978
  node.children.forEach((child, idx) => {
1706
- nextSeq = this.renderNode(
1979
+ this.renderNode(
1707
1980
  child,
1708
1981
  lines,
1709
1982
  depth + 1,
1710
- nextSeq,
1711
1983
  idx === childCount - 1
1712
1984
  );
1713
1985
  });
1714
- return nextSeq;
1715
1986
  }
1716
1987
  getTreePrefix(depth, isLast) {
1717
1988
  if (depth === 0) return "";
@@ -1723,42 +1994,26 @@ var JudgeSpanDigestFormatter = class {
1723
1994
  const continuation = isLast ? " " : "\u2502 ";
1724
1995
  return "\u2502 ".repeat(depth - 1) + continuation + " ";
1725
1996
  }
1726
- cleanAttributes(attrs) {
1727
- const cleaned = {};
1728
- const seen = /* @__PURE__ */ new Set();
1729
- const excludedKeys = [
1730
- import_observability2.attributes.ATTR_LANGWATCH_THREAD_ID,
1731
- "langwatch.scenario.id",
1732
- "langwatch.scenario.name"
1733
- ];
1734
- for (const [key, value] of Object.entries(attrs)) {
1735
- if (excludedKeys.includes(key)) {
1736
- continue;
1737
- }
1738
- const cleanKey = key.replace(/^(langwatch)\./, "");
1739
- if (!seen.has(cleanKey)) {
1740
- seen.add(cleanKey);
1741
- cleaned[cleanKey] = value;
1742
- }
1743
- }
1744
- return cleaned;
1745
- }
1746
- formatValue(value) {
1747
- const processed = this.transformValue(value);
1997
+ /**
1998
+ * Formats a value with deduplication applied. Used by the `format()` method
1999
+ * to reduce token usage by replacing repeated strings with markers.
2000
+ */
2001
+ formatValueWithDedup(value) {
2002
+ const processed = this.transformValueWithDedup(value);
1748
2003
  return typeof processed === "string" ? processed : JSON.stringify(processed);
1749
2004
  }
1750
- transformValue(value) {
2005
+ transformValueWithDedup(value) {
1751
2006
  return deepTransform(value, (v) => {
1752
2007
  const mediaPart = truncateMediaPart(v);
1753
2008
  if (mediaPart) return mediaPart;
1754
2009
  if (typeof v !== "string") return v;
1755
- return this.transformString(v);
2010
+ return this.transformStringWithDedup(v);
1756
2011
  });
1757
2012
  }
1758
- transformString(str) {
1759
- if (this.looksLikeJson(str)) {
2013
+ transformStringWithDedup(str) {
2014
+ if (looksLikeJson(str)) {
1760
2015
  try {
1761
- const processed = this.transformValue(JSON.parse(str));
2016
+ const processed = this.transformValueWithDedup(JSON.parse(str));
1762
2017
  return JSON.stringify(processed);
1763
2018
  } catch {
1764
2019
  }
@@ -1767,36 +2022,12 @@ var JudgeSpanDigestFormatter = class {
1767
2022
  if (truncated !== str) return truncated;
1768
2023
  return this.deduplicator.process(str);
1769
2024
  }
1770
- looksLikeJson(str) {
1771
- const t = str.trim();
1772
- return t.startsWith("{") && t.endsWith("}") || t.startsWith("[") && t.endsWith("]");
1773
- }
1774
- hrTimeToMs(hrTime) {
1775
- return hrTime[0] * 1e3 + hrTime[1] / 1e6;
1776
- }
1777
- calculateSpanDuration(span) {
1778
- return this.hrTimeToMs(span.endTime) - this.hrTimeToMs(span.startTime);
1779
- }
1780
2025
  calculateTotalDuration(spans) {
1781
2026
  if (spans.length === 0) return 0;
1782
- const first = this.hrTimeToMs(spans[0].startTime);
1783
- const last = Math.max(...spans.map((s) => this.hrTimeToMs(s.endTime)));
2027
+ const first = hrTimeToMs(spans[0].startTime);
2028
+ const last = Math.max(...spans.map((s) => hrTimeToMs(s.endTime)));
1784
2029
  return last - first;
1785
2030
  }
1786
- formatDuration(ms) {
1787
- if (ms < 1e3) return `${Math.round(ms)}ms`;
1788
- return `${(ms / 1e3).toFixed(2)}s`;
1789
- }
1790
- formatTimestamp(hrTime) {
1791
- const ms = this.hrTimeToMs(hrTime);
1792
- return new Date(ms).toISOString();
1793
- }
1794
- getStatusIndicator(span) {
1795
- if (span.status.code === 2) {
1796
- return ` \u26A0\uFE0F ERROR: ${span.status.message ?? "unknown"}`;
1797
- }
1798
- return "";
1799
- }
1800
2031
  collectErrors(spans) {
1801
2032
  return spans.filter((s) => s.status.code === 2).map((s) => `- ${s.name}: ${s.status.message ?? "unknown error"}`);
1802
2033
  }
@@ -1859,15 +2090,41 @@ function buildFinishTestTool(criteria) {
1859
2090
  })
1860
2091
  });
1861
2092
  }
2093
+ function buildProgressiveDiscoveryTools(spans) {
2094
+ return {
2095
+ expand_trace: (0, import_ai2.tool)({
2096
+ description: "Expand one or more spans to see their full details (attributes, events, content). Use the span ID shown in brackets in the trace skeleton.",
2097
+ inputSchema: import_v44.z.object({
2098
+ span_ids: import_v44.z.array(import_v44.z.string()).describe("Span IDs (or 8-char prefixes) to expand")
2099
+ }),
2100
+ execute: async ({ span_ids }) => {
2101
+ return expandTrace(spans, span_ids);
2102
+ }
2103
+ }),
2104
+ grep_trace: (0, import_ai2.tool)({
2105
+ description: "Search across all span attributes, events, and content for a pattern (case-insensitive). Returns matching spans with context.",
2106
+ inputSchema: import_v44.z.object({
2107
+ pattern: import_v44.z.string().describe("Search pattern (case-insensitive)")
2108
+ }),
2109
+ execute: async ({ pattern }) => {
2110
+ return grepTrace(spans, pattern);
2111
+ }
2112
+ })
2113
+ };
2114
+ }
1862
2115
  var JudgeAgent = class extends JudgeAgentAdapter {
1863
2116
  constructor(cfg) {
1864
2117
  super();
1865
2118
  this.cfg = cfg;
1866
2119
  this.criteria = cfg.criteria ?? [];
1867
2120
  this.spanCollector = cfg.spanCollector ?? judgeSpanCollector;
2121
+ this.tokenThreshold = cfg.tokenThreshold ?? DEFAULT_TOKEN_THRESHOLD;
2122
+ this.maxDiscoverySteps = cfg.maxDiscoverySteps ?? 10;
1868
2123
  }
1869
2124
  logger = new Logger("JudgeAgent");
1870
2125
  spanCollector;
2126
+ tokenThreshold;
2127
+ maxDiscoverySteps;
1871
2128
  role = "Judge" /* JUDGE */;
1872
2129
  criteria;
1873
2130
  /**
@@ -1875,7 +2132,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1875
2132
  */
1876
2133
  invokeLLM = createLLMInvoker(this.logger);
1877
2134
  async call(input) {
1878
- var _a, _b, _c, _d;
2135
+ var _a;
1879
2136
  const criteria = ((_a = input.judgmentRequest) == null ? void 0 : _a.criteria) ?? this.criteria;
1880
2137
  this.logger.debug("call() invoked", {
1881
2138
  threadId: input.threadId,
@@ -1883,8 +2140,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1883
2140
  maxTurns: input.scenarioConfig.maxTurns,
1884
2141
  judgmentRequest: input.judgmentRequest
1885
2142
  });
1886
- const digest = this.getOpenTelemetryTracesDigest(input.threadId);
1887
- this.logger.debug("OpenTelemetry traces built", { digest });
2143
+ const spans = this.spanCollector.getSpansForThread(input.threadId);
2144
+ const { digest, isLargeTrace } = this.buildTraceDigest(spans);
1888
2145
  const transcript = JudgeUtils.buildTranscriptFromMessages(input.messages);
1889
2146
  const contentForJudge = `
1890
2147
  <transcript>
@@ -1907,6 +2164,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1907
2164
  ...cfg
1908
2165
  });
1909
2166
  const tools = {
2167
+ ...isLargeTrace ? buildProgressiveDiscoveryTools(spans) : {},
1910
2168
  continue_test: buildContinueTestTool(),
1911
2169
  finish_test: buildFinishTestTool(criteria)
1912
2170
  };
@@ -1925,26 +2183,75 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1925
2183
  model: mergedConfig.model,
1926
2184
  toolChoice,
1927
2185
  isLastMessage,
1928
- enforceJudgement
2186
+ enforceJudgement,
2187
+ isLargeTrace
1929
2188
  });
1930
- const completion = await this.invokeLLM({
2189
+ const completion = await this.invokeLLMWithDiscovery({
1931
2190
  model: mergedConfig.model,
1932
2191
  messages,
1933
2192
  temperature: mergedConfig.temperature ?? 0,
1934
2193
  maxOutputTokens: mergedConfig.maxTokens,
1935
2194
  tools,
1936
- toolChoice
2195
+ toolChoice,
2196
+ isLargeTrace
2197
+ });
2198
+ return this.parseToolCalls(completion, criteria);
2199
+ }
2200
+ /**
2201
+ * Builds the trace digest, choosing between full inline rendering
2202
+ * and structure-only mode based on estimated token count.
2203
+ */
2204
+ buildTraceDigest(spans) {
2205
+ const fullDigest = judgeSpanDigestFormatter.format(spans);
2206
+ const isLargeTrace = spans.length > 0 && estimateTokens(fullDigest) > this.tokenThreshold;
2207
+ const digest = isLargeTrace ? judgeSpanDigestFormatter.formatStructureOnly(spans) + "\n\nUse expand_trace(span_id) to see span details or grep_trace(pattern) to search across spans. Reference spans by the ID shown in brackets." : fullDigest;
2208
+ this.logger.debug("Trace digest built", {
2209
+ isLargeTrace,
2210
+ estimatedTokens: estimateTokens(fullDigest)
1937
2211
  });
2212
+ return { digest, isLargeTrace };
2213
+ }
2214
+ /**
2215
+ * Invokes the LLM, enabling multi-step tool execution for large traces.
2216
+ * In multi-step mode, the AI SDK loops automatically: the judge can call
2217
+ * expand_trace/grep_trace tools multiple times before reaching a terminal
2218
+ * tool (finish_test/continue_test) or hitting the step limit.
2219
+ *
2220
+ * When the trace is large, toolChoice is relaxed to "required" so the
2221
+ * judge can freely pick discovery tools (expand_trace/grep_trace) before
2222
+ * being forced to a terminal decision.
2223
+ */
2224
+ async invokeLLMWithDiscovery({
2225
+ isLargeTrace,
2226
+ ...params
2227
+ }) {
2228
+ var _a, _b;
2229
+ if (isLargeTrace) {
2230
+ params.toolChoice = "required";
2231
+ params.stopWhen = [
2232
+ (0, import_ai2.stepCountIs)(this.maxDiscoverySteps),
2233
+ (0, import_ai2.hasToolCall)("finish_test"),
2234
+ (0, import_ai2.hasToolCall)("continue_test")
2235
+ ];
2236
+ }
2237
+ const completion = await this.invokeLLM(params);
1938
2238
  this.logger.debug("LLM response received", {
1939
- toolCallCount: ((_b = completion.toolCalls) == null ? void 0 : _b.length) ?? 0,
1940
- toolCalls: (_c = completion.toolCalls) == null ? void 0 : _c.map((tc) => ({
2239
+ toolCallCount: ((_a = completion.toolCalls) == null ? void 0 : _a.length) ?? 0,
2240
+ toolCalls: (_b = completion.toolCalls) == null ? void 0 : _b.map((tc) => ({
1941
2241
  toolName: tc.toolName,
1942
2242
  args: tc.input
1943
2243
  }))
1944
2244
  });
2245
+ return completion;
2246
+ }
2247
+ parseToolCalls(completion, criteria) {
2248
+ var _a;
1945
2249
  let args;
1946
- if ((_d = completion.toolCalls) == null ? void 0 : _d.length) {
1947
- const toolCall = completion.toolCalls[0];
2250
+ if ((_a = completion.toolCalls) == null ? void 0 : _a.length) {
2251
+ const terminalCall = completion.toolCalls.find(
2252
+ (tc) => tc.toolName === "finish_test" || tc.toolName === "continue_test"
2253
+ );
2254
+ const toolCall = terminalCall ?? completion.toolCalls[0];
1948
2255
  switch (toolCall.toolName) {
1949
2256
  case "finish_test": {
1950
2257
  args = toolCall.input;
@@ -1986,11 +2293,6 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1986
2293
  unmetCriteria: criteria
1987
2294
  };
1988
2295
  }
1989
- getOpenTelemetryTracesDigest(threadId) {
1990
- const spans = this.spanCollector.getSpansForThread(threadId);
1991
- const digest = judgeSpanDigestFormatter.format(spans);
1992
- return digest;
1993
- }
1994
2296
  };
1995
2297
  var judgeAgent = (cfg) => {
1996
2298
  return new JudgeAgent(cfg ?? {});
@@ -4731,6 +5033,7 @@ var index_default = scenario;
4731
5033
  AgentAdapter,
4732
5034
  AgentRole,
4733
5035
  DEFAULT_MAX_TURNS,
5036
+ DEFAULT_TOKEN_THRESHOLD,
4734
5037
  DEFAULT_VERBOSE,
4735
5038
  JudgeAgentAdapter,
4736
5039
  JudgeSpanCollector,
@@ -4743,7 +5046,10 @@ var index_default = scenario;
4743
5046
  agent,
4744
5047
  allAgentRoles,
4745
5048
  defineConfig,
5049
+ estimateTokens,
5050
+ expandTrace,
4746
5051
  fail,
5052
+ grepTrace,
4747
5053
  judge,
4748
5054
  judgeAgent,
4749
5055
  judgeSpanCollector,