@langwatch/scenario 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -944,9 +944,13 @@ var init_esm = __esm({
944
944
  // src/agents/index.ts
945
945
  var agents_exports = {};
946
946
  __export(agents_exports, {
947
+ DEFAULT_TOKEN_THRESHOLD: () => DEFAULT_TOKEN_THRESHOLD,
947
948
  JudgeSpanCollector: () => JudgeSpanCollector,
948
949
  JudgeSpanDigestFormatter: () => JudgeSpanDigestFormatter,
949
950
  RealtimeAgentAdapter: () => RealtimeAgentAdapter,
951
+ estimateTokens: () => estimateTokens,
952
+ expandTrace: () => expandTrace,
953
+ grepTrace: () => grepTrace,
950
954
  judgeAgent: () => judgeAgent,
951
955
  judgeSpanCollector: () => judgeSpanCollector,
952
956
  judgeSpanDigestFormatter: () => judgeSpanDigestFormatter,
@@ -954,7 +958,11 @@ __export(agents_exports, {
954
958
  });
955
959
 
956
960
  // src/agents/judge/judge-agent.ts
957
- import { tool } from "ai";
961
+ import {
962
+ tool,
963
+ stepCountIs,
964
+ hasToolCall
965
+ } from "ai";
958
966
  import { z as z4 } from "zod/v4";
959
967
 
960
968
  // src/agents/judge/judge-utils.ts
@@ -1026,6 +1034,283 @@ var JudgeUtils = {
1026
1034
  }
1027
1035
  };
1028
1036
 
1037
+ // src/agents/judge/estimate-tokens.ts
1038
+ var DEFAULT_TOKEN_THRESHOLD = 8192;
1039
+ function estimateTokens(text) {
1040
+ const byteLength = new TextEncoder().encode(text).byteLength;
1041
+ return Math.ceil(byteLength / 4);
1042
+ }
1043
+
1044
+ // src/agents/judge/span-utils.ts
1045
+ import { attributes } from "langwatch/observability";
1046
+
1047
+ // src/agents/judge/deep-transform.ts
1048
+ function deepTransform(value, fn) {
1049
+ const result = fn(value);
1050
+ if (result !== value) return result;
1051
+ if (Array.isArray(value)) {
1052
+ return value.map((v) => deepTransform(v, fn));
1053
+ }
1054
+ if (value !== null && typeof value === "object") {
1055
+ const out = {};
1056
+ for (const [k, v] of Object.entries(value)) {
1057
+ out[k] = deepTransform(v, fn);
1058
+ }
1059
+ return out;
1060
+ }
1061
+ return value;
1062
+ }
1063
+
1064
+ // src/agents/judge/truncate-media.ts
1065
+ function truncateMediaUrl(str) {
1066
+ const match = str.match(
1067
+ /^data:((image|audio|video)\/[a-z0-9+.-]+);base64,(.+)$/i
1068
+ );
1069
+ if (!match) return str;
1070
+ const [, mimeType, category, data] = match;
1071
+ return `[${category.toUpperCase()}: ${mimeType}, ~${data.length} bytes]`;
1072
+ }
1073
+ function truncateMediaPart(v) {
1074
+ var _a;
1075
+ if (v === null || typeof v !== "object" || Array.isArray(v)) return null;
1076
+ const obj = v;
1077
+ if (obj.type === "file" && typeof obj.mediaType === "string" && typeof obj.data === "string") {
1078
+ const category = ((_a = obj.mediaType.split("/")[0]) == null ? void 0 : _a.toUpperCase()) ?? "FILE";
1079
+ return {
1080
+ ...obj,
1081
+ data: `[${category}: ${obj.mediaType}, ~${obj.data.length} bytes]`
1082
+ };
1083
+ }
1084
+ if (obj.type === "image" && typeof obj.image === "string") {
1085
+ const imageData = obj.image;
1086
+ const dataUrlMatch = imageData.match(
1087
+ /^data:((image)\/[a-z0-9+.-]+);base64,(.+)$/i
1088
+ );
1089
+ if (dataUrlMatch) {
1090
+ return {
1091
+ ...obj,
1092
+ image: `[IMAGE: ${dataUrlMatch[1]}, ~${dataUrlMatch[3].length} bytes]`
1093
+ };
1094
+ }
1095
+ if (imageData.length > 1e3 && /^[A-Za-z0-9+/=]+$/.test(imageData)) {
1096
+ return {
1097
+ ...obj,
1098
+ image: `[IMAGE: unknown, ~${imageData.length} bytes]`
1099
+ };
1100
+ }
1101
+ }
1102
+ return null;
1103
+ }
1104
+
1105
+ // src/agents/judge/span-utils.ts
1106
+ function hrTimeToMs(hrTime) {
1107
+ return hrTime[0] * 1e3 + hrTime[1] / 1e6;
1108
+ }
1109
+ function formatDuration(ms) {
1110
+ if (ms < 1e3) return `${Math.round(ms)}ms`;
1111
+ return `${(ms / 1e3).toFixed(2)}s`;
1112
+ }
1113
+ function calculateSpanDuration(span) {
1114
+ return hrTimeToMs(span.endTime) - hrTimeToMs(span.startTime);
1115
+ }
1116
+ function getStatusIndicator(span) {
1117
+ if (span.status.code === 2) {
1118
+ return ` \u26A0\uFE0F ERROR: ${span.status.message ?? "unknown"}`;
1119
+ }
1120
+ return "";
1121
+ }
1122
+ function getTokenUsage(span) {
1123
+ const input = span.attributes["gen_ai.usage.input_tokens"];
1124
+ const output = span.attributes["gen_ai.usage.output_tokens"];
1125
+ if (input == null && output == null) return "";
1126
+ const total = (Number(input) || 0) + (Number(output) || 0);
1127
+ return `, ${total} tokens`;
1128
+ }
1129
+ function cleanAttributes(attrs) {
1130
+ const cleaned = {};
1131
+ const seen = /* @__PURE__ */ new Set();
1132
+ const excludedKeys = [
1133
+ attributes.ATTR_LANGWATCH_THREAD_ID,
1134
+ "langwatch.scenario.id",
1135
+ "langwatch.scenario.name"
1136
+ ];
1137
+ for (const [key, value] of Object.entries(attrs)) {
1138
+ if (excludedKeys.includes(key)) {
1139
+ continue;
1140
+ }
1141
+ const cleanKey = key.replace(/^(langwatch)\./, "");
1142
+ if (!seen.has(cleanKey)) {
1143
+ seen.add(cleanKey);
1144
+ cleaned[cleanKey] = value;
1145
+ }
1146
+ }
1147
+ return cleaned;
1148
+ }
1149
+ function formatValue(value) {
1150
+ const processed = transformValue(value);
1151
+ return typeof processed === "string" ? processed : JSON.stringify(processed);
1152
+ }
1153
+ function transformValue(value) {
1154
+ return deepTransform(value, (v) => {
1155
+ const mediaPart = truncateMediaPart(v);
1156
+ if (mediaPart) return mediaPart;
1157
+ if (typeof v !== "string") return v;
1158
+ const truncated = truncateMediaUrl(v);
1159
+ if (truncated !== v) return truncated;
1160
+ if (looksLikeJson(v)) {
1161
+ try {
1162
+ const parsed = transformValue(JSON.parse(v));
1163
+ return JSON.stringify(parsed);
1164
+ } catch {
1165
+ }
1166
+ }
1167
+ return v;
1168
+ });
1169
+ }
1170
+ function looksLikeJson(str) {
1171
+ const t = str.trim();
1172
+ return t.startsWith("{") && t.endsWith("}") || t.startsWith("[") && t.endsWith("]");
1173
+ }
1174
+ function indexSpans(spans) {
1175
+ const sorted = [...spans].sort((a, b) => {
1176
+ return hrTimeToMs(a.startTime) - hrTimeToMs(b.startTime);
1177
+ });
1178
+ return sorted.map((span, i) => ({
1179
+ span,
1180
+ children: [],
1181
+ index: i + 1
1182
+ }));
1183
+ }
1184
+
1185
+ // src/agents/judge/trace-tools.ts
1186
+ var TOOL_RESULT_TOKEN_BUDGET = 4096;
1187
+ var TOOL_RESULT_CHAR_BUDGET = TOOL_RESULT_TOKEN_BUDGET * 4;
1188
+ var MAX_GREP_MATCHES = 20;
1189
+ function renderFullSpanNode(node) {
1190
+ const span = node.span;
1191
+ const duration = calculateSpanDuration(span);
1192
+ const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
1193
+ const status = getStatusIndicator(span);
1194
+ const lines = [];
1195
+ lines.push(
1196
+ `[${node.index}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
1197
+ );
1198
+ const attrs = cleanAttributes(span.attributes);
1199
+ if (Object.keys(attrs).length > 0) {
1200
+ for (const [key, value] of Object.entries(attrs)) {
1201
+ lines.push(` ${key}: ${formatValue(value)}`);
1202
+ }
1203
+ }
1204
+ if (span.events.length > 0) {
1205
+ for (const event of span.events) {
1206
+ lines.push(` [event] ${event.name}`);
1207
+ if (event.attributes) {
1208
+ const eventAttrs = cleanAttributes(event.attributes);
1209
+ for (const [key, value] of Object.entries(eventAttrs)) {
1210
+ lines.push(` ${key}: ${formatValue(value)}`);
1211
+ }
1212
+ }
1213
+ }
1214
+ }
1215
+ return lines;
1216
+ }
1217
+ function truncateToCharBudget(text) {
1218
+ if (text.length <= TOOL_RESULT_CHAR_BUDGET) return text;
1219
+ const truncated = text.slice(0, TOOL_RESULT_CHAR_BUDGET);
1220
+ return truncated + "\n\n[TRUNCATED] Output exceeded ~4000 token budget. Use grep_trace(pattern) to search for specific content, or expand_trace with a narrower range.";
1221
+ }
1222
+ function spanToSearchableText(span) {
1223
+ const parts = [span.name];
1224
+ const attrs = cleanAttributes(span.attributes);
1225
+ for (const [key, value] of Object.entries(attrs)) {
1226
+ parts.push(`${key}: ${formatValue(value)}`);
1227
+ }
1228
+ for (const event of span.events) {
1229
+ parts.push(event.name);
1230
+ if (event.attributes) {
1231
+ const eventAttrs = cleanAttributes(event.attributes);
1232
+ for (const [key, value] of Object.entries(eventAttrs)) {
1233
+ parts.push(`${key}: ${formatValue(value)}`);
1234
+ }
1235
+ }
1236
+ }
1237
+ return parts.join("\n");
1238
+ }
1239
+ function expandTrace(spans, { index, range }) {
1240
+ const nodes = indexSpans(spans);
1241
+ if (nodes.length === 0) {
1242
+ return "No spans recorded.";
1243
+ }
1244
+ let startIdx;
1245
+ let endIdx;
1246
+ if (range != null) {
1247
+ const parts = range.split("-").map(Number);
1248
+ startIdx = parts[0];
1249
+ endIdx = parts[1] ?? startIdx;
1250
+ } else if (index != null) {
1251
+ startIdx = index;
1252
+ endIdx = index;
1253
+ } else {
1254
+ return "Error: provide either index or range parameter.";
1255
+ }
1256
+ const maxIndex = nodes.length;
1257
+ if (startIdx < 1 || endIdx > maxIndex || startIdx > endIdx) {
1258
+ return `Error: span index out of range. Valid range is 1-${maxIndex}.`;
1259
+ }
1260
+ const selected = nodes.filter(
1261
+ (n) => n.index >= startIdx && n.index <= endIdx
1262
+ );
1263
+ const lines = [];
1264
+ for (const node of selected) {
1265
+ const spanLines = renderFullSpanNode(node);
1266
+ lines.push(...spanLines);
1267
+ lines.push("");
1268
+ }
1269
+ return truncateToCharBudget(lines.join("\n").trimEnd());
1270
+ }
1271
+ function grepTrace(spans, pattern) {
1272
+ const nodes = indexSpans(spans);
1273
+ if (nodes.length === 0) {
1274
+ return "No spans recorded.";
1275
+ }
1276
+ const regex = new RegExp(escapeRegex(pattern), "i");
1277
+ const matches = [];
1278
+ for (const node of nodes) {
1279
+ const searchText = spanToSearchableText(node.span);
1280
+ const lines2 = searchText.split("\n");
1281
+ const matchingLines = lines2.filter((line) => regex.test(line));
1282
+ if (matchingLines.length > 0) {
1283
+ matches.push({ node, matchingLines });
1284
+ }
1285
+ }
1286
+ if (matches.length === 0) {
1287
+ const spanNames = Array.from(new Set(nodes.map((n) => n.span.name)));
1288
+ return `No matches found for "${pattern}". Available span names: ${spanNames.join(", ")}`;
1289
+ }
1290
+ const totalMatches = matches.length;
1291
+ const limited = matches.slice(0, MAX_GREP_MATCHES);
1292
+ const lines = [];
1293
+ for (const { node, matchingLines } of limited) {
1294
+ const duration = calculateSpanDuration(node.span);
1295
+ lines.push(
1296
+ `--- [${node.index}] ${node.span.name} (${formatDuration(duration)}) ---`
1297
+ );
1298
+ for (const line of matchingLines) {
1299
+ lines.push(` ${line}`);
1300
+ }
1301
+ lines.push("");
1302
+ }
1303
+ if (totalMatches > MAX_GREP_MATCHES) {
1304
+ lines.push(
1305
+ `[${totalMatches - MAX_GREP_MATCHES} more matches omitted. Refine your search pattern for more specific results.]`
1306
+ );
1307
+ }
1308
+ return truncateToCharBudget(lines.join("\n").trimEnd());
1309
+ }
1310
+ function escapeRegex(str) {
1311
+ return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
1312
+ }
1313
+
1029
1314
  // src/config/env.ts
1030
1315
  import { z } from "zod/v4";
1031
1316
 
@@ -1381,7 +1666,7 @@ var criterionToParamName = (criterion) => {
1381
1666
  };
1382
1667
 
1383
1668
  // src/agents/judge/judge-span-collector.ts
1384
- import { attributes } from "langwatch/observability";
1669
+ import { attributes as attributes2 } from "langwatch/observability";
1385
1670
  var JudgeSpanCollector = class {
1386
1671
  spans = [];
1387
1672
  onStart() {
@@ -1424,7 +1709,7 @@ var JudgeSpanCollector = class {
1424
1709
  const spanId = span.spanContext().spanId;
1425
1710
  if (visited.has(spanId)) return false;
1426
1711
  visited.add(spanId);
1427
- if (span.attributes[attributes.ATTR_LANGWATCH_THREAD_ID] === threadId) {
1712
+ if (span.attributes[attributes2.ATTR_LANGWATCH_THREAD_ID] === threadId) {
1428
1713
  return true;
1429
1714
  }
1430
1715
  const parentId = getParentSpanId(span);
@@ -1443,26 +1728,6 @@ function getParentSpanId(span) {
1443
1728
  }
1444
1729
  var judgeSpanCollector = new JudgeSpanCollector();
1445
1730
 
1446
- // src/agents/judge/judge-span-digest-formatter.ts
1447
- import { attributes as attributes2 } from "langwatch/observability";
1448
-
1449
- // src/agents/judge/deep-transform.ts
1450
- function deepTransform(value, fn) {
1451
- const result = fn(value);
1452
- if (result !== value) return result;
1453
- if (Array.isArray(value)) {
1454
- return value.map((v) => deepTransform(v, fn));
1455
- }
1456
- if (value !== null && typeof value === "object") {
1457
- const out = {};
1458
- for (const [k, v] of Object.entries(value)) {
1459
- out[k] = deepTransform(v, fn);
1460
- }
1461
- return out;
1462
- }
1463
- return value;
1464
- }
1465
-
1466
1731
  // src/agents/judge/string-deduplicator.ts
1467
1732
  var StringDeduplicator = class {
1468
1733
  seen = /* @__PURE__ */ new Map();
@@ -1496,51 +1761,51 @@ var StringDeduplicator = class {
1496
1761
  }
1497
1762
  };
1498
1763
 
1499
- // src/agents/judge/truncate-media.ts
1500
- function truncateMediaUrl(str) {
1501
- const match = str.match(
1502
- /^data:((image|audio|video)\/[a-z0-9+.-]+);base64,(.+)$/i
1503
- );
1504
- if (!match) return str;
1505
- const [, mimeType, category, data] = match;
1506
- return `[${category.toUpperCase()}: ${mimeType}, ~${data.length} bytes]`;
1507
- }
1508
- function truncateMediaPart(v) {
1509
- var _a;
1510
- if (v === null || typeof v !== "object" || Array.isArray(v)) return null;
1511
- const obj = v;
1512
- if (obj.type === "file" && typeof obj.mediaType === "string" && typeof obj.data === "string") {
1513
- const category = ((_a = obj.mediaType.split("/")[0]) == null ? void 0 : _a.toUpperCase()) ?? "FILE";
1514
- return {
1515
- ...obj,
1516
- data: `[${category}: ${obj.mediaType}, ~${obj.data.length} bytes]`
1517
- };
1518
- }
1519
- if (obj.type === "image" && typeof obj.image === "string") {
1520
- const imageData = obj.image;
1521
- const dataUrlMatch = imageData.match(
1522
- /^data:((image)\/[a-z0-9+.-]+);base64,(.+)$/i
1523
- );
1524
- if (dataUrlMatch) {
1525
- return {
1526
- ...obj,
1527
- image: `[IMAGE: ${dataUrlMatch[1]}, ~${dataUrlMatch[3].length} bytes]`
1528
- };
1529
- }
1530
- if (imageData.length > 1e3 && /^[A-Za-z0-9+/=]+$/.test(imageData)) {
1531
- return {
1532
- ...obj,
1533
- image: `[IMAGE: unknown, ~${imageData.length} bytes]`
1534
- };
1535
- }
1536
- }
1537
- return null;
1538
- }
1539
-
1540
1764
  // src/agents/judge/judge-span-digest-formatter.ts
1541
1765
  var JudgeSpanDigestFormatter = class {
1542
1766
  logger = new Logger("JudgeSpanDigestFormatter");
1543
1767
  deduplicator = new StringDeduplicator({ threshold: 50 });
1768
+ /**
1769
+ * Formats spans into a structure-only digest showing span tree hierarchy
1770
+ * without attributes, events, or content. Used for large traces that
1771
+ * exceed the token threshold, paired with expand_trace/grep_trace tools.
1772
+ *
1773
+ * @param spans - All spans for a thread
1774
+ * @returns Plain text digest with only structural information
1775
+ */
1776
+ formatStructureOnly(spans) {
1777
+ this.logger.debug("formatStructureOnly() called", {
1778
+ spanCount: spans.length
1779
+ });
1780
+ if (spans.length === 0) {
1781
+ return "No spans recorded.";
1782
+ }
1783
+ const sortedSpans = this.sortByStartTime(spans);
1784
+ const tree = this.buildHierarchy(sortedSpans);
1785
+ const totalDuration = this.calculateTotalDuration(sortedSpans);
1786
+ const lines = [
1787
+ `Spans: ${spans.length} | Total Duration: ${formatDuration(totalDuration)}`,
1788
+ ""
1789
+ ];
1790
+ let sequence = 1;
1791
+ const rootCount = tree.length;
1792
+ tree.forEach((node, idx) => {
1793
+ sequence = this.renderStructureNode(
1794
+ node,
1795
+ lines,
1796
+ 0,
1797
+ sequence,
1798
+ idx === rootCount - 1
1799
+ );
1800
+ });
1801
+ const errors = this.collectErrors(spans);
1802
+ if (errors.length > 0) {
1803
+ lines.push("");
1804
+ lines.push("=== ERRORS ===");
1805
+ errors.forEach((e) => lines.push(e));
1806
+ }
1807
+ return lines.join("\n");
1808
+ }
1544
1809
  /**
1545
1810
  * Formats spans into a complete digest with full content and nesting.
1546
1811
  * @param spans - All spans for a thread
@@ -1564,7 +1829,7 @@ var JudgeSpanDigestFormatter = class {
1564
1829
  totalDuration
1565
1830
  });
1566
1831
  const lines = [
1567
- `Spans: ${spans.length} | Total Duration: ${this.formatDuration(
1832
+ `Spans: ${spans.length} | Total Duration: ${formatDuration(
1568
1833
  totalDuration
1569
1834
  )}`,
1570
1835
  ""
@@ -1590,9 +1855,7 @@ var JudgeSpanDigestFormatter = class {
1590
1855
  }
1591
1856
  sortByStartTime(spans) {
1592
1857
  return [...spans].sort((a, b) => {
1593
- const aTime = this.hrTimeToMs(a.startTime);
1594
- const bTime = this.hrTimeToMs(b.startTime);
1595
- return aTime - bTime;
1858
+ return hrTimeToMs(a.startTime) - hrTimeToMs(b.startTime);
1596
1859
  });
1597
1860
  }
1598
1861
  buildHierarchy(spans) {
@@ -1612,29 +1875,53 @@ var JudgeSpanDigestFormatter = class {
1612
1875
  }
1613
1876
  return roots;
1614
1877
  }
1878
+ renderStructureNode(node, lines, depth, sequence, isLast = true) {
1879
+ const span = node.span;
1880
+ const duration = calculateSpanDuration(span);
1881
+ const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
1882
+ const status = getStatusIndicator(span);
1883
+ const tokens = getTokenUsage(span);
1884
+ const prefix = this.getTreePrefix(depth, isLast);
1885
+ lines.push(
1886
+ `${prefix}[${sequence}] ${timestamp} ${span.name} (${formatDuration(duration)}${tokens})${status}`
1887
+ );
1888
+ lines.push("");
1889
+ let nextSeq = sequence + 1;
1890
+ const childCount = node.children.length;
1891
+ node.children.forEach((child, idx) => {
1892
+ nextSeq = this.renderStructureNode(
1893
+ child,
1894
+ lines,
1895
+ depth + 1,
1896
+ nextSeq,
1897
+ idx === childCount - 1
1898
+ );
1899
+ });
1900
+ return nextSeq;
1901
+ }
1615
1902
  renderNode(node, lines, depth, sequence, isLast = true) {
1616
1903
  const span = node.span;
1617
- const duration = this.calculateSpanDuration(span);
1618
- const timestamp = this.formatTimestamp(span.startTime);
1619
- const status = this.getStatusIndicator(span);
1904
+ const duration = calculateSpanDuration(span);
1905
+ const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
1906
+ const status = getStatusIndicator(span);
1620
1907
  const prefix = this.getTreePrefix(depth, isLast);
1621
1908
  lines.push(
1622
- `${prefix}[${sequence}] ${new Date(timestamp).toISOString()} ${span.name} (${this.formatDuration(duration)})${status}`
1909
+ `${prefix}[${sequence}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
1623
1910
  );
1624
1911
  const attrIndent = this.getAttrIndent(depth, isLast);
1625
- const attrs = this.cleanAttributes(span.attributes);
1912
+ const attrs = cleanAttributes(span.attributes);
1626
1913
  if (Object.keys(attrs).length > 0) {
1627
1914
  for (const [key, value] of Object.entries(attrs)) {
1628
- lines.push(`${attrIndent}${key}: ${this.formatValue(value)}`);
1915
+ lines.push(`${attrIndent}${key}: ${this.formatValueWithDedup(value)}`);
1629
1916
  }
1630
1917
  }
1631
1918
  if (span.events.length > 0) {
1632
1919
  for (const event of span.events) {
1633
1920
  lines.push(`${attrIndent}[event] ${event.name}`);
1634
1921
  if (event.attributes) {
1635
- const eventAttrs = this.cleanAttributes(event.attributes);
1922
+ const eventAttrs = cleanAttributes(event.attributes);
1636
1923
  for (const [key, value] of Object.entries(eventAttrs)) {
1637
- lines.push(`${attrIndent} ${key}: ${this.formatValue(value)}`);
1924
+ lines.push(`${attrIndent} ${key}: ${this.formatValueWithDedup(value)}`);
1638
1925
  }
1639
1926
  }
1640
1927
  }
@@ -1663,42 +1950,26 @@ var JudgeSpanDigestFormatter = class {
1663
1950
  const continuation = isLast ? " " : "\u2502 ";
1664
1951
  return "\u2502 ".repeat(depth - 1) + continuation + " ";
1665
1952
  }
1666
- cleanAttributes(attrs) {
1667
- const cleaned = {};
1668
- const seen = /* @__PURE__ */ new Set();
1669
- const excludedKeys = [
1670
- attributes2.ATTR_LANGWATCH_THREAD_ID,
1671
- "langwatch.scenario.id",
1672
- "langwatch.scenario.name"
1673
- ];
1674
- for (const [key, value] of Object.entries(attrs)) {
1675
- if (excludedKeys.includes(key)) {
1676
- continue;
1677
- }
1678
- const cleanKey = key.replace(/^(langwatch)\./, "");
1679
- if (!seen.has(cleanKey)) {
1680
- seen.add(cleanKey);
1681
- cleaned[cleanKey] = value;
1682
- }
1683
- }
1684
- return cleaned;
1685
- }
1686
- formatValue(value) {
1687
- const processed = this.transformValue(value);
1953
+ /**
1954
+ * Formats a value with deduplication applied. Used by the `format()` method
1955
+ * to reduce token usage by replacing repeated strings with markers.
1956
+ */
1957
+ formatValueWithDedup(value) {
1958
+ const processed = this.transformValueWithDedup(value);
1688
1959
  return typeof processed === "string" ? processed : JSON.stringify(processed);
1689
1960
  }
1690
- transformValue(value) {
1961
+ transformValueWithDedup(value) {
1691
1962
  return deepTransform(value, (v) => {
1692
1963
  const mediaPart = truncateMediaPart(v);
1693
1964
  if (mediaPart) return mediaPart;
1694
1965
  if (typeof v !== "string") return v;
1695
- return this.transformString(v);
1966
+ return this.transformStringWithDedup(v);
1696
1967
  });
1697
1968
  }
1698
- transformString(str) {
1699
- if (this.looksLikeJson(str)) {
1969
+ transformStringWithDedup(str) {
1970
+ if (looksLikeJson(str)) {
1700
1971
  try {
1701
- const processed = this.transformValue(JSON.parse(str));
1972
+ const processed = this.transformValueWithDedup(JSON.parse(str));
1702
1973
  return JSON.stringify(processed);
1703
1974
  } catch {
1704
1975
  }
@@ -1707,36 +1978,12 @@ var JudgeSpanDigestFormatter = class {
1707
1978
  if (truncated !== str) return truncated;
1708
1979
  return this.deduplicator.process(str);
1709
1980
  }
1710
- looksLikeJson(str) {
1711
- const t = str.trim();
1712
- return t.startsWith("{") && t.endsWith("}") || t.startsWith("[") && t.endsWith("]");
1713
- }
1714
- hrTimeToMs(hrTime) {
1715
- return hrTime[0] * 1e3 + hrTime[1] / 1e6;
1716
- }
1717
- calculateSpanDuration(span) {
1718
- return this.hrTimeToMs(span.endTime) - this.hrTimeToMs(span.startTime);
1719
- }
1720
1981
  calculateTotalDuration(spans) {
1721
1982
  if (spans.length === 0) return 0;
1722
- const first = this.hrTimeToMs(spans[0].startTime);
1723
- const last = Math.max(...spans.map((s) => this.hrTimeToMs(s.endTime)));
1983
+ const first = hrTimeToMs(spans[0].startTime);
1984
+ const last = Math.max(...spans.map((s) => hrTimeToMs(s.endTime)));
1724
1985
  return last - first;
1725
1986
  }
1726
- formatDuration(ms) {
1727
- if (ms < 1e3) return `${Math.round(ms)}ms`;
1728
- return `${(ms / 1e3).toFixed(2)}s`;
1729
- }
1730
- formatTimestamp(hrTime) {
1731
- const ms = this.hrTimeToMs(hrTime);
1732
- return new Date(ms).toISOString();
1733
- }
1734
- getStatusIndicator(span) {
1735
- if (span.status.code === 2) {
1736
- return ` \u26A0\uFE0F ERROR: ${span.status.message ?? "unknown"}`;
1737
- }
1738
- return "";
1739
- }
1740
1987
  collectErrors(spans) {
1741
1988
  return spans.filter((s) => s.status.code === 2).map((s) => `- ${s.name}: ${s.status.message ?? "unknown error"}`);
1742
1989
  }
@@ -1799,15 +2046,42 @@ function buildFinishTestTool(criteria) {
1799
2046
  })
1800
2047
  });
1801
2048
  }
2049
+ function buildProgressiveDiscoveryTools(spans) {
2050
+ return {
2051
+ expand_trace: tool({
2052
+ description: "Expand one or more spans to see their full details (attributes, events, content). Use a single index like 5 or a range like '10-15'.",
2053
+ inputSchema: z4.object({
2054
+ index: z4.number().optional().describe("Single span index to expand"),
2055
+ range: z4.string().optional().describe('Range of span indices to expand, e.g. "10-15"')
2056
+ }),
2057
+ execute: async ({ index, range }) => {
2058
+ return expandTrace(spans, { index, range });
2059
+ }
2060
+ }),
2061
+ grep_trace: tool({
2062
+ description: "Search across all span attributes, events, and content for a pattern (case-insensitive). Returns matching spans with context.",
2063
+ inputSchema: z4.object({
2064
+ pattern: z4.string().describe("Search pattern (case-insensitive)")
2065
+ }),
2066
+ execute: async ({ pattern }) => {
2067
+ return grepTrace(spans, pattern);
2068
+ }
2069
+ })
2070
+ };
2071
+ }
1802
2072
  var JudgeAgent = class extends JudgeAgentAdapter {
1803
2073
  constructor(cfg) {
1804
2074
  super();
1805
2075
  this.cfg = cfg;
1806
2076
  this.criteria = cfg.criteria ?? [];
1807
2077
  this.spanCollector = cfg.spanCollector ?? judgeSpanCollector;
2078
+ this.tokenThreshold = cfg.tokenThreshold ?? DEFAULT_TOKEN_THRESHOLD;
2079
+ this.maxDiscoverySteps = cfg.maxDiscoverySteps ?? 10;
1808
2080
  }
1809
2081
  logger = new Logger("JudgeAgent");
1810
2082
  spanCollector;
2083
+ tokenThreshold;
2084
+ maxDiscoverySteps;
1811
2085
  role = "Judge" /* JUDGE */;
1812
2086
  criteria;
1813
2087
  /**
@@ -1815,7 +2089,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1815
2089
  */
1816
2090
  invokeLLM = createLLMInvoker(this.logger);
1817
2091
  async call(input) {
1818
- var _a, _b, _c, _d;
2092
+ var _a;
1819
2093
  const criteria = ((_a = input.judgmentRequest) == null ? void 0 : _a.criteria) ?? this.criteria;
1820
2094
  this.logger.debug("call() invoked", {
1821
2095
  threadId: input.threadId,
@@ -1823,8 +2097,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1823
2097
  maxTurns: input.scenarioConfig.maxTurns,
1824
2098
  judgmentRequest: input.judgmentRequest
1825
2099
  });
1826
- const digest = this.getOpenTelemetryTracesDigest(input.threadId);
1827
- this.logger.debug("OpenTelemetry traces built", { digest });
2100
+ const spans = this.spanCollector.getSpansForThread(input.threadId);
2101
+ const { digest, isLargeTrace } = this.buildTraceDigest(spans);
1828
2102
  const transcript = JudgeUtils.buildTranscriptFromMessages(input.messages);
1829
2103
  const contentForJudge = `
1830
2104
  <transcript>
@@ -1848,7 +2122,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1848
2122
  });
1849
2123
  const tools = {
1850
2124
  continue_test: buildContinueTestTool(),
1851
- finish_test: buildFinishTestTool(criteria)
2125
+ finish_test: buildFinishTestTool(criteria),
2126
+ ...isLargeTrace ? buildProgressiveDiscoveryTools(spans) : {}
1852
2127
  };
1853
2128
  const enforceJudgement = input.judgmentRequest != null;
1854
2129
  const hasCriteria = criteria.length && criteria.length > 0;
@@ -1865,26 +2140,70 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1865
2140
  model: mergedConfig.model,
1866
2141
  toolChoice,
1867
2142
  isLastMessage,
1868
- enforceJudgement
2143
+ enforceJudgement,
2144
+ isLargeTrace
1869
2145
  });
1870
- const completion = await this.invokeLLM({
2146
+ const completion = await this.invokeLLMWithDiscovery({
1871
2147
  model: mergedConfig.model,
1872
2148
  messages,
1873
2149
  temperature: mergedConfig.temperature ?? 0,
1874
2150
  maxOutputTokens: mergedConfig.maxTokens,
1875
2151
  tools,
1876
- toolChoice
2152
+ toolChoice,
2153
+ isLargeTrace
2154
+ });
2155
+ return this.parseToolCalls(completion, criteria);
2156
+ }
2157
+ /**
2158
+ * Builds the trace digest, choosing between full inline rendering
2159
+ * and structure-only mode based on estimated token count.
2160
+ */
2161
+ buildTraceDigest(spans) {
2162
+ const fullDigest = judgeSpanDigestFormatter.format(spans);
2163
+ const isLargeTrace = spans.length > 0 && estimateTokens(fullDigest) > this.tokenThreshold;
2164
+ const digest = isLargeTrace ? judgeSpanDigestFormatter.formatStructureOnly(spans) + "\n\nUse expand_trace(spanIndex) to see span details or grep_trace(pattern) to search across spans." : fullDigest;
2165
+ this.logger.debug("Trace digest built", {
2166
+ isLargeTrace,
2167
+ estimatedTokens: estimateTokens(fullDigest)
1877
2168
  });
2169
+ return { digest, isLargeTrace };
2170
+ }
2171
+ /**
2172
+ * Invokes the LLM, enabling multi-step tool execution for large traces.
2173
+ * In multi-step mode, the AI SDK loops automatically: the judge can call
2174
+ * expand_trace/grep_trace tools multiple times before reaching a terminal
2175
+ * tool (finish_test/continue_test) or hitting the step limit.
2176
+ */
2177
+ async invokeLLMWithDiscovery({
2178
+ isLargeTrace,
2179
+ ...params
2180
+ }) {
2181
+ var _a, _b;
2182
+ if (isLargeTrace) {
2183
+ params.stopWhen = [
2184
+ stepCountIs(this.maxDiscoverySteps),
2185
+ hasToolCall("finish_test"),
2186
+ hasToolCall("continue_test")
2187
+ ];
2188
+ }
2189
+ const completion = await this.invokeLLM(params);
1878
2190
  this.logger.debug("LLM response received", {
1879
- toolCallCount: ((_b = completion.toolCalls) == null ? void 0 : _b.length) ?? 0,
1880
- toolCalls: (_c = completion.toolCalls) == null ? void 0 : _c.map((tc) => ({
2191
+ toolCallCount: ((_a = completion.toolCalls) == null ? void 0 : _a.length) ?? 0,
2192
+ toolCalls: (_b = completion.toolCalls) == null ? void 0 : _b.map((tc) => ({
1881
2193
  toolName: tc.toolName,
1882
2194
  args: tc.input
1883
2195
  }))
1884
2196
  });
2197
+ return completion;
2198
+ }
2199
+ parseToolCalls(completion, criteria) {
2200
+ var _a;
1885
2201
  let args;
1886
- if ((_d = completion.toolCalls) == null ? void 0 : _d.length) {
1887
- const toolCall = completion.toolCalls[0];
2202
+ if ((_a = completion.toolCalls) == null ? void 0 : _a.length) {
2203
+ const terminalCall = completion.toolCalls.find(
2204
+ (tc) => tc.toolName === "finish_test" || tc.toolName === "continue_test"
2205
+ );
2206
+ const toolCall = terminalCall ?? completion.toolCalls[0];
1888
2207
  switch (toolCall.toolName) {
1889
2208
  case "finish_test": {
1890
2209
  args = toolCall.input;
@@ -1926,11 +2245,6 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1926
2245
  unmetCriteria: criteria
1927
2246
  };
1928
2247
  }
1929
- getOpenTelemetryTracesDigest(threadId) {
1930
- const spans = this.spanCollector.getSpansForThread(threadId);
1931
- const digest = judgeSpanDigestFormatter.format(spans);
1932
- return digest;
1933
- }
1934
2248
  };
1935
2249
  var judgeAgent = (cfg) => {
1936
2250
  return new JudgeAgent(cfg ?? {});
@@ -4677,6 +4991,7 @@ export {
4677
4991
  AgentAdapter,
4678
4992
  AgentRole,
4679
4993
  DEFAULT_MAX_TURNS,
4994
+ DEFAULT_TOKEN_THRESHOLD,
4680
4995
  DEFAULT_VERBOSE,
4681
4996
  JudgeAgentAdapter,
4682
4997
  JudgeSpanCollector,
@@ -4690,7 +5005,10 @@ export {
4690
5005
  allAgentRoles,
4691
5006
  index_default as default,
4692
5007
  defineConfig,
5008
+ estimateTokens,
5009
+ expandTrace,
4693
5010
  fail,
5011
+ grepTrace,
4694
5012
  judge,
4695
5013
  judgeAgent,
4696
5014
  judgeSpanCollector,