@langwatch/scenario 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -969,6 +969,7 @@ __export(index_exports, {
969
969
  AgentAdapter: () => AgentAdapter,
970
970
  AgentRole: () => AgentRole,
971
971
  DEFAULT_MAX_TURNS: () => DEFAULT_MAX_TURNS,
972
+ DEFAULT_TOKEN_THRESHOLD: () => DEFAULT_TOKEN_THRESHOLD,
972
973
  DEFAULT_VERBOSE: () => DEFAULT_VERBOSE,
973
974
  JudgeAgentAdapter: () => JudgeAgentAdapter,
974
975
  JudgeSpanCollector: () => JudgeSpanCollector,
@@ -982,7 +983,10 @@ __export(index_exports, {
982
983
  allAgentRoles: () => allAgentRoles,
983
984
  default: () => index_default,
984
985
  defineConfig: () => defineConfig,
986
+ estimateTokens: () => estimateTokens,
987
+ expandTrace: () => expandTrace,
985
988
  fail: () => fail,
989
+ grepTrace: () => grepTrace,
986
990
  judge: () => judge,
987
991
  judgeAgent: () => judgeAgent,
988
992
  judgeSpanCollector: () => judgeSpanCollector,
@@ -1004,9 +1008,13 @@ module.exports = __toCommonJS(index_exports);
1004
1008
  // src/agents/index.ts
1005
1009
  var agents_exports = {};
1006
1010
  __export(agents_exports, {
1011
+ DEFAULT_TOKEN_THRESHOLD: () => DEFAULT_TOKEN_THRESHOLD,
1007
1012
  JudgeSpanCollector: () => JudgeSpanCollector,
1008
1013
  JudgeSpanDigestFormatter: () => JudgeSpanDigestFormatter,
1009
1014
  RealtimeAgentAdapter: () => RealtimeAgentAdapter,
1015
+ estimateTokens: () => estimateTokens,
1016
+ expandTrace: () => expandTrace,
1017
+ grepTrace: () => grepTrace,
1010
1018
  judgeAgent: () => judgeAgent,
1011
1019
  judgeSpanCollector: () => judgeSpanCollector,
1012
1020
  judgeSpanDigestFormatter: () => judgeSpanDigestFormatter,
@@ -1086,6 +1094,283 @@ var JudgeUtils = {
1086
1094
  }
1087
1095
  };
1088
1096
 
1097
+ // src/agents/judge/estimate-tokens.ts
1098
+ var DEFAULT_TOKEN_THRESHOLD = 8192;
1099
+ function estimateTokens(text) {
1100
+ const byteLength = new TextEncoder().encode(text).byteLength;
1101
+ return Math.ceil(byteLength / 4);
1102
+ }
1103
+
1104
+ // src/agents/judge/span-utils.ts
1105
+ var import_observability = require("langwatch/observability");
1106
+
1107
+ // src/agents/judge/deep-transform.ts
1108
+ function deepTransform(value, fn) {
1109
+ const result = fn(value);
1110
+ if (result !== value) return result;
1111
+ if (Array.isArray(value)) {
1112
+ return value.map((v) => deepTransform(v, fn));
1113
+ }
1114
+ if (value !== null && typeof value === "object") {
1115
+ const out = {};
1116
+ for (const [k, v] of Object.entries(value)) {
1117
+ out[k] = deepTransform(v, fn);
1118
+ }
1119
+ return out;
1120
+ }
1121
+ return value;
1122
+ }
1123
+
1124
+ // src/agents/judge/truncate-media.ts
1125
+ function truncateMediaUrl(str) {
1126
+ const match = str.match(
1127
+ /^data:((image|audio|video)\/[a-z0-9+.-]+);base64,(.+)$/i
1128
+ );
1129
+ if (!match) return str;
1130
+ const [, mimeType, category, data] = match;
1131
+ return `[${category.toUpperCase()}: ${mimeType}, ~${data.length} bytes]`;
1132
+ }
1133
+ function truncateMediaPart(v) {
1134
+ var _a;
1135
+ if (v === null || typeof v !== "object" || Array.isArray(v)) return null;
1136
+ const obj = v;
1137
+ if (obj.type === "file" && typeof obj.mediaType === "string" && typeof obj.data === "string") {
1138
+ const category = ((_a = obj.mediaType.split("/")[0]) == null ? void 0 : _a.toUpperCase()) ?? "FILE";
1139
+ return {
1140
+ ...obj,
1141
+ data: `[${category}: ${obj.mediaType}, ~${obj.data.length} bytes]`
1142
+ };
1143
+ }
1144
+ if (obj.type === "image" && typeof obj.image === "string") {
1145
+ const imageData = obj.image;
1146
+ const dataUrlMatch = imageData.match(
1147
+ /^data:((image)\/[a-z0-9+.-]+);base64,(.+)$/i
1148
+ );
1149
+ if (dataUrlMatch) {
1150
+ return {
1151
+ ...obj,
1152
+ image: `[IMAGE: ${dataUrlMatch[1]}, ~${dataUrlMatch[3].length} bytes]`
1153
+ };
1154
+ }
1155
+ if (imageData.length > 1e3 && /^[A-Za-z0-9+/=]+$/.test(imageData)) {
1156
+ return {
1157
+ ...obj,
1158
+ image: `[IMAGE: unknown, ~${imageData.length} bytes]`
1159
+ };
1160
+ }
1161
+ }
1162
+ return null;
1163
+ }
1164
+
1165
+ // src/agents/judge/span-utils.ts
1166
+ function hrTimeToMs(hrTime) {
1167
+ return hrTime[0] * 1e3 + hrTime[1] / 1e6;
1168
+ }
1169
+ function formatDuration(ms) {
1170
+ if (ms < 1e3) return `${Math.round(ms)}ms`;
1171
+ return `${(ms / 1e3).toFixed(2)}s`;
1172
+ }
1173
+ function calculateSpanDuration(span) {
1174
+ return hrTimeToMs(span.endTime) - hrTimeToMs(span.startTime);
1175
+ }
1176
+ function getStatusIndicator(span) {
1177
+ if (span.status.code === 2) {
1178
+ return ` \u26A0\uFE0F ERROR: ${span.status.message ?? "unknown"}`;
1179
+ }
1180
+ return "";
1181
+ }
1182
+ function getTokenUsage(span) {
1183
+ const input = span.attributes["gen_ai.usage.input_tokens"];
1184
+ const output = span.attributes["gen_ai.usage.output_tokens"];
1185
+ if (input == null && output == null) return "";
1186
+ const total = (Number(input) || 0) + (Number(output) || 0);
1187
+ return `, ${total} tokens`;
1188
+ }
1189
+ function cleanAttributes(attrs) {
1190
+ const cleaned = {};
1191
+ const seen = /* @__PURE__ */ new Set();
1192
+ const excludedKeys = [
1193
+ import_observability.attributes.ATTR_LANGWATCH_THREAD_ID,
1194
+ "langwatch.scenario.id",
1195
+ "langwatch.scenario.name"
1196
+ ];
1197
+ for (const [key, value] of Object.entries(attrs)) {
1198
+ if (excludedKeys.includes(key)) {
1199
+ continue;
1200
+ }
1201
+ const cleanKey = key.replace(/^(langwatch)\./, "");
1202
+ if (!seen.has(cleanKey)) {
1203
+ seen.add(cleanKey);
1204
+ cleaned[cleanKey] = value;
1205
+ }
1206
+ }
1207
+ return cleaned;
1208
+ }
1209
+ function formatValue(value) {
1210
+ const processed = transformValue(value);
1211
+ return typeof processed === "string" ? processed : JSON.stringify(processed);
1212
+ }
1213
+ function transformValue(value) {
1214
+ return deepTransform(value, (v) => {
1215
+ const mediaPart = truncateMediaPart(v);
1216
+ if (mediaPart) return mediaPart;
1217
+ if (typeof v !== "string") return v;
1218
+ const truncated = truncateMediaUrl(v);
1219
+ if (truncated !== v) return truncated;
1220
+ if (looksLikeJson(v)) {
1221
+ try {
1222
+ const parsed = transformValue(JSON.parse(v));
1223
+ return JSON.stringify(parsed);
1224
+ } catch {
1225
+ }
1226
+ }
1227
+ return v;
1228
+ });
1229
+ }
1230
+ function looksLikeJson(str) {
1231
+ const t = str.trim();
1232
+ return t.startsWith("{") && t.endsWith("}") || t.startsWith("[") && t.endsWith("]");
1233
+ }
1234
+ function indexSpans(spans) {
1235
+ const sorted = [...spans].sort((a, b) => {
1236
+ return hrTimeToMs(a.startTime) - hrTimeToMs(b.startTime);
1237
+ });
1238
+ return sorted.map((span, i) => ({
1239
+ span,
1240
+ children: [],
1241
+ index: i + 1
1242
+ }));
1243
+ }
1244
+
1245
+ // src/agents/judge/trace-tools.ts
1246
+ var TOOL_RESULT_TOKEN_BUDGET = 4096;
1247
+ var TOOL_RESULT_CHAR_BUDGET = TOOL_RESULT_TOKEN_BUDGET * 4;
1248
+ var MAX_GREP_MATCHES = 20;
1249
+ function renderFullSpanNode(node) {
1250
+ const span = node.span;
1251
+ const duration = calculateSpanDuration(span);
1252
+ const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
1253
+ const status = getStatusIndicator(span);
1254
+ const lines = [];
1255
+ lines.push(
1256
+ `[${node.index}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
1257
+ );
1258
+ const attrs = cleanAttributes(span.attributes);
1259
+ if (Object.keys(attrs).length > 0) {
1260
+ for (const [key, value] of Object.entries(attrs)) {
1261
+ lines.push(` ${key}: ${formatValue(value)}`);
1262
+ }
1263
+ }
1264
+ if (span.events.length > 0) {
1265
+ for (const event of span.events) {
1266
+ lines.push(` [event] ${event.name}`);
1267
+ if (event.attributes) {
1268
+ const eventAttrs = cleanAttributes(event.attributes);
1269
+ for (const [key, value] of Object.entries(eventAttrs)) {
1270
+ lines.push(` ${key}: ${formatValue(value)}`);
1271
+ }
1272
+ }
1273
+ }
1274
+ }
1275
+ return lines;
1276
+ }
1277
+ function truncateToCharBudget(text) {
1278
+ if (text.length <= TOOL_RESULT_CHAR_BUDGET) return text;
1279
+ const truncated = text.slice(0, TOOL_RESULT_CHAR_BUDGET);
1280
+ return truncated + "\n\n[TRUNCATED] Output exceeded ~4000 token budget. Use grep_trace(pattern) to search for specific content, or expand_trace with a narrower range.";
1281
+ }
1282
+ function spanToSearchableText(span) {
1283
+ const parts = [span.name];
1284
+ const attrs = cleanAttributes(span.attributes);
1285
+ for (const [key, value] of Object.entries(attrs)) {
1286
+ parts.push(`${key}: ${formatValue(value)}`);
1287
+ }
1288
+ for (const event of span.events) {
1289
+ parts.push(event.name);
1290
+ if (event.attributes) {
1291
+ const eventAttrs = cleanAttributes(event.attributes);
1292
+ for (const [key, value] of Object.entries(eventAttrs)) {
1293
+ parts.push(`${key}: ${formatValue(value)}`);
1294
+ }
1295
+ }
1296
+ }
1297
+ return parts.join("\n");
1298
+ }
1299
+ function expandTrace(spans, { index, range }) {
1300
+ const nodes = indexSpans(spans);
1301
+ if (nodes.length === 0) {
1302
+ return "No spans recorded.";
1303
+ }
1304
+ let startIdx;
1305
+ let endIdx;
1306
+ if (range != null) {
1307
+ const parts = range.split("-").map(Number);
1308
+ startIdx = parts[0];
1309
+ endIdx = parts[1] ?? startIdx;
1310
+ } else if (index != null) {
1311
+ startIdx = index;
1312
+ endIdx = index;
1313
+ } else {
1314
+ return "Error: provide either index or range parameter.";
1315
+ }
1316
+ const maxIndex = nodes.length;
1317
+ if (startIdx < 1 || endIdx > maxIndex || startIdx > endIdx) {
1318
+ return `Error: span index out of range. Valid range is 1-${maxIndex}.`;
1319
+ }
1320
+ const selected = nodes.filter(
1321
+ (n) => n.index >= startIdx && n.index <= endIdx
1322
+ );
1323
+ const lines = [];
1324
+ for (const node of selected) {
1325
+ const spanLines = renderFullSpanNode(node);
1326
+ lines.push(...spanLines);
1327
+ lines.push("");
1328
+ }
1329
+ return truncateToCharBudget(lines.join("\n").trimEnd());
1330
+ }
1331
+ function grepTrace(spans, pattern) {
1332
+ const nodes = indexSpans(spans);
1333
+ if (nodes.length === 0) {
1334
+ return "No spans recorded.";
1335
+ }
1336
+ const regex = new RegExp(escapeRegex(pattern), "i");
1337
+ const matches = [];
1338
+ for (const node of nodes) {
1339
+ const searchText = spanToSearchableText(node.span);
1340
+ const lines2 = searchText.split("\n");
1341
+ const matchingLines = lines2.filter((line) => regex.test(line));
1342
+ if (matchingLines.length > 0) {
1343
+ matches.push({ node, matchingLines });
1344
+ }
1345
+ }
1346
+ if (matches.length === 0) {
1347
+ const spanNames = Array.from(new Set(nodes.map((n) => n.span.name)));
1348
+ return `No matches found for "${pattern}". Available span names: ${spanNames.join(", ")}`;
1349
+ }
1350
+ const totalMatches = matches.length;
1351
+ const limited = matches.slice(0, MAX_GREP_MATCHES);
1352
+ const lines = [];
1353
+ for (const { node, matchingLines } of limited) {
1354
+ const duration = calculateSpanDuration(node.span);
1355
+ lines.push(
1356
+ `--- [${node.index}] ${node.span.name} (${formatDuration(duration)}) ---`
1357
+ );
1358
+ for (const line of matchingLines) {
1359
+ lines.push(` ${line}`);
1360
+ }
1361
+ lines.push("");
1362
+ }
1363
+ if (totalMatches > MAX_GREP_MATCHES) {
1364
+ lines.push(
1365
+ `[${totalMatches - MAX_GREP_MATCHES} more matches omitted. Refine your search pattern for more specific results.]`
1366
+ );
1367
+ }
1368
+ return truncateToCharBudget(lines.join("\n").trimEnd());
1369
+ }
1370
+ function escapeRegex(str) {
1371
+ return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
1372
+ }
1373
+
1089
1374
  // src/config/env.ts
1090
1375
  var import_v4 = require("zod/v4");
1091
1376
 
@@ -1441,7 +1726,7 @@ var criterionToParamName = (criterion) => {
1441
1726
  };
1442
1727
 
1443
1728
  // src/agents/judge/judge-span-collector.ts
1444
- var import_observability = require("langwatch/observability");
1729
+ var import_observability2 = require("langwatch/observability");
1445
1730
  var JudgeSpanCollector = class {
1446
1731
  spans = [];
1447
1732
  onStart() {
@@ -1484,7 +1769,7 @@ var JudgeSpanCollector = class {
1484
1769
  const spanId = span.spanContext().spanId;
1485
1770
  if (visited.has(spanId)) return false;
1486
1771
  visited.add(spanId);
1487
- if (span.attributes[import_observability.attributes.ATTR_LANGWATCH_THREAD_ID] === threadId) {
1772
+ if (span.attributes[import_observability2.attributes.ATTR_LANGWATCH_THREAD_ID] === threadId) {
1488
1773
  return true;
1489
1774
  }
1490
1775
  const parentId = getParentSpanId(span);
@@ -1503,26 +1788,6 @@ function getParentSpanId(span) {
1503
1788
  }
1504
1789
  var judgeSpanCollector = new JudgeSpanCollector();
1505
1790
 
1506
- // src/agents/judge/judge-span-digest-formatter.ts
1507
- var import_observability2 = require("langwatch/observability");
1508
-
1509
- // src/agents/judge/deep-transform.ts
1510
- function deepTransform(value, fn) {
1511
- const result = fn(value);
1512
- if (result !== value) return result;
1513
- if (Array.isArray(value)) {
1514
- return value.map((v) => deepTransform(v, fn));
1515
- }
1516
- if (value !== null && typeof value === "object") {
1517
- const out = {};
1518
- for (const [k, v] of Object.entries(value)) {
1519
- out[k] = deepTransform(v, fn);
1520
- }
1521
- return out;
1522
- }
1523
- return value;
1524
- }
1525
-
1526
1791
  // src/agents/judge/string-deduplicator.ts
1527
1792
  var StringDeduplicator = class {
1528
1793
  seen = /* @__PURE__ */ new Map();
@@ -1556,51 +1821,51 @@ var StringDeduplicator = class {
1556
1821
  }
1557
1822
  };
1558
1823
 
1559
- // src/agents/judge/truncate-media.ts
1560
- function truncateMediaUrl(str) {
1561
- const match = str.match(
1562
- /^data:((image|audio|video)\/[a-z0-9+.-]+);base64,(.+)$/i
1563
- );
1564
- if (!match) return str;
1565
- const [, mimeType, category, data] = match;
1566
- return `[${category.toUpperCase()}: ${mimeType}, ~${data.length} bytes]`;
1567
- }
1568
- function truncateMediaPart(v) {
1569
- var _a;
1570
- if (v === null || typeof v !== "object" || Array.isArray(v)) return null;
1571
- const obj = v;
1572
- if (obj.type === "file" && typeof obj.mediaType === "string" && typeof obj.data === "string") {
1573
- const category = ((_a = obj.mediaType.split("/")[0]) == null ? void 0 : _a.toUpperCase()) ?? "FILE";
1574
- return {
1575
- ...obj,
1576
- data: `[${category}: ${obj.mediaType}, ~${obj.data.length} bytes]`
1577
- };
1578
- }
1579
- if (obj.type === "image" && typeof obj.image === "string") {
1580
- const imageData = obj.image;
1581
- const dataUrlMatch = imageData.match(
1582
- /^data:((image)\/[a-z0-9+.-]+);base64,(.+)$/i
1583
- );
1584
- if (dataUrlMatch) {
1585
- return {
1586
- ...obj,
1587
- image: `[IMAGE: ${dataUrlMatch[1]}, ~${dataUrlMatch[3].length} bytes]`
1588
- };
1589
- }
1590
- if (imageData.length > 1e3 && /^[A-Za-z0-9+/=]+$/.test(imageData)) {
1591
- return {
1592
- ...obj,
1593
- image: `[IMAGE: unknown, ~${imageData.length} bytes]`
1594
- };
1595
- }
1596
- }
1597
- return null;
1598
- }
1599
-
1600
1824
  // src/agents/judge/judge-span-digest-formatter.ts
1601
1825
  var JudgeSpanDigestFormatter = class {
1602
1826
  logger = new Logger("JudgeSpanDigestFormatter");
1603
1827
  deduplicator = new StringDeduplicator({ threshold: 50 });
1828
+ /**
1829
+ * Formats spans into a structure-only digest showing span tree hierarchy
1830
+ * without attributes, events, or content. Used for large traces that
1831
+ * exceed the token threshold, paired with expand_trace/grep_trace tools.
1832
+ *
1833
+ * @param spans - All spans for a thread
1834
+ * @returns Plain text digest with only structural information
1835
+ */
1836
+ formatStructureOnly(spans) {
1837
+ this.logger.debug("formatStructureOnly() called", {
1838
+ spanCount: spans.length
1839
+ });
1840
+ if (spans.length === 0) {
1841
+ return "No spans recorded.";
1842
+ }
1843
+ const sortedSpans = this.sortByStartTime(spans);
1844
+ const tree = this.buildHierarchy(sortedSpans);
1845
+ const totalDuration = this.calculateTotalDuration(sortedSpans);
1846
+ const lines = [
1847
+ `Spans: ${spans.length} | Total Duration: ${formatDuration(totalDuration)}`,
1848
+ ""
1849
+ ];
1850
+ let sequence = 1;
1851
+ const rootCount = tree.length;
1852
+ tree.forEach((node, idx) => {
1853
+ sequence = this.renderStructureNode(
1854
+ node,
1855
+ lines,
1856
+ 0,
1857
+ sequence,
1858
+ idx === rootCount - 1
1859
+ );
1860
+ });
1861
+ const errors = this.collectErrors(spans);
1862
+ if (errors.length > 0) {
1863
+ lines.push("");
1864
+ lines.push("=== ERRORS ===");
1865
+ errors.forEach((e) => lines.push(e));
1866
+ }
1867
+ return lines.join("\n");
1868
+ }
1604
1869
  /**
1605
1870
  * Formats spans into a complete digest with full content and nesting.
1606
1871
  * @param spans - All spans for a thread
@@ -1624,7 +1889,7 @@ var JudgeSpanDigestFormatter = class {
1624
1889
  totalDuration
1625
1890
  });
1626
1891
  const lines = [
1627
- `Spans: ${spans.length} | Total Duration: ${this.formatDuration(
1892
+ `Spans: ${spans.length} | Total Duration: ${formatDuration(
1628
1893
  totalDuration
1629
1894
  )}`,
1630
1895
  ""
@@ -1650,9 +1915,7 @@ var JudgeSpanDigestFormatter = class {
1650
1915
  }
1651
1916
  sortByStartTime(spans) {
1652
1917
  return [...spans].sort((a, b) => {
1653
- const aTime = this.hrTimeToMs(a.startTime);
1654
- const bTime = this.hrTimeToMs(b.startTime);
1655
- return aTime - bTime;
1918
+ return hrTimeToMs(a.startTime) - hrTimeToMs(b.startTime);
1656
1919
  });
1657
1920
  }
1658
1921
  buildHierarchy(spans) {
@@ -1672,29 +1935,53 @@ var JudgeSpanDigestFormatter = class {
1672
1935
  }
1673
1936
  return roots;
1674
1937
  }
1938
+ renderStructureNode(node, lines, depth, sequence, isLast = true) {
1939
+ const span = node.span;
1940
+ const duration = calculateSpanDuration(span);
1941
+ const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
1942
+ const status = getStatusIndicator(span);
1943
+ const tokens = getTokenUsage(span);
1944
+ const prefix = this.getTreePrefix(depth, isLast);
1945
+ lines.push(
1946
+ `${prefix}[${sequence}] ${timestamp} ${span.name} (${formatDuration(duration)}${tokens})${status}`
1947
+ );
1948
+ lines.push("");
1949
+ let nextSeq = sequence + 1;
1950
+ const childCount = node.children.length;
1951
+ node.children.forEach((child, idx) => {
1952
+ nextSeq = this.renderStructureNode(
1953
+ child,
1954
+ lines,
1955
+ depth + 1,
1956
+ nextSeq,
1957
+ idx === childCount - 1
1958
+ );
1959
+ });
1960
+ return nextSeq;
1961
+ }
1675
1962
  renderNode(node, lines, depth, sequence, isLast = true) {
1676
1963
  const span = node.span;
1677
- const duration = this.calculateSpanDuration(span);
1678
- const timestamp = this.formatTimestamp(span.startTime);
1679
- const status = this.getStatusIndicator(span);
1964
+ const duration = calculateSpanDuration(span);
1965
+ const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
1966
+ const status = getStatusIndicator(span);
1680
1967
  const prefix = this.getTreePrefix(depth, isLast);
1681
1968
  lines.push(
1682
- `${prefix}[${sequence}] ${new Date(timestamp).toISOString()} ${span.name} (${this.formatDuration(duration)})${status}`
1969
+ `${prefix}[${sequence}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
1683
1970
  );
1684
1971
  const attrIndent = this.getAttrIndent(depth, isLast);
1685
- const attrs = this.cleanAttributes(span.attributes);
1972
+ const attrs = cleanAttributes(span.attributes);
1686
1973
  if (Object.keys(attrs).length > 0) {
1687
1974
  for (const [key, value] of Object.entries(attrs)) {
1688
- lines.push(`${attrIndent}${key}: ${this.formatValue(value)}`);
1975
+ lines.push(`${attrIndent}${key}: ${this.formatValueWithDedup(value)}`);
1689
1976
  }
1690
1977
  }
1691
1978
  if (span.events.length > 0) {
1692
1979
  for (const event of span.events) {
1693
1980
  lines.push(`${attrIndent}[event] ${event.name}`);
1694
1981
  if (event.attributes) {
1695
- const eventAttrs = this.cleanAttributes(event.attributes);
1982
+ const eventAttrs = cleanAttributes(event.attributes);
1696
1983
  for (const [key, value] of Object.entries(eventAttrs)) {
1697
- lines.push(`${attrIndent} ${key}: ${this.formatValue(value)}`);
1984
+ lines.push(`${attrIndent} ${key}: ${this.formatValueWithDedup(value)}`);
1698
1985
  }
1699
1986
  }
1700
1987
  }
@@ -1723,42 +2010,26 @@ var JudgeSpanDigestFormatter = class {
1723
2010
  const continuation = isLast ? " " : "\u2502 ";
1724
2011
  return "\u2502 ".repeat(depth - 1) + continuation + " ";
1725
2012
  }
1726
- cleanAttributes(attrs) {
1727
- const cleaned = {};
1728
- const seen = /* @__PURE__ */ new Set();
1729
- const excludedKeys = [
1730
- import_observability2.attributes.ATTR_LANGWATCH_THREAD_ID,
1731
- "langwatch.scenario.id",
1732
- "langwatch.scenario.name"
1733
- ];
1734
- for (const [key, value] of Object.entries(attrs)) {
1735
- if (excludedKeys.includes(key)) {
1736
- continue;
1737
- }
1738
- const cleanKey = key.replace(/^(langwatch)\./, "");
1739
- if (!seen.has(cleanKey)) {
1740
- seen.add(cleanKey);
1741
- cleaned[cleanKey] = value;
1742
- }
1743
- }
1744
- return cleaned;
1745
- }
1746
- formatValue(value) {
1747
- const processed = this.transformValue(value);
2013
+ /**
2014
+ * Formats a value with deduplication applied. Used by the `format()` method
2015
+ * to reduce token usage by replacing repeated strings with markers.
2016
+ */
2017
+ formatValueWithDedup(value) {
2018
+ const processed = this.transformValueWithDedup(value);
1748
2019
  return typeof processed === "string" ? processed : JSON.stringify(processed);
1749
2020
  }
1750
- transformValue(value) {
2021
+ transformValueWithDedup(value) {
1751
2022
  return deepTransform(value, (v) => {
1752
2023
  const mediaPart = truncateMediaPart(v);
1753
2024
  if (mediaPart) return mediaPart;
1754
2025
  if (typeof v !== "string") return v;
1755
- return this.transformString(v);
2026
+ return this.transformStringWithDedup(v);
1756
2027
  });
1757
2028
  }
1758
- transformString(str) {
1759
- if (this.looksLikeJson(str)) {
2029
+ transformStringWithDedup(str) {
2030
+ if (looksLikeJson(str)) {
1760
2031
  try {
1761
- const processed = this.transformValue(JSON.parse(str));
2032
+ const processed = this.transformValueWithDedup(JSON.parse(str));
1762
2033
  return JSON.stringify(processed);
1763
2034
  } catch {
1764
2035
  }
@@ -1767,36 +2038,12 @@ var JudgeSpanDigestFormatter = class {
1767
2038
  if (truncated !== str) return truncated;
1768
2039
  return this.deduplicator.process(str);
1769
2040
  }
1770
- looksLikeJson(str) {
1771
- const t = str.trim();
1772
- return t.startsWith("{") && t.endsWith("}") || t.startsWith("[") && t.endsWith("]");
1773
- }
1774
- hrTimeToMs(hrTime) {
1775
- return hrTime[0] * 1e3 + hrTime[1] / 1e6;
1776
- }
1777
- calculateSpanDuration(span) {
1778
- return this.hrTimeToMs(span.endTime) - this.hrTimeToMs(span.startTime);
1779
- }
1780
2041
  calculateTotalDuration(spans) {
1781
2042
  if (spans.length === 0) return 0;
1782
- const first = this.hrTimeToMs(spans[0].startTime);
1783
- const last = Math.max(...spans.map((s) => this.hrTimeToMs(s.endTime)));
2043
+ const first = hrTimeToMs(spans[0].startTime);
2044
+ const last = Math.max(...spans.map((s) => hrTimeToMs(s.endTime)));
1784
2045
  return last - first;
1785
2046
  }
1786
- formatDuration(ms) {
1787
- if (ms < 1e3) return `${Math.round(ms)}ms`;
1788
- return `${(ms / 1e3).toFixed(2)}s`;
1789
- }
1790
- formatTimestamp(hrTime) {
1791
- const ms = this.hrTimeToMs(hrTime);
1792
- return new Date(ms).toISOString();
1793
- }
1794
- getStatusIndicator(span) {
1795
- if (span.status.code === 2) {
1796
- return ` \u26A0\uFE0F ERROR: ${span.status.message ?? "unknown"}`;
1797
- }
1798
- return "";
1799
- }
1800
2047
  collectErrors(spans) {
1801
2048
  return spans.filter((s) => s.status.code === 2).map((s) => `- ${s.name}: ${s.status.message ?? "unknown error"}`);
1802
2049
  }
@@ -1859,15 +2106,42 @@ function buildFinishTestTool(criteria) {
1859
2106
  })
1860
2107
  });
1861
2108
  }
2109
+ function buildProgressiveDiscoveryTools(spans) {
2110
+ return {
2111
+ expand_trace: (0, import_ai2.tool)({
2112
+ description: "Expand one or more spans to see their full details (attributes, events, content). Use a single index like 5 or a range like '10-15'.",
2113
+ inputSchema: import_v44.z.object({
2114
+ index: import_v44.z.number().optional().describe("Single span index to expand"),
2115
+ range: import_v44.z.string().optional().describe('Range of span indices to expand, e.g. "10-15"')
2116
+ }),
2117
+ execute: async ({ index, range }) => {
2118
+ return expandTrace(spans, { index, range });
2119
+ }
2120
+ }),
2121
+ grep_trace: (0, import_ai2.tool)({
2122
+ description: "Search across all span attributes, events, and content for a pattern (case-insensitive). Returns matching spans with context.",
2123
+ inputSchema: import_v44.z.object({
2124
+ pattern: import_v44.z.string().describe("Search pattern (case-insensitive)")
2125
+ }),
2126
+ execute: async ({ pattern }) => {
2127
+ return grepTrace(spans, pattern);
2128
+ }
2129
+ })
2130
+ };
2131
+ }
1862
2132
  var JudgeAgent = class extends JudgeAgentAdapter {
1863
2133
  constructor(cfg) {
1864
2134
  super();
1865
2135
  this.cfg = cfg;
1866
2136
  this.criteria = cfg.criteria ?? [];
1867
2137
  this.spanCollector = cfg.spanCollector ?? judgeSpanCollector;
2138
+ this.tokenThreshold = cfg.tokenThreshold ?? DEFAULT_TOKEN_THRESHOLD;
2139
+ this.maxDiscoverySteps = cfg.maxDiscoverySteps ?? 10;
1868
2140
  }
1869
2141
  logger = new Logger("JudgeAgent");
1870
2142
  spanCollector;
2143
+ tokenThreshold;
2144
+ maxDiscoverySteps;
1871
2145
  role = "Judge" /* JUDGE */;
1872
2146
  criteria;
1873
2147
  /**
@@ -1875,7 +2149,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1875
2149
  */
1876
2150
  invokeLLM = createLLMInvoker(this.logger);
1877
2151
  async call(input) {
1878
- var _a, _b, _c, _d;
2152
+ var _a;
1879
2153
  const criteria = ((_a = input.judgmentRequest) == null ? void 0 : _a.criteria) ?? this.criteria;
1880
2154
  this.logger.debug("call() invoked", {
1881
2155
  threadId: input.threadId,
@@ -1883,8 +2157,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1883
2157
  maxTurns: input.scenarioConfig.maxTurns,
1884
2158
  judgmentRequest: input.judgmentRequest
1885
2159
  });
1886
- const digest = this.getOpenTelemetryTracesDigest(input.threadId);
1887
- this.logger.debug("OpenTelemetry traces built", { digest });
2160
+ const spans = this.spanCollector.getSpansForThread(input.threadId);
2161
+ const { digest, isLargeTrace } = this.buildTraceDigest(spans);
1888
2162
  const transcript = JudgeUtils.buildTranscriptFromMessages(input.messages);
1889
2163
  const contentForJudge = `
1890
2164
  <transcript>
@@ -1908,7 +2182,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1908
2182
  });
1909
2183
  const tools = {
1910
2184
  continue_test: buildContinueTestTool(),
1911
- finish_test: buildFinishTestTool(criteria)
2185
+ finish_test: buildFinishTestTool(criteria),
2186
+ ...isLargeTrace ? buildProgressiveDiscoveryTools(spans) : {}
1912
2187
  };
1913
2188
  const enforceJudgement = input.judgmentRequest != null;
1914
2189
  const hasCriteria = criteria.length && criteria.length > 0;
@@ -1925,26 +2200,70 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1925
2200
  model: mergedConfig.model,
1926
2201
  toolChoice,
1927
2202
  isLastMessage,
1928
- enforceJudgement
2203
+ enforceJudgement,
2204
+ isLargeTrace
1929
2205
  });
1930
- const completion = await this.invokeLLM({
2206
+ const completion = await this.invokeLLMWithDiscovery({
1931
2207
  model: mergedConfig.model,
1932
2208
  messages,
1933
2209
  temperature: mergedConfig.temperature ?? 0,
1934
2210
  maxOutputTokens: mergedConfig.maxTokens,
1935
2211
  tools,
1936
- toolChoice
2212
+ toolChoice,
2213
+ isLargeTrace
2214
+ });
2215
+ return this.parseToolCalls(completion, criteria);
2216
+ }
2217
+ /**
2218
+ * Builds the trace digest, choosing between full inline rendering
2219
+ * and structure-only mode based on estimated token count.
2220
+ */
2221
+ buildTraceDigest(spans) {
2222
+ const fullDigest = judgeSpanDigestFormatter.format(spans);
2223
+ const isLargeTrace = spans.length > 0 && estimateTokens(fullDigest) > this.tokenThreshold;
2224
+ const digest = isLargeTrace ? judgeSpanDigestFormatter.formatStructureOnly(spans) + "\n\nUse expand_trace(spanIndex) to see span details or grep_trace(pattern) to search across spans." : fullDigest;
2225
+ this.logger.debug("Trace digest built", {
2226
+ isLargeTrace,
2227
+ estimatedTokens: estimateTokens(fullDigest)
1937
2228
  });
2229
+ return { digest, isLargeTrace };
2230
+ }
2231
+ /**
2232
+ * Invokes the LLM, enabling multi-step tool execution for large traces.
2233
+ * In multi-step mode, the AI SDK loops automatically: the judge can call
2234
+ * expand_trace/grep_trace tools multiple times before reaching a terminal
2235
+ * tool (finish_test/continue_test) or hitting the step limit.
2236
+ */
2237
+ async invokeLLMWithDiscovery({
2238
+ isLargeTrace,
2239
+ ...params
2240
+ }) {
2241
+ var _a, _b;
2242
+ if (isLargeTrace) {
2243
+ params.stopWhen = [
2244
+ (0, import_ai2.stepCountIs)(this.maxDiscoverySteps),
2245
+ (0, import_ai2.hasToolCall)("finish_test"),
2246
+ (0, import_ai2.hasToolCall)("continue_test")
2247
+ ];
2248
+ }
2249
+ const completion = await this.invokeLLM(params);
1938
2250
  this.logger.debug("LLM response received", {
1939
- toolCallCount: ((_b = completion.toolCalls) == null ? void 0 : _b.length) ?? 0,
1940
- toolCalls: (_c = completion.toolCalls) == null ? void 0 : _c.map((tc) => ({
2251
+ toolCallCount: ((_a = completion.toolCalls) == null ? void 0 : _a.length) ?? 0,
2252
+ toolCalls: (_b = completion.toolCalls) == null ? void 0 : _b.map((tc) => ({
1941
2253
  toolName: tc.toolName,
1942
2254
  args: tc.input
1943
2255
  }))
1944
2256
  });
2257
+ return completion;
2258
+ }
2259
+ parseToolCalls(completion, criteria) {
2260
+ var _a;
1945
2261
  let args;
1946
- if ((_d = completion.toolCalls) == null ? void 0 : _d.length) {
1947
- const toolCall = completion.toolCalls[0];
2262
+ if ((_a = completion.toolCalls) == null ? void 0 : _a.length) {
2263
+ const terminalCall = completion.toolCalls.find(
2264
+ (tc) => tc.toolName === "finish_test" || tc.toolName === "continue_test"
2265
+ );
2266
+ const toolCall = terminalCall ?? completion.toolCalls[0];
1948
2267
  switch (toolCall.toolName) {
1949
2268
  case "finish_test": {
1950
2269
  args = toolCall.input;
@@ -1986,11 +2305,6 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1986
2305
  unmetCriteria: criteria
1987
2306
  };
1988
2307
  }
1989
- getOpenTelemetryTracesDigest(threadId) {
1990
- const spans = this.spanCollector.getSpansForThread(threadId);
1991
- const digest = judgeSpanDigestFormatter.format(spans);
1992
- return digest;
1993
- }
1994
2308
  };
1995
2309
  var judgeAgent = (cfg) => {
1996
2310
  return new JudgeAgent(cfg ?? {});
@@ -4731,6 +5045,7 @@ var index_default = scenario;
4731
5045
  AgentAdapter,
4732
5046
  AgentRole,
4733
5047
  DEFAULT_MAX_TURNS,
5048
+ DEFAULT_TOKEN_THRESHOLD,
4734
5049
  DEFAULT_VERBOSE,
4735
5050
  JudgeAgentAdapter,
4736
5051
  JudgeSpanCollector,
@@ -4743,7 +5058,10 @@ var index_default = scenario;
4743
5058
  agent,
4744
5059
  allAgentRoles,
4745
5060
  defineConfig,
5061
+ estimateTokens,
5062
+ expandTrace,
4746
5063
  fail,
5064
+ grepTrace,
4747
5065
  judge,
4748
5066
  judgeAgent,
4749
5067
  judgeSpanCollector,