@langwatch/scenario 0.4.3 → 0.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +98 -13
- package/dist/index.d.ts +98 -13
- package/dist/index.js +457 -151
- package/dist/index.mjs +458 -152
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -944,9 +944,13 @@ var init_esm = __esm({
|
|
|
944
944
|
// src/agents/index.ts
|
|
945
945
|
var agents_exports = {};
|
|
946
946
|
__export(agents_exports, {
|
|
947
|
+
DEFAULT_TOKEN_THRESHOLD: () => DEFAULT_TOKEN_THRESHOLD,
|
|
947
948
|
JudgeSpanCollector: () => JudgeSpanCollector,
|
|
948
949
|
JudgeSpanDigestFormatter: () => JudgeSpanDigestFormatter,
|
|
949
950
|
RealtimeAgentAdapter: () => RealtimeAgentAdapter,
|
|
951
|
+
estimateTokens: () => estimateTokens,
|
|
952
|
+
expandTrace: () => expandTrace,
|
|
953
|
+
grepTrace: () => grepTrace,
|
|
950
954
|
judgeAgent: () => judgeAgent,
|
|
951
955
|
judgeSpanCollector: () => judgeSpanCollector,
|
|
952
956
|
judgeSpanDigestFormatter: () => judgeSpanDigestFormatter,
|
|
@@ -954,7 +958,11 @@ __export(agents_exports, {
|
|
|
954
958
|
});
|
|
955
959
|
|
|
956
960
|
// src/agents/judge/judge-agent.ts
|
|
957
|
-
import {
|
|
961
|
+
import {
|
|
962
|
+
tool,
|
|
963
|
+
stepCountIs,
|
|
964
|
+
hasToolCall
|
|
965
|
+
} from "ai";
|
|
958
966
|
import { z as z4 } from "zod/v4";
|
|
959
967
|
|
|
960
968
|
// src/agents/judge/judge-utils.ts
|
|
@@ -1026,6 +1034,275 @@ var JudgeUtils = {
|
|
|
1026
1034
|
}
|
|
1027
1035
|
};
|
|
1028
1036
|
|
|
1037
|
+
// src/agents/judge/estimate-tokens.ts
|
|
1038
|
+
var DEFAULT_TOKEN_THRESHOLD = 8192;
|
|
1039
|
+
function estimateTokens(text) {
|
|
1040
|
+
const byteLength = new TextEncoder().encode(text).byteLength;
|
|
1041
|
+
return Math.ceil(byteLength / 4);
|
|
1042
|
+
}
|
|
1043
|
+
|
|
1044
|
+
// src/agents/judge/span-utils.ts
|
|
1045
|
+
import { attributes } from "langwatch/observability";
|
|
1046
|
+
|
|
1047
|
+
// src/agents/judge/deep-transform.ts
|
|
1048
|
+
function deepTransform(value, fn) {
|
|
1049
|
+
const result = fn(value);
|
|
1050
|
+
if (result !== value) return result;
|
|
1051
|
+
if (Array.isArray(value)) {
|
|
1052
|
+
return value.map((v) => deepTransform(v, fn));
|
|
1053
|
+
}
|
|
1054
|
+
if (value !== null && typeof value === "object") {
|
|
1055
|
+
const out = {};
|
|
1056
|
+
for (const [k, v] of Object.entries(value)) {
|
|
1057
|
+
out[k] = deepTransform(v, fn);
|
|
1058
|
+
}
|
|
1059
|
+
return out;
|
|
1060
|
+
}
|
|
1061
|
+
return value;
|
|
1062
|
+
}
|
|
1063
|
+
|
|
1064
|
+
// src/agents/judge/truncate-media.ts
|
|
1065
|
+
function truncateMediaUrl(str) {
|
|
1066
|
+
const match = str.match(
|
|
1067
|
+
/^data:((image|audio|video)\/[a-z0-9+.-]+);base64,(.+)$/i
|
|
1068
|
+
);
|
|
1069
|
+
if (!match) return str;
|
|
1070
|
+
const [, mimeType, category, data] = match;
|
|
1071
|
+
return `[${category.toUpperCase()}: ${mimeType}, ~${data.length} bytes]`;
|
|
1072
|
+
}
|
|
1073
|
+
function truncateMediaPart(v) {
|
|
1074
|
+
var _a;
|
|
1075
|
+
if (v === null || typeof v !== "object" || Array.isArray(v)) return null;
|
|
1076
|
+
const obj = v;
|
|
1077
|
+
if (obj.type === "file" && typeof obj.mediaType === "string" && typeof obj.data === "string") {
|
|
1078
|
+
const category = ((_a = obj.mediaType.split("/")[0]) == null ? void 0 : _a.toUpperCase()) ?? "FILE";
|
|
1079
|
+
return {
|
|
1080
|
+
...obj,
|
|
1081
|
+
data: `[${category}: ${obj.mediaType}, ~${obj.data.length} bytes]`
|
|
1082
|
+
};
|
|
1083
|
+
}
|
|
1084
|
+
if (obj.type === "image" && typeof obj.image === "string") {
|
|
1085
|
+
const imageData = obj.image;
|
|
1086
|
+
const dataUrlMatch = imageData.match(
|
|
1087
|
+
/^data:((image)\/[a-z0-9+.-]+);base64,(.+)$/i
|
|
1088
|
+
);
|
|
1089
|
+
if (dataUrlMatch) {
|
|
1090
|
+
return {
|
|
1091
|
+
...obj,
|
|
1092
|
+
image: `[IMAGE: ${dataUrlMatch[1]}, ~${dataUrlMatch[3].length} bytes]`
|
|
1093
|
+
};
|
|
1094
|
+
}
|
|
1095
|
+
if (imageData.length > 1e3 && /^[A-Za-z0-9+/=]+$/.test(imageData)) {
|
|
1096
|
+
return {
|
|
1097
|
+
...obj,
|
|
1098
|
+
image: `[IMAGE: unknown, ~${imageData.length} bytes]`
|
|
1099
|
+
};
|
|
1100
|
+
}
|
|
1101
|
+
}
|
|
1102
|
+
return null;
|
|
1103
|
+
}
|
|
1104
|
+
|
|
1105
|
+
// src/agents/judge/span-utils.ts
|
|
1106
|
+
function hrTimeToMs(hrTime) {
|
|
1107
|
+
return hrTime[0] * 1e3 + hrTime[1] / 1e6;
|
|
1108
|
+
}
|
|
1109
|
+
function formatDuration(ms) {
|
|
1110
|
+
if (ms < 1e3) return `${Math.round(ms)}ms`;
|
|
1111
|
+
return `${(ms / 1e3).toFixed(2)}s`;
|
|
1112
|
+
}
|
|
1113
|
+
function calculateSpanDuration(span) {
|
|
1114
|
+
return hrTimeToMs(span.endTime) - hrTimeToMs(span.startTime);
|
|
1115
|
+
}
|
|
1116
|
+
function getStatusIndicator(span) {
|
|
1117
|
+
if (span.status.code === 2) {
|
|
1118
|
+
return ` \u26A0\uFE0F ERROR: ${span.status.message ?? "unknown"}`;
|
|
1119
|
+
}
|
|
1120
|
+
return "";
|
|
1121
|
+
}
|
|
1122
|
+
function getTokenUsage(span) {
|
|
1123
|
+
const input = span.attributes["gen_ai.usage.input_tokens"];
|
|
1124
|
+
const output = span.attributes["gen_ai.usage.output_tokens"];
|
|
1125
|
+
if (input == null && output == null) return "";
|
|
1126
|
+
const total = (Number(input) || 0) + (Number(output) || 0);
|
|
1127
|
+
return `, ${total} tokens`;
|
|
1128
|
+
}
|
|
1129
|
+
function cleanAttributes(attrs) {
|
|
1130
|
+
const cleaned = {};
|
|
1131
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1132
|
+
const excludedKeys = [
|
|
1133
|
+
attributes.ATTR_LANGWATCH_THREAD_ID,
|
|
1134
|
+
"langwatch.scenario.id",
|
|
1135
|
+
"langwatch.scenario.name"
|
|
1136
|
+
];
|
|
1137
|
+
for (const [key, value] of Object.entries(attrs)) {
|
|
1138
|
+
if (excludedKeys.includes(key)) {
|
|
1139
|
+
continue;
|
|
1140
|
+
}
|
|
1141
|
+
const cleanKey = key.replace(/^(langwatch)\./, "");
|
|
1142
|
+
if (!seen.has(cleanKey)) {
|
|
1143
|
+
seen.add(cleanKey);
|
|
1144
|
+
cleaned[cleanKey] = value;
|
|
1145
|
+
}
|
|
1146
|
+
}
|
|
1147
|
+
return cleaned;
|
|
1148
|
+
}
|
|
1149
|
+
function formatValue(value) {
|
|
1150
|
+
const processed = transformValue(value);
|
|
1151
|
+
return typeof processed === "string" ? processed : JSON.stringify(processed);
|
|
1152
|
+
}
|
|
1153
|
+
function transformValue(value) {
|
|
1154
|
+
return deepTransform(value, (v) => {
|
|
1155
|
+
const mediaPart = truncateMediaPart(v);
|
|
1156
|
+
if (mediaPart) return mediaPart;
|
|
1157
|
+
if (typeof v !== "string") return v;
|
|
1158
|
+
const truncated = truncateMediaUrl(v);
|
|
1159
|
+
if (truncated !== v) return truncated;
|
|
1160
|
+
if (looksLikeJson(v)) {
|
|
1161
|
+
try {
|
|
1162
|
+
const parsed = transformValue(JSON.parse(v));
|
|
1163
|
+
return JSON.stringify(parsed);
|
|
1164
|
+
} catch {
|
|
1165
|
+
}
|
|
1166
|
+
}
|
|
1167
|
+
return v;
|
|
1168
|
+
});
|
|
1169
|
+
}
|
|
1170
|
+
function looksLikeJson(str) {
|
|
1171
|
+
const t = str.trim();
|
|
1172
|
+
return t.startsWith("{") && t.endsWith("}") || t.startsWith("[") && t.endsWith("]");
|
|
1173
|
+
}
|
|
1174
|
+
function indexSpans(spans) {
|
|
1175
|
+
const sorted = [...spans].sort((a, b) => {
|
|
1176
|
+
return hrTimeToMs(a.startTime) - hrTimeToMs(b.startTime);
|
|
1177
|
+
});
|
|
1178
|
+
return sorted.map((span) => ({
|
|
1179
|
+
span,
|
|
1180
|
+
children: [],
|
|
1181
|
+
shortId: span.spanContext().spanId.slice(0, 8)
|
|
1182
|
+
}));
|
|
1183
|
+
}
|
|
1184
|
+
|
|
1185
|
+
// src/agents/judge/trace-tools.ts
|
|
1186
|
+
var TOOL_RESULT_TOKEN_BUDGET = 4096;
|
|
1187
|
+
var TOOL_RESULT_CHAR_BUDGET = TOOL_RESULT_TOKEN_BUDGET * 4;
|
|
1188
|
+
var MAX_GREP_MATCHES = 20;
|
|
1189
|
+
function renderFullSpanNode(node) {
|
|
1190
|
+
const span = node.span;
|
|
1191
|
+
const duration = calculateSpanDuration(span);
|
|
1192
|
+
const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
|
|
1193
|
+
const status = getStatusIndicator(span);
|
|
1194
|
+
const lines = [];
|
|
1195
|
+
lines.push(
|
|
1196
|
+
`[${node.shortId}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
|
|
1197
|
+
);
|
|
1198
|
+
const attrs = cleanAttributes(span.attributes);
|
|
1199
|
+
if (Object.keys(attrs).length > 0) {
|
|
1200
|
+
for (const [key, value] of Object.entries(attrs)) {
|
|
1201
|
+
lines.push(` ${key}: ${formatValue(value)}`);
|
|
1202
|
+
}
|
|
1203
|
+
}
|
|
1204
|
+
if (span.events.length > 0) {
|
|
1205
|
+
for (const event of span.events) {
|
|
1206
|
+
lines.push(` [event] ${event.name}`);
|
|
1207
|
+
if (event.attributes) {
|
|
1208
|
+
const eventAttrs = cleanAttributes(event.attributes);
|
|
1209
|
+
for (const [key, value] of Object.entries(eventAttrs)) {
|
|
1210
|
+
lines.push(` ${key}: ${formatValue(value)}`);
|
|
1211
|
+
}
|
|
1212
|
+
}
|
|
1213
|
+
}
|
|
1214
|
+
}
|
|
1215
|
+
return lines;
|
|
1216
|
+
}
|
|
1217
|
+
function truncateToCharBudget(text) {
|
|
1218
|
+
if (text.length <= TOOL_RESULT_CHAR_BUDGET) return text;
|
|
1219
|
+
const truncated = text.slice(0, TOOL_RESULT_CHAR_BUDGET);
|
|
1220
|
+
return truncated + "\n\n[TRUNCATED] Output exceeded ~4000 token budget. Use grep_trace(pattern) to search for specific content, or expand_trace with fewer span IDs.";
|
|
1221
|
+
}
|
|
1222
|
+
function spanToSearchableText(span) {
|
|
1223
|
+
const parts = [span.name];
|
|
1224
|
+
const attrs = cleanAttributes(span.attributes);
|
|
1225
|
+
for (const [key, value] of Object.entries(attrs)) {
|
|
1226
|
+
parts.push(`${key}: ${formatValue(value)}`);
|
|
1227
|
+
}
|
|
1228
|
+
for (const event of span.events) {
|
|
1229
|
+
parts.push(event.name);
|
|
1230
|
+
if (event.attributes) {
|
|
1231
|
+
const eventAttrs = cleanAttributes(event.attributes);
|
|
1232
|
+
for (const [key, value] of Object.entries(eventAttrs)) {
|
|
1233
|
+
parts.push(`${key}: ${formatValue(value)}`);
|
|
1234
|
+
}
|
|
1235
|
+
}
|
|
1236
|
+
}
|
|
1237
|
+
return parts.join("\n");
|
|
1238
|
+
}
|
|
1239
|
+
function expandTrace(spans, spanIds) {
|
|
1240
|
+
const nodes = indexSpans(spans);
|
|
1241
|
+
if (nodes.length === 0) {
|
|
1242
|
+
return "No spans recorded.";
|
|
1243
|
+
}
|
|
1244
|
+
if (spanIds.length === 0) {
|
|
1245
|
+
return "Error: provide at least one span ID.";
|
|
1246
|
+
}
|
|
1247
|
+
const selected = nodes.filter((n) => {
|
|
1248
|
+
const fullId = n.span.spanContext().spanId;
|
|
1249
|
+
return spanIds.some((prefix) => fullId.startsWith(prefix));
|
|
1250
|
+
});
|
|
1251
|
+
if (selected.length === 0) {
|
|
1252
|
+
const available = nodes.map((n) => n.shortId).join(", ");
|
|
1253
|
+
return `Error: no spans matched the given ID(s). Available span IDs: ${available}`;
|
|
1254
|
+
}
|
|
1255
|
+
const lines = [];
|
|
1256
|
+
for (const node of selected) {
|
|
1257
|
+
const spanLines = renderFullSpanNode(node);
|
|
1258
|
+
lines.push(...spanLines);
|
|
1259
|
+
lines.push("");
|
|
1260
|
+
}
|
|
1261
|
+
return truncateToCharBudget(lines.join("\n").trimEnd());
|
|
1262
|
+
}
|
|
1263
|
+
function grepTrace(spans, pattern) {
|
|
1264
|
+
const nodes = indexSpans(spans);
|
|
1265
|
+
if (nodes.length === 0) {
|
|
1266
|
+
return "No spans recorded.";
|
|
1267
|
+
}
|
|
1268
|
+
const regex = new RegExp(escapeRegex(pattern), "i");
|
|
1269
|
+
const matches = [];
|
|
1270
|
+
for (const node of nodes) {
|
|
1271
|
+
const searchText = spanToSearchableText(node.span);
|
|
1272
|
+
const lines2 = searchText.split("\n");
|
|
1273
|
+
const matchingLines = lines2.filter((line) => regex.test(line));
|
|
1274
|
+
if (matchingLines.length > 0) {
|
|
1275
|
+
matches.push({ node, matchingLines });
|
|
1276
|
+
}
|
|
1277
|
+
}
|
|
1278
|
+
if (matches.length === 0) {
|
|
1279
|
+
const spanNames = Array.from(new Set(nodes.map((n) => n.span.name)));
|
|
1280
|
+
return `No matches found for "${pattern}". Available span names: ${spanNames.join(", ")}`;
|
|
1281
|
+
}
|
|
1282
|
+
const totalMatches = matches.length;
|
|
1283
|
+
const limited = matches.slice(0, MAX_GREP_MATCHES);
|
|
1284
|
+
const lines = [];
|
|
1285
|
+
for (const { node, matchingLines } of limited) {
|
|
1286
|
+
const duration = calculateSpanDuration(node.span);
|
|
1287
|
+
lines.push(
|
|
1288
|
+
`--- [${node.shortId}] ${node.span.name} (${formatDuration(duration)}) ---`
|
|
1289
|
+
);
|
|
1290
|
+
for (const line of matchingLines) {
|
|
1291
|
+
lines.push(` ${line}`);
|
|
1292
|
+
}
|
|
1293
|
+
lines.push("");
|
|
1294
|
+
}
|
|
1295
|
+
if (totalMatches > MAX_GREP_MATCHES) {
|
|
1296
|
+
lines.push(
|
|
1297
|
+
`[${totalMatches - MAX_GREP_MATCHES} more matches omitted. Refine your search pattern for more specific results.]`
|
|
1298
|
+
);
|
|
1299
|
+
}
|
|
1300
|
+
return truncateToCharBudget(lines.join("\n").trimEnd());
|
|
1301
|
+
}
|
|
1302
|
+
function escapeRegex(str) {
|
|
1303
|
+
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
1304
|
+
}
|
|
1305
|
+
|
|
1029
1306
|
// src/config/env.ts
|
|
1030
1307
|
import { z } from "zod/v4";
|
|
1031
1308
|
|
|
@@ -1381,7 +1658,7 @@ var criterionToParamName = (criterion) => {
|
|
|
1381
1658
|
};
|
|
1382
1659
|
|
|
1383
1660
|
// src/agents/judge/judge-span-collector.ts
|
|
1384
|
-
import { attributes } from "langwatch/observability";
|
|
1661
|
+
import { attributes as attributes2 } from "langwatch/observability";
|
|
1385
1662
|
var JudgeSpanCollector = class {
|
|
1386
1663
|
spans = [];
|
|
1387
1664
|
onStart() {
|
|
@@ -1424,7 +1701,7 @@ var JudgeSpanCollector = class {
|
|
|
1424
1701
|
const spanId = span.spanContext().spanId;
|
|
1425
1702
|
if (visited.has(spanId)) return false;
|
|
1426
1703
|
visited.add(spanId);
|
|
1427
|
-
if (span.attributes[
|
|
1704
|
+
if (span.attributes[attributes2.ATTR_LANGWATCH_THREAD_ID] === threadId) {
|
|
1428
1705
|
return true;
|
|
1429
1706
|
}
|
|
1430
1707
|
const parentId = getParentSpanId(span);
|
|
@@ -1443,26 +1720,6 @@ function getParentSpanId(span) {
|
|
|
1443
1720
|
}
|
|
1444
1721
|
var judgeSpanCollector = new JudgeSpanCollector();
|
|
1445
1722
|
|
|
1446
|
-
// src/agents/judge/judge-span-digest-formatter.ts
|
|
1447
|
-
import { attributes as attributes2 } from "langwatch/observability";
|
|
1448
|
-
|
|
1449
|
-
// src/agents/judge/deep-transform.ts
|
|
1450
|
-
function deepTransform(value, fn) {
|
|
1451
|
-
const result = fn(value);
|
|
1452
|
-
if (result !== value) return result;
|
|
1453
|
-
if (Array.isArray(value)) {
|
|
1454
|
-
return value.map((v) => deepTransform(v, fn));
|
|
1455
|
-
}
|
|
1456
|
-
if (value !== null && typeof value === "object") {
|
|
1457
|
-
const out = {};
|
|
1458
|
-
for (const [k, v] of Object.entries(value)) {
|
|
1459
|
-
out[k] = deepTransform(v, fn);
|
|
1460
|
-
}
|
|
1461
|
-
return out;
|
|
1462
|
-
}
|
|
1463
|
-
return value;
|
|
1464
|
-
}
|
|
1465
|
-
|
|
1466
1723
|
// src/agents/judge/string-deduplicator.ts
|
|
1467
1724
|
var StringDeduplicator = class {
|
|
1468
1725
|
seen = /* @__PURE__ */ new Map();
|
|
@@ -1496,51 +1753,49 @@ var StringDeduplicator = class {
|
|
|
1496
1753
|
}
|
|
1497
1754
|
};
|
|
1498
1755
|
|
|
1499
|
-
// src/agents/judge/truncate-media.ts
|
|
1500
|
-
function truncateMediaUrl(str) {
|
|
1501
|
-
const match = str.match(
|
|
1502
|
-
/^data:((image|audio|video)\/[a-z0-9+.-]+);base64,(.+)$/i
|
|
1503
|
-
);
|
|
1504
|
-
if (!match) return str;
|
|
1505
|
-
const [, mimeType, category, data] = match;
|
|
1506
|
-
return `[${category.toUpperCase()}: ${mimeType}, ~${data.length} bytes]`;
|
|
1507
|
-
}
|
|
1508
|
-
function truncateMediaPart(v) {
|
|
1509
|
-
var _a;
|
|
1510
|
-
if (v === null || typeof v !== "object" || Array.isArray(v)) return null;
|
|
1511
|
-
const obj = v;
|
|
1512
|
-
if (obj.type === "file" && typeof obj.mediaType === "string" && typeof obj.data === "string") {
|
|
1513
|
-
const category = ((_a = obj.mediaType.split("/")[0]) == null ? void 0 : _a.toUpperCase()) ?? "FILE";
|
|
1514
|
-
return {
|
|
1515
|
-
...obj,
|
|
1516
|
-
data: `[${category}: ${obj.mediaType}, ~${obj.data.length} bytes]`
|
|
1517
|
-
};
|
|
1518
|
-
}
|
|
1519
|
-
if (obj.type === "image" && typeof obj.image === "string") {
|
|
1520
|
-
const imageData = obj.image;
|
|
1521
|
-
const dataUrlMatch = imageData.match(
|
|
1522
|
-
/^data:((image)\/[a-z0-9+.-]+);base64,(.+)$/i
|
|
1523
|
-
);
|
|
1524
|
-
if (dataUrlMatch) {
|
|
1525
|
-
return {
|
|
1526
|
-
...obj,
|
|
1527
|
-
image: `[IMAGE: ${dataUrlMatch[1]}, ~${dataUrlMatch[3].length} bytes]`
|
|
1528
|
-
};
|
|
1529
|
-
}
|
|
1530
|
-
if (imageData.length > 1e3 && /^[A-Za-z0-9+/=]+$/.test(imageData)) {
|
|
1531
|
-
return {
|
|
1532
|
-
...obj,
|
|
1533
|
-
image: `[IMAGE: unknown, ~${imageData.length} bytes]`
|
|
1534
|
-
};
|
|
1535
|
-
}
|
|
1536
|
-
}
|
|
1537
|
-
return null;
|
|
1538
|
-
}
|
|
1539
|
-
|
|
1540
1756
|
// src/agents/judge/judge-span-digest-formatter.ts
|
|
1541
1757
|
var JudgeSpanDigestFormatter = class {
|
|
1542
1758
|
logger = new Logger("JudgeSpanDigestFormatter");
|
|
1543
1759
|
deduplicator = new StringDeduplicator({ threshold: 50 });
|
|
1760
|
+
/**
|
|
1761
|
+
* Formats spans into a structure-only digest showing span tree hierarchy
|
|
1762
|
+
* without attributes, events, or content. Used for large traces that
|
|
1763
|
+
* exceed the token threshold, paired with expand_trace/grep_trace tools.
|
|
1764
|
+
*
|
|
1765
|
+
* @param spans - All spans for a thread
|
|
1766
|
+
* @returns Plain text digest with only structural information
|
|
1767
|
+
*/
|
|
1768
|
+
formatStructureOnly(spans) {
|
|
1769
|
+
this.logger.debug("formatStructureOnly() called", {
|
|
1770
|
+
spanCount: spans.length
|
|
1771
|
+
});
|
|
1772
|
+
if (spans.length === 0) {
|
|
1773
|
+
return "No spans recorded.";
|
|
1774
|
+
}
|
|
1775
|
+
const sortedSpans = this.sortByStartTime(spans);
|
|
1776
|
+
const tree = this.buildHierarchy(sortedSpans);
|
|
1777
|
+
const totalDuration = this.calculateTotalDuration(sortedSpans);
|
|
1778
|
+
const lines = [
|
|
1779
|
+
`Spans: ${spans.length} | Total Duration: ${formatDuration(totalDuration)}`,
|
|
1780
|
+
""
|
|
1781
|
+
];
|
|
1782
|
+
const rootCount = tree.length;
|
|
1783
|
+
tree.forEach((node, idx) => {
|
|
1784
|
+
this.renderStructureNode(
|
|
1785
|
+
node,
|
|
1786
|
+
lines,
|
|
1787
|
+
0,
|
|
1788
|
+
idx === rootCount - 1
|
|
1789
|
+
);
|
|
1790
|
+
});
|
|
1791
|
+
const errors = this.collectErrors(spans);
|
|
1792
|
+
if (errors.length > 0) {
|
|
1793
|
+
lines.push("");
|
|
1794
|
+
lines.push("=== ERRORS ===");
|
|
1795
|
+
errors.forEach((e) => lines.push(e));
|
|
1796
|
+
}
|
|
1797
|
+
return lines.join("\n");
|
|
1798
|
+
}
|
|
1544
1799
|
/**
|
|
1545
1800
|
* Formats spans into a complete digest with full content and nesting.
|
|
1546
1801
|
* @param spans - All spans for a thread
|
|
@@ -1564,19 +1819,17 @@ var JudgeSpanDigestFormatter = class {
|
|
|
1564
1819
|
totalDuration
|
|
1565
1820
|
});
|
|
1566
1821
|
const lines = [
|
|
1567
|
-
`Spans: ${spans.length} | Total Duration: ${
|
|
1822
|
+
`Spans: ${spans.length} | Total Duration: ${formatDuration(
|
|
1568
1823
|
totalDuration
|
|
1569
1824
|
)}`,
|
|
1570
1825
|
""
|
|
1571
1826
|
];
|
|
1572
|
-
let sequence = 1;
|
|
1573
1827
|
const rootCount = tree.length;
|
|
1574
1828
|
tree.forEach((node, idx) => {
|
|
1575
|
-
|
|
1829
|
+
this.renderNode(
|
|
1576
1830
|
node,
|
|
1577
1831
|
lines,
|
|
1578
1832
|
0,
|
|
1579
|
-
sequence,
|
|
1580
1833
|
idx === rootCount - 1
|
|
1581
1834
|
);
|
|
1582
1835
|
});
|
|
@@ -1590,9 +1843,7 @@ var JudgeSpanDigestFormatter = class {
|
|
|
1590
1843
|
}
|
|
1591
1844
|
sortByStartTime(spans) {
|
|
1592
1845
|
return [...spans].sort((a, b) => {
|
|
1593
|
-
|
|
1594
|
-
const bTime = this.hrTimeToMs(b.startTime);
|
|
1595
|
-
return aTime - bTime;
|
|
1846
|
+
return hrTimeToMs(a.startTime) - hrTimeToMs(b.startTime);
|
|
1596
1847
|
});
|
|
1597
1848
|
}
|
|
1598
1849
|
buildHierarchy(spans) {
|
|
@@ -1612,46 +1863,66 @@ var JudgeSpanDigestFormatter = class {
|
|
|
1612
1863
|
}
|
|
1613
1864
|
return roots;
|
|
1614
1865
|
}
|
|
1615
|
-
|
|
1866
|
+
renderStructureNode(node, lines, depth, isLast = true) {
|
|
1616
1867
|
const span = node.span;
|
|
1617
|
-
const
|
|
1618
|
-
const
|
|
1619
|
-
const
|
|
1868
|
+
const shortId = span.spanContext().spanId.slice(0, 8);
|
|
1869
|
+
const duration = calculateSpanDuration(span);
|
|
1870
|
+
const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
|
|
1871
|
+
const status = getStatusIndicator(span);
|
|
1872
|
+
const tokens = getTokenUsage(span);
|
|
1620
1873
|
const prefix = this.getTreePrefix(depth, isLast);
|
|
1621
1874
|
lines.push(
|
|
1622
|
-
`${prefix}[${
|
|
1875
|
+
`${prefix}[${shortId}] ${timestamp} ${span.name} (${formatDuration(duration)}${tokens})${status}`
|
|
1876
|
+
);
|
|
1877
|
+
lines.push("");
|
|
1878
|
+
const childCount = node.children.length;
|
|
1879
|
+
node.children.forEach((child, idx) => {
|
|
1880
|
+
this.renderStructureNode(
|
|
1881
|
+
child,
|
|
1882
|
+
lines,
|
|
1883
|
+
depth + 1,
|
|
1884
|
+
idx === childCount - 1
|
|
1885
|
+
);
|
|
1886
|
+
});
|
|
1887
|
+
}
|
|
1888
|
+
renderNode(node, lines, depth, isLast = true) {
|
|
1889
|
+
const span = node.span;
|
|
1890
|
+
const shortId = span.spanContext().spanId.slice(0, 8);
|
|
1891
|
+
const duration = calculateSpanDuration(span);
|
|
1892
|
+
const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
|
|
1893
|
+
const status = getStatusIndicator(span);
|
|
1894
|
+
const prefix = this.getTreePrefix(depth, isLast);
|
|
1895
|
+
lines.push(
|
|
1896
|
+
`${prefix}[${shortId}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
|
|
1623
1897
|
);
|
|
1624
1898
|
const attrIndent = this.getAttrIndent(depth, isLast);
|
|
1625
|
-
const attrs =
|
|
1899
|
+
const attrs = cleanAttributes(span.attributes);
|
|
1626
1900
|
if (Object.keys(attrs).length > 0) {
|
|
1627
1901
|
for (const [key, value] of Object.entries(attrs)) {
|
|
1628
|
-
lines.push(`${attrIndent}${key}: ${this.
|
|
1902
|
+
lines.push(`${attrIndent}${key}: ${this.formatValueWithDedup(value)}`);
|
|
1629
1903
|
}
|
|
1630
1904
|
}
|
|
1631
1905
|
if (span.events.length > 0) {
|
|
1632
1906
|
for (const event of span.events) {
|
|
1633
1907
|
lines.push(`${attrIndent}[event] ${event.name}`);
|
|
1634
1908
|
if (event.attributes) {
|
|
1635
|
-
const eventAttrs =
|
|
1909
|
+
const eventAttrs = cleanAttributes(event.attributes);
|
|
1636
1910
|
for (const [key, value] of Object.entries(eventAttrs)) {
|
|
1637
|
-
lines.push(`${attrIndent} ${key}: ${this.
|
|
1911
|
+
lines.push(`${attrIndent} ${key}: ${this.formatValueWithDedup(value)}`);
|
|
1638
1912
|
}
|
|
1639
1913
|
}
|
|
1640
1914
|
}
|
|
1641
1915
|
}
|
|
1642
1916
|
lines.push("");
|
|
1643
|
-
let nextSeq = sequence + 1;
|
|
1644
1917
|
const childCount = node.children.length;
|
|
1645
1918
|
node.children.forEach((child, idx) => {
|
|
1646
|
-
|
|
1919
|
+
this.renderNode(
|
|
1647
1920
|
child,
|
|
1648
1921
|
lines,
|
|
1649
1922
|
depth + 1,
|
|
1650
|
-
nextSeq,
|
|
1651
1923
|
idx === childCount - 1
|
|
1652
1924
|
);
|
|
1653
1925
|
});
|
|
1654
|
-
return nextSeq;
|
|
1655
1926
|
}
|
|
1656
1927
|
getTreePrefix(depth, isLast) {
|
|
1657
1928
|
if (depth === 0) return "";
|
|
@@ -1663,42 +1934,26 @@ var JudgeSpanDigestFormatter = class {
|
|
|
1663
1934
|
const continuation = isLast ? " " : "\u2502 ";
|
|
1664
1935
|
return "\u2502 ".repeat(depth - 1) + continuation + " ";
|
|
1665
1936
|
}
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
"langwatch.scenario.name"
|
|
1673
|
-
];
|
|
1674
|
-
for (const [key, value] of Object.entries(attrs)) {
|
|
1675
|
-
if (excludedKeys.includes(key)) {
|
|
1676
|
-
continue;
|
|
1677
|
-
}
|
|
1678
|
-
const cleanKey = key.replace(/^(langwatch)\./, "");
|
|
1679
|
-
if (!seen.has(cleanKey)) {
|
|
1680
|
-
seen.add(cleanKey);
|
|
1681
|
-
cleaned[cleanKey] = value;
|
|
1682
|
-
}
|
|
1683
|
-
}
|
|
1684
|
-
return cleaned;
|
|
1685
|
-
}
|
|
1686
|
-
formatValue(value) {
|
|
1687
|
-
const processed = this.transformValue(value);
|
|
1937
|
+
/**
|
|
1938
|
+
* Formats a value with deduplication applied. Used by the `format()` method
|
|
1939
|
+
* to reduce token usage by replacing repeated strings with markers.
|
|
1940
|
+
*/
|
|
1941
|
+
formatValueWithDedup(value) {
|
|
1942
|
+
const processed = this.transformValueWithDedup(value);
|
|
1688
1943
|
return typeof processed === "string" ? processed : JSON.stringify(processed);
|
|
1689
1944
|
}
|
|
1690
|
-
|
|
1945
|
+
transformValueWithDedup(value) {
|
|
1691
1946
|
return deepTransform(value, (v) => {
|
|
1692
1947
|
const mediaPart = truncateMediaPart(v);
|
|
1693
1948
|
if (mediaPart) return mediaPart;
|
|
1694
1949
|
if (typeof v !== "string") return v;
|
|
1695
|
-
return this.
|
|
1950
|
+
return this.transformStringWithDedup(v);
|
|
1696
1951
|
});
|
|
1697
1952
|
}
|
|
1698
|
-
|
|
1699
|
-
if (
|
|
1953
|
+
transformStringWithDedup(str) {
|
|
1954
|
+
if (looksLikeJson(str)) {
|
|
1700
1955
|
try {
|
|
1701
|
-
const processed = this.
|
|
1956
|
+
const processed = this.transformValueWithDedup(JSON.parse(str));
|
|
1702
1957
|
return JSON.stringify(processed);
|
|
1703
1958
|
} catch {
|
|
1704
1959
|
}
|
|
@@ -1707,36 +1962,12 @@ var JudgeSpanDigestFormatter = class {
|
|
|
1707
1962
|
if (truncated !== str) return truncated;
|
|
1708
1963
|
return this.deduplicator.process(str);
|
|
1709
1964
|
}
|
|
1710
|
-
looksLikeJson(str) {
|
|
1711
|
-
const t = str.trim();
|
|
1712
|
-
return t.startsWith("{") && t.endsWith("}") || t.startsWith("[") && t.endsWith("]");
|
|
1713
|
-
}
|
|
1714
|
-
hrTimeToMs(hrTime) {
|
|
1715
|
-
return hrTime[0] * 1e3 + hrTime[1] / 1e6;
|
|
1716
|
-
}
|
|
1717
|
-
calculateSpanDuration(span) {
|
|
1718
|
-
return this.hrTimeToMs(span.endTime) - this.hrTimeToMs(span.startTime);
|
|
1719
|
-
}
|
|
1720
1965
|
calculateTotalDuration(spans) {
|
|
1721
1966
|
if (spans.length === 0) return 0;
|
|
1722
|
-
const first =
|
|
1723
|
-
const last = Math.max(...spans.map((s) =>
|
|
1967
|
+
const first = hrTimeToMs(spans[0].startTime);
|
|
1968
|
+
const last = Math.max(...spans.map((s) => hrTimeToMs(s.endTime)));
|
|
1724
1969
|
return last - first;
|
|
1725
1970
|
}
|
|
1726
|
-
formatDuration(ms) {
|
|
1727
|
-
if (ms < 1e3) return `${Math.round(ms)}ms`;
|
|
1728
|
-
return `${(ms / 1e3).toFixed(2)}s`;
|
|
1729
|
-
}
|
|
1730
|
-
formatTimestamp(hrTime) {
|
|
1731
|
-
const ms = this.hrTimeToMs(hrTime);
|
|
1732
|
-
return new Date(ms).toISOString();
|
|
1733
|
-
}
|
|
1734
|
-
getStatusIndicator(span) {
|
|
1735
|
-
if (span.status.code === 2) {
|
|
1736
|
-
return ` \u26A0\uFE0F ERROR: ${span.status.message ?? "unknown"}`;
|
|
1737
|
-
}
|
|
1738
|
-
return "";
|
|
1739
|
-
}
|
|
1740
1971
|
collectErrors(spans) {
|
|
1741
1972
|
return spans.filter((s) => s.status.code === 2).map((s) => `- ${s.name}: ${s.status.message ?? "unknown error"}`);
|
|
1742
1973
|
}
|
|
@@ -1799,15 +2030,41 @@ function buildFinishTestTool(criteria) {
|
|
|
1799
2030
|
})
|
|
1800
2031
|
});
|
|
1801
2032
|
}
|
|
2033
|
+
function buildProgressiveDiscoveryTools(spans) {
|
|
2034
|
+
return {
|
|
2035
|
+
expand_trace: tool({
|
|
2036
|
+
description: "Expand one or more spans to see their full details (attributes, events, content). Use the span ID shown in brackets in the trace skeleton.",
|
|
2037
|
+
inputSchema: z4.object({
|
|
2038
|
+
span_ids: z4.array(z4.string()).describe("Span IDs (or 8-char prefixes) to expand")
|
|
2039
|
+
}),
|
|
2040
|
+
execute: async ({ span_ids }) => {
|
|
2041
|
+
return expandTrace(spans, span_ids);
|
|
2042
|
+
}
|
|
2043
|
+
}),
|
|
2044
|
+
grep_trace: tool({
|
|
2045
|
+
description: "Search across all span attributes, events, and content for a pattern (case-insensitive). Returns matching spans with context.",
|
|
2046
|
+
inputSchema: z4.object({
|
|
2047
|
+
pattern: z4.string().describe("Search pattern (case-insensitive)")
|
|
2048
|
+
}),
|
|
2049
|
+
execute: async ({ pattern }) => {
|
|
2050
|
+
return grepTrace(spans, pattern);
|
|
2051
|
+
}
|
|
2052
|
+
})
|
|
2053
|
+
};
|
|
2054
|
+
}
|
|
1802
2055
|
var JudgeAgent = class extends JudgeAgentAdapter {
|
|
1803
2056
|
constructor(cfg) {
|
|
1804
2057
|
super();
|
|
1805
2058
|
this.cfg = cfg;
|
|
1806
2059
|
this.criteria = cfg.criteria ?? [];
|
|
1807
2060
|
this.spanCollector = cfg.spanCollector ?? judgeSpanCollector;
|
|
2061
|
+
this.tokenThreshold = cfg.tokenThreshold ?? DEFAULT_TOKEN_THRESHOLD;
|
|
2062
|
+
this.maxDiscoverySteps = cfg.maxDiscoverySteps ?? 10;
|
|
1808
2063
|
}
|
|
1809
2064
|
logger = new Logger("JudgeAgent");
|
|
1810
2065
|
spanCollector;
|
|
2066
|
+
tokenThreshold;
|
|
2067
|
+
maxDiscoverySteps;
|
|
1811
2068
|
role = "Judge" /* JUDGE */;
|
|
1812
2069
|
criteria;
|
|
1813
2070
|
/**
|
|
@@ -1815,7 +2072,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1815
2072
|
*/
|
|
1816
2073
|
invokeLLM = createLLMInvoker(this.logger);
|
|
1817
2074
|
async call(input) {
|
|
1818
|
-
var _a
|
|
2075
|
+
var _a;
|
|
1819
2076
|
const criteria = ((_a = input.judgmentRequest) == null ? void 0 : _a.criteria) ?? this.criteria;
|
|
1820
2077
|
this.logger.debug("call() invoked", {
|
|
1821
2078
|
threadId: input.threadId,
|
|
@@ -1823,8 +2080,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1823
2080
|
maxTurns: input.scenarioConfig.maxTurns,
|
|
1824
2081
|
judgmentRequest: input.judgmentRequest
|
|
1825
2082
|
});
|
|
1826
|
-
const
|
|
1827
|
-
|
|
2083
|
+
const spans = this.spanCollector.getSpansForThread(input.threadId);
|
|
2084
|
+
const { digest, isLargeTrace } = this.buildTraceDigest(spans);
|
|
1828
2085
|
const transcript = JudgeUtils.buildTranscriptFromMessages(input.messages);
|
|
1829
2086
|
const contentForJudge = `
|
|
1830
2087
|
<transcript>
|
|
@@ -1847,6 +2104,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1847
2104
|
...cfg
|
|
1848
2105
|
});
|
|
1849
2106
|
const tools = {
|
|
2107
|
+
...isLargeTrace ? buildProgressiveDiscoveryTools(spans) : {},
|
|
1850
2108
|
continue_test: buildContinueTestTool(),
|
|
1851
2109
|
finish_test: buildFinishTestTool(criteria)
|
|
1852
2110
|
};
|
|
@@ -1865,26 +2123,75 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1865
2123
|
model: mergedConfig.model,
|
|
1866
2124
|
toolChoice,
|
|
1867
2125
|
isLastMessage,
|
|
1868
|
-
enforceJudgement
|
|
2126
|
+
enforceJudgement,
|
|
2127
|
+
isLargeTrace
|
|
1869
2128
|
});
|
|
1870
|
-
const completion = await this.
|
|
2129
|
+
const completion = await this.invokeLLMWithDiscovery({
|
|
1871
2130
|
model: mergedConfig.model,
|
|
1872
2131
|
messages,
|
|
1873
2132
|
temperature: mergedConfig.temperature ?? 0,
|
|
1874
2133
|
maxOutputTokens: mergedConfig.maxTokens,
|
|
1875
2134
|
tools,
|
|
1876
|
-
toolChoice
|
|
2135
|
+
toolChoice,
|
|
2136
|
+
isLargeTrace
|
|
2137
|
+
});
|
|
2138
|
+
return this.parseToolCalls(completion, criteria);
|
|
2139
|
+
}
|
|
2140
|
+
/**
|
|
2141
|
+
* Builds the trace digest, choosing between full inline rendering
|
|
2142
|
+
* and structure-only mode based on estimated token count.
|
|
2143
|
+
*/
|
|
2144
|
+
buildTraceDigest(spans) {
|
|
2145
|
+
const fullDigest = judgeSpanDigestFormatter.format(spans);
|
|
2146
|
+
const isLargeTrace = spans.length > 0 && estimateTokens(fullDigest) > this.tokenThreshold;
|
|
2147
|
+
const digest = isLargeTrace ? judgeSpanDigestFormatter.formatStructureOnly(spans) + "\n\nUse expand_trace(span_id) to see span details or grep_trace(pattern) to search across spans. Reference spans by the ID shown in brackets." : fullDigest;
|
|
2148
|
+
this.logger.debug("Trace digest built", {
|
|
2149
|
+
isLargeTrace,
|
|
2150
|
+
estimatedTokens: estimateTokens(fullDigest)
|
|
1877
2151
|
});
|
|
2152
|
+
return { digest, isLargeTrace };
|
|
2153
|
+
}
|
|
2154
|
+
/**
|
|
2155
|
+
* Invokes the LLM, enabling multi-step tool execution for large traces.
|
|
2156
|
+
* In multi-step mode, the AI SDK loops automatically: the judge can call
|
|
2157
|
+
* expand_trace/grep_trace tools multiple times before reaching a terminal
|
|
2158
|
+
* tool (finish_test/continue_test) or hitting the step limit.
|
|
2159
|
+
*
|
|
2160
|
+
* When the trace is large, toolChoice is relaxed to "required" so the
|
|
2161
|
+
* judge can freely pick discovery tools (expand_trace/grep_trace) before
|
|
2162
|
+
* being forced to a terminal decision.
|
|
2163
|
+
*/
|
|
2164
|
+
async invokeLLMWithDiscovery({
|
|
2165
|
+
isLargeTrace,
|
|
2166
|
+
...params
|
|
2167
|
+
}) {
|
|
2168
|
+
var _a, _b;
|
|
2169
|
+
if (isLargeTrace) {
|
|
2170
|
+
params.toolChoice = "required";
|
|
2171
|
+
params.stopWhen = [
|
|
2172
|
+
stepCountIs(this.maxDiscoverySteps),
|
|
2173
|
+
hasToolCall("finish_test"),
|
|
2174
|
+
hasToolCall("continue_test")
|
|
2175
|
+
];
|
|
2176
|
+
}
|
|
2177
|
+
const completion = await this.invokeLLM(params);
|
|
1878
2178
|
this.logger.debug("LLM response received", {
|
|
1879
|
-
toolCallCount: ((
|
|
1880
|
-
toolCalls: (
|
|
2179
|
+
toolCallCount: ((_a = completion.toolCalls) == null ? void 0 : _a.length) ?? 0,
|
|
2180
|
+
toolCalls: (_b = completion.toolCalls) == null ? void 0 : _b.map((tc) => ({
|
|
1881
2181
|
toolName: tc.toolName,
|
|
1882
2182
|
args: tc.input
|
|
1883
2183
|
}))
|
|
1884
2184
|
});
|
|
2185
|
+
return completion;
|
|
2186
|
+
}
|
|
2187
|
+
parseToolCalls(completion, criteria) {
|
|
2188
|
+
var _a;
|
|
1885
2189
|
let args;
|
|
1886
|
-
if ((
|
|
1887
|
-
const
|
|
2190
|
+
if ((_a = completion.toolCalls) == null ? void 0 : _a.length) {
|
|
2191
|
+
const terminalCall = completion.toolCalls.find(
|
|
2192
|
+
(tc) => tc.toolName === "finish_test" || tc.toolName === "continue_test"
|
|
2193
|
+
);
|
|
2194
|
+
const toolCall = terminalCall ?? completion.toolCalls[0];
|
|
1888
2195
|
switch (toolCall.toolName) {
|
|
1889
2196
|
case "finish_test": {
|
|
1890
2197
|
args = toolCall.input;
|
|
@@ -1926,11 +2233,6 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1926
2233
|
unmetCriteria: criteria
|
|
1927
2234
|
};
|
|
1928
2235
|
}
|
|
1929
|
-
getOpenTelemetryTracesDigest(threadId) {
|
|
1930
|
-
const spans = this.spanCollector.getSpansForThread(threadId);
|
|
1931
|
-
const digest = judgeSpanDigestFormatter.format(spans);
|
|
1932
|
-
return digest;
|
|
1933
|
-
}
|
|
1934
2236
|
};
|
|
1935
2237
|
var judgeAgent = (cfg) => {
|
|
1936
2238
|
return new JudgeAgent(cfg ?? {});
|
|
@@ -4677,6 +4979,7 @@ export {
|
|
|
4677
4979
|
AgentAdapter,
|
|
4678
4980
|
AgentRole,
|
|
4679
4981
|
DEFAULT_MAX_TURNS,
|
|
4982
|
+
DEFAULT_TOKEN_THRESHOLD,
|
|
4680
4983
|
DEFAULT_VERBOSE,
|
|
4681
4984
|
JudgeAgentAdapter,
|
|
4682
4985
|
JudgeSpanCollector,
|
|
@@ -4690,7 +4993,10 @@ export {
|
|
|
4690
4993
|
allAgentRoles,
|
|
4691
4994
|
index_default as default,
|
|
4692
4995
|
defineConfig,
|
|
4996
|
+
estimateTokens,
|
|
4997
|
+
expandTrace,
|
|
4693
4998
|
fail,
|
|
4999
|
+
grepTrace,
|
|
4694
5000
|
judge,
|
|
4695
5001
|
judgeAgent,
|
|
4696
5002
|
judgeSpanCollector,
|