@langwatch/scenario 0.4.3 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +93 -13
- package/dist/index.d.ts +93 -13
- package/dist/index.js +462 -144
- package/dist/index.mjs +463 -145
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -969,6 +969,7 @@ __export(index_exports, {
|
|
|
969
969
|
AgentAdapter: () => AgentAdapter,
|
|
970
970
|
AgentRole: () => AgentRole,
|
|
971
971
|
DEFAULT_MAX_TURNS: () => DEFAULT_MAX_TURNS,
|
|
972
|
+
DEFAULT_TOKEN_THRESHOLD: () => DEFAULT_TOKEN_THRESHOLD,
|
|
972
973
|
DEFAULT_VERBOSE: () => DEFAULT_VERBOSE,
|
|
973
974
|
JudgeAgentAdapter: () => JudgeAgentAdapter,
|
|
974
975
|
JudgeSpanCollector: () => JudgeSpanCollector,
|
|
@@ -982,7 +983,10 @@ __export(index_exports, {
|
|
|
982
983
|
allAgentRoles: () => allAgentRoles,
|
|
983
984
|
default: () => index_default,
|
|
984
985
|
defineConfig: () => defineConfig,
|
|
986
|
+
estimateTokens: () => estimateTokens,
|
|
987
|
+
expandTrace: () => expandTrace,
|
|
985
988
|
fail: () => fail,
|
|
989
|
+
grepTrace: () => grepTrace,
|
|
986
990
|
judge: () => judge,
|
|
987
991
|
judgeAgent: () => judgeAgent,
|
|
988
992
|
judgeSpanCollector: () => judgeSpanCollector,
|
|
@@ -1004,9 +1008,13 @@ module.exports = __toCommonJS(index_exports);
|
|
|
1004
1008
|
// src/agents/index.ts
|
|
1005
1009
|
var agents_exports = {};
|
|
1006
1010
|
__export(agents_exports, {
|
|
1011
|
+
DEFAULT_TOKEN_THRESHOLD: () => DEFAULT_TOKEN_THRESHOLD,
|
|
1007
1012
|
JudgeSpanCollector: () => JudgeSpanCollector,
|
|
1008
1013
|
JudgeSpanDigestFormatter: () => JudgeSpanDigestFormatter,
|
|
1009
1014
|
RealtimeAgentAdapter: () => RealtimeAgentAdapter,
|
|
1015
|
+
estimateTokens: () => estimateTokens,
|
|
1016
|
+
expandTrace: () => expandTrace,
|
|
1017
|
+
grepTrace: () => grepTrace,
|
|
1010
1018
|
judgeAgent: () => judgeAgent,
|
|
1011
1019
|
judgeSpanCollector: () => judgeSpanCollector,
|
|
1012
1020
|
judgeSpanDigestFormatter: () => judgeSpanDigestFormatter,
|
|
@@ -1086,6 +1094,283 @@ var JudgeUtils = {
|
|
|
1086
1094
|
}
|
|
1087
1095
|
};
|
|
1088
1096
|
|
|
1097
|
+
// src/agents/judge/estimate-tokens.ts
|
|
1098
|
+
var DEFAULT_TOKEN_THRESHOLD = 8192;
|
|
1099
|
+
function estimateTokens(text) {
|
|
1100
|
+
const byteLength = new TextEncoder().encode(text).byteLength;
|
|
1101
|
+
return Math.ceil(byteLength / 4);
|
|
1102
|
+
}
|
|
1103
|
+
|
|
1104
|
+
// src/agents/judge/span-utils.ts
|
|
1105
|
+
var import_observability = require("langwatch/observability");
|
|
1106
|
+
|
|
1107
|
+
// src/agents/judge/deep-transform.ts
|
|
1108
|
+
function deepTransform(value, fn) {
|
|
1109
|
+
const result = fn(value);
|
|
1110
|
+
if (result !== value) return result;
|
|
1111
|
+
if (Array.isArray(value)) {
|
|
1112
|
+
return value.map((v) => deepTransform(v, fn));
|
|
1113
|
+
}
|
|
1114
|
+
if (value !== null && typeof value === "object") {
|
|
1115
|
+
const out = {};
|
|
1116
|
+
for (const [k, v] of Object.entries(value)) {
|
|
1117
|
+
out[k] = deepTransform(v, fn);
|
|
1118
|
+
}
|
|
1119
|
+
return out;
|
|
1120
|
+
}
|
|
1121
|
+
return value;
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
// src/agents/judge/truncate-media.ts
|
|
1125
|
+
function truncateMediaUrl(str) {
|
|
1126
|
+
const match = str.match(
|
|
1127
|
+
/^data:((image|audio|video)\/[a-z0-9+.-]+);base64,(.+)$/i
|
|
1128
|
+
);
|
|
1129
|
+
if (!match) return str;
|
|
1130
|
+
const [, mimeType, category, data] = match;
|
|
1131
|
+
return `[${category.toUpperCase()}: ${mimeType}, ~${data.length} bytes]`;
|
|
1132
|
+
}
|
|
1133
|
+
function truncateMediaPart(v) {
|
|
1134
|
+
var _a;
|
|
1135
|
+
if (v === null || typeof v !== "object" || Array.isArray(v)) return null;
|
|
1136
|
+
const obj = v;
|
|
1137
|
+
if (obj.type === "file" && typeof obj.mediaType === "string" && typeof obj.data === "string") {
|
|
1138
|
+
const category = ((_a = obj.mediaType.split("/")[0]) == null ? void 0 : _a.toUpperCase()) ?? "FILE";
|
|
1139
|
+
return {
|
|
1140
|
+
...obj,
|
|
1141
|
+
data: `[${category}: ${obj.mediaType}, ~${obj.data.length} bytes]`
|
|
1142
|
+
};
|
|
1143
|
+
}
|
|
1144
|
+
if (obj.type === "image" && typeof obj.image === "string") {
|
|
1145
|
+
const imageData = obj.image;
|
|
1146
|
+
const dataUrlMatch = imageData.match(
|
|
1147
|
+
/^data:((image)\/[a-z0-9+.-]+);base64,(.+)$/i
|
|
1148
|
+
);
|
|
1149
|
+
if (dataUrlMatch) {
|
|
1150
|
+
return {
|
|
1151
|
+
...obj,
|
|
1152
|
+
image: `[IMAGE: ${dataUrlMatch[1]}, ~${dataUrlMatch[3].length} bytes]`
|
|
1153
|
+
};
|
|
1154
|
+
}
|
|
1155
|
+
if (imageData.length > 1e3 && /^[A-Za-z0-9+/=]+$/.test(imageData)) {
|
|
1156
|
+
return {
|
|
1157
|
+
...obj,
|
|
1158
|
+
image: `[IMAGE: unknown, ~${imageData.length} bytes]`
|
|
1159
|
+
};
|
|
1160
|
+
}
|
|
1161
|
+
}
|
|
1162
|
+
return null;
|
|
1163
|
+
}
|
|
1164
|
+
|
|
1165
|
+
// src/agents/judge/span-utils.ts
|
|
1166
|
+
function hrTimeToMs(hrTime) {
|
|
1167
|
+
return hrTime[0] * 1e3 + hrTime[1] / 1e6;
|
|
1168
|
+
}
|
|
1169
|
+
function formatDuration(ms) {
|
|
1170
|
+
if (ms < 1e3) return `${Math.round(ms)}ms`;
|
|
1171
|
+
return `${(ms / 1e3).toFixed(2)}s`;
|
|
1172
|
+
}
|
|
1173
|
+
function calculateSpanDuration(span) {
|
|
1174
|
+
return hrTimeToMs(span.endTime) - hrTimeToMs(span.startTime);
|
|
1175
|
+
}
|
|
1176
|
+
function getStatusIndicator(span) {
|
|
1177
|
+
if (span.status.code === 2) {
|
|
1178
|
+
return ` \u26A0\uFE0F ERROR: ${span.status.message ?? "unknown"}`;
|
|
1179
|
+
}
|
|
1180
|
+
return "";
|
|
1181
|
+
}
|
|
1182
|
+
function getTokenUsage(span) {
|
|
1183
|
+
const input = span.attributes["gen_ai.usage.input_tokens"];
|
|
1184
|
+
const output = span.attributes["gen_ai.usage.output_tokens"];
|
|
1185
|
+
if (input == null && output == null) return "";
|
|
1186
|
+
const total = (Number(input) || 0) + (Number(output) || 0);
|
|
1187
|
+
return `, ${total} tokens`;
|
|
1188
|
+
}
|
|
1189
|
+
function cleanAttributes(attrs) {
|
|
1190
|
+
const cleaned = {};
|
|
1191
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1192
|
+
const excludedKeys = [
|
|
1193
|
+
import_observability.attributes.ATTR_LANGWATCH_THREAD_ID,
|
|
1194
|
+
"langwatch.scenario.id",
|
|
1195
|
+
"langwatch.scenario.name"
|
|
1196
|
+
];
|
|
1197
|
+
for (const [key, value] of Object.entries(attrs)) {
|
|
1198
|
+
if (excludedKeys.includes(key)) {
|
|
1199
|
+
continue;
|
|
1200
|
+
}
|
|
1201
|
+
const cleanKey = key.replace(/^(langwatch)\./, "");
|
|
1202
|
+
if (!seen.has(cleanKey)) {
|
|
1203
|
+
seen.add(cleanKey);
|
|
1204
|
+
cleaned[cleanKey] = value;
|
|
1205
|
+
}
|
|
1206
|
+
}
|
|
1207
|
+
return cleaned;
|
|
1208
|
+
}
|
|
1209
|
+
function formatValue(value) {
|
|
1210
|
+
const processed = transformValue(value);
|
|
1211
|
+
return typeof processed === "string" ? processed : JSON.stringify(processed);
|
|
1212
|
+
}
|
|
1213
|
+
function transformValue(value) {
|
|
1214
|
+
return deepTransform(value, (v) => {
|
|
1215
|
+
const mediaPart = truncateMediaPart(v);
|
|
1216
|
+
if (mediaPart) return mediaPart;
|
|
1217
|
+
if (typeof v !== "string") return v;
|
|
1218
|
+
const truncated = truncateMediaUrl(v);
|
|
1219
|
+
if (truncated !== v) return truncated;
|
|
1220
|
+
if (looksLikeJson(v)) {
|
|
1221
|
+
try {
|
|
1222
|
+
const parsed = transformValue(JSON.parse(v));
|
|
1223
|
+
return JSON.stringify(parsed);
|
|
1224
|
+
} catch {
|
|
1225
|
+
}
|
|
1226
|
+
}
|
|
1227
|
+
return v;
|
|
1228
|
+
});
|
|
1229
|
+
}
|
|
1230
|
+
function looksLikeJson(str) {
|
|
1231
|
+
const t = str.trim();
|
|
1232
|
+
return t.startsWith("{") && t.endsWith("}") || t.startsWith("[") && t.endsWith("]");
|
|
1233
|
+
}
|
|
1234
|
+
function indexSpans(spans) {
|
|
1235
|
+
const sorted = [...spans].sort((a, b) => {
|
|
1236
|
+
return hrTimeToMs(a.startTime) - hrTimeToMs(b.startTime);
|
|
1237
|
+
});
|
|
1238
|
+
return sorted.map((span, i) => ({
|
|
1239
|
+
span,
|
|
1240
|
+
children: [],
|
|
1241
|
+
index: i + 1
|
|
1242
|
+
}));
|
|
1243
|
+
}
|
|
1244
|
+
|
|
1245
|
+
// src/agents/judge/trace-tools.ts
|
|
1246
|
+
var TOOL_RESULT_TOKEN_BUDGET = 4096;
|
|
1247
|
+
var TOOL_RESULT_CHAR_BUDGET = TOOL_RESULT_TOKEN_BUDGET * 4;
|
|
1248
|
+
var MAX_GREP_MATCHES = 20;
|
|
1249
|
+
function renderFullSpanNode(node) {
|
|
1250
|
+
const span = node.span;
|
|
1251
|
+
const duration = calculateSpanDuration(span);
|
|
1252
|
+
const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
|
|
1253
|
+
const status = getStatusIndicator(span);
|
|
1254
|
+
const lines = [];
|
|
1255
|
+
lines.push(
|
|
1256
|
+
`[${node.index}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
|
|
1257
|
+
);
|
|
1258
|
+
const attrs = cleanAttributes(span.attributes);
|
|
1259
|
+
if (Object.keys(attrs).length > 0) {
|
|
1260
|
+
for (const [key, value] of Object.entries(attrs)) {
|
|
1261
|
+
lines.push(` ${key}: ${formatValue(value)}`);
|
|
1262
|
+
}
|
|
1263
|
+
}
|
|
1264
|
+
if (span.events.length > 0) {
|
|
1265
|
+
for (const event of span.events) {
|
|
1266
|
+
lines.push(` [event] ${event.name}`);
|
|
1267
|
+
if (event.attributes) {
|
|
1268
|
+
const eventAttrs = cleanAttributes(event.attributes);
|
|
1269
|
+
for (const [key, value] of Object.entries(eventAttrs)) {
|
|
1270
|
+
lines.push(` ${key}: ${formatValue(value)}`);
|
|
1271
|
+
}
|
|
1272
|
+
}
|
|
1273
|
+
}
|
|
1274
|
+
}
|
|
1275
|
+
return lines;
|
|
1276
|
+
}
|
|
1277
|
+
function truncateToCharBudget(text) {
|
|
1278
|
+
if (text.length <= TOOL_RESULT_CHAR_BUDGET) return text;
|
|
1279
|
+
const truncated = text.slice(0, TOOL_RESULT_CHAR_BUDGET);
|
|
1280
|
+
return truncated + "\n\n[TRUNCATED] Output exceeded ~4000 token budget. Use grep_trace(pattern) to search for specific content, or expand_trace with a narrower range.";
|
|
1281
|
+
}
|
|
1282
|
+
function spanToSearchableText(span) {
|
|
1283
|
+
const parts = [span.name];
|
|
1284
|
+
const attrs = cleanAttributes(span.attributes);
|
|
1285
|
+
for (const [key, value] of Object.entries(attrs)) {
|
|
1286
|
+
parts.push(`${key}: ${formatValue(value)}`);
|
|
1287
|
+
}
|
|
1288
|
+
for (const event of span.events) {
|
|
1289
|
+
parts.push(event.name);
|
|
1290
|
+
if (event.attributes) {
|
|
1291
|
+
const eventAttrs = cleanAttributes(event.attributes);
|
|
1292
|
+
for (const [key, value] of Object.entries(eventAttrs)) {
|
|
1293
|
+
parts.push(`${key}: ${formatValue(value)}`);
|
|
1294
|
+
}
|
|
1295
|
+
}
|
|
1296
|
+
}
|
|
1297
|
+
return parts.join("\n");
|
|
1298
|
+
}
|
|
1299
|
+
function expandTrace(spans, { index, range }) {
|
|
1300
|
+
const nodes = indexSpans(spans);
|
|
1301
|
+
if (nodes.length === 0) {
|
|
1302
|
+
return "No spans recorded.";
|
|
1303
|
+
}
|
|
1304
|
+
let startIdx;
|
|
1305
|
+
let endIdx;
|
|
1306
|
+
if (range != null) {
|
|
1307
|
+
const parts = range.split("-").map(Number);
|
|
1308
|
+
startIdx = parts[0];
|
|
1309
|
+
endIdx = parts[1] ?? startIdx;
|
|
1310
|
+
} else if (index != null) {
|
|
1311
|
+
startIdx = index;
|
|
1312
|
+
endIdx = index;
|
|
1313
|
+
} else {
|
|
1314
|
+
return "Error: provide either index or range parameter.";
|
|
1315
|
+
}
|
|
1316
|
+
const maxIndex = nodes.length;
|
|
1317
|
+
if (startIdx < 1 || endIdx > maxIndex || startIdx > endIdx) {
|
|
1318
|
+
return `Error: span index out of range. Valid range is 1-${maxIndex}.`;
|
|
1319
|
+
}
|
|
1320
|
+
const selected = nodes.filter(
|
|
1321
|
+
(n) => n.index >= startIdx && n.index <= endIdx
|
|
1322
|
+
);
|
|
1323
|
+
const lines = [];
|
|
1324
|
+
for (const node of selected) {
|
|
1325
|
+
const spanLines = renderFullSpanNode(node);
|
|
1326
|
+
lines.push(...spanLines);
|
|
1327
|
+
lines.push("");
|
|
1328
|
+
}
|
|
1329
|
+
return truncateToCharBudget(lines.join("\n").trimEnd());
|
|
1330
|
+
}
|
|
1331
|
+
function grepTrace(spans, pattern) {
|
|
1332
|
+
const nodes = indexSpans(spans);
|
|
1333
|
+
if (nodes.length === 0) {
|
|
1334
|
+
return "No spans recorded.";
|
|
1335
|
+
}
|
|
1336
|
+
const regex = new RegExp(escapeRegex(pattern), "i");
|
|
1337
|
+
const matches = [];
|
|
1338
|
+
for (const node of nodes) {
|
|
1339
|
+
const searchText = spanToSearchableText(node.span);
|
|
1340
|
+
const lines2 = searchText.split("\n");
|
|
1341
|
+
const matchingLines = lines2.filter((line) => regex.test(line));
|
|
1342
|
+
if (matchingLines.length > 0) {
|
|
1343
|
+
matches.push({ node, matchingLines });
|
|
1344
|
+
}
|
|
1345
|
+
}
|
|
1346
|
+
if (matches.length === 0) {
|
|
1347
|
+
const spanNames = Array.from(new Set(nodes.map((n) => n.span.name)));
|
|
1348
|
+
return `No matches found for "${pattern}". Available span names: ${spanNames.join(", ")}`;
|
|
1349
|
+
}
|
|
1350
|
+
const totalMatches = matches.length;
|
|
1351
|
+
const limited = matches.slice(0, MAX_GREP_MATCHES);
|
|
1352
|
+
const lines = [];
|
|
1353
|
+
for (const { node, matchingLines } of limited) {
|
|
1354
|
+
const duration = calculateSpanDuration(node.span);
|
|
1355
|
+
lines.push(
|
|
1356
|
+
`--- [${node.index}] ${node.span.name} (${formatDuration(duration)}) ---`
|
|
1357
|
+
);
|
|
1358
|
+
for (const line of matchingLines) {
|
|
1359
|
+
lines.push(` ${line}`);
|
|
1360
|
+
}
|
|
1361
|
+
lines.push("");
|
|
1362
|
+
}
|
|
1363
|
+
if (totalMatches > MAX_GREP_MATCHES) {
|
|
1364
|
+
lines.push(
|
|
1365
|
+
`[${totalMatches - MAX_GREP_MATCHES} more matches omitted. Refine your search pattern for more specific results.]`
|
|
1366
|
+
);
|
|
1367
|
+
}
|
|
1368
|
+
return truncateToCharBudget(lines.join("\n").trimEnd());
|
|
1369
|
+
}
|
|
1370
|
+
function escapeRegex(str) {
|
|
1371
|
+
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
1372
|
+
}
|
|
1373
|
+
|
|
1089
1374
|
// src/config/env.ts
|
|
1090
1375
|
var import_v4 = require("zod/v4");
|
|
1091
1376
|
|
|
@@ -1441,7 +1726,7 @@ var criterionToParamName = (criterion) => {
|
|
|
1441
1726
|
};
|
|
1442
1727
|
|
|
1443
1728
|
// src/agents/judge/judge-span-collector.ts
|
|
1444
|
-
var
|
|
1729
|
+
var import_observability2 = require("langwatch/observability");
|
|
1445
1730
|
var JudgeSpanCollector = class {
|
|
1446
1731
|
spans = [];
|
|
1447
1732
|
onStart() {
|
|
@@ -1484,7 +1769,7 @@ var JudgeSpanCollector = class {
|
|
|
1484
1769
|
const spanId = span.spanContext().spanId;
|
|
1485
1770
|
if (visited.has(spanId)) return false;
|
|
1486
1771
|
visited.add(spanId);
|
|
1487
|
-
if (span.attributes[
|
|
1772
|
+
if (span.attributes[import_observability2.attributes.ATTR_LANGWATCH_THREAD_ID] === threadId) {
|
|
1488
1773
|
return true;
|
|
1489
1774
|
}
|
|
1490
1775
|
const parentId = getParentSpanId(span);
|
|
@@ -1503,26 +1788,6 @@ function getParentSpanId(span) {
|
|
|
1503
1788
|
}
|
|
1504
1789
|
var judgeSpanCollector = new JudgeSpanCollector();
|
|
1505
1790
|
|
|
1506
|
-
// src/agents/judge/judge-span-digest-formatter.ts
|
|
1507
|
-
var import_observability2 = require("langwatch/observability");
|
|
1508
|
-
|
|
1509
|
-
// src/agents/judge/deep-transform.ts
|
|
1510
|
-
function deepTransform(value, fn) {
|
|
1511
|
-
const result = fn(value);
|
|
1512
|
-
if (result !== value) return result;
|
|
1513
|
-
if (Array.isArray(value)) {
|
|
1514
|
-
return value.map((v) => deepTransform(v, fn));
|
|
1515
|
-
}
|
|
1516
|
-
if (value !== null && typeof value === "object") {
|
|
1517
|
-
const out = {};
|
|
1518
|
-
for (const [k, v] of Object.entries(value)) {
|
|
1519
|
-
out[k] = deepTransform(v, fn);
|
|
1520
|
-
}
|
|
1521
|
-
return out;
|
|
1522
|
-
}
|
|
1523
|
-
return value;
|
|
1524
|
-
}
|
|
1525
|
-
|
|
1526
1791
|
// src/agents/judge/string-deduplicator.ts
|
|
1527
1792
|
var StringDeduplicator = class {
|
|
1528
1793
|
seen = /* @__PURE__ */ new Map();
|
|
@@ -1556,51 +1821,51 @@ var StringDeduplicator = class {
|
|
|
1556
1821
|
}
|
|
1557
1822
|
};
|
|
1558
1823
|
|
|
1559
|
-
// src/agents/judge/truncate-media.ts
|
|
1560
|
-
function truncateMediaUrl(str) {
|
|
1561
|
-
const match = str.match(
|
|
1562
|
-
/^data:((image|audio|video)\/[a-z0-9+.-]+);base64,(.+)$/i
|
|
1563
|
-
);
|
|
1564
|
-
if (!match) return str;
|
|
1565
|
-
const [, mimeType, category, data] = match;
|
|
1566
|
-
return `[${category.toUpperCase()}: ${mimeType}, ~${data.length} bytes]`;
|
|
1567
|
-
}
|
|
1568
|
-
function truncateMediaPart(v) {
|
|
1569
|
-
var _a;
|
|
1570
|
-
if (v === null || typeof v !== "object" || Array.isArray(v)) return null;
|
|
1571
|
-
const obj = v;
|
|
1572
|
-
if (obj.type === "file" && typeof obj.mediaType === "string" && typeof obj.data === "string") {
|
|
1573
|
-
const category = ((_a = obj.mediaType.split("/")[0]) == null ? void 0 : _a.toUpperCase()) ?? "FILE";
|
|
1574
|
-
return {
|
|
1575
|
-
...obj,
|
|
1576
|
-
data: `[${category}: ${obj.mediaType}, ~${obj.data.length} bytes]`
|
|
1577
|
-
};
|
|
1578
|
-
}
|
|
1579
|
-
if (obj.type === "image" && typeof obj.image === "string") {
|
|
1580
|
-
const imageData = obj.image;
|
|
1581
|
-
const dataUrlMatch = imageData.match(
|
|
1582
|
-
/^data:((image)\/[a-z0-9+.-]+);base64,(.+)$/i
|
|
1583
|
-
);
|
|
1584
|
-
if (dataUrlMatch) {
|
|
1585
|
-
return {
|
|
1586
|
-
...obj,
|
|
1587
|
-
image: `[IMAGE: ${dataUrlMatch[1]}, ~${dataUrlMatch[3].length} bytes]`
|
|
1588
|
-
};
|
|
1589
|
-
}
|
|
1590
|
-
if (imageData.length > 1e3 && /^[A-Za-z0-9+/=]+$/.test(imageData)) {
|
|
1591
|
-
return {
|
|
1592
|
-
...obj,
|
|
1593
|
-
image: `[IMAGE: unknown, ~${imageData.length} bytes]`
|
|
1594
|
-
};
|
|
1595
|
-
}
|
|
1596
|
-
}
|
|
1597
|
-
return null;
|
|
1598
|
-
}
|
|
1599
|
-
|
|
1600
1824
|
// src/agents/judge/judge-span-digest-formatter.ts
|
|
1601
1825
|
var JudgeSpanDigestFormatter = class {
|
|
1602
1826
|
logger = new Logger("JudgeSpanDigestFormatter");
|
|
1603
1827
|
deduplicator = new StringDeduplicator({ threshold: 50 });
|
|
1828
|
+
/**
|
|
1829
|
+
* Formats spans into a structure-only digest showing span tree hierarchy
|
|
1830
|
+
* without attributes, events, or content. Used for large traces that
|
|
1831
|
+
* exceed the token threshold, paired with expand_trace/grep_trace tools.
|
|
1832
|
+
*
|
|
1833
|
+
* @param spans - All spans for a thread
|
|
1834
|
+
* @returns Plain text digest with only structural information
|
|
1835
|
+
*/
|
|
1836
|
+
formatStructureOnly(spans) {
|
|
1837
|
+
this.logger.debug("formatStructureOnly() called", {
|
|
1838
|
+
spanCount: spans.length
|
|
1839
|
+
});
|
|
1840
|
+
if (spans.length === 0) {
|
|
1841
|
+
return "No spans recorded.";
|
|
1842
|
+
}
|
|
1843
|
+
const sortedSpans = this.sortByStartTime(spans);
|
|
1844
|
+
const tree = this.buildHierarchy(sortedSpans);
|
|
1845
|
+
const totalDuration = this.calculateTotalDuration(sortedSpans);
|
|
1846
|
+
const lines = [
|
|
1847
|
+
`Spans: ${spans.length} | Total Duration: ${formatDuration(totalDuration)}`,
|
|
1848
|
+
""
|
|
1849
|
+
];
|
|
1850
|
+
let sequence = 1;
|
|
1851
|
+
const rootCount = tree.length;
|
|
1852
|
+
tree.forEach((node, idx) => {
|
|
1853
|
+
sequence = this.renderStructureNode(
|
|
1854
|
+
node,
|
|
1855
|
+
lines,
|
|
1856
|
+
0,
|
|
1857
|
+
sequence,
|
|
1858
|
+
idx === rootCount - 1
|
|
1859
|
+
);
|
|
1860
|
+
});
|
|
1861
|
+
const errors = this.collectErrors(spans);
|
|
1862
|
+
if (errors.length > 0) {
|
|
1863
|
+
lines.push("");
|
|
1864
|
+
lines.push("=== ERRORS ===");
|
|
1865
|
+
errors.forEach((e) => lines.push(e));
|
|
1866
|
+
}
|
|
1867
|
+
return lines.join("\n");
|
|
1868
|
+
}
|
|
1604
1869
|
/**
|
|
1605
1870
|
* Formats spans into a complete digest with full content and nesting.
|
|
1606
1871
|
* @param spans - All spans for a thread
|
|
@@ -1624,7 +1889,7 @@ var JudgeSpanDigestFormatter = class {
|
|
|
1624
1889
|
totalDuration
|
|
1625
1890
|
});
|
|
1626
1891
|
const lines = [
|
|
1627
|
-
`Spans: ${spans.length} | Total Duration: ${
|
|
1892
|
+
`Spans: ${spans.length} | Total Duration: ${formatDuration(
|
|
1628
1893
|
totalDuration
|
|
1629
1894
|
)}`,
|
|
1630
1895
|
""
|
|
@@ -1650,9 +1915,7 @@ var JudgeSpanDigestFormatter = class {
|
|
|
1650
1915
|
}
|
|
1651
1916
|
sortByStartTime(spans) {
|
|
1652
1917
|
return [...spans].sort((a, b) => {
|
|
1653
|
-
|
|
1654
|
-
const bTime = this.hrTimeToMs(b.startTime);
|
|
1655
|
-
return aTime - bTime;
|
|
1918
|
+
return hrTimeToMs(a.startTime) - hrTimeToMs(b.startTime);
|
|
1656
1919
|
});
|
|
1657
1920
|
}
|
|
1658
1921
|
buildHierarchy(spans) {
|
|
@@ -1672,29 +1935,53 @@ var JudgeSpanDigestFormatter = class {
|
|
|
1672
1935
|
}
|
|
1673
1936
|
return roots;
|
|
1674
1937
|
}
|
|
1938
|
+
renderStructureNode(node, lines, depth, sequence, isLast = true) {
|
|
1939
|
+
const span = node.span;
|
|
1940
|
+
const duration = calculateSpanDuration(span);
|
|
1941
|
+
const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
|
|
1942
|
+
const status = getStatusIndicator(span);
|
|
1943
|
+
const tokens = getTokenUsage(span);
|
|
1944
|
+
const prefix = this.getTreePrefix(depth, isLast);
|
|
1945
|
+
lines.push(
|
|
1946
|
+
`${prefix}[${sequence}] ${timestamp} ${span.name} (${formatDuration(duration)}${tokens})${status}`
|
|
1947
|
+
);
|
|
1948
|
+
lines.push("");
|
|
1949
|
+
let nextSeq = sequence + 1;
|
|
1950
|
+
const childCount = node.children.length;
|
|
1951
|
+
node.children.forEach((child, idx) => {
|
|
1952
|
+
nextSeq = this.renderStructureNode(
|
|
1953
|
+
child,
|
|
1954
|
+
lines,
|
|
1955
|
+
depth + 1,
|
|
1956
|
+
nextSeq,
|
|
1957
|
+
idx === childCount - 1
|
|
1958
|
+
);
|
|
1959
|
+
});
|
|
1960
|
+
return nextSeq;
|
|
1961
|
+
}
|
|
1675
1962
|
renderNode(node, lines, depth, sequence, isLast = true) {
|
|
1676
1963
|
const span = node.span;
|
|
1677
|
-
const duration =
|
|
1678
|
-
const timestamp =
|
|
1679
|
-
const status =
|
|
1964
|
+
const duration = calculateSpanDuration(span);
|
|
1965
|
+
const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
|
|
1966
|
+
const status = getStatusIndicator(span);
|
|
1680
1967
|
const prefix = this.getTreePrefix(depth, isLast);
|
|
1681
1968
|
lines.push(
|
|
1682
|
-
`${prefix}[${sequence}] ${
|
|
1969
|
+
`${prefix}[${sequence}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
|
|
1683
1970
|
);
|
|
1684
1971
|
const attrIndent = this.getAttrIndent(depth, isLast);
|
|
1685
|
-
const attrs =
|
|
1972
|
+
const attrs = cleanAttributes(span.attributes);
|
|
1686
1973
|
if (Object.keys(attrs).length > 0) {
|
|
1687
1974
|
for (const [key, value] of Object.entries(attrs)) {
|
|
1688
|
-
lines.push(`${attrIndent}${key}: ${this.
|
|
1975
|
+
lines.push(`${attrIndent}${key}: ${this.formatValueWithDedup(value)}`);
|
|
1689
1976
|
}
|
|
1690
1977
|
}
|
|
1691
1978
|
if (span.events.length > 0) {
|
|
1692
1979
|
for (const event of span.events) {
|
|
1693
1980
|
lines.push(`${attrIndent}[event] ${event.name}`);
|
|
1694
1981
|
if (event.attributes) {
|
|
1695
|
-
const eventAttrs =
|
|
1982
|
+
const eventAttrs = cleanAttributes(event.attributes);
|
|
1696
1983
|
for (const [key, value] of Object.entries(eventAttrs)) {
|
|
1697
|
-
lines.push(`${attrIndent} ${key}: ${this.
|
|
1984
|
+
lines.push(`${attrIndent} ${key}: ${this.formatValueWithDedup(value)}`);
|
|
1698
1985
|
}
|
|
1699
1986
|
}
|
|
1700
1987
|
}
|
|
@@ -1723,42 +2010,26 @@ var JudgeSpanDigestFormatter = class {
|
|
|
1723
2010
|
const continuation = isLast ? " " : "\u2502 ";
|
|
1724
2011
|
return "\u2502 ".repeat(depth - 1) + continuation + " ";
|
|
1725
2012
|
}
|
|
1726
|
-
|
|
1727
|
-
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
|
|
1731
|
-
|
|
1732
|
-
"langwatch.scenario.name"
|
|
1733
|
-
];
|
|
1734
|
-
for (const [key, value] of Object.entries(attrs)) {
|
|
1735
|
-
if (excludedKeys.includes(key)) {
|
|
1736
|
-
continue;
|
|
1737
|
-
}
|
|
1738
|
-
const cleanKey = key.replace(/^(langwatch)\./, "");
|
|
1739
|
-
if (!seen.has(cleanKey)) {
|
|
1740
|
-
seen.add(cleanKey);
|
|
1741
|
-
cleaned[cleanKey] = value;
|
|
1742
|
-
}
|
|
1743
|
-
}
|
|
1744
|
-
return cleaned;
|
|
1745
|
-
}
|
|
1746
|
-
formatValue(value) {
|
|
1747
|
-
const processed = this.transformValue(value);
|
|
2013
|
+
/**
|
|
2014
|
+
* Formats a value with deduplication applied. Used by the `format()` method
|
|
2015
|
+
* to reduce token usage by replacing repeated strings with markers.
|
|
2016
|
+
*/
|
|
2017
|
+
formatValueWithDedup(value) {
|
|
2018
|
+
const processed = this.transformValueWithDedup(value);
|
|
1748
2019
|
return typeof processed === "string" ? processed : JSON.stringify(processed);
|
|
1749
2020
|
}
|
|
1750
|
-
|
|
2021
|
+
transformValueWithDedup(value) {
|
|
1751
2022
|
return deepTransform(value, (v) => {
|
|
1752
2023
|
const mediaPart = truncateMediaPart(v);
|
|
1753
2024
|
if (mediaPart) return mediaPart;
|
|
1754
2025
|
if (typeof v !== "string") return v;
|
|
1755
|
-
return this.
|
|
2026
|
+
return this.transformStringWithDedup(v);
|
|
1756
2027
|
});
|
|
1757
2028
|
}
|
|
1758
|
-
|
|
1759
|
-
if (
|
|
2029
|
+
transformStringWithDedup(str) {
|
|
2030
|
+
if (looksLikeJson(str)) {
|
|
1760
2031
|
try {
|
|
1761
|
-
const processed = this.
|
|
2032
|
+
const processed = this.transformValueWithDedup(JSON.parse(str));
|
|
1762
2033
|
return JSON.stringify(processed);
|
|
1763
2034
|
} catch {
|
|
1764
2035
|
}
|
|
@@ -1767,36 +2038,12 @@ var JudgeSpanDigestFormatter = class {
|
|
|
1767
2038
|
if (truncated !== str) return truncated;
|
|
1768
2039
|
return this.deduplicator.process(str);
|
|
1769
2040
|
}
|
|
1770
|
-
looksLikeJson(str) {
|
|
1771
|
-
const t = str.trim();
|
|
1772
|
-
return t.startsWith("{") && t.endsWith("}") || t.startsWith("[") && t.endsWith("]");
|
|
1773
|
-
}
|
|
1774
|
-
hrTimeToMs(hrTime) {
|
|
1775
|
-
return hrTime[0] * 1e3 + hrTime[1] / 1e6;
|
|
1776
|
-
}
|
|
1777
|
-
calculateSpanDuration(span) {
|
|
1778
|
-
return this.hrTimeToMs(span.endTime) - this.hrTimeToMs(span.startTime);
|
|
1779
|
-
}
|
|
1780
2041
|
calculateTotalDuration(spans) {
|
|
1781
2042
|
if (spans.length === 0) return 0;
|
|
1782
|
-
const first =
|
|
1783
|
-
const last = Math.max(...spans.map((s) =>
|
|
2043
|
+
const first = hrTimeToMs(spans[0].startTime);
|
|
2044
|
+
const last = Math.max(...spans.map((s) => hrTimeToMs(s.endTime)));
|
|
1784
2045
|
return last - first;
|
|
1785
2046
|
}
|
|
1786
|
-
formatDuration(ms) {
|
|
1787
|
-
if (ms < 1e3) return `${Math.round(ms)}ms`;
|
|
1788
|
-
return `${(ms / 1e3).toFixed(2)}s`;
|
|
1789
|
-
}
|
|
1790
|
-
formatTimestamp(hrTime) {
|
|
1791
|
-
const ms = this.hrTimeToMs(hrTime);
|
|
1792
|
-
return new Date(ms).toISOString();
|
|
1793
|
-
}
|
|
1794
|
-
getStatusIndicator(span) {
|
|
1795
|
-
if (span.status.code === 2) {
|
|
1796
|
-
return ` \u26A0\uFE0F ERROR: ${span.status.message ?? "unknown"}`;
|
|
1797
|
-
}
|
|
1798
|
-
return "";
|
|
1799
|
-
}
|
|
1800
2047
|
collectErrors(spans) {
|
|
1801
2048
|
return spans.filter((s) => s.status.code === 2).map((s) => `- ${s.name}: ${s.status.message ?? "unknown error"}`);
|
|
1802
2049
|
}
|
|
@@ -1859,15 +2106,42 @@ function buildFinishTestTool(criteria) {
|
|
|
1859
2106
|
})
|
|
1860
2107
|
});
|
|
1861
2108
|
}
|
|
2109
|
+
function buildProgressiveDiscoveryTools(spans) {
|
|
2110
|
+
return {
|
|
2111
|
+
expand_trace: (0, import_ai2.tool)({
|
|
2112
|
+
description: "Expand one or more spans to see their full details (attributes, events, content). Use a single index like 5 or a range like '10-15'.",
|
|
2113
|
+
inputSchema: import_v44.z.object({
|
|
2114
|
+
index: import_v44.z.number().optional().describe("Single span index to expand"),
|
|
2115
|
+
range: import_v44.z.string().optional().describe('Range of span indices to expand, e.g. "10-15"')
|
|
2116
|
+
}),
|
|
2117
|
+
execute: async ({ index, range }) => {
|
|
2118
|
+
return expandTrace(spans, { index, range });
|
|
2119
|
+
}
|
|
2120
|
+
}),
|
|
2121
|
+
grep_trace: (0, import_ai2.tool)({
|
|
2122
|
+
description: "Search across all span attributes, events, and content for a pattern (case-insensitive). Returns matching spans with context.",
|
|
2123
|
+
inputSchema: import_v44.z.object({
|
|
2124
|
+
pattern: import_v44.z.string().describe("Search pattern (case-insensitive)")
|
|
2125
|
+
}),
|
|
2126
|
+
execute: async ({ pattern }) => {
|
|
2127
|
+
return grepTrace(spans, pattern);
|
|
2128
|
+
}
|
|
2129
|
+
})
|
|
2130
|
+
};
|
|
2131
|
+
}
|
|
1862
2132
|
var JudgeAgent = class extends JudgeAgentAdapter {
|
|
1863
2133
|
constructor(cfg) {
|
|
1864
2134
|
super();
|
|
1865
2135
|
this.cfg = cfg;
|
|
1866
2136
|
this.criteria = cfg.criteria ?? [];
|
|
1867
2137
|
this.spanCollector = cfg.spanCollector ?? judgeSpanCollector;
|
|
2138
|
+
this.tokenThreshold = cfg.tokenThreshold ?? DEFAULT_TOKEN_THRESHOLD;
|
|
2139
|
+
this.maxDiscoverySteps = cfg.maxDiscoverySteps ?? 10;
|
|
1868
2140
|
}
|
|
1869
2141
|
logger = new Logger("JudgeAgent");
|
|
1870
2142
|
spanCollector;
|
|
2143
|
+
tokenThreshold;
|
|
2144
|
+
maxDiscoverySteps;
|
|
1871
2145
|
role = "Judge" /* JUDGE */;
|
|
1872
2146
|
criteria;
|
|
1873
2147
|
/**
|
|
@@ -1875,7 +2149,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1875
2149
|
*/
|
|
1876
2150
|
invokeLLM = createLLMInvoker(this.logger);
|
|
1877
2151
|
async call(input) {
|
|
1878
|
-
var _a
|
|
2152
|
+
var _a;
|
|
1879
2153
|
const criteria = ((_a = input.judgmentRequest) == null ? void 0 : _a.criteria) ?? this.criteria;
|
|
1880
2154
|
this.logger.debug("call() invoked", {
|
|
1881
2155
|
threadId: input.threadId,
|
|
@@ -1883,8 +2157,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1883
2157
|
maxTurns: input.scenarioConfig.maxTurns,
|
|
1884
2158
|
judgmentRequest: input.judgmentRequest
|
|
1885
2159
|
});
|
|
1886
|
-
const
|
|
1887
|
-
|
|
2160
|
+
const spans = this.spanCollector.getSpansForThread(input.threadId);
|
|
2161
|
+
const { digest, isLargeTrace } = this.buildTraceDigest(spans);
|
|
1888
2162
|
const transcript = JudgeUtils.buildTranscriptFromMessages(input.messages);
|
|
1889
2163
|
const contentForJudge = `
|
|
1890
2164
|
<transcript>
|
|
@@ -1908,7 +2182,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1908
2182
|
});
|
|
1909
2183
|
const tools = {
|
|
1910
2184
|
continue_test: buildContinueTestTool(),
|
|
1911
|
-
finish_test: buildFinishTestTool(criteria)
|
|
2185
|
+
finish_test: buildFinishTestTool(criteria),
|
|
2186
|
+
...isLargeTrace ? buildProgressiveDiscoveryTools(spans) : {}
|
|
1912
2187
|
};
|
|
1913
2188
|
const enforceJudgement = input.judgmentRequest != null;
|
|
1914
2189
|
const hasCriteria = criteria.length && criteria.length > 0;
|
|
@@ -1925,26 +2200,70 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1925
2200
|
model: mergedConfig.model,
|
|
1926
2201
|
toolChoice,
|
|
1927
2202
|
isLastMessage,
|
|
1928
|
-
enforceJudgement
|
|
2203
|
+
enforceJudgement,
|
|
2204
|
+
isLargeTrace
|
|
1929
2205
|
});
|
|
1930
|
-
const completion = await this.
|
|
2206
|
+
const completion = await this.invokeLLMWithDiscovery({
|
|
1931
2207
|
model: mergedConfig.model,
|
|
1932
2208
|
messages,
|
|
1933
2209
|
temperature: mergedConfig.temperature ?? 0,
|
|
1934
2210
|
maxOutputTokens: mergedConfig.maxTokens,
|
|
1935
2211
|
tools,
|
|
1936
|
-
toolChoice
|
|
2212
|
+
toolChoice,
|
|
2213
|
+
isLargeTrace
|
|
2214
|
+
});
|
|
2215
|
+
return this.parseToolCalls(completion, criteria);
|
|
2216
|
+
}
|
|
2217
|
+
/**
|
|
2218
|
+
* Builds the trace digest, choosing between full inline rendering
|
|
2219
|
+
* and structure-only mode based on estimated token count.
|
|
2220
|
+
*/
|
|
2221
|
+
buildTraceDigest(spans) {
|
|
2222
|
+
const fullDigest = judgeSpanDigestFormatter.format(spans);
|
|
2223
|
+
const isLargeTrace = spans.length > 0 && estimateTokens(fullDigest) > this.tokenThreshold;
|
|
2224
|
+
const digest = isLargeTrace ? judgeSpanDigestFormatter.formatStructureOnly(spans) + "\n\nUse expand_trace(spanIndex) to see span details or grep_trace(pattern) to search across spans." : fullDigest;
|
|
2225
|
+
this.logger.debug("Trace digest built", {
|
|
2226
|
+
isLargeTrace,
|
|
2227
|
+
estimatedTokens: estimateTokens(fullDigest)
|
|
1937
2228
|
});
|
|
2229
|
+
return { digest, isLargeTrace };
|
|
2230
|
+
}
|
|
2231
|
+
/**
|
|
2232
|
+
* Invokes the LLM, enabling multi-step tool execution for large traces.
|
|
2233
|
+
* In multi-step mode, the AI SDK loops automatically: the judge can call
|
|
2234
|
+
* expand_trace/grep_trace tools multiple times before reaching a terminal
|
|
2235
|
+
* tool (finish_test/continue_test) or hitting the step limit.
|
|
2236
|
+
*/
|
|
2237
|
+
async invokeLLMWithDiscovery({
|
|
2238
|
+
isLargeTrace,
|
|
2239
|
+
...params
|
|
2240
|
+
}) {
|
|
2241
|
+
var _a, _b;
|
|
2242
|
+
if (isLargeTrace) {
|
|
2243
|
+
params.stopWhen = [
|
|
2244
|
+
(0, import_ai2.stepCountIs)(this.maxDiscoverySteps),
|
|
2245
|
+
(0, import_ai2.hasToolCall)("finish_test"),
|
|
2246
|
+
(0, import_ai2.hasToolCall)("continue_test")
|
|
2247
|
+
];
|
|
2248
|
+
}
|
|
2249
|
+
const completion = await this.invokeLLM(params);
|
|
1938
2250
|
this.logger.debug("LLM response received", {
|
|
1939
|
-
toolCallCount: ((
|
|
1940
|
-
toolCalls: (
|
|
2251
|
+
toolCallCount: ((_a = completion.toolCalls) == null ? void 0 : _a.length) ?? 0,
|
|
2252
|
+
toolCalls: (_b = completion.toolCalls) == null ? void 0 : _b.map((tc) => ({
|
|
1941
2253
|
toolName: tc.toolName,
|
|
1942
2254
|
args: tc.input
|
|
1943
2255
|
}))
|
|
1944
2256
|
});
|
|
2257
|
+
return completion;
|
|
2258
|
+
}
|
|
2259
|
+
parseToolCalls(completion, criteria) {
|
|
2260
|
+
var _a;
|
|
1945
2261
|
let args;
|
|
1946
|
-
if ((
|
|
1947
|
-
const
|
|
2262
|
+
if ((_a = completion.toolCalls) == null ? void 0 : _a.length) {
|
|
2263
|
+
const terminalCall = completion.toolCalls.find(
|
|
2264
|
+
(tc) => tc.toolName === "finish_test" || tc.toolName === "continue_test"
|
|
2265
|
+
);
|
|
2266
|
+
const toolCall = terminalCall ?? completion.toolCalls[0];
|
|
1948
2267
|
switch (toolCall.toolName) {
|
|
1949
2268
|
case "finish_test": {
|
|
1950
2269
|
args = toolCall.input;
|
|
@@ -1986,11 +2305,6 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1986
2305
|
unmetCriteria: criteria
|
|
1987
2306
|
};
|
|
1988
2307
|
}
|
|
1989
|
-
getOpenTelemetryTracesDigest(threadId) {
|
|
1990
|
-
const spans = this.spanCollector.getSpansForThread(threadId);
|
|
1991
|
-
const digest = judgeSpanDigestFormatter.format(spans);
|
|
1992
|
-
return digest;
|
|
1993
|
-
}
|
|
1994
2308
|
};
|
|
1995
2309
|
var judgeAgent = (cfg) => {
|
|
1996
2310
|
return new JudgeAgent(cfg ?? {});
|
|
@@ -4731,6 +5045,7 @@ var index_default = scenario;
|
|
|
4731
5045
|
AgentAdapter,
|
|
4732
5046
|
AgentRole,
|
|
4733
5047
|
DEFAULT_MAX_TURNS,
|
|
5048
|
+
DEFAULT_TOKEN_THRESHOLD,
|
|
4734
5049
|
DEFAULT_VERBOSE,
|
|
4735
5050
|
JudgeAgentAdapter,
|
|
4736
5051
|
JudgeSpanCollector,
|
|
@@ -4743,7 +5058,10 @@ var index_default = scenario;
|
|
|
4743
5058
|
agent,
|
|
4744
5059
|
allAgentRoles,
|
|
4745
5060
|
defineConfig,
|
|
5061
|
+
estimateTokens,
|
|
5062
|
+
expandTrace,
|
|
4746
5063
|
fail,
|
|
5064
|
+
grepTrace,
|
|
4747
5065
|
judge,
|
|
4748
5066
|
judgeAgent,
|
|
4749
5067
|
judgeSpanCollector,
|