@langwatch/scenario 0.4.3 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +93 -13
- package/dist/index.d.ts +93 -13
- package/dist/index.js +462 -144
- package/dist/index.mjs +463 -145
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -944,9 +944,13 @@ var init_esm = __esm({
|
|
|
944
944
|
// src/agents/index.ts
|
|
945
945
|
var agents_exports = {};
|
|
946
946
|
__export(agents_exports, {
|
|
947
|
+
DEFAULT_TOKEN_THRESHOLD: () => DEFAULT_TOKEN_THRESHOLD,
|
|
947
948
|
JudgeSpanCollector: () => JudgeSpanCollector,
|
|
948
949
|
JudgeSpanDigestFormatter: () => JudgeSpanDigestFormatter,
|
|
949
950
|
RealtimeAgentAdapter: () => RealtimeAgentAdapter,
|
|
951
|
+
estimateTokens: () => estimateTokens,
|
|
952
|
+
expandTrace: () => expandTrace,
|
|
953
|
+
grepTrace: () => grepTrace,
|
|
950
954
|
judgeAgent: () => judgeAgent,
|
|
951
955
|
judgeSpanCollector: () => judgeSpanCollector,
|
|
952
956
|
judgeSpanDigestFormatter: () => judgeSpanDigestFormatter,
|
|
@@ -954,7 +958,11 @@ __export(agents_exports, {
|
|
|
954
958
|
});
|
|
955
959
|
|
|
956
960
|
// src/agents/judge/judge-agent.ts
|
|
957
|
-
import {
|
|
961
|
+
import {
|
|
962
|
+
tool,
|
|
963
|
+
stepCountIs,
|
|
964
|
+
hasToolCall
|
|
965
|
+
} from "ai";
|
|
958
966
|
import { z as z4 } from "zod/v4";
|
|
959
967
|
|
|
960
968
|
// src/agents/judge/judge-utils.ts
|
|
@@ -1026,6 +1034,283 @@ var JudgeUtils = {
|
|
|
1026
1034
|
}
|
|
1027
1035
|
};
|
|
1028
1036
|
|
|
1037
|
+
// src/agents/judge/estimate-tokens.ts
|
|
1038
|
+
var DEFAULT_TOKEN_THRESHOLD = 8192;
|
|
1039
|
+
function estimateTokens(text) {
|
|
1040
|
+
const byteLength = new TextEncoder().encode(text).byteLength;
|
|
1041
|
+
return Math.ceil(byteLength / 4);
|
|
1042
|
+
}
|
|
1043
|
+
|
|
1044
|
+
// src/agents/judge/span-utils.ts
|
|
1045
|
+
import { attributes } from "langwatch/observability";
|
|
1046
|
+
|
|
1047
|
+
// src/agents/judge/deep-transform.ts
|
|
1048
|
+
function deepTransform(value, fn) {
|
|
1049
|
+
const result = fn(value);
|
|
1050
|
+
if (result !== value) return result;
|
|
1051
|
+
if (Array.isArray(value)) {
|
|
1052
|
+
return value.map((v) => deepTransform(v, fn));
|
|
1053
|
+
}
|
|
1054
|
+
if (value !== null && typeof value === "object") {
|
|
1055
|
+
const out = {};
|
|
1056
|
+
for (const [k, v] of Object.entries(value)) {
|
|
1057
|
+
out[k] = deepTransform(v, fn);
|
|
1058
|
+
}
|
|
1059
|
+
return out;
|
|
1060
|
+
}
|
|
1061
|
+
return value;
|
|
1062
|
+
}
|
|
1063
|
+
|
|
1064
|
+
// src/agents/judge/truncate-media.ts
|
|
1065
|
+
function truncateMediaUrl(str) {
|
|
1066
|
+
const match = str.match(
|
|
1067
|
+
/^data:((image|audio|video)\/[a-z0-9+.-]+);base64,(.+)$/i
|
|
1068
|
+
);
|
|
1069
|
+
if (!match) return str;
|
|
1070
|
+
const [, mimeType, category, data] = match;
|
|
1071
|
+
return `[${category.toUpperCase()}: ${mimeType}, ~${data.length} bytes]`;
|
|
1072
|
+
}
|
|
1073
|
+
function truncateMediaPart(v) {
|
|
1074
|
+
var _a;
|
|
1075
|
+
if (v === null || typeof v !== "object" || Array.isArray(v)) return null;
|
|
1076
|
+
const obj = v;
|
|
1077
|
+
if (obj.type === "file" && typeof obj.mediaType === "string" && typeof obj.data === "string") {
|
|
1078
|
+
const category = ((_a = obj.mediaType.split("/")[0]) == null ? void 0 : _a.toUpperCase()) ?? "FILE";
|
|
1079
|
+
return {
|
|
1080
|
+
...obj,
|
|
1081
|
+
data: `[${category}: ${obj.mediaType}, ~${obj.data.length} bytes]`
|
|
1082
|
+
};
|
|
1083
|
+
}
|
|
1084
|
+
if (obj.type === "image" && typeof obj.image === "string") {
|
|
1085
|
+
const imageData = obj.image;
|
|
1086
|
+
const dataUrlMatch = imageData.match(
|
|
1087
|
+
/^data:((image)\/[a-z0-9+.-]+);base64,(.+)$/i
|
|
1088
|
+
);
|
|
1089
|
+
if (dataUrlMatch) {
|
|
1090
|
+
return {
|
|
1091
|
+
...obj,
|
|
1092
|
+
image: `[IMAGE: ${dataUrlMatch[1]}, ~${dataUrlMatch[3].length} bytes]`
|
|
1093
|
+
};
|
|
1094
|
+
}
|
|
1095
|
+
if (imageData.length > 1e3 && /^[A-Za-z0-9+/=]+$/.test(imageData)) {
|
|
1096
|
+
return {
|
|
1097
|
+
...obj,
|
|
1098
|
+
image: `[IMAGE: unknown, ~${imageData.length} bytes]`
|
|
1099
|
+
};
|
|
1100
|
+
}
|
|
1101
|
+
}
|
|
1102
|
+
return null;
|
|
1103
|
+
}
|
|
1104
|
+
|
|
1105
|
+
// src/agents/judge/span-utils.ts
|
|
1106
|
+
function hrTimeToMs(hrTime) {
|
|
1107
|
+
return hrTime[0] * 1e3 + hrTime[1] / 1e6;
|
|
1108
|
+
}
|
|
1109
|
+
function formatDuration(ms) {
|
|
1110
|
+
if (ms < 1e3) return `${Math.round(ms)}ms`;
|
|
1111
|
+
return `${(ms / 1e3).toFixed(2)}s`;
|
|
1112
|
+
}
|
|
1113
|
+
function calculateSpanDuration(span) {
|
|
1114
|
+
return hrTimeToMs(span.endTime) - hrTimeToMs(span.startTime);
|
|
1115
|
+
}
|
|
1116
|
+
function getStatusIndicator(span) {
|
|
1117
|
+
if (span.status.code === 2) {
|
|
1118
|
+
return ` \u26A0\uFE0F ERROR: ${span.status.message ?? "unknown"}`;
|
|
1119
|
+
}
|
|
1120
|
+
return "";
|
|
1121
|
+
}
|
|
1122
|
+
function getTokenUsage(span) {
|
|
1123
|
+
const input = span.attributes["gen_ai.usage.input_tokens"];
|
|
1124
|
+
const output = span.attributes["gen_ai.usage.output_tokens"];
|
|
1125
|
+
if (input == null && output == null) return "";
|
|
1126
|
+
const total = (Number(input) || 0) + (Number(output) || 0);
|
|
1127
|
+
return `, ${total} tokens`;
|
|
1128
|
+
}
|
|
1129
|
+
function cleanAttributes(attrs) {
|
|
1130
|
+
const cleaned = {};
|
|
1131
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1132
|
+
const excludedKeys = [
|
|
1133
|
+
attributes.ATTR_LANGWATCH_THREAD_ID,
|
|
1134
|
+
"langwatch.scenario.id",
|
|
1135
|
+
"langwatch.scenario.name"
|
|
1136
|
+
];
|
|
1137
|
+
for (const [key, value] of Object.entries(attrs)) {
|
|
1138
|
+
if (excludedKeys.includes(key)) {
|
|
1139
|
+
continue;
|
|
1140
|
+
}
|
|
1141
|
+
const cleanKey = key.replace(/^(langwatch)\./, "");
|
|
1142
|
+
if (!seen.has(cleanKey)) {
|
|
1143
|
+
seen.add(cleanKey);
|
|
1144
|
+
cleaned[cleanKey] = value;
|
|
1145
|
+
}
|
|
1146
|
+
}
|
|
1147
|
+
return cleaned;
|
|
1148
|
+
}
|
|
1149
|
+
function formatValue(value) {
|
|
1150
|
+
const processed = transformValue(value);
|
|
1151
|
+
return typeof processed === "string" ? processed : JSON.stringify(processed);
|
|
1152
|
+
}
|
|
1153
|
+
function transformValue(value) {
|
|
1154
|
+
return deepTransform(value, (v) => {
|
|
1155
|
+
const mediaPart = truncateMediaPart(v);
|
|
1156
|
+
if (mediaPart) return mediaPart;
|
|
1157
|
+
if (typeof v !== "string") return v;
|
|
1158
|
+
const truncated = truncateMediaUrl(v);
|
|
1159
|
+
if (truncated !== v) return truncated;
|
|
1160
|
+
if (looksLikeJson(v)) {
|
|
1161
|
+
try {
|
|
1162
|
+
const parsed = transformValue(JSON.parse(v));
|
|
1163
|
+
return JSON.stringify(parsed);
|
|
1164
|
+
} catch {
|
|
1165
|
+
}
|
|
1166
|
+
}
|
|
1167
|
+
return v;
|
|
1168
|
+
});
|
|
1169
|
+
}
|
|
1170
|
+
function looksLikeJson(str) {
|
|
1171
|
+
const t = str.trim();
|
|
1172
|
+
return t.startsWith("{") && t.endsWith("}") || t.startsWith("[") && t.endsWith("]");
|
|
1173
|
+
}
|
|
1174
|
+
function indexSpans(spans) {
|
|
1175
|
+
const sorted = [...spans].sort((a, b) => {
|
|
1176
|
+
return hrTimeToMs(a.startTime) - hrTimeToMs(b.startTime);
|
|
1177
|
+
});
|
|
1178
|
+
return sorted.map((span, i) => ({
|
|
1179
|
+
span,
|
|
1180
|
+
children: [],
|
|
1181
|
+
index: i + 1
|
|
1182
|
+
}));
|
|
1183
|
+
}
|
|
1184
|
+
|
|
1185
|
+
// src/agents/judge/trace-tools.ts
|
|
1186
|
+
var TOOL_RESULT_TOKEN_BUDGET = 4096;
|
|
1187
|
+
var TOOL_RESULT_CHAR_BUDGET = TOOL_RESULT_TOKEN_BUDGET * 4;
|
|
1188
|
+
var MAX_GREP_MATCHES = 20;
|
|
1189
|
+
function renderFullSpanNode(node) {
|
|
1190
|
+
const span = node.span;
|
|
1191
|
+
const duration = calculateSpanDuration(span);
|
|
1192
|
+
const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
|
|
1193
|
+
const status = getStatusIndicator(span);
|
|
1194
|
+
const lines = [];
|
|
1195
|
+
lines.push(
|
|
1196
|
+
`[${node.index}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
|
|
1197
|
+
);
|
|
1198
|
+
const attrs = cleanAttributes(span.attributes);
|
|
1199
|
+
if (Object.keys(attrs).length > 0) {
|
|
1200
|
+
for (const [key, value] of Object.entries(attrs)) {
|
|
1201
|
+
lines.push(` ${key}: ${formatValue(value)}`);
|
|
1202
|
+
}
|
|
1203
|
+
}
|
|
1204
|
+
if (span.events.length > 0) {
|
|
1205
|
+
for (const event of span.events) {
|
|
1206
|
+
lines.push(` [event] ${event.name}`);
|
|
1207
|
+
if (event.attributes) {
|
|
1208
|
+
const eventAttrs = cleanAttributes(event.attributes);
|
|
1209
|
+
for (const [key, value] of Object.entries(eventAttrs)) {
|
|
1210
|
+
lines.push(` ${key}: ${formatValue(value)}`);
|
|
1211
|
+
}
|
|
1212
|
+
}
|
|
1213
|
+
}
|
|
1214
|
+
}
|
|
1215
|
+
return lines;
|
|
1216
|
+
}
|
|
1217
|
+
function truncateToCharBudget(text) {
|
|
1218
|
+
if (text.length <= TOOL_RESULT_CHAR_BUDGET) return text;
|
|
1219
|
+
const truncated = text.slice(0, TOOL_RESULT_CHAR_BUDGET);
|
|
1220
|
+
return truncated + "\n\n[TRUNCATED] Output exceeded ~4000 token budget. Use grep_trace(pattern) to search for specific content, or expand_trace with a narrower range.";
|
|
1221
|
+
}
|
|
1222
|
+
function spanToSearchableText(span) {
|
|
1223
|
+
const parts = [span.name];
|
|
1224
|
+
const attrs = cleanAttributes(span.attributes);
|
|
1225
|
+
for (const [key, value] of Object.entries(attrs)) {
|
|
1226
|
+
parts.push(`${key}: ${formatValue(value)}`);
|
|
1227
|
+
}
|
|
1228
|
+
for (const event of span.events) {
|
|
1229
|
+
parts.push(event.name);
|
|
1230
|
+
if (event.attributes) {
|
|
1231
|
+
const eventAttrs = cleanAttributes(event.attributes);
|
|
1232
|
+
for (const [key, value] of Object.entries(eventAttrs)) {
|
|
1233
|
+
parts.push(`${key}: ${formatValue(value)}`);
|
|
1234
|
+
}
|
|
1235
|
+
}
|
|
1236
|
+
}
|
|
1237
|
+
return parts.join("\n");
|
|
1238
|
+
}
|
|
1239
|
+
function expandTrace(spans, { index, range }) {
|
|
1240
|
+
const nodes = indexSpans(spans);
|
|
1241
|
+
if (nodes.length === 0) {
|
|
1242
|
+
return "No spans recorded.";
|
|
1243
|
+
}
|
|
1244
|
+
let startIdx;
|
|
1245
|
+
let endIdx;
|
|
1246
|
+
if (range != null) {
|
|
1247
|
+
const parts = range.split("-").map(Number);
|
|
1248
|
+
startIdx = parts[0];
|
|
1249
|
+
endIdx = parts[1] ?? startIdx;
|
|
1250
|
+
} else if (index != null) {
|
|
1251
|
+
startIdx = index;
|
|
1252
|
+
endIdx = index;
|
|
1253
|
+
} else {
|
|
1254
|
+
return "Error: provide either index or range parameter.";
|
|
1255
|
+
}
|
|
1256
|
+
const maxIndex = nodes.length;
|
|
1257
|
+
if (startIdx < 1 || endIdx > maxIndex || startIdx > endIdx) {
|
|
1258
|
+
return `Error: span index out of range. Valid range is 1-${maxIndex}.`;
|
|
1259
|
+
}
|
|
1260
|
+
const selected = nodes.filter(
|
|
1261
|
+
(n) => n.index >= startIdx && n.index <= endIdx
|
|
1262
|
+
);
|
|
1263
|
+
const lines = [];
|
|
1264
|
+
for (const node of selected) {
|
|
1265
|
+
const spanLines = renderFullSpanNode(node);
|
|
1266
|
+
lines.push(...spanLines);
|
|
1267
|
+
lines.push("");
|
|
1268
|
+
}
|
|
1269
|
+
return truncateToCharBudget(lines.join("\n").trimEnd());
|
|
1270
|
+
}
|
|
1271
|
+
function grepTrace(spans, pattern) {
|
|
1272
|
+
const nodes = indexSpans(spans);
|
|
1273
|
+
if (nodes.length === 0) {
|
|
1274
|
+
return "No spans recorded.";
|
|
1275
|
+
}
|
|
1276
|
+
const regex = new RegExp(escapeRegex(pattern), "i");
|
|
1277
|
+
const matches = [];
|
|
1278
|
+
for (const node of nodes) {
|
|
1279
|
+
const searchText = spanToSearchableText(node.span);
|
|
1280
|
+
const lines2 = searchText.split("\n");
|
|
1281
|
+
const matchingLines = lines2.filter((line) => regex.test(line));
|
|
1282
|
+
if (matchingLines.length > 0) {
|
|
1283
|
+
matches.push({ node, matchingLines });
|
|
1284
|
+
}
|
|
1285
|
+
}
|
|
1286
|
+
if (matches.length === 0) {
|
|
1287
|
+
const spanNames = Array.from(new Set(nodes.map((n) => n.span.name)));
|
|
1288
|
+
return `No matches found for "${pattern}". Available span names: ${spanNames.join(", ")}`;
|
|
1289
|
+
}
|
|
1290
|
+
const totalMatches = matches.length;
|
|
1291
|
+
const limited = matches.slice(0, MAX_GREP_MATCHES);
|
|
1292
|
+
const lines = [];
|
|
1293
|
+
for (const { node, matchingLines } of limited) {
|
|
1294
|
+
const duration = calculateSpanDuration(node.span);
|
|
1295
|
+
lines.push(
|
|
1296
|
+
`--- [${node.index}] ${node.span.name} (${formatDuration(duration)}) ---`
|
|
1297
|
+
);
|
|
1298
|
+
for (const line of matchingLines) {
|
|
1299
|
+
lines.push(` ${line}`);
|
|
1300
|
+
}
|
|
1301
|
+
lines.push("");
|
|
1302
|
+
}
|
|
1303
|
+
if (totalMatches > MAX_GREP_MATCHES) {
|
|
1304
|
+
lines.push(
|
|
1305
|
+
`[${totalMatches - MAX_GREP_MATCHES} more matches omitted. Refine your search pattern for more specific results.]`
|
|
1306
|
+
);
|
|
1307
|
+
}
|
|
1308
|
+
return truncateToCharBudget(lines.join("\n").trimEnd());
|
|
1309
|
+
}
|
|
1310
|
+
function escapeRegex(str) {
|
|
1311
|
+
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
1312
|
+
}
|
|
1313
|
+
|
|
1029
1314
|
// src/config/env.ts
|
|
1030
1315
|
import { z } from "zod/v4";
|
|
1031
1316
|
|
|
@@ -1381,7 +1666,7 @@ var criterionToParamName = (criterion) => {
|
|
|
1381
1666
|
};
|
|
1382
1667
|
|
|
1383
1668
|
// src/agents/judge/judge-span-collector.ts
|
|
1384
|
-
import { attributes } from "langwatch/observability";
|
|
1669
|
+
import { attributes as attributes2 } from "langwatch/observability";
|
|
1385
1670
|
var JudgeSpanCollector = class {
|
|
1386
1671
|
spans = [];
|
|
1387
1672
|
onStart() {
|
|
@@ -1424,7 +1709,7 @@ var JudgeSpanCollector = class {
|
|
|
1424
1709
|
const spanId = span.spanContext().spanId;
|
|
1425
1710
|
if (visited.has(spanId)) return false;
|
|
1426
1711
|
visited.add(spanId);
|
|
1427
|
-
if (span.attributes[
|
|
1712
|
+
if (span.attributes[attributes2.ATTR_LANGWATCH_THREAD_ID] === threadId) {
|
|
1428
1713
|
return true;
|
|
1429
1714
|
}
|
|
1430
1715
|
const parentId = getParentSpanId(span);
|
|
@@ -1443,26 +1728,6 @@ function getParentSpanId(span) {
|
|
|
1443
1728
|
}
|
|
1444
1729
|
var judgeSpanCollector = new JudgeSpanCollector();
|
|
1445
1730
|
|
|
1446
|
-
// src/agents/judge/judge-span-digest-formatter.ts
|
|
1447
|
-
import { attributes as attributes2 } from "langwatch/observability";
|
|
1448
|
-
|
|
1449
|
-
// src/agents/judge/deep-transform.ts
|
|
1450
|
-
function deepTransform(value, fn) {
|
|
1451
|
-
const result = fn(value);
|
|
1452
|
-
if (result !== value) return result;
|
|
1453
|
-
if (Array.isArray(value)) {
|
|
1454
|
-
return value.map((v) => deepTransform(v, fn));
|
|
1455
|
-
}
|
|
1456
|
-
if (value !== null && typeof value === "object") {
|
|
1457
|
-
const out = {};
|
|
1458
|
-
for (const [k, v] of Object.entries(value)) {
|
|
1459
|
-
out[k] = deepTransform(v, fn);
|
|
1460
|
-
}
|
|
1461
|
-
return out;
|
|
1462
|
-
}
|
|
1463
|
-
return value;
|
|
1464
|
-
}
|
|
1465
|
-
|
|
1466
1731
|
// src/agents/judge/string-deduplicator.ts
|
|
1467
1732
|
var StringDeduplicator = class {
|
|
1468
1733
|
seen = /* @__PURE__ */ new Map();
|
|
@@ -1496,51 +1761,51 @@ var StringDeduplicator = class {
|
|
|
1496
1761
|
}
|
|
1497
1762
|
};
|
|
1498
1763
|
|
|
1499
|
-
// src/agents/judge/truncate-media.ts
|
|
1500
|
-
function truncateMediaUrl(str) {
|
|
1501
|
-
const match = str.match(
|
|
1502
|
-
/^data:((image|audio|video)\/[a-z0-9+.-]+);base64,(.+)$/i
|
|
1503
|
-
);
|
|
1504
|
-
if (!match) return str;
|
|
1505
|
-
const [, mimeType, category, data] = match;
|
|
1506
|
-
return `[${category.toUpperCase()}: ${mimeType}, ~${data.length} bytes]`;
|
|
1507
|
-
}
|
|
1508
|
-
function truncateMediaPart(v) {
|
|
1509
|
-
var _a;
|
|
1510
|
-
if (v === null || typeof v !== "object" || Array.isArray(v)) return null;
|
|
1511
|
-
const obj = v;
|
|
1512
|
-
if (obj.type === "file" && typeof obj.mediaType === "string" && typeof obj.data === "string") {
|
|
1513
|
-
const category = ((_a = obj.mediaType.split("/")[0]) == null ? void 0 : _a.toUpperCase()) ?? "FILE";
|
|
1514
|
-
return {
|
|
1515
|
-
...obj,
|
|
1516
|
-
data: `[${category}: ${obj.mediaType}, ~${obj.data.length} bytes]`
|
|
1517
|
-
};
|
|
1518
|
-
}
|
|
1519
|
-
if (obj.type === "image" && typeof obj.image === "string") {
|
|
1520
|
-
const imageData = obj.image;
|
|
1521
|
-
const dataUrlMatch = imageData.match(
|
|
1522
|
-
/^data:((image)\/[a-z0-9+.-]+);base64,(.+)$/i
|
|
1523
|
-
);
|
|
1524
|
-
if (dataUrlMatch) {
|
|
1525
|
-
return {
|
|
1526
|
-
...obj,
|
|
1527
|
-
image: `[IMAGE: ${dataUrlMatch[1]}, ~${dataUrlMatch[3].length} bytes]`
|
|
1528
|
-
};
|
|
1529
|
-
}
|
|
1530
|
-
if (imageData.length > 1e3 && /^[A-Za-z0-9+/=]+$/.test(imageData)) {
|
|
1531
|
-
return {
|
|
1532
|
-
...obj,
|
|
1533
|
-
image: `[IMAGE: unknown, ~${imageData.length} bytes]`
|
|
1534
|
-
};
|
|
1535
|
-
}
|
|
1536
|
-
}
|
|
1537
|
-
return null;
|
|
1538
|
-
}
|
|
1539
|
-
|
|
1540
1764
|
// src/agents/judge/judge-span-digest-formatter.ts
|
|
1541
1765
|
var JudgeSpanDigestFormatter = class {
|
|
1542
1766
|
logger = new Logger("JudgeSpanDigestFormatter");
|
|
1543
1767
|
deduplicator = new StringDeduplicator({ threshold: 50 });
|
|
1768
|
+
/**
|
|
1769
|
+
* Formats spans into a structure-only digest showing span tree hierarchy
|
|
1770
|
+
* without attributes, events, or content. Used for large traces that
|
|
1771
|
+
* exceed the token threshold, paired with expand_trace/grep_trace tools.
|
|
1772
|
+
*
|
|
1773
|
+
* @param spans - All spans for a thread
|
|
1774
|
+
* @returns Plain text digest with only structural information
|
|
1775
|
+
*/
|
|
1776
|
+
formatStructureOnly(spans) {
|
|
1777
|
+
this.logger.debug("formatStructureOnly() called", {
|
|
1778
|
+
spanCount: spans.length
|
|
1779
|
+
});
|
|
1780
|
+
if (spans.length === 0) {
|
|
1781
|
+
return "No spans recorded.";
|
|
1782
|
+
}
|
|
1783
|
+
const sortedSpans = this.sortByStartTime(spans);
|
|
1784
|
+
const tree = this.buildHierarchy(sortedSpans);
|
|
1785
|
+
const totalDuration = this.calculateTotalDuration(sortedSpans);
|
|
1786
|
+
const lines = [
|
|
1787
|
+
`Spans: ${spans.length} | Total Duration: ${formatDuration(totalDuration)}`,
|
|
1788
|
+
""
|
|
1789
|
+
];
|
|
1790
|
+
let sequence = 1;
|
|
1791
|
+
const rootCount = tree.length;
|
|
1792
|
+
tree.forEach((node, idx) => {
|
|
1793
|
+
sequence = this.renderStructureNode(
|
|
1794
|
+
node,
|
|
1795
|
+
lines,
|
|
1796
|
+
0,
|
|
1797
|
+
sequence,
|
|
1798
|
+
idx === rootCount - 1
|
|
1799
|
+
);
|
|
1800
|
+
});
|
|
1801
|
+
const errors = this.collectErrors(spans);
|
|
1802
|
+
if (errors.length > 0) {
|
|
1803
|
+
lines.push("");
|
|
1804
|
+
lines.push("=== ERRORS ===");
|
|
1805
|
+
errors.forEach((e) => lines.push(e));
|
|
1806
|
+
}
|
|
1807
|
+
return lines.join("\n");
|
|
1808
|
+
}
|
|
1544
1809
|
/**
|
|
1545
1810
|
* Formats spans into a complete digest with full content and nesting.
|
|
1546
1811
|
* @param spans - All spans for a thread
|
|
@@ -1564,7 +1829,7 @@ var JudgeSpanDigestFormatter = class {
|
|
|
1564
1829
|
totalDuration
|
|
1565
1830
|
});
|
|
1566
1831
|
const lines = [
|
|
1567
|
-
`Spans: ${spans.length} | Total Duration: ${
|
|
1832
|
+
`Spans: ${spans.length} | Total Duration: ${formatDuration(
|
|
1568
1833
|
totalDuration
|
|
1569
1834
|
)}`,
|
|
1570
1835
|
""
|
|
@@ -1590,9 +1855,7 @@ var JudgeSpanDigestFormatter = class {
|
|
|
1590
1855
|
}
|
|
1591
1856
|
sortByStartTime(spans) {
|
|
1592
1857
|
return [...spans].sort((a, b) => {
|
|
1593
|
-
|
|
1594
|
-
const bTime = this.hrTimeToMs(b.startTime);
|
|
1595
|
-
return aTime - bTime;
|
|
1858
|
+
return hrTimeToMs(a.startTime) - hrTimeToMs(b.startTime);
|
|
1596
1859
|
});
|
|
1597
1860
|
}
|
|
1598
1861
|
buildHierarchy(spans) {
|
|
@@ -1612,29 +1875,53 @@ var JudgeSpanDigestFormatter = class {
|
|
|
1612
1875
|
}
|
|
1613
1876
|
return roots;
|
|
1614
1877
|
}
|
|
1878
|
+
renderStructureNode(node, lines, depth, sequence, isLast = true) {
|
|
1879
|
+
const span = node.span;
|
|
1880
|
+
const duration = calculateSpanDuration(span);
|
|
1881
|
+
const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
|
|
1882
|
+
const status = getStatusIndicator(span);
|
|
1883
|
+
const tokens = getTokenUsage(span);
|
|
1884
|
+
const prefix = this.getTreePrefix(depth, isLast);
|
|
1885
|
+
lines.push(
|
|
1886
|
+
`${prefix}[${sequence}] ${timestamp} ${span.name} (${formatDuration(duration)}${tokens})${status}`
|
|
1887
|
+
);
|
|
1888
|
+
lines.push("");
|
|
1889
|
+
let nextSeq = sequence + 1;
|
|
1890
|
+
const childCount = node.children.length;
|
|
1891
|
+
node.children.forEach((child, idx) => {
|
|
1892
|
+
nextSeq = this.renderStructureNode(
|
|
1893
|
+
child,
|
|
1894
|
+
lines,
|
|
1895
|
+
depth + 1,
|
|
1896
|
+
nextSeq,
|
|
1897
|
+
idx === childCount - 1
|
|
1898
|
+
);
|
|
1899
|
+
});
|
|
1900
|
+
return nextSeq;
|
|
1901
|
+
}
|
|
1615
1902
|
renderNode(node, lines, depth, sequence, isLast = true) {
|
|
1616
1903
|
const span = node.span;
|
|
1617
|
-
const duration =
|
|
1618
|
-
const timestamp =
|
|
1619
|
-
const status =
|
|
1904
|
+
const duration = calculateSpanDuration(span);
|
|
1905
|
+
const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
|
|
1906
|
+
const status = getStatusIndicator(span);
|
|
1620
1907
|
const prefix = this.getTreePrefix(depth, isLast);
|
|
1621
1908
|
lines.push(
|
|
1622
|
-
`${prefix}[${sequence}] ${
|
|
1909
|
+
`${prefix}[${sequence}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
|
|
1623
1910
|
);
|
|
1624
1911
|
const attrIndent = this.getAttrIndent(depth, isLast);
|
|
1625
|
-
const attrs =
|
|
1912
|
+
const attrs = cleanAttributes(span.attributes);
|
|
1626
1913
|
if (Object.keys(attrs).length > 0) {
|
|
1627
1914
|
for (const [key, value] of Object.entries(attrs)) {
|
|
1628
|
-
lines.push(`${attrIndent}${key}: ${this.
|
|
1915
|
+
lines.push(`${attrIndent}${key}: ${this.formatValueWithDedup(value)}`);
|
|
1629
1916
|
}
|
|
1630
1917
|
}
|
|
1631
1918
|
if (span.events.length > 0) {
|
|
1632
1919
|
for (const event of span.events) {
|
|
1633
1920
|
lines.push(`${attrIndent}[event] ${event.name}`);
|
|
1634
1921
|
if (event.attributes) {
|
|
1635
|
-
const eventAttrs =
|
|
1922
|
+
const eventAttrs = cleanAttributes(event.attributes);
|
|
1636
1923
|
for (const [key, value] of Object.entries(eventAttrs)) {
|
|
1637
|
-
lines.push(`${attrIndent} ${key}: ${this.
|
|
1924
|
+
lines.push(`${attrIndent} ${key}: ${this.formatValueWithDedup(value)}`);
|
|
1638
1925
|
}
|
|
1639
1926
|
}
|
|
1640
1927
|
}
|
|
@@ -1663,42 +1950,26 @@ var JudgeSpanDigestFormatter = class {
|
|
|
1663
1950
|
const continuation = isLast ? " " : "\u2502 ";
|
|
1664
1951
|
return "\u2502 ".repeat(depth - 1) + continuation + " ";
|
|
1665
1952
|
}
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
"langwatch.scenario.name"
|
|
1673
|
-
];
|
|
1674
|
-
for (const [key, value] of Object.entries(attrs)) {
|
|
1675
|
-
if (excludedKeys.includes(key)) {
|
|
1676
|
-
continue;
|
|
1677
|
-
}
|
|
1678
|
-
const cleanKey = key.replace(/^(langwatch)\./, "");
|
|
1679
|
-
if (!seen.has(cleanKey)) {
|
|
1680
|
-
seen.add(cleanKey);
|
|
1681
|
-
cleaned[cleanKey] = value;
|
|
1682
|
-
}
|
|
1683
|
-
}
|
|
1684
|
-
return cleaned;
|
|
1685
|
-
}
|
|
1686
|
-
formatValue(value) {
|
|
1687
|
-
const processed = this.transformValue(value);
|
|
1953
|
+
/**
|
|
1954
|
+
* Formats a value with deduplication applied. Used by the `format()` method
|
|
1955
|
+
* to reduce token usage by replacing repeated strings with markers.
|
|
1956
|
+
*/
|
|
1957
|
+
formatValueWithDedup(value) {
|
|
1958
|
+
const processed = this.transformValueWithDedup(value);
|
|
1688
1959
|
return typeof processed === "string" ? processed : JSON.stringify(processed);
|
|
1689
1960
|
}
|
|
1690
|
-
|
|
1961
|
+
transformValueWithDedup(value) {
|
|
1691
1962
|
return deepTransform(value, (v) => {
|
|
1692
1963
|
const mediaPart = truncateMediaPart(v);
|
|
1693
1964
|
if (mediaPart) return mediaPart;
|
|
1694
1965
|
if (typeof v !== "string") return v;
|
|
1695
|
-
return this.
|
|
1966
|
+
return this.transformStringWithDedup(v);
|
|
1696
1967
|
});
|
|
1697
1968
|
}
|
|
1698
|
-
|
|
1699
|
-
if (
|
|
1969
|
+
transformStringWithDedup(str) {
|
|
1970
|
+
if (looksLikeJson(str)) {
|
|
1700
1971
|
try {
|
|
1701
|
-
const processed = this.
|
|
1972
|
+
const processed = this.transformValueWithDedup(JSON.parse(str));
|
|
1702
1973
|
return JSON.stringify(processed);
|
|
1703
1974
|
} catch {
|
|
1704
1975
|
}
|
|
@@ -1707,36 +1978,12 @@ var JudgeSpanDigestFormatter = class {
|
|
|
1707
1978
|
if (truncated !== str) return truncated;
|
|
1708
1979
|
return this.deduplicator.process(str);
|
|
1709
1980
|
}
|
|
1710
|
-
looksLikeJson(str) {
|
|
1711
|
-
const t = str.trim();
|
|
1712
|
-
return t.startsWith("{") && t.endsWith("}") || t.startsWith("[") && t.endsWith("]");
|
|
1713
|
-
}
|
|
1714
|
-
hrTimeToMs(hrTime) {
|
|
1715
|
-
return hrTime[0] * 1e3 + hrTime[1] / 1e6;
|
|
1716
|
-
}
|
|
1717
|
-
calculateSpanDuration(span) {
|
|
1718
|
-
return this.hrTimeToMs(span.endTime) - this.hrTimeToMs(span.startTime);
|
|
1719
|
-
}
|
|
1720
1981
|
calculateTotalDuration(spans) {
|
|
1721
1982
|
if (spans.length === 0) return 0;
|
|
1722
|
-
const first =
|
|
1723
|
-
const last = Math.max(...spans.map((s) =>
|
|
1983
|
+
const first = hrTimeToMs(spans[0].startTime);
|
|
1984
|
+
const last = Math.max(...spans.map((s) => hrTimeToMs(s.endTime)));
|
|
1724
1985
|
return last - first;
|
|
1725
1986
|
}
|
|
1726
|
-
formatDuration(ms) {
|
|
1727
|
-
if (ms < 1e3) return `${Math.round(ms)}ms`;
|
|
1728
|
-
return `${(ms / 1e3).toFixed(2)}s`;
|
|
1729
|
-
}
|
|
1730
|
-
formatTimestamp(hrTime) {
|
|
1731
|
-
const ms = this.hrTimeToMs(hrTime);
|
|
1732
|
-
return new Date(ms).toISOString();
|
|
1733
|
-
}
|
|
1734
|
-
getStatusIndicator(span) {
|
|
1735
|
-
if (span.status.code === 2) {
|
|
1736
|
-
return ` \u26A0\uFE0F ERROR: ${span.status.message ?? "unknown"}`;
|
|
1737
|
-
}
|
|
1738
|
-
return "";
|
|
1739
|
-
}
|
|
1740
1987
|
collectErrors(spans) {
|
|
1741
1988
|
return spans.filter((s) => s.status.code === 2).map((s) => `- ${s.name}: ${s.status.message ?? "unknown error"}`);
|
|
1742
1989
|
}
|
|
@@ -1799,15 +2046,42 @@ function buildFinishTestTool(criteria) {
|
|
|
1799
2046
|
})
|
|
1800
2047
|
});
|
|
1801
2048
|
}
|
|
2049
|
+
function buildProgressiveDiscoveryTools(spans) {
|
|
2050
|
+
return {
|
|
2051
|
+
expand_trace: tool({
|
|
2052
|
+
description: "Expand one or more spans to see their full details (attributes, events, content). Use a single index like 5 or a range like '10-15'.",
|
|
2053
|
+
inputSchema: z4.object({
|
|
2054
|
+
index: z4.number().optional().describe("Single span index to expand"),
|
|
2055
|
+
range: z4.string().optional().describe('Range of span indices to expand, e.g. "10-15"')
|
|
2056
|
+
}),
|
|
2057
|
+
execute: async ({ index, range }) => {
|
|
2058
|
+
return expandTrace(spans, { index, range });
|
|
2059
|
+
}
|
|
2060
|
+
}),
|
|
2061
|
+
grep_trace: tool({
|
|
2062
|
+
description: "Search across all span attributes, events, and content for a pattern (case-insensitive). Returns matching spans with context.",
|
|
2063
|
+
inputSchema: z4.object({
|
|
2064
|
+
pattern: z4.string().describe("Search pattern (case-insensitive)")
|
|
2065
|
+
}),
|
|
2066
|
+
execute: async ({ pattern }) => {
|
|
2067
|
+
return grepTrace(spans, pattern);
|
|
2068
|
+
}
|
|
2069
|
+
})
|
|
2070
|
+
};
|
|
2071
|
+
}
|
|
1802
2072
|
var JudgeAgent = class extends JudgeAgentAdapter {
|
|
1803
2073
|
constructor(cfg) {
|
|
1804
2074
|
super();
|
|
1805
2075
|
this.cfg = cfg;
|
|
1806
2076
|
this.criteria = cfg.criteria ?? [];
|
|
1807
2077
|
this.spanCollector = cfg.spanCollector ?? judgeSpanCollector;
|
|
2078
|
+
this.tokenThreshold = cfg.tokenThreshold ?? DEFAULT_TOKEN_THRESHOLD;
|
|
2079
|
+
this.maxDiscoverySteps = cfg.maxDiscoverySteps ?? 10;
|
|
1808
2080
|
}
|
|
1809
2081
|
logger = new Logger("JudgeAgent");
|
|
1810
2082
|
spanCollector;
|
|
2083
|
+
tokenThreshold;
|
|
2084
|
+
maxDiscoverySteps;
|
|
1811
2085
|
role = "Judge" /* JUDGE */;
|
|
1812
2086
|
criteria;
|
|
1813
2087
|
/**
|
|
@@ -1815,7 +2089,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1815
2089
|
*/
|
|
1816
2090
|
invokeLLM = createLLMInvoker(this.logger);
|
|
1817
2091
|
async call(input) {
|
|
1818
|
-
var _a
|
|
2092
|
+
var _a;
|
|
1819
2093
|
const criteria = ((_a = input.judgmentRequest) == null ? void 0 : _a.criteria) ?? this.criteria;
|
|
1820
2094
|
this.logger.debug("call() invoked", {
|
|
1821
2095
|
threadId: input.threadId,
|
|
@@ -1823,8 +2097,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1823
2097
|
maxTurns: input.scenarioConfig.maxTurns,
|
|
1824
2098
|
judgmentRequest: input.judgmentRequest
|
|
1825
2099
|
});
|
|
1826
|
-
const
|
|
1827
|
-
|
|
2100
|
+
const spans = this.spanCollector.getSpansForThread(input.threadId);
|
|
2101
|
+
const { digest, isLargeTrace } = this.buildTraceDigest(spans);
|
|
1828
2102
|
const transcript = JudgeUtils.buildTranscriptFromMessages(input.messages);
|
|
1829
2103
|
const contentForJudge = `
|
|
1830
2104
|
<transcript>
|
|
@@ -1848,7 +2122,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1848
2122
|
});
|
|
1849
2123
|
const tools = {
|
|
1850
2124
|
continue_test: buildContinueTestTool(),
|
|
1851
|
-
finish_test: buildFinishTestTool(criteria)
|
|
2125
|
+
finish_test: buildFinishTestTool(criteria),
|
|
2126
|
+
...isLargeTrace ? buildProgressiveDiscoveryTools(spans) : {}
|
|
1852
2127
|
};
|
|
1853
2128
|
const enforceJudgement = input.judgmentRequest != null;
|
|
1854
2129
|
const hasCriteria = criteria.length && criteria.length > 0;
|
|
@@ -1865,26 +2140,70 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1865
2140
|
model: mergedConfig.model,
|
|
1866
2141
|
toolChoice,
|
|
1867
2142
|
isLastMessage,
|
|
1868
|
-
enforceJudgement
|
|
2143
|
+
enforceJudgement,
|
|
2144
|
+
isLargeTrace
|
|
1869
2145
|
});
|
|
1870
|
-
const completion = await this.
|
|
2146
|
+
const completion = await this.invokeLLMWithDiscovery({
|
|
1871
2147
|
model: mergedConfig.model,
|
|
1872
2148
|
messages,
|
|
1873
2149
|
temperature: mergedConfig.temperature ?? 0,
|
|
1874
2150
|
maxOutputTokens: mergedConfig.maxTokens,
|
|
1875
2151
|
tools,
|
|
1876
|
-
toolChoice
|
|
2152
|
+
toolChoice,
|
|
2153
|
+
isLargeTrace
|
|
2154
|
+
});
|
|
2155
|
+
return this.parseToolCalls(completion, criteria);
|
|
2156
|
+
}
|
|
2157
|
+
/**
|
|
2158
|
+
* Builds the trace digest, choosing between full inline rendering
|
|
2159
|
+
* and structure-only mode based on estimated token count.
|
|
2160
|
+
*/
|
|
2161
|
+
buildTraceDigest(spans) {
|
|
2162
|
+
const fullDigest = judgeSpanDigestFormatter.format(spans);
|
|
2163
|
+
const isLargeTrace = spans.length > 0 && estimateTokens(fullDigest) > this.tokenThreshold;
|
|
2164
|
+
const digest = isLargeTrace ? judgeSpanDigestFormatter.formatStructureOnly(spans) + "\n\nUse expand_trace(spanIndex) to see span details or grep_trace(pattern) to search across spans." : fullDigest;
|
|
2165
|
+
this.logger.debug("Trace digest built", {
|
|
2166
|
+
isLargeTrace,
|
|
2167
|
+
estimatedTokens: estimateTokens(fullDigest)
|
|
1877
2168
|
});
|
|
2169
|
+
return { digest, isLargeTrace };
|
|
2170
|
+
}
|
|
2171
|
+
/**
|
|
2172
|
+
* Invokes the LLM, enabling multi-step tool execution for large traces.
|
|
2173
|
+
* In multi-step mode, the AI SDK loops automatically: the judge can call
|
|
2174
|
+
* expand_trace/grep_trace tools multiple times before reaching a terminal
|
|
2175
|
+
* tool (finish_test/continue_test) or hitting the step limit.
|
|
2176
|
+
*/
|
|
2177
|
+
async invokeLLMWithDiscovery({
|
|
2178
|
+
isLargeTrace,
|
|
2179
|
+
...params
|
|
2180
|
+
}) {
|
|
2181
|
+
var _a, _b;
|
|
2182
|
+
if (isLargeTrace) {
|
|
2183
|
+
params.stopWhen = [
|
|
2184
|
+
stepCountIs(this.maxDiscoverySteps),
|
|
2185
|
+
hasToolCall("finish_test"),
|
|
2186
|
+
hasToolCall("continue_test")
|
|
2187
|
+
];
|
|
2188
|
+
}
|
|
2189
|
+
const completion = await this.invokeLLM(params);
|
|
1878
2190
|
this.logger.debug("LLM response received", {
|
|
1879
|
-
toolCallCount: ((
|
|
1880
|
-
toolCalls: (
|
|
2191
|
+
toolCallCount: ((_a = completion.toolCalls) == null ? void 0 : _a.length) ?? 0,
|
|
2192
|
+
toolCalls: (_b = completion.toolCalls) == null ? void 0 : _b.map((tc) => ({
|
|
1881
2193
|
toolName: tc.toolName,
|
|
1882
2194
|
args: tc.input
|
|
1883
2195
|
}))
|
|
1884
2196
|
});
|
|
2197
|
+
return completion;
|
|
2198
|
+
}
|
|
2199
|
+
parseToolCalls(completion, criteria) {
|
|
2200
|
+
var _a;
|
|
1885
2201
|
let args;
|
|
1886
|
-
if ((
|
|
1887
|
-
const
|
|
2202
|
+
if ((_a = completion.toolCalls) == null ? void 0 : _a.length) {
|
|
2203
|
+
const terminalCall = completion.toolCalls.find(
|
|
2204
|
+
(tc) => tc.toolName === "finish_test" || tc.toolName === "continue_test"
|
|
2205
|
+
);
|
|
2206
|
+
const toolCall = terminalCall ?? completion.toolCalls[0];
|
|
1888
2207
|
switch (toolCall.toolName) {
|
|
1889
2208
|
case "finish_test": {
|
|
1890
2209
|
args = toolCall.input;
|
|
@@ -1926,11 +2245,6 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1926
2245
|
unmetCriteria: criteria
|
|
1927
2246
|
};
|
|
1928
2247
|
}
|
|
1929
|
-
getOpenTelemetryTracesDigest(threadId) {
|
|
1930
|
-
const spans = this.spanCollector.getSpansForThread(threadId);
|
|
1931
|
-
const digest = judgeSpanDigestFormatter.format(spans);
|
|
1932
|
-
return digest;
|
|
1933
|
-
}
|
|
1934
2248
|
};
|
|
1935
2249
|
var judgeAgent = (cfg) => {
|
|
1936
2250
|
return new JudgeAgent(cfg ?? {});
|
|
@@ -4677,6 +4991,7 @@ export {
|
|
|
4677
4991
|
AgentAdapter,
|
|
4678
4992
|
AgentRole,
|
|
4679
4993
|
DEFAULT_MAX_TURNS,
|
|
4994
|
+
DEFAULT_TOKEN_THRESHOLD,
|
|
4680
4995
|
DEFAULT_VERBOSE,
|
|
4681
4996
|
JudgeAgentAdapter,
|
|
4682
4997
|
JudgeSpanCollector,
|
|
@@ -4690,7 +5005,10 @@ export {
|
|
|
4690
5005
|
allAgentRoles,
|
|
4691
5006
|
index_default as default,
|
|
4692
5007
|
defineConfig,
|
|
5008
|
+
estimateTokens,
|
|
5009
|
+
expandTrace,
|
|
4693
5010
|
fail,
|
|
5011
|
+
grepTrace,
|
|
4694
5012
|
judge,
|
|
4695
5013
|
judgeAgent,
|
|
4696
5014
|
judgeSpanCollector,
|