npm - @langwatch/scenario - Versions diffs - 0.4.3 → 0.4.5 - Mend

@langwatch/scenario 0.4.3 → 0.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/index.mjs CHANGED Viewed

@@ -944,9 +944,13 @@ var init_esm = __esm({
 // src/agents/index.ts
 var agents_exports = {};
 __export(agents_exports, {
+  DEFAULT_TOKEN_THRESHOLD: () => DEFAULT_TOKEN_THRESHOLD,
   JudgeSpanCollector: () => JudgeSpanCollector,
   JudgeSpanDigestFormatter: () => JudgeSpanDigestFormatter,
   RealtimeAgentAdapter: () => RealtimeAgentAdapter,
+  estimateTokens: () => estimateTokens,
+  expandTrace: () => expandTrace,
+  grepTrace: () => grepTrace,
   judgeAgent: () => judgeAgent,
   judgeSpanCollector: () => judgeSpanCollector,
   judgeSpanDigestFormatter: () => judgeSpanDigestFormatter,
@@ -954,7 +958,11 @@ __export(agents_exports, {
 });
 // src/agents/judge/judge-agent.ts
-import { tool } from "ai";
+import {
+  tool,
+  stepCountIs,
+  hasToolCall
+} from "ai";
 import { z as z4 } from "zod/v4";
 // src/agents/judge/judge-utils.ts
@@ -1026,6 +1034,275 @@ var JudgeUtils = {
   }
 };
+// src/agents/judge/estimate-tokens.ts
+var DEFAULT_TOKEN_THRESHOLD = 8192;
+function estimateTokens(text) {
+  const byteLength = new TextEncoder().encode(text).byteLength;
+  return Math.ceil(byteLength / 4);
+}
+// src/agents/judge/span-utils.ts
+import { attributes } from "langwatch/observability";
+// src/agents/judge/deep-transform.ts
+function deepTransform(value, fn) {
+  const result = fn(value);
+  if (result !== value) return result;
+  if (Array.isArray(value)) {
+    return value.map((v) => deepTransform(v, fn));
+  }
+  if (value !== null && typeof value === "object") {
+    const out = {};
+    for (const [k, v] of Object.entries(value)) {
+      out[k] = deepTransform(v, fn);
+    }
+    return out;
+  }
+  return value;
+}
+// src/agents/judge/truncate-media.ts
+function truncateMediaUrl(str) {
+  const match = str.match(
+    /^data:((image|audio|video)\/[a-z0-9+.-]+);base64,(.+)$/i
+  );
+  if (!match) return str;
+  const [, mimeType, category, data] = match;
+  return `[${category.toUpperCase()}: ${mimeType}, ~${data.length} bytes]`;
+}
+function truncateMediaPart(v) {
+  var _a;
+  if (v === null || typeof v !== "object" || Array.isArray(v)) return null;
+  const obj = v;
+  if (obj.type === "file" && typeof obj.mediaType === "string" && typeof obj.data === "string") {
+    const category = ((_a = obj.mediaType.split("/")[0]) == null ? void 0 : _a.toUpperCase()) ?? "FILE";
+    return {
+      ...obj,
+      data: `[${category}: ${obj.mediaType}, ~${obj.data.length} bytes]`
+    };
+  }
+  if (obj.type === "image" && typeof obj.image === "string") {
+    const imageData = obj.image;
+    const dataUrlMatch = imageData.match(
+      /^data:((image)\/[a-z0-9+.-]+);base64,(.+)$/i
+    );
+    if (dataUrlMatch) {
+      return {
+        ...obj,
+        image: `[IMAGE: ${dataUrlMatch[1]}, ~${dataUrlMatch[3].length} bytes]`
+      };
+    }
+    if (imageData.length > 1e3 && /^[A-Za-z0-9+/=]+$/.test(imageData)) {
+      return {
+        ...obj,
+        image: `[IMAGE: unknown, ~${imageData.length} bytes]`
+      };
+    }
+  }
+  return null;
+}
+// src/agents/judge/span-utils.ts
+function hrTimeToMs(hrTime) {
+  return hrTime[0] * 1e3 + hrTime[1] / 1e6;
+}
+function formatDuration(ms) {
+  if (ms < 1e3) return `${Math.round(ms)}ms`;
+  return `${(ms / 1e3).toFixed(2)}s`;
+}
+function calculateSpanDuration(span) {
+  return hrTimeToMs(span.endTime) - hrTimeToMs(span.startTime);
+}
+function getStatusIndicator(span) {
+  if (span.status.code === 2) {
+    return ` \u26A0\uFE0F ERROR: ${span.status.message ?? "unknown"}`;
+  }
+  return "";
+}
+function getTokenUsage(span) {
+  const input = span.attributes["gen_ai.usage.input_tokens"];
+  const output = span.attributes["gen_ai.usage.output_tokens"];
+  if (input == null && output == null) return "";
+  const total = (Number(input) || 0) + (Number(output) || 0);
+  return `, ${total} tokens`;
+}
+function cleanAttributes(attrs) {
+  const cleaned = {};
+  const seen = /* @__PURE__ */ new Set();
+  const excludedKeys = [
+    attributes.ATTR_LANGWATCH_THREAD_ID,
+    "langwatch.scenario.id",
+    "langwatch.scenario.name"
+  ];
+  for (const [key, value] of Object.entries(attrs)) {
+    if (excludedKeys.includes(key)) {
+      continue;
+    }
+    const cleanKey = key.replace(/^(langwatch)\./, "");
+    if (!seen.has(cleanKey)) {
+      seen.add(cleanKey);
+      cleaned[cleanKey] = value;
+    }
+  }
+  return cleaned;
+}
+function formatValue(value) {
+  const processed = transformValue(value);
+  return typeof processed === "string" ? processed : JSON.stringify(processed);
+}
+function transformValue(value) {
+  return deepTransform(value, (v) => {
+    const mediaPart = truncateMediaPart(v);
+    if (mediaPart) return mediaPart;
+    if (typeof v !== "string") return v;
+    const truncated = truncateMediaUrl(v);
+    if (truncated !== v) return truncated;
+    if (looksLikeJson(v)) {
+      try {
+        const parsed = transformValue(JSON.parse(v));
+        return JSON.stringify(parsed);
+      } catch {
+      }
+    }
+    return v;
+  });
+}
+function looksLikeJson(str) {
+  const t = str.trim();
+  return t.startsWith("{") && t.endsWith("}") || t.startsWith("[") && t.endsWith("]");
+}
+function indexSpans(spans) {
+  const sorted = [...spans].sort((a, b) => {
+    return hrTimeToMs(a.startTime) - hrTimeToMs(b.startTime);
+  });
+  return sorted.map((span) => ({
+    span,
+    children: [],
+    shortId: span.spanContext().spanId.slice(0, 8)
+  }));
+}
+// src/agents/judge/trace-tools.ts
+var TOOL_RESULT_TOKEN_BUDGET = 4096;
+var TOOL_RESULT_CHAR_BUDGET = TOOL_RESULT_TOKEN_BUDGET * 4;
+var MAX_GREP_MATCHES = 20;
+function renderFullSpanNode(node) {
+  const span = node.span;
+  const duration = calculateSpanDuration(span);
+  const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
+  const status = getStatusIndicator(span);
+  const lines = [];
+  lines.push(
+    `[${node.shortId}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
+  );
+  const attrs = cleanAttributes(span.attributes);
+  if (Object.keys(attrs).length > 0) {
+    for (const [key, value] of Object.entries(attrs)) {
+      lines.push(`    ${key}: ${formatValue(value)}`);
+    }
+  }
+  if (span.events.length > 0) {
+    for (const event of span.events) {
+      lines.push(`    [event] ${event.name}`);
+      if (event.attributes) {
+        const eventAttrs = cleanAttributes(event.attributes);
+        for (const [key, value] of Object.entries(eventAttrs)) {
+          lines.push(`      ${key}: ${formatValue(value)}`);
+        }
+      }
+    }
+  }
+  return lines;
+}
+function truncateToCharBudget(text) {
+  if (text.length <= TOOL_RESULT_CHAR_BUDGET) return text;
+  const truncated = text.slice(0, TOOL_RESULT_CHAR_BUDGET);
+  return truncated + "\n\n[TRUNCATED] Output exceeded ~4000 token budget. Use grep_trace(pattern) to search for specific content, or expand_trace with fewer span IDs.";
+}
+function spanToSearchableText(span) {
+  const parts = [span.name];
+  const attrs = cleanAttributes(span.attributes);
+  for (const [key, value] of Object.entries(attrs)) {
+    parts.push(`${key}: ${formatValue(value)}`);
+  }
+  for (const event of span.events) {
+    parts.push(event.name);
+    if (event.attributes) {
+      const eventAttrs = cleanAttributes(event.attributes);
+      for (const [key, value] of Object.entries(eventAttrs)) {
+        parts.push(`${key}: ${formatValue(value)}`);
+      }
+    }
+  }
+  return parts.join("\n");
+}
+function expandTrace(spans, spanIds) {
+  const nodes = indexSpans(spans);
+  if (nodes.length === 0) {
+    return "No spans recorded.";
+  }
+  if (spanIds.length === 0) {
+    return "Error: provide at least one span ID.";
+  }
+  const selected = nodes.filter((n) => {
+    const fullId = n.span.spanContext().spanId;
+    return spanIds.some((prefix) => fullId.startsWith(prefix));
+  });
+  if (selected.length === 0) {
+    const available = nodes.map((n) => n.shortId).join(", ");
+    return `Error: no spans matched the given ID(s). Available span IDs: ${available}`;
+  }
+  const lines = [];
+  for (const node of selected) {
+    const spanLines = renderFullSpanNode(node);
+    lines.push(...spanLines);
+    lines.push("");
+  }
+  return truncateToCharBudget(lines.join("\n").trimEnd());
+}
+function grepTrace(spans, pattern) {
+  const nodes = indexSpans(spans);
+  if (nodes.length === 0) {
+    return "No spans recorded.";
+  }
+  const regex = new RegExp(escapeRegex(pattern), "i");
+  const matches = [];
+  for (const node of nodes) {
+    const searchText = spanToSearchableText(node.span);
+    const lines2 = searchText.split("\n");
+    const matchingLines = lines2.filter((line) => regex.test(line));
+    if (matchingLines.length > 0) {
+      matches.push({ node, matchingLines });
+    }
+  }
+  if (matches.length === 0) {
+    const spanNames = Array.from(new Set(nodes.map((n) => n.span.name)));
+    return `No matches found for "${pattern}". Available span names: ${spanNames.join(", ")}`;
+  }
+  const totalMatches = matches.length;
+  const limited = matches.slice(0, MAX_GREP_MATCHES);
+  const lines = [];
+  for (const { node, matchingLines } of limited) {
+    const duration = calculateSpanDuration(node.span);
+    lines.push(
+      `--- [${node.shortId}] ${node.span.name} (${formatDuration(duration)}) ---`
+    );
+    for (const line of matchingLines) {
+      lines.push(`  ${line}`);
+    }
+    lines.push("");
+  }
+  if (totalMatches > MAX_GREP_MATCHES) {
+    lines.push(
+      `[${totalMatches - MAX_GREP_MATCHES} more matches omitted. Refine your search pattern for more specific results.]`
+    );
+  }
+  return truncateToCharBudget(lines.join("\n").trimEnd());
+}
+function escapeRegex(str) {
+  return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+}
 // src/config/env.ts
 import { z } from "zod/v4";
@@ -1381,7 +1658,7 @@ var criterionToParamName = (criterion) => {
 };
 // src/agents/judge/judge-span-collector.ts
-import { attributes } from "langwatch/observability";
+import { attributes as attributes2 } from "langwatch/observability";
 var JudgeSpanCollector = class {
   spans = [];
   onStart() {
@@ -1424,7 +1701,7 @@ var JudgeSpanCollector = class {
       const spanId = span.spanContext().spanId;
       if (visited.has(spanId)) return false;
       visited.add(spanId);
-      if (span.attributes[attributes.ATTR_LANGWATCH_THREAD_ID] === threadId) {
+      if (span.attributes[attributes2.ATTR_LANGWATCH_THREAD_ID] === threadId) {
         return true;
       }
       const parentId = getParentSpanId(span);
@@ -1443,26 +1720,6 @@ function getParentSpanId(span) {
 }
 var judgeSpanCollector = new JudgeSpanCollector();
-// src/agents/judge/judge-span-digest-formatter.ts
-import { attributes as attributes2 } from "langwatch/observability";
-// src/agents/judge/deep-transform.ts
-function deepTransform(value, fn) {
-  const result = fn(value);
-  if (result !== value) return result;
-  if (Array.isArray(value)) {
-    return value.map((v) => deepTransform(v, fn));
-  }
-  if (value !== null && typeof value === "object") {
-    const out = {};
-    for (const [k, v] of Object.entries(value)) {
-      out[k] = deepTransform(v, fn);
-    }
-    return out;
-  }
-  return value;
-}
 // src/agents/judge/string-deduplicator.ts
 var StringDeduplicator = class {
   seen = /* @__PURE__ */ new Map();
@@ -1496,51 +1753,49 @@ var StringDeduplicator = class {
   }
 };
-// src/agents/judge/truncate-media.ts
-function truncateMediaUrl(str) {
-  const match = str.match(
-    /^data:((image|audio|video)\/[a-z0-9+.-]+);base64,(.+)$/i
-  );
-  if (!match) return str;
-  const [, mimeType, category, data] = match;
-  return `[${category.toUpperCase()}: ${mimeType}, ~${data.length} bytes]`;
-}
-function truncateMediaPart(v) {
-  var _a;
-  if (v === null || typeof v !== "object" || Array.isArray(v)) return null;
-  const obj = v;
-  if (obj.type === "file" && typeof obj.mediaType === "string" && typeof obj.data === "string") {
-    const category = ((_a = obj.mediaType.split("/")[0]) == null ? void 0 : _a.toUpperCase()) ?? "FILE";
-    return {
-      ...obj,
-      data: `[${category}: ${obj.mediaType}, ~${obj.data.length} bytes]`
-    };
-  }
-  if (obj.type === "image" && typeof obj.image === "string") {
-    const imageData = obj.image;
-    const dataUrlMatch = imageData.match(
-      /^data:((image)\/[a-z0-9+.-]+);base64,(.+)$/i
-    );
-    if (dataUrlMatch) {
-      return {
-        ...obj,
-        image: `[IMAGE: ${dataUrlMatch[1]}, ~${dataUrlMatch[3].length} bytes]`
-      };
-    }
-    if (imageData.length > 1e3 && /^[A-Za-z0-9+/=]+$/.test(imageData)) {
-      return {
-        ...obj,
-        image: `[IMAGE: unknown, ~${imageData.length} bytes]`
-      };
-    }
-  }
-  return null;
-}
 // src/agents/judge/judge-span-digest-formatter.ts
 var JudgeSpanDigestFormatter = class {
   logger = new Logger("JudgeSpanDigestFormatter");
   deduplicator = new StringDeduplicator({ threshold: 50 });
+  /**
+   * Formats spans into a structure-only digest showing span tree hierarchy
+   * without attributes, events, or content. Used for large traces that
+   * exceed the token threshold, paired with expand_trace/grep_trace tools.
+   *
+   * @param spans - All spans for a thread
+   * @returns Plain text digest with only structural information
+   */
+  formatStructureOnly(spans) {
+    this.logger.debug("formatStructureOnly() called", {
+      spanCount: spans.length
+    });
+    if (spans.length === 0) {
+      return "No spans recorded.";
+    }
+    const sortedSpans = this.sortByStartTime(spans);
+    const tree = this.buildHierarchy(sortedSpans);
+    const totalDuration = this.calculateTotalDuration(sortedSpans);
+    const lines = [
+      `Spans: ${spans.length} | Total Duration: ${formatDuration(totalDuration)}`,
+      ""
+    ];
+    const rootCount = tree.length;
+    tree.forEach((node, idx) => {
+      this.renderStructureNode(
+        node,
+        lines,
+        0,
+        idx === rootCount - 1
+      );
+    });
+    const errors = this.collectErrors(spans);
+    if (errors.length > 0) {
+      lines.push("");
+      lines.push("=== ERRORS ===");
+      errors.forEach((e) => lines.push(e));
+    }
+    return lines.join("\n");
+  }
   /**
    * Formats spans into a complete digest with full content and nesting.
    * @param spans - All spans for a thread
@@ -1564,19 +1819,17 @@ var JudgeSpanDigestFormatter = class {
       totalDuration
     });
     const lines = [
-      `Spans: ${spans.length} | Total Duration: ${this.formatDuration(
+      `Spans: ${spans.length} | Total Duration: ${formatDuration(
         totalDuration
       )}`,
       ""
     ];
-    let sequence = 1;
     const rootCount = tree.length;
     tree.forEach((node, idx) => {
-      sequence = this.renderNode(
+      this.renderNode(
         node,
         lines,
         0,
-        sequence,
         idx === rootCount - 1
       );
     });
@@ -1590,9 +1843,7 @@ var JudgeSpanDigestFormatter = class {
   }
   sortByStartTime(spans) {
     return [...spans].sort((a, b) => {
-      const aTime = this.hrTimeToMs(a.startTime);
-      const bTime = this.hrTimeToMs(b.startTime);
-      return aTime - bTime;
+      return hrTimeToMs(a.startTime) - hrTimeToMs(b.startTime);
     });
   }
   buildHierarchy(spans) {
@@ -1612,46 +1863,66 @@ var JudgeSpanDigestFormatter = class {
     }
     return roots;
   }
-  renderNode(node, lines, depth, sequence, isLast = true) {
+  renderStructureNode(node, lines, depth, isLast = true) {
     const span = node.span;
-    const duration = this.calculateSpanDuration(span);
-    const timestamp = this.formatTimestamp(span.startTime);
-    const status = this.getStatusIndicator(span);
+    const shortId = span.spanContext().spanId.slice(0, 8);
+    const duration = calculateSpanDuration(span);
+    const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
+    const status = getStatusIndicator(span);
+    const tokens = getTokenUsage(span);
     const prefix = this.getTreePrefix(depth, isLast);
     lines.push(
-      `${prefix}[${sequence}] ${new Date(timestamp).toISOString()} ${span.name} (${this.formatDuration(duration)})${status}`
+      `${prefix}[${shortId}] ${timestamp} ${span.name} (${formatDuration(duration)}${tokens})${status}`
+    );
+    lines.push("");
+    const childCount = node.children.length;
+    node.children.forEach((child, idx) => {
+      this.renderStructureNode(
+        child,
+        lines,
+        depth + 1,
+        idx === childCount - 1
+      );
+    });
+  }
+  renderNode(node, lines, depth, isLast = true) {
+    const span = node.span;
+    const shortId = span.spanContext().spanId.slice(0, 8);
+    const duration = calculateSpanDuration(span);
+    const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
+    const status = getStatusIndicator(span);
+    const prefix = this.getTreePrefix(depth, isLast);
+    lines.push(
+      `${prefix}[${shortId}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
     );
     const attrIndent = this.getAttrIndent(depth, isLast);
-    const attrs = this.cleanAttributes(span.attributes);
+    const attrs = cleanAttributes(span.attributes);
     if (Object.keys(attrs).length > 0) {
       for (const [key, value] of Object.entries(attrs)) {
-        lines.push(`${attrIndent}${key}: ${this.formatValue(value)}`);
+        lines.push(`${attrIndent}${key}: ${this.formatValueWithDedup(value)}`);
       }
     }
     if (span.events.length > 0) {
       for (const event of span.events) {
         lines.push(`${attrIndent}[event] ${event.name}`);
         if (event.attributes) {
-          const eventAttrs = this.cleanAttributes(event.attributes);
+          const eventAttrs = cleanAttributes(event.attributes);
           for (const [key, value] of Object.entries(eventAttrs)) {
-            lines.push(`${attrIndent}  ${key}: ${this.formatValue(value)}`);
+            lines.push(`${attrIndent}  ${key}: ${this.formatValueWithDedup(value)}`);
           }
         }
       }
     }
     lines.push("");
-    let nextSeq = sequence + 1;
     const childCount = node.children.length;
     node.children.forEach((child, idx) => {
-      nextSeq = this.renderNode(
+      this.renderNode(
         child,
         lines,
         depth + 1,
-        nextSeq,
         idx === childCount - 1
       );
     });
-    return nextSeq;
   }
   getTreePrefix(depth, isLast) {
     if (depth === 0) return "";
@@ -1663,42 +1934,26 @@ var JudgeSpanDigestFormatter = class {
     const continuation = isLast ? "    " : "\u2502   ";
     return "\u2502   ".repeat(depth - 1) + continuation + "    ";
   }
-  cleanAttributes(attrs) {
-    const cleaned = {};
-    const seen = /* @__PURE__ */ new Set();
-    const excludedKeys = [
-      attributes2.ATTR_LANGWATCH_THREAD_ID,
-      "langwatch.scenario.id",
-      "langwatch.scenario.name"
-    ];
-    for (const [key, value] of Object.entries(attrs)) {
-      if (excludedKeys.includes(key)) {
-        continue;
-      }
-      const cleanKey = key.replace(/^(langwatch)\./, "");
-      if (!seen.has(cleanKey)) {
-        seen.add(cleanKey);
-        cleaned[cleanKey] = value;
-      }
-    }
-    return cleaned;
-  }
-  formatValue(value) {
-    const processed = this.transformValue(value);
+  /**
+   * Formats a value with deduplication applied. Used by the `format()` method
+   * to reduce token usage by replacing repeated strings with markers.
+   */
+  formatValueWithDedup(value) {
+    const processed = this.transformValueWithDedup(value);
     return typeof processed === "string" ? processed : JSON.stringify(processed);
   }
-  transformValue(value) {
+  transformValueWithDedup(value) {
     return deepTransform(value, (v) => {
       const mediaPart = truncateMediaPart(v);
       if (mediaPart) return mediaPart;
       if (typeof v !== "string") return v;
-      return this.transformString(v);
+      return this.transformStringWithDedup(v);
     });
   }
-  transformString(str) {
-    if (this.looksLikeJson(str)) {
+  transformStringWithDedup(str) {
+    if (looksLikeJson(str)) {
       try {
-        const processed = this.transformValue(JSON.parse(str));
+        const processed = this.transformValueWithDedup(JSON.parse(str));
         return JSON.stringify(processed);
       } catch {
       }
@@ -1707,36 +1962,12 @@ var JudgeSpanDigestFormatter = class {
     if (truncated !== str) return truncated;
     return this.deduplicator.process(str);
   }
-  looksLikeJson(str) {
-    const t = str.trim();
-    return t.startsWith("{") && t.endsWith("}") || t.startsWith("[") && t.endsWith("]");
-  }
-  hrTimeToMs(hrTime) {
-    return hrTime[0] * 1e3 + hrTime[1] / 1e6;
-  }
-  calculateSpanDuration(span) {
-    return this.hrTimeToMs(span.endTime) - this.hrTimeToMs(span.startTime);
-  }
   calculateTotalDuration(spans) {
     if (spans.length === 0) return 0;
-    const first = this.hrTimeToMs(spans[0].startTime);
-    const last = Math.max(...spans.map((s) => this.hrTimeToMs(s.endTime)));
+    const first = hrTimeToMs(spans[0].startTime);
+    const last = Math.max(...spans.map((s) => hrTimeToMs(s.endTime)));
     return last - first;
   }
-  formatDuration(ms) {
-    if (ms < 1e3) return `${Math.round(ms)}ms`;
-    return `${(ms / 1e3).toFixed(2)}s`;
-  }
-  formatTimestamp(hrTime) {
-    const ms = this.hrTimeToMs(hrTime);
-    return new Date(ms).toISOString();
-  }
-  getStatusIndicator(span) {
-    if (span.status.code === 2) {
-      return ` \u26A0\uFE0F ERROR: ${span.status.message ?? "unknown"}`;
-    }
-    return "";
-  }
   collectErrors(spans) {
     return spans.filter((s) => s.status.code === 2).map((s) => `- ${s.name}: ${s.status.message ?? "unknown error"}`);
   }
@@ -1799,15 +2030,41 @@ function buildFinishTestTool(criteria) {
     })
   });
 }
+function buildProgressiveDiscoveryTools(spans) {
+  return {
+    expand_trace: tool({
+      description: "Expand one or more spans to see their full details (attributes, events, content). Use the span ID shown in brackets in the trace skeleton.",
+      inputSchema: z4.object({
+        span_ids: z4.array(z4.string()).describe("Span IDs (or 8-char prefixes) to expand")
+      }),
+      execute: async ({ span_ids }) => {
+        return expandTrace(spans, span_ids);
+      }
+    }),
+    grep_trace: tool({
+      description: "Search across all span attributes, events, and content for a pattern (case-insensitive). Returns matching spans with context.",
+      inputSchema: z4.object({
+        pattern: z4.string().describe("Search pattern (case-insensitive)")
+      }),
+      execute: async ({ pattern }) => {
+        return grepTrace(spans, pattern);
+      }
+    })
+  };
+}
 var JudgeAgent = class extends JudgeAgentAdapter {
   constructor(cfg) {
     super();
     this.cfg = cfg;
     this.criteria = cfg.criteria ?? [];
     this.spanCollector = cfg.spanCollector ?? judgeSpanCollector;
+    this.tokenThreshold = cfg.tokenThreshold ?? DEFAULT_TOKEN_THRESHOLD;
+    this.maxDiscoverySteps = cfg.maxDiscoverySteps ?? 10;
   }
   logger = new Logger("JudgeAgent");
   spanCollector;
+  tokenThreshold;
+  maxDiscoverySteps;
   role = "Judge" /* JUDGE */;
   criteria;
   /**
@@ -1815,7 +2072,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
    */
   invokeLLM = createLLMInvoker(this.logger);
   async call(input) {
-    var _a, _b, _c, _d;
+    var _a;
     const criteria = ((_a = input.judgmentRequest) == null ? void 0 : _a.criteria) ?? this.criteria;
     this.logger.debug("call() invoked", {
       threadId: input.threadId,
@@ -1823,8 +2080,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
       maxTurns: input.scenarioConfig.maxTurns,
       judgmentRequest: input.judgmentRequest
     });
-    const digest = this.getOpenTelemetryTracesDigest(input.threadId);
-    this.logger.debug("OpenTelemetry traces built", { digest });
+    const spans = this.spanCollector.getSpansForThread(input.threadId);
+    const { digest, isLargeTrace } = this.buildTraceDigest(spans);
     const transcript = JudgeUtils.buildTranscriptFromMessages(input.messages);
     const contentForJudge = `
     <transcript>
@@ -1847,6 +2104,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
       ...cfg
     });
     const tools = {
+      ...isLargeTrace ? buildProgressiveDiscoveryTools(spans) : {},
       continue_test: buildContinueTestTool(),
       finish_test: buildFinishTestTool(criteria)
     };
@@ -1865,26 +2123,75 @@ var JudgeAgent = class extends JudgeAgentAdapter {
       model: mergedConfig.model,
       toolChoice,
       isLastMessage,
-      enforceJudgement
+      enforceJudgement,
+      isLargeTrace
     });
-    const completion = await this.invokeLLM({
+    const completion = await this.invokeLLMWithDiscovery({
       model: mergedConfig.model,
       messages,
       temperature: mergedConfig.temperature ?? 0,
       maxOutputTokens: mergedConfig.maxTokens,
       tools,
-      toolChoice
+      toolChoice,
+      isLargeTrace
+    });
+    return this.parseToolCalls(completion, criteria);
+  }
+  /**
+   * Builds the trace digest, choosing between full inline rendering
+   * and structure-only mode based on estimated token count.
+   */
+  buildTraceDigest(spans) {
+    const fullDigest = judgeSpanDigestFormatter.format(spans);
+    const isLargeTrace = spans.length > 0 && estimateTokens(fullDigest) > this.tokenThreshold;
+    const digest = isLargeTrace ? judgeSpanDigestFormatter.formatStructureOnly(spans) + "\n\nUse expand_trace(span_id) to see span details or grep_trace(pattern) to search across spans. Reference spans by the ID shown in brackets." : fullDigest;
+    this.logger.debug("Trace digest built", {
+      isLargeTrace,
+      estimatedTokens: estimateTokens(fullDigest)
     });
+    return { digest, isLargeTrace };
+  }
+  /**
+   * Invokes the LLM, enabling multi-step tool execution for large traces.
+   * In multi-step mode, the AI SDK loops automatically: the judge can call
+   * expand_trace/grep_trace tools multiple times before reaching a terminal
+   * tool (finish_test/continue_test) or hitting the step limit.
+   *
+   * When the trace is large, toolChoice is relaxed to "required" so the
+   * judge can freely pick discovery tools (expand_trace/grep_trace) before
+   * being forced to a terminal decision.
+   */
+  async invokeLLMWithDiscovery({
+    isLargeTrace,
+    ...params
+  }) {
+    var _a, _b;
+    if (isLargeTrace) {
+      params.toolChoice = "required";
+      params.stopWhen = [
+        stepCountIs(this.maxDiscoverySteps),
+        hasToolCall("finish_test"),
+        hasToolCall("continue_test")
+      ];
+    }
+    const completion = await this.invokeLLM(params);
     this.logger.debug("LLM response received", {
-      toolCallCount: ((_b = completion.toolCalls) == null ? void 0 : _b.length) ?? 0,
-      toolCalls: (_c = completion.toolCalls) == null ? void 0 : _c.map((tc) => ({
+      toolCallCount: ((_a = completion.toolCalls) == null ? void 0 : _a.length) ?? 0,
+      toolCalls: (_b = completion.toolCalls) == null ? void 0 : _b.map((tc) => ({
         toolName: tc.toolName,
         args: tc.input
       }))
     });
+    return completion;
+  }
+  parseToolCalls(completion, criteria) {
+    var _a;
     let args;
-    if ((_d = completion.toolCalls) == null ? void 0 : _d.length) {
-      const toolCall = completion.toolCalls[0];
+    if ((_a = completion.toolCalls) == null ? void 0 : _a.length) {
+      const terminalCall = completion.toolCalls.find(
+        (tc) => tc.toolName === "finish_test" || tc.toolName === "continue_test"
+      );
+      const toolCall = terminalCall ?? completion.toolCalls[0];
       switch (toolCall.toolName) {
         case "finish_test": {
           args = toolCall.input;
@@ -1926,11 +2233,6 @@ var JudgeAgent = class extends JudgeAgentAdapter {
       unmetCriteria: criteria
     };
   }
-  getOpenTelemetryTracesDigest(threadId) {
-    const spans = this.spanCollector.getSpansForThread(threadId);
-    const digest = judgeSpanDigestFormatter.format(spans);
-    return digest;
-  }
 };
 var judgeAgent = (cfg) => {
   return new JudgeAgent(cfg ?? {});
@@ -4677,6 +4979,7 @@ export {
   AgentAdapter,
   AgentRole,
   DEFAULT_MAX_TURNS,
+  DEFAULT_TOKEN_THRESHOLD,
   DEFAULT_VERBOSE,
   JudgeAgentAdapter,
   JudgeSpanCollector,
@@ -4690,7 +4993,10 @@ export {
   allAgentRoles,
   index_default as default,
   defineConfig,
+  estimateTokens,
+  expandTrace,
   fail,
+  grepTrace,
   judge,
   judgeAgent,
   judgeSpanCollector,