npm - @langwatch/scenario - Versions diffs - 0.4.4 → 0.4.5 - Mend

@langwatch/scenario 0.4.4 → 0.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/index.d.mts CHANGED Viewed

@@ -571,6 +571,10 @@ declare class JudgeAgent extends JudgeAgentAdapter {
      * In multi-step mode, the AI SDK loops automatically: the judge can call
      * expand_trace/grep_trace tools multiple times before reaching a terminal
      * tool (finish_test/continue_test) or hitting the step limit.
+     *
+     * When the trace is large, toolChoice is relaxed to "required" so the
+     * judge can freely pick discovery tools (expand_trace/grep_trace) before
+     * being forced to a terminal decision.
      */
     private invokeLLMWithDiscovery;
     private parseToolCalls;
@@ -691,14 +695,15 @@ declare function estimateTokens(text: string): number;
  * Expands one or more spans from a trace, returning their full details
  * (attributes, events, status) with tree position context.
  *
+ * Spans are matched by prefix: the caller can pass the truncated 8-char
+ * span ID shown in the skeleton and it will match any span whose full ID
+ * starts with that prefix.
+ *
  * @param spans - The full array of ReadableSpan objects for the trace
- * @param options - Either a single `index` or a `range` string like "10-15"
- * @returns Formatted string with full span details, truncated to ~4000 tokens
+ * @param spanIds - Span IDs or prefixes to expand
+ * @returns Formatted string with full span details, truncated to ~4096 tokens
  */
-declare function expandTrace(spans: ReadableSpan[], { index, range }: {
-    index?: number;
-    range?: string;
-}): string;
+declare function expandTrace(spans: ReadableSpan[], spanIds: string[]): string;
 /**
  * Searches across all span attributes, events, and content for a pattern.
  * Returns matching spans with their tree position and matching content.

package/dist/index.d.ts CHANGED Viewed

@@ -571,6 +571,10 @@ declare class JudgeAgent extends JudgeAgentAdapter {
      * In multi-step mode, the AI SDK loops automatically: the judge can call
      * expand_trace/grep_trace tools multiple times before reaching a terminal
      * tool (finish_test/continue_test) or hitting the step limit.
+     *
+     * When the trace is large, toolChoice is relaxed to "required" so the
+     * judge can freely pick discovery tools (expand_trace/grep_trace) before
+     * being forced to a terminal decision.
      */
     private invokeLLMWithDiscovery;
     private parseToolCalls;
@@ -691,14 +695,15 @@ declare function estimateTokens(text: string): number;
  * Expands one or more spans from a trace, returning their full details
  * (attributes, events, status) with tree position context.
  *
+ * Spans are matched by prefix: the caller can pass the truncated 8-char
+ * span ID shown in the skeleton and it will match any span whose full ID
+ * starts with that prefix.
+ *
  * @param spans - The full array of ReadableSpan objects for the trace
- * @param options - Either a single `index` or a `range` string like "10-15"
- * @returns Formatted string with full span details, truncated to ~4000 tokens
+ * @param spanIds - Span IDs or prefixes to expand
+ * @returns Formatted string with full span details, truncated to ~4096 tokens
  */
-declare function expandTrace(spans: ReadableSpan[], { index, range }: {
-    index?: number;
-    range?: string;
-}): string;
+declare function expandTrace(spans: ReadableSpan[], spanIds: string[]): string;
 /**
  * Searches across all span attributes, events, and content for a pattern.
  * Returns matching spans with their tree position and matching content.

package/dist/index.js CHANGED Viewed

@@ -1235,10 +1235,10 @@ function indexSpans(spans) {
   const sorted = [...spans].sort((a, b) => {
     return hrTimeToMs(a.startTime) - hrTimeToMs(b.startTime);
   });
-  return sorted.map((span, i) => ({
+  return sorted.map((span) => ({
     span,
     children: [],
-    index: i + 1
+    shortId: span.spanContext().spanId.slice(0, 8)
   }));
 }
@@ -1253,7 +1253,7 @@ function renderFullSpanNode(node) {
   const status = getStatusIndicator(span);
   const lines = [];
   lines.push(
-    `[${node.index}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
+    `[${node.shortId}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
   );
   const attrs = cleanAttributes(span.attributes);
   if (Object.keys(attrs).length > 0) {
@@ -1277,7 +1277,7 @@ function renderFullSpanNode(node) {
 function truncateToCharBudget(text) {
   if (text.length <= TOOL_RESULT_CHAR_BUDGET) return text;
   const truncated = text.slice(0, TOOL_RESULT_CHAR_BUDGET);
-  return truncated + "\n\n[TRUNCATED] Output exceeded ~4000 token budget. Use grep_trace(pattern) to search for specific content, or expand_trace with a narrower range.";
+  return truncated + "\n\n[TRUNCATED] Output exceeded ~4000 token budget. Use grep_trace(pattern) to search for specific content, or expand_trace with fewer span IDs.";
 }
 function spanToSearchableText(span) {
   const parts = [span.name];
@@ -1296,30 +1296,22 @@ function spanToSearchableText(span) {
   }
   return parts.join("\n");
 }
-function expandTrace(spans, { index, range }) {
+function expandTrace(spans, spanIds) {
   const nodes = indexSpans(spans);
   if (nodes.length === 0) {
     return "No spans recorded.";
   }
-  let startIdx;
-  let endIdx;
-  if (range != null) {
-    const parts = range.split("-").map(Number);
-    startIdx = parts[0];
-    endIdx = parts[1] ?? startIdx;
-  } else if (index != null) {
-    startIdx = index;
-    endIdx = index;
-  } else {
-    return "Error: provide either index or range parameter.";
+  if (spanIds.length === 0) {
+    return "Error: provide at least one span ID.";
   }
-  const maxIndex = nodes.length;
-  if (startIdx < 1 || endIdx > maxIndex || startIdx > endIdx) {
-    return `Error: span index out of range. Valid range is 1-${maxIndex}.`;
+  const selected = nodes.filter((n) => {
+    const fullId = n.span.spanContext().spanId;
+    return spanIds.some((prefix) => fullId.startsWith(prefix));
+  });
+  if (selected.length === 0) {
+    const available = nodes.map((n) => n.shortId).join(", ");
+    return `Error: no spans matched the given ID(s). Available span IDs: ${available}`;
   }
-  const selected = nodes.filter(
-    (n) => n.index >= startIdx && n.index <= endIdx
-  );
   const lines = [];
   for (const node of selected) {
     const spanLines = renderFullSpanNode(node);
@@ -1353,7 +1345,7 @@ function grepTrace(spans, pattern) {
   for (const { node, matchingLines } of limited) {
     const duration = calculateSpanDuration(node.span);
     lines.push(
-      `--- [${node.index}] ${node.span.name} (${formatDuration(duration)}) ---`
+      `--- [${node.shortId}] ${node.span.name} (${formatDuration(duration)}) ---`
     );
     for (const line of matchingLines) {
       lines.push(`  ${line}`);
@@ -1847,14 +1839,12 @@ var JudgeSpanDigestFormatter = class {
       `Spans: ${spans.length} | Total Duration: ${formatDuration(totalDuration)}`,
       ""
     ];
-    let sequence = 1;
     const rootCount = tree.length;
     tree.forEach((node, idx) => {
-      sequence = this.renderStructureNode(
+      this.renderStructureNode(
         node,
         lines,
         0,
-        sequence,
         idx === rootCount - 1
       );
     });
@@ -1894,14 +1884,12 @@ var JudgeSpanDigestFormatter = class {
       )}`,
       ""
     ];
-    let sequence = 1;
     const rootCount = tree.length;
     tree.forEach((node, idx) => {
-      sequence = this.renderNode(
+      this.renderNode(
         node,
         lines,
         0,
-        sequence,
         idx === rootCount - 1
       );
     });
@@ -1935,38 +1923,37 @@ var JudgeSpanDigestFormatter = class {
     }
     return roots;
   }
-  renderStructureNode(node, lines, depth, sequence, isLast = true) {
+  renderStructureNode(node, lines, depth, isLast = true) {
     const span = node.span;
+    const shortId = span.spanContext().spanId.slice(0, 8);
     const duration = calculateSpanDuration(span);
     const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
     const status = getStatusIndicator(span);
     const tokens = getTokenUsage(span);
     const prefix = this.getTreePrefix(depth, isLast);
     lines.push(
-      `${prefix}[${sequence}] ${timestamp} ${span.name} (${formatDuration(duration)}${tokens})${status}`
+      `${prefix}[${shortId}] ${timestamp} ${span.name} (${formatDuration(duration)}${tokens})${status}`
     );
     lines.push("");
-    let nextSeq = sequence + 1;
     const childCount = node.children.length;
     node.children.forEach((child, idx) => {
-      nextSeq = this.renderStructureNode(
+      this.renderStructureNode(
         child,
         lines,
         depth + 1,
-        nextSeq,
         idx === childCount - 1
       );
     });
-    return nextSeq;
   }
-  renderNode(node, lines, depth, sequence, isLast = true) {
+  renderNode(node, lines, depth, isLast = true) {
     const span = node.span;
+    const shortId = span.spanContext().spanId.slice(0, 8);
     const duration = calculateSpanDuration(span);
     const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
     const status = getStatusIndicator(span);
     const prefix = this.getTreePrefix(depth, isLast);
     lines.push(
-      `${prefix}[${sequence}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
+      `${prefix}[${shortId}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
     );
     const attrIndent = this.getAttrIndent(depth, isLast);
     const attrs = cleanAttributes(span.attributes);
@@ -1987,18 +1974,15 @@ var JudgeSpanDigestFormatter = class {
       }
     }
     lines.push("");
-    let nextSeq = sequence + 1;
     const childCount = node.children.length;
     node.children.forEach((child, idx) => {
-      nextSeq = this.renderNode(
+      this.renderNode(
         child,
         lines,
         depth + 1,
-        nextSeq,
         idx === childCount - 1
       );
     });
-    return nextSeq;
   }
   getTreePrefix(depth, isLast) {
     if (depth === 0) return "";
@@ -2109,13 +2093,12 @@ function buildFinishTestTool(criteria) {
 function buildProgressiveDiscoveryTools(spans) {
   return {
     expand_trace: (0, import_ai2.tool)({
-      description: "Expand one or more spans to see their full details (attributes, events, content). Use a single index like 5 or a range like '10-15'.",
+      description: "Expand one or more spans to see their full details (attributes, events, content). Use the span ID shown in brackets in the trace skeleton.",
       inputSchema: import_v44.z.object({
-        index: import_v44.z.number().optional().describe("Single span index to expand"),
-        range: import_v44.z.string().optional().describe('Range of span indices to expand, e.g. "10-15"')
+        span_ids: import_v44.z.array(import_v44.z.string()).describe("Span IDs (or 8-char prefixes) to expand")
       }),
-      execute: async ({ index, range }) => {
-        return expandTrace(spans, { index, range });
+      execute: async ({ span_ids }) => {
+        return expandTrace(spans, span_ids);
       }
     }),
     grep_trace: (0, import_ai2.tool)({
@@ -2181,9 +2164,9 @@ var JudgeAgent = class extends JudgeAgentAdapter {
       ...cfg
     });
     const tools = {
+      ...isLargeTrace ? buildProgressiveDiscoveryTools(spans) : {},
       continue_test: buildContinueTestTool(),
-      finish_test: buildFinishTestTool(criteria),
-      ...isLargeTrace ? buildProgressiveDiscoveryTools(spans) : {}
+      finish_test: buildFinishTestTool(criteria)
     };
     const enforceJudgement = input.judgmentRequest != null;
     const hasCriteria = criteria.length && criteria.length > 0;
@@ -2221,7 +2204,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
   buildTraceDigest(spans) {
     const fullDigest = judgeSpanDigestFormatter.format(spans);
     const isLargeTrace = spans.length > 0 && estimateTokens(fullDigest) > this.tokenThreshold;
-    const digest = isLargeTrace ? judgeSpanDigestFormatter.formatStructureOnly(spans) + "\n\nUse expand_trace(spanIndex) to see span details or grep_trace(pattern) to search across spans." : fullDigest;
+    const digest = isLargeTrace ? judgeSpanDigestFormatter.formatStructureOnly(spans) + "\n\nUse expand_trace(span_id) to see span details or grep_trace(pattern) to search across spans. Reference spans by the ID shown in brackets." : fullDigest;
     this.logger.debug("Trace digest built", {
       isLargeTrace,
       estimatedTokens: estimateTokens(fullDigest)
@@ -2233,6 +2216,10 @@ var JudgeAgent = class extends JudgeAgentAdapter {
    * In multi-step mode, the AI SDK loops automatically: the judge can call
    * expand_trace/grep_trace tools multiple times before reaching a terminal
    * tool (finish_test/continue_test) or hitting the step limit.
+   *
+   * When the trace is large, toolChoice is relaxed to "required" so the
+   * judge can freely pick discovery tools (expand_trace/grep_trace) before
+   * being forced to a terminal decision.
    */
   async invokeLLMWithDiscovery({
     isLargeTrace,
@@ -2240,6 +2227,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
   }) {
     var _a, _b;
     if (isLargeTrace) {
+      params.toolChoice = "required";
       params.stopWhen = [
         (0, import_ai2.stepCountIs)(this.maxDiscoverySteps),
         (0, import_ai2.hasToolCall)("finish_test"),

package/dist/index.mjs CHANGED Viewed

@@ -1175,10 +1175,10 @@ function indexSpans(spans) {
   const sorted = [...spans].sort((a, b) => {
     return hrTimeToMs(a.startTime) - hrTimeToMs(b.startTime);
   });
-  return sorted.map((span, i) => ({
+  return sorted.map((span) => ({
     span,
     children: [],
-    index: i + 1
+    shortId: span.spanContext().spanId.slice(0, 8)
   }));
 }
@@ -1193,7 +1193,7 @@ function renderFullSpanNode(node) {
   const status = getStatusIndicator(span);
   const lines = [];
   lines.push(
-    `[${node.index}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
+    `[${node.shortId}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
   );
   const attrs = cleanAttributes(span.attributes);
   if (Object.keys(attrs).length > 0) {
@@ -1217,7 +1217,7 @@ function renderFullSpanNode(node) {
 function truncateToCharBudget(text) {
   if (text.length <= TOOL_RESULT_CHAR_BUDGET) return text;
   const truncated = text.slice(0, TOOL_RESULT_CHAR_BUDGET);
-  return truncated + "\n\n[TRUNCATED] Output exceeded ~4000 token budget. Use grep_trace(pattern) to search for specific content, or expand_trace with a narrower range.";
+  return truncated + "\n\n[TRUNCATED] Output exceeded ~4000 token budget. Use grep_trace(pattern) to search for specific content, or expand_trace with fewer span IDs.";
 }
 function spanToSearchableText(span) {
   const parts = [span.name];
@@ -1236,30 +1236,22 @@ function spanToSearchableText(span) {
   }
   return parts.join("\n");
 }
-function expandTrace(spans, { index, range }) {
+function expandTrace(spans, spanIds) {
   const nodes = indexSpans(spans);
   if (nodes.length === 0) {
     return "No spans recorded.";
   }
-  let startIdx;
-  let endIdx;
-  if (range != null) {
-    const parts = range.split("-").map(Number);
-    startIdx = parts[0];
-    endIdx = parts[1] ?? startIdx;
-  } else if (index != null) {
-    startIdx = index;
-    endIdx = index;
-  } else {
-    return "Error: provide either index or range parameter.";
+  if (spanIds.length === 0) {
+    return "Error: provide at least one span ID.";
   }
-  const maxIndex = nodes.length;
-  if (startIdx < 1 || endIdx > maxIndex || startIdx > endIdx) {
-    return `Error: span index out of range. Valid range is 1-${maxIndex}.`;
+  const selected = nodes.filter((n) => {
+    const fullId = n.span.spanContext().spanId;
+    return spanIds.some((prefix) => fullId.startsWith(prefix));
+  });
+  if (selected.length === 0) {
+    const available = nodes.map((n) => n.shortId).join(", ");
+    return `Error: no spans matched the given ID(s). Available span IDs: ${available}`;
   }
-  const selected = nodes.filter(
-    (n) => n.index >= startIdx && n.index <= endIdx
-  );
   const lines = [];
   for (const node of selected) {
     const spanLines = renderFullSpanNode(node);
@@ -1293,7 +1285,7 @@ function grepTrace(spans, pattern) {
   for (const { node, matchingLines } of limited) {
     const duration = calculateSpanDuration(node.span);
     lines.push(
-      `--- [${node.index}] ${node.span.name} (${formatDuration(duration)}) ---`
+      `--- [${node.shortId}] ${node.span.name} (${formatDuration(duration)}) ---`
     );
     for (const line of matchingLines) {
       lines.push(`  ${line}`);
@@ -1787,14 +1779,12 @@ var JudgeSpanDigestFormatter = class {
       `Spans: ${spans.length} | Total Duration: ${formatDuration(totalDuration)}`,
       ""
     ];
-    let sequence = 1;
     const rootCount = tree.length;
     tree.forEach((node, idx) => {
-      sequence = this.renderStructureNode(
+      this.renderStructureNode(
         node,
         lines,
         0,
-        sequence,
         idx === rootCount - 1
       );
     });
@@ -1834,14 +1824,12 @@ var JudgeSpanDigestFormatter = class {
       )}`,
       ""
     ];
-    let sequence = 1;
     const rootCount = tree.length;
     tree.forEach((node, idx) => {
-      sequence = this.renderNode(
+      this.renderNode(
         node,
         lines,
         0,
-        sequence,
         idx === rootCount - 1
       );
     });
@@ -1875,38 +1863,37 @@ var JudgeSpanDigestFormatter = class {
     }
     return roots;
   }
-  renderStructureNode(node, lines, depth, sequence, isLast = true) {
+  renderStructureNode(node, lines, depth, isLast = true) {
     const span = node.span;
+    const shortId = span.spanContext().spanId.slice(0, 8);
     const duration = calculateSpanDuration(span);
     const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
     const status = getStatusIndicator(span);
     const tokens = getTokenUsage(span);
     const prefix = this.getTreePrefix(depth, isLast);
     lines.push(
-      `${prefix}[${sequence}] ${timestamp} ${span.name} (${formatDuration(duration)}${tokens})${status}`
+      `${prefix}[${shortId}] ${timestamp} ${span.name} (${formatDuration(duration)}${tokens})${status}`
     );
     lines.push("");
-    let nextSeq = sequence + 1;
     const childCount = node.children.length;
     node.children.forEach((child, idx) => {
-      nextSeq = this.renderStructureNode(
+      this.renderStructureNode(
         child,
         lines,
         depth + 1,
-        nextSeq,
         idx === childCount - 1
       );
     });
-    return nextSeq;
   }
-  renderNode(node, lines, depth, sequence, isLast = true) {
+  renderNode(node, lines, depth, isLast = true) {
     const span = node.span;
+    const shortId = span.spanContext().spanId.slice(0, 8);
     const duration = calculateSpanDuration(span);
     const timestamp = new Date(hrTimeToMs(span.startTime)).toISOString();
     const status = getStatusIndicator(span);
     const prefix = this.getTreePrefix(depth, isLast);
     lines.push(
-      `${prefix}[${sequence}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
+      `${prefix}[${shortId}] ${timestamp} ${span.name} (${formatDuration(duration)})${status}`
     );
     const attrIndent = this.getAttrIndent(depth, isLast);
     const attrs = cleanAttributes(span.attributes);
@@ -1927,18 +1914,15 @@ var JudgeSpanDigestFormatter = class {
       }
     }
     lines.push("");
-    let nextSeq = sequence + 1;
     const childCount = node.children.length;
     node.children.forEach((child, idx) => {
-      nextSeq = this.renderNode(
+      this.renderNode(
         child,
         lines,
         depth + 1,
-        nextSeq,
         idx === childCount - 1
       );
     });
-    return nextSeq;
   }
   getTreePrefix(depth, isLast) {
     if (depth === 0) return "";
@@ -2049,13 +2033,12 @@ function buildFinishTestTool(criteria) {
 function buildProgressiveDiscoveryTools(spans) {
   return {
     expand_trace: tool({
-      description: "Expand one or more spans to see their full details (attributes, events, content). Use a single index like 5 or a range like '10-15'.",
+      description: "Expand one or more spans to see their full details (attributes, events, content). Use the span ID shown in brackets in the trace skeleton.",
       inputSchema: z4.object({
-        index: z4.number().optional().describe("Single span index to expand"),
-        range: z4.string().optional().describe('Range of span indices to expand, e.g. "10-15"')
+        span_ids: z4.array(z4.string()).describe("Span IDs (or 8-char prefixes) to expand")
       }),
-      execute: async ({ index, range }) => {
-        return expandTrace(spans, { index, range });
+      execute: async ({ span_ids }) => {
+        return expandTrace(spans, span_ids);
       }
     }),
     grep_trace: tool({
@@ -2121,9 +2104,9 @@ var JudgeAgent = class extends JudgeAgentAdapter {
       ...cfg
     });
     const tools = {
+      ...isLargeTrace ? buildProgressiveDiscoveryTools(spans) : {},
       continue_test: buildContinueTestTool(),
-      finish_test: buildFinishTestTool(criteria),
-      ...isLargeTrace ? buildProgressiveDiscoveryTools(spans) : {}
+      finish_test: buildFinishTestTool(criteria)
     };
     const enforceJudgement = input.judgmentRequest != null;
     const hasCriteria = criteria.length && criteria.length > 0;
@@ -2161,7 +2144,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
   buildTraceDigest(spans) {
     const fullDigest = judgeSpanDigestFormatter.format(spans);
     const isLargeTrace = spans.length > 0 && estimateTokens(fullDigest) > this.tokenThreshold;
-    const digest = isLargeTrace ? judgeSpanDigestFormatter.formatStructureOnly(spans) + "\n\nUse expand_trace(spanIndex) to see span details or grep_trace(pattern) to search across spans." : fullDigest;
+    const digest = isLargeTrace ? judgeSpanDigestFormatter.formatStructureOnly(spans) + "\n\nUse expand_trace(span_id) to see span details or grep_trace(pattern) to search across spans. Reference spans by the ID shown in brackets." : fullDigest;
     this.logger.debug("Trace digest built", {
       isLargeTrace,
       estimatedTokens: estimateTokens(fullDigest)
@@ -2173,6 +2156,10 @@ var JudgeAgent = class extends JudgeAgentAdapter {
    * In multi-step mode, the AI SDK loops automatically: the judge can call
    * expand_trace/grep_trace tools multiple times before reaching a terminal
    * tool (finish_test/continue_test) or hitting the step limit.
+   *
+   * When the trace is large, toolChoice is relaxed to "required" so the
+   * judge can freely pick discovery tools (expand_trace/grep_trace) before
+   * being forced to a terminal decision.
    */
   async invokeLLMWithDiscovery({
     isLargeTrace,
@@ -2180,6 +2167,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
   }) {
     var _a, _b;
     if (isLargeTrace) {
+      params.toolChoice = "required";
       params.stopWhen = [
         stepCountIs(this.maxDiscoverySteps),
         hasToolCall("finish_test"),

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@langwatch/scenario",
-  "version": "0.4.4",
+  "version": "0.4.5",
   "description": "A TypeScript library for testing AI agents using scenarios",
   "main": "dist/index.js",
   "module": "dist/index.mjs",