npm - @langwatch/scenario - Versions diffs - 0.4.0 → 0.4.2 - Mend

@langwatch/scenario 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.d.mts +70 -47
package/dist/index.d.ts +70 -47
package/dist/index.js +153 -87
package/dist/index.mjs +153 -87
package/dist/integrations/vitest/setup.js +1 -1
package/dist/integrations/vitest/setup.mjs +1 -1
package/package.json +4 -4

package/dist/index.mjs CHANGED Viewed

@@ -131,7 +131,7 @@ var DEFAULT_TEMPERATURE = 0;
 var modelSchema = z2.object({
   model: z2.custom((val) => Boolean(val), {
     message: "A model is required. Configure it in scenario.config.js defaultModel or pass directly to the agent."
-  }).describe("The OpenAI Language Model to use for generating responses."),
+  }).describe("Language model that is used by the AI SDK Core functions."),
   temperature: z2.number().min(0).max(1).optional().describe("The temperature for the language model.").default(DEFAULT_TEMPERATURE),
   maxTokens: z2.number().optional().describe("The maximum number of tokens to generate.")
 });
@@ -397,7 +397,7 @@ var JudgeUtils = {
   /**
    * Builds a minimal transcript from messages for judge evaluation.
    * Truncates base64 media to reduce token usage.
-   * @param messages - Array of CoreMessage from conversation
+   * @param messages - Array of ModelMessage from conversation
    * @returns Plain text transcript with one message per line
    */
   buildTranscriptFromMessages(messages) {
@@ -428,52 +428,68 @@ var createLLMInvoker = (logger2) => {
 var toolMessageRole = "tool";
 var assistantMessageRole = "assistant";
 var userMessageRole = "user";
-var groupMessagesByToolBoundaries = (messages) => {
-  const segments = [];
-  let currentSegment = [];
-  for (const message2 of messages) {
-    currentSegment.push(message2);
-    if (message2.role === toolMessageRole) {
-      segments.push(currentSegment);
-      currentSegment = [];
-    }
-  }
-  if (currentSegment.length > 0) {
-    segments.push(currentSegment);
+var hasToolContent = (message2) => {
+  if (message2.role === toolMessageRole) return true;
+  if (!Array.isArray(message2.content)) return false;
+  return message2.content.some((part) => {
+    if (!part || typeof part !== "object") return false;
+    const partType = "type" in part ? part.type : void 0;
+    return partType === "tool-call" || partType === "tool-result";
+  });
+};
+var stringifyValue = (value) => {
+  if (typeof value === "string") return value;
+  if (value === void 0) return "undefined";
+  try {
+    const serialized = JSON.stringify(value);
+    return serialized === void 0 ? String(value) : serialized;
+  } catch {
+    return String(value);
   }
-  return segments;
 };
-var segmentHasToolMessages = (segment) => {
-  return segment.some((message2) => {
-    if (message2.role === toolMessageRole) return true;
-    if (message2.role === assistantMessageRole && Array.isArray(message2.content)) {
-      return message2.content.some((part) => part.type === "tool-call");
-    }
-    return false;
+var summarizeToolMessage = (message2) => {
+  if (message2.role === toolMessageRole && !Array.isArray(message2.content)) {
+    return `[Tool message: ${stringifyValue(message2.content)}]`;
+  }
+  if (message2.role === toolMessageRole) {
+    const toolResults = message2.content.filter((part) => part.type === "tool-result").map((part) => {
+      const contentPart = part;
+      const name = contentPart.toolName ?? "unknown tool";
+      const output = contentPart.output;
+      const value = output && typeof output === "object" && "value" in output && typeof output.value === "string" ? output.value : output ?? contentPart.result;
+      return `[Tool result from ${name}: ${stringifyValue(value)}]`;
+    });
+    return toolResults.length > 0 ? toolResults.join("\n") : null;
+  }
+  if (!Array.isArray(message2.content)) return null;
+  const toolCalls = message2.content.filter((part) => part.type === "tool-call").map((part) => {
+    const contentPart = part;
+    const name = contentPart.toolName ?? "unknown tool";
+    return `[Called tool ${name} with: ${stringifyValue(contentPart.input)}]`;
   });
+  return toolCalls.length > 0 ? toolCalls.join("\n") : null;
 };
-var reverseSegmentRoles = (segment) => {
-  return segment.map((message2) => {
-    const hasStringContent = typeof message2.content === "string";
-    if (!hasStringContent) return message2;
-    const roleMap = {
-      [userMessageRole]: assistantMessageRole,
-      [assistantMessageRole]: userMessageRole
-    };
+var messageRoleReversal = (messages) => {
+  const roleMap = {
+    [userMessageRole]: assistantMessageRole,
+    [assistantMessageRole]: userMessageRole
+  };
+  return messages.map((message2) => {
+    if (hasToolContent(message2)) {
+      const summary = summarizeToolMessage(message2);
+      if (!summary) return null;
+      return {
+        role: userMessageRole,
+        content: summary
+      };
+    }
     const newRole = roleMap[message2.role];
     if (!newRole) return message2;
     return {
-      role: newRole,
-      content: message2.content
+      ...message2,
+      role: newRole
     };
-  });
-};
-var messageRoleReversal = (messages) => {
-  const segments = groupMessagesByToolBoundaries(messages);
-  const processedSegments = segments.map(
-    (segment) => segmentHasToolMessages(segment) ? segment : reverseSegmentRoles(segment)
-  );
-  return processedSegments.flat();
+  }).filter((message2) => message2 !== null);
 };
 var criterionToParamName = (criterion) => {
   return criterion.replace(/"/g, "").replace(/[^a-zA-Z0-9]/g, "_").replace(/ /g, "_").toLowerCase().substring(0, 70);
@@ -835,7 +851,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
   constructor(cfg) {
     super();
     this.cfg = cfg;
-    this.criteria = cfg.criteria;
+    this.criteria = cfg.criteria ?? [];
     this.spanCollector = cfg.spanCollector ?? judgeSpanCollector;
   }
   logger = new Logger("JudgeAgent");
@@ -847,7 +863,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
    */
   invokeLLM = createLLMInvoker(this.logger);
   async call(input) {
-    var _a, _b, _c;
+    var _a, _b, _c, _d;
+    const criteria = ((_a = input.judgmentRequest) == null ? void 0 : _a.criteria) ?? this.criteria;
     this.logger.debug("call() invoked", {
       threadId: input.threadId,
       currentTurn: input.scenarioState.currentTurn,
@@ -866,7 +883,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
     </opentelemetry_traces>
     `;
     const cfg = this.cfg;
-    const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(cfg.criteria, input.scenarioConfig.description);
+    const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(criteria, input.scenarioConfig.description);
     const messages = [
       { role: "system", content: systemPrompt },
       { role: "user", content: contentForJudge }
@@ -879,10 +896,10 @@ var JudgeAgent = class extends JudgeAgentAdapter {
     });
     const tools = {
       continue_test: buildContinueTestTool(),
-      finish_test: buildFinishTestTool(cfg.criteria)
+      finish_test: buildFinishTestTool(criteria)
     };
-    const enforceJudgement = input.judgmentRequest;
-    const hasCriteria = cfg.criteria.length && cfg.criteria.length > 0;
+    const enforceJudgement = input.judgmentRequest != null;
+    const hasCriteria = criteria.length && criteria.length > 0;
     if (enforceJudgement && !hasCriteria) {
       return {
         success: false,
@@ -907,26 +924,26 @@ var JudgeAgent = class extends JudgeAgentAdapter {
       toolChoice
     });
     this.logger.debug("LLM response received", {
-      toolCallCount: ((_a = completion.toolCalls) == null ? void 0 : _a.length) ?? 0,
-      toolCalls: (_b = completion.toolCalls) == null ? void 0 : _b.map((tc) => ({
+      toolCallCount: ((_b = completion.toolCalls) == null ? void 0 : _b.length) ?? 0,
+      toolCalls: (_c = completion.toolCalls) == null ? void 0 : _c.map((tc) => ({
         toolName: tc.toolName,
         args: tc.input
       }))
     });
     let args;
-    if ((_c = completion.toolCalls) == null ? void 0 : _c.length) {
+    if ((_d = completion.toolCalls) == null ? void 0 : _d.length) {
       const toolCall = completion.toolCalls[0];
       switch (toolCall.toolName) {
         case "finish_test": {
           args = toolCall.input;
           const verdict = args.verdict || "inconclusive";
           const reasoning = args.reasoning || "No reasoning provided";
-          const criteria = args.criteria || {};
-          const criteriaValues = Object.values(criteria);
-          const metCriteria = cfg.criteria.filter(
+          const criteriaArgs = args.criteria || {};
+          const criteriaValues = Object.values(criteriaArgs);
+          const metCriteria = criteria.filter(
             (_, i) => criteriaValues[i] === "true"
           );
-          const unmetCriteria = cfg.criteria.filter(
+          const unmetCriteria = criteria.filter(
             (_, i) => criteriaValues[i] !== "true"
           );
           const result = {
@@ -946,7 +963,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
             success: false,
             reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
             metCriteria: [],
-            unmetCriteria: cfg.criteria
+            unmetCriteria: criteria
           };
       }
     }
@@ -954,7 +971,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
       success: false,
       reasoning: `JudgeAgent: No tool call found in LLM output`,
       metCriteria: [],
-      unmetCriteria: cfg.criteria
+      unmetCriteria: criteria
     };
   }
   getOpenTelemetryTracesDigest(threadId) {
@@ -964,7 +981,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
   }
 };
 var judgeAgent = (cfg) => {
-  return new JudgeAgent(cfg);
+  return new JudgeAgent(cfg ?? {});
 };
 // src/agents/user-simulator-agent.ts
@@ -2408,13 +2425,15 @@ function convertModelMessagesToAguiMessages(modelMessages) {
       }
       case msg.role === "tool":
         msg.content.map((p, i) => {
-          var _a;
+          if ("type" in p && p.type !== "tool-result") return;
           aguiMessages.push({
             trace_id: msg.traceId,
             id: `${id}-${i}`,
             role: "tool",
             toolCallId: p.toolCallId,
-            content: JSON.stringify((_a = p.output) == null ? void 0 : _a.value)
+            content: JSON.stringify(
+              p.output && "value" in p.output ? p.output.value : p.output
+            )
           });
         });
         break;
@@ -2458,6 +2477,8 @@ var ScenarioExecution = class {
   currentTurnSpan;
   /** Timestamp when execution started (for total time calculation) */
   totalStartTime = 0;
+  /** Accumulated results from inline judge checkpoints */
+  checkpointResults = [];
   /** Event stream for monitoring scenario progress */
   eventSubject = new Subject2();
   /**
@@ -2535,6 +2556,7 @@ var ScenarioExecution = class {
       totalTime: this.totalTime,
       agentTime: totalAgentTime
     };
+    return this._result;
     this.logger.debug(`[${this.config.id}] Result set`, {
       success: result.success,
       reasoning: result.reasoning,
@@ -2595,6 +2617,8 @@ var ScenarioExecution = class {
         const scriptStep = this.config.script[i];
         await this.executeScriptStep(scriptStep, i);
         if (this.result) {
+          const cp = this.compiledCheckpoints;
+          this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
           this.emitRunFinished({
             scenarioRunId,
             status: this.result.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
@@ -2603,7 +2627,22 @@ var ScenarioExecution = class {
           return this.result;
         }
       }
-      this.reachedMaxTurns(
+      if (this.checkpointResults.length > 0) {
+        const cp = this.compiledCheckpoints;
+        const result2 = this.setResult({
+          success: cp.unmetCriteria.length === 0,
+          reasoning: "All inline criteria checkpoints passed",
+          metCriteria: cp.metCriteria,
+          unmetCriteria: cp.unmetCriteria
+        });
+        this.emitRunFinished({
+          scenarioRunId,
+          status: result2.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
+          result: result2
+        });
+        return result2;
+      }
+      const result = this.reachedMaxTurns(
         [
           "Reached end of script without conclusion, add one of the following to the end of the script:",
           "- `Scenario.proceed()` to let the simulation continue to play out",
@@ -2611,11 +2650,11 @@ var ScenarioExecution = class {
           "- `Scenario.succeed()` or `Scenario.fail()` to end the test with an explicit result"
         ].join("\n")
       );
-      this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */ });
-      return this.result;
+      this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */, result });
+      return result;
     } catch (error) {
       const errorInfo = extractErrorInfo(error);
-      this.setResult({
+      const result = this.setResult({
         success: false,
         reasoning: `Scenario failed with error: ${errorInfo.message}`,
         metCriteria: [],
@@ -2625,7 +2664,7 @@ var ScenarioExecution = class {
       this.emitRunFinished({
         scenarioRunId,
         status: "ERROR" /* ERROR */,
-        result: this.result
+        result
       });
       throw error;
     } finally {
@@ -2729,7 +2768,7 @@ var ScenarioExecution = class {
    * @param judgmentRequest - Whether this is a judgment request (for judge agents)
    * @throws Error if the agent call fails
    */
-  async callAgent(idx, role, judgmentRequest = false) {
+  async callAgent(idx, role, judgmentRequest) {
     var _a;
     const agent2 = this.agents[idx];
     const agentName = agent2.name ?? agent2.constructor.name;
@@ -2920,25 +2959,26 @@ var ScenarioExecution = class {
    *
    * This method is part of the ScenarioExecutionLike interface used by script steps.
    *
-   * @param content - Optional message to pass to the judge agent for additional context
+   * @param options - Optional options with inline criteria to evaluate as a checkpoint.
    * @returns A promise that resolves with:
    *   - ScenarioResult if the judge makes a final decision, or
    *   - Null if the conversation should continue
    *
    * @example
    * ```typescript
-   * // Let judge evaluate current state
+   * // Let judge evaluate with its configured criteria
    * const result = await execution.judge();
-   * if (result) {
-   *   console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
-   * }
    *
-   * // Provide additional context to judge
-   * const result = await execution.judge("Please consider the user's satisfaction level");
+   * // Evaluate inline criteria as a checkpoint
+   * const result = await execution.judge({ criteria: ["Agent responded helpfully"] });
    * ```
    */
-  async judge(content) {
-    return await this.scriptCallAgent("Judge" /* JUDGE */, content, true);
+  async judge(options) {
+    return await this.scriptCallAgent(
+      "Judge" /* JUDGE */,
+      void 0,
+      { criteria: options == null ? void 0 : options.criteria }
+    );
   }
   /**
    * Lets the scenario proceed automatically for a specified number of turns.
@@ -3023,13 +3063,12 @@ var ScenarioExecution = class {
    * ```
    */
   async succeed(reasoning) {
-    this.setResult({
+    return this.setResult({
       success: true,
       reasoning: reasoning || "Scenario marked as successful with Scenario.succeed()",
       metCriteria: [],
       unmetCriteria: []
     });
-    return this.result;
   }
   /**
    * Immediately ends the scenario with a failure verdict.
@@ -3055,13 +3094,12 @@ var ScenarioExecution = class {
    * ```
    */
   async fail(reasoning) {
-    this.setResult({
+    return this.setResult({
       success: false,
       reasoning: reasoning || "Scenario marked as failed with Scenario.fail()",
       metCriteria: [],
       unmetCriteria: []
     });
-    return this.result;
   }
   /**
    * Adds execution time for a specific agent to the performance tracking.
@@ -3105,15 +3143,14 @@ var ScenarioExecution = class {
    *          decision, or null if the conversation should continue
    * @throws Error if no agent is found for the specified role
    */
-  async scriptCallAgent(role, content, judgmentRequest = false) {
+  async scriptCallAgent(role, content, judgmentRequest) {
     this.logger.debug(`[${this.config.id}] scriptCallAgent`, {
       role,
       hasContent: content !== void 0,
-      judgmentRequest
+      judgmentRequest: judgmentRequest != null,
+      hasInlineCriteria: (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null
     });
     this.consumeUntilRole(role);
-    let index = -1;
-    let agent2 = null;
     let nextAgent = this.getNextAgentForRole(role);
     if (!nextAgent) {
       this.newTurn();
@@ -3143,8 +3180,8 @@ var ScenarioExecution = class {
         `Cannot generate a message for role \`${role}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
       );
     }
-    index = nextAgent.index;
-    agent2 = nextAgent.agent;
+    const index = nextAgent.index;
+    const agent2 = nextAgent.agent;
     this.removePendingAgent(agent2);
     if (content) {
       const message2 = typeof content === "string" ? {
@@ -3156,6 +3193,25 @@ var ScenarioExecution = class {
       return null;
     }
     await this.callAgent(index, role, judgmentRequest);
+    if (this.result && (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null) {
+      this.checkpointResults.push({
+        metCriteria: this.result.metCriteria,
+        unmetCriteria: this.result.unmetCriteria
+      });
+      if (this.result.success) {
+        this._result = void 0;
+        return null;
+      } else {
+        const cp = this.compiledCheckpoints;
+        this.result.metCriteria = cp.metCriteria;
+        this.result.unmetCriteria = cp.unmetCriteria;
+        return this.result;
+      }
+    }
+    if (this.result) {
+      const cp = this.compiledCheckpoints;
+      this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
+    }
     return this.result ?? null;
   }
   /**
@@ -3188,11 +3244,22 @@ var ScenarioExecution = class {
     this.totalStartTime = Date.now();
     this.pendingMessages.clear();
     this._result = void 0;
+    this.checkpointResults = [];
     this.logger.debug(`[${this.config.id}] Reset complete`, {
       threadId: this.state.threadId,
       agentCount: this.agents.length
     });
   }
+  /** Compiles all accumulated checkpoint results into aggregated met/unmet criteria. */
+  get compiledCheckpoints() {
+    const metCriteria = [];
+    const unmetCriteria = [];
+    for (const cp of this.checkpointResults) {
+      metCriteria.push(...cp.metCriteria);
+      unmetCriteria.push(...cp.unmetCriteria);
+    }
+    return { metCriteria, unmetCriteria };
+  }
   nextAgentForRole(role) {
     for (const agent2 of this.agents) {
       if (agent2.role === role && this.pendingAgentsOnTurn.has(agent2) && this.pendingRolesOnTurn.includes(role)) {
@@ -3289,7 +3356,7 @@ var ScenarioExecution = class {
    */
   reachedMaxTurns(errorMessage) {
     var _a;
-    this.setResult({
+    return this.setResult({
       success: false,
       reasoning: errorMessage || `Reached maximum turns (${this.config.maxTurns || 10}) without conclusion`,
       metCriteria: [],
@@ -3797,9 +3864,9 @@ var message = (message2) => {
 var agent = (content) => {
   return (_state, executor) => executor.agent(content);
 };
-var judge = (content) => {
+var judge = (options) => {
   return async (_state, executor) => {
-    await executor.judge(content);
+    await executor.judge(options);
   };
 };
 var user = (content) => {
@@ -3911,7 +3978,6 @@ function formatPart(part) {
     case "reasoning":
       return `(reasoning): ${part.text}`;
     default:
-      part;
       return `Unknown content: ${JSON.stringify(part)}`;
   }
 }

package/dist/integrations/vitest/setup.js CHANGED Viewed

@@ -104,7 +104,7 @@ var DEFAULT_TEMPERATURE = 0;
 var modelSchema = import_v42.z.object({
   model: import_v42.z.custom((val) => Boolean(val), {
     message: "A model is required. Configure it in scenario.config.js defaultModel or pass directly to the agent."
-  }).describe("The OpenAI Language Model to use for generating responses."),
+  }).describe("Language model that is used by the AI SDK Core functions."),
   temperature: import_v42.z.number().min(0).max(1).optional().describe("The temperature for the language model.").default(DEFAULT_TEMPERATURE),
   maxTokens: import_v42.z.number().optional().describe("The maximum number of tokens to generate.")
 });

package/dist/integrations/vitest/setup.mjs CHANGED Viewed

@@ -87,7 +87,7 @@ var DEFAULT_TEMPERATURE = 0;
 var modelSchema = z2.object({
   model: z2.custom((val) => Boolean(val), {
     message: "A model is required. Configure it in scenario.config.js defaultModel or pass directly to the agent."
-  }).describe("The OpenAI Language Model to use for generating responses."),
+  }).describe("Language model that is used by the AI SDK Core functions."),
   temperature: z2.number().min(0).max(1).optional().describe("The temperature for the language model.").default(DEFAULT_TEMPERATURE),
   maxTokens: z2.number().optional().describe("The maximum number of tokens to generate.")
 });

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@langwatch/scenario",
-  "version": "0.4.0",
+  "version": "0.4.2",
   "description": "A TypeScript library for testing AI agents using scenarios",
   "main": "dist/index.js",
   "module": "dist/index.mjs",
@@ -29,9 +29,9 @@
   },
   "dependencies": {
     "@ag-ui/core": "^0.0.28",
-    "@ai-sdk/openai": "^2.0.74",
+    "@ai-sdk/openai": "^3.0.26",
     "@openai/agents": "^0.3.3",
-    "ai": "5.0.104",
+    "ai": "^6.0.0",
     "chalk": "^5.6.2",
     "langwatch": "0.9.0",
     "open": "11.0.0",
@@ -88,7 +88,7 @@
     }
   },
   "peerDependencies": {
-    "ai": ">=5.0.0",
+    "ai": ">=6.0.0",
     "vitest": ">=3.2.4"
   },
   "scripts": {