npm - @langwatch/scenario - Versions diffs - 0.4.1 → 0.4.2 - Mend

@langwatch/scenario 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/index.d.mts CHANGED Viewed

@@ -20,6 +20,18 @@ declare enum AgentRole {
     JUDGE = "Judge"
 }
 declare const allAgentRoles: readonly [AgentRole.USER, AgentRole.AGENT, AgentRole.JUDGE];
+/**
+ * Encapsulates a request for the judge agent to evaluate the conversation.
+ *
+ * When present on AgentInput, signals the judge to produce a verdict.
+ * Optionally carries inline criteria that override the judge's own criteria.
+ */
+interface JudgmentRequest {
+    /**
+     * Optional criteria to evaluate, overriding the judge agent's configured criteria.
+     */
+    criteria?: string[];
+}
 /**
  * Input provided to an agent's `call` method.
  */
@@ -41,9 +53,9 @@ interface AgentInput {
      */
     requestedRole: AgentRole;
     /**
-     * Whether a judgment is being requested in this turn.
+     * When set, requests the judge to produce a verdict, optionally with inline criteria.
      */
-    judgmentRequest: boolean;
+    judgmentRequest?: JudgmentRequest;
     /**
      * The current state of the scenario execution.
      */
@@ -215,10 +227,12 @@ interface ScenarioExecutionLike {
     agent(content?: string | ModelMessage): Promise<void>;
     /**
      * Invokes the judge agent to evaluate the current state.
-     * @param content Optional message to the judge.
+     * @param options Optional options with inline criteria to evaluate as a checkpoint.
      * @returns The result of the scenario if the judge makes a final decision.
      */
-    judge(content?: string | ModelMessage): Promise<ScenarioResult | null>;
+    judge(options?: {
+        criteria?: string[];
+    }): Promise<ScenarioResult | null>;
     /**
      * Proceeds with the scenario automatically for a number of turns.
      * @param turns The number of turns to proceed. Defaults to running until the scenario ends.
@@ -364,6 +378,7 @@ declare const domain_DEFAULT_MAX_TURNS: typeof DEFAULT_MAX_TURNS;
 declare const domain_DEFAULT_VERBOSE: typeof DEFAULT_VERBOSE;
 type domain_JudgeAgentAdapter = JudgeAgentAdapter;
 declare const domain_JudgeAgentAdapter: typeof JudgeAgentAdapter;
+type domain_JudgmentRequest = JudgmentRequest;
 type domain_ScenarioConfig = ScenarioConfig;
 type domain_ScenarioConfigFinal = ScenarioConfigFinal;
 type domain_ScenarioExecutionLike = ScenarioExecutionLike;
@@ -377,7 +392,7 @@ declare const domain_allAgentRoles: typeof allAgentRoles;
 declare const domain_defineConfig: typeof defineConfig;
 declare const domain_scenarioProjectConfigSchema: typeof scenarioProjectConfigSchema;
 declare namespace domain {
-  export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
+  export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_JudgmentRequest as JudgmentRequest, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
 }
 /**
@@ -475,7 +490,7 @@ interface JudgeAgentConfig extends TestingAgentConfig {
     /**
      * The criteria that the judge will use to evaluate the conversation.
      */
-    criteria: string[];
+    criteria?: string[];
     /**
      * Optional span collector for telemetry. Defaults to global singleton.
      */
@@ -554,7 +569,7 @@ declare class JudgeAgent extends JudgeAgentAdapter {
  * main();
  * ```
  */
-declare const judgeAgent: (cfg: JudgeAgentConfig) => JudgeAgent;
+declare const judgeAgent: (cfg?: JudgeAgentConfig) => JudgeAgent;
 /**
  * Transforms OpenTelemetry spans into a complete plain-text digest for judge evaluation.
@@ -1334,6 +1349,8 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
     private currentTurnSpan?;
     /** Timestamp when execution started (for total time calculation) */
     private totalStartTime;
+    /** Accumulated results from inline judge checkpoints */
+    private checkpointResults;
     /** Event stream for monitoring scenario progress */
     private eventSubject;
     /**
@@ -1554,24 +1571,23 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
      *
      * This method is part of the ScenarioExecutionLike interface used by script steps.
      *
-     * @param content - Optional message to pass to the judge agent for additional context
+     * @param options - Optional options with inline criteria to evaluate as a checkpoint.
      * @returns A promise that resolves with:
      *   - ScenarioResult if the judge makes a final decision, or
      *   - Null if the conversation should continue
      *
      * @example
      * ```typescript
-     * // Let judge evaluate current state
+     * // Let judge evaluate with its configured criteria
      * const result = await execution.judge();
-     * if (result) {
-     *   console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
-     * }
      *
-     * // Provide additional context to judge
-     * const result = await execution.judge("Please consider the user's satisfaction level");
+     * // Evaluate inline criteria as a checkpoint
+     * const result = await execution.judge({ criteria: ["Agent responded helpfully"] });
      * ```
      */
-    judge(content?: string | ModelMessage): Promise<ScenarioResult | null>;
+    judge(options?: {
+        criteria?: string[];
+    }): Promise<ScenarioResult | null>;
     /**
      * Lets the scenario proceed automatically for a specified number of turns.
      *
@@ -1718,6 +1734,8 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
      * - Clears the result from any previous execution
      */
     private reset;
+    /** Compiles all accumulated checkpoint results into aggregated met/unmet criteria. */
+    private get compiledCheckpoints();
     private nextAgentForRole;
     /**
      * Starts a new turn in the scenario execution.
@@ -1980,15 +1998,20 @@ declare const agent: (content?: string | ModelMessage) => ScriptStep;
 /**
  * Invoke the judge agent to evaluate the current conversation state.
  *
- * This function forces the judge agent to make a decision about whether
- * the scenario should continue or end with a success/failure verdict.
- * The judge will evaluate based on its configured criteria.
+ * When criteria are provided inline, the judge evaluates only those criteria
+ * as a checkpoint: if all pass, the scenario continues; if any fail, the
+ * scenario fails immediately. This is the preferred way to pass criteria
+ * when using scripts.
  *
- * @param content Optional message content for the judge. Usually undefined to let
- *                the judge evaluate based on its criteria.
+ * When no criteria are provided, the judge uses its own configured criteria
+ * and returns a final verdict (success or failure), ending the scenario.
+ *
+ * @param options Optional options object with inline criteria to evaluate.
  * @returns A ScriptStep function that can be used in scenario scripts.
  */
-declare const judge: (content?: string | ModelMessage) => ScriptStep;
+declare const judge: (options?: {
+    criteria: string[];
+}) => ScriptStep;
 /**
  * Generate or specify a user message in the conversation.
  *
@@ -2048,4 +2071,4 @@ declare namespace script {
 type ScenarioApi = typeof agents & typeof domain & typeof execution & typeof runner & typeof script;
 declare const scenario: ScenarioApi;
-export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, type AudioResponseEvent, DEFAULT_MAX_TURNS, DEFAULT_VERBOSE, type FinishTestArgs, type InvokeLLMParams, type InvokeLLMResult, JudgeAgentAdapter, type JudgeAgentConfig, type JudgeResult, JudgeSpanCollector, JudgeSpanDigestFormatter, RealtimeAgentAdapter, type RealtimeAgentAdapterConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, judgeSpanCollector, judgeSpanDigestFormatter, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };
+export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, type AudioResponseEvent, DEFAULT_MAX_TURNS, DEFAULT_VERBOSE, type FinishTestArgs, type InvokeLLMParams, type InvokeLLMResult, JudgeAgentAdapter, type JudgeAgentConfig, type JudgeResult, JudgeSpanCollector, JudgeSpanDigestFormatter, type JudgmentRequest, RealtimeAgentAdapter, type RealtimeAgentAdapterConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, judgeSpanCollector, judgeSpanDigestFormatter, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };

package/dist/index.d.ts CHANGED Viewed

@@ -20,6 +20,18 @@ declare enum AgentRole {
     JUDGE = "Judge"
 }
 declare const allAgentRoles: readonly [AgentRole.USER, AgentRole.AGENT, AgentRole.JUDGE];
+/**
+ * Encapsulates a request for the judge agent to evaluate the conversation.
+ *
+ * When present on AgentInput, signals the judge to produce a verdict.
+ * Optionally carries inline criteria that override the judge's own criteria.
+ */
+interface JudgmentRequest {
+    /**
+     * Optional criteria to evaluate, overriding the judge agent's configured criteria.
+     */
+    criteria?: string[];
+}
 /**
  * Input provided to an agent's `call` method.
  */
@@ -41,9 +53,9 @@ interface AgentInput {
      */
     requestedRole: AgentRole;
     /**
-     * Whether a judgment is being requested in this turn.
+     * When set, requests the judge to produce a verdict, optionally with inline criteria.
      */
-    judgmentRequest: boolean;
+    judgmentRequest?: JudgmentRequest;
     /**
      * The current state of the scenario execution.
      */
@@ -215,10 +227,12 @@ interface ScenarioExecutionLike {
     agent(content?: string | ModelMessage): Promise<void>;
     /**
      * Invokes the judge agent to evaluate the current state.
-     * @param content Optional message to the judge.
+     * @param options Optional options with inline criteria to evaluate as a checkpoint.
      * @returns The result of the scenario if the judge makes a final decision.
      */
-    judge(content?: string | ModelMessage): Promise<ScenarioResult | null>;
+    judge(options?: {
+        criteria?: string[];
+    }): Promise<ScenarioResult | null>;
     /**
      * Proceeds with the scenario automatically for a number of turns.
      * @param turns The number of turns to proceed. Defaults to running until the scenario ends.
@@ -364,6 +378,7 @@ declare const domain_DEFAULT_MAX_TURNS: typeof DEFAULT_MAX_TURNS;
 declare const domain_DEFAULT_VERBOSE: typeof DEFAULT_VERBOSE;
 type domain_JudgeAgentAdapter = JudgeAgentAdapter;
 declare const domain_JudgeAgentAdapter: typeof JudgeAgentAdapter;
+type domain_JudgmentRequest = JudgmentRequest;
 type domain_ScenarioConfig = ScenarioConfig;
 type domain_ScenarioConfigFinal = ScenarioConfigFinal;
 type domain_ScenarioExecutionLike = ScenarioExecutionLike;
@@ -377,7 +392,7 @@ declare const domain_allAgentRoles: typeof allAgentRoles;
 declare const domain_defineConfig: typeof defineConfig;
 declare const domain_scenarioProjectConfigSchema: typeof scenarioProjectConfigSchema;
 declare namespace domain {
-  export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
+  export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_JudgmentRequest as JudgmentRequest, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
 }
 /**
@@ -475,7 +490,7 @@ interface JudgeAgentConfig extends TestingAgentConfig {
     /**
      * The criteria that the judge will use to evaluate the conversation.
      */
-    criteria: string[];
+    criteria?: string[];
     /**
      * Optional span collector for telemetry. Defaults to global singleton.
      */
@@ -554,7 +569,7 @@ declare class JudgeAgent extends JudgeAgentAdapter {
  * main();
  * ```
  */
-declare const judgeAgent: (cfg: JudgeAgentConfig) => JudgeAgent;
+declare const judgeAgent: (cfg?: JudgeAgentConfig) => JudgeAgent;
 /**
  * Transforms OpenTelemetry spans into a complete plain-text digest for judge evaluation.
@@ -1334,6 +1349,8 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
     private currentTurnSpan?;
     /** Timestamp when execution started (for total time calculation) */
     private totalStartTime;
+    /** Accumulated results from inline judge checkpoints */
+    private checkpointResults;
     /** Event stream for monitoring scenario progress */
     private eventSubject;
     /**
@@ -1554,24 +1571,23 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
      *
      * This method is part of the ScenarioExecutionLike interface used by script steps.
      *
-     * @param content - Optional message to pass to the judge agent for additional context
+     * @param options - Optional options with inline criteria to evaluate as a checkpoint.
      * @returns A promise that resolves with:
      *   - ScenarioResult if the judge makes a final decision, or
      *   - Null if the conversation should continue
      *
      * @example
      * ```typescript
-     * // Let judge evaluate current state
+     * // Let judge evaluate with its configured criteria
      * const result = await execution.judge();
-     * if (result) {
-     *   console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
-     * }
      *
-     * // Provide additional context to judge
-     * const result = await execution.judge("Please consider the user's satisfaction level");
+     * // Evaluate inline criteria as a checkpoint
+     * const result = await execution.judge({ criteria: ["Agent responded helpfully"] });
      * ```
      */
-    judge(content?: string | ModelMessage): Promise<ScenarioResult | null>;
+    judge(options?: {
+        criteria?: string[];
+    }): Promise<ScenarioResult | null>;
     /**
      * Lets the scenario proceed automatically for a specified number of turns.
      *
@@ -1718,6 +1734,8 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
      * - Clears the result from any previous execution
      */
     private reset;
+    /** Compiles all accumulated checkpoint results into aggregated met/unmet criteria. */
+    private get compiledCheckpoints();
     private nextAgentForRole;
     /**
      * Starts a new turn in the scenario execution.
@@ -1980,15 +1998,20 @@ declare const agent: (content?: string | ModelMessage) => ScriptStep;
 /**
  * Invoke the judge agent to evaluate the current conversation state.
  *
- * This function forces the judge agent to make a decision about whether
- * the scenario should continue or end with a success/failure verdict.
- * The judge will evaluate based on its configured criteria.
+ * When criteria are provided inline, the judge evaluates only those criteria
+ * as a checkpoint: if all pass, the scenario continues; if any fail, the
+ * scenario fails immediately. This is the preferred way to pass criteria
+ * when using scripts.
  *
- * @param content Optional message content for the judge. Usually undefined to let
- *                the judge evaluate based on its criteria.
+ * When no criteria are provided, the judge uses its own configured criteria
+ * and returns a final verdict (success or failure), ending the scenario.
+ *
+ * @param options Optional options object with inline criteria to evaluate.
  * @returns A ScriptStep function that can be used in scenario scripts.
  */
-declare const judge: (content?: string | ModelMessage) => ScriptStep;
+declare const judge: (options?: {
+    criteria: string[];
+}) => ScriptStep;
 /**
  * Generate or specify a user message in the conversation.
  *
@@ -2048,4 +2071,4 @@ declare namespace script {
 type ScenarioApi = typeof agents & typeof domain & typeof execution & typeof runner & typeof script;
 declare const scenario: ScenarioApi;
-export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, type AudioResponseEvent, DEFAULT_MAX_TURNS, DEFAULT_VERBOSE, type FinishTestArgs, type InvokeLLMParams, type InvokeLLMResult, JudgeAgentAdapter, type JudgeAgentConfig, type JudgeResult, JudgeSpanCollector, JudgeSpanDigestFormatter, RealtimeAgentAdapter, type RealtimeAgentAdapterConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, judgeSpanCollector, judgeSpanDigestFormatter, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };
+export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, type AudioResponseEvent, DEFAULT_MAX_TURNS, DEFAULT_VERBOSE, type FinishTestArgs, type InvokeLLMParams, type InvokeLLMResult, JudgeAgentAdapter, type JudgeAgentConfig, type JudgeResult, JudgeSpanCollector, JudgeSpanDigestFormatter, type JudgmentRequest, RealtimeAgentAdapter, type RealtimeAgentAdapterConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, judgeSpanCollector, judgeSpanDigestFormatter, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };

package/dist/index.js CHANGED Viewed

@@ -486,52 +486,68 @@ var createLLMInvoker = (logger2) => {
 var toolMessageRole = "tool";
 var assistantMessageRole = "assistant";
 var userMessageRole = "user";
-var groupMessagesByToolBoundaries = (messages) => {
-  const segments = [];
-  let currentSegment = [];
-  for (const message2 of messages) {
-    currentSegment.push(message2);
-    if (message2.role === toolMessageRole) {
-      segments.push(currentSegment);
-      currentSegment = [];
-    }
-  }
-  if (currentSegment.length > 0) {
-    segments.push(currentSegment);
+var hasToolContent = (message2) => {
+  if (message2.role === toolMessageRole) return true;
+  if (!Array.isArray(message2.content)) return false;
+  return message2.content.some((part) => {
+    if (!part || typeof part !== "object") return false;
+    const partType = "type" in part ? part.type : void 0;
+    return partType === "tool-call" || partType === "tool-result";
+  });
+};
+var stringifyValue = (value) => {
+  if (typeof value === "string") return value;
+  if (value === void 0) return "undefined";
+  try {
+    const serialized = JSON.stringify(value);
+    return serialized === void 0 ? String(value) : serialized;
+  } catch {
+    return String(value);
   }
-  return segments;
 };
-var segmentHasToolMessages = (segment) => {
-  return segment.some((message2) => {
-    if (message2.role === toolMessageRole) return true;
-    if (message2.role === assistantMessageRole && Array.isArray(message2.content)) {
-      return message2.content.some((part) => part.type === "tool-call");
-    }
-    return false;
+var summarizeToolMessage = (message2) => {
+  if (message2.role === toolMessageRole && !Array.isArray(message2.content)) {
+    return `[Tool message: ${stringifyValue(message2.content)}]`;
+  }
+  if (message2.role === toolMessageRole) {
+    const toolResults = message2.content.filter((part) => part.type === "tool-result").map((part) => {
+      const contentPart = part;
+      const name = contentPart.toolName ?? "unknown tool";
+      const output = contentPart.output;
+      const value = output && typeof output === "object" && "value" in output && typeof output.value === "string" ? output.value : output ?? contentPart.result;
+      return `[Tool result from ${name}: ${stringifyValue(value)}]`;
+    });
+    return toolResults.length > 0 ? toolResults.join("\n") : null;
+  }
+  if (!Array.isArray(message2.content)) return null;
+  const toolCalls = message2.content.filter((part) => part.type === "tool-call").map((part) => {
+    const contentPart = part;
+    const name = contentPart.toolName ?? "unknown tool";
+    return `[Called tool ${name} with: ${stringifyValue(contentPart.input)}]`;
   });
+  return toolCalls.length > 0 ? toolCalls.join("\n") : null;
 };
-var reverseSegmentRoles = (segment) => {
-  return segment.map((message2) => {
-    const hasStringContent = typeof message2.content === "string";
-    if (!hasStringContent) return message2;
-    const roleMap = {
-      [userMessageRole]: assistantMessageRole,
-      [assistantMessageRole]: userMessageRole
-    };
+var messageRoleReversal = (messages) => {
+  const roleMap = {
+    [userMessageRole]: assistantMessageRole,
+    [assistantMessageRole]: userMessageRole
+  };
+  return messages.map((message2) => {
+    if (hasToolContent(message2)) {
+      const summary = summarizeToolMessage(message2);
+      if (!summary) return null;
+      return {
+        role: userMessageRole,
+        content: summary
+      };
+    }
     const newRole = roleMap[message2.role];
     if (!newRole) return message2;
     return {
-      role: newRole,
-      content: message2.content
+      ...message2,
+      role: newRole
     };
-  });
-};
-var messageRoleReversal = (messages) => {
-  const segments = groupMessagesByToolBoundaries(messages);
-  const processedSegments = segments.map(
-    (segment) => segmentHasToolMessages(segment) ? segment : reverseSegmentRoles(segment)
-  );
-  return processedSegments.flat();
+  }).filter((message2) => message2 !== null);
 };
 var criterionToParamName = (criterion) => {
   return criterion.replace(/"/g, "").replace(/[^a-zA-Z0-9]/g, "_").replace(/ /g, "_").toLowerCase().substring(0, 70);
@@ -893,7 +909,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
   constructor(cfg) {
     super();
     this.cfg = cfg;
-    this.criteria = cfg.criteria;
+    this.criteria = cfg.criteria ?? [];
     this.spanCollector = cfg.spanCollector ?? judgeSpanCollector;
   }
   logger = new Logger("JudgeAgent");
@@ -905,7 +921,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
    */
   invokeLLM = createLLMInvoker(this.logger);
   async call(input) {
-    var _a, _b, _c;
+    var _a, _b, _c, _d;
+    const criteria = ((_a = input.judgmentRequest) == null ? void 0 : _a.criteria) ?? this.criteria;
     this.logger.debug("call() invoked", {
       threadId: input.threadId,
       currentTurn: input.scenarioState.currentTurn,
@@ -924,7 +941,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
     </opentelemetry_traces>
     `;
     const cfg = this.cfg;
-    const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(cfg.criteria, input.scenarioConfig.description);
+    const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(criteria, input.scenarioConfig.description);
     const messages = [
       { role: "system", content: systemPrompt },
       { role: "user", content: contentForJudge }
@@ -937,10 +954,10 @@ var JudgeAgent = class extends JudgeAgentAdapter {
     });
     const tools = {
       continue_test: buildContinueTestTool(),
-      finish_test: buildFinishTestTool(cfg.criteria)
+      finish_test: buildFinishTestTool(criteria)
     };
-    const enforceJudgement = input.judgmentRequest;
-    const hasCriteria = cfg.criteria.length && cfg.criteria.length > 0;
+    const enforceJudgement = input.judgmentRequest != null;
+    const hasCriteria = criteria.length && criteria.length > 0;
     if (enforceJudgement && !hasCriteria) {
       return {
         success: false,
@@ -965,26 +982,26 @@ var JudgeAgent = class extends JudgeAgentAdapter {
       toolChoice
     });
     this.logger.debug("LLM response received", {
-      toolCallCount: ((_a = completion.toolCalls) == null ? void 0 : _a.length) ?? 0,
-      toolCalls: (_b = completion.toolCalls) == null ? void 0 : _b.map((tc) => ({
+      toolCallCount: ((_b = completion.toolCalls) == null ? void 0 : _b.length) ?? 0,
+      toolCalls: (_c = completion.toolCalls) == null ? void 0 : _c.map((tc) => ({
         toolName: tc.toolName,
         args: tc.input
       }))
     });
     let args;
-    if ((_c = completion.toolCalls) == null ? void 0 : _c.length) {
+    if ((_d = completion.toolCalls) == null ? void 0 : _d.length) {
       const toolCall = completion.toolCalls[0];
       switch (toolCall.toolName) {
         case "finish_test": {
           args = toolCall.input;
           const verdict = args.verdict || "inconclusive";
           const reasoning = args.reasoning || "No reasoning provided";
-          const criteria = args.criteria || {};
-          const criteriaValues = Object.values(criteria);
-          const metCriteria = cfg.criteria.filter(
+          const criteriaArgs = args.criteria || {};
+          const criteriaValues = Object.values(criteriaArgs);
+          const metCriteria = criteria.filter(
             (_, i) => criteriaValues[i] === "true"
           );
-          const unmetCriteria = cfg.criteria.filter(
+          const unmetCriteria = criteria.filter(
             (_, i) => criteriaValues[i] !== "true"
           );
           const result = {
@@ -1004,7 +1021,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
             success: false,
             reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
             metCriteria: [],
-            unmetCriteria: cfg.criteria
+            unmetCriteria: criteria
           };
       }
     }
@@ -1012,7 +1029,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
       success: false,
       reasoning: `JudgeAgent: No tool call found in LLM output`,
       metCriteria: [],
-      unmetCriteria: cfg.criteria
+      unmetCriteria: criteria
     };
   }
   getOpenTelemetryTracesDigest(threadId) {
@@ -1022,7 +1039,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
   }
 };
 var judgeAgent = (cfg) => {
-  return new JudgeAgent(cfg);
+  return new JudgeAgent(cfg ?? {});
 };
 // src/agents/user-simulator-agent.ts
@@ -2518,6 +2535,8 @@ var ScenarioExecution = class {
   currentTurnSpan;
   /** Timestamp when execution started (for total time calculation) */
   totalStartTime = 0;
+  /** Accumulated results from inline judge checkpoints */
+  checkpointResults = [];
   /** Event stream for monitoring scenario progress */
   eventSubject = new import_rxjs2.Subject();
   /**
@@ -2595,6 +2614,7 @@ var ScenarioExecution = class {
       totalTime: this.totalTime,
       agentTime: totalAgentTime
     };
+    return this._result;
     this.logger.debug(`[${this.config.id}] Result set`, {
       success: result.success,
       reasoning: result.reasoning,
@@ -2655,6 +2675,8 @@ var ScenarioExecution = class {
         const scriptStep = this.config.script[i];
         await this.executeScriptStep(scriptStep, i);
         if (this.result) {
+          const cp = this.compiledCheckpoints;
+          this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
           this.emitRunFinished({
             scenarioRunId,
             status: this.result.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
@@ -2663,7 +2685,22 @@ var ScenarioExecution = class {
           return this.result;
         }
       }
-      this.reachedMaxTurns(
+      if (this.checkpointResults.length > 0) {
+        const cp = this.compiledCheckpoints;
+        const result2 = this.setResult({
+          success: cp.unmetCriteria.length === 0,
+          reasoning: "All inline criteria checkpoints passed",
+          metCriteria: cp.metCriteria,
+          unmetCriteria: cp.unmetCriteria
+        });
+        this.emitRunFinished({
+          scenarioRunId,
+          status: result2.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
+          result: result2
+        });
+        return result2;
+      }
+      const result = this.reachedMaxTurns(
         [
           "Reached end of script without conclusion, add one of the following to the end of the script:",
           "- `Scenario.proceed()` to let the simulation continue to play out",
@@ -2671,11 +2708,11 @@ var ScenarioExecution = class {
           "- `Scenario.succeed()` or `Scenario.fail()` to end the test with an explicit result"
         ].join("\n")
       );
-      this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */ });
-      return this.result;
+      this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */, result });
+      return result;
     } catch (error) {
       const errorInfo = extractErrorInfo(error);
-      this.setResult({
+      const result = this.setResult({
         success: false,
         reasoning: `Scenario failed with error: ${errorInfo.message}`,
         metCriteria: [],
@@ -2685,7 +2722,7 @@ var ScenarioExecution = class {
       this.emitRunFinished({
         scenarioRunId,
         status: "ERROR" /* ERROR */,
-        result: this.result
+        result
       });
       throw error;
     } finally {
@@ -2789,7 +2826,7 @@ var ScenarioExecution = class {
    * @param judgmentRequest - Whether this is a judgment request (for judge agents)
    * @throws Error if the agent call fails
    */
-  async callAgent(idx, role, judgmentRequest = false) {
+  async callAgent(idx, role, judgmentRequest) {
     var _a;
     const agent2 = this.agents[idx];
     const agentName = agent2.name ?? agent2.constructor.name;
@@ -2980,25 +3017,26 @@ var ScenarioExecution = class {
    *
    * This method is part of the ScenarioExecutionLike interface used by script steps.
    *
-   * @param content - Optional message to pass to the judge agent for additional context
+   * @param options - Optional options with inline criteria to evaluate as a checkpoint.
    * @returns A promise that resolves with:
    *   - ScenarioResult if the judge makes a final decision, or
    *   - Null if the conversation should continue
    *
    * @example
    * ```typescript
-   * // Let judge evaluate current state
+   * // Let judge evaluate with its configured criteria
    * const result = await execution.judge();
-   * if (result) {
-   *   console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
-   * }
    *
-   * // Provide additional context to judge
-   * const result = await execution.judge("Please consider the user's satisfaction level");
+   * // Evaluate inline criteria as a checkpoint
+   * const result = await execution.judge({ criteria: ["Agent responded helpfully"] });
    * ```
    */
-  async judge(content) {
-    return await this.scriptCallAgent("Judge" /* JUDGE */, content, true);
+  async judge(options) {
+    return await this.scriptCallAgent(
+      "Judge" /* JUDGE */,
+      void 0,
+      { criteria: options == null ? void 0 : options.criteria }
+    );
   }
   /**
    * Lets the scenario proceed automatically for a specified number of turns.
@@ -3083,13 +3121,12 @@ var ScenarioExecution = class {
    * ```
    */
   async succeed(reasoning) {
-    this.setResult({
+    return this.setResult({
       success: true,
       reasoning: reasoning || "Scenario marked as successful with Scenario.succeed()",
       metCriteria: [],
       unmetCriteria: []
     });
-    return this.result;
   }
   /**
    * Immediately ends the scenario with a failure verdict.
@@ -3115,13 +3152,12 @@ var ScenarioExecution = class {
    * ```
    */
   async fail(reasoning) {
-    this.setResult({
+    return this.setResult({
       success: false,
       reasoning: reasoning || "Scenario marked as failed with Scenario.fail()",
       metCriteria: [],
       unmetCriteria: []
     });
-    return this.result;
   }
   /**
    * Adds execution time for a specific agent to the performance tracking.
@@ -3165,15 +3201,14 @@ var ScenarioExecution = class {
    *          decision, or null if the conversation should continue
    * @throws Error if no agent is found for the specified role
    */
-  async scriptCallAgent(role, content, judgmentRequest = false) {
+  async scriptCallAgent(role, content, judgmentRequest) {
     this.logger.debug(`[${this.config.id}] scriptCallAgent`, {
       role,
       hasContent: content !== void 0,
-      judgmentRequest
+      judgmentRequest: judgmentRequest != null,
+      hasInlineCriteria: (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null
     });
     this.consumeUntilRole(role);
-    let index = -1;
-    let agent2 = null;
     let nextAgent = this.getNextAgentForRole(role);
     if (!nextAgent) {
       this.newTurn();
@@ -3203,8 +3238,8 @@ var ScenarioExecution = class {
         `Cannot generate a message for role \`${role}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
       );
     }
-    index = nextAgent.index;
-    agent2 = nextAgent.agent;
+    const index = nextAgent.index;
+    const agent2 = nextAgent.agent;
     this.removePendingAgent(agent2);
     if (content) {
       const message2 = typeof content === "string" ? {
@@ -3216,6 +3251,25 @@ var ScenarioExecution = class {
       return null;
     }
     await this.callAgent(index, role, judgmentRequest);
+    if (this.result && (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null) {
+      this.checkpointResults.push({
+        metCriteria: this.result.metCriteria,
+        unmetCriteria: this.result.unmetCriteria
+      });
+      if (this.result.success) {
+        this._result = void 0;
+        return null;
+      } else {
+        const cp = this.compiledCheckpoints;
+        this.result.metCriteria = cp.metCriteria;
+        this.result.unmetCriteria = cp.unmetCriteria;
+        return this.result;
+      }
+    }
+    if (this.result) {
+      const cp = this.compiledCheckpoints;
+      this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
+    }
     return this.result ?? null;
   }
   /**
@@ -3248,11 +3302,22 @@ var ScenarioExecution = class {
     this.totalStartTime = Date.now();
     this.pendingMessages.clear();
     this._result = void 0;
+    this.checkpointResults = [];
     this.logger.debug(`[${this.config.id}] Reset complete`, {
       threadId: this.state.threadId,
       agentCount: this.agents.length
     });
   }
+  /** Compiles all accumulated checkpoint results into aggregated met/unmet criteria. */
+  get compiledCheckpoints() {
+    const metCriteria = [];
+    const unmetCriteria = [];
+    for (const cp of this.checkpointResults) {
+      metCriteria.push(...cp.metCriteria);
+      unmetCriteria.push(...cp.unmetCriteria);
+    }
+    return { metCriteria, unmetCriteria };
+  }
   nextAgentForRole(role) {
     for (const agent2 of this.agents) {
       if (agent2.role === role && this.pendingAgentsOnTurn.has(agent2) && this.pendingRolesOnTurn.includes(role)) {
@@ -3349,7 +3414,7 @@ var ScenarioExecution = class {
    */
   reachedMaxTurns(errorMessage) {
     var _a;
-    this.setResult({
+    return this.setResult({
       success: false,
       reasoning: errorMessage || `Reached maximum turns (${this.config.maxTurns || 10}) without conclusion`,
       metCriteria: [],
@@ -3850,9 +3915,9 @@ var message = (message2) => {
 var agent = (content) => {
   return (_state, executor) => executor.agent(content);
 };
-var judge = (content) => {
+var judge = (options) => {
   return async (_state, executor) => {
-    await executor.judge(content);
+    await executor.judge(options);
   };
 };
 var user = (content) => {

package/dist/index.mjs CHANGED Viewed

@@ -428,52 +428,68 @@ var createLLMInvoker = (logger2) => {
 var toolMessageRole = "tool";
 var assistantMessageRole = "assistant";
 var userMessageRole = "user";
-var groupMessagesByToolBoundaries = (messages) => {
-  const segments = [];
-  let currentSegment = [];
-  for (const message2 of messages) {
-    currentSegment.push(message2);
-    if (message2.role === toolMessageRole) {
-      segments.push(currentSegment);
-      currentSegment = [];
-    }
-  }
-  if (currentSegment.length > 0) {
-    segments.push(currentSegment);
+var hasToolContent = (message2) => {
+  if (message2.role === toolMessageRole) return true;
+  if (!Array.isArray(message2.content)) return false;
+  return message2.content.some((part) => {
+    if (!part || typeof part !== "object") return false;
+    const partType = "type" in part ? part.type : void 0;
+    return partType === "tool-call" || partType === "tool-result";
+  });
+};
+var stringifyValue = (value) => {
+  if (typeof value === "string") return value;
+  if (value === void 0) return "undefined";
+  try {
+    const serialized = JSON.stringify(value);
+    return serialized === void 0 ? String(value) : serialized;
+  } catch {
+    return String(value);
   }
-  return segments;
 };
-var segmentHasToolMessages = (segment) => {
-  return segment.some((message2) => {
-    if (message2.role === toolMessageRole) return true;
-    if (message2.role === assistantMessageRole && Array.isArray(message2.content)) {
-      return message2.content.some((part) => part.type === "tool-call");
-    }
-    return false;
+var summarizeToolMessage = (message2) => {
+  if (message2.role === toolMessageRole && !Array.isArray(message2.content)) {
+    return `[Tool message: ${stringifyValue(message2.content)}]`;
+  }
+  if (message2.role === toolMessageRole) {
+    const toolResults = message2.content.filter((part) => part.type === "tool-result").map((part) => {
+      const contentPart = part;
+      const name = contentPart.toolName ?? "unknown tool";
+      const output = contentPart.output;
+      const value = output && typeof output === "object" && "value" in output && typeof output.value === "string" ? output.value : output ?? contentPart.result;
+      return `[Tool result from ${name}: ${stringifyValue(value)}]`;
+    });
+    return toolResults.length > 0 ? toolResults.join("\n") : null;
+  }
+  if (!Array.isArray(message2.content)) return null;
+  const toolCalls = message2.content.filter((part) => part.type === "tool-call").map((part) => {
+    const contentPart = part;
+    const name = contentPart.toolName ?? "unknown tool";
+    return `[Called tool ${name} with: ${stringifyValue(contentPart.input)}]`;
   });
+  return toolCalls.length > 0 ? toolCalls.join("\n") : null;
 };
-var reverseSegmentRoles = (segment) => {
-  return segment.map((message2) => {
-    const hasStringContent = typeof message2.content === "string";
-    if (!hasStringContent) return message2;
-    const roleMap = {
-      [userMessageRole]: assistantMessageRole,
-      [assistantMessageRole]: userMessageRole
-    };
+var messageRoleReversal = (messages) => {
+  const roleMap = {
+    [userMessageRole]: assistantMessageRole,
+    [assistantMessageRole]: userMessageRole
+  };
+  return messages.map((message2) => {
+    if (hasToolContent(message2)) {
+      const summary = summarizeToolMessage(message2);
+      if (!summary) return null;
+      return {
+        role: userMessageRole,
+        content: summary
+      };
+    }
     const newRole = roleMap[message2.role];
     if (!newRole) return message2;
     return {
-      role: newRole,
-      content: message2.content
+      ...message2,
+      role: newRole
     };
-  });
-};
-var messageRoleReversal = (messages) => {
-  const segments = groupMessagesByToolBoundaries(messages);
-  const processedSegments = segments.map(
-    (segment) => segmentHasToolMessages(segment) ? segment : reverseSegmentRoles(segment)
-  );
-  return processedSegments.flat();
+  }).filter((message2) => message2 !== null);
 };
 var criterionToParamName = (criterion) => {
   return criterion.replace(/"/g, "").replace(/[^a-zA-Z0-9]/g, "_").replace(/ /g, "_").toLowerCase().substring(0, 70);
@@ -835,7 +851,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
   constructor(cfg) {
     super();
     this.cfg = cfg;
-    this.criteria = cfg.criteria;
+    this.criteria = cfg.criteria ?? [];
     this.spanCollector = cfg.spanCollector ?? judgeSpanCollector;
   }
   logger = new Logger("JudgeAgent");
@@ -847,7 +863,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
    */
   invokeLLM = createLLMInvoker(this.logger);
   async call(input) {
-    var _a, _b, _c;
+    var _a, _b, _c, _d;
+    const criteria = ((_a = input.judgmentRequest) == null ? void 0 : _a.criteria) ?? this.criteria;
     this.logger.debug("call() invoked", {
       threadId: input.threadId,
       currentTurn: input.scenarioState.currentTurn,
@@ -866,7 +883,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
     </opentelemetry_traces>
     `;
     const cfg = this.cfg;
-    const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(cfg.criteria, input.scenarioConfig.description);
+    const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(criteria, input.scenarioConfig.description);
     const messages = [
       { role: "system", content: systemPrompt },
       { role: "user", content: contentForJudge }
@@ -879,10 +896,10 @@ var JudgeAgent = class extends JudgeAgentAdapter {
     });
     const tools = {
       continue_test: buildContinueTestTool(),
-      finish_test: buildFinishTestTool(cfg.criteria)
+      finish_test: buildFinishTestTool(criteria)
     };
-    const enforceJudgement = input.judgmentRequest;
-    const hasCriteria = cfg.criteria.length && cfg.criteria.length > 0;
+    const enforceJudgement = input.judgmentRequest != null;
+    const hasCriteria = criteria.length && criteria.length > 0;
     if (enforceJudgement && !hasCriteria) {
       return {
         success: false,
@@ -907,26 +924,26 @@ var JudgeAgent = class extends JudgeAgentAdapter {
       toolChoice
     });
     this.logger.debug("LLM response received", {
-      toolCallCount: ((_a = completion.toolCalls) == null ? void 0 : _a.length) ?? 0,
-      toolCalls: (_b = completion.toolCalls) == null ? void 0 : _b.map((tc) => ({
+      toolCallCount: ((_b = completion.toolCalls) == null ? void 0 : _b.length) ?? 0,
+      toolCalls: (_c = completion.toolCalls) == null ? void 0 : _c.map((tc) => ({
         toolName: tc.toolName,
         args: tc.input
       }))
     });
     let args;
-    if ((_c = completion.toolCalls) == null ? void 0 : _c.length) {
+    if ((_d = completion.toolCalls) == null ? void 0 : _d.length) {
       const toolCall = completion.toolCalls[0];
       switch (toolCall.toolName) {
         case "finish_test": {
           args = toolCall.input;
           const verdict = args.verdict || "inconclusive";
           const reasoning = args.reasoning || "No reasoning provided";
-          const criteria = args.criteria || {};
-          const criteriaValues = Object.values(criteria);
-          const metCriteria = cfg.criteria.filter(
+          const criteriaArgs = args.criteria || {};
+          const criteriaValues = Object.values(criteriaArgs);
+          const metCriteria = criteria.filter(
             (_, i) => criteriaValues[i] === "true"
           );
-          const unmetCriteria = cfg.criteria.filter(
+          const unmetCriteria = criteria.filter(
             (_, i) => criteriaValues[i] !== "true"
           );
           const result = {
@@ -946,7 +963,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
             success: false,
             reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
             metCriteria: [],
-            unmetCriteria: cfg.criteria
+            unmetCriteria: criteria
           };
       }
     }
@@ -954,7 +971,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
       success: false,
       reasoning: `JudgeAgent: No tool call found in LLM output`,
       metCriteria: [],
-      unmetCriteria: cfg.criteria
+      unmetCriteria: criteria
     };
   }
   getOpenTelemetryTracesDigest(threadId) {
@@ -964,7 +981,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
   }
 };
 var judgeAgent = (cfg) => {
-  return new JudgeAgent(cfg);
+  return new JudgeAgent(cfg ?? {});
 };
 // src/agents/user-simulator-agent.ts
@@ -2460,6 +2477,8 @@ var ScenarioExecution = class {
   currentTurnSpan;
   /** Timestamp when execution started (for total time calculation) */
   totalStartTime = 0;
+  /** Accumulated results from inline judge checkpoints */
+  checkpointResults = [];
   /** Event stream for monitoring scenario progress */
   eventSubject = new Subject2();
   /**
@@ -2537,6 +2556,7 @@ var ScenarioExecution = class {
       totalTime: this.totalTime,
       agentTime: totalAgentTime
     };
+    return this._result;
     this.logger.debug(`[${this.config.id}] Result set`, {
       success: result.success,
       reasoning: result.reasoning,
@@ -2597,6 +2617,8 @@ var ScenarioExecution = class {
         const scriptStep = this.config.script[i];
         await this.executeScriptStep(scriptStep, i);
         if (this.result) {
+          const cp = this.compiledCheckpoints;
+          this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
           this.emitRunFinished({
             scenarioRunId,
             status: this.result.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
@@ -2605,7 +2627,22 @@ var ScenarioExecution = class {
           return this.result;
         }
       }
-      this.reachedMaxTurns(
+      if (this.checkpointResults.length > 0) {
+        const cp = this.compiledCheckpoints;
+        const result2 = this.setResult({
+          success: cp.unmetCriteria.length === 0,
+          reasoning: "All inline criteria checkpoints passed",
+          metCriteria: cp.metCriteria,
+          unmetCriteria: cp.unmetCriteria
+        });
+        this.emitRunFinished({
+          scenarioRunId,
+          status: result2.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
+          result: result2
+        });
+        return result2;
+      }
+      const result = this.reachedMaxTurns(
         [
           "Reached end of script without conclusion, add one of the following to the end of the script:",
           "- `Scenario.proceed()` to let the simulation continue to play out",
@@ -2613,11 +2650,11 @@ var ScenarioExecution = class {
           "- `Scenario.succeed()` or `Scenario.fail()` to end the test with an explicit result"
         ].join("\n")
       );
-      this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */ });
-      return this.result;
+      this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */, result });
+      return result;
     } catch (error) {
       const errorInfo = extractErrorInfo(error);
-      this.setResult({
+      const result = this.setResult({
         success: false,
         reasoning: `Scenario failed with error: ${errorInfo.message}`,
         metCriteria: [],
@@ -2627,7 +2664,7 @@ var ScenarioExecution = class {
       this.emitRunFinished({
         scenarioRunId,
         status: "ERROR" /* ERROR */,
-        result: this.result
+        result
       });
       throw error;
     } finally {
@@ -2731,7 +2768,7 @@ var ScenarioExecution = class {
    * @param judgmentRequest - Whether this is a judgment request (for judge agents)
    * @throws Error if the agent call fails
    */
-  async callAgent(idx, role, judgmentRequest = false) {
+  async callAgent(idx, role, judgmentRequest) {
     var _a;
     const agent2 = this.agents[idx];
     const agentName = agent2.name ?? agent2.constructor.name;
@@ -2922,25 +2959,26 @@ var ScenarioExecution = class {
    *
    * This method is part of the ScenarioExecutionLike interface used by script steps.
    *
-   * @param content - Optional message to pass to the judge agent for additional context
+   * @param options - Optional options with inline criteria to evaluate as a checkpoint.
    * @returns A promise that resolves with:
    *   - ScenarioResult if the judge makes a final decision, or
    *   - Null if the conversation should continue
    *
    * @example
    * ```typescript
-   * // Let judge evaluate current state
+   * // Let judge evaluate with its configured criteria
    * const result = await execution.judge();
-   * if (result) {
-   *   console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
-   * }
    *
-   * // Provide additional context to judge
-   * const result = await execution.judge("Please consider the user's satisfaction level");
+   * // Evaluate inline criteria as a checkpoint
+   * const result = await execution.judge({ criteria: ["Agent responded helpfully"] });
    * ```
    */
-  async judge(content) {
-    return await this.scriptCallAgent("Judge" /* JUDGE */, content, true);
+  async judge(options) {
+    return await this.scriptCallAgent(
+      "Judge" /* JUDGE */,
+      void 0,
+      { criteria: options == null ? void 0 : options.criteria }
+    );
   }
   /**
    * Lets the scenario proceed automatically for a specified number of turns.
@@ -3025,13 +3063,12 @@ var ScenarioExecution = class {
    * ```
    */
   async succeed(reasoning) {
-    this.setResult({
+    return this.setResult({
       success: true,
       reasoning: reasoning || "Scenario marked as successful with Scenario.succeed()",
       metCriteria: [],
       unmetCriteria: []
     });
-    return this.result;
   }
   /**
    * Immediately ends the scenario with a failure verdict.
@@ -3057,13 +3094,12 @@ var ScenarioExecution = class {
    * ```
    */
   async fail(reasoning) {
-    this.setResult({
+    return this.setResult({
       success: false,
       reasoning: reasoning || "Scenario marked as failed with Scenario.fail()",
       metCriteria: [],
       unmetCriteria: []
     });
-    return this.result;
   }
   /**
    * Adds execution time for a specific agent to the performance tracking.
@@ -3107,15 +3143,14 @@ var ScenarioExecution = class {
    *          decision, or null if the conversation should continue
    * @throws Error if no agent is found for the specified role
    */
-  async scriptCallAgent(role, content, judgmentRequest = false) {
+  async scriptCallAgent(role, content, judgmentRequest) {
     this.logger.debug(`[${this.config.id}] scriptCallAgent`, {
       role,
       hasContent: content !== void 0,
-      judgmentRequest
+      judgmentRequest: judgmentRequest != null,
+      hasInlineCriteria: (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null
     });
     this.consumeUntilRole(role);
-    let index = -1;
-    let agent2 = null;
     let nextAgent = this.getNextAgentForRole(role);
     if (!nextAgent) {
       this.newTurn();
@@ -3145,8 +3180,8 @@ var ScenarioExecution = class {
         `Cannot generate a message for role \`${role}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
       );
     }
-    index = nextAgent.index;
-    agent2 = nextAgent.agent;
+    const index = nextAgent.index;
+    const agent2 = nextAgent.agent;
     this.removePendingAgent(agent2);
     if (content) {
       const message2 = typeof content === "string" ? {
@@ -3158,6 +3193,25 @@ var ScenarioExecution = class {
       return null;
     }
     await this.callAgent(index, role, judgmentRequest);
+    if (this.result && (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null) {
+      this.checkpointResults.push({
+        metCriteria: this.result.metCriteria,
+        unmetCriteria: this.result.unmetCriteria
+      });
+      if (this.result.success) {
+        this._result = void 0;
+        return null;
+      } else {
+        const cp = this.compiledCheckpoints;
+        this.result.metCriteria = cp.metCriteria;
+        this.result.unmetCriteria = cp.unmetCriteria;
+        return this.result;
+      }
+    }
+    if (this.result) {
+      const cp = this.compiledCheckpoints;
+      this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
+    }
     return this.result ?? null;
   }
   /**
@@ -3190,11 +3244,22 @@ var ScenarioExecution = class {
     this.totalStartTime = Date.now();
     this.pendingMessages.clear();
     this._result = void 0;
+    this.checkpointResults = [];
     this.logger.debug(`[${this.config.id}] Reset complete`, {
       threadId: this.state.threadId,
       agentCount: this.agents.length
     });
   }
+  /** Compiles all accumulated checkpoint results into aggregated met/unmet criteria. */
+  get compiledCheckpoints() {
+    const metCriteria = [];
+    const unmetCriteria = [];
+    for (const cp of this.checkpointResults) {
+      metCriteria.push(...cp.metCriteria);
+      unmetCriteria.push(...cp.unmetCriteria);
+    }
+    return { metCriteria, unmetCriteria };
+  }
   nextAgentForRole(role) {
     for (const agent2 of this.agents) {
       if (agent2.role === role && this.pendingAgentsOnTurn.has(agent2) && this.pendingRolesOnTurn.includes(role)) {
@@ -3291,7 +3356,7 @@ var ScenarioExecution = class {
    */
   reachedMaxTurns(errorMessage) {
     var _a;
-    this.setResult({
+    return this.setResult({
       success: false,
       reasoning: errorMessage || `Reached maximum turns (${this.config.maxTurns || 10}) without conclusion`,
       metCriteria: [],
@@ -3799,9 +3864,9 @@ var message = (message2) => {
 var agent = (content) => {
   return (_state, executor) => executor.agent(content);
 };
-var judge = (content) => {
+var judge = (options) => {
   return async (_state, executor) => {
-    await executor.judge(content);
+    await executor.judge(options);
   };
 };
 var user = (content) => {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@langwatch/scenario",
-  "version": "0.4.1",
+  "version": "0.4.2",
   "description": "A TypeScript library for testing AI agents using scenarios",
   "main": "dist/index.js",
   "module": "dist/index.mjs",
@@ -29,7 +29,7 @@
   },
   "dependencies": {
     "@ag-ui/core": "^0.0.28",
-    "@ai-sdk/openai": "^2.0.74",
+    "@ai-sdk/openai": "^3.0.26",
     "@openai/agents": "^0.3.3",
     "ai": "^6.0.0",
     "chalk": "^5.6.2",