npm - @langwatch/scenario - Versions diffs - 0.4.0 → 0.4.2 - Mend

@langwatch/scenario 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.d.mts +70 -47
package/dist/index.d.ts +70 -47
package/dist/index.js +153 -87
package/dist/index.mjs +153 -87
package/dist/integrations/vitest/setup.js +1 -1
package/dist/integrations/vitest/setup.mjs +1 -1
package/package.json +4 -4

package/dist/index.d.mts CHANGED Viewed

@@ -1,5 +1,5 @@
 import * as ai from 'ai';
-import { CoreMessage, CoreUserMessage, CoreAssistantMessage, CoreToolMessage, LanguageModel, generateText, ModelMessage } from 'ai';
+import { ModelMessage, UserModelMessage, AssistantModelMessage, ToolModelMessage, LanguageModel, generateText } from 'ai';
 import { z } from 'zod/v4';
 import { SpanProcessor, ReadableSpan } from '@opentelemetry/sdk-trace-base';
 import { RealtimeSession } from '@openai/agents/realtime';
@@ -8,11 +8,11 @@ import { z as z$1 } from 'zod';
 /**
  * The possible return types from an agent's `call` method.
- * - string | CoreMessage | CoreMessage[]: Agent generated response
+ * - string | ModelMessage | ModelMessage[]: Agent generated response
  * - JudgeResult: Judge made a final decision
  * - null: Judge wants to continue observing (no decision yet)
  */
-type AgentReturnTypes = string | CoreMessage | CoreMessage[] | JudgeResult | null;
+type AgentReturnTypes = string | ModelMessage | ModelMessage[] | JudgeResult | null;
 declare enum AgentRole {
     USER = "User",
@@ -20,6 +20,18 @@ declare enum AgentRole {
     JUDGE = "Judge"
 }
 declare const allAgentRoles: readonly [AgentRole.USER, AgentRole.AGENT, AgentRole.JUDGE];
+/**
+ * Encapsulates a request for the judge agent to evaluate the conversation.
+ *
+ * When present on AgentInput, signals the judge to produce a verdict.
+ * Optionally carries inline criteria that override the judge's own criteria.
+ */
+interface JudgmentRequest {
+    /**
+     * Optional criteria to evaluate, overriding the judge agent's configured criteria.
+     */
+    criteria?: string[];
+}
 /**
  * Input provided to an agent's `call` method.
  */
@@ -31,19 +43,19 @@ interface AgentInput {
     /**
      * The full history of messages in the conversation.
      */
-    messages: CoreMessage[];
+    messages: ModelMessage[];
     /**
      * New messages added since the last time this agent was called.
      */
-    newMessages: CoreMessage[];
+    newMessages: ModelMessage[];
     /**
      * The role the agent is being asked to play in this turn.
      */
     requestedRole: AgentRole;
     /**
-     * Whether a judgment is being requested in this turn.
+     * When set, requests the judge to produce a verdict, optionally with inline criteria.
      */
-    judgmentRequest: boolean;
+    judgmentRequest?: JudgmentRequest;
     /**
      * The current state of the scenario execution.
      */
@@ -191,7 +203,7 @@ interface ScenarioExecutionLike {
     /**
      * The history of messages in the conversation.
      */
-    readonly messages: CoreMessage[];
+    readonly messages: ModelMessage[];
     /**
      * The ID of the conversation thread.
      */
@@ -200,25 +212,27 @@ interface ScenarioExecutionLike {
      * Adds a message to the conversation.
      * @param message The message to add.
      */
-    message(message: CoreMessage): Promise<void>;
+    message(message: ModelMessage): Promise<void>;
     /**
      * Adds a user message to the conversation.
      * If no content is provided, the user simulator will generate a message.
      * @param content The content of the user message.
      */
-    user(content?: string | CoreMessage): Promise<void>;
+    user(content?: string | ModelMessage): Promise<void>;
     /**
      * Adds an agent message to the conversation.
      * If no content is provided, the agent under test will generate a message.
      * @param content The content of the agent message.
      */
-    agent(content?: string | CoreMessage): Promise<void>;
+    agent(content?: string | ModelMessage): Promise<void>;
     /**
      * Invokes the judge agent to evaluate the current state.
-     * @param content Optional message to the judge.
+     * @param options Optional options with inline criteria to evaluate as a checkpoint.
      * @returns The result of the scenario if the judge makes a final decision.
      */
-    judge(content?: string | CoreMessage): Promise<ScenarioResult | null>;
+    judge(options?: {
+        criteria?: string[];
+    }): Promise<ScenarioResult | null>;
     /**
      * Proceeds with the scenario automatically for a number of turns.
      * @param turns The number of turns to proceed. Defaults to running until the scenario ends.
@@ -258,7 +272,7 @@ interface ScenarioResult {
     /**
      * The sequence of messages exchanged during the scenario.
      */
-    messages: CoreMessage[];
+    messages: ModelMessage[];
     /**
      * The reasoning behind the scenario's outcome.
      */
@@ -299,7 +313,7 @@ interface ScenarioExecutionStateLike {
     /**
      * The sequence of messages exchanged during the scenario.
      */
-    get messages(): CoreMessage[];
+    get messages(): ModelMessage[];
     /**
      * The unique identifier for the execution thread.
      */
@@ -313,28 +327,28 @@ interface ScenarioExecutionStateLike {
      *
      * @param message - The core message to add.
      */
-    addMessage(message: CoreMessage): void;
+    addMessage(message: ModelMessage): void;
     /**
      * Retrieves the last message from the execution state.
      * @returns The last message.
      */
-    lastMessage(): CoreMessage;
+    lastMessage(): ModelMessage;
     /**
      * Retrieves the last user message from the execution state.
      * @returns The last user message.
      */
-    lastUserMessage(): CoreUserMessage;
+    lastUserMessage(): UserModelMessage;
     /**
      * Retrieves the last agent message from the execution state.
      * @returns The last agent message.
      */
-    lastAgentMessage(): CoreAssistantMessage;
+    lastAgentMessage(): AssistantModelMessage;
     /**
      * Retrieves the last tool call message for a specific tool.
      * @param toolName - The name of the tool.
      * @returns The last tool call message.
      */
-    lastToolCall(toolName: string): CoreToolMessage;
+    lastToolCall(toolName: string): ToolModelMessage;
     /**
      * Checks if a tool call for a specific tool exists in the execution state.
      * @param toolName - The name of the tool.
@@ -364,6 +378,7 @@ declare const domain_DEFAULT_MAX_TURNS: typeof DEFAULT_MAX_TURNS;
 declare const domain_DEFAULT_VERBOSE: typeof DEFAULT_VERBOSE;
 type domain_JudgeAgentAdapter = JudgeAgentAdapter;
 declare const domain_JudgeAgentAdapter: typeof JudgeAgentAdapter;
+type domain_JudgmentRequest = JudgmentRequest;
 type domain_ScenarioConfig = ScenarioConfig;
 type domain_ScenarioConfigFinal = ScenarioConfigFinal;
 type domain_ScenarioExecutionLike = ScenarioExecutionLike;
@@ -377,7 +392,7 @@ declare const domain_allAgentRoles: typeof allAgentRoles;
 declare const domain_defineConfig: typeof defineConfig;
 declare const domain_scenarioProjectConfigSchema: typeof scenarioProjectConfigSchema;
 declare namespace domain {
-  export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
+  export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_JudgmentRequest as JudgmentRequest, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
 }
 /**
@@ -475,7 +490,7 @@ interface JudgeAgentConfig extends TestingAgentConfig {
     /**
      * The criteria that the judge will use to evaluate the conversation.
      */
-    criteria: string[];
+    criteria?: string[];
     /**
      * Optional span collector for telemetry. Defaults to global singleton.
      */
@@ -554,7 +569,7 @@ declare class JudgeAgent extends JudgeAgentAdapter {
  * main();
  * ```
  */
-declare const judgeAgent: (cfg: JudgeAgentConfig) => JudgeAgent;
+declare const judgeAgent: (cfg?: JudgeAgentConfig) => JudgeAgent;
 /**
  * Transforms OpenTelemetry spans into a complete plain-text digest for judge evaluation.
@@ -1334,6 +1349,8 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
     private currentTurnSpan?;
     /** Timestamp when execution started (for total time calculation) */
     private totalStartTime;
+    /** Accumulated results from inline judge checkpoints */
+    private checkpointResults;
     /** Event stream for monitoring scenario progress */
     private eventSubject;
     /**
@@ -1554,24 +1571,23 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
      *
      * This method is part of the ScenarioExecutionLike interface used by script steps.
      *
-     * @param content - Optional message to pass to the judge agent for additional context
+     * @param options - Optional options with inline criteria to evaluate as a checkpoint.
      * @returns A promise that resolves with:
      *   - ScenarioResult if the judge makes a final decision, or
      *   - Null if the conversation should continue
      *
      * @example
      * ```typescript
-     * // Let judge evaluate current state
+     * // Let judge evaluate with its configured criteria
      * const result = await execution.judge();
-     * if (result) {
-     *   console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
-     * }
      *
-     * // Provide additional context to judge
-     * const result = await execution.judge("Please consider the user's satisfaction level");
+     * // Evaluate inline criteria as a checkpoint
+     * const result = await execution.judge({ criteria: ["Agent responded helpfully"] });
      * ```
      */
-    judge(content?: string | ModelMessage): Promise<ScenarioResult | null>;
+    judge(options?: {
+        criteria?: string[];
+    }): Promise<ScenarioResult | null>;
     /**
      * Lets the scenario proceed automatically for a specified number of turns.
      *
@@ -1718,6 +1734,8 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
      * - Clears the result from any previous execution
      */
     private reset;
+    /** Compiles all accumulated checkpoint results into aggregated met/unmet criteria. */
+    private get compiledCheckpoints();
     private nextAgentForRole;
     /**
      * Starts a new turn in the scenario execution.
@@ -1847,7 +1865,7 @@ declare class ScenarioExecutionState implements ScenarioExecutionStateLike {
     description: string;
     config: ScenarioConfig;
     constructor(config: ScenarioConfig);
-    get messages(): CoreMessage[];
+    get messages(): ModelMessage[];
     get currentTurn(): number;
     set currentTurn(turn: number);
     get threadId(): string;
@@ -1858,10 +1876,10 @@ declare class ScenarioExecutionState implements ScenarioExecutionStateLike {
      * @param message - The message to add.
      * @param traceId - Optional trace ID to associate with the message.
      */
-    addMessage(message: CoreMessage & {
+    addMessage(message: ModelMessage & {
         traceId?: string;
     }): void;
-    lastMessage(): ai.ModelMessage & {
+    lastMessage(): ModelMessage & {
         id: string;
         traceId?: string;
     };
@@ -1869,10 +1887,10 @@ declare class ScenarioExecutionState implements ScenarioExecutionStateLike {
         id: string;
         traceId?: string;
     };
-    lastAgentMessage(): CoreAssistantMessage & {
+    lastAgentMessage(): AssistantModelMessage & {
         traceId?: string;
     };
-    lastToolCall(toolName: string): CoreToolMessage & {
+    lastToolCall(toolName: string): ToolModelMessage & {
         traceId?: string;
     };
     hasToolCall(toolName: string): boolean;
@@ -1957,14 +1975,14 @@ declare namespace runner {
 /**
  * Add a specific message to the conversation.
  *
- * This function allows you to inject any CoreMessage compatible message directly
+ * This function allows you to inject any ModelMessage compatible message directly
  * into the conversation at a specific point in the script. Useful for
  * simulating tool responses, system messages, or specific conversational states.
  *
  * @param message The message to add to the conversation.
  * @returns A ScriptStep function that can be used in scenario scripts.
  */
-declare const message: (message: CoreMessage) => ScriptStep;
+declare const message: (message: ModelMessage) => ScriptStep;
 /**
  * Generate or specify an agent response in the conversation.
  *
@@ -1976,19 +1994,24 @@ declare const message: (message: CoreMessage) => ScriptStep;
  *                If undefined, the agent under test will generate content automatically.
  * @returns A ScriptStep function that can be used in scenario scripts.
  */
-declare const agent: (content?: string | CoreMessage) => ScriptStep;
+declare const agent: (content?: string | ModelMessage) => ScriptStep;
 /**
  * Invoke the judge agent to evaluate the current conversation state.
  *
- * This function forces the judge agent to make a decision about whether
- * the scenario should continue or end with a success/failure verdict.
- * The judge will evaluate based on its configured criteria.
+ * When criteria are provided inline, the judge evaluates only those criteria
+ * as a checkpoint: if all pass, the scenario continues; if any fail, the
+ * scenario fails immediately. This is the preferred way to pass criteria
+ * when using scripts.
+ *
+ * When no criteria are provided, the judge uses its own configured criteria
+ * and returns a final verdict (success or failure), ending the scenario.
  *
- * @param content Optional message content for the judge. Usually undefined to let
- *                the judge evaluate based on its criteria.
+ * @param options Optional options object with inline criteria to evaluate.
  * @returns A ScriptStep function that can be used in scenario scripts.
  */
-declare const judge: (content?: string | CoreMessage) => ScriptStep;
+declare const judge: (options?: {
+    criteria: string[];
+}) => ScriptStep;
 /**
  * Generate or specify a user message in the conversation.
  *
@@ -2000,7 +2023,7 @@ declare const judge: (content?: string | CoreMessage) => ScriptStep;
  *                If undefined, the user simulator will generate content automatically.
  * @returns A ScriptStep function that can be used in scenario scripts.
  */
-declare const user: (content?: string | CoreMessage) => ScriptStep;
+declare const user: (content?: string | ModelMessage) => ScriptStep;
 /**
  * Let the scenario proceed automatically for a specified number of turns.
  *
@@ -2048,4 +2071,4 @@ declare namespace script {
 type ScenarioApi = typeof agents & typeof domain & typeof execution & typeof runner & typeof script;
 declare const scenario: ScenarioApi;
-export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, type AudioResponseEvent, DEFAULT_MAX_TURNS, DEFAULT_VERBOSE, type FinishTestArgs, type InvokeLLMParams, type InvokeLLMResult, JudgeAgentAdapter, type JudgeAgentConfig, type JudgeResult, JudgeSpanCollector, JudgeSpanDigestFormatter, RealtimeAgentAdapter, type RealtimeAgentAdapterConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, judgeSpanCollector, judgeSpanDigestFormatter, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };
+export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, type AudioResponseEvent, DEFAULT_MAX_TURNS, DEFAULT_VERBOSE, type FinishTestArgs, type InvokeLLMParams, type InvokeLLMResult, JudgeAgentAdapter, type JudgeAgentConfig, type JudgeResult, JudgeSpanCollector, JudgeSpanDigestFormatter, type JudgmentRequest, RealtimeAgentAdapter, type RealtimeAgentAdapterConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, judgeSpanCollector, judgeSpanDigestFormatter, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };