npm - @langwatch/scenario - Versions diffs - 0.3.0 → 0.4.0 - Mend

@langwatch/scenario 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/README.md +1 -1
package/dist/index.d.mts +328 -130
package/dist/index.d.ts +328 -130
package/dist/index.js +2144 -459
package/dist/index.mjs +2576 -268
package/dist/integrations/vitest/config.mjs +0 -2
package/dist/integrations/vitest/reporter.js +22 -1
package/dist/integrations/vitest/reporter.mjs +153 -6
package/dist/integrations/vitest/setup-global.mjs +0 -2
package/dist/integrations/vitest/setup.js +21 -9
package/dist/integrations/vitest/setup.mjs +619 -18
package/package.json +46 -31
package/dist/chunk-3Z7E24UI.mjs +0 -548
package/dist/chunk-7P6ASYW6.mjs +0 -9
package/dist/chunk-RHTLQKEJ.mjs +0 -133

package/dist/index.d.mts CHANGED Viewed

@@ -1,9 +1,19 @@
 import * as ai from 'ai';
-import { CoreMessage, CoreUserMessage, CoreAssistantMessage, CoreToolMessage, LanguageModel, ModelMessage } from 'ai';
+import { CoreMessage, CoreUserMessage, CoreAssistantMessage, CoreToolMessage, LanguageModel, generateText, ModelMessage } from 'ai';
 import { z } from 'zod/v4';
+import { SpanProcessor, ReadableSpan } from '@opentelemetry/sdk-trace-base';
+import { RealtimeSession } from '@openai/agents/realtime';
 import { Observable } from 'rxjs';
 import { z as z$1 } from 'zod';
+/**
+ * The possible return types from an agent's `call` method.
+ * - string | CoreMessage | CoreMessage[]: Agent generated response
+ * - JudgeResult: Judge made a final decision
+ * - null: Judge wants to continue observing (no decision yet)
+ */
+type AgentReturnTypes = string | CoreMessage | CoreMessage[] | JudgeResult | null;
 declare enum AgentRole {
     USER = "User",
     AGENT = "Agent",
@@ -43,11 +53,6 @@ interface AgentInput {
      */
     scenarioConfig: ScenarioConfig;
 }
-/**
- * The possible return types from an agent's `call` method.
- * Can be a simple string, a single message, an array of messages, or a ScenarioResult.
- */
-type AgentReturnTypes = string | CoreMessage | CoreMessage[] | ScenarioResult;
 /**
  * Abstract base class for integrating custom agents with the Scenario framework.
  *
@@ -72,6 +77,7 @@ type AgentReturnTypes = string | CoreMessage | CoreMessage[] | ScenarioResult;
  * ```
  */
 declare abstract class AgentAdapter {
+    name?: string;
     role: AgentRole;
     /**
      * Process the input and generate a response.
@@ -89,33 +95,21 @@ declare abstract class AgentAdapter {
  * Abstract base class for user simulator agents.
  * User simulator agents are responsible for generating user messages to drive the conversation.
  */
-declare abstract class UserSimulatorAgentAdapter implements AgentAdapter {
+declare abstract class UserSimulatorAgentAdapter extends AgentAdapter {
+    name: string;
     role: AgentRole;
-    /**
-     * Process the input and generate a user message.
-     *
-     * @param input AgentInput containing conversation history, thread context, and scenario state.
-     * @returns The user's response.
-     */
-    abstract call(input: AgentInput): Promise<AgentReturnTypes>;
 }
 /**
  * Abstract base class for judge agents.
  * Judge agents are responsible for evaluating the conversation and determining success or failure.
  */
-declare abstract class JudgeAgentAdapter implements AgentAdapter {
+declare abstract class JudgeAgentAdapter extends AgentAdapter {
+    name: string;
     role: AgentRole;
     /**
      * The criteria the judge will use to evaluate the conversation.
      */
     abstract criteria: string[];
-    /**
-     * Process the input and evaluate the conversation.
-     *
-     * @param input AgentInput containing conversation history, thread context, and scenario state.
-     * @returns A ScenarioResult if the conversation should end, otherwise should continue.
-     */
-    abstract call(input: AgentInput): Promise<AgentReturnTypes>;
 }
 declare const DEFAULT_MAX_TURNS = 10;
@@ -250,7 +244,7 @@ interface ScenarioExecutionLike {
  * A step in a scenario script.
  * This is a function that takes the current state and an executor, and performs an action.
  */
-type ScriptStep = (state: ScenarioExecutionStateLike, executor: ScenarioExecutionLike) => Promise<void | ScenarioResult | null> | void | ScenarioResult | null;
+type ScriptStep = (state: ScenarioExecutionStateLike, executor: ScenarioExecutionLike) => Promise<void> | void;
 /**
  * Represents the result of a scenario execution.
@@ -349,11 +343,9 @@ interface ScenarioExecutionStateLike {
     hasToolCall(toolName: string): boolean;
 }
-/** Default temperature for language model inference */
-declare const DEFAULT_TEMPERATURE = 0;
 declare const scenarioProjectConfigSchema: z.ZodObject<{
     defaultModel: z.ZodOptional<z.ZodObject<{
-        model: z.ZodCustom<LanguageModel, LanguageModel>;
+        model: z.ZodCustom<ai.LanguageModel, ai.LanguageModel>;
         temperature: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
         maxTokens: z.ZodOptional<z.ZodNumber>;
     }, z.core.$strip>>;
@@ -369,7 +361,6 @@ type domain_AgentReturnTypes = AgentReturnTypes;
 type domain_AgentRole = AgentRole;
 declare const domain_AgentRole: typeof AgentRole;
 declare const domain_DEFAULT_MAX_TURNS: typeof DEFAULT_MAX_TURNS;
-declare const domain_DEFAULT_TEMPERATURE: typeof DEFAULT_TEMPERATURE;
 declare const domain_DEFAULT_VERBOSE: typeof DEFAULT_VERBOSE;
 type domain_JudgeAgentAdapter = JudgeAgentAdapter;
 declare const domain_JudgeAgentAdapter: typeof JudgeAgentAdapter;
@@ -386,32 +377,33 @@ declare const domain_allAgentRoles: typeof allAgentRoles;
 declare const domain_defineConfig: typeof defineConfig;
 declare const domain_scenarioProjectConfigSchema: typeof scenarioProjectConfigSchema;
 declare namespace domain {
-  export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_TEMPERATURE as DEFAULT_TEMPERATURE, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
+  export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
 }
 /**
- * Configuration for the inference parameters of a testing agent.
+ * Schema for a language model.
  */
-interface TestingAgentInferenceConfig {
-    /**
-     * The language model to use for generating responses.
-     * If not provided, a default model will be used.
-     */
-    model?: LanguageModel;
-    /**
-     * The temperature for the language model.
-     * Defaults to 0.
-     */
-    temperature?: number;
-    /**
-     * The maximum number of tokens to generate.
-     */
-    maxTokens?: number;
-}
+declare const modelSchema: z.ZodObject<{
+    model: z.ZodCustom<LanguageModel, LanguageModel>;
+    temperature: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
+    maxTokens: z.ZodOptional<z.ZodNumber>;
+}, z.core.$strip>;
+type ModelConfig = z.infer<typeof modelSchema>;
+/**
+ * Parameters for LLM invocation.
+ * Derived from generateText parameters for now.
+ */
+type InvokeLLMParams = Parameters<typeof generateText>[0];
+/**
+ * Result from LLM invocation.
+ * Derived from generateText return type for now.
+ */
+type InvokeLLMResult = Pick<Awaited<ReturnType<typeof generateText>>, "text" | "content" | "toolCalls" | "toolResults">;
 /**
  * General configuration for a testing agent.
  */
-interface TestingAgentConfig extends TestingAgentInferenceConfig {
+interface TestingAgentConfig extends Partial<ModelConfig> {
     /**
      * The name of the agent.
      */
@@ -443,6 +435,35 @@ interface FinishTestArgs {
     verdict: "success" | "failure" | "inconclusive";
 }
+interface JudgeResult {
+    success: boolean;
+    reasoning: string;
+    metCriteria: string[];
+    unmetCriteria: string[];
+}
+/**
+ * Collects OpenTelemetry spans for judge evaluation.
+ * Implements SpanProcessor to intercept spans as they complete.
+ */
+declare class JudgeSpanCollector implements SpanProcessor {
+    private spans;
+    onStart(): void;
+    onEnd(span: ReadableSpan): void;
+    forceFlush(): Promise<void>;
+    shutdown(): Promise<void>;
+    /**
+     * Retrieves all spans associated with a specific thread.
+     * @param threadId - The thread identifier to filter spans by
+     * @returns Array of spans for the given thread
+     */
+    getSpansForThread(threadId: string): ReadableSpan[];
+}
+/**
+ * Singleton instance of the judge span collector.
+ */
+declare const judgeSpanCollector: JudgeSpanCollector;
 /**
  * Configuration for the judge agent.
  */
@@ -455,6 +476,10 @@ interface JudgeAgentConfig extends TestingAgentConfig {
      * The criteria that the judge will use to evaluate the conversation.
      */
     criteria: string[];
+    /**
+     * Optional span collector for telemetry. Defaults to global singleton.
+     */
+    spanCollector?: JudgeSpanCollector;
 }
 /**
  * Agent that evaluates conversations against success criteria.
@@ -468,17 +493,16 @@ interface JudgeAgentConfig extends TestingAgentConfig {
 declare class JudgeAgent extends JudgeAgentAdapter {
     private readonly cfg;
     private logger;
+    private readonly spanCollector;
     role: AgentRole;
     criteria: string[];
+    /**
+     * LLM invocation function. Can be overridden to customize LLM behavior.
+     */
+    invokeLLM: (params: InvokeLLMParams) => Promise<InvokeLLMResult>;
     constructor(cfg: JudgeAgentConfig);
-    call(input: AgentInput): Promise<never[] | {
-        success: boolean;
-        messages: ai.ModelMessage[];
-        reasoning: string;
-        metCriteria: string[];
-        unmetCriteria: string[];
-    }>;
-    private generateText;
+    call(input: AgentInput): Promise<JudgeResult | null>;
+    private getOpenTelemetryTracesDigest;
 }
 /**
  * Factory function for creating JudgeAgent instances.
@@ -532,15 +556,54 @@ declare class JudgeAgent extends JudgeAgentAdapter {
  */
 declare const judgeAgent: (cfg: JudgeAgentConfig) => JudgeAgent;
+/**
+ * Transforms OpenTelemetry spans into a complete plain-text digest for judge evaluation.
+ * Deduplicates repeated string content to reduce token usage.
+ */
+declare class JudgeSpanDigestFormatter {
+    private readonly logger;
+    private readonly deduplicator;
+    /**
+     * Formats spans into a complete digest with full content and nesting.
+     * @param spans - All spans for a thread
+     * @returns Plain text digest
+     */
+    format(spans: ReadableSpan[]): string;
+    private sortByStartTime;
+    private buildHierarchy;
+    private renderNode;
+    private getTreePrefix;
+    private getAttrIndent;
+    private cleanAttributes;
+    private formatValue;
+    private transformValue;
+    private transformString;
+    private looksLikeJson;
+    private hrTimeToMs;
+    private calculateSpanDuration;
+    private calculateTotalDuration;
+    private formatDuration;
+    private formatTimestamp;
+    private getStatusIndicator;
+    private collectErrors;
+}
+/**
+ * Singleton instance for convenience.
+ */
+declare const judgeSpanDigestFormatter: JudgeSpanDigestFormatter;
 declare class UserSimulatorAgent extends UserSimulatorAgentAdapter {
     private readonly cfg?;
     private logger;
+    /**
+     * LLM invocation function. Can be overridden to customize LLM behavior.
+     */
+    invokeLLM: (params: InvokeLLMParams) => Promise<InvokeLLMResult>;
     constructor(cfg?: TestingAgentConfig | undefined);
     call: (input: AgentInput) => Promise<{
         role: "user";
         content: string;
     }>;
-    private generateText;
 }
 /**
  * Agent that simulates realistic user behavior in scenario conversations.
@@ -633,14 +696,169 @@ declare class UserSimulatorAgent extends UserSimulatorAgentAdapter {
  */
 declare const userSimulatorAgent: (config?: TestingAgentConfig) => UserSimulatorAgent;
+/**
+ * Event emitted when an audio response is completed
+ */
+interface AudioResponseEvent {
+    transcript: string;
+    audio: string;
+}
+/**
+ * Realtime Agent Adapter for Scenario Testing
+ *
+ * Adapts a connected RealtimeSession to the Scenario framework interface.
+ * The session must be created and connected before passing to this adapter.
+ *
+ * This ensures we test the REAL agent, not a mock, using the same session
+ * creation pattern as the browser client.
+ */
+/**
+ * Configuration for RealtimeAgentAdapter
+ */
+interface RealtimeAgentAdapterConfig {
+    /**
+     * The role of the agent
+     */
+    role: AgentRole;
+    /**
+     * A connected RealtimeSession instance
+     *
+     * The session should be created using your agent's session creator function
+     * and connected before passing to this adapter.
+     *
+     * @example
+     * ```typescript
+     * const session = createVegetarianRecipeSession();
+     * await session.connect({ apiKey: process.env.OPENAI_API_KEY });
+     * const adapter = new RealtimeAgentAdapter({
+     *   session,
+     *   role: AgentRole.AGENT,
+     *   agentName: "Vegetarian Recipe Assistant"
+     * });
+     * ```
+     */
+    session: RealtimeSession;
+    /**
+     * Name of the agent (for logging/identification)
+     */
+    agentName: string;
+    /**
+     * Timeout for waiting for agent response (ms)
+     * @default 30000
+     */
+    responseTimeout?: number;
+}
+/**
+ * Adapter that connects Scenario testing framework to OpenAI Realtime API
+ *
+ * This adapter wraps a connected RealtimeSession to provide the Scenario
+ * framework interface. The session must be created and connected externally,
+ * ensuring the same session creation pattern is used in both browser and tests.
+ *
+ * @example
+ * ```typescript
+ * // In beforeAll
+ * const session = createVegetarianRecipeSession();
+ * await session.connect({ apiKey: process.env.OPENAI_API_KEY });
+ * const adapter = new RealtimeAgentAdapter({
+ *   session,
+ *   role: AgentRole.AGENT
+ * });
+ *
+ * // In test
+ * await scenario.run({
+ *   agents: [adapter, scenario.userSimulatorAgent()],
+ *   script: [scenario.user("quick recipe"), scenario.agent()]
+ * });
+ *
+ * // In afterAll
+ * session.close();
+ * ```
+ */
+declare class RealtimeAgentAdapter extends AgentAdapter {
+    private config;
+    role: AgentRole;
+    name: string;
+    private session;
+    private eventHandler;
+    private messageProcessor;
+    private responseFormatter;
+    private audioEvents;
+    /**
+     * Creates a new RealtimeAgentAdapter instance
+     *
+     * The session can be either connected or unconnected.
+     * If unconnected, call connect() with an API key before use.
+     *
+     * @param config - Configuration for the realtime agent adapter
+     */
+    constructor(config: RealtimeAgentAdapterConfig);
+    /**
+     * Get the connect method from the session
+     */
+    connect(params?: Parameters<RealtimeSession["connect"]>[0] | undefined): Promise<void>;
+    /**
+     * Closes the session connection
+     */
+    disconnect(): Promise<void>;
+    /**
+     * Process input and generate response (implements AgentAdapter interface)
+     *
+     * This is called by Scenario framework for each agent turn.
+     * Handles both text and audio input, returns audio message with transcript.
+     *
+     * @param input - Scenario agent input with message history
+     * @returns Agent response as audio message or text
+     */
+    call(input: AgentInput): Promise<AgentReturnTypes>;
+    /**
+     * Handles the initial response when no user message exists
+     */
+    private handleInitialResponse;
+    /**
+     * Handles audio input from the user
+     */
+    private handleAudioInput;
+    /**
+     * Handles text input from the user
+     */
+    private handleTextInput;
+    /**
+     * Subscribe to audio response events
+     *
+     * @param callback - Function called when an audio response completes
+     */
+    onAudioResponse(callback: (event: AudioResponseEvent) => void): void;
+    /**
+     * Remove audio response listener
+     *
+     * @param callback - The callback function to remove
+     */
+    offAudioResponse(callback: (event: AudioResponseEvent) => void): void;
+}
+type agents_AudioResponseEvent = AudioResponseEvent;
 type agents_FinishTestArgs = FinishTestArgs;
+type agents_InvokeLLMParams = InvokeLLMParams;
+type agents_InvokeLLMResult = InvokeLLMResult;
 type agents_JudgeAgentConfig = JudgeAgentConfig;
+type agents_JudgeResult = JudgeResult;
+type agents_JudgeSpanCollector = JudgeSpanCollector;
+declare const agents_JudgeSpanCollector: typeof JudgeSpanCollector;
+type agents_JudgeSpanDigestFormatter = JudgeSpanDigestFormatter;
+declare const agents_JudgeSpanDigestFormatter: typeof JudgeSpanDigestFormatter;
+type agents_RealtimeAgentAdapter = RealtimeAgentAdapter;
+declare const agents_RealtimeAgentAdapter: typeof RealtimeAgentAdapter;
+type agents_RealtimeAgentAdapterConfig = RealtimeAgentAdapterConfig;
 type agents_TestingAgentConfig = TestingAgentConfig;
-type agents_TestingAgentInferenceConfig = TestingAgentInferenceConfig;
 declare const agents_judgeAgent: typeof judgeAgent;
+declare const agents_judgeSpanCollector: typeof judgeSpanCollector;
+declare const agents_judgeSpanDigestFormatter: typeof judgeSpanDigestFormatter;
 declare const agents_userSimulatorAgent: typeof userSimulatorAgent;
 declare namespace agents {
-  export { type agents_FinishTestArgs as FinishTestArgs, type agents_JudgeAgentConfig as JudgeAgentConfig, type agents_TestingAgentConfig as TestingAgentConfig, type agents_TestingAgentInferenceConfig as TestingAgentInferenceConfig, agents_judgeAgent as judgeAgent, agents_userSimulatorAgent as userSimulatorAgent };
+  export { type agents_AudioResponseEvent as AudioResponseEvent, type agents_FinishTestArgs as FinishTestArgs, type agents_InvokeLLMParams as InvokeLLMParams, type agents_InvokeLLMResult as InvokeLLMResult, type agents_JudgeAgentConfig as JudgeAgentConfig, type agents_JudgeResult as JudgeResult, agents_JudgeSpanCollector as JudgeSpanCollector, agents_JudgeSpanDigestFormatter as JudgeSpanDigestFormatter, agents_RealtimeAgentAdapter as RealtimeAgentAdapter, type agents_RealtimeAgentAdapterConfig as RealtimeAgentAdapterConfig, type agents_TestingAgentConfig as TestingAgentConfig, agents_judgeAgent as judgeAgent, agents_judgeSpanCollector as judgeSpanCollector, agents_judgeSpanDigestFormatter as judgeSpanDigestFormatter, agents_userSimulatorAgent as userSimulatorAgent };
 }
 /**
@@ -682,11 +900,11 @@ declare const scenarioEventSchema: z$1.ZodDiscriminatedUnion<"type", [z$1.ZodObj
         name: z$1.ZodOptional<z$1.ZodString>;
         description: z$1.ZodOptional<z$1.ZodString>;
     }, "strip", z$1.ZodTypeAny, {
-        name?: string | undefined;
         description?: string | undefined;
-    }, {
         name?: string | undefined;
+    }, {
         description?: string | undefined;
+        name?: string | undefined;
     }>;
 }, "strip", z$1.ZodTypeAny, {
     type: ScenarioEventType.RUN_STARTED;
@@ -696,8 +914,8 @@ declare const scenarioEventSchema: z$1.ZodDiscriminatedUnion<"type", [z$1.ZodObj
     scenarioRunId: string;
     scenarioSetId: string;
     metadata: {
-        name?: string | undefined;
         description?: string | undefined;
+        name?: string | undefined;
     };
     rawEvent?: any;
 }, {
@@ -707,8 +925,8 @@ declare const scenarioEventSchema: z$1.ZodDiscriminatedUnion<"type", [z$1.ZodObj
     scenarioId: string;
     scenarioRunId: string;
     metadata: {
-        name?: string | undefined;
         description?: string | undefined;
+        name?: string | undefined;
     };
     rawEvent?: any;
     scenarioSetId?: string | undefined;
@@ -1086,8 +1304,12 @@ type ScenarioEvent = z$1.infer<typeof scenarioEventSchema>;
  * ```
  */
 declare class ScenarioExecution implements ScenarioExecutionLike {
+    /** LangWatch tracer for scenario execution */
+    private tracer;
     /** The current state of the scenario execution */
     private state;
+    /** The final result of the scenario execution, set when a conclusion is reached */
+    private _result?;
     /** Logger for debugging and monitoring */
     private logger;
     /** Finalized configuration with all defaults applied */
@@ -1106,10 +1328,10 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
      * Key: agent index, Value: array of pending messages for that agent
      */
     private pendingMessages;
-    /** Intermediate result set by agents that make final decisions */
-    private partialResult;
     /** Accumulated execution time for each agent (for performance tracking) */
     private agentTimes;
+    /** Current turn span for trace context management */
+    private currentTurnSpan?;
     /** Timestamp when execution started (for total time calculation) */
     private totalStartTime;
     /** Event stream for monitoring scenario progress */
@@ -1144,6 +1366,20 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
      * @returns The thread identifier string
      */
     get threadId(): string;
+    /**
+     * Gets the result of the scenario execution if it has been set.
+     *
+     * @returns The scenario result or undefined if not yet set
+     */
+    get result(): ScenarioResult | undefined;
+    /**
+     * Sets the result of the scenario execution.
+     * This is called when the scenario reaches a conclusion (success or failure).
+     * Automatically includes messages, totalTime, and agentTime from the current execution context.
+     *
+     * @param result - The final scenario result (without messages/timing, which will be added automatically)
+     */
+    private setResult;
     /**
      * The total elapsed time for the scenario execution.
      */
@@ -1186,30 +1422,25 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
      * - Progress to the next turn if needed
      * - Find the next agent that should act
      * - Execute that agent's response
-     * - Return either new messages or a final scenario result
+     * - Set the result if the scenario concludes
      *
      * Note: This method is primarily for debugging or custom execution flows. Most users
      * will use `execute()` to run the entire scenario automatically.
      *
-     * @returns A promise that resolves with either:
-     *   - Array of new messages added during the agent interaction, or
-     *   - A final ScenarioResult if the interaction concludes the scenario
-     * @throws Error if no result is returned from the step
+     * After calling this method, check `this.result` to see if the scenario has concluded.
      *
      * @example
      * ```typescript
      * const execution = new ScenarioExecution(config, script);
      *
      * // Execute one agent interaction at a time
-     * const messages = await execution.step();
-     * if (Array.isArray(messages)) {
-     *   console.log('New messages:', messages);
-     * } else {
-     *   console.log('Scenario finished:', messages.success);
+     * await execution.step();
+     * if (execution.result) {
+     *   console.log('Scenario finished:', execution.result.success);
      * }
      * ```
      */
-    step(): Promise<ModelMessage[] | ScenarioResult>;
+    step(): Promise<void>;
     private _step;
     /**
      * Calls a specific agent to generate a response or make a decision.
@@ -1228,15 +1459,12 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
      * After the agent responds:
      * - Performance timing is recorded
      * - Pending messages for this agent are cleared (they've been processed)
-     * - If the agent returns a ScenarioResult, it's returned immediately
+     * - If the agent returns a ScenarioResult, it's set on this.result
      * - Otherwise, the agent's messages are added to the conversation and broadcast
      *
      * @param idx - The index of the agent in the agents array
      * @param role - The role the agent is being asked to play (USER, AGENT, or JUDGE)
      * @param judgmentRequest - Whether this is a judgment request (for judge agents)
-     * @returns A promise that resolves with either:
-     *   - Array of messages if the agent generated a response, or
-     *   - ScenarioResult if the agent made a final decision
      * @throws Error if the agent call fails
      */
     private callAgent;
@@ -1451,49 +1679,6 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
      * ```
      */
     addAgentTime(agentIdx: number, time: number): void;
-    /**
-     * Checks if a partial result has been set for the scenario.
-     *
-     * This method is used internally to determine if a scenario has already reached
-     * a conclusion (success or failure) but hasn't been finalized yet. Partial results
-     * are typically set by agents that make final decisions (like judge agents) and
-     * are later finalized with the complete message history.
-     *
-     * @returns True if a partial result exists, false otherwise
-     *
-     * @example
-     * ```typescript
-     * // This is typically used internally by the execution engine
-     * if (execution.hasResult()) {
-     *   console.log('Scenario has reached a conclusion');
-     * }
-     * ```
-     */
-    hasResult(): boolean;
-    /**
-     * Sets a partial result for the scenario.
-     *
-     * This method is used internally to store intermediate results that may be
-     * finalized later with the complete message history. Partial results are typically
-     * created by agents that make final decisions (like judge agents) and contain
-     * the success/failure status, reasoning, and criteria evaluation, but not the
-     * complete message history.
-     *
-     * @param result - The partial result without the messages field. Should include
-     *                success status, reasoning, and criteria evaluation.
-     *
-     * @example
-     * ```typescript
-     * // This is typically called internally by agents that make final decisions
-     * execution.setResult({
-     *   success: true,
-     *   reasoning: "Agent provided accurate weather information",
-     *   metCriteria: ["Provides accurate weather data"],
-     *   unmetCriteria: []
-     * });
-     * ```
-     */
-    setResult(result: Omit<ScenarioResult, "messages">): void;
     /**
      * Internal method to handle script step calls to agents.
      *
@@ -1506,7 +1691,7 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
      * - Progress to a new turn if no agent is available
      * - Execute the agent with the provided content or let it generate content
      * - Handle judgment requests for judge agents
-     * - Return a final result if the agent makes a decision
+     * - Set the result if the agent makes a decision
      *
      * @param role - The role of the agent to call (USER, AGENT, or JUDGE)
      * @param content - Optional content to use instead of letting the agent generate it
@@ -1530,6 +1715,7 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
      * - Starts the first turn
      * - Records the start time for performance tracking
      * - Clears any pending messages
+     * - Clears the result from any previous execution
      */
     private reset;
     private nextAgentForRole;
@@ -1554,7 +1740,7 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
      *
      * This method is called when the scenario execution reaches the maximum number
      * of turns without reaching a conclusion. It creates a failure result with
-     * appropriate reasoning and includes performance metrics.
+     * appropriate reasoning and includes performance metrics, then sets it on this.result.
      *
      * The result includes:
      * - All messages from the conversation
@@ -1564,7 +1750,6 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
      * - Total execution time and agent response times
      *
      * @param errorMessage - Optional custom error message to use instead of the default
-     * @returns A ScenarioResult indicating failure due to reaching max turns
      */
     private reachedMaxTurns;
     private getJudgeAgent;
@@ -1671,12 +1856,25 @@ declare class ScenarioExecutionState implements ScenarioExecutionStateLike {
      * Adds a message to the conversation history.
      *
      * @param message - The message to add.
+     * @param traceId - Optional trace ID to associate with the message.
      */
-    addMessage(message: CoreMessage): void;
-    lastMessage(): CoreMessage;
-    lastUserMessage(): CoreUserMessage;
-    lastAgentMessage(): CoreAssistantMessage;
-    lastToolCall(toolName: string): CoreToolMessage;
+    addMessage(message: CoreMessage & {
+        traceId?: string;
+    }): void;
+    lastMessage(): ai.ModelMessage & {
+        id: string;
+        traceId?: string;
+    };
+    lastUserMessage(): ai.UserModelMessage & {
+        id: string;
+        traceId?: string;
+    };
+    lastAgentMessage(): CoreAssistantMessage & {
+        traceId?: string;
+    };
+    lastToolCall(toolName: string): CoreToolMessage & {
+        traceId?: string;
+    };
     hasToolCall(toolName: string): boolean;
 }
@@ -1850,4 +2048,4 @@ declare namespace script {
 type ScenarioApi = typeof agents & typeof domain & typeof execution & typeof runner & typeof script;
 declare const scenario: ScenarioApi;
-export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, DEFAULT_MAX_TURNS, DEFAULT_TEMPERATURE, DEFAULT_VERBOSE, type FinishTestArgs, JudgeAgentAdapter, type JudgeAgentConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, type TestingAgentInferenceConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };
+export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, type AudioResponseEvent, DEFAULT_MAX_TURNS, DEFAULT_VERBOSE, type FinishTestArgs, type InvokeLLMParams, type InvokeLLMResult, JudgeAgentAdapter, type JudgeAgentConfig, type JudgeResult, JudgeSpanCollector, JudgeSpanDigestFormatter, RealtimeAgentAdapter, type RealtimeAgentAdapterConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, judgeSpanCollector, judgeSpanDigestFormatter, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };