@langwatch/scenario 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -1,9 +1,19 @@
1
1
  import * as ai from 'ai';
2
- import { CoreMessage, CoreUserMessage, CoreAssistantMessage, CoreToolMessage, LanguageModel, ModelMessage } from 'ai';
2
+ import { CoreMessage, CoreUserMessage, CoreAssistantMessage, CoreToolMessage, LanguageModel, generateText, ModelMessage } from 'ai';
3
3
  import { z } from 'zod/v4';
4
+ import { SpanProcessor, ReadableSpan } from '@opentelemetry/sdk-trace-base';
5
+ import { RealtimeSession } from '@openai/agents/realtime';
4
6
  import { Observable } from 'rxjs';
5
7
  import { z as z$1 } from 'zod';
6
8
 
9
+ /**
10
+ * The possible return types from an agent's `call` method.
11
+ * - string | CoreMessage | CoreMessage[]: Agent generated response
12
+ * - JudgeResult: Judge made a final decision
13
+ * - null: Judge wants to continue observing (no decision yet)
14
+ */
15
+ type AgentReturnTypes = string | CoreMessage | CoreMessage[] | JudgeResult | null;
16
+
7
17
  declare enum AgentRole {
8
18
  USER = "User",
9
19
  AGENT = "Agent",
@@ -43,11 +53,6 @@ interface AgentInput {
43
53
  */
44
54
  scenarioConfig: ScenarioConfig;
45
55
  }
46
- /**
47
- * The possible return types from an agent's `call` method.
48
- * Can be a simple string, a single message, an array of messages, or a ScenarioResult.
49
- */
50
- type AgentReturnTypes = string | CoreMessage | CoreMessage[] | ScenarioResult;
51
56
  /**
52
57
  * Abstract base class for integrating custom agents with the Scenario framework.
53
58
  *
@@ -72,6 +77,7 @@ type AgentReturnTypes = string | CoreMessage | CoreMessage[] | ScenarioResult;
72
77
  * ```
73
78
  */
74
79
  declare abstract class AgentAdapter {
80
+ name?: string;
75
81
  role: AgentRole;
76
82
  /**
77
83
  * Process the input and generate a response.
@@ -89,33 +95,21 @@ declare abstract class AgentAdapter {
89
95
  * Abstract base class for user simulator agents.
90
96
  * User simulator agents are responsible for generating user messages to drive the conversation.
91
97
  */
92
- declare abstract class UserSimulatorAgentAdapter implements AgentAdapter {
98
+ declare abstract class UserSimulatorAgentAdapter extends AgentAdapter {
99
+ name: string;
93
100
  role: AgentRole;
94
- /**
95
- * Process the input and generate a user message.
96
- *
97
- * @param input AgentInput containing conversation history, thread context, and scenario state.
98
- * @returns The user's response.
99
- */
100
- abstract call(input: AgentInput): Promise<AgentReturnTypes>;
101
101
  }
102
102
  /**
103
103
  * Abstract base class for judge agents.
104
104
  * Judge agents are responsible for evaluating the conversation and determining success or failure.
105
105
  */
106
- declare abstract class JudgeAgentAdapter implements AgentAdapter {
106
+ declare abstract class JudgeAgentAdapter extends AgentAdapter {
107
+ name: string;
107
108
  role: AgentRole;
108
109
  /**
109
110
  * The criteria the judge will use to evaluate the conversation.
110
111
  */
111
112
  abstract criteria: string[];
112
- /**
113
- * Process the input and evaluate the conversation.
114
- *
115
- * @param input AgentInput containing conversation history, thread context, and scenario state.
116
- * @returns A ScenarioResult if the conversation should end, otherwise should continue.
117
- */
118
- abstract call(input: AgentInput): Promise<AgentReturnTypes>;
119
113
  }
120
114
 
121
115
  declare const DEFAULT_MAX_TURNS = 10;
@@ -250,7 +244,7 @@ interface ScenarioExecutionLike {
250
244
  * A step in a scenario script.
251
245
  * This is a function that takes the current state and an executor, and performs an action.
252
246
  */
253
- type ScriptStep = (state: ScenarioExecutionStateLike, executor: ScenarioExecutionLike) => Promise<void | ScenarioResult | null> | void | ScenarioResult | null;
247
+ type ScriptStep = (state: ScenarioExecutionStateLike, executor: ScenarioExecutionLike) => Promise<void> | void;
254
248
 
255
249
  /**
256
250
  * Represents the result of a scenario execution.
@@ -349,11 +343,9 @@ interface ScenarioExecutionStateLike {
349
343
  hasToolCall(toolName: string): boolean;
350
344
  }
351
345
 
352
- /** Default temperature for language model inference */
353
- declare const DEFAULT_TEMPERATURE = 0;
354
346
  declare const scenarioProjectConfigSchema: z.ZodObject<{
355
347
  defaultModel: z.ZodOptional<z.ZodObject<{
356
- model: z.ZodCustom<LanguageModel, LanguageModel>;
348
+ model: z.ZodCustom<ai.LanguageModel, ai.LanguageModel>;
357
349
  temperature: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
358
350
  maxTokens: z.ZodOptional<z.ZodNumber>;
359
351
  }, z.core.$strip>>;
@@ -369,7 +361,6 @@ type domain_AgentReturnTypes = AgentReturnTypes;
369
361
  type domain_AgentRole = AgentRole;
370
362
  declare const domain_AgentRole: typeof AgentRole;
371
363
  declare const domain_DEFAULT_MAX_TURNS: typeof DEFAULT_MAX_TURNS;
372
- declare const domain_DEFAULT_TEMPERATURE: typeof DEFAULT_TEMPERATURE;
373
364
  declare const domain_DEFAULT_VERBOSE: typeof DEFAULT_VERBOSE;
374
365
  type domain_JudgeAgentAdapter = JudgeAgentAdapter;
375
366
  declare const domain_JudgeAgentAdapter: typeof JudgeAgentAdapter;
@@ -386,32 +377,33 @@ declare const domain_allAgentRoles: typeof allAgentRoles;
386
377
  declare const domain_defineConfig: typeof defineConfig;
387
378
  declare const domain_scenarioProjectConfigSchema: typeof scenarioProjectConfigSchema;
388
379
  declare namespace domain {
389
- export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_TEMPERATURE as DEFAULT_TEMPERATURE, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
380
+ export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
390
381
  }
391
382
 
392
383
  /**
393
- * Configuration for the inference parameters of a testing agent.
384
+ * Schema for a language model.
394
385
  */
395
- interface TestingAgentInferenceConfig {
396
- /**
397
- * The language model to use for generating responses.
398
- * If not provided, a default model will be used.
399
- */
400
- model?: LanguageModel;
401
- /**
402
- * The temperature for the language model.
403
- * Defaults to 0.
404
- */
405
- temperature?: number;
406
- /**
407
- * The maximum number of tokens to generate.
408
- */
409
- maxTokens?: number;
410
- }
386
+ declare const modelSchema: z.ZodObject<{
387
+ model: z.ZodCustom<LanguageModel, LanguageModel>;
388
+ temperature: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
389
+ maxTokens: z.ZodOptional<z.ZodNumber>;
390
+ }, z.core.$strip>;
391
+ type ModelConfig = z.infer<typeof modelSchema>;
392
+
393
+ /**
394
+ * Parameters for LLM invocation.
395
+ * Derived from generateText parameters for now.
396
+ */
397
+ type InvokeLLMParams = Parameters<typeof generateText>[0];
398
+ /**
399
+ * Result from LLM invocation.
400
+ * Derived from generateText return type for now.
401
+ */
402
+ type InvokeLLMResult = Pick<Awaited<ReturnType<typeof generateText>>, "text" | "content" | "toolCalls" | "toolResults">;
411
403
  /**
412
404
  * General configuration for a testing agent.
413
405
  */
414
- interface TestingAgentConfig extends TestingAgentInferenceConfig {
406
+ interface TestingAgentConfig extends Partial<ModelConfig> {
415
407
  /**
416
408
  * The name of the agent.
417
409
  */
@@ -443,6 +435,35 @@ interface FinishTestArgs {
443
435
  verdict: "success" | "failure" | "inconclusive";
444
436
  }
445
437
 
438
+ interface JudgeResult {
439
+ success: boolean;
440
+ reasoning: string;
441
+ metCriteria: string[];
442
+ unmetCriteria: string[];
443
+ }
444
+
445
+ /**
446
+ * Collects OpenTelemetry spans for judge evaluation.
447
+ * Implements SpanProcessor to intercept spans as they complete.
448
+ */
449
+ declare class JudgeSpanCollector implements SpanProcessor {
450
+ private spans;
451
+ onStart(): void;
452
+ onEnd(span: ReadableSpan): void;
453
+ forceFlush(): Promise<void>;
454
+ shutdown(): Promise<void>;
455
+ /**
456
+ * Retrieves all spans associated with a specific thread.
457
+ * @param threadId - The thread identifier to filter spans by
458
+ * @returns Array of spans for the given thread
459
+ */
460
+ getSpansForThread(threadId: string): ReadableSpan[];
461
+ }
462
+ /**
463
+ * Singleton instance of the judge span collector.
464
+ */
465
+ declare const judgeSpanCollector: JudgeSpanCollector;
466
+
446
467
  /**
447
468
  * Configuration for the judge agent.
448
469
  */
@@ -455,6 +476,10 @@ interface JudgeAgentConfig extends TestingAgentConfig {
455
476
  * The criteria that the judge will use to evaluate the conversation.
456
477
  */
457
478
  criteria: string[];
479
+ /**
480
+ * Optional span collector for telemetry. Defaults to global singleton.
481
+ */
482
+ spanCollector?: JudgeSpanCollector;
458
483
  }
459
484
  /**
460
485
  * Agent that evaluates conversations against success criteria.
@@ -468,17 +493,16 @@ interface JudgeAgentConfig extends TestingAgentConfig {
468
493
  declare class JudgeAgent extends JudgeAgentAdapter {
469
494
  private readonly cfg;
470
495
  private logger;
496
+ private readonly spanCollector;
471
497
  role: AgentRole;
472
498
  criteria: string[];
499
+ /**
500
+ * LLM invocation function. Can be overridden to customize LLM behavior.
501
+ */
502
+ invokeLLM: (params: InvokeLLMParams) => Promise<InvokeLLMResult>;
473
503
  constructor(cfg: JudgeAgentConfig);
474
- call(input: AgentInput): Promise<never[] | {
475
- success: boolean;
476
- messages: ai.ModelMessage[];
477
- reasoning: string;
478
- metCriteria: string[];
479
- unmetCriteria: string[];
480
- }>;
481
- private generateText;
504
+ call(input: AgentInput): Promise<JudgeResult | null>;
505
+ private getOpenTelemetryTracesDigest;
482
506
  }
483
507
  /**
484
508
  * Factory function for creating JudgeAgent instances.
@@ -532,15 +556,54 @@ declare class JudgeAgent extends JudgeAgentAdapter {
532
556
  */
533
557
  declare const judgeAgent: (cfg: JudgeAgentConfig) => JudgeAgent;
534
558
 
559
+ /**
560
+ * Transforms OpenTelemetry spans into a complete plain-text digest for judge evaluation.
561
+ * Deduplicates repeated string content to reduce token usage.
562
+ */
563
+ declare class JudgeSpanDigestFormatter {
564
+ private readonly logger;
565
+ private readonly deduplicator;
566
+ /**
567
+ * Formats spans into a complete digest with full content and nesting.
568
+ * @param spans - All spans for a thread
569
+ * @returns Plain text digest
570
+ */
571
+ format(spans: ReadableSpan[]): string;
572
+ private sortByStartTime;
573
+ private buildHierarchy;
574
+ private renderNode;
575
+ private getTreePrefix;
576
+ private getAttrIndent;
577
+ private cleanAttributes;
578
+ private formatValue;
579
+ private transformValue;
580
+ private transformString;
581
+ private looksLikeJson;
582
+ private hrTimeToMs;
583
+ private calculateSpanDuration;
584
+ private calculateTotalDuration;
585
+ private formatDuration;
586
+ private formatTimestamp;
587
+ private getStatusIndicator;
588
+ private collectErrors;
589
+ }
590
+ /**
591
+ * Singleton instance for convenience.
592
+ */
593
+ declare const judgeSpanDigestFormatter: JudgeSpanDigestFormatter;
594
+
535
595
  declare class UserSimulatorAgent extends UserSimulatorAgentAdapter {
536
596
  private readonly cfg?;
537
597
  private logger;
598
+ /**
599
+ * LLM invocation function. Can be overridden to customize LLM behavior.
600
+ */
601
+ invokeLLM: (params: InvokeLLMParams) => Promise<InvokeLLMResult>;
538
602
  constructor(cfg?: TestingAgentConfig | undefined);
539
603
  call: (input: AgentInput) => Promise<{
540
604
  role: "user";
541
605
  content: string;
542
606
  }>;
543
- private generateText;
544
607
  }
545
608
  /**
546
609
  * Agent that simulates realistic user behavior in scenario conversations.
@@ -633,14 +696,169 @@ declare class UserSimulatorAgent extends UserSimulatorAgentAdapter {
633
696
  */
634
697
  declare const userSimulatorAgent: (config?: TestingAgentConfig) => UserSimulatorAgent;
635
698
 
699
+ /**
700
+ * Event emitted when an audio response is completed
701
+ */
702
+ interface AudioResponseEvent {
703
+ transcript: string;
704
+ audio: string;
705
+ }
706
+
707
+ /**
708
+ * Realtime Agent Adapter for Scenario Testing
709
+ *
710
+ * Adapts a connected RealtimeSession to the Scenario framework interface.
711
+ * The session must be created and connected before passing to this adapter.
712
+ *
713
+ * This ensures we test the REAL agent, not a mock, using the same session
714
+ * creation pattern as the browser client.
715
+ */
716
+
717
+ /**
718
+ * Configuration for RealtimeAgentAdapter
719
+ */
720
+ interface RealtimeAgentAdapterConfig {
721
+ /**
722
+ * The role of the agent
723
+ */
724
+ role: AgentRole;
725
+ /**
726
+ * A connected RealtimeSession instance
727
+ *
728
+ * The session should be created using your agent's session creator function
729
+ * and connected before passing to this adapter.
730
+ *
731
+ * @example
732
+ * ```typescript
733
+ * const session = createVegetarianRecipeSession();
734
+ * await session.connect({ apiKey: process.env.OPENAI_API_KEY });
735
+ * const adapter = new RealtimeAgentAdapter({
736
+ * session,
737
+ * role: AgentRole.AGENT,
738
+ * agentName: "Vegetarian Recipe Assistant"
739
+ * });
740
+ * ```
741
+ */
742
+ session: RealtimeSession;
743
+ /**
744
+ * Name of the agent (for logging/identification)
745
+ */
746
+ agentName: string;
747
+ /**
748
+ * Timeout for waiting for agent response (ms)
749
+ * @default 30000
750
+ */
751
+ responseTimeout?: number;
752
+ }
753
+ /**
754
+ * Adapter that connects Scenario testing framework to OpenAI Realtime API
755
+ *
756
+ * This adapter wraps a connected RealtimeSession to provide the Scenario
757
+ * framework interface. The session must be created and connected externally,
758
+ * ensuring the same session creation pattern is used in both browser and tests.
759
+ *
760
+ * @example
761
+ * ```typescript
762
+ * // In beforeAll
763
+ * const session = createVegetarianRecipeSession();
764
+ * await session.connect({ apiKey: process.env.OPENAI_API_KEY });
765
+ * const adapter = new RealtimeAgentAdapter({
766
+ * session,
767
+ * role: AgentRole.AGENT
768
+ * });
769
+ *
770
+ * // In test
771
+ * await scenario.run({
772
+ * agents: [adapter, scenario.userSimulatorAgent()],
773
+ * script: [scenario.user("quick recipe"), scenario.agent()]
774
+ * });
775
+ *
776
+ * // In afterAll
777
+ * session.close();
778
+ * ```
779
+ */
780
+ declare class RealtimeAgentAdapter extends AgentAdapter {
781
+ private config;
782
+ role: AgentRole;
783
+ name: string;
784
+ private session;
785
+ private eventHandler;
786
+ private messageProcessor;
787
+ private responseFormatter;
788
+ private audioEvents;
789
+ /**
790
+ * Creates a new RealtimeAgentAdapter instance
791
+ *
792
+ * The session can be either connected or unconnected.
793
+ * If unconnected, call connect() with an API key before use.
794
+ *
795
+ * @param config - Configuration for the realtime agent adapter
796
+ */
797
+ constructor(config: RealtimeAgentAdapterConfig);
798
+ /**
799
+ * Get the connect method from the session
800
+ */
801
+ connect(params?: Parameters<RealtimeSession["connect"]>[0] | undefined): Promise<void>;
802
+ /**
803
+ * Closes the session connection
804
+ */
805
+ disconnect(): Promise<void>;
806
+ /**
807
+ * Process input and generate response (implements AgentAdapter interface)
808
+ *
809
+ * This is called by Scenario framework for each agent turn.
810
+ * Handles both text and audio input, returns audio message with transcript.
811
+ *
812
+ * @param input - Scenario agent input with message history
813
+ * @returns Agent response as audio message or text
814
+ */
815
+ call(input: AgentInput): Promise<AgentReturnTypes>;
816
+ /**
817
+ * Handles the initial response when no user message exists
818
+ */
819
+ private handleInitialResponse;
820
+ /**
821
+ * Handles audio input from the user
822
+ */
823
+ private handleAudioInput;
824
+ /**
825
+ * Handles text input from the user
826
+ */
827
+ private handleTextInput;
828
+ /**
829
+ * Subscribe to audio response events
830
+ *
831
+ * @param callback - Function called when an audio response completes
832
+ */
833
+ onAudioResponse(callback: (event: AudioResponseEvent) => void): void;
834
+ /**
835
+ * Remove audio response listener
836
+ *
837
+ * @param callback - The callback function to remove
838
+ */
839
+ offAudioResponse(callback: (event: AudioResponseEvent) => void): void;
840
+ }
841
+
842
+ type agents_AudioResponseEvent = AudioResponseEvent;
636
843
  type agents_FinishTestArgs = FinishTestArgs;
844
+ type agents_InvokeLLMParams = InvokeLLMParams;
845
+ type agents_InvokeLLMResult = InvokeLLMResult;
637
846
  type agents_JudgeAgentConfig = JudgeAgentConfig;
847
+ type agents_JudgeResult = JudgeResult;
848
+ type agents_JudgeSpanCollector = JudgeSpanCollector;
849
+ declare const agents_JudgeSpanCollector: typeof JudgeSpanCollector;
850
+ type agents_JudgeSpanDigestFormatter = JudgeSpanDigestFormatter;
851
+ declare const agents_JudgeSpanDigestFormatter: typeof JudgeSpanDigestFormatter;
852
+ type agents_RealtimeAgentAdapter = RealtimeAgentAdapter;
853
+ declare const agents_RealtimeAgentAdapter: typeof RealtimeAgentAdapter;
854
+ type agents_RealtimeAgentAdapterConfig = RealtimeAgentAdapterConfig;
638
855
  type agents_TestingAgentConfig = TestingAgentConfig;
639
- type agents_TestingAgentInferenceConfig = TestingAgentInferenceConfig;
640
856
  declare const agents_judgeAgent: typeof judgeAgent;
857
+ declare const agents_judgeSpanCollector: typeof judgeSpanCollector;
858
+ declare const agents_judgeSpanDigestFormatter: typeof judgeSpanDigestFormatter;
641
859
  declare const agents_userSimulatorAgent: typeof userSimulatorAgent;
642
860
  declare namespace agents {
643
- export { type agents_FinishTestArgs as FinishTestArgs, type agents_JudgeAgentConfig as JudgeAgentConfig, type agents_TestingAgentConfig as TestingAgentConfig, type agents_TestingAgentInferenceConfig as TestingAgentInferenceConfig, agents_judgeAgent as judgeAgent, agents_userSimulatorAgent as userSimulatorAgent };
861
+ export { type agents_AudioResponseEvent as AudioResponseEvent, type agents_FinishTestArgs as FinishTestArgs, type agents_InvokeLLMParams as InvokeLLMParams, type agents_InvokeLLMResult as InvokeLLMResult, type agents_JudgeAgentConfig as JudgeAgentConfig, type agents_JudgeResult as JudgeResult, agents_JudgeSpanCollector as JudgeSpanCollector, agents_JudgeSpanDigestFormatter as JudgeSpanDigestFormatter, agents_RealtimeAgentAdapter as RealtimeAgentAdapter, type agents_RealtimeAgentAdapterConfig as RealtimeAgentAdapterConfig, type agents_TestingAgentConfig as TestingAgentConfig, agents_judgeAgent as judgeAgent, agents_judgeSpanCollector as judgeSpanCollector, agents_judgeSpanDigestFormatter as judgeSpanDigestFormatter, agents_userSimulatorAgent as userSimulatorAgent };
644
862
  }
645
863
 
646
864
  /**
@@ -682,11 +900,11 @@ declare const scenarioEventSchema: z$1.ZodDiscriminatedUnion<"type", [z$1.ZodObj
682
900
  name: z$1.ZodOptional<z$1.ZodString>;
683
901
  description: z$1.ZodOptional<z$1.ZodString>;
684
902
  }, "strip", z$1.ZodTypeAny, {
685
- name?: string | undefined;
686
903
  description?: string | undefined;
687
- }, {
688
904
  name?: string | undefined;
905
+ }, {
689
906
  description?: string | undefined;
907
+ name?: string | undefined;
690
908
  }>;
691
909
  }, "strip", z$1.ZodTypeAny, {
692
910
  type: ScenarioEventType.RUN_STARTED;
@@ -696,8 +914,8 @@ declare const scenarioEventSchema: z$1.ZodDiscriminatedUnion<"type", [z$1.ZodObj
696
914
  scenarioRunId: string;
697
915
  scenarioSetId: string;
698
916
  metadata: {
699
- name?: string | undefined;
700
917
  description?: string | undefined;
918
+ name?: string | undefined;
701
919
  };
702
920
  rawEvent?: any;
703
921
  }, {
@@ -707,8 +925,8 @@ declare const scenarioEventSchema: z$1.ZodDiscriminatedUnion<"type", [z$1.ZodObj
707
925
  scenarioId: string;
708
926
  scenarioRunId: string;
709
927
  metadata: {
710
- name?: string | undefined;
711
928
  description?: string | undefined;
929
+ name?: string | undefined;
712
930
  };
713
931
  rawEvent?: any;
714
932
  scenarioSetId?: string | undefined;
@@ -1086,8 +1304,12 @@ type ScenarioEvent = z$1.infer<typeof scenarioEventSchema>;
1086
1304
  * ```
1087
1305
  */
1088
1306
  declare class ScenarioExecution implements ScenarioExecutionLike {
1307
+ /** LangWatch tracer for scenario execution */
1308
+ private tracer;
1089
1309
  /** The current state of the scenario execution */
1090
1310
  private state;
1311
+ /** The final result of the scenario execution, set when a conclusion is reached */
1312
+ private _result?;
1091
1313
  /** Logger for debugging and monitoring */
1092
1314
  private logger;
1093
1315
  /** Finalized configuration with all defaults applied */
@@ -1106,10 +1328,10 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
1106
1328
  * Key: agent index, Value: array of pending messages for that agent
1107
1329
  */
1108
1330
  private pendingMessages;
1109
- /** Intermediate result set by agents that make final decisions */
1110
- private partialResult;
1111
1331
  /** Accumulated execution time for each agent (for performance tracking) */
1112
1332
  private agentTimes;
1333
+ /** Current turn span for trace context management */
1334
+ private currentTurnSpan?;
1113
1335
  /** Timestamp when execution started (for total time calculation) */
1114
1336
  private totalStartTime;
1115
1337
  /** Event stream for monitoring scenario progress */
@@ -1144,6 +1366,20 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
1144
1366
  * @returns The thread identifier string
1145
1367
  */
1146
1368
  get threadId(): string;
1369
+ /**
1370
+ * Gets the result of the scenario execution if it has been set.
1371
+ *
1372
+ * @returns The scenario result or undefined if not yet set
1373
+ */
1374
+ get result(): ScenarioResult | undefined;
1375
+ /**
1376
+ * Sets the result of the scenario execution.
1377
+ * This is called when the scenario reaches a conclusion (success or failure).
1378
+ * Automatically includes messages, totalTime, and agentTime from the current execution context.
1379
+ *
1380
+ * @param result - The final scenario result (without messages/timing, which will be added automatically)
1381
+ */
1382
+ private setResult;
1147
1383
  /**
1148
1384
  * The total elapsed time for the scenario execution.
1149
1385
  */
@@ -1186,30 +1422,25 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
1186
1422
  * - Progress to the next turn if needed
1187
1423
  * - Find the next agent that should act
1188
1424
  * - Execute that agent's response
1189
- * - Return either new messages or a final scenario result
1425
+ * - Set the result if the scenario concludes
1190
1426
  *
1191
1427
  * Note: This method is primarily for debugging or custom execution flows. Most users
1192
1428
  * will use `execute()` to run the entire scenario automatically.
1193
1429
  *
1194
- * @returns A promise that resolves with either:
1195
- * - Array of new messages added during the agent interaction, or
1196
- * - A final ScenarioResult if the interaction concludes the scenario
1197
- * @throws Error if no result is returned from the step
1430
+ * After calling this method, check `this.result` to see if the scenario has concluded.
1198
1431
  *
1199
1432
  * @example
1200
1433
  * ```typescript
1201
1434
  * const execution = new ScenarioExecution(config, script);
1202
1435
  *
1203
1436
  * // Execute one agent interaction at a time
1204
- * const messages = await execution.step();
1205
- * if (Array.isArray(messages)) {
1206
- * console.log('New messages:', messages);
1207
- * } else {
1208
- * console.log('Scenario finished:', messages.success);
1437
+ * await execution.step();
1438
+ * if (execution.result) {
1439
+ * console.log('Scenario finished:', execution.result.success);
1209
1440
  * }
1210
1441
  * ```
1211
1442
  */
1212
- step(): Promise<ModelMessage[] | ScenarioResult>;
1443
+ step(): Promise<void>;
1213
1444
  private _step;
1214
1445
  /**
1215
1446
  * Calls a specific agent to generate a response or make a decision.
@@ -1228,15 +1459,12 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
1228
1459
  * After the agent responds:
1229
1460
  * - Performance timing is recorded
1230
1461
  * - Pending messages for this agent are cleared (they've been processed)
1231
- * - If the agent returns a ScenarioResult, it's returned immediately
1462
+ * - If the agent returns a ScenarioResult, it's set on this.result
1232
1463
  * - Otherwise, the agent's messages are added to the conversation and broadcast
1233
1464
  *
1234
1465
  * @param idx - The index of the agent in the agents array
1235
1466
  * @param role - The role the agent is being asked to play (USER, AGENT, or JUDGE)
1236
1467
  * @param judgmentRequest - Whether this is a judgment request (for judge agents)
1237
- * @returns A promise that resolves with either:
1238
- * - Array of messages if the agent generated a response, or
1239
- * - ScenarioResult if the agent made a final decision
1240
1468
  * @throws Error if the agent call fails
1241
1469
  */
1242
1470
  private callAgent;
@@ -1451,49 +1679,6 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
1451
1679
  * ```
1452
1680
  */
1453
1681
  addAgentTime(agentIdx: number, time: number): void;
1454
- /**
1455
- * Checks if a partial result has been set for the scenario.
1456
- *
1457
- * This method is used internally to determine if a scenario has already reached
1458
- * a conclusion (success or failure) but hasn't been finalized yet. Partial results
1459
- * are typically set by agents that make final decisions (like judge agents) and
1460
- * are later finalized with the complete message history.
1461
- *
1462
- * @returns True if a partial result exists, false otherwise
1463
- *
1464
- * @example
1465
- * ```typescript
1466
- * // This is typically used internally by the execution engine
1467
- * if (execution.hasResult()) {
1468
- * console.log('Scenario has reached a conclusion');
1469
- * }
1470
- * ```
1471
- */
1472
- hasResult(): boolean;
1473
- /**
1474
- * Sets a partial result for the scenario.
1475
- *
1476
- * This method is used internally to store intermediate results that may be
1477
- * finalized later with the complete message history. Partial results are typically
1478
- * created by agents that make final decisions (like judge agents) and contain
1479
- * the success/failure status, reasoning, and criteria evaluation, but not the
1480
- * complete message history.
1481
- *
1482
- * @param result - The partial result without the messages field. Should include
1483
- * success status, reasoning, and criteria evaluation.
1484
- *
1485
- * @example
1486
- * ```typescript
1487
- * // This is typically called internally by agents that make final decisions
1488
- * execution.setResult({
1489
- * success: true,
1490
- * reasoning: "Agent provided accurate weather information",
1491
- * metCriteria: ["Provides accurate weather data"],
1492
- * unmetCriteria: []
1493
- * });
1494
- * ```
1495
- */
1496
- setResult(result: Omit<ScenarioResult, "messages">): void;
1497
1682
  /**
1498
1683
  * Internal method to handle script step calls to agents.
1499
1684
  *
@@ -1506,7 +1691,7 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
1506
1691
  * - Progress to a new turn if no agent is available
1507
1692
  * - Execute the agent with the provided content or let it generate content
1508
1693
  * - Handle judgment requests for judge agents
1509
- * - Return a final result if the agent makes a decision
1694
+ * - Set the result if the agent makes a decision
1510
1695
  *
1511
1696
  * @param role - The role of the agent to call (USER, AGENT, or JUDGE)
1512
1697
  * @param content - Optional content to use instead of letting the agent generate it
@@ -1530,6 +1715,7 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
1530
1715
  * - Starts the first turn
1531
1716
  * - Records the start time for performance tracking
1532
1717
  * - Clears any pending messages
1718
+ * - Clears the result from any previous execution
1533
1719
  */
1534
1720
  private reset;
1535
1721
  private nextAgentForRole;
@@ -1554,7 +1740,7 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
1554
1740
  *
1555
1741
  * This method is called when the scenario execution reaches the maximum number
1556
1742
  * of turns without reaching a conclusion. It creates a failure result with
1557
- * appropriate reasoning and includes performance metrics.
1743
+ * appropriate reasoning and includes performance metrics, then sets it on this.result.
1558
1744
  *
1559
1745
  * The result includes:
1560
1746
  * - All messages from the conversation
@@ -1564,7 +1750,6 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
1564
1750
  * - Total execution time and agent response times
1565
1751
  *
1566
1752
  * @param errorMessage - Optional custom error message to use instead of the default
1567
- * @returns A ScenarioResult indicating failure due to reaching max turns
1568
1753
  */
1569
1754
  private reachedMaxTurns;
1570
1755
  private getJudgeAgent;
@@ -1671,12 +1856,25 @@ declare class ScenarioExecutionState implements ScenarioExecutionStateLike {
1671
1856
  * Adds a message to the conversation history.
1672
1857
  *
1673
1858
  * @param message - The message to add.
1859
+ * @param traceId - Optional trace ID to associate with the message.
1674
1860
  */
1675
- addMessage(message: CoreMessage): void;
1676
- lastMessage(): CoreMessage;
1677
- lastUserMessage(): CoreUserMessage;
1678
- lastAgentMessage(): CoreAssistantMessage;
1679
- lastToolCall(toolName: string): CoreToolMessage;
1861
+ addMessage(message: CoreMessage & {
1862
+ traceId?: string;
1863
+ }): void;
1864
+ lastMessage(): ai.ModelMessage & {
1865
+ id: string;
1866
+ traceId?: string;
1867
+ };
1868
+ lastUserMessage(): ai.UserModelMessage & {
1869
+ id: string;
1870
+ traceId?: string;
1871
+ };
1872
+ lastAgentMessage(): CoreAssistantMessage & {
1873
+ traceId?: string;
1874
+ };
1875
+ lastToolCall(toolName: string): CoreToolMessage & {
1876
+ traceId?: string;
1877
+ };
1680
1878
  hasToolCall(toolName: string): boolean;
1681
1879
  }
1682
1880
 
@@ -1850,4 +2048,4 @@ declare namespace script {
1850
2048
  type ScenarioApi = typeof agents & typeof domain & typeof execution & typeof runner & typeof script;
1851
2049
  declare const scenario: ScenarioApi;
1852
2050
 
1853
- export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, DEFAULT_MAX_TURNS, DEFAULT_TEMPERATURE, DEFAULT_VERBOSE, type FinishTestArgs, JudgeAgentAdapter, type JudgeAgentConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, type TestingAgentInferenceConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };
2051
+ export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, type AudioResponseEvent, DEFAULT_MAX_TURNS, DEFAULT_VERBOSE, type FinishTestArgs, type InvokeLLMParams, type InvokeLLMResult, JudgeAgentAdapter, type JudgeAgentConfig, type JudgeResult, JudgeSpanCollector, JudgeSpanDigestFormatter, RealtimeAgentAdapter, type RealtimeAgentAdapterConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, judgeSpanCollector, judgeSpanDigestFormatter, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };