@langwatch/scenario 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -20,6 +20,18 @@ declare enum AgentRole {
20
20
  JUDGE = "Judge"
21
21
  }
22
22
  declare const allAgentRoles: readonly [AgentRole.USER, AgentRole.AGENT, AgentRole.JUDGE];
23
+ /**
24
+ * Encapsulates a request for the judge agent to evaluate the conversation.
25
+ *
26
+ * When present on AgentInput, signals the judge to produce a verdict.
27
+ * Optionally carries inline criteria that override the judge's own criteria.
28
+ */
29
+ interface JudgmentRequest {
30
+ /**
31
+ * Optional criteria to evaluate, overriding the judge agent's configured criteria.
32
+ */
33
+ criteria?: string[];
34
+ }
23
35
  /**
24
36
  * Input provided to an agent's `call` method.
25
37
  */
@@ -41,9 +53,9 @@ interface AgentInput {
41
53
  */
42
54
  requestedRole: AgentRole;
43
55
  /**
44
- * Whether a judgment is being requested in this turn.
56
+ * When set, requests the judge to produce a verdict, optionally with inline criteria.
45
57
  */
46
- judgmentRequest: boolean;
58
+ judgmentRequest?: JudgmentRequest;
47
59
  /**
48
60
  * The current state of the scenario execution.
49
61
  */
@@ -215,10 +227,12 @@ interface ScenarioExecutionLike {
215
227
  agent(content?: string | ModelMessage): Promise<void>;
216
228
  /**
217
229
  * Invokes the judge agent to evaluate the current state.
218
- * @param content Optional message to the judge.
230
+ * @param options Optional options with inline criteria to evaluate as a checkpoint.
219
231
  * @returns The result of the scenario if the judge makes a final decision.
220
232
  */
221
- judge(content?: string | ModelMessage): Promise<ScenarioResult | null>;
233
+ judge(options?: {
234
+ criteria?: string[];
235
+ }): Promise<ScenarioResult | null>;
222
236
  /**
223
237
  * Proceeds with the scenario automatically for a number of turns.
224
238
  * @param turns The number of turns to proceed. Defaults to running until the scenario ends.
@@ -364,6 +378,7 @@ declare const domain_DEFAULT_MAX_TURNS: typeof DEFAULT_MAX_TURNS;
364
378
  declare const domain_DEFAULT_VERBOSE: typeof DEFAULT_VERBOSE;
365
379
  type domain_JudgeAgentAdapter = JudgeAgentAdapter;
366
380
  declare const domain_JudgeAgentAdapter: typeof JudgeAgentAdapter;
381
+ type domain_JudgmentRequest = JudgmentRequest;
367
382
  type domain_ScenarioConfig = ScenarioConfig;
368
383
  type domain_ScenarioConfigFinal = ScenarioConfigFinal;
369
384
  type domain_ScenarioExecutionLike = ScenarioExecutionLike;
@@ -377,7 +392,7 @@ declare const domain_allAgentRoles: typeof allAgentRoles;
377
392
  declare const domain_defineConfig: typeof defineConfig;
378
393
  declare const domain_scenarioProjectConfigSchema: typeof scenarioProjectConfigSchema;
379
394
  declare namespace domain {
380
- export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
395
+ export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_JudgmentRequest as JudgmentRequest, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
381
396
  }
382
397
 
383
398
  /**
@@ -475,7 +490,7 @@ interface JudgeAgentConfig extends TestingAgentConfig {
475
490
  /**
476
491
  * The criteria that the judge will use to evaluate the conversation.
477
492
  */
478
- criteria: string[];
493
+ criteria?: string[];
479
494
  /**
480
495
  * Optional span collector for telemetry. Defaults to global singleton.
481
496
  */
@@ -554,7 +569,7 @@ declare class JudgeAgent extends JudgeAgentAdapter {
554
569
  * main();
555
570
  * ```
556
571
  */
557
- declare const judgeAgent: (cfg: JudgeAgentConfig) => JudgeAgent;
572
+ declare const judgeAgent: (cfg?: JudgeAgentConfig) => JudgeAgent;
558
573
 
559
574
  /**
560
575
  * Transforms OpenTelemetry spans into a complete plain-text digest for judge evaluation.
@@ -1334,6 +1349,8 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
1334
1349
  private currentTurnSpan?;
1335
1350
  /** Timestamp when execution started (for total time calculation) */
1336
1351
  private totalStartTime;
1352
+ /** Accumulated results from inline judge checkpoints */
1353
+ private checkpointResults;
1337
1354
  /** Event stream for monitoring scenario progress */
1338
1355
  private eventSubject;
1339
1356
  /**
@@ -1554,24 +1571,23 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
1554
1571
  *
1555
1572
  * This method is part of the ScenarioExecutionLike interface used by script steps.
1556
1573
  *
1557
- * @param content - Optional message to pass to the judge agent for additional context
1574
+ * @param options - Optional options with inline criteria to evaluate as a checkpoint.
1558
1575
  * @returns A promise that resolves with:
1559
1576
  * - ScenarioResult if the judge makes a final decision, or
1560
1577
  * - Null if the conversation should continue
1561
1578
  *
1562
1579
  * @example
1563
1580
  * ```typescript
1564
- * // Let judge evaluate current state
1581
+ * // Let judge evaluate with its configured criteria
1565
1582
  * const result = await execution.judge();
1566
- * if (result) {
1567
- * console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
1568
- * }
1569
1583
  *
1570
- * // Provide additional context to judge
1571
- * const result = await execution.judge("Please consider the user's satisfaction level");
1584
+ * // Evaluate inline criteria as a checkpoint
1585
+ * const result = await execution.judge({ criteria: ["Agent responded helpfully"] });
1572
1586
  * ```
1573
1587
  */
1574
- judge(content?: string | ModelMessage): Promise<ScenarioResult | null>;
1588
+ judge(options?: {
1589
+ criteria?: string[];
1590
+ }): Promise<ScenarioResult | null>;
1575
1591
  /**
1576
1592
  * Lets the scenario proceed automatically for a specified number of turns.
1577
1593
  *
@@ -1718,6 +1734,8 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
1718
1734
  * - Clears the result from any previous execution
1719
1735
  */
1720
1736
  private reset;
1737
+ /** Compiles all accumulated checkpoint results into aggregated met/unmet criteria. */
1738
+ private get compiledCheckpoints();
1721
1739
  private nextAgentForRole;
1722
1740
  /**
1723
1741
  * Starts a new turn in the scenario execution.
@@ -1980,15 +1998,20 @@ declare const agent: (content?: string | ModelMessage) => ScriptStep;
1980
1998
  /**
1981
1999
  * Invoke the judge agent to evaluate the current conversation state.
1982
2000
  *
1983
- * This function forces the judge agent to make a decision about whether
1984
- * the scenario should continue or end with a success/failure verdict.
1985
- * The judge will evaluate based on its configured criteria.
2001
+ * When criteria are provided inline, the judge evaluates only those criteria
2002
+ * as a checkpoint: if all pass, the scenario continues; if any fail, the
2003
+ * scenario fails immediately. This is the preferred way to pass criteria
2004
+ * when using scripts.
1986
2005
  *
1987
- * @param content Optional message content for the judge. Usually undefined to let
1988
- * the judge evaluate based on its criteria.
2006
+ * When no criteria are provided, the judge uses its own configured criteria
2007
+ * and returns a final verdict (success or failure), ending the scenario.
2008
+ *
2009
+ * @param options Optional options object with inline criteria to evaluate.
1989
2010
  * @returns A ScriptStep function that can be used in scenario scripts.
1990
2011
  */
1991
- declare const judge: (content?: string | ModelMessage) => ScriptStep;
2012
+ declare const judge: (options?: {
2013
+ criteria: string[];
2014
+ }) => ScriptStep;
1992
2015
  /**
1993
2016
  * Generate or specify a user message in the conversation.
1994
2017
  *
@@ -2048,4 +2071,4 @@ declare namespace script {
2048
2071
  type ScenarioApi = typeof agents & typeof domain & typeof execution & typeof runner & typeof script;
2049
2072
  declare const scenario: ScenarioApi;
2050
2073
 
2051
- export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, type AudioResponseEvent, DEFAULT_MAX_TURNS, DEFAULT_VERBOSE, type FinishTestArgs, type InvokeLLMParams, type InvokeLLMResult, JudgeAgentAdapter, type JudgeAgentConfig, type JudgeResult, JudgeSpanCollector, JudgeSpanDigestFormatter, RealtimeAgentAdapter, type RealtimeAgentAdapterConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, judgeSpanCollector, judgeSpanDigestFormatter, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };
2074
+ export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, type AudioResponseEvent, DEFAULT_MAX_TURNS, DEFAULT_VERBOSE, type FinishTestArgs, type InvokeLLMParams, type InvokeLLMResult, JudgeAgentAdapter, type JudgeAgentConfig, type JudgeResult, JudgeSpanCollector, JudgeSpanDigestFormatter, type JudgmentRequest, RealtimeAgentAdapter, type RealtimeAgentAdapterConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, judgeSpanCollector, judgeSpanDigestFormatter, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };
package/dist/index.d.ts CHANGED
@@ -20,6 +20,18 @@ declare enum AgentRole {
20
20
  JUDGE = "Judge"
21
21
  }
22
22
  declare const allAgentRoles: readonly [AgentRole.USER, AgentRole.AGENT, AgentRole.JUDGE];
23
+ /**
24
+ * Encapsulates a request for the judge agent to evaluate the conversation.
25
+ *
26
+ * When present on AgentInput, signals the judge to produce a verdict.
27
+ * Optionally carries inline criteria that override the judge's own criteria.
28
+ */
29
+ interface JudgmentRequest {
30
+ /**
31
+ * Optional criteria to evaluate, overriding the judge agent's configured criteria.
32
+ */
33
+ criteria?: string[];
34
+ }
23
35
  /**
24
36
  * Input provided to an agent's `call` method.
25
37
  */
@@ -41,9 +53,9 @@ interface AgentInput {
41
53
  */
42
54
  requestedRole: AgentRole;
43
55
  /**
44
- * Whether a judgment is being requested in this turn.
56
+ * When set, requests the judge to produce a verdict, optionally with inline criteria.
45
57
  */
46
- judgmentRequest: boolean;
58
+ judgmentRequest?: JudgmentRequest;
47
59
  /**
48
60
  * The current state of the scenario execution.
49
61
  */
@@ -215,10 +227,12 @@ interface ScenarioExecutionLike {
215
227
  agent(content?: string | ModelMessage): Promise<void>;
216
228
  /**
217
229
  * Invokes the judge agent to evaluate the current state.
218
- * @param content Optional message to the judge.
230
+ * @param options Optional options with inline criteria to evaluate as a checkpoint.
219
231
  * @returns The result of the scenario if the judge makes a final decision.
220
232
  */
221
- judge(content?: string | ModelMessage): Promise<ScenarioResult | null>;
233
+ judge(options?: {
234
+ criteria?: string[];
235
+ }): Promise<ScenarioResult | null>;
222
236
  /**
223
237
  * Proceeds with the scenario automatically for a number of turns.
224
238
  * @param turns The number of turns to proceed. Defaults to running until the scenario ends.
@@ -364,6 +378,7 @@ declare const domain_DEFAULT_MAX_TURNS: typeof DEFAULT_MAX_TURNS;
364
378
  declare const domain_DEFAULT_VERBOSE: typeof DEFAULT_VERBOSE;
365
379
  type domain_JudgeAgentAdapter = JudgeAgentAdapter;
366
380
  declare const domain_JudgeAgentAdapter: typeof JudgeAgentAdapter;
381
+ type domain_JudgmentRequest = JudgmentRequest;
367
382
  type domain_ScenarioConfig = ScenarioConfig;
368
383
  type domain_ScenarioConfigFinal = ScenarioConfigFinal;
369
384
  type domain_ScenarioExecutionLike = ScenarioExecutionLike;
@@ -377,7 +392,7 @@ declare const domain_allAgentRoles: typeof allAgentRoles;
377
392
  declare const domain_defineConfig: typeof defineConfig;
378
393
  declare const domain_scenarioProjectConfigSchema: typeof scenarioProjectConfigSchema;
379
394
  declare namespace domain {
380
- export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
395
+ export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_JudgmentRequest as JudgmentRequest, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
381
396
  }
382
397
 
383
398
  /**
@@ -475,7 +490,7 @@ interface JudgeAgentConfig extends TestingAgentConfig {
475
490
  /**
476
491
  * The criteria that the judge will use to evaluate the conversation.
477
492
  */
478
- criteria: string[];
493
+ criteria?: string[];
479
494
  /**
480
495
  * Optional span collector for telemetry. Defaults to global singleton.
481
496
  */
@@ -554,7 +569,7 @@ declare class JudgeAgent extends JudgeAgentAdapter {
554
569
  * main();
555
570
  * ```
556
571
  */
557
- declare const judgeAgent: (cfg: JudgeAgentConfig) => JudgeAgent;
572
+ declare const judgeAgent: (cfg?: JudgeAgentConfig) => JudgeAgent;
558
573
 
559
574
  /**
560
575
  * Transforms OpenTelemetry spans into a complete plain-text digest for judge evaluation.
@@ -1334,6 +1349,8 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
1334
1349
  private currentTurnSpan?;
1335
1350
  /** Timestamp when execution started (for total time calculation) */
1336
1351
  private totalStartTime;
1352
+ /** Accumulated results from inline judge checkpoints */
1353
+ private checkpointResults;
1337
1354
  /** Event stream for monitoring scenario progress */
1338
1355
  private eventSubject;
1339
1356
  /**
@@ -1554,24 +1571,23 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
1554
1571
  *
1555
1572
  * This method is part of the ScenarioExecutionLike interface used by script steps.
1556
1573
  *
1557
- * @param content - Optional message to pass to the judge agent for additional context
1574
+ * @param options - Optional options with inline criteria to evaluate as a checkpoint.
1558
1575
  * @returns A promise that resolves with:
1559
1576
  * - ScenarioResult if the judge makes a final decision, or
1560
1577
  * - Null if the conversation should continue
1561
1578
  *
1562
1579
  * @example
1563
1580
  * ```typescript
1564
- * // Let judge evaluate current state
1581
+ * // Let judge evaluate with its configured criteria
1565
1582
  * const result = await execution.judge();
1566
- * if (result) {
1567
- * console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
1568
- * }
1569
1583
  *
1570
- * // Provide additional context to judge
1571
- * const result = await execution.judge("Please consider the user's satisfaction level");
1584
+ * // Evaluate inline criteria as a checkpoint
1585
+ * const result = await execution.judge({ criteria: ["Agent responded helpfully"] });
1572
1586
  * ```
1573
1587
  */
1574
- judge(content?: string | ModelMessage): Promise<ScenarioResult | null>;
1588
+ judge(options?: {
1589
+ criteria?: string[];
1590
+ }): Promise<ScenarioResult | null>;
1575
1591
  /**
1576
1592
  * Lets the scenario proceed automatically for a specified number of turns.
1577
1593
  *
@@ -1718,6 +1734,8 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
1718
1734
  * - Clears the result from any previous execution
1719
1735
  */
1720
1736
  private reset;
1737
+ /** Compiles all accumulated checkpoint results into aggregated met/unmet criteria. */
1738
+ private get compiledCheckpoints();
1721
1739
  private nextAgentForRole;
1722
1740
  /**
1723
1741
  * Starts a new turn in the scenario execution.
@@ -1980,15 +1998,20 @@ declare const agent: (content?: string | ModelMessage) => ScriptStep;
1980
1998
  /**
1981
1999
  * Invoke the judge agent to evaluate the current conversation state.
1982
2000
  *
1983
- * This function forces the judge agent to make a decision about whether
1984
- * the scenario should continue or end with a success/failure verdict.
1985
- * The judge will evaluate based on its configured criteria.
2001
+ * When criteria are provided inline, the judge evaluates only those criteria
2002
+ * as a checkpoint: if all pass, the scenario continues; if any fail, the
2003
+ * scenario fails immediately. This is the preferred way to pass criteria
2004
+ * when using scripts.
1986
2005
  *
1987
- * @param content Optional message content for the judge. Usually undefined to let
1988
- * the judge evaluate based on its criteria.
2006
+ * When no criteria are provided, the judge uses its own configured criteria
2007
+ * and returns a final verdict (success or failure), ending the scenario.
2008
+ *
2009
+ * @param options Optional options object with inline criteria to evaluate.
1989
2010
  * @returns A ScriptStep function that can be used in scenario scripts.
1990
2011
  */
1991
- declare const judge: (content?: string | ModelMessage) => ScriptStep;
2012
+ declare const judge: (options?: {
2013
+ criteria: string[];
2014
+ }) => ScriptStep;
1992
2015
  /**
1993
2016
  * Generate or specify a user message in the conversation.
1994
2017
  *
@@ -2048,4 +2071,4 @@ declare namespace script {
2048
2071
  type ScenarioApi = typeof agents & typeof domain & typeof execution & typeof runner & typeof script;
2049
2072
  declare const scenario: ScenarioApi;
2050
2073
 
2051
- export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, type AudioResponseEvent, DEFAULT_MAX_TURNS, DEFAULT_VERBOSE, type FinishTestArgs, type InvokeLLMParams, type InvokeLLMResult, JudgeAgentAdapter, type JudgeAgentConfig, type JudgeResult, JudgeSpanCollector, JudgeSpanDigestFormatter, RealtimeAgentAdapter, type RealtimeAgentAdapterConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, judgeSpanCollector, judgeSpanDigestFormatter, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };
2074
+ export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, type AudioResponseEvent, DEFAULT_MAX_TURNS, DEFAULT_VERBOSE, type FinishTestArgs, type InvokeLLMParams, type InvokeLLMResult, JudgeAgentAdapter, type JudgeAgentConfig, type JudgeResult, JudgeSpanCollector, JudgeSpanDigestFormatter, type JudgmentRequest, RealtimeAgentAdapter, type RealtimeAgentAdapterConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, judgeSpanCollector, judgeSpanDigestFormatter, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };
package/dist/index.js CHANGED
@@ -486,52 +486,68 @@ var createLLMInvoker = (logger2) => {
486
486
  var toolMessageRole = "tool";
487
487
  var assistantMessageRole = "assistant";
488
488
  var userMessageRole = "user";
489
- var groupMessagesByToolBoundaries = (messages) => {
490
- const segments = [];
491
- let currentSegment = [];
492
- for (const message2 of messages) {
493
- currentSegment.push(message2);
494
- if (message2.role === toolMessageRole) {
495
- segments.push(currentSegment);
496
- currentSegment = [];
497
- }
498
- }
499
- if (currentSegment.length > 0) {
500
- segments.push(currentSegment);
489
+ var hasToolContent = (message2) => {
490
+ if (message2.role === toolMessageRole) return true;
491
+ if (!Array.isArray(message2.content)) return false;
492
+ return message2.content.some((part) => {
493
+ if (!part || typeof part !== "object") return false;
494
+ const partType = "type" in part ? part.type : void 0;
495
+ return partType === "tool-call" || partType === "tool-result";
496
+ });
497
+ };
498
+ var stringifyValue = (value) => {
499
+ if (typeof value === "string") return value;
500
+ if (value === void 0) return "undefined";
501
+ try {
502
+ const serialized = JSON.stringify(value);
503
+ return serialized === void 0 ? String(value) : serialized;
504
+ } catch {
505
+ return String(value);
501
506
  }
502
- return segments;
503
507
  };
504
- var segmentHasToolMessages = (segment) => {
505
- return segment.some((message2) => {
506
- if (message2.role === toolMessageRole) return true;
507
- if (message2.role === assistantMessageRole && Array.isArray(message2.content)) {
508
- return message2.content.some((part) => part.type === "tool-call");
509
- }
510
- return false;
508
+ var summarizeToolMessage = (message2) => {
509
+ if (message2.role === toolMessageRole && !Array.isArray(message2.content)) {
510
+ return `[Tool message: ${stringifyValue(message2.content)}]`;
511
+ }
512
+ if (message2.role === toolMessageRole) {
513
+ const toolResults = message2.content.filter((part) => part.type === "tool-result").map((part) => {
514
+ const contentPart = part;
515
+ const name = contentPart.toolName ?? "unknown tool";
516
+ const output = contentPart.output;
517
+ const value = output && typeof output === "object" && "value" in output && typeof output.value === "string" ? output.value : output ?? contentPart.result;
518
+ return `[Tool result from ${name}: ${stringifyValue(value)}]`;
519
+ });
520
+ return toolResults.length > 0 ? toolResults.join("\n") : null;
521
+ }
522
+ if (!Array.isArray(message2.content)) return null;
523
+ const toolCalls = message2.content.filter((part) => part.type === "tool-call").map((part) => {
524
+ const contentPart = part;
525
+ const name = contentPart.toolName ?? "unknown tool";
526
+ return `[Called tool ${name} with: ${stringifyValue(contentPart.input)}]`;
511
527
  });
528
+ return toolCalls.length > 0 ? toolCalls.join("\n") : null;
512
529
  };
513
- var reverseSegmentRoles = (segment) => {
514
- return segment.map((message2) => {
515
- const hasStringContent = typeof message2.content === "string";
516
- if (!hasStringContent) return message2;
517
- const roleMap = {
518
- [userMessageRole]: assistantMessageRole,
519
- [assistantMessageRole]: userMessageRole
520
- };
530
+ var messageRoleReversal = (messages) => {
531
+ const roleMap = {
532
+ [userMessageRole]: assistantMessageRole,
533
+ [assistantMessageRole]: userMessageRole
534
+ };
535
+ return messages.map((message2) => {
536
+ if (hasToolContent(message2)) {
537
+ const summary = summarizeToolMessage(message2);
538
+ if (!summary) return null;
539
+ return {
540
+ role: userMessageRole,
541
+ content: summary
542
+ };
543
+ }
521
544
  const newRole = roleMap[message2.role];
522
545
  if (!newRole) return message2;
523
546
  return {
524
- role: newRole,
525
- content: message2.content
547
+ ...message2,
548
+ role: newRole
526
549
  };
527
- });
528
- };
529
- var messageRoleReversal = (messages) => {
530
- const segments = groupMessagesByToolBoundaries(messages);
531
- const processedSegments = segments.map(
532
- (segment) => segmentHasToolMessages(segment) ? segment : reverseSegmentRoles(segment)
533
- );
534
- return processedSegments.flat();
550
+ }).filter((message2) => message2 !== null);
535
551
  };
536
552
  var criterionToParamName = (criterion) => {
537
553
  return criterion.replace(/"/g, "").replace(/[^a-zA-Z0-9]/g, "_").replace(/ /g, "_").toLowerCase().substring(0, 70);
@@ -893,7 +909,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
893
909
  constructor(cfg) {
894
910
  super();
895
911
  this.cfg = cfg;
896
- this.criteria = cfg.criteria;
912
+ this.criteria = cfg.criteria ?? [];
897
913
  this.spanCollector = cfg.spanCollector ?? judgeSpanCollector;
898
914
  }
899
915
  logger = new Logger("JudgeAgent");
@@ -905,7 +921,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
905
921
  */
906
922
  invokeLLM = createLLMInvoker(this.logger);
907
923
  async call(input) {
908
- var _a, _b, _c;
924
+ var _a, _b, _c, _d;
925
+ const criteria = ((_a = input.judgmentRequest) == null ? void 0 : _a.criteria) ?? this.criteria;
909
926
  this.logger.debug("call() invoked", {
910
927
  threadId: input.threadId,
911
928
  currentTurn: input.scenarioState.currentTurn,
@@ -924,7 +941,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
924
941
  </opentelemetry_traces>
925
942
  `;
926
943
  const cfg = this.cfg;
927
- const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(cfg.criteria, input.scenarioConfig.description);
944
+ const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(criteria, input.scenarioConfig.description);
928
945
  const messages = [
929
946
  { role: "system", content: systemPrompt },
930
947
  { role: "user", content: contentForJudge }
@@ -937,10 +954,10 @@ var JudgeAgent = class extends JudgeAgentAdapter {
937
954
  });
938
955
  const tools = {
939
956
  continue_test: buildContinueTestTool(),
940
- finish_test: buildFinishTestTool(cfg.criteria)
957
+ finish_test: buildFinishTestTool(criteria)
941
958
  };
942
- const enforceJudgement = input.judgmentRequest;
943
- const hasCriteria = cfg.criteria.length && cfg.criteria.length > 0;
959
+ const enforceJudgement = input.judgmentRequest != null;
960
+ const hasCriteria = criteria.length && criteria.length > 0;
944
961
  if (enforceJudgement && !hasCriteria) {
945
962
  return {
946
963
  success: false,
@@ -965,26 +982,26 @@ var JudgeAgent = class extends JudgeAgentAdapter {
965
982
  toolChoice
966
983
  });
967
984
  this.logger.debug("LLM response received", {
968
- toolCallCount: ((_a = completion.toolCalls) == null ? void 0 : _a.length) ?? 0,
969
- toolCalls: (_b = completion.toolCalls) == null ? void 0 : _b.map((tc) => ({
985
+ toolCallCount: ((_b = completion.toolCalls) == null ? void 0 : _b.length) ?? 0,
986
+ toolCalls: (_c = completion.toolCalls) == null ? void 0 : _c.map((tc) => ({
970
987
  toolName: tc.toolName,
971
988
  args: tc.input
972
989
  }))
973
990
  });
974
991
  let args;
975
- if ((_c = completion.toolCalls) == null ? void 0 : _c.length) {
992
+ if ((_d = completion.toolCalls) == null ? void 0 : _d.length) {
976
993
  const toolCall = completion.toolCalls[0];
977
994
  switch (toolCall.toolName) {
978
995
  case "finish_test": {
979
996
  args = toolCall.input;
980
997
  const verdict = args.verdict || "inconclusive";
981
998
  const reasoning = args.reasoning || "No reasoning provided";
982
- const criteria = args.criteria || {};
983
- const criteriaValues = Object.values(criteria);
984
- const metCriteria = cfg.criteria.filter(
999
+ const criteriaArgs = args.criteria || {};
1000
+ const criteriaValues = Object.values(criteriaArgs);
1001
+ const metCriteria = criteria.filter(
985
1002
  (_, i) => criteriaValues[i] === "true"
986
1003
  );
987
- const unmetCriteria = cfg.criteria.filter(
1004
+ const unmetCriteria = criteria.filter(
988
1005
  (_, i) => criteriaValues[i] !== "true"
989
1006
  );
990
1007
  const result = {
@@ -1004,7 +1021,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1004
1021
  success: false,
1005
1022
  reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
1006
1023
  metCriteria: [],
1007
- unmetCriteria: cfg.criteria
1024
+ unmetCriteria: criteria
1008
1025
  };
1009
1026
  }
1010
1027
  }
@@ -1012,7 +1029,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1012
1029
  success: false,
1013
1030
  reasoning: `JudgeAgent: No tool call found in LLM output`,
1014
1031
  metCriteria: [],
1015
- unmetCriteria: cfg.criteria
1032
+ unmetCriteria: criteria
1016
1033
  };
1017
1034
  }
1018
1035
  getOpenTelemetryTracesDigest(threadId) {
@@ -1022,7 +1039,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1022
1039
  }
1023
1040
  };
1024
1041
  var judgeAgent = (cfg) => {
1025
- return new JudgeAgent(cfg);
1042
+ return new JudgeAgent(cfg ?? {});
1026
1043
  };
1027
1044
 
1028
1045
  // src/agents/user-simulator-agent.ts
@@ -2518,6 +2535,8 @@ var ScenarioExecution = class {
2518
2535
  currentTurnSpan;
2519
2536
  /** Timestamp when execution started (for total time calculation) */
2520
2537
  totalStartTime = 0;
2538
+ /** Accumulated results from inline judge checkpoints */
2539
+ checkpointResults = [];
2521
2540
  /** Event stream for monitoring scenario progress */
2522
2541
  eventSubject = new import_rxjs2.Subject();
2523
2542
  /**
@@ -2595,6 +2614,7 @@ var ScenarioExecution = class {
2595
2614
  totalTime: this.totalTime,
2596
2615
  agentTime: totalAgentTime
2597
2616
  };
2617
+ return this._result;
2598
2618
  this.logger.debug(`[${this.config.id}] Result set`, {
2599
2619
  success: result.success,
2600
2620
  reasoning: result.reasoning,
@@ -2655,6 +2675,8 @@ var ScenarioExecution = class {
2655
2675
  const scriptStep = this.config.script[i];
2656
2676
  await this.executeScriptStep(scriptStep, i);
2657
2677
  if (this.result) {
2678
+ const cp = this.compiledCheckpoints;
2679
+ this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
2658
2680
  this.emitRunFinished({
2659
2681
  scenarioRunId,
2660
2682
  status: this.result.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
@@ -2663,7 +2685,22 @@ var ScenarioExecution = class {
2663
2685
  return this.result;
2664
2686
  }
2665
2687
  }
2666
- this.reachedMaxTurns(
2688
+ if (this.checkpointResults.length > 0) {
2689
+ const cp = this.compiledCheckpoints;
2690
+ const result2 = this.setResult({
2691
+ success: cp.unmetCriteria.length === 0,
2692
+ reasoning: "All inline criteria checkpoints passed",
2693
+ metCriteria: cp.metCriteria,
2694
+ unmetCriteria: cp.unmetCriteria
2695
+ });
2696
+ this.emitRunFinished({
2697
+ scenarioRunId,
2698
+ status: result2.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
2699
+ result: result2
2700
+ });
2701
+ return result2;
2702
+ }
2703
+ const result = this.reachedMaxTurns(
2667
2704
  [
2668
2705
  "Reached end of script without conclusion, add one of the following to the end of the script:",
2669
2706
  "- `Scenario.proceed()` to let the simulation continue to play out",
@@ -2671,11 +2708,11 @@ var ScenarioExecution = class {
2671
2708
  "- `Scenario.succeed()` or `Scenario.fail()` to end the test with an explicit result"
2672
2709
  ].join("\n")
2673
2710
  );
2674
- this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */ });
2675
- return this.result;
2711
+ this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */, result });
2712
+ return result;
2676
2713
  } catch (error) {
2677
2714
  const errorInfo = extractErrorInfo(error);
2678
- this.setResult({
2715
+ const result = this.setResult({
2679
2716
  success: false,
2680
2717
  reasoning: `Scenario failed with error: ${errorInfo.message}`,
2681
2718
  metCriteria: [],
@@ -2685,7 +2722,7 @@ var ScenarioExecution = class {
2685
2722
  this.emitRunFinished({
2686
2723
  scenarioRunId,
2687
2724
  status: "ERROR" /* ERROR */,
2688
- result: this.result
2725
+ result
2689
2726
  });
2690
2727
  throw error;
2691
2728
  } finally {
@@ -2789,7 +2826,7 @@ var ScenarioExecution = class {
2789
2826
  * @param judgmentRequest - Whether this is a judgment request (for judge agents)
2790
2827
  * @throws Error if the agent call fails
2791
2828
  */
2792
- async callAgent(idx, role, judgmentRequest = false) {
2829
+ async callAgent(idx, role, judgmentRequest) {
2793
2830
  var _a;
2794
2831
  const agent2 = this.agents[idx];
2795
2832
  const agentName = agent2.name ?? agent2.constructor.name;
@@ -2980,25 +3017,26 @@ var ScenarioExecution = class {
2980
3017
  *
2981
3018
  * This method is part of the ScenarioExecutionLike interface used by script steps.
2982
3019
  *
2983
- * @param content - Optional message to pass to the judge agent for additional context
3020
+ * @param options - Optional options with inline criteria to evaluate as a checkpoint.
2984
3021
  * @returns A promise that resolves with:
2985
3022
  * - ScenarioResult if the judge makes a final decision, or
2986
3023
  * - Null if the conversation should continue
2987
3024
  *
2988
3025
  * @example
2989
3026
  * ```typescript
2990
- * // Let judge evaluate current state
3027
+ * // Let judge evaluate with its configured criteria
2991
3028
  * const result = await execution.judge();
2992
- * if (result) {
2993
- * console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
2994
- * }
2995
3029
  *
2996
- * // Provide additional context to judge
2997
- * const result = await execution.judge("Please consider the user's satisfaction level");
3030
+ * // Evaluate inline criteria as a checkpoint
3031
+ * const result = await execution.judge({ criteria: ["Agent responded helpfully"] });
2998
3032
  * ```
2999
3033
  */
3000
- async judge(content) {
3001
- return await this.scriptCallAgent("Judge" /* JUDGE */, content, true);
3034
+ async judge(options) {
3035
+ return await this.scriptCallAgent(
3036
+ "Judge" /* JUDGE */,
3037
+ void 0,
3038
+ { criteria: options == null ? void 0 : options.criteria }
3039
+ );
3002
3040
  }
3003
3041
  /**
3004
3042
  * Lets the scenario proceed automatically for a specified number of turns.
@@ -3083,13 +3121,12 @@ var ScenarioExecution = class {
3083
3121
  * ```
3084
3122
  */
3085
3123
  async succeed(reasoning) {
3086
- this.setResult({
3124
+ return this.setResult({
3087
3125
  success: true,
3088
3126
  reasoning: reasoning || "Scenario marked as successful with Scenario.succeed()",
3089
3127
  metCriteria: [],
3090
3128
  unmetCriteria: []
3091
3129
  });
3092
- return this.result;
3093
3130
  }
3094
3131
  /**
3095
3132
  * Immediately ends the scenario with a failure verdict.
@@ -3115,13 +3152,12 @@ var ScenarioExecution = class {
3115
3152
  * ```
3116
3153
  */
3117
3154
  async fail(reasoning) {
3118
- this.setResult({
3155
+ return this.setResult({
3119
3156
  success: false,
3120
3157
  reasoning: reasoning || "Scenario marked as failed with Scenario.fail()",
3121
3158
  metCriteria: [],
3122
3159
  unmetCriteria: []
3123
3160
  });
3124
- return this.result;
3125
3161
  }
3126
3162
  /**
3127
3163
  * Adds execution time for a specific agent to the performance tracking.
@@ -3165,15 +3201,14 @@ var ScenarioExecution = class {
3165
3201
  * decision, or null if the conversation should continue
3166
3202
  * @throws Error if no agent is found for the specified role
3167
3203
  */
3168
- async scriptCallAgent(role, content, judgmentRequest = false) {
3204
+ async scriptCallAgent(role, content, judgmentRequest) {
3169
3205
  this.logger.debug(`[${this.config.id}] scriptCallAgent`, {
3170
3206
  role,
3171
3207
  hasContent: content !== void 0,
3172
- judgmentRequest
3208
+ judgmentRequest: judgmentRequest != null,
3209
+ hasInlineCriteria: (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null
3173
3210
  });
3174
3211
  this.consumeUntilRole(role);
3175
- let index = -1;
3176
- let agent2 = null;
3177
3212
  let nextAgent = this.getNextAgentForRole(role);
3178
3213
  if (!nextAgent) {
3179
3214
  this.newTurn();
@@ -3203,8 +3238,8 @@ var ScenarioExecution = class {
3203
3238
  `Cannot generate a message for role \`${role}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
3204
3239
  );
3205
3240
  }
3206
- index = nextAgent.index;
3207
- agent2 = nextAgent.agent;
3241
+ const index = nextAgent.index;
3242
+ const agent2 = nextAgent.agent;
3208
3243
  this.removePendingAgent(agent2);
3209
3244
  if (content) {
3210
3245
  const message2 = typeof content === "string" ? {
@@ -3216,6 +3251,25 @@ var ScenarioExecution = class {
3216
3251
  return null;
3217
3252
  }
3218
3253
  await this.callAgent(index, role, judgmentRequest);
3254
+ if (this.result && (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null) {
3255
+ this.checkpointResults.push({
3256
+ metCriteria: this.result.metCriteria,
3257
+ unmetCriteria: this.result.unmetCriteria
3258
+ });
3259
+ if (this.result.success) {
3260
+ this._result = void 0;
3261
+ return null;
3262
+ } else {
3263
+ const cp = this.compiledCheckpoints;
3264
+ this.result.metCriteria = cp.metCriteria;
3265
+ this.result.unmetCriteria = cp.unmetCriteria;
3266
+ return this.result;
3267
+ }
3268
+ }
3269
+ if (this.result) {
3270
+ const cp = this.compiledCheckpoints;
3271
+ this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
3272
+ }
3219
3273
  return this.result ?? null;
3220
3274
  }
3221
3275
  /**
@@ -3248,11 +3302,22 @@ var ScenarioExecution = class {
3248
3302
  this.totalStartTime = Date.now();
3249
3303
  this.pendingMessages.clear();
3250
3304
  this._result = void 0;
3305
+ this.checkpointResults = [];
3251
3306
  this.logger.debug(`[${this.config.id}] Reset complete`, {
3252
3307
  threadId: this.state.threadId,
3253
3308
  agentCount: this.agents.length
3254
3309
  });
3255
3310
  }
3311
+ /** Compiles all accumulated checkpoint results into aggregated met/unmet criteria. */
3312
+ get compiledCheckpoints() {
3313
+ const metCriteria = [];
3314
+ const unmetCriteria = [];
3315
+ for (const cp of this.checkpointResults) {
3316
+ metCriteria.push(...cp.metCriteria);
3317
+ unmetCriteria.push(...cp.unmetCriteria);
3318
+ }
3319
+ return { metCriteria, unmetCriteria };
3320
+ }
3256
3321
  nextAgentForRole(role) {
3257
3322
  for (const agent2 of this.agents) {
3258
3323
  if (agent2.role === role && this.pendingAgentsOnTurn.has(agent2) && this.pendingRolesOnTurn.includes(role)) {
@@ -3349,7 +3414,7 @@ var ScenarioExecution = class {
3349
3414
  */
3350
3415
  reachedMaxTurns(errorMessage) {
3351
3416
  var _a;
3352
- this.setResult({
3417
+ return this.setResult({
3353
3418
  success: false,
3354
3419
  reasoning: errorMessage || `Reached maximum turns (${this.config.maxTurns || 10}) without conclusion`,
3355
3420
  metCriteria: [],
@@ -3850,9 +3915,9 @@ var message = (message2) => {
3850
3915
  var agent = (content) => {
3851
3916
  return (_state, executor) => executor.agent(content);
3852
3917
  };
3853
- var judge = (content) => {
3918
+ var judge = (options) => {
3854
3919
  return async (_state, executor) => {
3855
- await executor.judge(content);
3920
+ await executor.judge(options);
3856
3921
  };
3857
3922
  };
3858
3923
  var user = (content) => {
package/dist/index.mjs CHANGED
@@ -428,52 +428,68 @@ var createLLMInvoker = (logger2) => {
428
428
  var toolMessageRole = "tool";
429
429
  var assistantMessageRole = "assistant";
430
430
  var userMessageRole = "user";
431
- var groupMessagesByToolBoundaries = (messages) => {
432
- const segments = [];
433
- let currentSegment = [];
434
- for (const message2 of messages) {
435
- currentSegment.push(message2);
436
- if (message2.role === toolMessageRole) {
437
- segments.push(currentSegment);
438
- currentSegment = [];
439
- }
440
- }
441
- if (currentSegment.length > 0) {
442
- segments.push(currentSegment);
431
+ var hasToolContent = (message2) => {
432
+ if (message2.role === toolMessageRole) return true;
433
+ if (!Array.isArray(message2.content)) return false;
434
+ return message2.content.some((part) => {
435
+ if (!part || typeof part !== "object") return false;
436
+ const partType = "type" in part ? part.type : void 0;
437
+ return partType === "tool-call" || partType === "tool-result";
438
+ });
439
+ };
440
+ var stringifyValue = (value) => {
441
+ if (typeof value === "string") return value;
442
+ if (value === void 0) return "undefined";
443
+ try {
444
+ const serialized = JSON.stringify(value);
445
+ return serialized === void 0 ? String(value) : serialized;
446
+ } catch {
447
+ return String(value);
443
448
  }
444
- return segments;
445
449
  };
446
- var segmentHasToolMessages = (segment) => {
447
- return segment.some((message2) => {
448
- if (message2.role === toolMessageRole) return true;
449
- if (message2.role === assistantMessageRole && Array.isArray(message2.content)) {
450
- return message2.content.some((part) => part.type === "tool-call");
451
- }
452
- return false;
450
+ var summarizeToolMessage = (message2) => {
451
+ if (message2.role === toolMessageRole && !Array.isArray(message2.content)) {
452
+ return `[Tool message: ${stringifyValue(message2.content)}]`;
453
+ }
454
+ if (message2.role === toolMessageRole) {
455
+ const toolResults = message2.content.filter((part) => part.type === "tool-result").map((part) => {
456
+ const contentPart = part;
457
+ const name = contentPart.toolName ?? "unknown tool";
458
+ const output = contentPart.output;
459
+ const value = output && typeof output === "object" && "value" in output && typeof output.value === "string" ? output.value : output ?? contentPart.result;
460
+ return `[Tool result from ${name}: ${stringifyValue(value)}]`;
461
+ });
462
+ return toolResults.length > 0 ? toolResults.join("\n") : null;
463
+ }
464
+ if (!Array.isArray(message2.content)) return null;
465
+ const toolCalls = message2.content.filter((part) => part.type === "tool-call").map((part) => {
466
+ const contentPart = part;
467
+ const name = contentPart.toolName ?? "unknown tool";
468
+ return `[Called tool ${name} with: ${stringifyValue(contentPart.input)}]`;
453
469
  });
470
+ return toolCalls.length > 0 ? toolCalls.join("\n") : null;
454
471
  };
455
- var reverseSegmentRoles = (segment) => {
456
- return segment.map((message2) => {
457
- const hasStringContent = typeof message2.content === "string";
458
- if (!hasStringContent) return message2;
459
- const roleMap = {
460
- [userMessageRole]: assistantMessageRole,
461
- [assistantMessageRole]: userMessageRole
462
- };
472
+ var messageRoleReversal = (messages) => {
473
+ const roleMap = {
474
+ [userMessageRole]: assistantMessageRole,
475
+ [assistantMessageRole]: userMessageRole
476
+ };
477
+ return messages.map((message2) => {
478
+ if (hasToolContent(message2)) {
479
+ const summary = summarizeToolMessage(message2);
480
+ if (!summary) return null;
481
+ return {
482
+ role: userMessageRole,
483
+ content: summary
484
+ };
485
+ }
463
486
  const newRole = roleMap[message2.role];
464
487
  if (!newRole) return message2;
465
488
  return {
466
- role: newRole,
467
- content: message2.content
489
+ ...message2,
490
+ role: newRole
468
491
  };
469
- });
470
- };
471
- var messageRoleReversal = (messages) => {
472
- const segments = groupMessagesByToolBoundaries(messages);
473
- const processedSegments = segments.map(
474
- (segment) => segmentHasToolMessages(segment) ? segment : reverseSegmentRoles(segment)
475
- );
476
- return processedSegments.flat();
492
+ }).filter((message2) => message2 !== null);
477
493
  };
478
494
  var criterionToParamName = (criterion) => {
479
495
  return criterion.replace(/"/g, "").replace(/[^a-zA-Z0-9]/g, "_").replace(/ /g, "_").toLowerCase().substring(0, 70);
@@ -835,7 +851,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
835
851
  constructor(cfg) {
836
852
  super();
837
853
  this.cfg = cfg;
838
- this.criteria = cfg.criteria;
854
+ this.criteria = cfg.criteria ?? [];
839
855
  this.spanCollector = cfg.spanCollector ?? judgeSpanCollector;
840
856
  }
841
857
  logger = new Logger("JudgeAgent");
@@ -847,7 +863,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
847
863
  */
848
864
  invokeLLM = createLLMInvoker(this.logger);
849
865
  async call(input) {
850
- var _a, _b, _c;
866
+ var _a, _b, _c, _d;
867
+ const criteria = ((_a = input.judgmentRequest) == null ? void 0 : _a.criteria) ?? this.criteria;
851
868
  this.logger.debug("call() invoked", {
852
869
  threadId: input.threadId,
853
870
  currentTurn: input.scenarioState.currentTurn,
@@ -866,7 +883,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
866
883
  </opentelemetry_traces>
867
884
  `;
868
885
  const cfg = this.cfg;
869
- const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(cfg.criteria, input.scenarioConfig.description);
886
+ const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(criteria, input.scenarioConfig.description);
870
887
  const messages = [
871
888
  { role: "system", content: systemPrompt },
872
889
  { role: "user", content: contentForJudge }
@@ -879,10 +896,10 @@ var JudgeAgent = class extends JudgeAgentAdapter {
879
896
  });
880
897
  const tools = {
881
898
  continue_test: buildContinueTestTool(),
882
- finish_test: buildFinishTestTool(cfg.criteria)
899
+ finish_test: buildFinishTestTool(criteria)
883
900
  };
884
- const enforceJudgement = input.judgmentRequest;
885
- const hasCriteria = cfg.criteria.length && cfg.criteria.length > 0;
901
+ const enforceJudgement = input.judgmentRequest != null;
902
+ const hasCriteria = criteria.length && criteria.length > 0;
886
903
  if (enforceJudgement && !hasCriteria) {
887
904
  return {
888
905
  success: false,
@@ -907,26 +924,26 @@ var JudgeAgent = class extends JudgeAgentAdapter {
907
924
  toolChoice
908
925
  });
909
926
  this.logger.debug("LLM response received", {
910
- toolCallCount: ((_a = completion.toolCalls) == null ? void 0 : _a.length) ?? 0,
911
- toolCalls: (_b = completion.toolCalls) == null ? void 0 : _b.map((tc) => ({
927
+ toolCallCount: ((_b = completion.toolCalls) == null ? void 0 : _b.length) ?? 0,
928
+ toolCalls: (_c = completion.toolCalls) == null ? void 0 : _c.map((tc) => ({
912
929
  toolName: tc.toolName,
913
930
  args: tc.input
914
931
  }))
915
932
  });
916
933
  let args;
917
- if ((_c = completion.toolCalls) == null ? void 0 : _c.length) {
934
+ if ((_d = completion.toolCalls) == null ? void 0 : _d.length) {
918
935
  const toolCall = completion.toolCalls[0];
919
936
  switch (toolCall.toolName) {
920
937
  case "finish_test": {
921
938
  args = toolCall.input;
922
939
  const verdict = args.verdict || "inconclusive";
923
940
  const reasoning = args.reasoning || "No reasoning provided";
924
- const criteria = args.criteria || {};
925
- const criteriaValues = Object.values(criteria);
926
- const metCriteria = cfg.criteria.filter(
941
+ const criteriaArgs = args.criteria || {};
942
+ const criteriaValues = Object.values(criteriaArgs);
943
+ const metCriteria = criteria.filter(
927
944
  (_, i) => criteriaValues[i] === "true"
928
945
  );
929
- const unmetCriteria = cfg.criteria.filter(
946
+ const unmetCriteria = criteria.filter(
930
947
  (_, i) => criteriaValues[i] !== "true"
931
948
  );
932
949
  const result = {
@@ -946,7 +963,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
946
963
  success: false,
947
964
  reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
948
965
  metCriteria: [],
949
- unmetCriteria: cfg.criteria
966
+ unmetCriteria: criteria
950
967
  };
951
968
  }
952
969
  }
@@ -954,7 +971,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
954
971
  success: false,
955
972
  reasoning: `JudgeAgent: No tool call found in LLM output`,
956
973
  metCriteria: [],
957
- unmetCriteria: cfg.criteria
974
+ unmetCriteria: criteria
958
975
  };
959
976
  }
960
977
  getOpenTelemetryTracesDigest(threadId) {
@@ -964,7 +981,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
964
981
  }
965
982
  };
966
983
  var judgeAgent = (cfg) => {
967
- return new JudgeAgent(cfg);
984
+ return new JudgeAgent(cfg ?? {});
968
985
  };
969
986
 
970
987
  // src/agents/user-simulator-agent.ts
@@ -2460,6 +2477,8 @@ var ScenarioExecution = class {
2460
2477
  currentTurnSpan;
2461
2478
  /** Timestamp when execution started (for total time calculation) */
2462
2479
  totalStartTime = 0;
2480
+ /** Accumulated results from inline judge checkpoints */
2481
+ checkpointResults = [];
2463
2482
  /** Event stream for monitoring scenario progress */
2464
2483
  eventSubject = new Subject2();
2465
2484
  /**
@@ -2537,6 +2556,7 @@ var ScenarioExecution = class {
2537
2556
  totalTime: this.totalTime,
2538
2557
  agentTime: totalAgentTime
2539
2558
  };
2559
+ return this._result;
2540
2560
  this.logger.debug(`[${this.config.id}] Result set`, {
2541
2561
  success: result.success,
2542
2562
  reasoning: result.reasoning,
@@ -2597,6 +2617,8 @@ var ScenarioExecution = class {
2597
2617
  const scriptStep = this.config.script[i];
2598
2618
  await this.executeScriptStep(scriptStep, i);
2599
2619
  if (this.result) {
2620
+ const cp = this.compiledCheckpoints;
2621
+ this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
2600
2622
  this.emitRunFinished({
2601
2623
  scenarioRunId,
2602
2624
  status: this.result.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
@@ -2605,7 +2627,22 @@ var ScenarioExecution = class {
2605
2627
  return this.result;
2606
2628
  }
2607
2629
  }
2608
- this.reachedMaxTurns(
2630
+ if (this.checkpointResults.length > 0) {
2631
+ const cp = this.compiledCheckpoints;
2632
+ const result2 = this.setResult({
2633
+ success: cp.unmetCriteria.length === 0,
2634
+ reasoning: "All inline criteria checkpoints passed",
2635
+ metCriteria: cp.metCriteria,
2636
+ unmetCriteria: cp.unmetCriteria
2637
+ });
2638
+ this.emitRunFinished({
2639
+ scenarioRunId,
2640
+ status: result2.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
2641
+ result: result2
2642
+ });
2643
+ return result2;
2644
+ }
2645
+ const result = this.reachedMaxTurns(
2609
2646
  [
2610
2647
  "Reached end of script without conclusion, add one of the following to the end of the script:",
2611
2648
  "- `Scenario.proceed()` to let the simulation continue to play out",
@@ -2613,11 +2650,11 @@ var ScenarioExecution = class {
2613
2650
  "- `Scenario.succeed()` or `Scenario.fail()` to end the test with an explicit result"
2614
2651
  ].join("\n")
2615
2652
  );
2616
- this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */ });
2617
- return this.result;
2653
+ this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */, result });
2654
+ return result;
2618
2655
  } catch (error) {
2619
2656
  const errorInfo = extractErrorInfo(error);
2620
- this.setResult({
2657
+ const result = this.setResult({
2621
2658
  success: false,
2622
2659
  reasoning: `Scenario failed with error: ${errorInfo.message}`,
2623
2660
  metCriteria: [],
@@ -2627,7 +2664,7 @@ var ScenarioExecution = class {
2627
2664
  this.emitRunFinished({
2628
2665
  scenarioRunId,
2629
2666
  status: "ERROR" /* ERROR */,
2630
- result: this.result
2667
+ result
2631
2668
  });
2632
2669
  throw error;
2633
2670
  } finally {
@@ -2731,7 +2768,7 @@ var ScenarioExecution = class {
2731
2768
  * @param judgmentRequest - Whether this is a judgment request (for judge agents)
2732
2769
  * @throws Error if the agent call fails
2733
2770
  */
2734
- async callAgent(idx, role, judgmentRequest = false) {
2771
+ async callAgent(idx, role, judgmentRequest) {
2735
2772
  var _a;
2736
2773
  const agent2 = this.agents[idx];
2737
2774
  const agentName = agent2.name ?? agent2.constructor.name;
@@ -2922,25 +2959,26 @@ var ScenarioExecution = class {
2922
2959
  *
2923
2960
  * This method is part of the ScenarioExecutionLike interface used by script steps.
2924
2961
  *
2925
- * @param content - Optional message to pass to the judge agent for additional context
2962
+ * @param options - Optional options with inline criteria to evaluate as a checkpoint.
2926
2963
  * @returns A promise that resolves with:
2927
2964
  * - ScenarioResult if the judge makes a final decision, or
2928
2965
  * - Null if the conversation should continue
2929
2966
  *
2930
2967
  * @example
2931
2968
  * ```typescript
2932
- * // Let judge evaluate current state
2969
+ * // Let judge evaluate with its configured criteria
2933
2970
  * const result = await execution.judge();
2934
- * if (result) {
2935
- * console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
2936
- * }
2937
2971
  *
2938
- * // Provide additional context to judge
2939
- * const result = await execution.judge("Please consider the user's satisfaction level");
2972
+ * // Evaluate inline criteria as a checkpoint
2973
+ * const result = await execution.judge({ criteria: ["Agent responded helpfully"] });
2940
2974
  * ```
2941
2975
  */
2942
- async judge(content) {
2943
- return await this.scriptCallAgent("Judge" /* JUDGE */, content, true);
2976
+ async judge(options) {
2977
+ return await this.scriptCallAgent(
2978
+ "Judge" /* JUDGE */,
2979
+ void 0,
2980
+ { criteria: options == null ? void 0 : options.criteria }
2981
+ );
2944
2982
  }
2945
2983
  /**
2946
2984
  * Lets the scenario proceed automatically for a specified number of turns.
@@ -3025,13 +3063,12 @@ var ScenarioExecution = class {
3025
3063
  * ```
3026
3064
  */
3027
3065
  async succeed(reasoning) {
3028
- this.setResult({
3066
+ return this.setResult({
3029
3067
  success: true,
3030
3068
  reasoning: reasoning || "Scenario marked as successful with Scenario.succeed()",
3031
3069
  metCriteria: [],
3032
3070
  unmetCriteria: []
3033
3071
  });
3034
- return this.result;
3035
3072
  }
3036
3073
  /**
3037
3074
  * Immediately ends the scenario with a failure verdict.
@@ -3057,13 +3094,12 @@ var ScenarioExecution = class {
3057
3094
  * ```
3058
3095
  */
3059
3096
  async fail(reasoning) {
3060
- this.setResult({
3097
+ return this.setResult({
3061
3098
  success: false,
3062
3099
  reasoning: reasoning || "Scenario marked as failed with Scenario.fail()",
3063
3100
  metCriteria: [],
3064
3101
  unmetCriteria: []
3065
3102
  });
3066
- return this.result;
3067
3103
  }
3068
3104
  /**
3069
3105
  * Adds execution time for a specific agent to the performance tracking.
@@ -3107,15 +3143,14 @@ var ScenarioExecution = class {
3107
3143
  * decision, or null if the conversation should continue
3108
3144
  * @throws Error if no agent is found for the specified role
3109
3145
  */
3110
- async scriptCallAgent(role, content, judgmentRequest = false) {
3146
+ async scriptCallAgent(role, content, judgmentRequest) {
3111
3147
  this.logger.debug(`[${this.config.id}] scriptCallAgent`, {
3112
3148
  role,
3113
3149
  hasContent: content !== void 0,
3114
- judgmentRequest
3150
+ judgmentRequest: judgmentRequest != null,
3151
+ hasInlineCriteria: (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null
3115
3152
  });
3116
3153
  this.consumeUntilRole(role);
3117
- let index = -1;
3118
- let agent2 = null;
3119
3154
  let nextAgent = this.getNextAgentForRole(role);
3120
3155
  if (!nextAgent) {
3121
3156
  this.newTurn();
@@ -3145,8 +3180,8 @@ var ScenarioExecution = class {
3145
3180
  `Cannot generate a message for role \`${role}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
3146
3181
  );
3147
3182
  }
3148
- index = nextAgent.index;
3149
- agent2 = nextAgent.agent;
3183
+ const index = nextAgent.index;
3184
+ const agent2 = nextAgent.agent;
3150
3185
  this.removePendingAgent(agent2);
3151
3186
  if (content) {
3152
3187
  const message2 = typeof content === "string" ? {
@@ -3158,6 +3193,25 @@ var ScenarioExecution = class {
3158
3193
  return null;
3159
3194
  }
3160
3195
  await this.callAgent(index, role, judgmentRequest);
3196
+ if (this.result && (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null) {
3197
+ this.checkpointResults.push({
3198
+ metCriteria: this.result.metCriteria,
3199
+ unmetCriteria: this.result.unmetCriteria
3200
+ });
3201
+ if (this.result.success) {
3202
+ this._result = void 0;
3203
+ return null;
3204
+ } else {
3205
+ const cp = this.compiledCheckpoints;
3206
+ this.result.metCriteria = cp.metCriteria;
3207
+ this.result.unmetCriteria = cp.unmetCriteria;
3208
+ return this.result;
3209
+ }
3210
+ }
3211
+ if (this.result) {
3212
+ const cp = this.compiledCheckpoints;
3213
+ this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
3214
+ }
3161
3215
  return this.result ?? null;
3162
3216
  }
3163
3217
  /**
@@ -3190,11 +3244,22 @@ var ScenarioExecution = class {
3190
3244
  this.totalStartTime = Date.now();
3191
3245
  this.pendingMessages.clear();
3192
3246
  this._result = void 0;
3247
+ this.checkpointResults = [];
3193
3248
  this.logger.debug(`[${this.config.id}] Reset complete`, {
3194
3249
  threadId: this.state.threadId,
3195
3250
  agentCount: this.agents.length
3196
3251
  });
3197
3252
  }
3253
+ /** Compiles all accumulated checkpoint results into aggregated met/unmet criteria. */
3254
+ get compiledCheckpoints() {
3255
+ const metCriteria = [];
3256
+ const unmetCriteria = [];
3257
+ for (const cp of this.checkpointResults) {
3258
+ metCriteria.push(...cp.metCriteria);
3259
+ unmetCriteria.push(...cp.unmetCriteria);
3260
+ }
3261
+ return { metCriteria, unmetCriteria };
3262
+ }
3198
3263
  nextAgentForRole(role) {
3199
3264
  for (const agent2 of this.agents) {
3200
3265
  if (agent2.role === role && this.pendingAgentsOnTurn.has(agent2) && this.pendingRolesOnTurn.includes(role)) {
@@ -3291,7 +3356,7 @@ var ScenarioExecution = class {
3291
3356
  */
3292
3357
  reachedMaxTurns(errorMessage) {
3293
3358
  var _a;
3294
- this.setResult({
3359
+ return this.setResult({
3295
3360
  success: false,
3296
3361
  reasoning: errorMessage || `Reached maximum turns (${this.config.maxTurns || 10}) without conclusion`,
3297
3362
  metCriteria: [],
@@ -3799,9 +3864,9 @@ var message = (message2) => {
3799
3864
  var agent = (content) => {
3800
3865
  return (_state, executor) => executor.agent(content);
3801
3866
  };
3802
- var judge = (content) => {
3867
+ var judge = (options) => {
3803
3868
  return async (_state, executor) => {
3804
- await executor.judge(content);
3869
+ await executor.judge(options);
3805
3870
  };
3806
3871
  };
3807
3872
  var user = (content) => {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@langwatch/scenario",
3
- "version": "0.4.1",
3
+ "version": "0.4.2",
4
4
  "description": "A TypeScript library for testing AI agents using scenarios",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.mjs",
@@ -29,7 +29,7 @@
29
29
  },
30
30
  "dependencies": {
31
31
  "@ag-ui/core": "^0.0.28",
32
- "@ai-sdk/openai": "^2.0.74",
32
+ "@ai-sdk/openai": "^3.0.26",
33
33
  "@openai/agents": "^0.3.3",
34
34
  "ai": "^6.0.0",
35
35
  "chalk": "^5.6.2",