@langwatch/scenario 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +45 -22
- package/dist/index.d.ts +45 -22
- package/dist/index.js +147 -82
- package/dist/index.mjs +147 -82
- package/package.json +2 -2
package/dist/index.d.mts
CHANGED
|
@@ -20,6 +20,18 @@ declare enum AgentRole {
|
|
|
20
20
|
JUDGE = "Judge"
|
|
21
21
|
}
|
|
22
22
|
declare const allAgentRoles: readonly [AgentRole.USER, AgentRole.AGENT, AgentRole.JUDGE];
|
|
23
|
+
/**
|
|
24
|
+
* Encapsulates a request for the judge agent to evaluate the conversation.
|
|
25
|
+
*
|
|
26
|
+
* When present on AgentInput, signals the judge to produce a verdict.
|
|
27
|
+
* Optionally carries inline criteria that override the judge's own criteria.
|
|
28
|
+
*/
|
|
29
|
+
interface JudgmentRequest {
|
|
30
|
+
/**
|
|
31
|
+
* Optional criteria to evaluate, overriding the judge agent's configured criteria.
|
|
32
|
+
*/
|
|
33
|
+
criteria?: string[];
|
|
34
|
+
}
|
|
23
35
|
/**
|
|
24
36
|
* Input provided to an agent's `call` method.
|
|
25
37
|
*/
|
|
@@ -41,9 +53,9 @@ interface AgentInput {
|
|
|
41
53
|
*/
|
|
42
54
|
requestedRole: AgentRole;
|
|
43
55
|
/**
|
|
44
|
-
*
|
|
56
|
+
* When set, requests the judge to produce a verdict, optionally with inline criteria.
|
|
45
57
|
*/
|
|
46
|
-
judgmentRequest
|
|
58
|
+
judgmentRequest?: JudgmentRequest;
|
|
47
59
|
/**
|
|
48
60
|
* The current state of the scenario execution.
|
|
49
61
|
*/
|
|
@@ -215,10 +227,12 @@ interface ScenarioExecutionLike {
|
|
|
215
227
|
agent(content?: string | ModelMessage): Promise<void>;
|
|
216
228
|
/**
|
|
217
229
|
* Invokes the judge agent to evaluate the current state.
|
|
218
|
-
* @param
|
|
230
|
+
* @param options Optional options with inline criteria to evaluate as a checkpoint.
|
|
219
231
|
* @returns The result of the scenario if the judge makes a final decision.
|
|
220
232
|
*/
|
|
221
|
-
judge(
|
|
233
|
+
judge(options?: {
|
|
234
|
+
criteria?: string[];
|
|
235
|
+
}): Promise<ScenarioResult | null>;
|
|
222
236
|
/**
|
|
223
237
|
* Proceeds with the scenario automatically for a number of turns.
|
|
224
238
|
* @param turns The number of turns to proceed. Defaults to running until the scenario ends.
|
|
@@ -364,6 +378,7 @@ declare const domain_DEFAULT_MAX_TURNS: typeof DEFAULT_MAX_TURNS;
|
|
|
364
378
|
declare const domain_DEFAULT_VERBOSE: typeof DEFAULT_VERBOSE;
|
|
365
379
|
type domain_JudgeAgentAdapter = JudgeAgentAdapter;
|
|
366
380
|
declare const domain_JudgeAgentAdapter: typeof JudgeAgentAdapter;
|
|
381
|
+
type domain_JudgmentRequest = JudgmentRequest;
|
|
367
382
|
type domain_ScenarioConfig = ScenarioConfig;
|
|
368
383
|
type domain_ScenarioConfigFinal = ScenarioConfigFinal;
|
|
369
384
|
type domain_ScenarioExecutionLike = ScenarioExecutionLike;
|
|
@@ -377,7 +392,7 @@ declare const domain_allAgentRoles: typeof allAgentRoles;
|
|
|
377
392
|
declare const domain_defineConfig: typeof defineConfig;
|
|
378
393
|
declare const domain_scenarioProjectConfigSchema: typeof scenarioProjectConfigSchema;
|
|
379
394
|
declare namespace domain {
|
|
380
|
-
export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
|
|
395
|
+
export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_JudgmentRequest as JudgmentRequest, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
|
|
381
396
|
}
|
|
382
397
|
|
|
383
398
|
/**
|
|
@@ -475,7 +490,7 @@ interface JudgeAgentConfig extends TestingAgentConfig {
|
|
|
475
490
|
/**
|
|
476
491
|
* The criteria that the judge will use to evaluate the conversation.
|
|
477
492
|
*/
|
|
478
|
-
criteria
|
|
493
|
+
criteria?: string[];
|
|
479
494
|
/**
|
|
480
495
|
* Optional span collector for telemetry. Defaults to global singleton.
|
|
481
496
|
*/
|
|
@@ -554,7 +569,7 @@ declare class JudgeAgent extends JudgeAgentAdapter {
|
|
|
554
569
|
* main();
|
|
555
570
|
* ```
|
|
556
571
|
*/
|
|
557
|
-
declare const judgeAgent: (cfg
|
|
572
|
+
declare const judgeAgent: (cfg?: JudgeAgentConfig) => JudgeAgent;
|
|
558
573
|
|
|
559
574
|
/**
|
|
560
575
|
* Transforms OpenTelemetry spans into a complete plain-text digest for judge evaluation.
|
|
@@ -1334,6 +1349,8 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
|
|
|
1334
1349
|
private currentTurnSpan?;
|
|
1335
1350
|
/** Timestamp when execution started (for total time calculation) */
|
|
1336
1351
|
private totalStartTime;
|
|
1352
|
+
/** Accumulated results from inline judge checkpoints */
|
|
1353
|
+
private checkpointResults;
|
|
1337
1354
|
/** Event stream for monitoring scenario progress */
|
|
1338
1355
|
private eventSubject;
|
|
1339
1356
|
/**
|
|
@@ -1554,24 +1571,23 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
|
|
|
1554
1571
|
*
|
|
1555
1572
|
* This method is part of the ScenarioExecutionLike interface used by script steps.
|
|
1556
1573
|
*
|
|
1557
|
-
* @param
|
|
1574
|
+
* @param options - Optional options with inline criteria to evaluate as a checkpoint.
|
|
1558
1575
|
* @returns A promise that resolves with:
|
|
1559
1576
|
* - ScenarioResult if the judge makes a final decision, or
|
|
1560
1577
|
* - Null if the conversation should continue
|
|
1561
1578
|
*
|
|
1562
1579
|
* @example
|
|
1563
1580
|
* ```typescript
|
|
1564
|
-
* // Let judge evaluate
|
|
1581
|
+
* // Let judge evaluate with its configured criteria
|
|
1565
1582
|
* const result = await execution.judge();
|
|
1566
|
-
* if (result) {
|
|
1567
|
-
* console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
|
|
1568
|
-
* }
|
|
1569
1583
|
*
|
|
1570
|
-
* //
|
|
1571
|
-
* const result = await execution.judge(
|
|
1584
|
+
* // Evaluate inline criteria as a checkpoint
|
|
1585
|
+
* const result = await execution.judge({ criteria: ["Agent responded helpfully"] });
|
|
1572
1586
|
* ```
|
|
1573
1587
|
*/
|
|
1574
|
-
judge(
|
|
1588
|
+
judge(options?: {
|
|
1589
|
+
criteria?: string[];
|
|
1590
|
+
}): Promise<ScenarioResult | null>;
|
|
1575
1591
|
/**
|
|
1576
1592
|
* Lets the scenario proceed automatically for a specified number of turns.
|
|
1577
1593
|
*
|
|
@@ -1718,6 +1734,8 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
|
|
|
1718
1734
|
* - Clears the result from any previous execution
|
|
1719
1735
|
*/
|
|
1720
1736
|
private reset;
|
|
1737
|
+
/** Compiles all accumulated checkpoint results into aggregated met/unmet criteria. */
|
|
1738
|
+
private get compiledCheckpoints();
|
|
1721
1739
|
private nextAgentForRole;
|
|
1722
1740
|
/**
|
|
1723
1741
|
* Starts a new turn in the scenario execution.
|
|
@@ -1980,15 +1998,20 @@ declare const agent: (content?: string | ModelMessage) => ScriptStep;
|
|
|
1980
1998
|
/**
|
|
1981
1999
|
* Invoke the judge agent to evaluate the current conversation state.
|
|
1982
2000
|
*
|
|
1983
|
-
*
|
|
1984
|
-
*
|
|
1985
|
-
*
|
|
2001
|
+
* When criteria are provided inline, the judge evaluates only those criteria
|
|
2002
|
+
* as a checkpoint: if all pass, the scenario continues; if any fail, the
|
|
2003
|
+
* scenario fails immediately. This is the preferred way to pass criteria
|
|
2004
|
+
* when using scripts.
|
|
1986
2005
|
*
|
|
1987
|
-
*
|
|
1988
|
-
*
|
|
2006
|
+
* When no criteria are provided, the judge uses its own configured criteria
|
|
2007
|
+
* and returns a final verdict (success or failure), ending the scenario.
|
|
2008
|
+
*
|
|
2009
|
+
* @param options Optional options object with inline criteria to evaluate.
|
|
1989
2010
|
* @returns A ScriptStep function that can be used in scenario scripts.
|
|
1990
2011
|
*/
|
|
1991
|
-
declare const judge: (
|
|
2012
|
+
declare const judge: (options?: {
|
|
2013
|
+
criteria: string[];
|
|
2014
|
+
}) => ScriptStep;
|
|
1992
2015
|
/**
|
|
1993
2016
|
* Generate or specify a user message in the conversation.
|
|
1994
2017
|
*
|
|
@@ -2048,4 +2071,4 @@ declare namespace script {
|
|
|
2048
2071
|
type ScenarioApi = typeof agents & typeof domain & typeof execution & typeof runner & typeof script;
|
|
2049
2072
|
declare const scenario: ScenarioApi;
|
|
2050
2073
|
|
|
2051
|
-
export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, type AudioResponseEvent, DEFAULT_MAX_TURNS, DEFAULT_VERBOSE, type FinishTestArgs, type InvokeLLMParams, type InvokeLLMResult, JudgeAgentAdapter, type JudgeAgentConfig, type JudgeResult, JudgeSpanCollector, JudgeSpanDigestFormatter, RealtimeAgentAdapter, type RealtimeAgentAdapterConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, judgeSpanCollector, judgeSpanDigestFormatter, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };
|
|
2074
|
+
export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, type AudioResponseEvent, DEFAULT_MAX_TURNS, DEFAULT_VERBOSE, type FinishTestArgs, type InvokeLLMParams, type InvokeLLMResult, JudgeAgentAdapter, type JudgeAgentConfig, type JudgeResult, JudgeSpanCollector, JudgeSpanDigestFormatter, type JudgmentRequest, RealtimeAgentAdapter, type RealtimeAgentAdapterConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, judgeSpanCollector, judgeSpanDigestFormatter, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };
|
package/dist/index.d.ts
CHANGED
|
@@ -20,6 +20,18 @@ declare enum AgentRole {
|
|
|
20
20
|
JUDGE = "Judge"
|
|
21
21
|
}
|
|
22
22
|
declare const allAgentRoles: readonly [AgentRole.USER, AgentRole.AGENT, AgentRole.JUDGE];
|
|
23
|
+
/**
|
|
24
|
+
* Encapsulates a request for the judge agent to evaluate the conversation.
|
|
25
|
+
*
|
|
26
|
+
* When present on AgentInput, signals the judge to produce a verdict.
|
|
27
|
+
* Optionally carries inline criteria that override the judge's own criteria.
|
|
28
|
+
*/
|
|
29
|
+
interface JudgmentRequest {
|
|
30
|
+
/**
|
|
31
|
+
* Optional criteria to evaluate, overriding the judge agent's configured criteria.
|
|
32
|
+
*/
|
|
33
|
+
criteria?: string[];
|
|
34
|
+
}
|
|
23
35
|
/**
|
|
24
36
|
* Input provided to an agent's `call` method.
|
|
25
37
|
*/
|
|
@@ -41,9 +53,9 @@ interface AgentInput {
|
|
|
41
53
|
*/
|
|
42
54
|
requestedRole: AgentRole;
|
|
43
55
|
/**
|
|
44
|
-
*
|
|
56
|
+
* When set, requests the judge to produce a verdict, optionally with inline criteria.
|
|
45
57
|
*/
|
|
46
|
-
judgmentRequest
|
|
58
|
+
judgmentRequest?: JudgmentRequest;
|
|
47
59
|
/**
|
|
48
60
|
* The current state of the scenario execution.
|
|
49
61
|
*/
|
|
@@ -215,10 +227,12 @@ interface ScenarioExecutionLike {
|
|
|
215
227
|
agent(content?: string | ModelMessage): Promise<void>;
|
|
216
228
|
/**
|
|
217
229
|
* Invokes the judge agent to evaluate the current state.
|
|
218
|
-
* @param
|
|
230
|
+
* @param options Optional options with inline criteria to evaluate as a checkpoint.
|
|
219
231
|
* @returns The result of the scenario if the judge makes a final decision.
|
|
220
232
|
*/
|
|
221
|
-
judge(
|
|
233
|
+
judge(options?: {
|
|
234
|
+
criteria?: string[];
|
|
235
|
+
}): Promise<ScenarioResult | null>;
|
|
222
236
|
/**
|
|
223
237
|
* Proceeds with the scenario automatically for a number of turns.
|
|
224
238
|
* @param turns The number of turns to proceed. Defaults to running until the scenario ends.
|
|
@@ -364,6 +378,7 @@ declare const domain_DEFAULT_MAX_TURNS: typeof DEFAULT_MAX_TURNS;
|
|
|
364
378
|
declare const domain_DEFAULT_VERBOSE: typeof DEFAULT_VERBOSE;
|
|
365
379
|
type domain_JudgeAgentAdapter = JudgeAgentAdapter;
|
|
366
380
|
declare const domain_JudgeAgentAdapter: typeof JudgeAgentAdapter;
|
|
381
|
+
type domain_JudgmentRequest = JudgmentRequest;
|
|
367
382
|
type domain_ScenarioConfig = ScenarioConfig;
|
|
368
383
|
type domain_ScenarioConfigFinal = ScenarioConfigFinal;
|
|
369
384
|
type domain_ScenarioExecutionLike = ScenarioExecutionLike;
|
|
@@ -377,7 +392,7 @@ declare const domain_allAgentRoles: typeof allAgentRoles;
|
|
|
377
392
|
declare const domain_defineConfig: typeof defineConfig;
|
|
378
393
|
declare const domain_scenarioProjectConfigSchema: typeof scenarioProjectConfigSchema;
|
|
379
394
|
declare namespace domain {
|
|
380
|
-
export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
|
|
395
|
+
export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_JudgmentRequest as JudgmentRequest, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
|
|
381
396
|
}
|
|
382
397
|
|
|
383
398
|
/**
|
|
@@ -475,7 +490,7 @@ interface JudgeAgentConfig extends TestingAgentConfig {
|
|
|
475
490
|
/**
|
|
476
491
|
* The criteria that the judge will use to evaluate the conversation.
|
|
477
492
|
*/
|
|
478
|
-
criteria
|
|
493
|
+
criteria?: string[];
|
|
479
494
|
/**
|
|
480
495
|
* Optional span collector for telemetry. Defaults to global singleton.
|
|
481
496
|
*/
|
|
@@ -554,7 +569,7 @@ declare class JudgeAgent extends JudgeAgentAdapter {
|
|
|
554
569
|
* main();
|
|
555
570
|
* ```
|
|
556
571
|
*/
|
|
557
|
-
declare const judgeAgent: (cfg
|
|
572
|
+
declare const judgeAgent: (cfg?: JudgeAgentConfig) => JudgeAgent;
|
|
558
573
|
|
|
559
574
|
/**
|
|
560
575
|
* Transforms OpenTelemetry spans into a complete plain-text digest for judge evaluation.
|
|
@@ -1334,6 +1349,8 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
|
|
|
1334
1349
|
private currentTurnSpan?;
|
|
1335
1350
|
/** Timestamp when execution started (for total time calculation) */
|
|
1336
1351
|
private totalStartTime;
|
|
1352
|
+
/** Accumulated results from inline judge checkpoints */
|
|
1353
|
+
private checkpointResults;
|
|
1337
1354
|
/** Event stream for monitoring scenario progress */
|
|
1338
1355
|
private eventSubject;
|
|
1339
1356
|
/**
|
|
@@ -1554,24 +1571,23 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
|
|
|
1554
1571
|
*
|
|
1555
1572
|
* This method is part of the ScenarioExecutionLike interface used by script steps.
|
|
1556
1573
|
*
|
|
1557
|
-
* @param
|
|
1574
|
+
* @param options - Optional options with inline criteria to evaluate as a checkpoint.
|
|
1558
1575
|
* @returns A promise that resolves with:
|
|
1559
1576
|
* - ScenarioResult if the judge makes a final decision, or
|
|
1560
1577
|
* - Null if the conversation should continue
|
|
1561
1578
|
*
|
|
1562
1579
|
* @example
|
|
1563
1580
|
* ```typescript
|
|
1564
|
-
* // Let judge evaluate
|
|
1581
|
+
* // Let judge evaluate with its configured criteria
|
|
1565
1582
|
* const result = await execution.judge();
|
|
1566
|
-
* if (result) {
|
|
1567
|
-
* console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
|
|
1568
|
-
* }
|
|
1569
1583
|
*
|
|
1570
|
-
* //
|
|
1571
|
-
* const result = await execution.judge(
|
|
1584
|
+
* // Evaluate inline criteria as a checkpoint
|
|
1585
|
+
* const result = await execution.judge({ criteria: ["Agent responded helpfully"] });
|
|
1572
1586
|
* ```
|
|
1573
1587
|
*/
|
|
1574
|
-
judge(
|
|
1588
|
+
judge(options?: {
|
|
1589
|
+
criteria?: string[];
|
|
1590
|
+
}): Promise<ScenarioResult | null>;
|
|
1575
1591
|
/**
|
|
1576
1592
|
* Lets the scenario proceed automatically for a specified number of turns.
|
|
1577
1593
|
*
|
|
@@ -1718,6 +1734,8 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
|
|
|
1718
1734
|
* - Clears the result from any previous execution
|
|
1719
1735
|
*/
|
|
1720
1736
|
private reset;
|
|
1737
|
+
/** Compiles all accumulated checkpoint results into aggregated met/unmet criteria. */
|
|
1738
|
+
private get compiledCheckpoints();
|
|
1721
1739
|
private nextAgentForRole;
|
|
1722
1740
|
/**
|
|
1723
1741
|
* Starts a new turn in the scenario execution.
|
|
@@ -1980,15 +1998,20 @@ declare const agent: (content?: string | ModelMessage) => ScriptStep;
|
|
|
1980
1998
|
/**
|
|
1981
1999
|
* Invoke the judge agent to evaluate the current conversation state.
|
|
1982
2000
|
*
|
|
1983
|
-
*
|
|
1984
|
-
*
|
|
1985
|
-
*
|
|
2001
|
+
* When criteria are provided inline, the judge evaluates only those criteria
|
|
2002
|
+
* as a checkpoint: if all pass, the scenario continues; if any fail, the
|
|
2003
|
+
* scenario fails immediately. This is the preferred way to pass criteria
|
|
2004
|
+
* when using scripts.
|
|
1986
2005
|
*
|
|
1987
|
-
*
|
|
1988
|
-
*
|
|
2006
|
+
* When no criteria are provided, the judge uses its own configured criteria
|
|
2007
|
+
* and returns a final verdict (success or failure), ending the scenario.
|
|
2008
|
+
*
|
|
2009
|
+
* @param options Optional options object with inline criteria to evaluate.
|
|
1989
2010
|
* @returns A ScriptStep function that can be used in scenario scripts.
|
|
1990
2011
|
*/
|
|
1991
|
-
declare const judge: (
|
|
2012
|
+
declare const judge: (options?: {
|
|
2013
|
+
criteria: string[];
|
|
2014
|
+
}) => ScriptStep;
|
|
1992
2015
|
/**
|
|
1993
2016
|
* Generate or specify a user message in the conversation.
|
|
1994
2017
|
*
|
|
@@ -2048,4 +2071,4 @@ declare namespace script {
|
|
|
2048
2071
|
type ScenarioApi = typeof agents & typeof domain & typeof execution & typeof runner & typeof script;
|
|
2049
2072
|
declare const scenario: ScenarioApi;
|
|
2050
2073
|
|
|
2051
|
-
export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, type AudioResponseEvent, DEFAULT_MAX_TURNS, DEFAULT_VERBOSE, type FinishTestArgs, type InvokeLLMParams, type InvokeLLMResult, JudgeAgentAdapter, type JudgeAgentConfig, type JudgeResult, JudgeSpanCollector, JudgeSpanDigestFormatter, RealtimeAgentAdapter, type RealtimeAgentAdapterConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, judgeSpanCollector, judgeSpanDigestFormatter, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };
|
|
2074
|
+
export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, type AudioResponseEvent, DEFAULT_MAX_TURNS, DEFAULT_VERBOSE, type FinishTestArgs, type InvokeLLMParams, type InvokeLLMResult, JudgeAgentAdapter, type JudgeAgentConfig, type JudgeResult, JudgeSpanCollector, JudgeSpanDigestFormatter, type JudgmentRequest, RealtimeAgentAdapter, type RealtimeAgentAdapterConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, judgeSpanCollector, judgeSpanDigestFormatter, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };
|
package/dist/index.js
CHANGED
|
@@ -486,52 +486,68 @@ var createLLMInvoker = (logger2) => {
|
|
|
486
486
|
var toolMessageRole = "tool";
|
|
487
487
|
var assistantMessageRole = "assistant";
|
|
488
488
|
var userMessageRole = "user";
|
|
489
|
-
var
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
if (
|
|
500
|
-
|
|
489
|
+
var hasToolContent = (message2) => {
|
|
490
|
+
if (message2.role === toolMessageRole) return true;
|
|
491
|
+
if (!Array.isArray(message2.content)) return false;
|
|
492
|
+
return message2.content.some((part) => {
|
|
493
|
+
if (!part || typeof part !== "object") return false;
|
|
494
|
+
const partType = "type" in part ? part.type : void 0;
|
|
495
|
+
return partType === "tool-call" || partType === "tool-result";
|
|
496
|
+
});
|
|
497
|
+
};
|
|
498
|
+
var stringifyValue = (value) => {
|
|
499
|
+
if (typeof value === "string") return value;
|
|
500
|
+
if (value === void 0) return "undefined";
|
|
501
|
+
try {
|
|
502
|
+
const serialized = JSON.stringify(value);
|
|
503
|
+
return serialized === void 0 ? String(value) : serialized;
|
|
504
|
+
} catch {
|
|
505
|
+
return String(value);
|
|
501
506
|
}
|
|
502
|
-
return segments;
|
|
503
507
|
};
|
|
504
|
-
var
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
508
|
+
var summarizeToolMessage = (message2) => {
|
|
509
|
+
if (message2.role === toolMessageRole && !Array.isArray(message2.content)) {
|
|
510
|
+
return `[Tool message: ${stringifyValue(message2.content)}]`;
|
|
511
|
+
}
|
|
512
|
+
if (message2.role === toolMessageRole) {
|
|
513
|
+
const toolResults = message2.content.filter((part) => part.type === "tool-result").map((part) => {
|
|
514
|
+
const contentPart = part;
|
|
515
|
+
const name = contentPart.toolName ?? "unknown tool";
|
|
516
|
+
const output = contentPart.output;
|
|
517
|
+
const value = output && typeof output === "object" && "value" in output && typeof output.value === "string" ? output.value : output ?? contentPart.result;
|
|
518
|
+
return `[Tool result from ${name}: ${stringifyValue(value)}]`;
|
|
519
|
+
});
|
|
520
|
+
return toolResults.length > 0 ? toolResults.join("\n") : null;
|
|
521
|
+
}
|
|
522
|
+
if (!Array.isArray(message2.content)) return null;
|
|
523
|
+
const toolCalls = message2.content.filter((part) => part.type === "tool-call").map((part) => {
|
|
524
|
+
const contentPart = part;
|
|
525
|
+
const name = contentPart.toolName ?? "unknown tool";
|
|
526
|
+
return `[Called tool ${name} with: ${stringifyValue(contentPart.input)}]`;
|
|
511
527
|
});
|
|
528
|
+
return toolCalls.length > 0 ? toolCalls.join("\n") : null;
|
|
512
529
|
};
|
|
513
|
-
var
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
530
|
+
var messageRoleReversal = (messages) => {
|
|
531
|
+
const roleMap = {
|
|
532
|
+
[userMessageRole]: assistantMessageRole,
|
|
533
|
+
[assistantMessageRole]: userMessageRole
|
|
534
|
+
};
|
|
535
|
+
return messages.map((message2) => {
|
|
536
|
+
if (hasToolContent(message2)) {
|
|
537
|
+
const summary = summarizeToolMessage(message2);
|
|
538
|
+
if (!summary) return null;
|
|
539
|
+
return {
|
|
540
|
+
role: userMessageRole,
|
|
541
|
+
content: summary
|
|
542
|
+
};
|
|
543
|
+
}
|
|
521
544
|
const newRole = roleMap[message2.role];
|
|
522
545
|
if (!newRole) return message2;
|
|
523
546
|
return {
|
|
524
|
-
|
|
525
|
-
|
|
547
|
+
...message2,
|
|
548
|
+
role: newRole
|
|
526
549
|
};
|
|
527
|
-
});
|
|
528
|
-
};
|
|
529
|
-
var messageRoleReversal = (messages) => {
|
|
530
|
-
const segments = groupMessagesByToolBoundaries(messages);
|
|
531
|
-
const processedSegments = segments.map(
|
|
532
|
-
(segment) => segmentHasToolMessages(segment) ? segment : reverseSegmentRoles(segment)
|
|
533
|
-
);
|
|
534
|
-
return processedSegments.flat();
|
|
550
|
+
}).filter((message2) => message2 !== null);
|
|
535
551
|
};
|
|
536
552
|
var criterionToParamName = (criterion) => {
|
|
537
553
|
return criterion.replace(/"/g, "").replace(/[^a-zA-Z0-9]/g, "_").replace(/ /g, "_").toLowerCase().substring(0, 70);
|
|
@@ -893,7 +909,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
893
909
|
constructor(cfg) {
|
|
894
910
|
super();
|
|
895
911
|
this.cfg = cfg;
|
|
896
|
-
this.criteria = cfg.criteria;
|
|
912
|
+
this.criteria = cfg.criteria ?? [];
|
|
897
913
|
this.spanCollector = cfg.spanCollector ?? judgeSpanCollector;
|
|
898
914
|
}
|
|
899
915
|
logger = new Logger("JudgeAgent");
|
|
@@ -905,7 +921,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
905
921
|
*/
|
|
906
922
|
invokeLLM = createLLMInvoker(this.logger);
|
|
907
923
|
async call(input) {
|
|
908
|
-
var _a, _b, _c;
|
|
924
|
+
var _a, _b, _c, _d;
|
|
925
|
+
const criteria = ((_a = input.judgmentRequest) == null ? void 0 : _a.criteria) ?? this.criteria;
|
|
909
926
|
this.logger.debug("call() invoked", {
|
|
910
927
|
threadId: input.threadId,
|
|
911
928
|
currentTurn: input.scenarioState.currentTurn,
|
|
@@ -924,7 +941,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
924
941
|
</opentelemetry_traces>
|
|
925
942
|
`;
|
|
926
943
|
const cfg = this.cfg;
|
|
927
|
-
const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(
|
|
944
|
+
const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(criteria, input.scenarioConfig.description);
|
|
928
945
|
const messages = [
|
|
929
946
|
{ role: "system", content: systemPrompt },
|
|
930
947
|
{ role: "user", content: contentForJudge }
|
|
@@ -937,10 +954,10 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
937
954
|
});
|
|
938
955
|
const tools = {
|
|
939
956
|
continue_test: buildContinueTestTool(),
|
|
940
|
-
finish_test: buildFinishTestTool(
|
|
957
|
+
finish_test: buildFinishTestTool(criteria)
|
|
941
958
|
};
|
|
942
|
-
const enforceJudgement = input.judgmentRequest;
|
|
943
|
-
const hasCriteria =
|
|
959
|
+
const enforceJudgement = input.judgmentRequest != null;
|
|
960
|
+
const hasCriteria = criteria.length && criteria.length > 0;
|
|
944
961
|
if (enforceJudgement && !hasCriteria) {
|
|
945
962
|
return {
|
|
946
963
|
success: false,
|
|
@@ -965,26 +982,26 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
965
982
|
toolChoice
|
|
966
983
|
});
|
|
967
984
|
this.logger.debug("LLM response received", {
|
|
968
|
-
toolCallCount: ((
|
|
969
|
-
toolCalls: (
|
|
985
|
+
toolCallCount: ((_b = completion.toolCalls) == null ? void 0 : _b.length) ?? 0,
|
|
986
|
+
toolCalls: (_c = completion.toolCalls) == null ? void 0 : _c.map((tc) => ({
|
|
970
987
|
toolName: tc.toolName,
|
|
971
988
|
args: tc.input
|
|
972
989
|
}))
|
|
973
990
|
});
|
|
974
991
|
let args;
|
|
975
|
-
if ((
|
|
992
|
+
if ((_d = completion.toolCalls) == null ? void 0 : _d.length) {
|
|
976
993
|
const toolCall = completion.toolCalls[0];
|
|
977
994
|
switch (toolCall.toolName) {
|
|
978
995
|
case "finish_test": {
|
|
979
996
|
args = toolCall.input;
|
|
980
997
|
const verdict = args.verdict || "inconclusive";
|
|
981
998
|
const reasoning = args.reasoning || "No reasoning provided";
|
|
982
|
-
const
|
|
983
|
-
const criteriaValues = Object.values(
|
|
984
|
-
const metCriteria =
|
|
999
|
+
const criteriaArgs = args.criteria || {};
|
|
1000
|
+
const criteriaValues = Object.values(criteriaArgs);
|
|
1001
|
+
const metCriteria = criteria.filter(
|
|
985
1002
|
(_, i) => criteriaValues[i] === "true"
|
|
986
1003
|
);
|
|
987
|
-
const unmetCriteria =
|
|
1004
|
+
const unmetCriteria = criteria.filter(
|
|
988
1005
|
(_, i) => criteriaValues[i] !== "true"
|
|
989
1006
|
);
|
|
990
1007
|
const result = {
|
|
@@ -1004,7 +1021,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1004
1021
|
success: false,
|
|
1005
1022
|
reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
|
|
1006
1023
|
metCriteria: [],
|
|
1007
|
-
unmetCriteria:
|
|
1024
|
+
unmetCriteria: criteria
|
|
1008
1025
|
};
|
|
1009
1026
|
}
|
|
1010
1027
|
}
|
|
@@ -1012,7 +1029,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1012
1029
|
success: false,
|
|
1013
1030
|
reasoning: `JudgeAgent: No tool call found in LLM output`,
|
|
1014
1031
|
metCriteria: [],
|
|
1015
|
-
unmetCriteria:
|
|
1032
|
+
unmetCriteria: criteria
|
|
1016
1033
|
};
|
|
1017
1034
|
}
|
|
1018
1035
|
getOpenTelemetryTracesDigest(threadId) {
|
|
@@ -1022,7 +1039,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1022
1039
|
}
|
|
1023
1040
|
};
|
|
1024
1041
|
var judgeAgent = (cfg) => {
|
|
1025
|
-
return new JudgeAgent(cfg);
|
|
1042
|
+
return new JudgeAgent(cfg ?? {});
|
|
1026
1043
|
};
|
|
1027
1044
|
|
|
1028
1045
|
// src/agents/user-simulator-agent.ts
|
|
@@ -2518,6 +2535,8 @@ var ScenarioExecution = class {
|
|
|
2518
2535
|
currentTurnSpan;
|
|
2519
2536
|
/** Timestamp when execution started (for total time calculation) */
|
|
2520
2537
|
totalStartTime = 0;
|
|
2538
|
+
/** Accumulated results from inline judge checkpoints */
|
|
2539
|
+
checkpointResults = [];
|
|
2521
2540
|
/** Event stream for monitoring scenario progress */
|
|
2522
2541
|
eventSubject = new import_rxjs2.Subject();
|
|
2523
2542
|
/**
|
|
@@ -2595,6 +2614,7 @@ var ScenarioExecution = class {
|
|
|
2595
2614
|
totalTime: this.totalTime,
|
|
2596
2615
|
agentTime: totalAgentTime
|
|
2597
2616
|
};
|
|
2617
|
+
return this._result;
|
|
2598
2618
|
this.logger.debug(`[${this.config.id}] Result set`, {
|
|
2599
2619
|
success: result.success,
|
|
2600
2620
|
reasoning: result.reasoning,
|
|
@@ -2655,6 +2675,8 @@ var ScenarioExecution = class {
|
|
|
2655
2675
|
const scriptStep = this.config.script[i];
|
|
2656
2676
|
await this.executeScriptStep(scriptStep, i);
|
|
2657
2677
|
if (this.result) {
|
|
2678
|
+
const cp = this.compiledCheckpoints;
|
|
2679
|
+
this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
|
|
2658
2680
|
this.emitRunFinished({
|
|
2659
2681
|
scenarioRunId,
|
|
2660
2682
|
status: this.result.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
|
|
@@ -2663,7 +2685,22 @@ var ScenarioExecution = class {
|
|
|
2663
2685
|
return this.result;
|
|
2664
2686
|
}
|
|
2665
2687
|
}
|
|
2666
|
-
this.
|
|
2688
|
+
if (this.checkpointResults.length > 0) {
|
|
2689
|
+
const cp = this.compiledCheckpoints;
|
|
2690
|
+
const result2 = this.setResult({
|
|
2691
|
+
success: cp.unmetCriteria.length === 0,
|
|
2692
|
+
reasoning: "All inline criteria checkpoints passed",
|
|
2693
|
+
metCriteria: cp.metCriteria,
|
|
2694
|
+
unmetCriteria: cp.unmetCriteria
|
|
2695
|
+
});
|
|
2696
|
+
this.emitRunFinished({
|
|
2697
|
+
scenarioRunId,
|
|
2698
|
+
status: result2.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
|
|
2699
|
+
result: result2
|
|
2700
|
+
});
|
|
2701
|
+
return result2;
|
|
2702
|
+
}
|
|
2703
|
+
const result = this.reachedMaxTurns(
|
|
2667
2704
|
[
|
|
2668
2705
|
"Reached end of script without conclusion, add one of the following to the end of the script:",
|
|
2669
2706
|
"- `Scenario.proceed()` to let the simulation continue to play out",
|
|
@@ -2671,11 +2708,11 @@ var ScenarioExecution = class {
|
|
|
2671
2708
|
"- `Scenario.succeed()` or `Scenario.fail()` to end the test with an explicit result"
|
|
2672
2709
|
].join("\n")
|
|
2673
2710
|
);
|
|
2674
|
-
this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED
|
|
2675
|
-
return
|
|
2711
|
+
this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */, result });
|
|
2712
|
+
return result;
|
|
2676
2713
|
} catch (error) {
|
|
2677
2714
|
const errorInfo = extractErrorInfo(error);
|
|
2678
|
-
this.setResult({
|
|
2715
|
+
const result = this.setResult({
|
|
2679
2716
|
success: false,
|
|
2680
2717
|
reasoning: `Scenario failed with error: ${errorInfo.message}`,
|
|
2681
2718
|
metCriteria: [],
|
|
@@ -2685,7 +2722,7 @@ var ScenarioExecution = class {
|
|
|
2685
2722
|
this.emitRunFinished({
|
|
2686
2723
|
scenarioRunId,
|
|
2687
2724
|
status: "ERROR" /* ERROR */,
|
|
2688
|
-
result
|
|
2725
|
+
result
|
|
2689
2726
|
});
|
|
2690
2727
|
throw error;
|
|
2691
2728
|
} finally {
|
|
@@ -2789,7 +2826,7 @@ var ScenarioExecution = class {
|
|
|
2789
2826
|
* @param judgmentRequest - Whether this is a judgment request (for judge agents)
|
|
2790
2827
|
* @throws Error if the agent call fails
|
|
2791
2828
|
*/
|
|
2792
|
-
async callAgent(idx, role, judgmentRequest
|
|
2829
|
+
async callAgent(idx, role, judgmentRequest) {
|
|
2793
2830
|
var _a;
|
|
2794
2831
|
const agent2 = this.agents[idx];
|
|
2795
2832
|
const agentName = agent2.name ?? agent2.constructor.name;
|
|
@@ -2980,25 +3017,26 @@ var ScenarioExecution = class {
|
|
|
2980
3017
|
*
|
|
2981
3018
|
* This method is part of the ScenarioExecutionLike interface used by script steps.
|
|
2982
3019
|
*
|
|
2983
|
-
* @param
|
|
3020
|
+
* @param options - Optional options with inline criteria to evaluate as a checkpoint.
|
|
2984
3021
|
* @returns A promise that resolves with:
|
|
2985
3022
|
* - ScenarioResult if the judge makes a final decision, or
|
|
2986
3023
|
* - Null if the conversation should continue
|
|
2987
3024
|
*
|
|
2988
3025
|
* @example
|
|
2989
3026
|
* ```typescript
|
|
2990
|
-
* // Let judge evaluate
|
|
3027
|
+
* // Let judge evaluate with its configured criteria
|
|
2991
3028
|
* const result = await execution.judge();
|
|
2992
|
-
* if (result) {
|
|
2993
|
-
* console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
|
|
2994
|
-
* }
|
|
2995
3029
|
*
|
|
2996
|
-
* //
|
|
2997
|
-
* const result = await execution.judge(
|
|
3030
|
+
* // Evaluate inline criteria as a checkpoint
|
|
3031
|
+
* const result = await execution.judge({ criteria: ["Agent responded helpfully"] });
|
|
2998
3032
|
* ```
|
|
2999
3033
|
*/
|
|
3000
|
-
async judge(
|
|
3001
|
-
return await this.scriptCallAgent(
|
|
3034
|
+
async judge(options) {
|
|
3035
|
+
return await this.scriptCallAgent(
|
|
3036
|
+
"Judge" /* JUDGE */,
|
|
3037
|
+
void 0,
|
|
3038
|
+
{ criteria: options == null ? void 0 : options.criteria }
|
|
3039
|
+
);
|
|
3002
3040
|
}
|
|
3003
3041
|
/**
|
|
3004
3042
|
* Lets the scenario proceed automatically for a specified number of turns.
|
|
@@ -3083,13 +3121,12 @@ var ScenarioExecution = class {
|
|
|
3083
3121
|
* ```
|
|
3084
3122
|
*/
|
|
3085
3123
|
async succeed(reasoning) {
|
|
3086
|
-
this.setResult({
|
|
3124
|
+
return this.setResult({
|
|
3087
3125
|
success: true,
|
|
3088
3126
|
reasoning: reasoning || "Scenario marked as successful with Scenario.succeed()",
|
|
3089
3127
|
metCriteria: [],
|
|
3090
3128
|
unmetCriteria: []
|
|
3091
3129
|
});
|
|
3092
|
-
return this.result;
|
|
3093
3130
|
}
|
|
3094
3131
|
/**
|
|
3095
3132
|
* Immediately ends the scenario with a failure verdict.
|
|
@@ -3115,13 +3152,12 @@ var ScenarioExecution = class {
|
|
|
3115
3152
|
* ```
|
|
3116
3153
|
*/
|
|
3117
3154
|
async fail(reasoning) {
|
|
3118
|
-
this.setResult({
|
|
3155
|
+
return this.setResult({
|
|
3119
3156
|
success: false,
|
|
3120
3157
|
reasoning: reasoning || "Scenario marked as failed with Scenario.fail()",
|
|
3121
3158
|
metCriteria: [],
|
|
3122
3159
|
unmetCriteria: []
|
|
3123
3160
|
});
|
|
3124
|
-
return this.result;
|
|
3125
3161
|
}
|
|
3126
3162
|
/**
|
|
3127
3163
|
* Adds execution time for a specific agent to the performance tracking.
|
|
@@ -3165,15 +3201,14 @@ var ScenarioExecution = class {
|
|
|
3165
3201
|
* decision, or null if the conversation should continue
|
|
3166
3202
|
* @throws Error if no agent is found for the specified role
|
|
3167
3203
|
*/
|
|
3168
|
-
async scriptCallAgent(role, content, judgmentRequest
|
|
3204
|
+
async scriptCallAgent(role, content, judgmentRequest) {
|
|
3169
3205
|
this.logger.debug(`[${this.config.id}] scriptCallAgent`, {
|
|
3170
3206
|
role,
|
|
3171
3207
|
hasContent: content !== void 0,
|
|
3172
|
-
judgmentRequest
|
|
3208
|
+
judgmentRequest: judgmentRequest != null,
|
|
3209
|
+
hasInlineCriteria: (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null
|
|
3173
3210
|
});
|
|
3174
3211
|
this.consumeUntilRole(role);
|
|
3175
|
-
let index = -1;
|
|
3176
|
-
let agent2 = null;
|
|
3177
3212
|
let nextAgent = this.getNextAgentForRole(role);
|
|
3178
3213
|
if (!nextAgent) {
|
|
3179
3214
|
this.newTurn();
|
|
@@ -3203,8 +3238,8 @@ var ScenarioExecution = class {
|
|
|
3203
3238
|
`Cannot generate a message for role \`${role}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
|
|
3204
3239
|
);
|
|
3205
3240
|
}
|
|
3206
|
-
index = nextAgent.index;
|
|
3207
|
-
agent2 = nextAgent.agent;
|
|
3241
|
+
const index = nextAgent.index;
|
|
3242
|
+
const agent2 = nextAgent.agent;
|
|
3208
3243
|
this.removePendingAgent(agent2);
|
|
3209
3244
|
if (content) {
|
|
3210
3245
|
const message2 = typeof content === "string" ? {
|
|
@@ -3216,6 +3251,25 @@ var ScenarioExecution = class {
|
|
|
3216
3251
|
return null;
|
|
3217
3252
|
}
|
|
3218
3253
|
await this.callAgent(index, role, judgmentRequest);
|
|
3254
|
+
if (this.result && (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null) {
|
|
3255
|
+
this.checkpointResults.push({
|
|
3256
|
+
metCriteria: this.result.metCriteria,
|
|
3257
|
+
unmetCriteria: this.result.unmetCriteria
|
|
3258
|
+
});
|
|
3259
|
+
if (this.result.success) {
|
|
3260
|
+
this._result = void 0;
|
|
3261
|
+
return null;
|
|
3262
|
+
} else {
|
|
3263
|
+
const cp = this.compiledCheckpoints;
|
|
3264
|
+
this.result.metCriteria = cp.metCriteria;
|
|
3265
|
+
this.result.unmetCriteria = cp.unmetCriteria;
|
|
3266
|
+
return this.result;
|
|
3267
|
+
}
|
|
3268
|
+
}
|
|
3269
|
+
if (this.result) {
|
|
3270
|
+
const cp = this.compiledCheckpoints;
|
|
3271
|
+
this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
|
|
3272
|
+
}
|
|
3219
3273
|
return this.result ?? null;
|
|
3220
3274
|
}
|
|
3221
3275
|
/**
|
|
@@ -3248,11 +3302,22 @@ var ScenarioExecution = class {
|
|
|
3248
3302
|
this.totalStartTime = Date.now();
|
|
3249
3303
|
this.pendingMessages.clear();
|
|
3250
3304
|
this._result = void 0;
|
|
3305
|
+
this.checkpointResults = [];
|
|
3251
3306
|
this.logger.debug(`[${this.config.id}] Reset complete`, {
|
|
3252
3307
|
threadId: this.state.threadId,
|
|
3253
3308
|
agentCount: this.agents.length
|
|
3254
3309
|
});
|
|
3255
3310
|
}
|
|
3311
|
+
/** Compiles all accumulated checkpoint results into aggregated met/unmet criteria. */
|
|
3312
|
+
get compiledCheckpoints() {
|
|
3313
|
+
const metCriteria = [];
|
|
3314
|
+
const unmetCriteria = [];
|
|
3315
|
+
for (const cp of this.checkpointResults) {
|
|
3316
|
+
metCriteria.push(...cp.metCriteria);
|
|
3317
|
+
unmetCriteria.push(...cp.unmetCriteria);
|
|
3318
|
+
}
|
|
3319
|
+
return { metCriteria, unmetCriteria };
|
|
3320
|
+
}
|
|
3256
3321
|
nextAgentForRole(role) {
|
|
3257
3322
|
for (const agent2 of this.agents) {
|
|
3258
3323
|
if (agent2.role === role && this.pendingAgentsOnTurn.has(agent2) && this.pendingRolesOnTurn.includes(role)) {
|
|
@@ -3349,7 +3414,7 @@ var ScenarioExecution = class {
|
|
|
3349
3414
|
*/
|
|
3350
3415
|
reachedMaxTurns(errorMessage) {
|
|
3351
3416
|
var _a;
|
|
3352
|
-
this.setResult({
|
|
3417
|
+
return this.setResult({
|
|
3353
3418
|
success: false,
|
|
3354
3419
|
reasoning: errorMessage || `Reached maximum turns (${this.config.maxTurns || 10}) without conclusion`,
|
|
3355
3420
|
metCriteria: [],
|
|
@@ -3850,9 +3915,9 @@ var message = (message2) => {
|
|
|
3850
3915
|
var agent = (content) => {
|
|
3851
3916
|
return (_state, executor) => executor.agent(content);
|
|
3852
3917
|
};
|
|
3853
|
-
var judge = (
|
|
3918
|
+
var judge = (options) => {
|
|
3854
3919
|
return async (_state, executor) => {
|
|
3855
|
-
await executor.judge(
|
|
3920
|
+
await executor.judge(options);
|
|
3856
3921
|
};
|
|
3857
3922
|
};
|
|
3858
3923
|
var user = (content) => {
|
package/dist/index.mjs
CHANGED
|
@@ -428,52 +428,68 @@ var createLLMInvoker = (logger2) => {
|
|
|
428
428
|
var toolMessageRole = "tool";
|
|
429
429
|
var assistantMessageRole = "assistant";
|
|
430
430
|
var userMessageRole = "user";
|
|
431
|
-
var
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
if (
|
|
442
|
-
|
|
431
|
+
var hasToolContent = (message2) => {
|
|
432
|
+
if (message2.role === toolMessageRole) return true;
|
|
433
|
+
if (!Array.isArray(message2.content)) return false;
|
|
434
|
+
return message2.content.some((part) => {
|
|
435
|
+
if (!part || typeof part !== "object") return false;
|
|
436
|
+
const partType = "type" in part ? part.type : void 0;
|
|
437
|
+
return partType === "tool-call" || partType === "tool-result";
|
|
438
|
+
});
|
|
439
|
+
};
|
|
440
|
+
var stringifyValue = (value) => {
|
|
441
|
+
if (typeof value === "string") return value;
|
|
442
|
+
if (value === void 0) return "undefined";
|
|
443
|
+
try {
|
|
444
|
+
const serialized = JSON.stringify(value);
|
|
445
|
+
return serialized === void 0 ? String(value) : serialized;
|
|
446
|
+
} catch {
|
|
447
|
+
return String(value);
|
|
443
448
|
}
|
|
444
|
-
return segments;
|
|
445
449
|
};
|
|
446
|
-
var
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
450
|
+
var summarizeToolMessage = (message2) => {
|
|
451
|
+
if (message2.role === toolMessageRole && !Array.isArray(message2.content)) {
|
|
452
|
+
return `[Tool message: ${stringifyValue(message2.content)}]`;
|
|
453
|
+
}
|
|
454
|
+
if (message2.role === toolMessageRole) {
|
|
455
|
+
const toolResults = message2.content.filter((part) => part.type === "tool-result").map((part) => {
|
|
456
|
+
const contentPart = part;
|
|
457
|
+
const name = contentPart.toolName ?? "unknown tool";
|
|
458
|
+
const output = contentPart.output;
|
|
459
|
+
const value = output && typeof output === "object" && "value" in output && typeof output.value === "string" ? output.value : output ?? contentPart.result;
|
|
460
|
+
return `[Tool result from ${name}: ${stringifyValue(value)}]`;
|
|
461
|
+
});
|
|
462
|
+
return toolResults.length > 0 ? toolResults.join("\n") : null;
|
|
463
|
+
}
|
|
464
|
+
if (!Array.isArray(message2.content)) return null;
|
|
465
|
+
const toolCalls = message2.content.filter((part) => part.type === "tool-call").map((part) => {
|
|
466
|
+
const contentPart = part;
|
|
467
|
+
const name = contentPart.toolName ?? "unknown tool";
|
|
468
|
+
return `[Called tool ${name} with: ${stringifyValue(contentPart.input)}]`;
|
|
453
469
|
});
|
|
470
|
+
return toolCalls.length > 0 ? toolCalls.join("\n") : null;
|
|
454
471
|
};
|
|
455
|
-
var
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
472
|
+
var messageRoleReversal = (messages) => {
|
|
473
|
+
const roleMap = {
|
|
474
|
+
[userMessageRole]: assistantMessageRole,
|
|
475
|
+
[assistantMessageRole]: userMessageRole
|
|
476
|
+
};
|
|
477
|
+
return messages.map((message2) => {
|
|
478
|
+
if (hasToolContent(message2)) {
|
|
479
|
+
const summary = summarizeToolMessage(message2);
|
|
480
|
+
if (!summary) return null;
|
|
481
|
+
return {
|
|
482
|
+
role: userMessageRole,
|
|
483
|
+
content: summary
|
|
484
|
+
};
|
|
485
|
+
}
|
|
463
486
|
const newRole = roleMap[message2.role];
|
|
464
487
|
if (!newRole) return message2;
|
|
465
488
|
return {
|
|
466
|
-
|
|
467
|
-
|
|
489
|
+
...message2,
|
|
490
|
+
role: newRole
|
|
468
491
|
};
|
|
469
|
-
});
|
|
470
|
-
};
|
|
471
|
-
var messageRoleReversal = (messages) => {
|
|
472
|
-
const segments = groupMessagesByToolBoundaries(messages);
|
|
473
|
-
const processedSegments = segments.map(
|
|
474
|
-
(segment) => segmentHasToolMessages(segment) ? segment : reverseSegmentRoles(segment)
|
|
475
|
-
);
|
|
476
|
-
return processedSegments.flat();
|
|
492
|
+
}).filter((message2) => message2 !== null);
|
|
477
493
|
};
|
|
478
494
|
var criterionToParamName = (criterion) => {
|
|
479
495
|
return criterion.replace(/"/g, "").replace(/[^a-zA-Z0-9]/g, "_").replace(/ /g, "_").toLowerCase().substring(0, 70);
|
|
@@ -835,7 +851,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
835
851
|
constructor(cfg) {
|
|
836
852
|
super();
|
|
837
853
|
this.cfg = cfg;
|
|
838
|
-
this.criteria = cfg.criteria;
|
|
854
|
+
this.criteria = cfg.criteria ?? [];
|
|
839
855
|
this.spanCollector = cfg.spanCollector ?? judgeSpanCollector;
|
|
840
856
|
}
|
|
841
857
|
logger = new Logger("JudgeAgent");
|
|
@@ -847,7 +863,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
847
863
|
*/
|
|
848
864
|
invokeLLM = createLLMInvoker(this.logger);
|
|
849
865
|
async call(input) {
|
|
850
|
-
var _a, _b, _c;
|
|
866
|
+
var _a, _b, _c, _d;
|
|
867
|
+
const criteria = ((_a = input.judgmentRequest) == null ? void 0 : _a.criteria) ?? this.criteria;
|
|
851
868
|
this.logger.debug("call() invoked", {
|
|
852
869
|
threadId: input.threadId,
|
|
853
870
|
currentTurn: input.scenarioState.currentTurn,
|
|
@@ -866,7 +883,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
866
883
|
</opentelemetry_traces>
|
|
867
884
|
`;
|
|
868
885
|
const cfg = this.cfg;
|
|
869
|
-
const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(
|
|
886
|
+
const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(criteria, input.scenarioConfig.description);
|
|
870
887
|
const messages = [
|
|
871
888
|
{ role: "system", content: systemPrompt },
|
|
872
889
|
{ role: "user", content: contentForJudge }
|
|
@@ -879,10 +896,10 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
879
896
|
});
|
|
880
897
|
const tools = {
|
|
881
898
|
continue_test: buildContinueTestTool(),
|
|
882
|
-
finish_test: buildFinishTestTool(
|
|
899
|
+
finish_test: buildFinishTestTool(criteria)
|
|
883
900
|
};
|
|
884
|
-
const enforceJudgement = input.judgmentRequest;
|
|
885
|
-
const hasCriteria =
|
|
901
|
+
const enforceJudgement = input.judgmentRequest != null;
|
|
902
|
+
const hasCriteria = criteria.length && criteria.length > 0;
|
|
886
903
|
if (enforceJudgement && !hasCriteria) {
|
|
887
904
|
return {
|
|
888
905
|
success: false,
|
|
@@ -907,26 +924,26 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
907
924
|
toolChoice
|
|
908
925
|
});
|
|
909
926
|
this.logger.debug("LLM response received", {
|
|
910
|
-
toolCallCount: ((
|
|
911
|
-
toolCalls: (
|
|
927
|
+
toolCallCount: ((_b = completion.toolCalls) == null ? void 0 : _b.length) ?? 0,
|
|
928
|
+
toolCalls: (_c = completion.toolCalls) == null ? void 0 : _c.map((tc) => ({
|
|
912
929
|
toolName: tc.toolName,
|
|
913
930
|
args: tc.input
|
|
914
931
|
}))
|
|
915
932
|
});
|
|
916
933
|
let args;
|
|
917
|
-
if ((
|
|
934
|
+
if ((_d = completion.toolCalls) == null ? void 0 : _d.length) {
|
|
918
935
|
const toolCall = completion.toolCalls[0];
|
|
919
936
|
switch (toolCall.toolName) {
|
|
920
937
|
case "finish_test": {
|
|
921
938
|
args = toolCall.input;
|
|
922
939
|
const verdict = args.verdict || "inconclusive";
|
|
923
940
|
const reasoning = args.reasoning || "No reasoning provided";
|
|
924
|
-
const
|
|
925
|
-
const criteriaValues = Object.values(
|
|
926
|
-
const metCriteria =
|
|
941
|
+
const criteriaArgs = args.criteria || {};
|
|
942
|
+
const criteriaValues = Object.values(criteriaArgs);
|
|
943
|
+
const metCriteria = criteria.filter(
|
|
927
944
|
(_, i) => criteriaValues[i] === "true"
|
|
928
945
|
);
|
|
929
|
-
const unmetCriteria =
|
|
946
|
+
const unmetCriteria = criteria.filter(
|
|
930
947
|
(_, i) => criteriaValues[i] !== "true"
|
|
931
948
|
);
|
|
932
949
|
const result = {
|
|
@@ -946,7 +963,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
946
963
|
success: false,
|
|
947
964
|
reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
|
|
948
965
|
metCriteria: [],
|
|
949
|
-
unmetCriteria:
|
|
966
|
+
unmetCriteria: criteria
|
|
950
967
|
};
|
|
951
968
|
}
|
|
952
969
|
}
|
|
@@ -954,7 +971,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
954
971
|
success: false,
|
|
955
972
|
reasoning: `JudgeAgent: No tool call found in LLM output`,
|
|
956
973
|
metCriteria: [],
|
|
957
|
-
unmetCriteria:
|
|
974
|
+
unmetCriteria: criteria
|
|
958
975
|
};
|
|
959
976
|
}
|
|
960
977
|
getOpenTelemetryTracesDigest(threadId) {
|
|
@@ -964,7 +981,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
964
981
|
}
|
|
965
982
|
};
|
|
966
983
|
var judgeAgent = (cfg) => {
|
|
967
|
-
return new JudgeAgent(cfg);
|
|
984
|
+
return new JudgeAgent(cfg ?? {});
|
|
968
985
|
};
|
|
969
986
|
|
|
970
987
|
// src/agents/user-simulator-agent.ts
|
|
@@ -2460,6 +2477,8 @@ var ScenarioExecution = class {
|
|
|
2460
2477
|
currentTurnSpan;
|
|
2461
2478
|
/** Timestamp when execution started (for total time calculation) */
|
|
2462
2479
|
totalStartTime = 0;
|
|
2480
|
+
/** Accumulated results from inline judge checkpoints */
|
|
2481
|
+
checkpointResults = [];
|
|
2463
2482
|
/** Event stream for monitoring scenario progress */
|
|
2464
2483
|
eventSubject = new Subject2();
|
|
2465
2484
|
/**
|
|
@@ -2537,6 +2556,7 @@ var ScenarioExecution = class {
|
|
|
2537
2556
|
totalTime: this.totalTime,
|
|
2538
2557
|
agentTime: totalAgentTime
|
|
2539
2558
|
};
|
|
2559
|
+
return this._result;
|
|
2540
2560
|
this.logger.debug(`[${this.config.id}] Result set`, {
|
|
2541
2561
|
success: result.success,
|
|
2542
2562
|
reasoning: result.reasoning,
|
|
@@ -2597,6 +2617,8 @@ var ScenarioExecution = class {
|
|
|
2597
2617
|
const scriptStep = this.config.script[i];
|
|
2598
2618
|
await this.executeScriptStep(scriptStep, i);
|
|
2599
2619
|
if (this.result) {
|
|
2620
|
+
const cp = this.compiledCheckpoints;
|
|
2621
|
+
this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
|
|
2600
2622
|
this.emitRunFinished({
|
|
2601
2623
|
scenarioRunId,
|
|
2602
2624
|
status: this.result.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
|
|
@@ -2605,7 +2627,22 @@ var ScenarioExecution = class {
|
|
|
2605
2627
|
return this.result;
|
|
2606
2628
|
}
|
|
2607
2629
|
}
|
|
2608
|
-
this.
|
|
2630
|
+
if (this.checkpointResults.length > 0) {
|
|
2631
|
+
const cp = this.compiledCheckpoints;
|
|
2632
|
+
const result2 = this.setResult({
|
|
2633
|
+
success: cp.unmetCriteria.length === 0,
|
|
2634
|
+
reasoning: "All inline criteria checkpoints passed",
|
|
2635
|
+
metCriteria: cp.metCriteria,
|
|
2636
|
+
unmetCriteria: cp.unmetCriteria
|
|
2637
|
+
});
|
|
2638
|
+
this.emitRunFinished({
|
|
2639
|
+
scenarioRunId,
|
|
2640
|
+
status: result2.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
|
|
2641
|
+
result: result2
|
|
2642
|
+
});
|
|
2643
|
+
return result2;
|
|
2644
|
+
}
|
|
2645
|
+
const result = this.reachedMaxTurns(
|
|
2609
2646
|
[
|
|
2610
2647
|
"Reached end of script without conclusion, add one of the following to the end of the script:",
|
|
2611
2648
|
"- `Scenario.proceed()` to let the simulation continue to play out",
|
|
@@ -2613,11 +2650,11 @@ var ScenarioExecution = class {
|
|
|
2613
2650
|
"- `Scenario.succeed()` or `Scenario.fail()` to end the test with an explicit result"
|
|
2614
2651
|
].join("\n")
|
|
2615
2652
|
);
|
|
2616
|
-
this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED
|
|
2617
|
-
return
|
|
2653
|
+
this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */, result });
|
|
2654
|
+
return result;
|
|
2618
2655
|
} catch (error) {
|
|
2619
2656
|
const errorInfo = extractErrorInfo(error);
|
|
2620
|
-
this.setResult({
|
|
2657
|
+
const result = this.setResult({
|
|
2621
2658
|
success: false,
|
|
2622
2659
|
reasoning: `Scenario failed with error: ${errorInfo.message}`,
|
|
2623
2660
|
metCriteria: [],
|
|
@@ -2627,7 +2664,7 @@ var ScenarioExecution = class {
|
|
|
2627
2664
|
this.emitRunFinished({
|
|
2628
2665
|
scenarioRunId,
|
|
2629
2666
|
status: "ERROR" /* ERROR */,
|
|
2630
|
-
result
|
|
2667
|
+
result
|
|
2631
2668
|
});
|
|
2632
2669
|
throw error;
|
|
2633
2670
|
} finally {
|
|
@@ -2731,7 +2768,7 @@ var ScenarioExecution = class {
|
|
|
2731
2768
|
* @param judgmentRequest - Whether this is a judgment request (for judge agents)
|
|
2732
2769
|
* @throws Error if the agent call fails
|
|
2733
2770
|
*/
|
|
2734
|
-
async callAgent(idx, role, judgmentRequest
|
|
2771
|
+
async callAgent(idx, role, judgmentRequest) {
|
|
2735
2772
|
var _a;
|
|
2736
2773
|
const agent2 = this.agents[idx];
|
|
2737
2774
|
const agentName = agent2.name ?? agent2.constructor.name;
|
|
@@ -2922,25 +2959,26 @@ var ScenarioExecution = class {
|
|
|
2922
2959
|
*
|
|
2923
2960
|
* This method is part of the ScenarioExecutionLike interface used by script steps.
|
|
2924
2961
|
*
|
|
2925
|
-
* @param
|
|
2962
|
+
* @param options - Optional options with inline criteria to evaluate as a checkpoint.
|
|
2926
2963
|
* @returns A promise that resolves with:
|
|
2927
2964
|
* - ScenarioResult if the judge makes a final decision, or
|
|
2928
2965
|
* - Null if the conversation should continue
|
|
2929
2966
|
*
|
|
2930
2967
|
* @example
|
|
2931
2968
|
* ```typescript
|
|
2932
|
-
* // Let judge evaluate
|
|
2969
|
+
* // Let judge evaluate with its configured criteria
|
|
2933
2970
|
* const result = await execution.judge();
|
|
2934
|
-
* if (result) {
|
|
2935
|
-
* console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
|
|
2936
|
-
* }
|
|
2937
2971
|
*
|
|
2938
|
-
* //
|
|
2939
|
-
* const result = await execution.judge(
|
|
2972
|
+
* // Evaluate inline criteria as a checkpoint
|
|
2973
|
+
* const result = await execution.judge({ criteria: ["Agent responded helpfully"] });
|
|
2940
2974
|
* ```
|
|
2941
2975
|
*/
|
|
2942
|
-
async judge(
|
|
2943
|
-
return await this.scriptCallAgent(
|
|
2976
|
+
async judge(options) {
|
|
2977
|
+
return await this.scriptCallAgent(
|
|
2978
|
+
"Judge" /* JUDGE */,
|
|
2979
|
+
void 0,
|
|
2980
|
+
{ criteria: options == null ? void 0 : options.criteria }
|
|
2981
|
+
);
|
|
2944
2982
|
}
|
|
2945
2983
|
/**
|
|
2946
2984
|
* Lets the scenario proceed automatically for a specified number of turns.
|
|
@@ -3025,13 +3063,12 @@ var ScenarioExecution = class {
|
|
|
3025
3063
|
* ```
|
|
3026
3064
|
*/
|
|
3027
3065
|
async succeed(reasoning) {
|
|
3028
|
-
this.setResult({
|
|
3066
|
+
return this.setResult({
|
|
3029
3067
|
success: true,
|
|
3030
3068
|
reasoning: reasoning || "Scenario marked as successful with Scenario.succeed()",
|
|
3031
3069
|
metCriteria: [],
|
|
3032
3070
|
unmetCriteria: []
|
|
3033
3071
|
});
|
|
3034
|
-
return this.result;
|
|
3035
3072
|
}
|
|
3036
3073
|
/**
|
|
3037
3074
|
* Immediately ends the scenario with a failure verdict.
|
|
@@ -3057,13 +3094,12 @@ var ScenarioExecution = class {
|
|
|
3057
3094
|
* ```
|
|
3058
3095
|
*/
|
|
3059
3096
|
async fail(reasoning) {
|
|
3060
|
-
this.setResult({
|
|
3097
|
+
return this.setResult({
|
|
3061
3098
|
success: false,
|
|
3062
3099
|
reasoning: reasoning || "Scenario marked as failed with Scenario.fail()",
|
|
3063
3100
|
metCriteria: [],
|
|
3064
3101
|
unmetCriteria: []
|
|
3065
3102
|
});
|
|
3066
|
-
return this.result;
|
|
3067
3103
|
}
|
|
3068
3104
|
/**
|
|
3069
3105
|
* Adds execution time for a specific agent to the performance tracking.
|
|
@@ -3107,15 +3143,14 @@ var ScenarioExecution = class {
|
|
|
3107
3143
|
* decision, or null if the conversation should continue
|
|
3108
3144
|
* @throws Error if no agent is found for the specified role
|
|
3109
3145
|
*/
|
|
3110
|
-
async scriptCallAgent(role, content, judgmentRequest
|
|
3146
|
+
async scriptCallAgent(role, content, judgmentRequest) {
|
|
3111
3147
|
this.logger.debug(`[${this.config.id}] scriptCallAgent`, {
|
|
3112
3148
|
role,
|
|
3113
3149
|
hasContent: content !== void 0,
|
|
3114
|
-
judgmentRequest
|
|
3150
|
+
judgmentRequest: judgmentRequest != null,
|
|
3151
|
+
hasInlineCriteria: (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null
|
|
3115
3152
|
});
|
|
3116
3153
|
this.consumeUntilRole(role);
|
|
3117
|
-
let index = -1;
|
|
3118
|
-
let agent2 = null;
|
|
3119
3154
|
let nextAgent = this.getNextAgentForRole(role);
|
|
3120
3155
|
if (!nextAgent) {
|
|
3121
3156
|
this.newTurn();
|
|
@@ -3145,8 +3180,8 @@ var ScenarioExecution = class {
|
|
|
3145
3180
|
`Cannot generate a message for role \`${role}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
|
|
3146
3181
|
);
|
|
3147
3182
|
}
|
|
3148
|
-
index = nextAgent.index;
|
|
3149
|
-
agent2 = nextAgent.agent;
|
|
3183
|
+
const index = nextAgent.index;
|
|
3184
|
+
const agent2 = nextAgent.agent;
|
|
3150
3185
|
this.removePendingAgent(agent2);
|
|
3151
3186
|
if (content) {
|
|
3152
3187
|
const message2 = typeof content === "string" ? {
|
|
@@ -3158,6 +3193,25 @@ var ScenarioExecution = class {
|
|
|
3158
3193
|
return null;
|
|
3159
3194
|
}
|
|
3160
3195
|
await this.callAgent(index, role, judgmentRequest);
|
|
3196
|
+
if (this.result && (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null) {
|
|
3197
|
+
this.checkpointResults.push({
|
|
3198
|
+
metCriteria: this.result.metCriteria,
|
|
3199
|
+
unmetCriteria: this.result.unmetCriteria
|
|
3200
|
+
});
|
|
3201
|
+
if (this.result.success) {
|
|
3202
|
+
this._result = void 0;
|
|
3203
|
+
return null;
|
|
3204
|
+
} else {
|
|
3205
|
+
const cp = this.compiledCheckpoints;
|
|
3206
|
+
this.result.metCriteria = cp.metCriteria;
|
|
3207
|
+
this.result.unmetCriteria = cp.unmetCriteria;
|
|
3208
|
+
return this.result;
|
|
3209
|
+
}
|
|
3210
|
+
}
|
|
3211
|
+
if (this.result) {
|
|
3212
|
+
const cp = this.compiledCheckpoints;
|
|
3213
|
+
this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
|
|
3214
|
+
}
|
|
3161
3215
|
return this.result ?? null;
|
|
3162
3216
|
}
|
|
3163
3217
|
/**
|
|
@@ -3190,11 +3244,22 @@ var ScenarioExecution = class {
|
|
|
3190
3244
|
this.totalStartTime = Date.now();
|
|
3191
3245
|
this.pendingMessages.clear();
|
|
3192
3246
|
this._result = void 0;
|
|
3247
|
+
this.checkpointResults = [];
|
|
3193
3248
|
this.logger.debug(`[${this.config.id}] Reset complete`, {
|
|
3194
3249
|
threadId: this.state.threadId,
|
|
3195
3250
|
agentCount: this.agents.length
|
|
3196
3251
|
});
|
|
3197
3252
|
}
|
|
3253
|
+
/** Compiles all accumulated checkpoint results into aggregated met/unmet criteria. */
|
|
3254
|
+
get compiledCheckpoints() {
|
|
3255
|
+
const metCriteria = [];
|
|
3256
|
+
const unmetCriteria = [];
|
|
3257
|
+
for (const cp of this.checkpointResults) {
|
|
3258
|
+
metCriteria.push(...cp.metCriteria);
|
|
3259
|
+
unmetCriteria.push(...cp.unmetCriteria);
|
|
3260
|
+
}
|
|
3261
|
+
return { metCriteria, unmetCriteria };
|
|
3262
|
+
}
|
|
3198
3263
|
nextAgentForRole(role) {
|
|
3199
3264
|
for (const agent2 of this.agents) {
|
|
3200
3265
|
if (agent2.role === role && this.pendingAgentsOnTurn.has(agent2) && this.pendingRolesOnTurn.includes(role)) {
|
|
@@ -3291,7 +3356,7 @@ var ScenarioExecution = class {
|
|
|
3291
3356
|
*/
|
|
3292
3357
|
reachedMaxTurns(errorMessage) {
|
|
3293
3358
|
var _a;
|
|
3294
|
-
this.setResult({
|
|
3359
|
+
return this.setResult({
|
|
3295
3360
|
success: false,
|
|
3296
3361
|
reasoning: errorMessage || `Reached maximum turns (${this.config.maxTurns || 10}) without conclusion`,
|
|
3297
3362
|
metCriteria: [],
|
|
@@ -3799,9 +3864,9 @@ var message = (message2) => {
|
|
|
3799
3864
|
var agent = (content) => {
|
|
3800
3865
|
return (_state, executor) => executor.agent(content);
|
|
3801
3866
|
};
|
|
3802
|
-
var judge = (
|
|
3867
|
+
var judge = (options) => {
|
|
3803
3868
|
return async (_state, executor) => {
|
|
3804
|
-
await executor.judge(
|
|
3869
|
+
await executor.judge(options);
|
|
3805
3870
|
};
|
|
3806
3871
|
};
|
|
3807
3872
|
var user = (content) => {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@langwatch/scenario",
|
|
3
|
-
"version": "0.4.
|
|
3
|
+
"version": "0.4.2",
|
|
4
4
|
"description": "A TypeScript library for testing AI agents using scenarios",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"module": "dist/index.mjs",
|
|
@@ -29,7 +29,7 @@
|
|
|
29
29
|
},
|
|
30
30
|
"dependencies": {
|
|
31
31
|
"@ag-ui/core": "^0.0.28",
|
|
32
|
-
"@ai-sdk/openai": "^
|
|
32
|
+
"@ai-sdk/openai": "^3.0.26",
|
|
33
33
|
"@openai/agents": "^0.3.3",
|
|
34
34
|
"ai": "^6.0.0",
|
|
35
35
|
"chalk": "^5.6.2",
|