@langwatch/scenario 0.4.0 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +70 -47
- package/dist/index.d.ts +70 -47
- package/dist/index.js +153 -87
- package/dist/index.mjs +153 -87
- package/dist/integrations/vitest/setup.js +1 -1
- package/dist/integrations/vitest/setup.mjs +1 -1
- package/package.json +4 -4
package/dist/index.js
CHANGED
|
@@ -189,7 +189,7 @@ var DEFAULT_TEMPERATURE = 0;
|
|
|
189
189
|
var modelSchema = import_v42.z.object({
|
|
190
190
|
model: import_v42.z.custom((val) => Boolean(val), {
|
|
191
191
|
message: "A model is required. Configure it in scenario.config.js defaultModel or pass directly to the agent."
|
|
192
|
-
}).describe("
|
|
192
|
+
}).describe("Language model that is used by the AI SDK Core functions."),
|
|
193
193
|
temperature: import_v42.z.number().min(0).max(1).optional().describe("The temperature for the language model.").default(DEFAULT_TEMPERATURE),
|
|
194
194
|
maxTokens: import_v42.z.number().optional().describe("The maximum number of tokens to generate.")
|
|
195
195
|
});
|
|
@@ -455,7 +455,7 @@ var JudgeUtils = {
|
|
|
455
455
|
/**
|
|
456
456
|
* Builds a minimal transcript from messages for judge evaluation.
|
|
457
457
|
* Truncates base64 media to reduce token usage.
|
|
458
|
-
* @param messages - Array of
|
|
458
|
+
* @param messages - Array of ModelMessage from conversation
|
|
459
459
|
* @returns Plain text transcript with one message per line
|
|
460
460
|
*/
|
|
461
461
|
buildTranscriptFromMessages(messages) {
|
|
@@ -486,52 +486,68 @@ var createLLMInvoker = (logger2) => {
|
|
|
486
486
|
var toolMessageRole = "tool";
|
|
487
487
|
var assistantMessageRole = "assistant";
|
|
488
488
|
var userMessageRole = "user";
|
|
489
|
-
var
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
if (
|
|
500
|
-
|
|
489
|
+
var hasToolContent = (message2) => {
|
|
490
|
+
if (message2.role === toolMessageRole) return true;
|
|
491
|
+
if (!Array.isArray(message2.content)) return false;
|
|
492
|
+
return message2.content.some((part) => {
|
|
493
|
+
if (!part || typeof part !== "object") return false;
|
|
494
|
+
const partType = "type" in part ? part.type : void 0;
|
|
495
|
+
return partType === "tool-call" || partType === "tool-result";
|
|
496
|
+
});
|
|
497
|
+
};
|
|
498
|
+
var stringifyValue = (value) => {
|
|
499
|
+
if (typeof value === "string") return value;
|
|
500
|
+
if (value === void 0) return "undefined";
|
|
501
|
+
try {
|
|
502
|
+
const serialized = JSON.stringify(value);
|
|
503
|
+
return serialized === void 0 ? String(value) : serialized;
|
|
504
|
+
} catch {
|
|
505
|
+
return String(value);
|
|
501
506
|
}
|
|
502
|
-
return segments;
|
|
503
507
|
};
|
|
504
|
-
var
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
508
|
+
var summarizeToolMessage = (message2) => {
|
|
509
|
+
if (message2.role === toolMessageRole && !Array.isArray(message2.content)) {
|
|
510
|
+
return `[Tool message: ${stringifyValue(message2.content)}]`;
|
|
511
|
+
}
|
|
512
|
+
if (message2.role === toolMessageRole) {
|
|
513
|
+
const toolResults = message2.content.filter((part) => part.type === "tool-result").map((part) => {
|
|
514
|
+
const contentPart = part;
|
|
515
|
+
const name = contentPart.toolName ?? "unknown tool";
|
|
516
|
+
const output = contentPart.output;
|
|
517
|
+
const value = output && typeof output === "object" && "value" in output && typeof output.value === "string" ? output.value : output ?? contentPart.result;
|
|
518
|
+
return `[Tool result from ${name}: ${stringifyValue(value)}]`;
|
|
519
|
+
});
|
|
520
|
+
return toolResults.length > 0 ? toolResults.join("\n") : null;
|
|
521
|
+
}
|
|
522
|
+
if (!Array.isArray(message2.content)) return null;
|
|
523
|
+
const toolCalls = message2.content.filter((part) => part.type === "tool-call").map((part) => {
|
|
524
|
+
const contentPart = part;
|
|
525
|
+
const name = contentPart.toolName ?? "unknown tool";
|
|
526
|
+
return `[Called tool ${name} with: ${stringifyValue(contentPart.input)}]`;
|
|
511
527
|
});
|
|
528
|
+
return toolCalls.length > 0 ? toolCalls.join("\n") : null;
|
|
512
529
|
};
|
|
513
|
-
var
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
530
|
+
var messageRoleReversal = (messages) => {
|
|
531
|
+
const roleMap = {
|
|
532
|
+
[userMessageRole]: assistantMessageRole,
|
|
533
|
+
[assistantMessageRole]: userMessageRole
|
|
534
|
+
};
|
|
535
|
+
return messages.map((message2) => {
|
|
536
|
+
if (hasToolContent(message2)) {
|
|
537
|
+
const summary = summarizeToolMessage(message2);
|
|
538
|
+
if (!summary) return null;
|
|
539
|
+
return {
|
|
540
|
+
role: userMessageRole,
|
|
541
|
+
content: summary
|
|
542
|
+
};
|
|
543
|
+
}
|
|
521
544
|
const newRole = roleMap[message2.role];
|
|
522
545
|
if (!newRole) return message2;
|
|
523
546
|
return {
|
|
524
|
-
|
|
525
|
-
|
|
547
|
+
...message2,
|
|
548
|
+
role: newRole
|
|
526
549
|
};
|
|
527
|
-
});
|
|
528
|
-
};
|
|
529
|
-
var messageRoleReversal = (messages) => {
|
|
530
|
-
const segments = groupMessagesByToolBoundaries(messages);
|
|
531
|
-
const processedSegments = segments.map(
|
|
532
|
-
(segment) => segmentHasToolMessages(segment) ? segment : reverseSegmentRoles(segment)
|
|
533
|
-
);
|
|
534
|
-
return processedSegments.flat();
|
|
550
|
+
}).filter((message2) => message2 !== null);
|
|
535
551
|
};
|
|
536
552
|
var criterionToParamName = (criterion) => {
|
|
537
553
|
return criterion.replace(/"/g, "").replace(/[^a-zA-Z0-9]/g, "_").replace(/ /g, "_").toLowerCase().substring(0, 70);
|
|
@@ -893,7 +909,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
893
909
|
constructor(cfg) {
|
|
894
910
|
super();
|
|
895
911
|
this.cfg = cfg;
|
|
896
|
-
this.criteria = cfg.criteria;
|
|
912
|
+
this.criteria = cfg.criteria ?? [];
|
|
897
913
|
this.spanCollector = cfg.spanCollector ?? judgeSpanCollector;
|
|
898
914
|
}
|
|
899
915
|
logger = new Logger("JudgeAgent");
|
|
@@ -905,7 +921,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
905
921
|
*/
|
|
906
922
|
invokeLLM = createLLMInvoker(this.logger);
|
|
907
923
|
async call(input) {
|
|
908
|
-
var _a, _b, _c;
|
|
924
|
+
var _a, _b, _c, _d;
|
|
925
|
+
const criteria = ((_a = input.judgmentRequest) == null ? void 0 : _a.criteria) ?? this.criteria;
|
|
909
926
|
this.logger.debug("call() invoked", {
|
|
910
927
|
threadId: input.threadId,
|
|
911
928
|
currentTurn: input.scenarioState.currentTurn,
|
|
@@ -924,7 +941,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
924
941
|
</opentelemetry_traces>
|
|
925
942
|
`;
|
|
926
943
|
const cfg = this.cfg;
|
|
927
|
-
const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(
|
|
944
|
+
const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(criteria, input.scenarioConfig.description);
|
|
928
945
|
const messages = [
|
|
929
946
|
{ role: "system", content: systemPrompt },
|
|
930
947
|
{ role: "user", content: contentForJudge }
|
|
@@ -937,10 +954,10 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
937
954
|
});
|
|
938
955
|
const tools = {
|
|
939
956
|
continue_test: buildContinueTestTool(),
|
|
940
|
-
finish_test: buildFinishTestTool(
|
|
957
|
+
finish_test: buildFinishTestTool(criteria)
|
|
941
958
|
};
|
|
942
|
-
const enforceJudgement = input.judgmentRequest;
|
|
943
|
-
const hasCriteria =
|
|
959
|
+
const enforceJudgement = input.judgmentRequest != null;
|
|
960
|
+
const hasCriteria = criteria.length && criteria.length > 0;
|
|
944
961
|
if (enforceJudgement && !hasCriteria) {
|
|
945
962
|
return {
|
|
946
963
|
success: false,
|
|
@@ -965,26 +982,26 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
965
982
|
toolChoice
|
|
966
983
|
});
|
|
967
984
|
this.logger.debug("LLM response received", {
|
|
968
|
-
toolCallCount: ((
|
|
969
|
-
toolCalls: (
|
|
985
|
+
toolCallCount: ((_b = completion.toolCalls) == null ? void 0 : _b.length) ?? 0,
|
|
986
|
+
toolCalls: (_c = completion.toolCalls) == null ? void 0 : _c.map((tc) => ({
|
|
970
987
|
toolName: tc.toolName,
|
|
971
988
|
args: tc.input
|
|
972
989
|
}))
|
|
973
990
|
});
|
|
974
991
|
let args;
|
|
975
|
-
if ((
|
|
992
|
+
if ((_d = completion.toolCalls) == null ? void 0 : _d.length) {
|
|
976
993
|
const toolCall = completion.toolCalls[0];
|
|
977
994
|
switch (toolCall.toolName) {
|
|
978
995
|
case "finish_test": {
|
|
979
996
|
args = toolCall.input;
|
|
980
997
|
const verdict = args.verdict || "inconclusive";
|
|
981
998
|
const reasoning = args.reasoning || "No reasoning provided";
|
|
982
|
-
const
|
|
983
|
-
const criteriaValues = Object.values(
|
|
984
|
-
const metCriteria =
|
|
999
|
+
const criteriaArgs = args.criteria || {};
|
|
1000
|
+
const criteriaValues = Object.values(criteriaArgs);
|
|
1001
|
+
const metCriteria = criteria.filter(
|
|
985
1002
|
(_, i) => criteriaValues[i] === "true"
|
|
986
1003
|
);
|
|
987
|
-
const unmetCriteria =
|
|
1004
|
+
const unmetCriteria = criteria.filter(
|
|
988
1005
|
(_, i) => criteriaValues[i] !== "true"
|
|
989
1006
|
);
|
|
990
1007
|
const result = {
|
|
@@ -1004,7 +1021,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1004
1021
|
success: false,
|
|
1005
1022
|
reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
|
|
1006
1023
|
metCriteria: [],
|
|
1007
|
-
unmetCriteria:
|
|
1024
|
+
unmetCriteria: criteria
|
|
1008
1025
|
};
|
|
1009
1026
|
}
|
|
1010
1027
|
}
|
|
@@ -1012,7 +1029,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1012
1029
|
success: false,
|
|
1013
1030
|
reasoning: `JudgeAgent: No tool call found in LLM output`,
|
|
1014
1031
|
metCriteria: [],
|
|
1015
|
-
unmetCriteria:
|
|
1032
|
+
unmetCriteria: criteria
|
|
1016
1033
|
};
|
|
1017
1034
|
}
|
|
1018
1035
|
getOpenTelemetryTracesDigest(threadId) {
|
|
@@ -1022,7 +1039,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
1022
1039
|
}
|
|
1023
1040
|
};
|
|
1024
1041
|
var judgeAgent = (cfg) => {
|
|
1025
|
-
return new JudgeAgent(cfg);
|
|
1042
|
+
return new JudgeAgent(cfg ?? {});
|
|
1026
1043
|
};
|
|
1027
1044
|
|
|
1028
1045
|
// src/agents/user-simulator-agent.ts
|
|
@@ -2466,13 +2483,15 @@ function convertModelMessagesToAguiMessages(modelMessages) {
|
|
|
2466
2483
|
}
|
|
2467
2484
|
case msg.role === "tool":
|
|
2468
2485
|
msg.content.map((p, i) => {
|
|
2469
|
-
|
|
2486
|
+
if ("type" in p && p.type !== "tool-result") return;
|
|
2470
2487
|
aguiMessages.push({
|
|
2471
2488
|
trace_id: msg.traceId,
|
|
2472
2489
|
id: `${id}-${i}`,
|
|
2473
2490
|
role: "tool",
|
|
2474
2491
|
toolCallId: p.toolCallId,
|
|
2475
|
-
content: JSON.stringify(
|
|
2492
|
+
content: JSON.stringify(
|
|
2493
|
+
p.output && "value" in p.output ? p.output.value : p.output
|
|
2494
|
+
)
|
|
2476
2495
|
});
|
|
2477
2496
|
});
|
|
2478
2497
|
break;
|
|
@@ -2516,6 +2535,8 @@ var ScenarioExecution = class {
|
|
|
2516
2535
|
currentTurnSpan;
|
|
2517
2536
|
/** Timestamp when execution started (for total time calculation) */
|
|
2518
2537
|
totalStartTime = 0;
|
|
2538
|
+
/** Accumulated results from inline judge checkpoints */
|
|
2539
|
+
checkpointResults = [];
|
|
2519
2540
|
/** Event stream for monitoring scenario progress */
|
|
2520
2541
|
eventSubject = new import_rxjs2.Subject();
|
|
2521
2542
|
/**
|
|
@@ -2593,6 +2614,7 @@ var ScenarioExecution = class {
|
|
|
2593
2614
|
totalTime: this.totalTime,
|
|
2594
2615
|
agentTime: totalAgentTime
|
|
2595
2616
|
};
|
|
2617
|
+
return this._result;
|
|
2596
2618
|
this.logger.debug(`[${this.config.id}] Result set`, {
|
|
2597
2619
|
success: result.success,
|
|
2598
2620
|
reasoning: result.reasoning,
|
|
@@ -2653,6 +2675,8 @@ var ScenarioExecution = class {
|
|
|
2653
2675
|
const scriptStep = this.config.script[i];
|
|
2654
2676
|
await this.executeScriptStep(scriptStep, i);
|
|
2655
2677
|
if (this.result) {
|
|
2678
|
+
const cp = this.compiledCheckpoints;
|
|
2679
|
+
this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
|
|
2656
2680
|
this.emitRunFinished({
|
|
2657
2681
|
scenarioRunId,
|
|
2658
2682
|
status: this.result.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
|
|
@@ -2661,7 +2685,22 @@ var ScenarioExecution = class {
|
|
|
2661
2685
|
return this.result;
|
|
2662
2686
|
}
|
|
2663
2687
|
}
|
|
2664
|
-
this.
|
|
2688
|
+
if (this.checkpointResults.length > 0) {
|
|
2689
|
+
const cp = this.compiledCheckpoints;
|
|
2690
|
+
const result2 = this.setResult({
|
|
2691
|
+
success: cp.unmetCriteria.length === 0,
|
|
2692
|
+
reasoning: "All inline criteria checkpoints passed",
|
|
2693
|
+
metCriteria: cp.metCriteria,
|
|
2694
|
+
unmetCriteria: cp.unmetCriteria
|
|
2695
|
+
});
|
|
2696
|
+
this.emitRunFinished({
|
|
2697
|
+
scenarioRunId,
|
|
2698
|
+
status: result2.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
|
|
2699
|
+
result: result2
|
|
2700
|
+
});
|
|
2701
|
+
return result2;
|
|
2702
|
+
}
|
|
2703
|
+
const result = this.reachedMaxTurns(
|
|
2665
2704
|
[
|
|
2666
2705
|
"Reached end of script without conclusion, add one of the following to the end of the script:",
|
|
2667
2706
|
"- `Scenario.proceed()` to let the simulation continue to play out",
|
|
@@ -2669,11 +2708,11 @@ var ScenarioExecution = class {
|
|
|
2669
2708
|
"- `Scenario.succeed()` or `Scenario.fail()` to end the test with an explicit result"
|
|
2670
2709
|
].join("\n")
|
|
2671
2710
|
);
|
|
2672
|
-
this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED
|
|
2673
|
-
return
|
|
2711
|
+
this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */, result });
|
|
2712
|
+
return result;
|
|
2674
2713
|
} catch (error) {
|
|
2675
2714
|
const errorInfo = extractErrorInfo(error);
|
|
2676
|
-
this.setResult({
|
|
2715
|
+
const result = this.setResult({
|
|
2677
2716
|
success: false,
|
|
2678
2717
|
reasoning: `Scenario failed with error: ${errorInfo.message}`,
|
|
2679
2718
|
metCriteria: [],
|
|
@@ -2683,7 +2722,7 @@ var ScenarioExecution = class {
|
|
|
2683
2722
|
this.emitRunFinished({
|
|
2684
2723
|
scenarioRunId,
|
|
2685
2724
|
status: "ERROR" /* ERROR */,
|
|
2686
|
-
result
|
|
2725
|
+
result
|
|
2687
2726
|
});
|
|
2688
2727
|
throw error;
|
|
2689
2728
|
} finally {
|
|
@@ -2787,7 +2826,7 @@ var ScenarioExecution = class {
|
|
|
2787
2826
|
* @param judgmentRequest - Whether this is a judgment request (for judge agents)
|
|
2788
2827
|
* @throws Error if the agent call fails
|
|
2789
2828
|
*/
|
|
2790
|
-
async callAgent(idx, role, judgmentRequest
|
|
2829
|
+
async callAgent(idx, role, judgmentRequest) {
|
|
2791
2830
|
var _a;
|
|
2792
2831
|
const agent2 = this.agents[idx];
|
|
2793
2832
|
const agentName = agent2.name ?? agent2.constructor.name;
|
|
@@ -2978,25 +3017,26 @@ var ScenarioExecution = class {
|
|
|
2978
3017
|
*
|
|
2979
3018
|
* This method is part of the ScenarioExecutionLike interface used by script steps.
|
|
2980
3019
|
*
|
|
2981
|
-
* @param
|
|
3020
|
+
* @param options - Optional options with inline criteria to evaluate as a checkpoint.
|
|
2982
3021
|
* @returns A promise that resolves with:
|
|
2983
3022
|
* - ScenarioResult if the judge makes a final decision, or
|
|
2984
3023
|
* - Null if the conversation should continue
|
|
2985
3024
|
*
|
|
2986
3025
|
* @example
|
|
2987
3026
|
* ```typescript
|
|
2988
|
-
* // Let judge evaluate
|
|
3027
|
+
* // Let judge evaluate with its configured criteria
|
|
2989
3028
|
* const result = await execution.judge();
|
|
2990
|
-
* if (result) {
|
|
2991
|
-
* console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
|
|
2992
|
-
* }
|
|
2993
3029
|
*
|
|
2994
|
-
* //
|
|
2995
|
-
* const result = await execution.judge(
|
|
3030
|
+
* // Evaluate inline criteria as a checkpoint
|
|
3031
|
+
* const result = await execution.judge({ criteria: ["Agent responded helpfully"] });
|
|
2996
3032
|
* ```
|
|
2997
3033
|
*/
|
|
2998
|
-
async judge(
|
|
2999
|
-
return await this.scriptCallAgent(
|
|
3034
|
+
async judge(options) {
|
|
3035
|
+
return await this.scriptCallAgent(
|
|
3036
|
+
"Judge" /* JUDGE */,
|
|
3037
|
+
void 0,
|
|
3038
|
+
{ criteria: options == null ? void 0 : options.criteria }
|
|
3039
|
+
);
|
|
3000
3040
|
}
|
|
3001
3041
|
/**
|
|
3002
3042
|
* Lets the scenario proceed automatically for a specified number of turns.
|
|
@@ -3081,13 +3121,12 @@ var ScenarioExecution = class {
|
|
|
3081
3121
|
* ```
|
|
3082
3122
|
*/
|
|
3083
3123
|
async succeed(reasoning) {
|
|
3084
|
-
this.setResult({
|
|
3124
|
+
return this.setResult({
|
|
3085
3125
|
success: true,
|
|
3086
3126
|
reasoning: reasoning || "Scenario marked as successful with Scenario.succeed()",
|
|
3087
3127
|
metCriteria: [],
|
|
3088
3128
|
unmetCriteria: []
|
|
3089
3129
|
});
|
|
3090
|
-
return this.result;
|
|
3091
3130
|
}
|
|
3092
3131
|
/**
|
|
3093
3132
|
* Immediately ends the scenario with a failure verdict.
|
|
@@ -3113,13 +3152,12 @@ var ScenarioExecution = class {
|
|
|
3113
3152
|
* ```
|
|
3114
3153
|
*/
|
|
3115
3154
|
async fail(reasoning) {
|
|
3116
|
-
this.setResult({
|
|
3155
|
+
return this.setResult({
|
|
3117
3156
|
success: false,
|
|
3118
3157
|
reasoning: reasoning || "Scenario marked as failed with Scenario.fail()",
|
|
3119
3158
|
metCriteria: [],
|
|
3120
3159
|
unmetCriteria: []
|
|
3121
3160
|
});
|
|
3122
|
-
return this.result;
|
|
3123
3161
|
}
|
|
3124
3162
|
/**
|
|
3125
3163
|
* Adds execution time for a specific agent to the performance tracking.
|
|
@@ -3163,15 +3201,14 @@ var ScenarioExecution = class {
|
|
|
3163
3201
|
* decision, or null if the conversation should continue
|
|
3164
3202
|
* @throws Error if no agent is found for the specified role
|
|
3165
3203
|
*/
|
|
3166
|
-
async scriptCallAgent(role, content, judgmentRequest
|
|
3204
|
+
async scriptCallAgent(role, content, judgmentRequest) {
|
|
3167
3205
|
this.logger.debug(`[${this.config.id}] scriptCallAgent`, {
|
|
3168
3206
|
role,
|
|
3169
3207
|
hasContent: content !== void 0,
|
|
3170
|
-
judgmentRequest
|
|
3208
|
+
judgmentRequest: judgmentRequest != null,
|
|
3209
|
+
hasInlineCriteria: (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null
|
|
3171
3210
|
});
|
|
3172
3211
|
this.consumeUntilRole(role);
|
|
3173
|
-
let index = -1;
|
|
3174
|
-
let agent2 = null;
|
|
3175
3212
|
let nextAgent = this.getNextAgentForRole(role);
|
|
3176
3213
|
if (!nextAgent) {
|
|
3177
3214
|
this.newTurn();
|
|
@@ -3201,8 +3238,8 @@ var ScenarioExecution = class {
|
|
|
3201
3238
|
`Cannot generate a message for role \`${role}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
|
|
3202
3239
|
);
|
|
3203
3240
|
}
|
|
3204
|
-
index = nextAgent.index;
|
|
3205
|
-
agent2 = nextAgent.agent;
|
|
3241
|
+
const index = nextAgent.index;
|
|
3242
|
+
const agent2 = nextAgent.agent;
|
|
3206
3243
|
this.removePendingAgent(agent2);
|
|
3207
3244
|
if (content) {
|
|
3208
3245
|
const message2 = typeof content === "string" ? {
|
|
@@ -3214,6 +3251,25 @@ var ScenarioExecution = class {
|
|
|
3214
3251
|
return null;
|
|
3215
3252
|
}
|
|
3216
3253
|
await this.callAgent(index, role, judgmentRequest);
|
|
3254
|
+
if (this.result && (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null) {
|
|
3255
|
+
this.checkpointResults.push({
|
|
3256
|
+
metCriteria: this.result.metCriteria,
|
|
3257
|
+
unmetCriteria: this.result.unmetCriteria
|
|
3258
|
+
});
|
|
3259
|
+
if (this.result.success) {
|
|
3260
|
+
this._result = void 0;
|
|
3261
|
+
return null;
|
|
3262
|
+
} else {
|
|
3263
|
+
const cp = this.compiledCheckpoints;
|
|
3264
|
+
this.result.metCriteria = cp.metCriteria;
|
|
3265
|
+
this.result.unmetCriteria = cp.unmetCriteria;
|
|
3266
|
+
return this.result;
|
|
3267
|
+
}
|
|
3268
|
+
}
|
|
3269
|
+
if (this.result) {
|
|
3270
|
+
const cp = this.compiledCheckpoints;
|
|
3271
|
+
this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
|
|
3272
|
+
}
|
|
3217
3273
|
return this.result ?? null;
|
|
3218
3274
|
}
|
|
3219
3275
|
/**
|
|
@@ -3246,11 +3302,22 @@ var ScenarioExecution = class {
|
|
|
3246
3302
|
this.totalStartTime = Date.now();
|
|
3247
3303
|
this.pendingMessages.clear();
|
|
3248
3304
|
this._result = void 0;
|
|
3305
|
+
this.checkpointResults = [];
|
|
3249
3306
|
this.logger.debug(`[${this.config.id}] Reset complete`, {
|
|
3250
3307
|
threadId: this.state.threadId,
|
|
3251
3308
|
agentCount: this.agents.length
|
|
3252
3309
|
});
|
|
3253
3310
|
}
|
|
3311
|
+
/** Compiles all accumulated checkpoint results into aggregated met/unmet criteria. */
|
|
3312
|
+
get compiledCheckpoints() {
|
|
3313
|
+
const metCriteria = [];
|
|
3314
|
+
const unmetCriteria = [];
|
|
3315
|
+
for (const cp of this.checkpointResults) {
|
|
3316
|
+
metCriteria.push(...cp.metCriteria);
|
|
3317
|
+
unmetCriteria.push(...cp.unmetCriteria);
|
|
3318
|
+
}
|
|
3319
|
+
return { metCriteria, unmetCriteria };
|
|
3320
|
+
}
|
|
3254
3321
|
nextAgentForRole(role) {
|
|
3255
3322
|
for (const agent2 of this.agents) {
|
|
3256
3323
|
if (agent2.role === role && this.pendingAgentsOnTurn.has(agent2) && this.pendingRolesOnTurn.includes(role)) {
|
|
@@ -3347,7 +3414,7 @@ var ScenarioExecution = class {
|
|
|
3347
3414
|
*/
|
|
3348
3415
|
reachedMaxTurns(errorMessage) {
|
|
3349
3416
|
var _a;
|
|
3350
|
-
this.setResult({
|
|
3417
|
+
return this.setResult({
|
|
3351
3418
|
success: false,
|
|
3352
3419
|
reasoning: errorMessage || `Reached maximum turns (${this.config.maxTurns || 10}) without conclusion`,
|
|
3353
3420
|
metCriteria: [],
|
|
@@ -3848,9 +3915,9 @@ var message = (message2) => {
|
|
|
3848
3915
|
var agent = (content) => {
|
|
3849
3916
|
return (_state, executor) => executor.agent(content);
|
|
3850
3917
|
};
|
|
3851
|
-
var judge = (
|
|
3918
|
+
var judge = (options) => {
|
|
3852
3919
|
return async (_state, executor) => {
|
|
3853
|
-
await executor.judge(
|
|
3920
|
+
await executor.judge(options);
|
|
3854
3921
|
};
|
|
3855
3922
|
};
|
|
3856
3923
|
var user = (content) => {
|
|
@@ -3962,7 +4029,6 @@ function formatPart(part) {
|
|
|
3962
4029
|
case "reasoning":
|
|
3963
4030
|
return `(reasoning): ${part.text}`;
|
|
3964
4031
|
default:
|
|
3965
|
-
part;
|
|
3966
4032
|
return `Unknown content: ${JSON.stringify(part)}`;
|
|
3967
4033
|
}
|
|
3968
4034
|
}
|