@langwatch/scenario 0.4.0 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +70 -47
- package/dist/index.d.ts +70 -47
- package/dist/index.js +153 -87
- package/dist/index.mjs +153 -87
- package/dist/integrations/vitest/setup.js +1 -1
- package/dist/integrations/vitest/setup.mjs +1 -1
- package/package.json +4 -4
package/dist/index.mjs
CHANGED
|
@@ -131,7 +131,7 @@ var DEFAULT_TEMPERATURE = 0;
|
|
|
131
131
|
var modelSchema = z2.object({
|
|
132
132
|
model: z2.custom((val) => Boolean(val), {
|
|
133
133
|
message: "A model is required. Configure it in scenario.config.js defaultModel or pass directly to the agent."
|
|
134
|
-
}).describe("
|
|
134
|
+
}).describe("Language model that is used by the AI SDK Core functions."),
|
|
135
135
|
temperature: z2.number().min(0).max(1).optional().describe("The temperature for the language model.").default(DEFAULT_TEMPERATURE),
|
|
136
136
|
maxTokens: z2.number().optional().describe("The maximum number of tokens to generate.")
|
|
137
137
|
});
|
|
@@ -397,7 +397,7 @@ var JudgeUtils = {
|
|
|
397
397
|
/**
|
|
398
398
|
* Builds a minimal transcript from messages for judge evaluation.
|
|
399
399
|
* Truncates base64 media to reduce token usage.
|
|
400
|
-
* @param messages - Array of
|
|
400
|
+
* @param messages - Array of ModelMessage from conversation
|
|
401
401
|
* @returns Plain text transcript with one message per line
|
|
402
402
|
*/
|
|
403
403
|
buildTranscriptFromMessages(messages) {
|
|
@@ -428,52 +428,68 @@ var createLLMInvoker = (logger2) => {
|
|
|
428
428
|
var toolMessageRole = "tool";
|
|
429
429
|
var assistantMessageRole = "assistant";
|
|
430
430
|
var userMessageRole = "user";
|
|
431
|
-
var
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
if (
|
|
442
|
-
|
|
431
|
+
var hasToolContent = (message2) => {
|
|
432
|
+
if (message2.role === toolMessageRole) return true;
|
|
433
|
+
if (!Array.isArray(message2.content)) return false;
|
|
434
|
+
return message2.content.some((part) => {
|
|
435
|
+
if (!part || typeof part !== "object") return false;
|
|
436
|
+
const partType = "type" in part ? part.type : void 0;
|
|
437
|
+
return partType === "tool-call" || partType === "tool-result";
|
|
438
|
+
});
|
|
439
|
+
};
|
|
440
|
+
var stringifyValue = (value) => {
|
|
441
|
+
if (typeof value === "string") return value;
|
|
442
|
+
if (value === void 0) return "undefined";
|
|
443
|
+
try {
|
|
444
|
+
const serialized = JSON.stringify(value);
|
|
445
|
+
return serialized === void 0 ? String(value) : serialized;
|
|
446
|
+
} catch {
|
|
447
|
+
return String(value);
|
|
443
448
|
}
|
|
444
|
-
return segments;
|
|
445
449
|
};
|
|
446
|
-
var
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
450
|
+
var summarizeToolMessage = (message2) => {
|
|
451
|
+
if (message2.role === toolMessageRole && !Array.isArray(message2.content)) {
|
|
452
|
+
return `[Tool message: ${stringifyValue(message2.content)}]`;
|
|
453
|
+
}
|
|
454
|
+
if (message2.role === toolMessageRole) {
|
|
455
|
+
const toolResults = message2.content.filter((part) => part.type === "tool-result").map((part) => {
|
|
456
|
+
const contentPart = part;
|
|
457
|
+
const name = contentPart.toolName ?? "unknown tool";
|
|
458
|
+
const output = contentPart.output;
|
|
459
|
+
const value = output && typeof output === "object" && "value" in output && typeof output.value === "string" ? output.value : output ?? contentPart.result;
|
|
460
|
+
return `[Tool result from ${name}: ${stringifyValue(value)}]`;
|
|
461
|
+
});
|
|
462
|
+
return toolResults.length > 0 ? toolResults.join("\n") : null;
|
|
463
|
+
}
|
|
464
|
+
if (!Array.isArray(message2.content)) return null;
|
|
465
|
+
const toolCalls = message2.content.filter((part) => part.type === "tool-call").map((part) => {
|
|
466
|
+
const contentPart = part;
|
|
467
|
+
const name = contentPart.toolName ?? "unknown tool";
|
|
468
|
+
return `[Called tool ${name} with: ${stringifyValue(contentPart.input)}]`;
|
|
453
469
|
});
|
|
470
|
+
return toolCalls.length > 0 ? toolCalls.join("\n") : null;
|
|
454
471
|
};
|
|
455
|
-
var
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
472
|
+
var messageRoleReversal = (messages) => {
|
|
473
|
+
const roleMap = {
|
|
474
|
+
[userMessageRole]: assistantMessageRole,
|
|
475
|
+
[assistantMessageRole]: userMessageRole
|
|
476
|
+
};
|
|
477
|
+
return messages.map((message2) => {
|
|
478
|
+
if (hasToolContent(message2)) {
|
|
479
|
+
const summary = summarizeToolMessage(message2);
|
|
480
|
+
if (!summary) return null;
|
|
481
|
+
return {
|
|
482
|
+
role: userMessageRole,
|
|
483
|
+
content: summary
|
|
484
|
+
};
|
|
485
|
+
}
|
|
463
486
|
const newRole = roleMap[message2.role];
|
|
464
487
|
if (!newRole) return message2;
|
|
465
488
|
return {
|
|
466
|
-
|
|
467
|
-
|
|
489
|
+
...message2,
|
|
490
|
+
role: newRole
|
|
468
491
|
};
|
|
469
|
-
});
|
|
470
|
-
};
|
|
471
|
-
var messageRoleReversal = (messages) => {
|
|
472
|
-
const segments = groupMessagesByToolBoundaries(messages);
|
|
473
|
-
const processedSegments = segments.map(
|
|
474
|
-
(segment) => segmentHasToolMessages(segment) ? segment : reverseSegmentRoles(segment)
|
|
475
|
-
);
|
|
476
|
-
return processedSegments.flat();
|
|
492
|
+
}).filter((message2) => message2 !== null);
|
|
477
493
|
};
|
|
478
494
|
var criterionToParamName = (criterion) => {
|
|
479
495
|
return criterion.replace(/"/g, "").replace(/[^a-zA-Z0-9]/g, "_").replace(/ /g, "_").toLowerCase().substring(0, 70);
|
|
@@ -835,7 +851,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
835
851
|
constructor(cfg) {
|
|
836
852
|
super();
|
|
837
853
|
this.cfg = cfg;
|
|
838
|
-
this.criteria = cfg.criteria;
|
|
854
|
+
this.criteria = cfg.criteria ?? [];
|
|
839
855
|
this.spanCollector = cfg.spanCollector ?? judgeSpanCollector;
|
|
840
856
|
}
|
|
841
857
|
logger = new Logger("JudgeAgent");
|
|
@@ -847,7 +863,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
847
863
|
*/
|
|
848
864
|
invokeLLM = createLLMInvoker(this.logger);
|
|
849
865
|
async call(input) {
|
|
850
|
-
var _a, _b, _c;
|
|
866
|
+
var _a, _b, _c, _d;
|
|
867
|
+
const criteria = ((_a = input.judgmentRequest) == null ? void 0 : _a.criteria) ?? this.criteria;
|
|
851
868
|
this.logger.debug("call() invoked", {
|
|
852
869
|
threadId: input.threadId,
|
|
853
870
|
currentTurn: input.scenarioState.currentTurn,
|
|
@@ -866,7 +883,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
866
883
|
</opentelemetry_traces>
|
|
867
884
|
`;
|
|
868
885
|
const cfg = this.cfg;
|
|
869
|
-
const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(
|
|
886
|
+
const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(criteria, input.scenarioConfig.description);
|
|
870
887
|
const messages = [
|
|
871
888
|
{ role: "system", content: systemPrompt },
|
|
872
889
|
{ role: "user", content: contentForJudge }
|
|
@@ -879,10 +896,10 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
879
896
|
});
|
|
880
897
|
const tools = {
|
|
881
898
|
continue_test: buildContinueTestTool(),
|
|
882
|
-
finish_test: buildFinishTestTool(
|
|
899
|
+
finish_test: buildFinishTestTool(criteria)
|
|
883
900
|
};
|
|
884
|
-
const enforceJudgement = input.judgmentRequest;
|
|
885
|
-
const hasCriteria =
|
|
901
|
+
const enforceJudgement = input.judgmentRequest != null;
|
|
902
|
+
const hasCriteria = criteria.length && criteria.length > 0;
|
|
886
903
|
if (enforceJudgement && !hasCriteria) {
|
|
887
904
|
return {
|
|
888
905
|
success: false,
|
|
@@ -907,26 +924,26 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
907
924
|
toolChoice
|
|
908
925
|
});
|
|
909
926
|
this.logger.debug("LLM response received", {
|
|
910
|
-
toolCallCount: ((
|
|
911
|
-
toolCalls: (
|
|
927
|
+
toolCallCount: ((_b = completion.toolCalls) == null ? void 0 : _b.length) ?? 0,
|
|
928
|
+
toolCalls: (_c = completion.toolCalls) == null ? void 0 : _c.map((tc) => ({
|
|
912
929
|
toolName: tc.toolName,
|
|
913
930
|
args: tc.input
|
|
914
931
|
}))
|
|
915
932
|
});
|
|
916
933
|
let args;
|
|
917
|
-
if ((
|
|
934
|
+
if ((_d = completion.toolCalls) == null ? void 0 : _d.length) {
|
|
918
935
|
const toolCall = completion.toolCalls[0];
|
|
919
936
|
switch (toolCall.toolName) {
|
|
920
937
|
case "finish_test": {
|
|
921
938
|
args = toolCall.input;
|
|
922
939
|
const verdict = args.verdict || "inconclusive";
|
|
923
940
|
const reasoning = args.reasoning || "No reasoning provided";
|
|
924
|
-
const
|
|
925
|
-
const criteriaValues = Object.values(
|
|
926
|
-
const metCriteria =
|
|
941
|
+
const criteriaArgs = args.criteria || {};
|
|
942
|
+
const criteriaValues = Object.values(criteriaArgs);
|
|
943
|
+
const metCriteria = criteria.filter(
|
|
927
944
|
(_, i) => criteriaValues[i] === "true"
|
|
928
945
|
);
|
|
929
|
-
const unmetCriteria =
|
|
946
|
+
const unmetCriteria = criteria.filter(
|
|
930
947
|
(_, i) => criteriaValues[i] !== "true"
|
|
931
948
|
);
|
|
932
949
|
const result = {
|
|
@@ -946,7 +963,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
946
963
|
success: false,
|
|
947
964
|
reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
|
|
948
965
|
metCriteria: [],
|
|
949
|
-
unmetCriteria:
|
|
966
|
+
unmetCriteria: criteria
|
|
950
967
|
};
|
|
951
968
|
}
|
|
952
969
|
}
|
|
@@ -954,7 +971,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
954
971
|
success: false,
|
|
955
972
|
reasoning: `JudgeAgent: No tool call found in LLM output`,
|
|
956
973
|
metCriteria: [],
|
|
957
|
-
unmetCriteria:
|
|
974
|
+
unmetCriteria: criteria
|
|
958
975
|
};
|
|
959
976
|
}
|
|
960
977
|
getOpenTelemetryTracesDigest(threadId) {
|
|
@@ -964,7 +981,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
|
|
|
964
981
|
}
|
|
965
982
|
};
|
|
966
983
|
var judgeAgent = (cfg) => {
|
|
967
|
-
return new JudgeAgent(cfg);
|
|
984
|
+
return new JudgeAgent(cfg ?? {});
|
|
968
985
|
};
|
|
969
986
|
|
|
970
987
|
// src/agents/user-simulator-agent.ts
|
|
@@ -2408,13 +2425,15 @@ function convertModelMessagesToAguiMessages(modelMessages) {
|
|
|
2408
2425
|
}
|
|
2409
2426
|
case msg.role === "tool":
|
|
2410
2427
|
msg.content.map((p, i) => {
|
|
2411
|
-
|
|
2428
|
+
if ("type" in p && p.type !== "tool-result") return;
|
|
2412
2429
|
aguiMessages.push({
|
|
2413
2430
|
trace_id: msg.traceId,
|
|
2414
2431
|
id: `${id}-${i}`,
|
|
2415
2432
|
role: "tool",
|
|
2416
2433
|
toolCallId: p.toolCallId,
|
|
2417
|
-
content: JSON.stringify(
|
|
2434
|
+
content: JSON.stringify(
|
|
2435
|
+
p.output && "value" in p.output ? p.output.value : p.output
|
|
2436
|
+
)
|
|
2418
2437
|
});
|
|
2419
2438
|
});
|
|
2420
2439
|
break;
|
|
@@ -2458,6 +2477,8 @@ var ScenarioExecution = class {
|
|
|
2458
2477
|
currentTurnSpan;
|
|
2459
2478
|
/** Timestamp when execution started (for total time calculation) */
|
|
2460
2479
|
totalStartTime = 0;
|
|
2480
|
+
/** Accumulated results from inline judge checkpoints */
|
|
2481
|
+
checkpointResults = [];
|
|
2461
2482
|
/** Event stream for monitoring scenario progress */
|
|
2462
2483
|
eventSubject = new Subject2();
|
|
2463
2484
|
/**
|
|
@@ -2535,6 +2556,7 @@ var ScenarioExecution = class {
|
|
|
2535
2556
|
totalTime: this.totalTime,
|
|
2536
2557
|
agentTime: totalAgentTime
|
|
2537
2558
|
};
|
|
2559
|
+
return this._result;
|
|
2538
2560
|
this.logger.debug(`[${this.config.id}] Result set`, {
|
|
2539
2561
|
success: result.success,
|
|
2540
2562
|
reasoning: result.reasoning,
|
|
@@ -2595,6 +2617,8 @@ var ScenarioExecution = class {
|
|
|
2595
2617
|
const scriptStep = this.config.script[i];
|
|
2596
2618
|
await this.executeScriptStep(scriptStep, i);
|
|
2597
2619
|
if (this.result) {
|
|
2620
|
+
const cp = this.compiledCheckpoints;
|
|
2621
|
+
this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
|
|
2598
2622
|
this.emitRunFinished({
|
|
2599
2623
|
scenarioRunId,
|
|
2600
2624
|
status: this.result.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
|
|
@@ -2603,7 +2627,22 @@ var ScenarioExecution = class {
|
|
|
2603
2627
|
return this.result;
|
|
2604
2628
|
}
|
|
2605
2629
|
}
|
|
2606
|
-
this.
|
|
2630
|
+
if (this.checkpointResults.length > 0) {
|
|
2631
|
+
const cp = this.compiledCheckpoints;
|
|
2632
|
+
const result2 = this.setResult({
|
|
2633
|
+
success: cp.unmetCriteria.length === 0,
|
|
2634
|
+
reasoning: "All inline criteria checkpoints passed",
|
|
2635
|
+
metCriteria: cp.metCriteria,
|
|
2636
|
+
unmetCriteria: cp.unmetCriteria
|
|
2637
|
+
});
|
|
2638
|
+
this.emitRunFinished({
|
|
2639
|
+
scenarioRunId,
|
|
2640
|
+
status: result2.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
|
|
2641
|
+
result: result2
|
|
2642
|
+
});
|
|
2643
|
+
return result2;
|
|
2644
|
+
}
|
|
2645
|
+
const result = this.reachedMaxTurns(
|
|
2607
2646
|
[
|
|
2608
2647
|
"Reached end of script without conclusion, add one of the following to the end of the script:",
|
|
2609
2648
|
"- `Scenario.proceed()` to let the simulation continue to play out",
|
|
@@ -2611,11 +2650,11 @@ var ScenarioExecution = class {
|
|
|
2611
2650
|
"- `Scenario.succeed()` or `Scenario.fail()` to end the test with an explicit result"
|
|
2612
2651
|
].join("\n")
|
|
2613
2652
|
);
|
|
2614
|
-
this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED
|
|
2615
|
-
return
|
|
2653
|
+
this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */, result });
|
|
2654
|
+
return result;
|
|
2616
2655
|
} catch (error) {
|
|
2617
2656
|
const errorInfo = extractErrorInfo(error);
|
|
2618
|
-
this.setResult({
|
|
2657
|
+
const result = this.setResult({
|
|
2619
2658
|
success: false,
|
|
2620
2659
|
reasoning: `Scenario failed with error: ${errorInfo.message}`,
|
|
2621
2660
|
metCriteria: [],
|
|
@@ -2625,7 +2664,7 @@ var ScenarioExecution = class {
|
|
|
2625
2664
|
this.emitRunFinished({
|
|
2626
2665
|
scenarioRunId,
|
|
2627
2666
|
status: "ERROR" /* ERROR */,
|
|
2628
|
-
result
|
|
2667
|
+
result
|
|
2629
2668
|
});
|
|
2630
2669
|
throw error;
|
|
2631
2670
|
} finally {
|
|
@@ -2729,7 +2768,7 @@ var ScenarioExecution = class {
|
|
|
2729
2768
|
* @param judgmentRequest - Whether this is a judgment request (for judge agents)
|
|
2730
2769
|
* @throws Error if the agent call fails
|
|
2731
2770
|
*/
|
|
2732
|
-
async callAgent(idx, role, judgmentRequest
|
|
2771
|
+
async callAgent(idx, role, judgmentRequest) {
|
|
2733
2772
|
var _a;
|
|
2734
2773
|
const agent2 = this.agents[idx];
|
|
2735
2774
|
const agentName = agent2.name ?? agent2.constructor.name;
|
|
@@ -2920,25 +2959,26 @@ var ScenarioExecution = class {
|
|
|
2920
2959
|
*
|
|
2921
2960
|
* This method is part of the ScenarioExecutionLike interface used by script steps.
|
|
2922
2961
|
*
|
|
2923
|
-
* @param
|
|
2962
|
+
* @param options - Optional options with inline criteria to evaluate as a checkpoint.
|
|
2924
2963
|
* @returns A promise that resolves with:
|
|
2925
2964
|
* - ScenarioResult if the judge makes a final decision, or
|
|
2926
2965
|
* - Null if the conversation should continue
|
|
2927
2966
|
*
|
|
2928
2967
|
* @example
|
|
2929
2968
|
* ```typescript
|
|
2930
|
-
* // Let judge evaluate
|
|
2969
|
+
* // Let judge evaluate with its configured criteria
|
|
2931
2970
|
* const result = await execution.judge();
|
|
2932
|
-
* if (result) {
|
|
2933
|
-
* console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
|
|
2934
|
-
* }
|
|
2935
2971
|
*
|
|
2936
|
-
* //
|
|
2937
|
-
* const result = await execution.judge(
|
|
2972
|
+
* // Evaluate inline criteria as a checkpoint
|
|
2973
|
+
* const result = await execution.judge({ criteria: ["Agent responded helpfully"] });
|
|
2938
2974
|
* ```
|
|
2939
2975
|
*/
|
|
2940
|
-
async judge(
|
|
2941
|
-
return await this.scriptCallAgent(
|
|
2976
|
+
async judge(options) {
|
|
2977
|
+
return await this.scriptCallAgent(
|
|
2978
|
+
"Judge" /* JUDGE */,
|
|
2979
|
+
void 0,
|
|
2980
|
+
{ criteria: options == null ? void 0 : options.criteria }
|
|
2981
|
+
);
|
|
2942
2982
|
}
|
|
2943
2983
|
/**
|
|
2944
2984
|
* Lets the scenario proceed automatically for a specified number of turns.
|
|
@@ -3023,13 +3063,12 @@ var ScenarioExecution = class {
|
|
|
3023
3063
|
* ```
|
|
3024
3064
|
*/
|
|
3025
3065
|
async succeed(reasoning) {
|
|
3026
|
-
this.setResult({
|
|
3066
|
+
return this.setResult({
|
|
3027
3067
|
success: true,
|
|
3028
3068
|
reasoning: reasoning || "Scenario marked as successful with Scenario.succeed()",
|
|
3029
3069
|
metCriteria: [],
|
|
3030
3070
|
unmetCriteria: []
|
|
3031
3071
|
});
|
|
3032
|
-
return this.result;
|
|
3033
3072
|
}
|
|
3034
3073
|
/**
|
|
3035
3074
|
* Immediately ends the scenario with a failure verdict.
|
|
@@ -3055,13 +3094,12 @@ var ScenarioExecution = class {
|
|
|
3055
3094
|
* ```
|
|
3056
3095
|
*/
|
|
3057
3096
|
async fail(reasoning) {
|
|
3058
|
-
this.setResult({
|
|
3097
|
+
return this.setResult({
|
|
3059
3098
|
success: false,
|
|
3060
3099
|
reasoning: reasoning || "Scenario marked as failed with Scenario.fail()",
|
|
3061
3100
|
metCriteria: [],
|
|
3062
3101
|
unmetCriteria: []
|
|
3063
3102
|
});
|
|
3064
|
-
return this.result;
|
|
3065
3103
|
}
|
|
3066
3104
|
/**
|
|
3067
3105
|
* Adds execution time for a specific agent to the performance tracking.
|
|
@@ -3105,15 +3143,14 @@ var ScenarioExecution = class {
|
|
|
3105
3143
|
* decision, or null if the conversation should continue
|
|
3106
3144
|
* @throws Error if no agent is found for the specified role
|
|
3107
3145
|
*/
|
|
3108
|
-
async scriptCallAgent(role, content, judgmentRequest
|
|
3146
|
+
async scriptCallAgent(role, content, judgmentRequest) {
|
|
3109
3147
|
this.logger.debug(`[${this.config.id}] scriptCallAgent`, {
|
|
3110
3148
|
role,
|
|
3111
3149
|
hasContent: content !== void 0,
|
|
3112
|
-
judgmentRequest
|
|
3150
|
+
judgmentRequest: judgmentRequest != null,
|
|
3151
|
+
hasInlineCriteria: (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null
|
|
3113
3152
|
});
|
|
3114
3153
|
this.consumeUntilRole(role);
|
|
3115
|
-
let index = -1;
|
|
3116
|
-
let agent2 = null;
|
|
3117
3154
|
let nextAgent = this.getNextAgentForRole(role);
|
|
3118
3155
|
if (!nextAgent) {
|
|
3119
3156
|
this.newTurn();
|
|
@@ -3143,8 +3180,8 @@ var ScenarioExecution = class {
|
|
|
3143
3180
|
`Cannot generate a message for role \`${role}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
|
|
3144
3181
|
);
|
|
3145
3182
|
}
|
|
3146
|
-
index = nextAgent.index;
|
|
3147
|
-
agent2 = nextAgent.agent;
|
|
3183
|
+
const index = nextAgent.index;
|
|
3184
|
+
const agent2 = nextAgent.agent;
|
|
3148
3185
|
this.removePendingAgent(agent2);
|
|
3149
3186
|
if (content) {
|
|
3150
3187
|
const message2 = typeof content === "string" ? {
|
|
@@ -3156,6 +3193,25 @@ var ScenarioExecution = class {
|
|
|
3156
3193
|
return null;
|
|
3157
3194
|
}
|
|
3158
3195
|
await this.callAgent(index, role, judgmentRequest);
|
|
3196
|
+
if (this.result && (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null) {
|
|
3197
|
+
this.checkpointResults.push({
|
|
3198
|
+
metCriteria: this.result.metCriteria,
|
|
3199
|
+
unmetCriteria: this.result.unmetCriteria
|
|
3200
|
+
});
|
|
3201
|
+
if (this.result.success) {
|
|
3202
|
+
this._result = void 0;
|
|
3203
|
+
return null;
|
|
3204
|
+
} else {
|
|
3205
|
+
const cp = this.compiledCheckpoints;
|
|
3206
|
+
this.result.metCriteria = cp.metCriteria;
|
|
3207
|
+
this.result.unmetCriteria = cp.unmetCriteria;
|
|
3208
|
+
return this.result;
|
|
3209
|
+
}
|
|
3210
|
+
}
|
|
3211
|
+
if (this.result) {
|
|
3212
|
+
const cp = this.compiledCheckpoints;
|
|
3213
|
+
this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
|
|
3214
|
+
}
|
|
3159
3215
|
return this.result ?? null;
|
|
3160
3216
|
}
|
|
3161
3217
|
/**
|
|
@@ -3188,11 +3244,22 @@ var ScenarioExecution = class {
|
|
|
3188
3244
|
this.totalStartTime = Date.now();
|
|
3189
3245
|
this.pendingMessages.clear();
|
|
3190
3246
|
this._result = void 0;
|
|
3247
|
+
this.checkpointResults = [];
|
|
3191
3248
|
this.logger.debug(`[${this.config.id}] Reset complete`, {
|
|
3192
3249
|
threadId: this.state.threadId,
|
|
3193
3250
|
agentCount: this.agents.length
|
|
3194
3251
|
});
|
|
3195
3252
|
}
|
|
3253
|
+
/** Compiles all accumulated checkpoint results into aggregated met/unmet criteria. */
|
|
3254
|
+
get compiledCheckpoints() {
|
|
3255
|
+
const metCriteria = [];
|
|
3256
|
+
const unmetCriteria = [];
|
|
3257
|
+
for (const cp of this.checkpointResults) {
|
|
3258
|
+
metCriteria.push(...cp.metCriteria);
|
|
3259
|
+
unmetCriteria.push(...cp.unmetCriteria);
|
|
3260
|
+
}
|
|
3261
|
+
return { metCriteria, unmetCriteria };
|
|
3262
|
+
}
|
|
3196
3263
|
nextAgentForRole(role) {
|
|
3197
3264
|
for (const agent2 of this.agents) {
|
|
3198
3265
|
if (agent2.role === role && this.pendingAgentsOnTurn.has(agent2) && this.pendingRolesOnTurn.includes(role)) {
|
|
@@ -3289,7 +3356,7 @@ var ScenarioExecution = class {
|
|
|
3289
3356
|
*/
|
|
3290
3357
|
reachedMaxTurns(errorMessage) {
|
|
3291
3358
|
var _a;
|
|
3292
|
-
this.setResult({
|
|
3359
|
+
return this.setResult({
|
|
3293
3360
|
success: false,
|
|
3294
3361
|
reasoning: errorMessage || `Reached maximum turns (${this.config.maxTurns || 10}) without conclusion`,
|
|
3295
3362
|
metCriteria: [],
|
|
@@ -3797,9 +3864,9 @@ var message = (message2) => {
|
|
|
3797
3864
|
var agent = (content) => {
|
|
3798
3865
|
return (_state, executor) => executor.agent(content);
|
|
3799
3866
|
};
|
|
3800
|
-
var judge = (
|
|
3867
|
+
var judge = (options) => {
|
|
3801
3868
|
return async (_state, executor) => {
|
|
3802
|
-
await executor.judge(
|
|
3869
|
+
await executor.judge(options);
|
|
3803
3870
|
};
|
|
3804
3871
|
};
|
|
3805
3872
|
var user = (content) => {
|
|
@@ -3911,7 +3978,6 @@ function formatPart(part) {
|
|
|
3911
3978
|
case "reasoning":
|
|
3912
3979
|
return `(reasoning): ${part.text}`;
|
|
3913
3980
|
default:
|
|
3914
|
-
part;
|
|
3915
3981
|
return `Unknown content: ${JSON.stringify(part)}`;
|
|
3916
3982
|
}
|
|
3917
3983
|
}
|
|
@@ -104,7 +104,7 @@ var DEFAULT_TEMPERATURE = 0;
|
|
|
104
104
|
var modelSchema = import_v42.z.object({
|
|
105
105
|
model: import_v42.z.custom((val) => Boolean(val), {
|
|
106
106
|
message: "A model is required. Configure it in scenario.config.js defaultModel or pass directly to the agent."
|
|
107
|
-
}).describe("
|
|
107
|
+
}).describe("Language model that is used by the AI SDK Core functions."),
|
|
108
108
|
temperature: import_v42.z.number().min(0).max(1).optional().describe("The temperature for the language model.").default(DEFAULT_TEMPERATURE),
|
|
109
109
|
maxTokens: import_v42.z.number().optional().describe("The maximum number of tokens to generate.")
|
|
110
110
|
});
|
|
@@ -87,7 +87,7 @@ var DEFAULT_TEMPERATURE = 0;
|
|
|
87
87
|
var modelSchema = z2.object({
|
|
88
88
|
model: z2.custom((val) => Boolean(val), {
|
|
89
89
|
message: "A model is required. Configure it in scenario.config.js defaultModel or pass directly to the agent."
|
|
90
|
-
}).describe("
|
|
90
|
+
}).describe("Language model that is used by the AI SDK Core functions."),
|
|
91
91
|
temperature: z2.number().min(0).max(1).optional().describe("The temperature for the language model.").default(DEFAULT_TEMPERATURE),
|
|
92
92
|
maxTokens: z2.number().optional().describe("The maximum number of tokens to generate.")
|
|
93
93
|
});
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@langwatch/scenario",
|
|
3
|
-
"version": "0.4.
|
|
3
|
+
"version": "0.4.2",
|
|
4
4
|
"description": "A TypeScript library for testing AI agents using scenarios",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"module": "dist/index.mjs",
|
|
@@ -29,9 +29,9 @@
|
|
|
29
29
|
},
|
|
30
30
|
"dependencies": {
|
|
31
31
|
"@ag-ui/core": "^0.0.28",
|
|
32
|
-
"@ai-sdk/openai": "^
|
|
32
|
+
"@ai-sdk/openai": "^3.0.26",
|
|
33
33
|
"@openai/agents": "^0.3.3",
|
|
34
|
-
"ai": "
|
|
34
|
+
"ai": "^6.0.0",
|
|
35
35
|
"chalk": "^5.6.2",
|
|
36
36
|
"langwatch": "0.9.0",
|
|
37
37
|
"open": "11.0.0",
|
|
@@ -88,7 +88,7 @@
|
|
|
88
88
|
}
|
|
89
89
|
},
|
|
90
90
|
"peerDependencies": {
|
|
91
|
-
"ai": ">=
|
|
91
|
+
"ai": ">=6.0.0",
|
|
92
92
|
"vitest": ">=3.2.4"
|
|
93
93
|
},
|
|
94
94
|
"scripts": {
|