@langwatch/scenario 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -189,7 +189,7 @@ var DEFAULT_TEMPERATURE = 0;
189
189
  var modelSchema = import_v42.z.object({
190
190
  model: import_v42.z.custom((val) => Boolean(val), {
191
191
  message: "A model is required. Configure it in scenario.config.js defaultModel or pass directly to the agent."
192
- }).describe("The OpenAI Language Model to use for generating responses."),
192
+ }).describe("Language model that is used by the AI SDK Core functions."),
193
193
  temperature: import_v42.z.number().min(0).max(1).optional().describe("The temperature for the language model.").default(DEFAULT_TEMPERATURE),
194
194
  maxTokens: import_v42.z.number().optional().describe("The maximum number of tokens to generate.")
195
195
  });
@@ -455,7 +455,7 @@ var JudgeUtils = {
455
455
  /**
456
456
  * Builds a minimal transcript from messages for judge evaluation.
457
457
  * Truncates base64 media to reduce token usage.
458
- * @param messages - Array of CoreMessage from conversation
458
+ * @param messages - Array of ModelMessage from conversation
459
459
  * @returns Plain text transcript with one message per line
460
460
  */
461
461
  buildTranscriptFromMessages(messages) {
@@ -486,52 +486,68 @@ var createLLMInvoker = (logger2) => {
486
486
  var toolMessageRole = "tool";
487
487
  var assistantMessageRole = "assistant";
488
488
  var userMessageRole = "user";
489
- var groupMessagesByToolBoundaries = (messages) => {
490
- const segments = [];
491
- let currentSegment = [];
492
- for (const message2 of messages) {
493
- currentSegment.push(message2);
494
- if (message2.role === toolMessageRole) {
495
- segments.push(currentSegment);
496
- currentSegment = [];
497
- }
498
- }
499
- if (currentSegment.length > 0) {
500
- segments.push(currentSegment);
489
+ var hasToolContent = (message2) => {
490
+ if (message2.role === toolMessageRole) return true;
491
+ if (!Array.isArray(message2.content)) return false;
492
+ return message2.content.some((part) => {
493
+ if (!part || typeof part !== "object") return false;
494
+ const partType = "type" in part ? part.type : void 0;
495
+ return partType === "tool-call" || partType === "tool-result";
496
+ });
497
+ };
498
+ var stringifyValue = (value) => {
499
+ if (typeof value === "string") return value;
500
+ if (value === void 0) return "undefined";
501
+ try {
502
+ const serialized = JSON.stringify(value);
503
+ return serialized === void 0 ? String(value) : serialized;
504
+ } catch {
505
+ return String(value);
501
506
  }
502
- return segments;
503
507
  };
504
- var segmentHasToolMessages = (segment) => {
505
- return segment.some((message2) => {
506
- if (message2.role === toolMessageRole) return true;
507
- if (message2.role === assistantMessageRole && Array.isArray(message2.content)) {
508
- return message2.content.some((part) => part.type === "tool-call");
509
- }
510
- return false;
508
+ var summarizeToolMessage = (message2) => {
509
+ if (message2.role === toolMessageRole && !Array.isArray(message2.content)) {
510
+ return `[Tool message: ${stringifyValue(message2.content)}]`;
511
+ }
512
+ if (message2.role === toolMessageRole) {
513
+ const toolResults = message2.content.filter((part) => part.type === "tool-result").map((part) => {
514
+ const contentPart = part;
515
+ const name = contentPart.toolName ?? "unknown tool";
516
+ const output = contentPart.output;
517
+ const value = output && typeof output === "object" && "value" in output && typeof output.value === "string" ? output.value : output ?? contentPart.result;
518
+ return `[Tool result from ${name}: ${stringifyValue(value)}]`;
519
+ });
520
+ return toolResults.length > 0 ? toolResults.join("\n") : null;
521
+ }
522
+ if (!Array.isArray(message2.content)) return null;
523
+ const toolCalls = message2.content.filter((part) => part.type === "tool-call").map((part) => {
524
+ const contentPart = part;
525
+ const name = contentPart.toolName ?? "unknown tool";
526
+ return `[Called tool ${name} with: ${stringifyValue(contentPart.input)}]`;
511
527
  });
528
+ return toolCalls.length > 0 ? toolCalls.join("\n") : null;
512
529
  };
513
- var reverseSegmentRoles = (segment) => {
514
- return segment.map((message2) => {
515
- const hasStringContent = typeof message2.content === "string";
516
- if (!hasStringContent) return message2;
517
- const roleMap = {
518
- [userMessageRole]: assistantMessageRole,
519
- [assistantMessageRole]: userMessageRole
520
- };
530
+ var messageRoleReversal = (messages) => {
531
+ const roleMap = {
532
+ [userMessageRole]: assistantMessageRole,
533
+ [assistantMessageRole]: userMessageRole
534
+ };
535
+ return messages.map((message2) => {
536
+ if (hasToolContent(message2)) {
537
+ const summary = summarizeToolMessage(message2);
538
+ if (!summary) return null;
539
+ return {
540
+ role: userMessageRole,
541
+ content: summary
542
+ };
543
+ }
521
544
  const newRole = roleMap[message2.role];
522
545
  if (!newRole) return message2;
523
546
  return {
524
- role: newRole,
525
- content: message2.content
547
+ ...message2,
548
+ role: newRole
526
549
  };
527
- });
528
- };
529
- var messageRoleReversal = (messages) => {
530
- const segments = groupMessagesByToolBoundaries(messages);
531
- const processedSegments = segments.map(
532
- (segment) => segmentHasToolMessages(segment) ? segment : reverseSegmentRoles(segment)
533
- );
534
- return processedSegments.flat();
550
+ }).filter((message2) => message2 !== null);
535
551
  };
536
552
  var criterionToParamName = (criterion) => {
537
553
  return criterion.replace(/"/g, "").replace(/[^a-zA-Z0-9]/g, "_").replace(/ /g, "_").toLowerCase().substring(0, 70);
@@ -893,7 +909,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
893
909
  constructor(cfg) {
894
910
  super();
895
911
  this.cfg = cfg;
896
- this.criteria = cfg.criteria;
912
+ this.criteria = cfg.criteria ?? [];
897
913
  this.spanCollector = cfg.spanCollector ?? judgeSpanCollector;
898
914
  }
899
915
  logger = new Logger("JudgeAgent");
@@ -905,7 +921,8 @@ var JudgeAgent = class extends JudgeAgentAdapter {
905
921
  */
906
922
  invokeLLM = createLLMInvoker(this.logger);
907
923
  async call(input) {
908
- var _a, _b, _c;
924
+ var _a, _b, _c, _d;
925
+ const criteria = ((_a = input.judgmentRequest) == null ? void 0 : _a.criteria) ?? this.criteria;
909
926
  this.logger.debug("call() invoked", {
910
927
  threadId: input.threadId,
911
928
  currentTurn: input.scenarioState.currentTurn,
@@ -924,7 +941,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
924
941
  </opentelemetry_traces>
925
942
  `;
926
943
  const cfg = this.cfg;
927
- const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(cfg.criteria, input.scenarioConfig.description);
944
+ const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(criteria, input.scenarioConfig.description);
928
945
  const messages = [
929
946
  { role: "system", content: systemPrompt },
930
947
  { role: "user", content: contentForJudge }
@@ -937,10 +954,10 @@ var JudgeAgent = class extends JudgeAgentAdapter {
937
954
  });
938
955
  const tools = {
939
956
  continue_test: buildContinueTestTool(),
940
- finish_test: buildFinishTestTool(cfg.criteria)
957
+ finish_test: buildFinishTestTool(criteria)
941
958
  };
942
- const enforceJudgement = input.judgmentRequest;
943
- const hasCriteria = cfg.criteria.length && cfg.criteria.length > 0;
959
+ const enforceJudgement = input.judgmentRequest != null;
960
+ const hasCriteria = criteria.length && criteria.length > 0;
944
961
  if (enforceJudgement && !hasCriteria) {
945
962
  return {
946
963
  success: false,
@@ -965,26 +982,26 @@ var JudgeAgent = class extends JudgeAgentAdapter {
965
982
  toolChoice
966
983
  });
967
984
  this.logger.debug("LLM response received", {
968
- toolCallCount: ((_a = completion.toolCalls) == null ? void 0 : _a.length) ?? 0,
969
- toolCalls: (_b = completion.toolCalls) == null ? void 0 : _b.map((tc) => ({
985
+ toolCallCount: ((_b = completion.toolCalls) == null ? void 0 : _b.length) ?? 0,
986
+ toolCalls: (_c = completion.toolCalls) == null ? void 0 : _c.map((tc) => ({
970
987
  toolName: tc.toolName,
971
988
  args: tc.input
972
989
  }))
973
990
  });
974
991
  let args;
975
- if ((_c = completion.toolCalls) == null ? void 0 : _c.length) {
992
+ if ((_d = completion.toolCalls) == null ? void 0 : _d.length) {
976
993
  const toolCall = completion.toolCalls[0];
977
994
  switch (toolCall.toolName) {
978
995
  case "finish_test": {
979
996
  args = toolCall.input;
980
997
  const verdict = args.verdict || "inconclusive";
981
998
  const reasoning = args.reasoning || "No reasoning provided";
982
- const criteria = args.criteria || {};
983
- const criteriaValues = Object.values(criteria);
984
- const metCriteria = cfg.criteria.filter(
999
+ const criteriaArgs = args.criteria || {};
1000
+ const criteriaValues = Object.values(criteriaArgs);
1001
+ const metCriteria = criteria.filter(
985
1002
  (_, i) => criteriaValues[i] === "true"
986
1003
  );
987
- const unmetCriteria = cfg.criteria.filter(
1004
+ const unmetCriteria = criteria.filter(
988
1005
  (_, i) => criteriaValues[i] !== "true"
989
1006
  );
990
1007
  const result = {
@@ -1004,7 +1021,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1004
1021
  success: false,
1005
1022
  reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
1006
1023
  metCriteria: [],
1007
- unmetCriteria: cfg.criteria
1024
+ unmetCriteria: criteria
1008
1025
  };
1009
1026
  }
1010
1027
  }
@@ -1012,7 +1029,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1012
1029
  success: false,
1013
1030
  reasoning: `JudgeAgent: No tool call found in LLM output`,
1014
1031
  metCriteria: [],
1015
- unmetCriteria: cfg.criteria
1032
+ unmetCriteria: criteria
1016
1033
  };
1017
1034
  }
1018
1035
  getOpenTelemetryTracesDigest(threadId) {
@@ -1022,7 +1039,7 @@ var JudgeAgent = class extends JudgeAgentAdapter {
1022
1039
  }
1023
1040
  };
1024
1041
  var judgeAgent = (cfg) => {
1025
- return new JudgeAgent(cfg);
1042
+ return new JudgeAgent(cfg ?? {});
1026
1043
  };
1027
1044
 
1028
1045
  // src/agents/user-simulator-agent.ts
@@ -2466,13 +2483,15 @@ function convertModelMessagesToAguiMessages(modelMessages) {
2466
2483
  }
2467
2484
  case msg.role === "tool":
2468
2485
  msg.content.map((p, i) => {
2469
- var _a;
2486
+ if ("type" in p && p.type !== "tool-result") return;
2470
2487
  aguiMessages.push({
2471
2488
  trace_id: msg.traceId,
2472
2489
  id: `${id}-${i}`,
2473
2490
  role: "tool",
2474
2491
  toolCallId: p.toolCallId,
2475
- content: JSON.stringify((_a = p.output) == null ? void 0 : _a.value)
2492
+ content: JSON.stringify(
2493
+ p.output && "value" in p.output ? p.output.value : p.output
2494
+ )
2476
2495
  });
2477
2496
  });
2478
2497
  break;
@@ -2516,6 +2535,8 @@ var ScenarioExecution = class {
2516
2535
  currentTurnSpan;
2517
2536
  /** Timestamp when execution started (for total time calculation) */
2518
2537
  totalStartTime = 0;
2538
+ /** Accumulated results from inline judge checkpoints */
2539
+ checkpointResults = [];
2519
2540
  /** Event stream for monitoring scenario progress */
2520
2541
  eventSubject = new import_rxjs2.Subject();
2521
2542
  /**
@@ -2593,6 +2614,7 @@ var ScenarioExecution = class {
2593
2614
  totalTime: this.totalTime,
2594
2615
  agentTime: totalAgentTime
2595
2616
  };
2617
+ return this._result;
2596
2618
  this.logger.debug(`[${this.config.id}] Result set`, {
2597
2619
  success: result.success,
2598
2620
  reasoning: result.reasoning,
@@ -2653,6 +2675,8 @@ var ScenarioExecution = class {
2653
2675
  const scriptStep = this.config.script[i];
2654
2676
  await this.executeScriptStep(scriptStep, i);
2655
2677
  if (this.result) {
2678
+ const cp = this.compiledCheckpoints;
2679
+ this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
2656
2680
  this.emitRunFinished({
2657
2681
  scenarioRunId,
2658
2682
  status: this.result.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
@@ -2661,7 +2685,22 @@ var ScenarioExecution = class {
2661
2685
  return this.result;
2662
2686
  }
2663
2687
  }
2664
- this.reachedMaxTurns(
2688
+ if (this.checkpointResults.length > 0) {
2689
+ const cp = this.compiledCheckpoints;
2690
+ const result2 = this.setResult({
2691
+ success: cp.unmetCriteria.length === 0,
2692
+ reasoning: "All inline criteria checkpoints passed",
2693
+ metCriteria: cp.metCriteria,
2694
+ unmetCriteria: cp.unmetCriteria
2695
+ });
2696
+ this.emitRunFinished({
2697
+ scenarioRunId,
2698
+ status: result2.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
2699
+ result: result2
2700
+ });
2701
+ return result2;
2702
+ }
2703
+ const result = this.reachedMaxTurns(
2665
2704
  [
2666
2705
  "Reached end of script without conclusion, add one of the following to the end of the script:",
2667
2706
  "- `Scenario.proceed()` to let the simulation continue to play out",
@@ -2669,11 +2708,11 @@ var ScenarioExecution = class {
2669
2708
  "- `Scenario.succeed()` or `Scenario.fail()` to end the test with an explicit result"
2670
2709
  ].join("\n")
2671
2710
  );
2672
- this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */ });
2673
- return this.result;
2711
+ this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */, result });
2712
+ return result;
2674
2713
  } catch (error) {
2675
2714
  const errorInfo = extractErrorInfo(error);
2676
- this.setResult({
2715
+ const result = this.setResult({
2677
2716
  success: false,
2678
2717
  reasoning: `Scenario failed with error: ${errorInfo.message}`,
2679
2718
  metCriteria: [],
@@ -2683,7 +2722,7 @@ var ScenarioExecution = class {
2683
2722
  this.emitRunFinished({
2684
2723
  scenarioRunId,
2685
2724
  status: "ERROR" /* ERROR */,
2686
- result: this.result
2725
+ result
2687
2726
  });
2688
2727
  throw error;
2689
2728
  } finally {
@@ -2787,7 +2826,7 @@ var ScenarioExecution = class {
2787
2826
  * @param judgmentRequest - Whether this is a judgment request (for judge agents)
2788
2827
  * @throws Error if the agent call fails
2789
2828
  */
2790
- async callAgent(idx, role, judgmentRequest = false) {
2829
+ async callAgent(idx, role, judgmentRequest) {
2791
2830
  var _a;
2792
2831
  const agent2 = this.agents[idx];
2793
2832
  const agentName = agent2.name ?? agent2.constructor.name;
@@ -2978,25 +3017,26 @@ var ScenarioExecution = class {
2978
3017
  *
2979
3018
  * This method is part of the ScenarioExecutionLike interface used by script steps.
2980
3019
  *
2981
- * @param content - Optional message to pass to the judge agent for additional context
3020
+ * @param options - Optional options with inline criteria to evaluate as a checkpoint.
2982
3021
  * @returns A promise that resolves with:
2983
3022
  * - ScenarioResult if the judge makes a final decision, or
2984
3023
  * - Null if the conversation should continue
2985
3024
  *
2986
3025
  * @example
2987
3026
  * ```typescript
2988
- * // Let judge evaluate current state
3027
+ * // Let judge evaluate with its configured criteria
2989
3028
  * const result = await execution.judge();
2990
- * if (result) {
2991
- * console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
2992
- * }
2993
3029
  *
2994
- * // Provide additional context to judge
2995
- * const result = await execution.judge("Please consider the user's satisfaction level");
3030
+ * // Evaluate inline criteria as a checkpoint
3031
+ * const result = await execution.judge({ criteria: ["Agent responded helpfully"] });
2996
3032
  * ```
2997
3033
  */
2998
- async judge(content) {
2999
- return await this.scriptCallAgent("Judge" /* JUDGE */, content, true);
3034
+ async judge(options) {
3035
+ return await this.scriptCallAgent(
3036
+ "Judge" /* JUDGE */,
3037
+ void 0,
3038
+ { criteria: options == null ? void 0 : options.criteria }
3039
+ );
3000
3040
  }
3001
3041
  /**
3002
3042
  * Lets the scenario proceed automatically for a specified number of turns.
@@ -3081,13 +3121,12 @@ var ScenarioExecution = class {
3081
3121
  * ```
3082
3122
  */
3083
3123
  async succeed(reasoning) {
3084
- this.setResult({
3124
+ return this.setResult({
3085
3125
  success: true,
3086
3126
  reasoning: reasoning || "Scenario marked as successful with Scenario.succeed()",
3087
3127
  metCriteria: [],
3088
3128
  unmetCriteria: []
3089
3129
  });
3090
- return this.result;
3091
3130
  }
3092
3131
  /**
3093
3132
  * Immediately ends the scenario with a failure verdict.
@@ -3113,13 +3152,12 @@ var ScenarioExecution = class {
3113
3152
  * ```
3114
3153
  */
3115
3154
  async fail(reasoning) {
3116
- this.setResult({
3155
+ return this.setResult({
3117
3156
  success: false,
3118
3157
  reasoning: reasoning || "Scenario marked as failed with Scenario.fail()",
3119
3158
  metCriteria: [],
3120
3159
  unmetCriteria: []
3121
3160
  });
3122
- return this.result;
3123
3161
  }
3124
3162
  /**
3125
3163
  * Adds execution time for a specific agent to the performance tracking.
@@ -3163,15 +3201,14 @@ var ScenarioExecution = class {
3163
3201
  * decision, or null if the conversation should continue
3164
3202
  * @throws Error if no agent is found for the specified role
3165
3203
  */
3166
- async scriptCallAgent(role, content, judgmentRequest = false) {
3204
+ async scriptCallAgent(role, content, judgmentRequest) {
3167
3205
  this.logger.debug(`[${this.config.id}] scriptCallAgent`, {
3168
3206
  role,
3169
3207
  hasContent: content !== void 0,
3170
- judgmentRequest
3208
+ judgmentRequest: judgmentRequest != null,
3209
+ hasInlineCriteria: (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null
3171
3210
  });
3172
3211
  this.consumeUntilRole(role);
3173
- let index = -1;
3174
- let agent2 = null;
3175
3212
  let nextAgent = this.getNextAgentForRole(role);
3176
3213
  if (!nextAgent) {
3177
3214
  this.newTurn();
@@ -3201,8 +3238,8 @@ var ScenarioExecution = class {
3201
3238
  `Cannot generate a message for role \`${role}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
3202
3239
  );
3203
3240
  }
3204
- index = nextAgent.index;
3205
- agent2 = nextAgent.agent;
3241
+ const index = nextAgent.index;
3242
+ const agent2 = nextAgent.agent;
3206
3243
  this.removePendingAgent(agent2);
3207
3244
  if (content) {
3208
3245
  const message2 = typeof content === "string" ? {
@@ -3214,6 +3251,25 @@ var ScenarioExecution = class {
3214
3251
  return null;
3215
3252
  }
3216
3253
  await this.callAgent(index, role, judgmentRequest);
3254
+ if (this.result && (judgmentRequest == null ? void 0 : judgmentRequest.criteria) != null) {
3255
+ this.checkpointResults.push({
3256
+ metCriteria: this.result.metCriteria,
3257
+ unmetCriteria: this.result.unmetCriteria
3258
+ });
3259
+ if (this.result.success) {
3260
+ this._result = void 0;
3261
+ return null;
3262
+ } else {
3263
+ const cp = this.compiledCheckpoints;
3264
+ this.result.metCriteria = cp.metCriteria;
3265
+ this.result.unmetCriteria = cp.unmetCriteria;
3266
+ return this.result;
3267
+ }
3268
+ }
3269
+ if (this.result) {
3270
+ const cp = this.compiledCheckpoints;
3271
+ this.result.metCriteria = [...cp.metCriteria, ...this.result.metCriteria];
3272
+ }
3217
3273
  return this.result ?? null;
3218
3274
  }
3219
3275
  /**
@@ -3246,11 +3302,22 @@ var ScenarioExecution = class {
3246
3302
  this.totalStartTime = Date.now();
3247
3303
  this.pendingMessages.clear();
3248
3304
  this._result = void 0;
3305
+ this.checkpointResults = [];
3249
3306
  this.logger.debug(`[${this.config.id}] Reset complete`, {
3250
3307
  threadId: this.state.threadId,
3251
3308
  agentCount: this.agents.length
3252
3309
  });
3253
3310
  }
3311
+ /** Compiles all accumulated checkpoint results into aggregated met/unmet criteria. */
3312
+ get compiledCheckpoints() {
3313
+ const metCriteria = [];
3314
+ const unmetCriteria = [];
3315
+ for (const cp of this.checkpointResults) {
3316
+ metCriteria.push(...cp.metCriteria);
3317
+ unmetCriteria.push(...cp.unmetCriteria);
3318
+ }
3319
+ return { metCriteria, unmetCriteria };
3320
+ }
3254
3321
  nextAgentForRole(role) {
3255
3322
  for (const agent2 of this.agents) {
3256
3323
  if (agent2.role === role && this.pendingAgentsOnTurn.has(agent2) && this.pendingRolesOnTurn.includes(role)) {
@@ -3347,7 +3414,7 @@ var ScenarioExecution = class {
3347
3414
  */
3348
3415
  reachedMaxTurns(errorMessage) {
3349
3416
  var _a;
3350
- this.setResult({
3417
+ return this.setResult({
3351
3418
  success: false,
3352
3419
  reasoning: errorMessage || `Reached maximum turns (${this.config.maxTurns || 10}) without conclusion`,
3353
3420
  metCriteria: [],
@@ -3848,9 +3915,9 @@ var message = (message2) => {
3848
3915
  var agent = (content) => {
3849
3916
  return (_state, executor) => executor.agent(content);
3850
3917
  };
3851
- var judge = (content) => {
3918
+ var judge = (options) => {
3852
3919
  return async (_state, executor) => {
3853
- await executor.judge(content);
3920
+ await executor.judge(options);
3854
3921
  };
3855
3922
  };
3856
3923
  var user = (content) => {
@@ -3962,7 +4029,6 @@ function formatPart(part) {
3962
4029
  case "reasoning":
3963
4030
  return `(reasoning): ${part.text}`;
3964
4031
  default:
3965
- part;
3966
4032
  return `Unknown content: ${JSON.stringify(part)}`;
3967
4033
  }
3968
4034
  }