@orq-ai/evaluatorq 1.2.2 → 1.2.3-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/dist/lib/integrations/ai-sdk/index.d.ts +2 -0
  2. package/dist/lib/integrations/ai-sdk/index.d.ts.map +1 -1
  3. package/dist/lib/integrations/ai-sdk/index.js +1 -0
  4. package/dist/lib/integrations/ai-sdk/simulation-adapter.d.ts +47 -0
  5. package/dist/lib/integrations/ai-sdk/simulation-adapter.d.ts.map +1 -0
  6. package/dist/lib/integrations/ai-sdk/simulation-adapter.js +58 -0
  7. package/dist/lib/integrations/langchain/index.d.ts +2 -0
  8. package/dist/lib/integrations/langchain/index.d.ts.map +1 -1
  9. package/dist/lib/integrations/langchain/index.js +1 -0
  10. package/dist/lib/integrations/langchain/simulation-adapter.d.ts +49 -0
  11. package/dist/lib/integrations/langchain/simulation-adapter.d.ts.map +1 -0
  12. package/dist/lib/integrations/langchain/simulation-adapter.js +110 -0
  13. package/dist/lib/integrations/simulation/adapters.d.ts +57 -0
  14. package/dist/lib/integrations/simulation/adapters.d.ts.map +1 -0
  15. package/dist/lib/integrations/simulation/adapters.js +64 -0
  16. package/dist/lib/integrations/simulation/agents/base.d.ts +90 -0
  17. package/dist/lib/integrations/simulation/agents/base.d.ts.map +1 -0
  18. package/dist/lib/integrations/simulation/agents/base.js +227 -0
  19. package/dist/lib/integrations/simulation/agents/index.d.ts +10 -0
  20. package/dist/lib/integrations/simulation/agents/index.d.ts.map +1 -0
  21. package/dist/lib/integrations/simulation/agents/index.js +6 -0
  22. package/dist/lib/integrations/simulation/agents/judge.d.ts +50 -0
  23. package/dist/lib/integrations/simulation/agents/judge.d.ts.map +1 -0
  24. package/dist/lib/integrations/simulation/agents/judge.js +313 -0
  25. package/dist/lib/integrations/simulation/agents/user-simulator.d.ts +41 -0
  26. package/dist/lib/integrations/simulation/agents/user-simulator.d.ts.map +1 -0
  27. package/dist/lib/integrations/simulation/agents/user-simulator.js +82 -0
  28. package/dist/lib/integrations/simulation/convert.d.ts +22 -0
  29. package/dist/lib/integrations/simulation/convert.d.ts.map +1 -0
  30. package/dist/lib/integrations/simulation/convert.js +124 -0
  31. package/dist/lib/integrations/simulation/evaluators/index.d.ts +50 -0
  32. package/dist/lib/integrations/simulation/evaluators/index.d.ts.map +1 -0
  33. package/dist/lib/integrations/simulation/evaluators/index.js +100 -0
  34. package/dist/lib/integrations/simulation/generators/datapoint-generator.d.ts +60 -0
  35. package/dist/lib/integrations/simulation/generators/datapoint-generator.d.ts.map +1 -0
  36. package/dist/lib/integrations/simulation/generators/datapoint-generator.js +223 -0
  37. package/dist/lib/integrations/simulation/generators/first-message-generator.d.ts +38 -0
  38. package/dist/lib/integrations/simulation/generators/first-message-generator.d.ts.map +1 -0
  39. package/dist/lib/integrations/simulation/generators/first-message-generator.js +131 -0
  40. package/dist/lib/integrations/simulation/generators/index.d.ts +15 -0
  41. package/dist/lib/integrations/simulation/generators/index.d.ts.map +1 -0
  42. package/dist/lib/integrations/simulation/generators/index.js +10 -0
  43. package/dist/lib/integrations/simulation/generators/persona-generator.d.ts +60 -0
  44. package/dist/lib/integrations/simulation/generators/persona-generator.d.ts.map +1 -0
  45. package/dist/lib/integrations/simulation/generators/persona-generator.js +333 -0
  46. package/dist/lib/integrations/simulation/generators/scenario-generator.d.ts +77 -0
  47. package/dist/lib/integrations/simulation/generators/scenario-generator.d.ts.map +1 -0
  48. package/dist/lib/integrations/simulation/generators/scenario-generator.js +545 -0
  49. package/dist/lib/integrations/simulation/index.d.ts +33 -0
  50. package/dist/lib/integrations/simulation/index.d.ts.map +1 -0
  51. package/dist/lib/integrations/simulation/index.js +35 -0
  52. package/dist/lib/integrations/simulation/quality/index.d.ts +5 -0
  53. package/dist/lib/integrations/simulation/quality/index.d.ts.map +1 -0
  54. package/dist/lib/integrations/simulation/quality/index.js +4 -0
  55. package/dist/lib/integrations/simulation/quality/message-perturbation.d.ts +25 -0
  56. package/dist/lib/integrations/simulation/quality/message-perturbation.d.ts.map +1 -0
  57. package/dist/lib/integrations/simulation/quality/message-perturbation.js +150 -0
  58. package/dist/lib/integrations/simulation/runner/index.d.ts +5 -0
  59. package/dist/lib/integrations/simulation/runner/index.d.ts.map +1 -0
  60. package/dist/lib/integrations/simulation/runner/index.js +4 -0
  61. package/dist/lib/integrations/simulation/runner/simulation.d.ts +57 -0
  62. package/dist/lib/integrations/simulation/runner/simulation.d.ts.map +1 -0
  63. package/dist/lib/integrations/simulation/runner/simulation.js +336 -0
  64. package/dist/lib/integrations/simulation/schemas.d.ts +104 -0
  65. package/dist/lib/integrations/simulation/schemas.d.ts.map +1 -0
  66. package/dist/lib/integrations/simulation/schemas.js +76 -0
  67. package/dist/lib/integrations/simulation/simulation/index.d.ts +49 -0
  68. package/dist/lib/integrations/simulation/simulation/index.d.ts.map +1 -0
  69. package/dist/lib/integrations/simulation/simulation/index.js +159 -0
  70. package/dist/lib/integrations/simulation/types.d.ts +101 -0
  71. package/dist/lib/integrations/simulation/types.d.ts.map +1 -0
  72. package/dist/lib/integrations/simulation/types.js +90 -0
  73. package/dist/lib/integrations/simulation/utils/dataset-export.d.ts +31 -0
  74. package/dist/lib/integrations/simulation/utils/dataset-export.d.ts.map +1 -0
  75. package/dist/lib/integrations/simulation/utils/dataset-export.js +146 -0
  76. package/dist/lib/integrations/simulation/utils/extract-json.d.ts +17 -0
  77. package/dist/lib/integrations/simulation/utils/extract-json.d.ts.map +1 -0
  78. package/dist/lib/integrations/simulation/utils/extract-json.js +106 -0
  79. package/dist/lib/integrations/simulation/utils/prompt-builders.d.ts +34 -0
  80. package/dist/lib/integrations/simulation/utils/prompt-builders.d.ts.map +1 -0
  81. package/dist/lib/integrations/simulation/utils/prompt-builders.js +147 -0
  82. package/dist/lib/integrations/simulation/utils/sanitize.d.ts +15 -0
  83. package/dist/lib/integrations/simulation/utils/sanitize.d.ts.map +1 -0
  84. package/dist/lib/integrations/simulation/utils/sanitize.js +20 -0
  85. package/dist/lib/integrations/simulation/wrap-agent.d.ts +65 -0
  86. package/dist/lib/integrations/simulation/wrap-agent.d.ts.map +1 -0
  87. package/dist/lib/integrations/simulation/wrap-agent.js +140 -0
  88. package/dist/lib/send-results.d.ts.map +1 -1
  89. package/dist/lib/send-results.js +17 -2
  90. package/dist/lib/types.d.ts +2 -2
  91. package/dist/lib/types.d.ts.map +1 -1
  92. package/dist/tsconfig.lib.tsbuildinfo +1 -1
  93. package/package.json +24 -2
@@ -0,0 +1,227 @@
1
+ /**
2
+ * Base agent class for simulation agents.
3
+ *
4
+ * Provides common functionality for all agents in the simulation system,
5
+ * including LLM interaction with retry logic.
6
+ */
7
+ import OpenAI from "openai";
8
+ // Retry configuration
9
+ const MAX_RETRY_ATTEMPTS = 5;
10
+ const RETRY_MIN_WAIT_MS = 2_000;
11
+ const RETRY_MAX_WAIT_MS = 60_000;
12
+ const DEFAULT_TIMEOUT_S = 60;
13
+ /**
14
+ * Determines whether an HTTP status code is retryable.
15
+ */
16
+ function isRetryableStatus(status) {
17
+ if (status === undefined)
18
+ return false;
19
+ return status === 429 || status >= 500;
20
+ }
21
+ /**
22
+ * Abstract base class for simulation agents.
23
+ *
24
+ * Provides common LLM interaction functionality with exponential-backoff
25
+ * retry logic and cumulative token-usage tracking.
26
+ *
27
+ * **Client injection**: pass an existing `OpenAI` client via `config.client`
28
+ * to share a single HTTP connection across multiple agents. The agent will
29
+ * NOT close an injected client -- the caller is responsible for its lifecycle.
30
+ */
31
+ export class BaseAgent {
32
+ model;
33
+ client;
34
+ clientOwned;
35
+ usage;
36
+ constructor(config) {
37
+ this.model = config?.model ?? "azure/gpt-4o-mini";
38
+ if (config?.client) {
39
+ this.client = config.client;
40
+ this.clientOwned = false;
41
+ }
42
+ else {
43
+ const resolvedApiKey = config?.apiKey ?? process.env.ORQ_API_KEY;
44
+ if (!resolvedApiKey) {
45
+ throw new Error("ORQ_API_KEY environment variable is not set. Set it or pass apiKey in AgentConfig.");
46
+ }
47
+ this.client = new OpenAI({
48
+ baseURL: process.env.ROUTER_BASE_URL ?? "https://api.orq.ai/v2/router",
49
+ apiKey: resolvedApiKey,
50
+ });
51
+ this.clientOwned = true;
52
+ }
53
+ this.usage = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
54
+ }
55
+ // ---------------------------------------------------------------------------
56
+ // Public API
57
+ // ---------------------------------------------------------------------------
58
+ /**
59
+ * Generate a text response for a conversation.
60
+ *
61
+ * @param messages - Conversation history
62
+ * @param options - Temperature, maxTokens, and timeout overrides
63
+ * @returns The agent's response text
64
+ * @throws {Error} If the LLM call returns no content
65
+ */
66
+ async respondAsync(messages, options) {
67
+ const result = await this.callLLM(messages, {
68
+ temperature: options?.temperature,
69
+ maxTokens: options?.maxTokens,
70
+ timeout: options?.timeout,
71
+ signal: options?.signal,
72
+ });
73
+ if (!result.content) {
74
+ throw new Error(`${this.name}: LLM call failed -- no content in response`);
75
+ }
76
+ return result.content;
77
+ }
78
+ /**
79
+ * Get cumulative token usage for this agent.
80
+ */
81
+ getUsage() {
82
+ return { ...this.usage };
83
+ }
84
+ /**
85
+ * Reset token usage counters to zero.
86
+ */
87
+ resetUsage() {
88
+ this.usage = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
89
+ }
90
+ /**
91
+ * Close the underlying HTTP client.
92
+ *
93
+ * Only closes clients that the agent created itself (not injected ones).
94
+ */
95
+ async close() {
96
+ // The OpenAI Node SDK does not currently expose a public close() method,
97
+ // but we guard against future changes and respect ownership semantics.
98
+ if (this.clientOwned &&
99
+ typeof this.client
100
+ .close === "function") {
101
+ await this.client.close();
102
+ }
103
+ }
104
+ // ---------------------------------------------------------------------------
105
+ // Protected helpers
106
+ // ---------------------------------------------------------------------------
107
+ /**
108
+ * Call the LLM with retry logic (exponential backoff).
109
+ *
110
+ * Retries on rate-limit (429) and server errors (500+). All other errors
111
+ * are raised immediately.
112
+ */
113
+ async callLLM(messages, options) {
114
+ const temperature = options?.temperature ?? 0.7;
115
+ const maxTokens = options?.maxTokens ?? 2048;
116
+ const timeoutS = options?.timeout ?? DEFAULT_TIMEOUT_S;
117
+ const fullMessages = [
118
+ { role: "system", content: this.systemPrompt },
119
+ ...messages.map((m) => ({
120
+ role: m.role,
121
+ content: m.content,
122
+ })),
123
+ ];
124
+ let lastError;
125
+ for (let attempt = 1; attempt <= MAX_RETRY_ATTEMPTS; attempt++) {
126
+ try {
127
+ // Bail immediately if already cancelled
128
+ if (options?.signal?.aborted) {
129
+ throw new Error("Cancelled");
130
+ }
131
+ const controller = new AbortController();
132
+ const timer = setTimeout(() => controller.abort(), timeoutS * 1000);
133
+ // Link external signal to this request's controller
134
+ const onAbort = () => controller.abort();
135
+ options?.signal?.addEventListener("abort", onAbort, { once: true });
136
+ try {
137
+ const params = {
138
+ model: this.model,
139
+ messages: fullMessages,
140
+ temperature,
141
+ max_tokens: maxTokens,
142
+ };
143
+ if (options?.tools && options.tools.length > 0) {
144
+ params.tools = options.tools;
145
+ params.tool_choice = "auto";
146
+ }
147
+ const response = await this.client.chat.completions.create(params, {
148
+ signal: controller.signal,
149
+ });
150
+ clearTimeout(timer);
151
+ const choice = response.choices[0];
152
+ if (!choice) {
153
+ throw new Error(`${this.name}: No choices in response`);
154
+ }
155
+ const message = choice.message;
156
+ // Accumulate token usage
157
+ if (response.usage) {
158
+ this.usage.prompt_tokens += response.usage.prompt_tokens;
159
+ this.usage.completion_tokens += response.usage.completion_tokens;
160
+ this.usage.total_tokens += response.usage.total_tokens;
161
+ }
162
+ const result = {
163
+ content: message.content ?? "",
164
+ };
165
+ if (message.tool_calls && message.tool_calls.length > 0) {
166
+ result.tool_calls = message.tool_calls;
167
+ }
168
+ return result;
169
+ }
170
+ finally {
171
+ clearTimeout(timer);
172
+ options?.signal?.removeEventListener("abort", onAbort);
173
+ }
174
+ }
175
+ catch (err) {
176
+ lastError = err;
177
+ // Abort errors (from timeout cancellation) should never be retried
178
+ if (err instanceof Error && err.name === "AbortError") {
179
+ throw err;
180
+ }
181
+ // Determine if retryable
182
+ const isApiError = err instanceof OpenAI.APIError;
183
+ const status = isApiError ? err.status : undefined;
184
+ const isNetworkError = !isApiError &&
185
+ err instanceof Error &&
186
+ "code" in err &&
187
+ typeof err.code === "string" &&
188
+ /^E(CONN|TIMEOUT|NOTFOUND|RESET)/.test(err.code ?? "");
189
+ // Re-throw immediately for external cancellation
190
+ if (options?.signal?.aborted)
191
+ throw err;
192
+ if (!isRetryableStatus(status) && !isNetworkError) {
193
+ throw err;
194
+ }
195
+ if (attempt < MAX_RETRY_ATTEMPTS) {
196
+ const baseWait = RETRY_MIN_WAIT_MS * 2 ** (attempt - 1);
197
+ const waitMs = Math.min(baseWait, RETRY_MAX_WAIT_MS);
198
+ // Add jitter (0-25% of wait time)
199
+ const jitter = Math.random() * waitMs * 0.25;
200
+ await sleepCancellable(waitMs + jitter, options?.signal);
201
+ }
202
+ }
203
+ }
204
+ throw (lastError ??
205
+ new Error(`${this.name}: Max retries (${MAX_RETRY_ATTEMPTS}) exceeded`));
206
+ }
207
+ }
208
+ // ---------------------------------------------------------------------------
209
+ // Utility
210
+ // ---------------------------------------------------------------------------
211
+ function sleepCancellable(ms, signal) {
212
+ return new Promise((resolve, reject) => {
213
+ if (signal?.aborted) {
214
+ reject(new Error("Cancelled"));
215
+ return;
216
+ }
217
+ const onAbort = () => {
218
+ clearTimeout(timer);
219
+ reject(new Error("Cancelled"));
220
+ };
221
+ const timer = setTimeout(() => {
222
+ signal?.removeEventListener("abort", onAbort);
223
+ resolve();
224
+ }, ms);
225
+ signal?.addEventListener("abort", onAbort, { once: true });
226
+ });
227
+ }
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Agent exports for the simulation framework.
3
+ */
4
+ export type { AgentConfig, LLMResult } from "./base.js";
5
+ export { BaseAgent } from "./base.js";
6
+ export type { JudgeAgentConfig } from "./judge.js";
7
+ export { DEFAULT_JUDGE_PROMPT, JUDGE_TOOLS, JudgeAgent } from "./judge.js";
8
+ export type { UserSimulatorAgentConfig } from "./user-simulator.js";
9
+ export { DEFAULT_USER_SIMULATOR_PROMPT, UserSimulatorAgent, } from "./user-simulator.js";
10
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../../src/lib/integrations/simulation/agents/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,YAAY,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AACxD,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AACtC,YAAY,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AACnD,OAAO,EAAE,oBAAoB,EAAE,WAAW,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AAC3E,YAAY,EAAE,wBAAwB,EAAE,MAAM,qBAAqB,CAAC;AACpE,OAAO,EACL,6BAA6B,EAC7B,kBAAkB,GACnB,MAAM,qBAAqB,CAAC"}
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Agent exports for the simulation framework.
3
+ */
4
+ export { BaseAgent } from "./base.js";
5
+ export { DEFAULT_JUDGE_PROMPT, JUDGE_TOOLS, JudgeAgent } from "./judge.js";
6
+ export { DEFAULT_USER_SIMULATOR_PROMPT, UserSimulatorAgent, } from "./user-simulator.js";
@@ -0,0 +1,50 @@
1
+ /**
2
+ * Judge agent for conversation evaluation.
3
+ *
4
+ * Evaluates conversations and decides when to terminate based on
5
+ * goal achievement or rule violations.
6
+ */
7
+ import type OpenAI from "openai";
8
+ import type { ChatMessage, Criterion, Judgment } from "../types.js";
9
+ import type { AgentConfig } from "./base.js";
10
+ import { BaseAgent } from "./base.js";
11
+ export declare const JUDGE_TOOLS: OpenAI.Chat.Completions.ChatCompletionTool[];
12
+ export declare const DEFAULT_JUDGE_PROMPT = "You are a conversation judge. Your role is to evaluate conversations between a user and an AI agent.\n\nYou will be given:\n1. The conversation history\n2. The user's goal\n3. Criteria that should or should not be satisfied\n\nYour task:\n- Evaluate whether the conversation should continue or end\n- Determine if the user's goal has been achieved\n- Check if any rules/criteria have been violated\n\nDecision rules:\n1. FINISH if the user's goal is clearly achieved\n2. FINISH if any \"must_not_happen\" criteria are violated\n3. CONTINUE if the goal is not yet achieved and no rules are broken\n4. CONTINUE if progress is being made toward the goal\n\nFor EVERY evaluation (continue or finish), also assess the agent's LAST response:\n- response_quality: How helpful, accurate, and complete was the response? (0.0=poor, 1.0=excellent)\n- hallucination_risk: Did the agent make up information not grounded in the conversation? (0.0=none, 1.0=high risk)\n- tone_appropriateness: Was the agent's tone appropriate for the situation? (0.0=inappropriate, 1.0=perfect)\n- factual_accuracy: If GROUND TRUTH is provided below, score how accurate the agent's response is against it (0.0=wrong, 1.0=correct). Skip if no ground truth.\n\nYou MUST call one of the provided tools to make your decision.";
13
+ export interface JudgeAgentConfig extends AgentConfig {
14
+ goal?: string;
15
+ criteria?: Criterion[];
16
+ groundTruth?: string;
17
+ }
18
+ /**
19
+ * Agent that evaluates conversations and decides termination.
20
+ *
21
+ * Uses tool calling to make structured decisions about whether a conversation
22
+ * should continue or end.
23
+ */
24
+ export declare class JudgeAgent extends BaseAgent {
25
+ private goal;
26
+ private criteria;
27
+ private groundTruth;
28
+ constructor(config?: JudgeAgentConfig);
29
+ get name(): string;
30
+ get systemPrompt(): string;
31
+ /**
32
+ * Evaluate a conversation and decide next action.
33
+ *
34
+ * @param messages - Conversation history to evaluate
35
+ * @returns Judgment with termination decision and reasoning
36
+ */
37
+ evaluate(messages: ChatMessage[], options?: {
38
+ signal?: AbortSignal;
39
+ }): Promise<Judgment>;
40
+ private parseJudgment;
41
+ /**
42
+ * Extract and clamp quality scores from tool call arguments.
43
+ */
44
+ private static extractQualityScores;
45
+ /**
46
+ * Format criteria for the system prompt.
47
+ */
48
+ private formatCriteria;
49
+ }
50
+ //# sourceMappingURL=judge.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"judge.d.ts","sourceRoot":"","sources":["../../../../../src/lib/integrations/simulation/agents/judge.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,MAAM,MAAM,QAAQ,CAAC;AAEjC,OAAO,KAAK,EAAE,WAAW,EAAE,SAAS,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAEpE,OAAO,KAAK,EAAE,WAAW,EAAa,MAAM,WAAW,CAAC;AACxD,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AAiCtC,eAAO,MAAM,WAAW,EAAE,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,kBAAkB,EA2DnE,CAAC;AAMF,eAAO,MAAM,oBAAoB,+wCAwB8B,CAAC;AAMhE,MAAM,WAAW,gBAAiB,SAAQ,WAAW;IACnD,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,QAAQ,CAAC,EAAE,SAAS,EAAE,CAAC;IACvB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAmBD;;;;;GAKG;AACH,qBAAa,UAAW,SAAQ,SAAS;IACvC,OAAO,CAAC,IAAI,CAAS;IACrB,OAAO,CAAC,QAAQ,CAAc;IAC9B,OAAO,CAAC,WAAW,CAAS;gBAEhB,MAAM,CAAC,EAAE,gBAAgB;IAOrC,IAAI,IAAI,IAAI,MAAM,CAEjB;IAED,IAAI,YAAY,IAAI,MAAM,CASzB;IAED;;;;;OAKG;IACG,QAAQ,CACZ,QAAQ,EAAE,WAAW,EAAE,EACvB,OAAO,CAAC,EAAE;QAAE,MAAM,CAAC,EAAE,WAAW,CAAA;KAAE,GACjC,OAAO,CAAC,QAAQ,CAAC;IAuBpB,OAAO,CAAC,aAAa;IAiGrB;;OAEG;IACH,OAAO,CAAC,MAAM,CAAC,oBAAoB;IAkBnC;;OAEG;IACH,OAAO,CAAC,cAAc;CAuBvB"}
@@ -0,0 +1,313 @@
1
+ /**
2
+ * Judge agent for conversation evaluation.
3
+ *
4
+ * Evaluates conversations and decides when to terminate based on
5
+ * goal achievement or rule violations.
6
+ */
7
+ import { delimit } from "../utils/sanitize.js";
8
+ import { BaseAgent } from "./base.js";
9
+ // ---------------------------------------------------------------------------
10
+ // Quality score property definitions (shared by both judge tools)
11
+ // ---------------------------------------------------------------------------
12
+ const QUALITY_SCORE_PROPERTIES = {
13
+ response_quality: {
14
+ type: "number",
15
+ description: "Quality of the agent's last response: helpful, accurate, complete (0.0=poor, 1.0=excellent)",
16
+ },
17
+ hallucination_risk: {
18
+ type: "number",
19
+ description: "Risk that the agent fabricated information not grounded in the conversation (0.0=none, 1.0=high risk)",
20
+ },
21
+ tone_appropriateness: {
22
+ type: "number",
23
+ description: "How appropriate the agent's tone was for the situation (0.0=inappropriate, 1.0=perfect)",
24
+ },
25
+ factual_accuracy: {
26
+ type: "number",
27
+ description: "Accuracy of the agent's response against the provided ground truth (0.0=completely wrong, 1.0=fully correct). Only score this if ground truth is provided.",
28
+ },
29
+ };
30
+ // ---------------------------------------------------------------------------
31
+ // Judge tools for structured decision making
32
+ // ---------------------------------------------------------------------------
33
+ export const JUDGE_TOOLS = [
34
+ {
35
+ type: "function",
36
+ function: {
37
+ name: "continue_conversation",
38
+ description: "Allow the conversation to continue. Use when the goal is not yet achieved and no rules are broken.",
39
+ parameters: {
40
+ type: "object",
41
+ properties: {
42
+ reason: {
43
+ type: "string",
44
+ description: "Brief explanation of why the conversation should continue",
45
+ },
46
+ ...QUALITY_SCORE_PROPERTIES,
47
+ },
48
+ required: ["reason"],
49
+ },
50
+ },
51
+ },
52
+ {
53
+ type: "function",
54
+ function: {
55
+ name: "finish_conversation",
56
+ description: "Terminate the conversation. Use when the goal is achieved OR a rule is broken.",
57
+ parameters: {
58
+ type: "object",
59
+ properties: {
60
+ reason: {
61
+ type: "string",
62
+ description: "Explanation of why the conversation should end",
63
+ },
64
+ goal_achieved: {
65
+ type: "boolean",
66
+ description: "Whether the user's goal was successfully achieved",
67
+ },
68
+ rules_broken: {
69
+ type: "array",
70
+ items: { type: "string" },
71
+ description: "List of criteria that were violated (empty if none)",
72
+ },
73
+ goal_completion_score: {
74
+ type: "number",
75
+ description: "How much of the goal was achieved, from 0.0 (none) to 1.0 (fully achieved). Use intermediate values for partial completion.",
76
+ },
77
+ ...QUALITY_SCORE_PROPERTIES,
78
+ },
79
+ required: [
80
+ "reason",
81
+ "goal_achieved",
82
+ "rules_broken",
83
+ "goal_completion_score",
84
+ ],
85
+ },
86
+ },
87
+ },
88
+ ];
89
+ // ---------------------------------------------------------------------------
90
+ // Default judge system prompt
91
+ // ---------------------------------------------------------------------------
92
+ export const DEFAULT_JUDGE_PROMPT = `You are a conversation judge. Your role is to evaluate conversations between a user and an AI agent.
93
+
94
+ You will be given:
95
+ 1. The conversation history
96
+ 2. The user's goal
97
+ 3. Criteria that should or should not be satisfied
98
+
99
+ Your task:
100
+ - Evaluate whether the conversation should continue or end
101
+ - Determine if the user's goal has been achieved
102
+ - Check if any rules/criteria have been violated
103
+
104
+ Decision rules:
105
+ 1. FINISH if the user's goal is clearly achieved
106
+ 2. FINISH if any "must_not_happen" criteria are violated
107
+ 3. CONTINUE if the goal is not yet achieved and no rules are broken
108
+ 4. CONTINUE if progress is being made toward the goal
109
+
110
+ For EVERY evaluation (continue or finish), also assess the agent's LAST response:
111
+ - response_quality: How helpful, accurate, and complete was the response? (0.0=poor, 1.0=excellent)
112
+ - hallucination_risk: Did the agent make up information not grounded in the conversation? (0.0=none, 1.0=high risk)
113
+ - tone_appropriateness: Was the agent's tone appropriate for the situation? (0.0=inappropriate, 1.0=perfect)
114
+ - factual_accuracy: If GROUND TRUTH is provided below, score how accurate the agent's response is against it (0.0=wrong, 1.0=correct). Skip if no ground truth.
115
+
116
+ You MUST call one of the provided tools to make your decision.`;
117
+ // ---------------------------------------------------------------------------
118
+ // Quality score field names
119
+ // ---------------------------------------------------------------------------
120
+ const QUALITY_SCORE_FIELDS = [
121
+ "response_quality",
122
+ "hallucination_risk",
123
+ "tone_appropriateness",
124
+ "factual_accuracy",
125
+ ];
126
+ // ---------------------------------------------------------------------------
127
+ // JudgeAgent
128
+ // ---------------------------------------------------------------------------
129
+ /**
130
+ * Agent that evaluates conversations and decides termination.
131
+ *
132
+ * Uses tool calling to make structured decisions about whether a conversation
133
+ * should continue or end.
134
+ */
135
+ export class JudgeAgent extends BaseAgent {
136
+ goal;
137
+ criteria;
138
+ groundTruth;
139
+ constructor(config) {
140
+ super(config);
141
+ this.goal = config?.goal ?? "";
142
+ this.criteria = config?.criteria ?? [];
143
+ this.groundTruth = config?.groundTruth ?? "";
144
+ }
145
+ get name() {
146
+ return "JudgeAgent";
147
+ }
148
+ get systemPrompt() {
149
+ const criteriaText = this.formatCriteria();
150
+ let groundTruthText = "";
151
+ if (this.groundTruth) {
152
+ groundTruthText = `\n\nGROUND TRUTH (use this to score factual_accuracy):\n${delimit(this.groundTruth)}`;
153
+ }
154
+ return `${DEFAULT_JUDGE_PROMPT}\n\n---\n\nUSER'S GOAL: ${delimit(this.goal)}\n\nEVALUATION CRITERIA:\n${criteriaText}${groundTruthText}`;
155
+ }
156
+ /**
157
+ * Evaluate a conversation and decide next action.
158
+ *
159
+ * @param messages - Conversation history to evaluate
160
+ * @returns Judgment with termination decision and reasoning
161
+ */
162
+ async evaluate(messages, options) {
163
+ const evalMessages = [
164
+ ...messages,
165
+ {
166
+ role: "user",
167
+ content: "Evaluate the conversation above. Should it continue or end? Use the appropriate tool.",
168
+ },
169
+ ];
170
+ const result = await this.callLLM(evalMessages, {
171
+ temperature: 0.0,
172
+ tools: JUDGE_TOOLS,
173
+ signal: options?.signal,
174
+ });
175
+ return this.parseJudgment(result);
176
+ }
177
+ // ---------------------------------------------------------------------------
178
+ // Private helpers
179
+ // ---------------------------------------------------------------------------
180
+ parseJudgment(result) {
181
+ const toolCalls = result.tool_calls;
182
+ if (!toolCalls || toolCalls.length === 0) {
183
+ const content = (result.content ?? "").slice(0, 200);
184
+ console.warn(`JudgeAgent: No tool call in response (LLM may have failed). ` +
185
+ `Content: ${JSON.stringify(content)}. Defaulting to TERMINATE to prevent runaway conversations.`);
186
+ return {
187
+ should_terminate: true,
188
+ reason: "Judge failed to make explicit decision - terminating for safety",
189
+ goal_achieved: false,
190
+ rules_broken: [],
191
+ goal_completion_score: 0.0,
192
+ };
193
+ }
194
+ const toolCall = toolCalls[0];
195
+ const functionName = toolCall.function.name;
196
+ const argumentsStr = toolCall.function.arguments;
197
+ let args;
198
+ try {
199
+ const parsed = JSON.parse(argumentsStr);
200
+ if (typeof parsed !== "object" ||
201
+ parsed === null ||
202
+ Array.isArray(parsed)) {
203
+ throw new TypeError(`Expected object, got ${typeof parsed}`);
204
+ }
205
+ args = parsed;
206
+ }
207
+ catch (err) {
208
+ console.error(`JudgeAgent: Failed to parse tool arguments: ${String(err)} (raw: ${JSON.stringify(argumentsStr)})`);
209
+ return {
210
+ should_terminate: true,
211
+ reason: "Failed to parse judgment decision - terminating for safety",
212
+ goal_achieved: false,
213
+ rules_broken: [],
214
+ goal_completion_score: 0.0,
215
+ };
216
+ }
217
+ // Extract quality scores (shared by both tools)
218
+ const qualityScores = JudgeAgent.extractQualityScores(args);
219
+ if (functionName === "continue_conversation") {
220
+ return {
221
+ should_terminate: false,
222
+ reason: typeof args.reason === "string" ? args.reason : "",
223
+ goal_achieved: false,
224
+ rules_broken: [],
225
+ goal_completion_score: 0.0,
226
+ ...qualityScores,
227
+ };
228
+ }
229
+ if (functionName === "finish_conversation") {
230
+ const goalAchieved = typeof args.goal_achieved === "boolean" ? args.goal_achieved : false;
231
+ // Clamp goal_completion_score to [0.0, 1.0]
232
+ const rawScore = args.goal_completion_score;
233
+ const defaultScore = goalAchieved ? 1.0 : 0.0;
234
+ const goalCompletionScore = clamp(toNumber(rawScore, defaultScore));
235
+ const rulesBroken = Array.isArray(args.rules_broken)
236
+ ? args.rules_broken.map(String)
237
+ : [];
238
+ return {
239
+ should_terminate: true,
240
+ reason: typeof args.reason === "string" ? args.reason : "",
241
+ goal_achieved: goalAchieved,
242
+ rules_broken: rulesBroken,
243
+ goal_completion_score: goalCompletionScore,
244
+ ...qualityScores,
245
+ };
246
+ }
247
+ // Unknown function -- terminate for safety
248
+ console.warn(`JudgeAgent: Unknown function ${functionName} - terminating for safety`);
249
+ return {
250
+ should_terminate: true,
251
+ reason: `Unknown function '${functionName}' - terminating for safety`,
252
+ goal_achieved: false,
253
+ rules_broken: [],
254
+ goal_completion_score: 0.0,
255
+ };
256
+ }
257
+ /**
258
+ * Extract and clamp quality scores from tool call arguments.
259
+ */
260
+ static extractQualityScores(args) {
261
+ const scores = {};
262
+ for (const field of QUALITY_SCORE_FIELDS) {
263
+ const raw = args[field];
264
+ if (raw !== undefined && raw !== null) {
265
+ const num = Number(raw);
266
+ if (!Number.isNaN(num)) {
267
+ scores[field] = clamp(num);
268
+ }
269
+ }
270
+ }
271
+ return scores;
272
+ }
273
+ /**
274
+ * Format criteria for the system prompt.
275
+ */
276
+ formatCriteria() {
277
+ if (this.criteria.length === 0) {
278
+ return "No specific criteria defined.";
279
+ }
280
+ const mustHappen = this.criteria
281
+ .filter((c) => c.type === "must_happen")
282
+ .map((c) => delimit(c.description));
283
+ const mustNot = this.criteria
284
+ .filter((c) => c.type === "must_not_happen")
285
+ .map((c) => delimit(c.description));
286
+ let text = "";
287
+ if (mustHappen.length > 0) {
288
+ text += `MUST HAPPEN:\n${mustHappen.map((c) => `- ${c}`).join("\n")}\n\n`;
289
+ }
290
+ if (mustNot.length > 0) {
291
+ text += `MUST NOT HAPPEN:\n${mustNot.map((c) => `- ${c}`).join("\n")}`;
292
+ }
293
+ return text.trim() || "No specific criteria defined.";
294
+ }
295
+ }
296
+ // ---------------------------------------------------------------------------
297
+ // Utility helpers
298
+ // ---------------------------------------------------------------------------
299
+ /** Clamp a number to [0.0, 1.0]. */
300
+ function clamp(value) {
301
+ return Math.max(0.0, Math.min(1.0, value));
302
+ }
303
+ /** Safely convert an unknown value to a number, falling back to a default. */
304
+ function toNumber(value, fallback) {
305
+ if (typeof value === "number" && !Number.isNaN(value))
306
+ return value;
307
+ if (typeof value === "string") {
308
+ const n = Number(value);
309
+ if (!Number.isNaN(n))
310
+ return n;
311
+ }
312
+ return fallback;
313
+ }
@@ -0,0 +1,41 @@
1
+ /**
2
+ * User simulator agent.
3
+ *
4
+ * Simulates user behavior based on a persona and scenario,
5
+ * generating realistic user messages in conversations.
6
+ */
7
+ import type { ChatMessage } from "../types.js";
8
+ import type { AgentConfig } from "./base.js";
9
+ import { BaseAgent } from "./base.js";
10
+ export declare const DEFAULT_USER_SIMULATOR_PROMPT = "You are a user simulator. Your role is to simulate realistic user behavior in a conversation with an AI agent.\n\nYou will be given:\n1. A persona describing who you are and how you behave\n2. A scenario describing your goal and context\n\nYour task:\n- Generate realistic user messages based on your persona and scenario\n- Stay in character throughout the conversation\n- Work towards achieving your goal naturally\n- React authentically to the agent's responses\n- Do not break character or acknowledge that you are a simulation\n\nResponse format:\n- Respond only with the user's message\n- Do not include any meta-commentary or explanations\n- Keep responses natural and conversational";
11
+ export interface UserSimulatorAgentConfig extends AgentConfig {
12
+ /** Custom system prompt to append to the default prompt. */
13
+ systemPrompt?: string;
14
+ }
15
+ /**
16
+ * Agent that simulates user behavior.
17
+ *
18
+ * Uses a persona and scenario to generate realistic user messages
19
+ * in a conversation with the agent being tested.
20
+ */
21
+ export declare class UserSimulatorAgent extends BaseAgent {
22
+ private customSystemPrompt;
23
+ constructor(config?: UserSimulatorAgentConfig);
24
+ get name(): string;
25
+ get systemPrompt(): string;
26
+ /**
27
+ * Generate the first message to start a conversation.
28
+ *
29
+ * @param messages - Optional context messages
30
+ * @returns First user message to start the conversation
31
+ */
32
+ generateFirstMessage(messages?: ChatMessage[]): Promise<string>;
33
+ /**
34
+ * Update the persona and scenario context.
35
+ *
36
+ * @param personaContext - Persona-specific context
37
+ * @param scenarioContext - Scenario-specific context
38
+ */
39
+ updateContext(personaContext?: string, scenarioContext?: string): void;
40
+ }
41
+ //# sourceMappingURL=user-simulator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"user-simulator.d.ts","sourceRoot":"","sources":["../../../../../src/lib/integrations/simulation/agents/user-simulator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAC/C,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AAC7C,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AAMtC,eAAO,MAAM,6BAA6B,urBAgBE,CAAC;AAM7C,MAAM,WAAW,wBAAyB,SAAQ,WAAW;IAC3D,4DAA4D;IAC5D,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB;AAMD;;;;;GAKG;AACH,qBAAa,kBAAmB,SAAQ,SAAS;IAC/C,OAAO,CAAC,kBAAkB,CAAgB;gBAE9B,MAAM,CAAC,EAAE,wBAAwB;IAK7C,IAAI,IAAI,IAAI,MAAM,CAEjB;IAED,IAAI,YAAY,IAAI,MAAM,CAKzB;IAED;;;;;OAKG;IACG,oBAAoB,CAAC,QAAQ,CAAC,EAAE,WAAW,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC;IAWrE;;;;;OAKG;IACH,aAAa,CAAC,cAAc,CAAC,EAAE,MAAM,EAAE,eAAe,CAAC,EAAE,MAAM,GAAG,IAAI;CAWvE"}