@orq-ai/evaluatorq 1.2.2 → 1.2.3-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/integrations/ai-sdk/index.d.ts +2 -0
- package/dist/lib/integrations/ai-sdk/index.d.ts.map +1 -1
- package/dist/lib/integrations/ai-sdk/index.js +1 -0
- package/dist/lib/integrations/ai-sdk/simulation-adapter.d.ts +47 -0
- package/dist/lib/integrations/ai-sdk/simulation-adapter.d.ts.map +1 -0
- package/dist/lib/integrations/ai-sdk/simulation-adapter.js +58 -0
- package/dist/lib/integrations/langchain/index.d.ts +2 -0
- package/dist/lib/integrations/langchain/index.d.ts.map +1 -1
- package/dist/lib/integrations/langchain/index.js +1 -0
- package/dist/lib/integrations/langchain/simulation-adapter.d.ts +49 -0
- package/dist/lib/integrations/langchain/simulation-adapter.d.ts.map +1 -0
- package/dist/lib/integrations/langchain/simulation-adapter.js +110 -0
- package/dist/lib/integrations/simulation/adapters.d.ts +57 -0
- package/dist/lib/integrations/simulation/adapters.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/adapters.js +64 -0
- package/dist/lib/integrations/simulation/agents/base.d.ts +90 -0
- package/dist/lib/integrations/simulation/agents/base.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/agents/base.js +227 -0
- package/dist/lib/integrations/simulation/agents/index.d.ts +10 -0
- package/dist/lib/integrations/simulation/agents/index.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/agents/index.js +6 -0
- package/dist/lib/integrations/simulation/agents/judge.d.ts +50 -0
- package/dist/lib/integrations/simulation/agents/judge.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/agents/judge.js +313 -0
- package/dist/lib/integrations/simulation/agents/user-simulator.d.ts +41 -0
- package/dist/lib/integrations/simulation/agents/user-simulator.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/agents/user-simulator.js +82 -0
- package/dist/lib/integrations/simulation/convert.d.ts +22 -0
- package/dist/lib/integrations/simulation/convert.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/convert.js +124 -0
- package/dist/lib/integrations/simulation/evaluators/index.d.ts +50 -0
- package/dist/lib/integrations/simulation/evaluators/index.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/evaluators/index.js +100 -0
- package/dist/lib/integrations/simulation/generators/datapoint-generator.d.ts +60 -0
- package/dist/lib/integrations/simulation/generators/datapoint-generator.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/generators/datapoint-generator.js +223 -0
- package/dist/lib/integrations/simulation/generators/first-message-generator.d.ts +38 -0
- package/dist/lib/integrations/simulation/generators/first-message-generator.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/generators/first-message-generator.js +131 -0
- package/dist/lib/integrations/simulation/generators/index.d.ts +15 -0
- package/dist/lib/integrations/simulation/generators/index.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/generators/index.js +10 -0
- package/dist/lib/integrations/simulation/generators/persona-generator.d.ts +60 -0
- package/dist/lib/integrations/simulation/generators/persona-generator.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/generators/persona-generator.js +333 -0
- package/dist/lib/integrations/simulation/generators/scenario-generator.d.ts +77 -0
- package/dist/lib/integrations/simulation/generators/scenario-generator.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/generators/scenario-generator.js +545 -0
- package/dist/lib/integrations/simulation/index.d.ts +33 -0
- package/dist/lib/integrations/simulation/index.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/index.js +35 -0
- package/dist/lib/integrations/simulation/quality/index.d.ts +5 -0
- package/dist/lib/integrations/simulation/quality/index.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/quality/index.js +4 -0
- package/dist/lib/integrations/simulation/quality/message-perturbation.d.ts +25 -0
- package/dist/lib/integrations/simulation/quality/message-perturbation.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/quality/message-perturbation.js +150 -0
- package/dist/lib/integrations/simulation/runner/index.d.ts +5 -0
- package/dist/lib/integrations/simulation/runner/index.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/runner/index.js +4 -0
- package/dist/lib/integrations/simulation/runner/simulation.d.ts +57 -0
- package/dist/lib/integrations/simulation/runner/simulation.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/runner/simulation.js +336 -0
- package/dist/lib/integrations/simulation/schemas.d.ts +104 -0
- package/dist/lib/integrations/simulation/schemas.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/schemas.js +76 -0
- package/dist/lib/integrations/simulation/simulation/index.d.ts +49 -0
- package/dist/lib/integrations/simulation/simulation/index.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/simulation/index.js +159 -0
- package/dist/lib/integrations/simulation/types.d.ts +101 -0
- package/dist/lib/integrations/simulation/types.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/types.js +90 -0
- package/dist/lib/integrations/simulation/utils/dataset-export.d.ts +31 -0
- package/dist/lib/integrations/simulation/utils/dataset-export.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/utils/dataset-export.js +146 -0
- package/dist/lib/integrations/simulation/utils/extract-json.d.ts +17 -0
- package/dist/lib/integrations/simulation/utils/extract-json.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/utils/extract-json.js +106 -0
- package/dist/lib/integrations/simulation/utils/prompt-builders.d.ts +34 -0
- package/dist/lib/integrations/simulation/utils/prompt-builders.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/utils/prompt-builders.js +147 -0
- package/dist/lib/integrations/simulation/utils/sanitize.d.ts +15 -0
- package/dist/lib/integrations/simulation/utils/sanitize.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/utils/sanitize.js +20 -0
- package/dist/lib/integrations/simulation/wrap-agent.d.ts +65 -0
- package/dist/lib/integrations/simulation/wrap-agent.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/wrap-agent.js +140 -0
- package/dist/lib/send-results.d.ts.map +1 -1
- package/dist/lib/send-results.js +17 -2
- package/dist/lib/types.d.ts +2 -2
- package/dist/lib/types.d.ts.map +1 -1
- package/dist/tsconfig.lib.tsbuildinfo +1 -1
- package/package.json +24 -2
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* User simulator agent.
|
|
3
|
+
*
|
|
4
|
+
* Simulates user behavior based on a persona and scenario,
|
|
5
|
+
* generating realistic user messages in conversations.
|
|
6
|
+
*/
|
|
7
|
+
import { BaseAgent } from "./base.js";
|
|
8
|
+
// ---------------------------------------------------------------------------
|
|
9
|
+
// Default user simulator system prompt
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
export const DEFAULT_USER_SIMULATOR_PROMPT = `You are a user simulator. Your role is to simulate realistic user behavior in a conversation with an AI agent.
|
|
12
|
+
|
|
13
|
+
You will be given:
|
|
14
|
+
1. A persona describing who you are and how you behave
|
|
15
|
+
2. A scenario describing your goal and context
|
|
16
|
+
|
|
17
|
+
Your task:
|
|
18
|
+
- Generate realistic user messages based on your persona and scenario
|
|
19
|
+
- Stay in character throughout the conversation
|
|
20
|
+
- Work towards achieving your goal naturally
|
|
21
|
+
- React authentically to the agent's responses
|
|
22
|
+
- Do not break character or acknowledge that you are a simulation
|
|
23
|
+
|
|
24
|
+
Response format:
|
|
25
|
+
- Respond only with the user's message
|
|
26
|
+
- Do not include any meta-commentary or explanations
|
|
27
|
+
- Keep responses natural and conversational`;
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
// UserSimulatorAgent
|
|
30
|
+
// ---------------------------------------------------------------------------
|
|
31
|
+
/**
|
|
32
|
+
* Agent that simulates user behavior.
|
|
33
|
+
*
|
|
34
|
+
* Uses a persona and scenario to generate realistic user messages
|
|
35
|
+
* in a conversation with the agent being tested.
|
|
36
|
+
*/
|
|
37
|
+
export class UserSimulatorAgent extends BaseAgent {
|
|
38
|
+
customSystemPrompt;
|
|
39
|
+
constructor(config) {
|
|
40
|
+
super(config);
|
|
41
|
+
this.customSystemPrompt = config?.systemPrompt ?? null;
|
|
42
|
+
}
|
|
43
|
+
get name() {
|
|
44
|
+
return "UserSimulator";
|
|
45
|
+
}
|
|
46
|
+
get systemPrompt() {
|
|
47
|
+
if (this.customSystemPrompt) {
|
|
48
|
+
return `${DEFAULT_USER_SIMULATOR_PROMPT}\n\n---\n\n${this.customSystemPrompt}`;
|
|
49
|
+
}
|
|
50
|
+
return DEFAULT_USER_SIMULATOR_PROMPT;
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Generate the first message to start a conversation.
|
|
54
|
+
*
|
|
55
|
+
* @param messages - Optional context messages
|
|
56
|
+
* @returns First user message to start the conversation
|
|
57
|
+
*/
|
|
58
|
+
async generateFirstMessage(messages) {
|
|
59
|
+
const promptMessages = [...(messages ?? [])];
|
|
60
|
+
promptMessages.push({
|
|
61
|
+
role: "user",
|
|
62
|
+
content: "Generate your first message to start the conversation. Remember your goal and persona.",
|
|
63
|
+
});
|
|
64
|
+
return this.respondAsync(promptMessages, { temperature: 0.8 });
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Update the persona and scenario context.
|
|
68
|
+
*
|
|
69
|
+
* @param personaContext - Persona-specific context
|
|
70
|
+
* @param scenarioContext - Scenario-specific context
|
|
71
|
+
*/
|
|
72
|
+
updateContext(personaContext, scenarioContext) {
|
|
73
|
+
let combined = "";
|
|
74
|
+
if (personaContext) {
|
|
75
|
+
combined += `PERSONA:\n${personaContext}\n\n`;
|
|
76
|
+
}
|
|
77
|
+
if (scenarioContext) {
|
|
78
|
+
combined += `SCENARIO:\n${scenarioContext}`;
|
|
79
|
+
}
|
|
80
|
+
this.customSystemPrompt = combined.trim() || null;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Conversion functions from SimulationResult to OpenResponses format.
|
|
3
|
+
*/
|
|
4
|
+
import type { ResponseResource } from "../openresponses/index.js";
|
|
5
|
+
import type { SimulationResult } from "./types.js";
|
|
6
|
+
/**
|
|
7
|
+
* Converts a SimulationResult to OpenResponses format.
|
|
8
|
+
*
|
|
9
|
+
* Mapping:
|
|
10
|
+
* - messages with role "user" → input[] as Message with input_text content
|
|
11
|
+
* - messages with role "assistant" → output[] as Message with output_text content
|
|
12
|
+
* - messages with role "system" → input[] as Message with input_text content
|
|
13
|
+
* - token_usage → Usage
|
|
14
|
+
* - terminated_by → status ("judge" → "completed", others → "incomplete")
|
|
15
|
+
* - goal_achieved, rules_broken, criteria_results, turn_metrics → metadata
|
|
16
|
+
*
|
|
17
|
+
* @param result - The simulation result to convert
|
|
18
|
+
* @param model - Optional model name to include in the response
|
|
19
|
+
* @returns A ResponseResource in OpenResponses format
|
|
20
|
+
*/
|
|
21
|
+
export declare function toOpenResponses(result: SimulationResult, model?: string): ResponseResource;
|
|
22
|
+
//# sourceMappingURL=convert.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"convert.d.ts","sourceRoot":"","sources":["../../../../src/lib/integrations/simulation/convert.ts"],"names":[],"mappings":"AAAA;;GAEG;AAGH,OAAO,KAAK,EAGV,gBAAgB,EAEjB,MAAM,2BAA2B,CAAC;AACnC,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AAEnD;;;;;;;;;;;;;;GAcG;AACH,wBAAgB,eAAe,CAC7B,MAAM,EAAE,gBAAgB,EACxB,KAAK,SAAe,GACnB,gBAAgB,CA8GlB"}
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Conversion functions from SimulationResult to OpenResponses format.
|
|
3
|
+
*/
|
|
4
|
+
import { generateItemId } from "../common/utils.js";
|
|
5
|
+
/**
|
|
6
|
+
* Converts a SimulationResult to OpenResponses format.
|
|
7
|
+
*
|
|
8
|
+
* Mapping:
|
|
9
|
+
* - messages with role "user" → input[] as Message with input_text content
|
|
10
|
+
* - messages with role "assistant" → output[] as Message with output_text content
|
|
11
|
+
* - messages with role "system" → input[] as Message with input_text content
|
|
12
|
+
* - token_usage → Usage
|
|
13
|
+
* - terminated_by → status ("judge" → "completed", others → "incomplete")
|
|
14
|
+
* - goal_achieved, rules_broken, criteria_results, turn_metrics → metadata
|
|
15
|
+
*
|
|
16
|
+
* @param result - The simulation result to convert
|
|
17
|
+
* @param model - Optional model name to include in the response
|
|
18
|
+
* @returns A ResponseResource in OpenResponses format
|
|
19
|
+
*/
|
|
20
|
+
export function toOpenResponses(result, model = "simulation") {
|
|
21
|
+
const now = Math.floor(Date.now() / 1000);
|
|
22
|
+
const inputItems = [];
|
|
23
|
+
const outputItems = [];
|
|
24
|
+
for (const msg of result.messages) {
|
|
25
|
+
if (msg.role === "user" || msg.role === "system") {
|
|
26
|
+
const inputMessage = {
|
|
27
|
+
type: "message",
|
|
28
|
+
id: generateItemId("msg"),
|
|
29
|
+
role: msg.role,
|
|
30
|
+
status: "completed",
|
|
31
|
+
content: [{ type: "input_text", text: msg.content }],
|
|
32
|
+
};
|
|
33
|
+
inputItems.push(inputMessage);
|
|
34
|
+
}
|
|
35
|
+
else if (msg.role === "assistant") {
|
|
36
|
+
const outputMessage = {
|
|
37
|
+
type: "message",
|
|
38
|
+
id: generateItemId("msg"),
|
|
39
|
+
role: "assistant",
|
|
40
|
+
status: "completed",
|
|
41
|
+
content: [
|
|
42
|
+
{
|
|
43
|
+
type: "output_text",
|
|
44
|
+
text: msg.content,
|
|
45
|
+
annotations: [],
|
|
46
|
+
logprobs: [],
|
|
47
|
+
},
|
|
48
|
+
],
|
|
49
|
+
};
|
|
50
|
+
outputItems.push(outputMessage);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
// Map terminated_by to status
|
|
54
|
+
const status = result.terminated_by === "judge"
|
|
55
|
+
? "completed"
|
|
56
|
+
: result.terminated_by === "error"
|
|
57
|
+
? "failed"
|
|
58
|
+
: "incomplete";
|
|
59
|
+
const incompleteDetails = status === "incomplete"
|
|
60
|
+
? { reason: `${result.terminated_by}: ${result.reason}` }
|
|
61
|
+
: null;
|
|
62
|
+
// Build usage from token_usage
|
|
63
|
+
let usageData = null;
|
|
64
|
+
if (result.token_usage.total_tokens > 0) {
|
|
65
|
+
usageData = {
|
|
66
|
+
input_tokens: result.token_usage.prompt_tokens,
|
|
67
|
+
input_tokens_details: { cached_tokens: 0 },
|
|
68
|
+
output_tokens: result.token_usage.completion_tokens,
|
|
69
|
+
output_tokens_details: { reasoning_tokens: 0 },
|
|
70
|
+
total_tokens: result.token_usage.total_tokens,
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
return {
|
|
74
|
+
id: generateItemId("resp"),
|
|
75
|
+
object: "response",
|
|
76
|
+
created_at: now,
|
|
77
|
+
completed_at: status === "completed" ? now : null,
|
|
78
|
+
status,
|
|
79
|
+
incomplete_details: incompleteDetails,
|
|
80
|
+
model,
|
|
81
|
+
previous_response_id: null,
|
|
82
|
+
instructions: null,
|
|
83
|
+
input: inputItems,
|
|
84
|
+
output: outputItems,
|
|
85
|
+
error: result.terminated_by === "error" ? { message: result.reason } : null,
|
|
86
|
+
tools: [],
|
|
87
|
+
tool_choice: "auto",
|
|
88
|
+
truncation: "disabled",
|
|
89
|
+
parallel_tool_calls: false,
|
|
90
|
+
text: {
|
|
91
|
+
format: { type: "text" },
|
|
92
|
+
},
|
|
93
|
+
top_p: 1,
|
|
94
|
+
presence_penalty: 0,
|
|
95
|
+
frequency_penalty: 0,
|
|
96
|
+
top_logprobs: 0,
|
|
97
|
+
temperature: 1,
|
|
98
|
+
reasoning: null,
|
|
99
|
+
user: null,
|
|
100
|
+
usage: usageData,
|
|
101
|
+
max_output_tokens: null,
|
|
102
|
+
max_tool_calls: null,
|
|
103
|
+
store: false,
|
|
104
|
+
background: false,
|
|
105
|
+
service_tier: "default",
|
|
106
|
+
metadata: {
|
|
107
|
+
framework: "simulation",
|
|
108
|
+
goal_achieved: result.goal_achieved,
|
|
109
|
+
goal_completion_score: result.goal_completion_score,
|
|
110
|
+
terminated_by: result.terminated_by,
|
|
111
|
+
reason: result.reason,
|
|
112
|
+
turn_count: result.turn_count,
|
|
113
|
+
rules_broken: result.rules_broken,
|
|
114
|
+
...(result.criteria_results && {
|
|
115
|
+
criteria_results: result.criteria_results,
|
|
116
|
+
}),
|
|
117
|
+
...(result.turn_metrics.length > 0 && {
|
|
118
|
+
turn_metrics: result.turn_metrics,
|
|
119
|
+
}),
|
|
120
|
+
},
|
|
121
|
+
safety_identifier: null,
|
|
122
|
+
prompt_cache_key: null,
|
|
123
|
+
};
|
|
124
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Built-in evaluators for agent simulation.
|
|
3
|
+
*
|
|
4
|
+
* These evaluators assess simulation results using a scorer pattern
|
|
5
|
+
* compatible with evaluatorq integration.
|
|
6
|
+
*/
|
|
7
|
+
import type { SimulationResult } from "../types.js";
|
|
8
|
+
export type SimulationScorer = (result: SimulationResult) => number;
|
|
9
|
+
/**
|
|
10
|
+
* Evaluate if the simulation goal was achieved.
|
|
11
|
+
* Returns 1 if achieved, 0 otherwise.
|
|
12
|
+
*/
|
|
13
|
+
export declare const goalAchievedScorer: SimulationScorer;
|
|
14
|
+
/**
|
|
15
|
+
* Evaluate how many criteria were met.
|
|
16
|
+
* Returns a value between 0 and 1 based on the ratio of met criteria.
|
|
17
|
+
*
|
|
18
|
+
* Uses the criteria_results from metadata if available; otherwise returns 1.0.
|
|
19
|
+
*/
|
|
20
|
+
export declare const criteriaMetScorer: SimulationScorer;
|
|
21
|
+
/**
|
|
22
|
+
* Evaluate conversation efficiency (fewer turns = better).
|
|
23
|
+
* Returns a value between 0 and 1.
|
|
24
|
+
*/
|
|
25
|
+
export declare const turnEfficiencyScorer: SimulationScorer;
|
|
26
|
+
/**
|
|
27
|
+
* Evaluate overall conversation quality.
|
|
28
|
+
*
|
|
29
|
+
* Composite score based on:
|
|
30
|
+
* - Goal achievement (40%)
|
|
31
|
+
* - Criteria met (30%)
|
|
32
|
+
* - Turn efficiency (30%)
|
|
33
|
+
*/
|
|
34
|
+
export declare const conversationQualityScorer: SimulationScorer;
|
|
35
|
+
export declare const SIMULATION_EVALUATORS: Record<string, SimulationScorer>;
|
|
36
|
+
/**
|
|
37
|
+
* Get a built-in simulation evaluator by name.
|
|
38
|
+
*
|
|
39
|
+
* @param name - Evaluator name (goal_achieved, criteria_met, etc.)
|
|
40
|
+
* @returns The scorer function
|
|
41
|
+
* @throws Error if evaluator not found
|
|
42
|
+
*/
|
|
43
|
+
export declare function getEvaluator(name: string): SimulationScorer;
|
|
44
|
+
/**
|
|
45
|
+
* Get all built-in simulation evaluators.
|
|
46
|
+
*
|
|
47
|
+
* @returns Record of evaluator name to scorer function
|
|
48
|
+
*/
|
|
49
|
+
export declare function getAllEvaluators(): Record<string, SimulationScorer>;
|
|
50
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../../src/lib/integrations/simulation/evaluators/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAMpD,MAAM,MAAM,gBAAgB,GAAG,CAAC,MAAM,EAAE,gBAAgB,KAAK,MAAM,CAAC;AAMpE;;;GAGG;AACH,eAAO,MAAM,kBAAkB,EAAE,gBAEhC,CAAC;AAEF;;;;;GAKG;AACH,eAAO,MAAM,iBAAiB,EAAE,gBAY/B,CAAC;AAEF;;;GAGG;AACH,eAAO,MAAM,oBAAoB,EAAE,gBAqBlC,CAAC;AAEF;;;;;;;GAOG;AACH,eAAO,MAAM,yBAAyB,EAAE,gBAOvC,CAAC;AAMF,eAAO,MAAM,qBAAqB,EAAE,MAAM,CAAC,MAAM,EAAE,gBAAgB,CAKlE,CAAC;AAEF;;;;;;GAMG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,gBAAgB,CAO3D;AAED;;;;GAIG;AACH,wBAAgB,gBAAgB,IAAI,MAAM,CAAC,MAAM,EAAE,gBAAgB,CAAC,CAEnE"}
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Built-in evaluators for agent simulation.
|
|
3
|
+
*
|
|
4
|
+
* These evaluators assess simulation results using a scorer pattern
|
|
5
|
+
* compatible with evaluatorq integration.
|
|
6
|
+
*/
|
|
7
|
+
// ---------------------------------------------------------------------------
|
|
8
|
+
// Individual scorers
|
|
9
|
+
// ---------------------------------------------------------------------------
|
|
10
|
+
/**
|
|
11
|
+
* Evaluate if the simulation goal was achieved.
|
|
12
|
+
* Returns 1 if achieved, 0 otherwise.
|
|
13
|
+
*/
|
|
14
|
+
export const goalAchievedScorer = (result) => {
|
|
15
|
+
return result.goal_achieved ? 1 : 0;
|
|
16
|
+
};
|
|
17
|
+
/**
|
|
18
|
+
* Evaluate how many criteria were met.
|
|
19
|
+
* Returns a value between 0 and 1 based on the ratio of met criteria.
|
|
20
|
+
*
|
|
21
|
+
* Uses the criteria_results from metadata if available; otherwise returns 1.0.
|
|
22
|
+
*/
|
|
23
|
+
export const criteriaMetScorer = (result) => {
|
|
24
|
+
const criteriaResults = result.criteria_results ?? {};
|
|
25
|
+
const keys = Object.keys(criteriaResults);
|
|
26
|
+
if (keys.length === 0) {
|
|
27
|
+
return 1.0;
|
|
28
|
+
}
|
|
29
|
+
const met = Object.values(criteriaResults).filter((v) => v).length;
|
|
30
|
+
const total = keys.length;
|
|
31
|
+
return total > 0 ? met / total : 1.0;
|
|
32
|
+
};
|
|
33
|
+
/**
|
|
34
|
+
* Evaluate conversation efficiency (fewer turns = better).
|
|
35
|
+
* Returns a value between 0 and 1.
|
|
36
|
+
*/
|
|
37
|
+
export const turnEfficiencyScorer = (result) => {
|
|
38
|
+
const totalTurns = result.turn_count;
|
|
39
|
+
const goalAchieved = result.goal_achieved;
|
|
40
|
+
if (!goalAchieved) {
|
|
41
|
+
return 0.0;
|
|
42
|
+
}
|
|
43
|
+
if (totalTurns <= 2) {
|
|
44
|
+
return 1.0;
|
|
45
|
+
}
|
|
46
|
+
if (totalTurns <= 4) {
|
|
47
|
+
return 0.9;
|
|
48
|
+
}
|
|
49
|
+
if (totalTurns <= 6) {
|
|
50
|
+
return 0.7;
|
|
51
|
+
}
|
|
52
|
+
return Math.max(0.3, 1.0 - (totalTurns - 6) * 0.1);
|
|
53
|
+
};
|
|
54
|
+
/**
|
|
55
|
+
* Evaluate overall conversation quality.
|
|
56
|
+
*
|
|
57
|
+
* Composite score based on:
|
|
58
|
+
* - Goal achievement (40%)
|
|
59
|
+
* - Criteria met (30%)
|
|
60
|
+
* - Turn efficiency (30%)
|
|
61
|
+
*/
|
|
62
|
+
export const conversationQualityScorer = (result) => {
|
|
63
|
+
const goalScore = goalAchievedScorer(result);
|
|
64
|
+
const criteriaScore = criteriaMetScorer(result);
|
|
65
|
+
const efficiencyScore = turnEfficiencyScorer(result);
|
|
66
|
+
const score = goalScore * 0.4 + criteriaScore * 0.3 + efficiencyScore * 0.3;
|
|
67
|
+
return Math.round(score * 100) / 100;
|
|
68
|
+
};
|
|
69
|
+
// ---------------------------------------------------------------------------
|
|
70
|
+
// Registry
|
|
71
|
+
// ---------------------------------------------------------------------------
|
|
72
|
+
export const SIMULATION_EVALUATORS = {
|
|
73
|
+
goal_achieved: goalAchievedScorer,
|
|
74
|
+
criteria_met: criteriaMetScorer,
|
|
75
|
+
turn_efficiency: turnEfficiencyScorer,
|
|
76
|
+
conversation_quality: conversationQualityScorer,
|
|
77
|
+
};
|
|
78
|
+
/**
|
|
79
|
+
* Get a built-in simulation evaluator by name.
|
|
80
|
+
*
|
|
81
|
+
* @param name - Evaluator name (goal_achieved, criteria_met, etc.)
|
|
82
|
+
* @returns The scorer function
|
|
83
|
+
* @throws Error if evaluator not found
|
|
84
|
+
*/
|
|
85
|
+
export function getEvaluator(name) {
|
|
86
|
+
const evaluator = SIMULATION_EVALUATORS[name];
|
|
87
|
+
if (!evaluator) {
|
|
88
|
+
const available = Object.keys(SIMULATION_EVALUATORS).join(", ");
|
|
89
|
+
throw new Error(`Unknown evaluator: ${name}. Available: ${available}`);
|
|
90
|
+
}
|
|
91
|
+
return evaluator;
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Get all built-in simulation evaluators.
|
|
95
|
+
*
|
|
96
|
+
* @returns Record of evaluator name to scorer function
|
|
97
|
+
*/
|
|
98
|
+
export function getAllEvaluators() {
|
|
99
|
+
return { ...SIMULATION_EVALUATORS };
|
|
100
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Datapoint generator for creating test datasets.
|
|
3
|
+
*
|
|
4
|
+
* Combines personas and scenarios into complete datapoints with first messages.
|
|
5
|
+
*/
|
|
6
|
+
import type { Datapoint, Persona, Scenario } from "../types.js";
|
|
7
|
+
/**
|
|
8
|
+
* Configuration for DatapointGenerator.
|
|
9
|
+
*/
|
|
10
|
+
export interface DatapointGeneratorConfig {
|
|
11
|
+
model?: string;
|
|
12
|
+
rateLimitDelay?: number;
|
|
13
|
+
maxConcurrentCalls?: number;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Generates complete datapoints for simulation.
|
|
17
|
+
*
|
|
18
|
+
* Orchestrates persona, scenario, and first message generation
|
|
19
|
+
* to produce ready-to-use test datapoints.
|
|
20
|
+
*/
|
|
21
|
+
export declare class DatapointGenerator {
|
|
22
|
+
private model;
|
|
23
|
+
private rateLimitDelay;
|
|
24
|
+
private semaphore;
|
|
25
|
+
private personaGenerator;
|
|
26
|
+
private scenarioGenerator;
|
|
27
|
+
private firstMessageGenerator;
|
|
28
|
+
constructor(config?: DatapointGeneratorConfig);
|
|
29
|
+
/**
|
|
30
|
+
* Generate datapoints from agent description.
|
|
31
|
+
*
|
|
32
|
+
* Creates personas and scenarios, then combines them into datapoints.
|
|
33
|
+
* Total datapoints = numPersonas x (numScenarios + boundary + security)
|
|
34
|
+
*/
|
|
35
|
+
generateFromDescription(params: {
|
|
36
|
+
agentDescription: string;
|
|
37
|
+
context?: string;
|
|
38
|
+
numPersonas?: number;
|
|
39
|
+
numScenarios?: number;
|
|
40
|
+
edgeCasePercentage?: number;
|
|
41
|
+
perturbationRate?: number;
|
|
42
|
+
includeBoundary?: boolean;
|
|
43
|
+
numBoundary?: number;
|
|
44
|
+
includeSecurity?: boolean;
|
|
45
|
+
numSecurity?: number;
|
|
46
|
+
securitySeedExamples?: Record<string, unknown>[];
|
|
47
|
+
securityCategories?: string[];
|
|
48
|
+
}): Promise<Datapoint[]>;
|
|
49
|
+
/**
|
|
50
|
+
* Generate datapoints from persona-scenario combinations.
|
|
51
|
+
*/
|
|
52
|
+
generateFromCombinations(personas: Persona[], scenarios: Scenario[]): Promise<Datapoint[]>;
|
|
53
|
+
/**
|
|
54
|
+
* Apply random input perturbations to first messages for robustness testing.
|
|
55
|
+
*
|
|
56
|
+
* Uses the shared message-perturbation module.
|
|
57
|
+
*/
|
|
58
|
+
private static applyPerturbations;
|
|
59
|
+
}
|
|
60
|
+
//# sourceMappingURL=datapoint-generator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"datapoint-generator.d.ts","sourceRoot":"","sources":["../../../../../src/lib/integrations/simulation/generators/datapoint-generator.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,OAAO,KAAK,EAAE,SAAS,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAgDhE;;GAEG;AACH,MAAM,WAAW,wBAAwB;IACvC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,kBAAkB,CAAC,EAAE,MAAM,CAAC;CAC7B;AAED;;;;;GAKG;AACH,qBAAa,kBAAkB;IAC7B,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,cAAc,CAAS;IAC/B,OAAO,CAAC,SAAS,CAAY;IAC7B,OAAO,CAAC,gBAAgB,CAAmB;IAC3C,OAAO,CAAC,iBAAiB,CAAoB;IAC7C,OAAO,CAAC,qBAAqB,CAAwB;gBAEzC,MAAM,CAAC,EAAE,wBAAwB;IAa7C;;;;;OAKG;IACG,uBAAuB,CAAC,MAAM,EAAE;QACpC,gBAAgB,EAAE,MAAM,CAAC;QACzB,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,kBAAkB,CAAC,EAAE,MAAM,CAAC;QAC5B,gBAAgB,CAAC,EAAE,MAAM,CAAC;QAC1B,eAAe,CAAC,EAAE,OAAO,CAAC;QAC1B,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,eAAe,CAAC,EAAE,OAAO,CAAC;QAC1B,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,oBAAoB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAAE,CAAC;QACjD,kBAAkB,CAAC,EAAE,MAAM,EAAE,CAAC;KAC/B,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC;IAyHxB;;OAEG;IACG,wBAAwB,CAC5B,QAAQ,EAAE,OAAO,EAAE,EACnB,SAAS,EAAE,QAAQ,EAAE,GACpB,OAAO,CAAC,SAAS,EAAE,CAAC;IAuCvB;;;;OAIG;IACH,OAAO,CAAC,MAAM,CAAC,kBAAkB;CA2BlC"}
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Datapoint generator for creating test datasets.
|
|
3
|
+
*
|
|
4
|
+
* Combines personas and scenarios into complete datapoints with first messages.
|
|
5
|
+
*/
|
|
6
|
+
import { applyRandomPerturbation } from "../quality/message-perturbation.js";
|
|
7
|
+
import { generateDatapoint } from "../utils/prompt-builders.js";
|
|
8
|
+
import { FirstMessageGenerator } from "./first-message-generator.js";
|
|
9
|
+
import { PersonaGenerator } from "./persona-generator.js";
|
|
10
|
+
import { ScenarioGenerator } from "./scenario-generator.js";
|
|
11
|
+
// Default rate limit settings
|
|
12
|
+
const DEFAULT_RATE_LIMIT_DELAY = 100; // 100ms delay between API calls
|
|
13
|
+
const DEFAULT_MAX_CONCURRENT_CALLS = 5;
|
|
14
|
+
/**
|
|
15
|
+
* Simple semaphore for limiting concurrency.
|
|
16
|
+
*/
|
|
17
|
+
class Semaphore {
|
|
18
|
+
max;
|
|
19
|
+
queue = [];
|
|
20
|
+
running = 0;
|
|
21
|
+
constructor(max) {
|
|
22
|
+
this.max = max;
|
|
23
|
+
}
|
|
24
|
+
async acquire() {
|
|
25
|
+
if (this.running < this.max) {
|
|
26
|
+
this.running++;
|
|
27
|
+
return;
|
|
28
|
+
}
|
|
29
|
+
return new Promise((resolve) => {
|
|
30
|
+
this.queue.push(() => {
|
|
31
|
+
this.running++;
|
|
32
|
+
resolve();
|
|
33
|
+
});
|
|
34
|
+
});
|
|
35
|
+
}
|
|
36
|
+
release() {
|
|
37
|
+
this.running--;
|
|
38
|
+
const next = this.queue.shift();
|
|
39
|
+
if (next) {
|
|
40
|
+
next();
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Sleep for a given number of milliseconds.
|
|
46
|
+
*/
|
|
47
|
+
function sleep(ms) {
|
|
48
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Generates complete datapoints for simulation.
|
|
52
|
+
*
|
|
53
|
+
* Orchestrates persona, scenario, and first message generation
|
|
54
|
+
* to produce ready-to-use test datapoints.
|
|
55
|
+
*/
|
|
56
|
+
export class DatapointGenerator {
|
|
57
|
+
model;
|
|
58
|
+
rateLimitDelay;
|
|
59
|
+
semaphore;
|
|
60
|
+
personaGenerator;
|
|
61
|
+
scenarioGenerator;
|
|
62
|
+
firstMessageGenerator;
|
|
63
|
+
constructor(config) {
|
|
64
|
+
this.model = config?.model ?? "azure/gpt-4o-mini";
|
|
65
|
+
this.rateLimitDelay = config?.rateLimitDelay ?? DEFAULT_RATE_LIMIT_DELAY;
|
|
66
|
+
this.semaphore = new Semaphore(config?.maxConcurrentCalls ?? DEFAULT_MAX_CONCURRENT_CALLS);
|
|
67
|
+
this.personaGenerator = new PersonaGenerator({ model: this.model });
|
|
68
|
+
this.scenarioGenerator = new ScenarioGenerator({ model: this.model });
|
|
69
|
+
this.firstMessageGenerator = new FirstMessageGenerator({
|
|
70
|
+
model: this.model,
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Generate datapoints from agent description.
|
|
75
|
+
*
|
|
76
|
+
* Creates personas and scenarios, then combines them into datapoints.
|
|
77
|
+
* Total datapoints = numPersonas x (numScenarios + boundary + security)
|
|
78
|
+
*/
|
|
79
|
+
async generateFromDescription(params) {
|
|
80
|
+
const { agentDescription, context = "", numPersonas = 3, numScenarios = 5, edgeCasePercentage = 0.2, perturbationRate = 0.0, includeBoundary = false, numBoundary = 5, includeSecurity = false, numSecurity = 5, securitySeedExamples, securityCategories, } = params;
|
|
81
|
+
console.log(`Generating ${numPersonas} personas and ${numScenarios} scenarios...`);
|
|
82
|
+
// Build named tasks for parallel generation
|
|
83
|
+
const namedTasks = {
|
|
84
|
+
personas: this.personaGenerator.generate({
|
|
85
|
+
agentDescription,
|
|
86
|
+
context,
|
|
87
|
+
numPersonas,
|
|
88
|
+
edgeCasePercentage,
|
|
89
|
+
}),
|
|
90
|
+
scenarios: this.scenarioGenerator.generate({
|
|
91
|
+
agentDescription,
|
|
92
|
+
context,
|
|
93
|
+
numScenarios,
|
|
94
|
+
edgeCasePercentage,
|
|
95
|
+
}),
|
|
96
|
+
};
|
|
97
|
+
if (includeBoundary) {
|
|
98
|
+
console.log(`Including ${numBoundary} boundary scenarios`);
|
|
99
|
+
namedTasks.boundary = this.scenarioGenerator.generateBoundaryScenarios({
|
|
100
|
+
agentDescription,
|
|
101
|
+
numScenarios: numBoundary,
|
|
102
|
+
});
|
|
103
|
+
}
|
|
104
|
+
if (includeSecurity) {
|
|
105
|
+
console.log(`Including ${numSecurity} security scenarios`);
|
|
106
|
+
namedTasks.security = this.scenarioGenerator.generateSecurityScenarios({
|
|
107
|
+
agentDescription,
|
|
108
|
+
seedExamples: securitySeedExamples,
|
|
109
|
+
categories: securityCategories,
|
|
110
|
+
numScenarios: numSecurity,
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
const taskKeys = Object.keys(namedTasks);
|
|
114
|
+
const taskPromises = Object.values(namedTasks);
|
|
115
|
+
const rawResults = await Promise.all(taskPromises);
|
|
116
|
+
const results = {};
|
|
117
|
+
for (let i = 0; i < taskKeys.length; i++) {
|
|
118
|
+
const key = taskKeys[i];
|
|
119
|
+
results[key] = rawResults[i];
|
|
120
|
+
}
|
|
121
|
+
let personas = results.personas;
|
|
122
|
+
let scenarios = results.scenarios;
|
|
123
|
+
// Merge additional scenario types
|
|
124
|
+
if (results.boundary) {
|
|
125
|
+
const boundary = results.boundary;
|
|
126
|
+
console.log(`Generated ${boundary.length} boundary scenarios`);
|
|
127
|
+
scenarios = [...scenarios, ...boundary];
|
|
128
|
+
}
|
|
129
|
+
if (results.security) {
|
|
130
|
+
const security = results.security;
|
|
131
|
+
console.log(`Generated ${security.length} security scenarios`);
|
|
132
|
+
scenarios = [...scenarios, ...security];
|
|
133
|
+
}
|
|
134
|
+
if (personas.length === 0) {
|
|
135
|
+
console.warn("No personas generated, using defaults");
|
|
136
|
+
personas = [
|
|
137
|
+
{
|
|
138
|
+
name: "Default User",
|
|
139
|
+
patience: 0.5,
|
|
140
|
+
assertiveness: 0.5,
|
|
141
|
+
politeness: 0.5,
|
|
142
|
+
technical_level: 0.5,
|
|
143
|
+
communication_style: "casual",
|
|
144
|
+
background: "",
|
|
145
|
+
},
|
|
146
|
+
];
|
|
147
|
+
}
|
|
148
|
+
if (scenarios.length === 0) {
|
|
149
|
+
console.warn("No scenarios generated, using defaults");
|
|
150
|
+
scenarios = [
|
|
151
|
+
{
|
|
152
|
+
name: "Default Scenario",
|
|
153
|
+
goal: "Get help",
|
|
154
|
+
context: "User needs general assistance",
|
|
155
|
+
starting_emotion: "neutral",
|
|
156
|
+
criteria: [],
|
|
157
|
+
},
|
|
158
|
+
];
|
|
159
|
+
}
|
|
160
|
+
// Generate datapoints from all combinations
|
|
161
|
+
let datapoints = await this.generateFromCombinations(personas, scenarios);
|
|
162
|
+
// Apply message perturbations for robustness testing
|
|
163
|
+
if (perturbationRate > 0.0) {
|
|
164
|
+
datapoints = DatapointGenerator.applyPerturbations(datapoints, perturbationRate);
|
|
165
|
+
}
|
|
166
|
+
return datapoints;
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Generate datapoints from persona-scenario combinations.
|
|
170
|
+
*/
|
|
171
|
+
async generateFromCombinations(personas, scenarios) {
|
|
172
|
+
// Build all combinations
|
|
173
|
+
const combinations = [];
|
|
174
|
+
for (const persona of personas) {
|
|
175
|
+
for (const scenario of scenarios) {
|
|
176
|
+
combinations.push([persona, scenario]);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
console.log(`Generating ${combinations.length} datapoints from ${personas.length} personas x ${scenarios.length} scenarios`);
|
|
180
|
+
const generateSingle = async (persona, scenario) => {
|
|
181
|
+
await this.semaphore.acquire();
|
|
182
|
+
try {
|
|
183
|
+
const firstMessage = await this.firstMessageGenerator.generate(persona, scenario);
|
|
184
|
+
// Small delay to prevent overwhelming the API
|
|
185
|
+
await sleep(this.rateLimitDelay);
|
|
186
|
+
return generateDatapoint(persona, scenario, firstMessage);
|
|
187
|
+
}
|
|
188
|
+
finally {
|
|
189
|
+
this.semaphore.release();
|
|
190
|
+
}
|
|
191
|
+
};
|
|
192
|
+
// Generate all datapoints with bounded concurrency
|
|
193
|
+
const tasks = combinations.map(([p, s]) => generateSingle(p, s));
|
|
194
|
+
const datapoints = await Promise.all(tasks);
|
|
195
|
+
console.log(`Generated ${datapoints.length} datapoints`);
|
|
196
|
+
return datapoints;
|
|
197
|
+
}
|
|
198
|
+
/**
|
|
199
|
+
* Apply random input perturbations to first messages for robustness testing.
|
|
200
|
+
*
|
|
201
|
+
* Uses the shared message-perturbation module.
|
|
202
|
+
*/
|
|
203
|
+
static applyPerturbations(datapoints, perturbationRate) {
|
|
204
|
+
let perturbedCount = 0;
|
|
205
|
+
const result = datapoints.map((dp) => {
|
|
206
|
+
if (dp.first_message && Math.random() < perturbationRate) {
|
|
207
|
+
const [perturbedMsg, pType] = applyRandomPerturbation(dp.first_message);
|
|
208
|
+
perturbedCount++;
|
|
209
|
+
console.debug(`Applied ${pType} perturbation to: ${dp.scenario.name}`);
|
|
210
|
+
// Create a new datapoint with the perturbed message (immutable update)
|
|
211
|
+
return {
|
|
212
|
+
...dp,
|
|
213
|
+
first_message: perturbedMsg,
|
|
214
|
+
};
|
|
215
|
+
}
|
|
216
|
+
return dp;
|
|
217
|
+
});
|
|
218
|
+
if (perturbedCount > 0) {
|
|
219
|
+
console.log(`Applied perturbations to ${perturbedCount}/${datapoints.length} first messages`);
|
|
220
|
+
}
|
|
221
|
+
return result;
|
|
222
|
+
}
|
|
223
|
+
}
|