@orq-ai/evaluatorq 1.3.1 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/integrations/simulation/adapters.d.ts +28 -5
- package/dist/lib/integrations/simulation/adapters.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/adapters.js +113 -7
- package/dist/lib/integrations/simulation/agents/base.d.ts +3 -0
- package/dist/lib/integrations/simulation/agents/base.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/agents/base.js +104 -82
- package/dist/lib/integrations/simulation/agents/judge.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/agents/judge.js +1 -0
- package/dist/lib/integrations/simulation/agents/user-simulator.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/agents/user-simulator.js +4 -1
- package/dist/lib/integrations/simulation/generators/first-message-generator.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/generators/first-message-generator.js +51 -28
- package/dist/lib/integrations/simulation/generators/persona-generator.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/generators/persona-generator.js +144 -102
- package/dist/lib/integrations/simulation/generators/scenario-generator.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/generators/scenario-generator.js +274 -169
- package/dist/lib/integrations/simulation/index.d.ts +1 -1
- package/dist/lib/integrations/simulation/index.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/index.js +1 -1
- package/dist/lib/integrations/simulation/runner/simulation.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/runner/simulation.js +147 -85
- package/dist/lib/integrations/simulation/simulation/index.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/simulation/index.js +81 -27
- package/dist/lib/integrations/simulation/tracing.d.ts +111 -0
- package/dist/lib/integrations/simulation/tracing.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/tracing.js +310 -0
- package/dist/lib/integrations/simulation/wrap-agent.js +2 -2
- package/dist/tsconfig.lib.tsbuildinfo +1 -1
- package/package.json +1 -1
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
import OpenAI from "openai";
|
|
8
8
|
import { JudgeAgent } from "../agents/judge.js";
|
|
9
9
|
import { UserSimulatorAgent } from "../agents/user-simulator.js";
|
|
10
|
+
import { recordLLMInput, recordLLMOutput, recordTokenUsage, setSpanAttrs, withSimulationSpan, } from "../tracing.js";
|
|
10
11
|
import { buildDatapointSystemPrompt } from "../utils/prompt-builders.js";
|
|
11
12
|
// ---------------------------------------------------------------------------
|
|
12
13
|
// Helpers: create SimulationResult variants
|
|
@@ -111,96 +112,157 @@ export class SimulationRunner {
|
|
|
111
112
|
// Declare usage helper references — initialized inside try after agents are created
|
|
112
113
|
let getTotalUsage;
|
|
113
114
|
try {
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
prompt_tokens: usageAfter.prompt_tokens - usageBefore.prompt_tokens,
|
|
145
|
-
completion_tokens: usageAfter.completion_tokens - usageBefore.completion_tokens,
|
|
146
|
-
total_tokens: usageAfter.total_tokens - usageBefore.total_tokens,
|
|
147
|
-
},
|
|
148
|
-
response_quality: judgment.response_quality ?? null,
|
|
149
|
-
hallucination_risk: judgment.hallucination_risk ?? null,
|
|
150
|
-
tone_appropriateness: judgment.tone_appropriateness ?? null,
|
|
151
|
-
factual_accuracy: judgment.factual_accuracy ?? null,
|
|
152
|
-
judge_reason: judgment.reason,
|
|
115
|
+
return await withSimulationSpan("orq.simulation.run", {
|
|
116
|
+
"orq.simulation.persona": persona?.name,
|
|
117
|
+
"orq.simulation.scenario": scenario?.name,
|
|
118
|
+
"orq.simulation.max_turns": maxTurns,
|
|
119
|
+
"orq.simulation.model": this.model,
|
|
120
|
+
}, async (runSpan) => {
|
|
121
|
+
// Use stored system prompt if available, otherwise build from persona+scenario
|
|
122
|
+
const systemPrompt = storedSystemPrompt ??
|
|
123
|
+
buildDatapointSystemPrompt(persona, scenario);
|
|
124
|
+
const client = this.getSharedClient();
|
|
125
|
+
// Always create fresh agents per simulation (no shared state between concurrent runs)
|
|
126
|
+
const userSimulator = new UserSimulatorAgent({
|
|
127
|
+
model: this.model,
|
|
128
|
+
client,
|
|
129
|
+
systemPrompt: systemPrompt,
|
|
130
|
+
});
|
|
131
|
+
const judge = new JudgeAgent({
|
|
132
|
+
model: this.model,
|
|
133
|
+
client,
|
|
134
|
+
goal: scenario?.goal,
|
|
135
|
+
criteria: scenario?.criteria ?? [],
|
|
136
|
+
groundTruth: scenario?.ground_truth ?? "",
|
|
137
|
+
});
|
|
138
|
+
getTotalUsage = () => {
|
|
139
|
+
const usage = userSimulator.getUsage();
|
|
140
|
+
const judgeUsage = judge.getUsage();
|
|
141
|
+
usage.prompt_tokens += judgeUsage.prompt_tokens;
|
|
142
|
+
usage.completion_tokens += judgeUsage.completion_tokens;
|
|
143
|
+
usage.total_tokens += judgeUsage.total_tokens;
|
|
144
|
+
return usage;
|
|
153
145
|
};
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
const checkCancelled = () => {
|
|
157
|
-
if (signal?.aborted) {
|
|
158
|
-
throw new Error("Simulation cancelled");
|
|
159
|
-
}
|
|
160
|
-
};
|
|
161
|
-
checkCancelled();
|
|
162
|
-
// Generate or use first message
|
|
163
|
-
const firstMsg = firstMessage
|
|
164
|
-
? firstMessage
|
|
165
|
-
: await userSimulator.generateFirstMessage();
|
|
166
|
-
messages.push({ role: "user", content: firstMsg });
|
|
167
|
-
let lastJudgment;
|
|
168
|
-
for (let turn = 0; turn < maxTurns; turn++) {
|
|
169
|
-
checkCancelled();
|
|
170
|
-
const usageBefore = getTotalUsage();
|
|
171
|
-
// 1. Target agent responds
|
|
172
|
-
const agentResponse = await this.getTargetResponse(messages.map((m) => ({ role: m.role, content: m.content })));
|
|
173
|
-
messages.push({ role: "assistant", content: agentResponse });
|
|
174
|
-
checkCancelled();
|
|
175
|
-
// 2. Judge evaluates
|
|
176
|
-
const judgment = await judge.evaluate(messages.map((m) => ({ role: m.role, content: m.content })), { signal });
|
|
177
|
-
turnMetricsList.push(buildTurnMetrics(turn + 1, judgment, usageBefore));
|
|
178
|
-
lastJudgment = judgment;
|
|
179
|
-
if (judgment.should_terminate) {
|
|
146
|
+
const buildTurnMetrics = (turnNum, judgment, usageBefore) => {
|
|
147
|
+
const usageAfter = getTotalUsage();
|
|
180
148
|
return {
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
149
|
+
turn_number: turnNum,
|
|
150
|
+
token_usage: {
|
|
151
|
+
prompt_tokens: usageAfter.prompt_tokens - usageBefore.prompt_tokens,
|
|
152
|
+
completion_tokens: usageAfter.completion_tokens - usageBefore.completion_tokens,
|
|
153
|
+
total_tokens: usageAfter.total_tokens - usageBefore.total_tokens,
|
|
154
|
+
},
|
|
155
|
+
response_quality: judgment.response_quality ?? null,
|
|
156
|
+
hallucination_risk: judgment.hallucination_risk ?? null,
|
|
157
|
+
tone_appropriateness: judgment.tone_appropriateness ?? null,
|
|
158
|
+
factual_accuracy: judgment.factual_accuracy ?? null,
|
|
159
|
+
judge_reason: judgment.reason,
|
|
192
160
|
};
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
|
|
161
|
+
};
|
|
162
|
+
/** Check if this run has been cancelled (timeout). */
|
|
163
|
+
const checkCancelled = () => {
|
|
164
|
+
if (signal?.aborted) {
|
|
165
|
+
throw new Error("Simulation cancelled");
|
|
166
|
+
}
|
|
167
|
+
};
|
|
168
|
+
checkCancelled();
|
|
169
|
+
// Generate or use first message
|
|
170
|
+
const firstMsg = firstMessage
|
|
171
|
+
? firstMessage
|
|
172
|
+
: await withSimulationSpan("orq.simulation.first_message_generation", {
|
|
173
|
+
"orq.simulation.persona": persona?.name,
|
|
174
|
+
"orq.simulation.scenario": scenario?.name,
|
|
175
|
+
"orq.simulation.model": this.model,
|
|
176
|
+
}, async () => userSimulator.generateFirstMessage());
|
|
177
|
+
messages.push({ role: "user", content: firstMsg });
|
|
178
|
+
let lastJudgment;
|
|
179
|
+
for (let turn = 0; turn < maxTurns; turn++) {
|
|
196
180
|
checkCancelled();
|
|
197
|
-
const
|
|
198
|
-
|
|
181
|
+
const usageBefore = getTotalUsage();
|
|
182
|
+
await withSimulationSpan("orq.simulation.turn", {
|
|
183
|
+
"orq.simulation.turn": turn + 1,
|
|
184
|
+
"orq.simulation.max_turns": maxTurns,
|
|
185
|
+
}, async (turnSpan) => {
|
|
186
|
+
// 1. Target agent responds
|
|
187
|
+
const targetMessages = messages.map((m) => ({
|
|
188
|
+
role: m.role,
|
|
189
|
+
content: m.content,
|
|
190
|
+
}));
|
|
191
|
+
const agentResponse = await withSimulationSpan("orq.simulation.target_call", undefined, async (targetSpan) => {
|
|
192
|
+
recordLLMInput(targetSpan, targetMessages);
|
|
193
|
+
const response = await this.getTargetResponse(targetMessages);
|
|
194
|
+
recordLLMOutput(targetSpan, response);
|
|
195
|
+
return response;
|
|
196
|
+
});
|
|
197
|
+
messages.push({ role: "assistant", content: agentResponse });
|
|
198
|
+
checkCancelled();
|
|
199
|
+
// 2. Judge evaluates
|
|
200
|
+
const judgment = await withSimulationSpan("orq.simulation.judge_evaluation", undefined, async () => judge.evaluate(messages.map((m) => ({
|
|
201
|
+
role: m.role,
|
|
202
|
+
content: m.content,
|
|
203
|
+
})), { signal }));
|
|
204
|
+
turnMetricsList.push(buildTurnMetrics(turn + 1, judgment, usageBefore));
|
|
205
|
+
lastJudgment = judgment;
|
|
206
|
+
setSpanAttrs(turnSpan, {
|
|
207
|
+
"orq.simulation.goal_achieved": judgment.goal_achieved,
|
|
208
|
+
"orq.simulation.goal_completion_score": judgment.goal_completion_score,
|
|
209
|
+
"orq.simulation.should_terminate": judgment.should_terminate,
|
|
210
|
+
});
|
|
211
|
+
if (!judgment.should_terminate && turn < maxTurns - 1) {
|
|
212
|
+
// 3. User simulator continues
|
|
213
|
+
checkCancelled();
|
|
214
|
+
const userResponse = await withSimulationSpan("orq.simulation.user_simulator_call", undefined, async () => userSimulator.respondAsync(messages.map((m) => ({
|
|
215
|
+
role: m.role,
|
|
216
|
+
content: m.content,
|
|
217
|
+
})), { signal, llmPurpose: "user_simulator" }));
|
|
218
|
+
messages.push({ role: "user", content: userResponse });
|
|
219
|
+
}
|
|
220
|
+
});
|
|
221
|
+
// Check if judge terminated after the turn span completes
|
|
222
|
+
if (lastJudgment?.should_terminate) {
|
|
223
|
+
const finalUsage = getTotalUsage();
|
|
224
|
+
recordTokenUsage(runSpan, {
|
|
225
|
+
promptTokens: finalUsage.prompt_tokens,
|
|
226
|
+
completionTokens: finalUsage.completion_tokens,
|
|
227
|
+
totalTokens: finalUsage.total_tokens,
|
|
228
|
+
});
|
|
229
|
+
setSpanAttrs(runSpan, {
|
|
230
|
+
"orq.simulation.terminated_by": "judge",
|
|
231
|
+
"orq.simulation.goal_achieved": lastJudgment.goal_achieved,
|
|
232
|
+
"orq.simulation.turn_count": turn + 1,
|
|
233
|
+
});
|
|
234
|
+
return {
|
|
235
|
+
messages,
|
|
236
|
+
terminated_by: "judge",
|
|
237
|
+
reason: lastJudgment.reason,
|
|
238
|
+
goal_achieved: lastJudgment.goal_achieved,
|
|
239
|
+
goal_completion_score: lastJudgment.goal_completion_score,
|
|
240
|
+
rules_broken: lastJudgment.rules_broken,
|
|
241
|
+
turn_count: turn + 1,
|
|
242
|
+
turn_metrics: turnMetricsList,
|
|
243
|
+
token_usage: finalUsage,
|
|
244
|
+
criteria_results: this.buildCriteriaResults(scenario, lastJudgment),
|
|
245
|
+
metadata: {
|
|
246
|
+
persona: persona?.name,
|
|
247
|
+
scenario: scenario?.name,
|
|
248
|
+
},
|
|
249
|
+
};
|
|
250
|
+
}
|
|
199
251
|
}
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
252
|
+
// Max turns reached
|
|
253
|
+
const finalUsage = getTotalUsage();
|
|
254
|
+
recordTokenUsage(runSpan, {
|
|
255
|
+
promptTokens: finalUsage.prompt_tokens,
|
|
256
|
+
completionTokens: finalUsage.completion_tokens,
|
|
257
|
+
totalTokens: finalUsage.total_tokens,
|
|
258
|
+
});
|
|
259
|
+
setSpanAttrs(runSpan, {
|
|
260
|
+
"orq.simulation.terminated_by": "max_turns",
|
|
261
|
+
"orq.simulation.goal_achieved": lastJudgment?.goal_achieved ?? false,
|
|
262
|
+
"orq.simulation.turn_count": maxTurns,
|
|
263
|
+
});
|
|
264
|
+
return maxTurnsResult(maxTurns, messages, turnMetricsList, finalUsage, persona, scenario, lastJudgment);
|
|
265
|
+
});
|
|
204
266
|
}
|
|
205
267
|
catch (e) {
|
|
206
268
|
console.error("SimulationRunner.run() failed:", e);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../../src/lib/integrations/simulation/simulation/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../../src/lib/integrations/simulation/simulation/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAcH,OAAO,KAAK,EACV,WAAW,EACX,SAAS,EACT,OAAO,EACP,QAAQ,EACR,gBAAgB,EACjB,MAAM,aAAa,CAAC;AAOrB,MAAM,WAAW,cAAc;IAC7B,cAAc,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,cAAc,CAAC,EAAE,CAAC,QAAQ,EAAE,WAAW,EAAE,KAAK,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IACvE,QAAQ,CAAC,EAAE,OAAO,EAAE,CAAC;IACrB,SAAS,CAAC,EAAE,QAAQ,EAAE,CAAC;IACvB,UAAU,CAAC,EAAE,SAAS,EAAE,CAAC;IACzB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;IACtB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;;;;;;;GAQG;AACH,wBAAsB,QAAQ,CAC5B,MAAM,EAAE,cAAc,GACrB,OAAO,CAAC,gBAAgB,EAAE,CAAC,CAkB7B;AA2JD,MAAM,WAAW,yBAAyB;IACxC,cAAc,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,gBAAgB,EAAE,MAAM,CAAC;IACzB,cAAc,CAAC,EAAE,CAAC,QAAQ,EAAE,WAAW,EAAE,KAAK,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IACvE,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;IACtB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;;;GAIG;AACH,wBAAsB,mBAAmB,CACvC,MAAM,EAAE,yBAAyB,GAChC,OAAO,CAAC,gBAAgB,EAAE,CAAC,CAuG7B;AAGD,OAAO,EACL,gBAAgB,EAChB,YAAY,EACZ,qBAAqB,GACtB,MAAM,wBAAwB,CAAC"}
|
|
@@ -5,10 +5,12 @@
|
|
|
5
5
|
* either standalone or within the evaluatorq framework.
|
|
6
6
|
*/
|
|
7
7
|
import OpenAI from "openai";
|
|
8
|
-
import {
|
|
8
|
+
import { flushTracing, initTracingIfNeeded } from "../../../tracing/setup.js";
|
|
9
|
+
import { fromOrqAgent } from "../adapters.js";
|
|
9
10
|
import { getEvaluator } from "../evaluators/index.js";
|
|
10
11
|
import { FirstMessageGenerator } from "../generators/first-message-generator.js";
|
|
11
12
|
import { SimulationRunner } from "../runner/simulation.js";
|
|
13
|
+
import { recordTokenUsage, setSpanAttrs, withSimulationSpan, } from "../tracing.js";
|
|
12
14
|
import { generateDatapoint } from "../utils/prompt-builders.js";
|
|
13
15
|
/**
|
|
14
16
|
* High-level function to run agent simulations.
|
|
@@ -20,6 +22,24 @@ import { generateDatapoint } from "../utils/prompt-builders.js";
|
|
|
20
22
|
* - Applying evaluators to results
|
|
21
23
|
*/
|
|
22
24
|
export async function simulate(params) {
|
|
25
|
+
// Initialize OTel tracing (no-op if already initialized or not configured)
|
|
26
|
+
await initTracingIfNeeded();
|
|
27
|
+
try {
|
|
28
|
+
return await withSimulationSpan("orq.simulation.pipeline", {
|
|
29
|
+
"orq.simulation.evaluation_name": params.evaluationName,
|
|
30
|
+
"orq.simulation.max_turns": params.maxTurns ?? 10,
|
|
31
|
+
"orq.simulation.parallelism": params.parallelism ?? 5,
|
|
32
|
+
}, (pipelineSpan) => _simulateCore(params, pipelineSpan));
|
|
33
|
+
}
|
|
34
|
+
finally {
|
|
35
|
+
// Flush pending spans to ensure they're exported before the process exits
|
|
36
|
+
await flushTracing();
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
// ---------------------------------------------------------------------------
|
|
40
|
+
// Core simulation logic (shared by simulate and generateAndSimulate)
|
|
41
|
+
// ---------------------------------------------------------------------------
|
|
42
|
+
async function _simulateCore(params, pipelineSpan) {
|
|
23
43
|
const { targetCallback, personas, scenarios, maxTurns = 10, model = "azure/gpt-4o-mini", evaluators: evaluatorNames, parallelism = 5, } = params;
|
|
24
44
|
let { datapoints } = params;
|
|
25
45
|
// Validate evaluator names early — throw on unknown names
|
|
@@ -69,10 +89,13 @@ export async function simulate(params) {
|
|
|
69
89
|
if (!datapoints || datapoints.length === 0) {
|
|
70
90
|
throw new Error("No datapoints to simulate — persona or scenario generation may have failed");
|
|
71
91
|
}
|
|
92
|
+
setSpanAttrs(pipelineSpan, {
|
|
93
|
+
"orq.simulation.datapoints_count": datapoints.length,
|
|
94
|
+
});
|
|
72
95
|
// Bridge agentKey to invoke() if no callback is provided
|
|
73
96
|
let resolvedCallback = targetCallback;
|
|
74
97
|
if (!resolvedCallback && params.agentKey) {
|
|
75
|
-
resolvedCallback =
|
|
98
|
+
resolvedCallback = fromOrqAgent(params.agentKey);
|
|
76
99
|
}
|
|
77
100
|
if (!resolvedCallback) {
|
|
78
101
|
throw new Error("Either targetCallback or agentKey is required");
|
|
@@ -98,6 +121,21 @@ export async function simulate(params) {
|
|
|
98
121
|
}
|
|
99
122
|
result.metadata.evaluator_scores = scores;
|
|
100
123
|
}
|
|
124
|
+
// Record aggregate token usage on the pipeline span
|
|
125
|
+
const totalUsage = results.reduce((acc, r) => ({
|
|
126
|
+
prompt: acc.prompt + (r.token_usage?.prompt_tokens ?? 0),
|
|
127
|
+
completion: acc.completion + (r.token_usage?.completion_tokens ?? 0),
|
|
128
|
+
total: acc.total + (r.token_usage?.total_tokens ?? 0),
|
|
129
|
+
}), { prompt: 0, completion: 0, total: 0 });
|
|
130
|
+
recordTokenUsage(pipelineSpan, {
|
|
131
|
+
promptTokens: totalUsage.prompt,
|
|
132
|
+
completionTokens: totalUsage.completion,
|
|
133
|
+
totalTokens: totalUsage.total,
|
|
134
|
+
});
|
|
135
|
+
setSpanAttrs(pipelineSpan, {
|
|
136
|
+
"orq.simulation.results_count": results.length,
|
|
137
|
+
"orq.simulation.goal_achieved_count": results.filter((r) => r.goal_achieved).length,
|
|
138
|
+
});
|
|
101
139
|
return results;
|
|
102
140
|
}
|
|
103
141
|
finally {
|
|
@@ -110,11 +148,13 @@ export async function simulate(params) {
|
|
|
110
148
|
* Convenience function that combines generation and simulation.
|
|
111
149
|
*/
|
|
112
150
|
export async function generateAndSimulate(params) {
|
|
151
|
+
// Initialize tracing early so generation spans are captured
|
|
152
|
+
await initTracingIfNeeded();
|
|
113
153
|
const { evaluationName, agentDescription, targetCallback, numPersonas = 5, numScenarios = 5, maxTurns = 10, model = "azure/gpt-4o-mini", evaluators, parallelism = 5, } = params;
|
|
114
154
|
// Bridge agentKey to invoke() if no callback is provided
|
|
115
155
|
let resolvedCallback = targetCallback;
|
|
116
156
|
if (!resolvedCallback && params.agentKey) {
|
|
117
|
-
resolvedCallback =
|
|
157
|
+
resolvedCallback = fromOrqAgent(params.agentKey);
|
|
118
158
|
}
|
|
119
159
|
if (!resolvedCallback) {
|
|
120
160
|
throw new Error("Either targetCallback or agentKey is required for generateAndSimulate");
|
|
@@ -130,30 +170,44 @@ export async function generateAndSimulate(params) {
|
|
|
130
170
|
catch (err) {
|
|
131
171
|
throw new Error("Generators module not available. Install generators or provide pre-built datapoints using simulate() instead.", { cause: err });
|
|
132
172
|
}
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
173
|
+
try {
|
|
174
|
+
return await withSimulationSpan("orq.simulation.pipeline", {
|
|
175
|
+
"orq.simulation.evaluation_name": evaluationName,
|
|
176
|
+
"orq.simulation.mode": "generate_and_simulate",
|
|
177
|
+
"orq.simulation.num_personas": numPersonas,
|
|
178
|
+
"orq.simulation.num_scenarios": numScenarios,
|
|
179
|
+
"orq.simulation.max_turns": maxTurns,
|
|
180
|
+
"orq.simulation.parallelism": parallelism,
|
|
181
|
+
}, async (pipelineSpan) => {
|
|
182
|
+
// Generate personas and scenarios in parallel (under the pipeline span)
|
|
183
|
+
const personaGen = new PersonaGenerator({ model });
|
|
184
|
+
const scenarioGen = new ScenarioGenerator({ model });
|
|
185
|
+
const [personas, scenarios] = await Promise.all([
|
|
186
|
+
personaGen.generate({
|
|
187
|
+
agentDescription,
|
|
188
|
+
numPersonas,
|
|
189
|
+
}),
|
|
190
|
+
scenarioGen.generate({
|
|
191
|
+
agentDescription,
|
|
192
|
+
numScenarios,
|
|
193
|
+
}),
|
|
194
|
+
]);
|
|
195
|
+
// Delegate to core logic (no duplicate pipeline span)
|
|
196
|
+
return _simulateCore({
|
|
197
|
+
evaluationName,
|
|
198
|
+
targetCallback: resolvedCallback,
|
|
199
|
+
personas,
|
|
200
|
+
scenarios,
|
|
201
|
+
maxTurns,
|
|
202
|
+
model,
|
|
203
|
+
evaluators,
|
|
204
|
+
parallelism,
|
|
205
|
+
}, pipelineSpan);
|
|
206
|
+
});
|
|
207
|
+
}
|
|
208
|
+
finally {
|
|
209
|
+
await flushTracing();
|
|
210
|
+
}
|
|
157
211
|
}
|
|
158
212
|
// Re-export evaluator utilities for convenience
|
|
159
213
|
export { getAllEvaluators, getEvaluator, SIMULATION_EVALUATORS, } from "../evaluators/index.js";
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenTelemetry tracing utilities for the agent simulation module.
|
|
3
|
+
*
|
|
4
|
+
* Provides span creation helpers that mirror the redteam module's tracing
|
|
5
|
+
* patterns, adapted for the TypeScript simulation module. All functions
|
|
6
|
+
* gracefully degrade to no-ops when tracing is not enabled.
|
|
7
|
+
*
|
|
8
|
+
* Span hierarchy:
|
|
9
|
+
* orq.simulation.pipeline (root)
|
|
10
|
+
* ├── orq.simulation.persona_generation
|
|
11
|
+
* ├── orq.simulation.scenario_generation
|
|
12
|
+
* ├── orq.simulation.run (per datapoint)
|
|
13
|
+
* │ ├── orq.simulation.first_message_generation
|
|
14
|
+
* │ └── orq.simulation.turn (per turn)
|
|
15
|
+
* │ ├── orq.simulation.target_call
|
|
16
|
+
* │ ├── orq.simulation.judge_evaluation
|
|
17
|
+
* │ └── orq.simulation.user_simulator_call
|
|
18
|
+
*/
|
|
19
|
+
import type { Span } from "@opentelemetry/api";
|
|
20
|
+
/**
|
|
21
|
+
* Execute a function within a simulation span (SpanKind.INTERNAL).
|
|
22
|
+
*
|
|
23
|
+
* Gracefully returns `fn(undefined)` when tracing is not enabled.
|
|
24
|
+
* Automatically records errors and sets span status.
|
|
25
|
+
*/
|
|
26
|
+
export declare function withSimulationSpan<T>(name: string, attributes: Record<string, string | number | boolean | undefined> | undefined, fn: (span: Span | undefined) => Promise<T>): Promise<T>;
|
|
27
|
+
export interface LLMSpanOptions {
|
|
28
|
+
model: string;
|
|
29
|
+
operation?: string;
|
|
30
|
+
provider?: string;
|
|
31
|
+
temperature?: number;
|
|
32
|
+
maxTokens?: number;
|
|
33
|
+
purpose?: string;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Execute a function within a GenAI LLM span (SpanKind.CLIENT).
|
|
37
|
+
*
|
|
38
|
+
* Follows OTel GenAI semantic conventions for client inference spans.
|
|
39
|
+
* Span name is derived as `"{operation} {model}"`.
|
|
40
|
+
*/
|
|
41
|
+
export declare function withLLMSpan<T>(options: LLMSpanOptions, fn: (span: Span | undefined) => Promise<T>): Promise<T>;
|
|
42
|
+
export interface TokenUsageAttrs {
|
|
43
|
+
promptTokens?: number;
|
|
44
|
+
completionTokens?: number;
|
|
45
|
+
totalTokens?: number;
|
|
46
|
+
cacheReadInputTokens?: number;
|
|
47
|
+
cacheCreationInputTokens?: number;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Record token usage attributes on a span.
|
|
51
|
+
*
|
|
52
|
+
* Sets both OTel GenAI names and bare attribute keys for platform
|
|
53
|
+
* compatibility (matches the redteam module's dual-naming convention).
|
|
54
|
+
*/
|
|
55
|
+
export declare function recordTokenUsage(span: Span | undefined, usage: TokenUsageAttrs): void;
|
|
56
|
+
/**
|
|
57
|
+
* Record LLM input messages on a span.
|
|
58
|
+
*
|
|
59
|
+
* Sets both `gen_ai.input.messages` (OTel GenAI convention) and `input`
|
|
60
|
+
* (platform fallback), matching the redteam module's dual-attribute pattern.
|
|
61
|
+
* Suppressed when `OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=false`.
|
|
62
|
+
*/
|
|
63
|
+
export declare function recordLLMInput(span: Span | undefined, messages: Array<{
|
|
64
|
+
role: string;
|
|
65
|
+
content: string;
|
|
66
|
+
}>): void;
|
|
67
|
+
/**
|
|
68
|
+
* Record a single LLM output string on a span.
|
|
69
|
+
*
|
|
70
|
+
* Sets `gen_ai.output.messages` and `output` (platform fallback). Suppressed
|
|
71
|
+
* when `OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=false`.
|
|
72
|
+
*/
|
|
73
|
+
export declare function recordLLMOutput(span: Span | undefined, output: string): void;
|
|
74
|
+
/**
|
|
75
|
+
* Record LLM response attributes on a span from an OpenAI-compatible response.
|
|
76
|
+
*
|
|
77
|
+
* Sets `gen_ai.output.messages` and `output` with the response content,
|
|
78
|
+
* plus token usage, finish reasons, and response metadata.
|
|
79
|
+
*/
|
|
80
|
+
export declare function recordLLMResponse(span: Span | undefined, response: {
|
|
81
|
+
id?: string;
|
|
82
|
+
model?: string;
|
|
83
|
+
usage?: {
|
|
84
|
+
prompt_tokens: number;
|
|
85
|
+
completion_tokens: number;
|
|
86
|
+
total_tokens: number;
|
|
87
|
+
prompt_tokens_details?: {
|
|
88
|
+
cached_tokens?: number;
|
|
89
|
+
} | null;
|
|
90
|
+
} | null;
|
|
91
|
+
choices?: Array<{
|
|
92
|
+
finish_reason?: string | null;
|
|
93
|
+
message?: {
|
|
94
|
+
role?: string;
|
|
95
|
+
content?: string | null;
|
|
96
|
+
};
|
|
97
|
+
}>;
|
|
98
|
+
}): void;
|
|
99
|
+
/**
|
|
100
|
+
* Batch set multiple attributes on a span. Skips undefined values.
|
|
101
|
+
*/
|
|
102
|
+
export declare function setSpanAttrs(span: Span | undefined, attrs: Record<string, string | number | boolean | undefined>): void;
|
|
103
|
+
/**
|
|
104
|
+
* Get W3C trace context headers (traceparent/tracestate) for the current
|
|
105
|
+
* active span. Returns an empty object when tracing is not available.
|
|
106
|
+
*
|
|
107
|
+
* Used to propagate trace context into outgoing HTTP requests so the
|
|
108
|
+
* router can create child spans under the current simulation span.
|
|
109
|
+
*/
|
|
110
|
+
export declare function getTraceContextHeaders(): Promise<Record<string, string>>;
|
|
111
|
+
//# sourceMappingURL=tracing.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tracing.d.ts","sourceRoot":"","sources":["../../../../src/lib/integrations/simulation/tracing.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;GAiBG;AAEH,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,oBAAoB,CAAC;AAQ/C;;;;;GAKG;AACH,wBAAsB,kBAAkB,CAAC,CAAC,EACxC,IAAI,EAAE,MAAM,EACZ,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,GAAG,OAAO,GAAG,SAAS,CAAC,GAAG,SAAS,EAC7E,EAAE,EAAE,CAAC,IAAI,EAAE,IAAI,GAAG,SAAS,KAAK,OAAO,CAAC,CAAC,CAAC,GACzC,OAAO,CAAC,CAAC,CAAC,CAgDZ;AAMD,MAAM,WAAW,cAAc;IAC7B,KAAK,EAAE,MAAM,CAAC;IACd,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED;;;;;GAKG;AACH,wBAAsB,WAAW,CAAC,CAAC,EACjC,OAAO,EAAE,cAAc,EACvB,EAAE,EAAE,CAAC,IAAI,EAAE,IAAI,GAAG,SAAS,KAAK,OAAO,CAAC,CAAC,CAAC,GACzC,OAAO,CAAC,CAAC,CAAC,CA4DZ;AAMD,MAAM,WAAW,eAAe;IAC9B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAC9B,wBAAwB,CAAC,EAAE,MAAM,CAAC;CACnC;AAED;;;;;GAKG;AACH,wBAAgB,gBAAgB,CAC9B,IAAI,EAAE,IAAI,GAAG,SAAS,EACtB,KAAK,EAAE,eAAe,GACrB,IAAI,CAiCN;AAgCD;;;;;;GAMG;AACH,wBAAgB,cAAc,CAC5B,IAAI,EAAE,IAAI,GAAG,SAAS,EACtB,QAAQ,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,OAAO,EAAE,MAAM,CAAA;CAAE,CAAC,GACjD,IAAI,CAON;AAED;;;;;GAKG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,IAAI,GAAG,SAAS,EAAE,MAAM,EAAE,MAAM,GAAG,IAAI,CAS5E;AAED;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAC/B,IAAI,EAAE,IAAI,GAAG,SAAS,EACtB,QAAQ,EAAE;IACR,EAAE,CAAC,EAAE,MAAM,CAAC;IACZ,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE;QACN,aAAa,EAAE,MAAM,CAAC;QACtB,iBAAiB,EAAE,MAAM,CAAC;QAC1B,YAAY,EAAE,MAAM,CAAC;QACrB,qBAAqB,CAAC,EAAE;YAAE,aAAa,CAAC,EAAE,MAAM,CAAA;SAAE,GAAG,IAAI,CAAC;KAC3D,GAAG,IAAI,CAAC;IACT,OAAO,CAAC,EAAE,KAAK,CAAC;QACd,aAAa,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;QAC9B,OAAO,CAAC,EAAE;YAAE,IAAI,CAAC,EAAE,MAAM,CAAC;YAAC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,CAAA;SAAE,CAAC;KACtD,CAAC,CAAC;CACJ,GACA,IAAI,CAwCN;AAMD;;GAEG;AACH,wBAAgB,YAAY,CAC1B,IAAI,EAAE,IAAI,GAAG,SAAS,EACtB,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,GAAG,OAAO,GAAG,SAAS,CAAC,GAC3D,IAAI,CAON;AAED;;;;;;GAMG;AACH,wBAAsB,sBAAsB,IAAI,OAAO,CACrD,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CACvB,CASA"}
|