@orq-ai/evaluatorq 1.3.1 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/integrations/simulation/adapters.d.ts +28 -5
- package/dist/lib/integrations/simulation/adapters.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/adapters.js +113 -7
- package/dist/lib/integrations/simulation/agents/base.d.ts +3 -0
- package/dist/lib/integrations/simulation/agents/base.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/agents/base.js +104 -82
- package/dist/lib/integrations/simulation/agents/judge.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/agents/judge.js +1 -0
- package/dist/lib/integrations/simulation/agents/user-simulator.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/agents/user-simulator.js +4 -1
- package/dist/lib/integrations/simulation/generators/first-message-generator.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/generators/first-message-generator.js +51 -28
- package/dist/lib/integrations/simulation/generators/persona-generator.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/generators/persona-generator.js +144 -102
- package/dist/lib/integrations/simulation/generators/scenario-generator.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/generators/scenario-generator.js +274 -169
- package/dist/lib/integrations/simulation/index.d.ts +1 -1
- package/dist/lib/integrations/simulation/index.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/index.js +1 -1
- package/dist/lib/integrations/simulation/runner/simulation.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/runner/simulation.js +147 -85
- package/dist/lib/integrations/simulation/simulation/index.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/simulation/index.js +81 -27
- package/dist/lib/integrations/simulation/tracing.d.ts +111 -0
- package/dist/lib/integrations/simulation/tracing.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/tracing.js +310 -0
- package/dist/lib/integrations/simulation/wrap-agent.js +2 -2
- package/dist/tsconfig.lib.tsbuildinfo +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenTelemetry tracing utilities for the agent simulation module.
|
|
3
|
+
*
|
|
4
|
+
* Provides span creation helpers that mirror the redteam module's tracing
|
|
5
|
+
* patterns, adapted for the TypeScript simulation module. All functions
|
|
6
|
+
* gracefully degrade to no-ops when tracing is not enabled.
|
|
7
|
+
*
|
|
8
|
+
* Span hierarchy:
|
|
9
|
+
* orq.simulation.pipeline (root)
|
|
10
|
+
* ├── orq.simulation.persona_generation
|
|
11
|
+
* ├── orq.simulation.scenario_generation
|
|
12
|
+
* ├── orq.simulation.run (per datapoint)
|
|
13
|
+
* │ ├── orq.simulation.first_message_generation
|
|
14
|
+
* │ └── orq.simulation.turn (per turn)
|
|
15
|
+
* │ ├── orq.simulation.target_call
|
|
16
|
+
* │ ├── orq.simulation.judge_evaluation
|
|
17
|
+
* │ └── orq.simulation.user_simulator_call
|
|
18
|
+
*/
|
|
19
|
+
import { getTracer } from "../../tracing/setup.js";
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
// Internal span: orq.simulation.*
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
/**
|
|
24
|
+
* Execute a function within a simulation span (SpanKind.INTERNAL).
|
|
25
|
+
*
|
|
26
|
+
* Gracefully returns `fn(undefined)` when tracing is not enabled.
|
|
27
|
+
* Automatically records errors and sets span status.
|
|
28
|
+
*/
|
|
29
|
+
export async function withSimulationSpan(name, attributes, fn) {
|
|
30
|
+
const tracer = getTracer();
|
|
31
|
+
if (!tracer) {
|
|
32
|
+
return fn(undefined);
|
|
33
|
+
}
|
|
34
|
+
let SpanStatusCode;
|
|
35
|
+
try {
|
|
36
|
+
({ SpanStatusCode } = await import("@opentelemetry/api"));
|
|
37
|
+
}
|
|
38
|
+
catch {
|
|
39
|
+
// OTEL not available, run without span
|
|
40
|
+
return fn(undefined);
|
|
41
|
+
}
|
|
42
|
+
const cleanAttrs = {};
|
|
43
|
+
if (attributes) {
|
|
44
|
+
for (const [k, v] of Object.entries(attributes)) {
|
|
45
|
+
if (v !== undefined) {
|
|
46
|
+
cleanAttrs[k] = v;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
return tracer.startActiveSpan(name, { attributes: cleanAttrs }, async (span) => {
|
|
51
|
+
try {
|
|
52
|
+
const result = await fn(span);
|
|
53
|
+
span.setStatus({ code: SpanStatusCode.OK });
|
|
54
|
+
return result;
|
|
55
|
+
}
|
|
56
|
+
catch (error) {
|
|
57
|
+
span.setStatus({
|
|
58
|
+
code: SpanStatusCode.ERROR,
|
|
59
|
+
message: error instanceof Error ? error.message : String(error),
|
|
60
|
+
});
|
|
61
|
+
span.recordException(error instanceof Error ? error : new Error(String(error)));
|
|
62
|
+
if (error instanceof Error) {
|
|
63
|
+
span.setAttribute("error.type", error.constructor.name);
|
|
64
|
+
}
|
|
65
|
+
throw error;
|
|
66
|
+
}
|
|
67
|
+
finally {
|
|
68
|
+
span.end();
|
|
69
|
+
}
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Execute a function within a GenAI LLM span (SpanKind.CLIENT).
|
|
74
|
+
*
|
|
75
|
+
* Follows OTel GenAI semantic conventions for client inference spans.
|
|
76
|
+
* Span name is derived as `"{operation} {model}"`.
|
|
77
|
+
*/
|
|
78
|
+
export async function withLLMSpan(options, fn) {
|
|
79
|
+
const tracer = getTracer();
|
|
80
|
+
if (!tracer) {
|
|
81
|
+
return fn(undefined);
|
|
82
|
+
}
|
|
83
|
+
let SpanKind;
|
|
84
|
+
let SpanStatusCode;
|
|
85
|
+
try {
|
|
86
|
+
({ SpanKind, SpanStatusCode } = await import("@opentelemetry/api"));
|
|
87
|
+
}
|
|
88
|
+
catch {
|
|
89
|
+
return fn(undefined);
|
|
90
|
+
}
|
|
91
|
+
const operation = options.operation ?? "chat";
|
|
92
|
+
const provider = options.provider ?? deriveProvider(options.model);
|
|
93
|
+
const spanName = `${operation} ${options.model}`;
|
|
94
|
+
const attrs = {
|
|
95
|
+
"gen_ai.operation.name": operation,
|
|
96
|
+
"gen_ai.system": provider,
|
|
97
|
+
"gen_ai.provider.name": provider,
|
|
98
|
+
"gen_ai.request.model": options.model,
|
|
99
|
+
};
|
|
100
|
+
if (options.temperature !== undefined) {
|
|
101
|
+
attrs["gen_ai.request.temperature"] = options.temperature;
|
|
102
|
+
}
|
|
103
|
+
if (options.maxTokens !== undefined) {
|
|
104
|
+
attrs["gen_ai.request.max_tokens"] = options.maxTokens;
|
|
105
|
+
}
|
|
106
|
+
if (options.purpose) {
|
|
107
|
+
attrs["orq.simulation.llm_purpose"] = options.purpose;
|
|
108
|
+
}
|
|
109
|
+
return tracer.startActiveSpan(spanName, { kind: SpanKind.CLIENT, attributes: attrs }, async (span) => {
|
|
110
|
+
try {
|
|
111
|
+
const result = await fn(span);
|
|
112
|
+
span.setStatus({ code: SpanStatusCode.OK });
|
|
113
|
+
return result;
|
|
114
|
+
}
|
|
115
|
+
catch (error) {
|
|
116
|
+
span.setStatus({
|
|
117
|
+
code: SpanStatusCode.ERROR,
|
|
118
|
+
message: error instanceof Error ? error.message : String(error),
|
|
119
|
+
});
|
|
120
|
+
span.recordException(error instanceof Error ? error : new Error(String(error)));
|
|
121
|
+
if (error instanceof Error) {
|
|
122
|
+
span.setAttribute("error.type", error.constructor.name);
|
|
123
|
+
}
|
|
124
|
+
throw error;
|
|
125
|
+
}
|
|
126
|
+
finally {
|
|
127
|
+
span.end();
|
|
128
|
+
}
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Record token usage attributes on a span.
|
|
133
|
+
*
|
|
134
|
+
* Sets both OTel GenAI names and bare attribute keys for platform
|
|
135
|
+
* compatibility (matches the redteam module's dual-naming convention).
|
|
136
|
+
*/
|
|
137
|
+
export function recordTokenUsage(span, usage) {
|
|
138
|
+
if (!span)
|
|
139
|
+
return;
|
|
140
|
+
const prompt = usage.promptTokens ?? 0;
|
|
141
|
+
const completion = usage.completionTokens ?? 0;
|
|
142
|
+
const total = usage.totalTokens ?? prompt + completion;
|
|
143
|
+
// OTel GenAI semantic convention names
|
|
144
|
+
span.setAttribute("gen_ai.usage.input_tokens", prompt);
|
|
145
|
+
span.setAttribute("gen_ai.usage.output_tokens", completion);
|
|
146
|
+
span.setAttribute("gen_ai.usage.total_tokens", total);
|
|
147
|
+
if (usage.cacheReadInputTokens !== undefined) {
|
|
148
|
+
span.setAttribute("gen_ai.usage.cache_read.input_tokens", usage.cacheReadInputTokens);
|
|
149
|
+
}
|
|
150
|
+
if (usage.cacheCreationInputTokens !== undefined) {
|
|
151
|
+
span.setAttribute("gen_ai.usage.cache_creation.input_tokens", usage.cacheCreationInputTokens);
|
|
152
|
+
}
|
|
153
|
+
// Aliases for platform compatibility
|
|
154
|
+
span.setAttribute("gen_ai.usage.prompt_tokens", prompt);
|
|
155
|
+
span.setAttribute("gen_ai.usage.completion_tokens", completion);
|
|
156
|
+
span.setAttribute("prompt_tokens", prompt);
|
|
157
|
+
span.setAttribute("completion_tokens", completion);
|
|
158
|
+
span.setAttribute("input_tokens", prompt);
|
|
159
|
+
span.setAttribute("output_tokens", completion);
|
|
160
|
+
span.setAttribute("total_tokens", total);
|
|
161
|
+
}
|
|
162
|
+
// Max content length per message to avoid oversized spans (matches redteam)
|
|
163
|
+
const MAX_CONTENT_LEN = 2000;
|
|
164
|
+
function truncate(text) {
|
|
165
|
+
if (text.length <= MAX_CONTENT_LEN)
|
|
166
|
+
return text;
|
|
167
|
+
return `${text.slice(0, MAX_CONTENT_LEN)}…`;
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
* Serialize an array of chat messages to JSON for span attributes.
|
|
171
|
+
*/
|
|
172
|
+
function serializeMessages(messages) {
|
|
173
|
+
return JSON.stringify(messages.map((m) => ({ role: m.role, content: truncate(m.content) })));
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* The OTel GenAI semconv classifies `gen_ai.input.messages` and
|
|
177
|
+
* `gen_ai.output.messages` as opt-in because they may carry PII. Honor the
|
|
178
|
+
* spec env var; default to enabled for the platform UI to keep working.
|
|
179
|
+
*/
|
|
180
|
+
function captureMessageContent() {
|
|
181
|
+
const flag = process.env.OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT;
|
|
182
|
+
if (flag === undefined)
|
|
183
|
+
return true;
|
|
184
|
+
return flag.toLowerCase() === "true" || flag === "1";
|
|
185
|
+
}
|
|
186
|
+
/**
|
|
187
|
+
* Record LLM input messages on a span.
|
|
188
|
+
*
|
|
189
|
+
* Sets both `gen_ai.input.messages` (OTel GenAI convention) and `input`
|
|
190
|
+
* (platform fallback), matching the redteam module's dual-attribute pattern.
|
|
191
|
+
* Suppressed when `OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=false`.
|
|
192
|
+
*/
|
|
193
|
+
export function recordLLMInput(span, messages) {
|
|
194
|
+
if (!span || messages.length === 0)
|
|
195
|
+
return;
|
|
196
|
+
if (!captureMessageContent())
|
|
197
|
+
return;
|
|
198
|
+
const serialized = serializeMessages(messages);
|
|
199
|
+
span.setAttribute("gen_ai.input.messages", serialized);
|
|
200
|
+
span.setAttribute("input", serialized);
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Record a single LLM output string on a span.
|
|
204
|
+
*
|
|
205
|
+
* Sets `gen_ai.output.messages` and `output` (platform fallback). Suppressed
|
|
206
|
+
* when `OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=false`.
|
|
207
|
+
*/
|
|
208
|
+
export function recordLLMOutput(span, output) {
|
|
209
|
+
if (!span || !output)
|
|
210
|
+
return;
|
|
211
|
+
if (!captureMessageContent())
|
|
212
|
+
return;
|
|
213
|
+
const serialized = serializeMessages([
|
|
214
|
+
{ role: "assistant", content: output },
|
|
215
|
+
]);
|
|
216
|
+
span.setAttribute("gen_ai.output.messages", serialized);
|
|
217
|
+
span.setAttribute("output", serialized);
|
|
218
|
+
}
|
|
219
|
+
/**
|
|
220
|
+
* Record LLM response attributes on a span from an OpenAI-compatible response.
|
|
221
|
+
*
|
|
222
|
+
* Sets `gen_ai.output.messages` and `output` with the response content,
|
|
223
|
+
* plus token usage, finish reasons, and response metadata.
|
|
224
|
+
*/
|
|
225
|
+
export function recordLLMResponse(span, response) {
|
|
226
|
+
if (!span)
|
|
227
|
+
return;
|
|
228
|
+
if (response.id) {
|
|
229
|
+
span.setAttribute("gen_ai.response.id", response.id);
|
|
230
|
+
}
|
|
231
|
+
if (response.model) {
|
|
232
|
+
span.setAttribute("gen_ai.response.model", response.model);
|
|
233
|
+
}
|
|
234
|
+
if (response.usage) {
|
|
235
|
+
recordTokenUsage(span, {
|
|
236
|
+
promptTokens: response.usage.prompt_tokens,
|
|
237
|
+
completionTokens: response.usage.completion_tokens,
|
|
238
|
+
totalTokens: response.usage.total_tokens,
|
|
239
|
+
cacheReadInputTokens: response.usage.prompt_tokens_details?.cached_tokens,
|
|
240
|
+
});
|
|
241
|
+
}
|
|
242
|
+
// Record output content (dual-attribute pattern). Opt-in per GenAI semconv.
|
|
243
|
+
if (captureMessageContent()) {
|
|
244
|
+
const outputMessages = response.choices
|
|
245
|
+
?.filter((c) => c.message?.content)
|
|
246
|
+
.map((c) => ({
|
|
247
|
+
role: c.message?.role ?? "assistant",
|
|
248
|
+
content: c.message?.content ?? "",
|
|
249
|
+
}));
|
|
250
|
+
if (outputMessages && outputMessages.length > 0) {
|
|
251
|
+
const serialized = serializeMessages(outputMessages);
|
|
252
|
+
span.setAttribute("gen_ai.output.messages", serialized);
|
|
253
|
+
span.setAttribute("output", serialized);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
const finishReasons = response.choices
|
|
257
|
+
?.map((c) => c.finish_reason)
|
|
258
|
+
.filter((r) => Boolean(r));
|
|
259
|
+
if (finishReasons && finishReasons.length > 0) {
|
|
260
|
+
span.setAttribute("gen_ai.response.finish_reasons", finishReasons);
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
// ---------------------------------------------------------------------------
|
|
264
|
+
// Attribute helpers
|
|
265
|
+
// ---------------------------------------------------------------------------
|
|
266
|
+
/**
|
|
267
|
+
* Batch set multiple attributes on a span. Skips undefined values.
|
|
268
|
+
*/
|
|
269
|
+
export function setSpanAttrs(span, attrs) {
|
|
270
|
+
if (!span)
|
|
271
|
+
return;
|
|
272
|
+
for (const [key, value] of Object.entries(attrs)) {
|
|
273
|
+
if (value !== undefined) {
|
|
274
|
+
span.setAttribute(key, value);
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
/**
|
|
279
|
+
* Get W3C trace context headers (traceparent/tracestate) for the current
|
|
280
|
+
* active span. Returns an empty object when tracing is not available.
|
|
281
|
+
*
|
|
282
|
+
* Used to propagate trace context into outgoing HTTP requests so the
|
|
283
|
+
* router can create child spans under the current simulation span.
|
|
284
|
+
*/
|
|
285
|
+
export async function getTraceContextHeaders() {
|
|
286
|
+
try {
|
|
287
|
+
const { context, propagation } = await import("@opentelemetry/api");
|
|
288
|
+
const headers = {};
|
|
289
|
+
propagation.inject(context.active(), headers);
|
|
290
|
+
return headers;
|
|
291
|
+
}
|
|
292
|
+
catch {
|
|
293
|
+
return {};
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
// ---------------------------------------------------------------------------
|
|
297
|
+
// Helpers
|
|
298
|
+
// ---------------------------------------------------------------------------
|
|
299
|
+
// OTel GenAI semconv `gen_ai.system` enum values. The router uses prefixes
|
|
300
|
+
// like "azure/" that don't map 1:1 to the spec — translate the known ones.
|
|
301
|
+
const PROVIDER_ALIASES = {
|
|
302
|
+
azure: "azure.ai.openai",
|
|
303
|
+
};
|
|
304
|
+
function deriveProvider(model) {
|
|
305
|
+
if (model.includes("/")) {
|
|
306
|
+
const prefix = model.split("/")[0];
|
|
307
|
+
return PROVIDER_ALIASES[prefix] ?? prefix;
|
|
308
|
+
}
|
|
309
|
+
return "openai";
|
|
310
|
+
}
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Follows the same pattern as wrapAISdkAgent() and wrapLangChainAgent().
|
|
5
5
|
*/
|
|
6
|
-
import {
|
|
6
|
+
import { fromOrqAgent } from "./adapters.js";
|
|
7
7
|
import { toOpenResponses } from "./convert.js";
|
|
8
8
|
import { simulate } from "./simulation/index.js";
|
|
9
9
|
/**
|
|
@@ -51,7 +51,7 @@ export function wrapSimulationAgent(options) {
|
|
|
51
51
|
// Resolve the target callback
|
|
52
52
|
let resolvedCallback = targetCallback;
|
|
53
53
|
if (!resolvedCallback && agentKey) {
|
|
54
|
-
resolvedCallback =
|
|
54
|
+
resolvedCallback = fromOrqAgent(agentKey);
|
|
55
55
|
}
|
|
56
56
|
if (!resolvedCallback) {
|
|
57
57
|
throw new Error("wrapSimulationAgent requires either targetCallback or agentKey");
|