@orq-ai/evaluatorq 1.3.1 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/integrations/simulation/adapters.d.ts +28 -5
- package/dist/lib/integrations/simulation/adapters.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/adapters.js +113 -7
- package/dist/lib/integrations/simulation/agents/base.d.ts +3 -0
- package/dist/lib/integrations/simulation/agents/base.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/agents/base.js +104 -82
- package/dist/lib/integrations/simulation/agents/judge.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/agents/judge.js +1 -0
- package/dist/lib/integrations/simulation/agents/user-simulator.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/agents/user-simulator.js +4 -1
- package/dist/lib/integrations/simulation/generators/first-message-generator.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/generators/first-message-generator.js +51 -28
- package/dist/lib/integrations/simulation/generators/persona-generator.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/generators/persona-generator.js +144 -102
- package/dist/lib/integrations/simulation/generators/scenario-generator.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/generators/scenario-generator.js +274 -169
- package/dist/lib/integrations/simulation/index.d.ts +1 -1
- package/dist/lib/integrations/simulation/index.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/index.js +1 -1
- package/dist/lib/integrations/simulation/runner/simulation.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/runner/simulation.js +147 -85
- package/dist/lib/integrations/simulation/simulation/index.d.ts.map +1 -1
- package/dist/lib/integrations/simulation/simulation/index.js +81 -27
- package/dist/lib/integrations/simulation/tracing.d.ts +111 -0
- package/dist/lib/integrations/simulation/tracing.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/tracing.js +310 -0
- package/dist/lib/integrations/simulation/wrap-agent.js +2 -2
- package/dist/tsconfig.lib.tsbuildinfo +1 -1
- package/package.json +1 -1
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
* Generates test scenarios from agent descriptions and optional context.
|
|
5
5
|
*/
|
|
6
6
|
import OpenAI from "openai";
|
|
7
|
+
import { getTraceContextHeaders, recordLLMInput, recordLLMResponse, withLLMSpan, withSimulationSpan, } from "../tracing.js";
|
|
7
8
|
import { extractJsonFromResponse } from "../utils/extract-json.js";
|
|
8
9
|
import { delimit } from "../utils/sanitize.js";
|
|
9
10
|
// Temperature settings for different generation modes
|
|
@@ -225,9 +226,13 @@ export class ScenarioGenerator {
|
|
|
225
226
|
* Generate scenarios for agent testing.
|
|
226
227
|
*/
|
|
227
228
|
async generate(params) {
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
229
|
+
return withSimulationSpan("orq.simulation.scenario_generation", {
|
|
230
|
+
"orq.simulation.num_scenarios": params.numScenarios ?? 10,
|
|
231
|
+
"orq.simulation.model": this.model,
|
|
232
|
+
}, async (_span) => {
|
|
233
|
+
const { agentDescription, context = "", numScenarios = 10, edgeCasePercentage = 0.3, } = params;
|
|
234
|
+
const numEdgeCases = Math.floor(numScenarios * edgeCasePercentage);
|
|
235
|
+
const userPrompt = `Agent Description: ${delimit(agentDescription)}
|
|
231
236
|
|
|
232
237
|
Additional Context: ${delimit(context || "None provided")}
|
|
233
238
|
|
|
@@ -238,52 +243,73 @@ Generate ${numScenarios} diverse test scenarios for this agent.
|
|
|
238
243
|
- Each scenario should have clear success/failure criteria
|
|
239
244
|
|
|
240
245
|
Return ONLY a JSON array, no other text.`;
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
model: this.model,
|
|
244
|
-
messages: [
|
|
246
|
+
try {
|
|
247
|
+
const genMessages = [
|
|
245
248
|
{ role: "system", content: SCENARIO_GENERATOR_PROMPT },
|
|
246
249
|
{ role: "user", content: userPrompt },
|
|
247
|
-
]
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
250
|
+
];
|
|
251
|
+
const response = await withLLMSpan({
|
|
252
|
+
model: this.model,
|
|
253
|
+
temperature: TEMPERATURE_CREATIVE,
|
|
254
|
+
maxTokens: 6000,
|
|
255
|
+
purpose: "scenario_generation",
|
|
256
|
+
}, async (llmSpan) => {
|
|
257
|
+
recordLLMInput(llmSpan, [
|
|
258
|
+
{ role: "system", content: SCENARIO_GENERATOR_PROMPT },
|
|
259
|
+
{ role: "user", content: userPrompt },
|
|
260
|
+
]);
|
|
261
|
+
const traceHeaders = await getTraceContextHeaders();
|
|
262
|
+
const res = await this.client.chat.completions.create({
|
|
263
|
+
model: this.model,
|
|
264
|
+
messages: genMessages,
|
|
265
|
+
temperature: TEMPERATURE_CREATIVE,
|
|
266
|
+
max_tokens: 6000,
|
|
267
|
+
}, { headers: traceHeaders });
|
|
268
|
+
recordLLMResponse(llmSpan, res);
|
|
269
|
+
return res;
|
|
270
|
+
});
|
|
271
|
+
const content = response.choices[0]?.message.content ?? "[]";
|
|
272
|
+
const extracted = extractJsonFromResponse(content);
|
|
273
|
+
const scenarioDicts = parseJsonArray(extracted);
|
|
274
|
+
const scenarios = parseScenarios(scenarioDicts);
|
|
275
|
+
if (scenarios.length < numScenarios) {
|
|
276
|
+
console.warn(`ScenarioGenerator: requested ${numScenarios} scenarios but only ${scenarios.length} were successfully parsed`);
|
|
277
|
+
}
|
|
278
|
+
return scenarios;
|
|
257
279
|
}
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
280
|
+
catch (e) {
|
|
281
|
+
if (e instanceof SyntaxError) {
|
|
282
|
+
console.warn(`ScenarioGenerator: requested ${numScenarios} scenarios but LLM response was not valid JSON — returning empty array`);
|
|
283
|
+
return [];
|
|
284
|
+
}
|
|
285
|
+
throw e;
|
|
264
286
|
}
|
|
265
|
-
|
|
266
|
-
}
|
|
287
|
+
});
|
|
267
288
|
}
|
|
268
289
|
/**
|
|
269
290
|
* Generate scenarios with guaranteed emotion and criteria coverage.
|
|
270
291
|
*/
|
|
271
292
|
async generateWithCoverage(params) {
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
"
|
|
275
|
-
"
|
|
276
|
-
|
|
277
|
-
"
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
293
|
+
return withSimulationSpan("orq.simulation.scenario_generation", {
|
|
294
|
+
"orq.simulation.num_scenarios": params.numScenarios ?? 6,
|
|
295
|
+
"orq.simulation.mode": "coverage",
|
|
296
|
+
"orq.simulation.model": this.model,
|
|
297
|
+
}, async (_span) => {
|
|
298
|
+
const { agentDescription, context = "", numScenarios = 6, edgeCasePercentage = 0.3, } = params;
|
|
299
|
+
const emotions = [
|
|
300
|
+
"neutral",
|
|
301
|
+
"frustrated",
|
|
302
|
+
"confused",
|
|
303
|
+
"happy",
|
|
304
|
+
"urgent",
|
|
305
|
+
];
|
|
306
|
+
const numEdgeCases = Math.floor(numScenarios * edgeCasePercentage);
|
|
307
|
+
const coverageInstructions = Array.from({ length: numScenarios }, (_, i) => {
|
|
308
|
+
const emotion = emotions[i % emotions.length];
|
|
309
|
+
const edgeLabel = i < numEdgeCases ? " (edge case)" : "";
|
|
310
|
+
return `- Scenario ${i + 1}: starting_emotion='${emotion}'${edgeLabel}`;
|
|
311
|
+
}).join("\n");
|
|
312
|
+
const userPrompt = `Agent Description: ${delimit(agentDescription)}
|
|
287
313
|
|
|
288
314
|
Additional Context: ${delimit(context || "None provided")}
|
|
289
315
|
|
|
@@ -298,39 +324,55 @@ Additional requirements:
|
|
|
298
324
|
- Cover different types of user requests
|
|
299
325
|
|
|
300
326
|
Return ONLY a JSON array, no other text.`;
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
model: this.model,
|
|
304
|
-
messages: [
|
|
327
|
+
try {
|
|
328
|
+
const covMessages = [
|
|
305
329
|
{ role: "system", content: SCENARIO_GENERATOR_PROMPT },
|
|
306
330
|
{ role: "user", content: userPrompt },
|
|
307
|
-
]
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
331
|
+
];
|
|
332
|
+
const response = await withLLMSpan({
|
|
333
|
+
model: this.model,
|
|
334
|
+
temperature: TEMPERATURE_BALANCED,
|
|
335
|
+
maxTokens: 6000,
|
|
336
|
+
purpose: "scenario_generation_coverage",
|
|
337
|
+
}, async (llmSpan) => {
|
|
338
|
+
recordLLMInput(llmSpan, [
|
|
339
|
+
{ role: "system", content: SCENARIO_GENERATOR_PROMPT },
|
|
340
|
+
{ role: "user", content: userPrompt },
|
|
341
|
+
]);
|
|
342
|
+
const traceHeaders = await getTraceContextHeaders();
|
|
343
|
+
const res = await this.client.chat.completions.create({
|
|
344
|
+
model: this.model,
|
|
345
|
+
messages: covMessages,
|
|
346
|
+
temperature: TEMPERATURE_BALANCED,
|
|
347
|
+
max_tokens: 6000,
|
|
348
|
+
}, { headers: traceHeaders });
|
|
349
|
+
recordLLMResponse(llmSpan, res);
|
|
350
|
+
return res;
|
|
351
|
+
});
|
|
352
|
+
const content = response.choices[0]?.message.content ?? "[]";
|
|
353
|
+
const extracted = extractJsonFromResponse(content);
|
|
354
|
+
const scenarioDicts = parseJsonArray(extracted);
|
|
355
|
+
let scenarios = parseScenarios(scenarioDicts);
|
|
356
|
+
// Validate coverage and fill gaps
|
|
357
|
+
scenarios = this.ensureEmotionCoverage(scenarios, emotions);
|
|
358
|
+
scenarios = this.ensureCriteriaCoverage(scenarios);
|
|
359
|
+
// Trim to requested count (coverage adjustments may have kept extras)
|
|
360
|
+
if (scenarios.length > numScenarios) {
|
|
361
|
+
scenarios = scenarios.slice(0, numScenarios);
|
|
362
|
+
}
|
|
363
|
+
if (scenarios.length < numScenarios) {
|
|
364
|
+
console.warn(`ScenarioGenerator: requested ${numScenarios} scenarios (with coverage) but only ${scenarios.length} were successfully parsed`);
|
|
365
|
+
}
|
|
366
|
+
return scenarios;
|
|
324
367
|
}
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
368
|
+
catch (e) {
|
|
369
|
+
if (e instanceof SyntaxError) {
|
|
370
|
+
console.warn(`ScenarioGenerator: requested ${numScenarios} scenarios but LLM response was not valid JSON — returning empty array`);
|
|
371
|
+
return [];
|
|
372
|
+
}
|
|
373
|
+
throw e;
|
|
331
374
|
}
|
|
332
|
-
|
|
333
|
-
}
|
|
375
|
+
});
|
|
334
376
|
}
|
|
335
377
|
/**
|
|
336
378
|
* Ensure all starting emotions are covered.
|
|
@@ -381,11 +423,16 @@ Return ONLY a JSON array, no other text.`;
|
|
|
381
423
|
* Generate edge case scenarios specifically.
|
|
382
424
|
*/
|
|
383
425
|
async generateEdgeCases(params) {
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
:
|
|
388
|
-
|
|
426
|
+
return withSimulationSpan("orq.simulation.scenario_generation", {
|
|
427
|
+
"orq.simulation.num_scenarios": params.numEdgeCases ?? 5,
|
|
428
|
+
"orq.simulation.mode": "edge_cases",
|
|
429
|
+
"orq.simulation.model": this.model,
|
|
430
|
+
}, async (_span) => {
|
|
431
|
+
const { agentDescription, existingScenarios, numEdgeCases = 5, } = params;
|
|
432
|
+
const existingNames = existingScenarios
|
|
433
|
+
? existingScenarios.map((s) => s.name)
|
|
434
|
+
: [];
|
|
435
|
+
const userPrompt = `Agent Description: ${delimit(agentDescription)}
|
|
389
436
|
|
|
390
437
|
Existing scenarios (avoid duplicating these):
|
|
391
438
|
${delimit(JSON.stringify(existingNames, null, 2))}
|
|
@@ -399,43 +446,64 @@ Generate ${numEdgeCases} EDGE CASE scenarios that:
|
|
|
399
446
|
Each scenario MUST have is_edge_case: true
|
|
400
447
|
|
|
401
448
|
Return ONLY a JSON array, no other text.`;
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
model: this.model,
|
|
405
|
-
messages: [
|
|
449
|
+
try {
|
|
450
|
+
const edgeMessages = [
|
|
406
451
|
{ role: "system", content: SCENARIO_GENERATOR_PROMPT },
|
|
407
452
|
{ role: "user", content: userPrompt },
|
|
408
|
-
]
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
453
|
+
];
|
|
454
|
+
const response = await withLLMSpan({
|
|
455
|
+
model: this.model,
|
|
456
|
+
temperature: TEMPERATURE_EDGE_CASE,
|
|
457
|
+
maxTokens: 4000,
|
|
458
|
+
purpose: "scenario_edge_cases",
|
|
459
|
+
}, async (llmSpan) => {
|
|
460
|
+
recordLLMInput(llmSpan, [
|
|
461
|
+
{ role: "system", content: SCENARIO_GENERATOR_PROMPT },
|
|
462
|
+
{ role: "user", content: userPrompt },
|
|
463
|
+
]);
|
|
464
|
+
const traceHeaders = await getTraceContextHeaders();
|
|
465
|
+
const res = await this.client.chat.completions.create({
|
|
466
|
+
model: this.model,
|
|
467
|
+
messages: edgeMessages,
|
|
468
|
+
temperature: TEMPERATURE_EDGE_CASE,
|
|
469
|
+
max_tokens: 4000,
|
|
470
|
+
}, { headers: traceHeaders });
|
|
471
|
+
recordLLMResponse(llmSpan, res);
|
|
472
|
+
return res;
|
|
473
|
+
});
|
|
474
|
+
const content = response.choices[0]?.message.content ?? "[]";
|
|
475
|
+
const extracted = extractJsonFromResponse(content);
|
|
476
|
+
const scenarioDicts = parseJsonArray(extracted);
|
|
477
|
+
// Force edge case flag
|
|
478
|
+
for (const sDict of scenarioDicts) {
|
|
479
|
+
sDict.is_edge_case = true;
|
|
480
|
+
}
|
|
481
|
+
const scenarios = parseScenarios(scenarioDicts);
|
|
482
|
+
if (scenarios.length < numEdgeCases) {
|
|
483
|
+
console.warn(`ScenarioGenerator: requested ${numEdgeCases} edge cases but only ${scenarios.length} were successfully parsed`);
|
|
484
|
+
}
|
|
485
|
+
return scenarios;
|
|
422
486
|
}
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
487
|
+
catch (e) {
|
|
488
|
+
if (e instanceof SyntaxError) {
|
|
489
|
+
console.warn(`ScenarioGenerator: requested ${numEdgeCases} edge cases but LLM response was not valid JSON — returning empty array`);
|
|
490
|
+
return [];
|
|
491
|
+
}
|
|
492
|
+
throw e;
|
|
429
493
|
}
|
|
430
|
-
|
|
431
|
-
}
|
|
494
|
+
});
|
|
432
495
|
}
|
|
433
496
|
/**
|
|
434
497
|
* Generate boundary/out-of-scope test scenarios.
|
|
435
498
|
*/
|
|
436
499
|
async generateBoundaryScenarios(params) {
|
|
437
|
-
|
|
438
|
-
|
|
500
|
+
return withSimulationSpan("orq.simulation.scenario_generation", {
|
|
501
|
+
"orq.simulation.num_scenarios": params.numScenarios ?? 5,
|
|
502
|
+
"orq.simulation.mode": "boundary",
|
|
503
|
+
"orq.simulation.model": this.model,
|
|
504
|
+
}, async (_span) => {
|
|
505
|
+
const { agentDescription, numScenarios = 5 } = params;
|
|
506
|
+
const userPrompt = `Agent Description: ${delimit(agentDescription)}
|
|
439
507
|
|
|
440
508
|
Generate ${numScenarios} BOUNDARY TEST scenarios that probe the limits of this agent's scope.
|
|
441
509
|
|
|
@@ -448,56 +516,77 @@ Include a mix of:
|
|
|
448
516
|
Each scenario MUST have is_edge_case: true
|
|
449
517
|
|
|
450
518
|
Return ONLY a JSON array, no other text.`;
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
model: this.model,
|
|
454
|
-
messages: [
|
|
519
|
+
try {
|
|
520
|
+
const bndMessages = [
|
|
455
521
|
{ role: "system", content: BOUNDARY_SCENARIO_PROMPT },
|
|
456
522
|
{ role: "user", content: userPrompt },
|
|
457
|
-
]
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
523
|
+
];
|
|
524
|
+
const response = await withLLMSpan({
|
|
525
|
+
model: this.model,
|
|
526
|
+
temperature: TEMPERATURE_EDGE_CASE,
|
|
527
|
+
maxTokens: 4000,
|
|
528
|
+
purpose: "scenario_boundary",
|
|
529
|
+
}, async (llmSpan) => {
|
|
530
|
+
recordLLMInput(llmSpan, [
|
|
531
|
+
{ role: "system", content: BOUNDARY_SCENARIO_PROMPT },
|
|
532
|
+
{ role: "user", content: userPrompt },
|
|
533
|
+
]);
|
|
534
|
+
const traceHeaders = await getTraceContextHeaders();
|
|
535
|
+
const res = await this.client.chat.completions.create({
|
|
536
|
+
model: this.model,
|
|
537
|
+
messages: bndMessages,
|
|
538
|
+
temperature: TEMPERATURE_EDGE_CASE,
|
|
539
|
+
max_tokens: 4000,
|
|
540
|
+
}, { headers: traceHeaders });
|
|
541
|
+
recordLLMResponse(llmSpan, res);
|
|
542
|
+
return res;
|
|
543
|
+
});
|
|
544
|
+
const content = response.choices[0]?.message.content ?? "[]";
|
|
545
|
+
const extracted = extractJsonFromResponse(content);
|
|
546
|
+
const scenarioDicts = parseJsonArray(extracted);
|
|
547
|
+
// Force edge case flag
|
|
548
|
+
for (const sDict of scenarioDicts) {
|
|
549
|
+
sDict.is_edge_case = true;
|
|
550
|
+
}
|
|
551
|
+
const scenarios = parseScenarios(scenarioDicts);
|
|
552
|
+
if (scenarios.length < numScenarios) {
|
|
553
|
+
console.warn(`ScenarioGenerator: requested ${numScenarios} boundary scenarios but only ${scenarios.length} were successfully parsed`);
|
|
554
|
+
}
|
|
555
|
+
return scenarios;
|
|
471
556
|
}
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
557
|
+
catch (e) {
|
|
558
|
+
if (e instanceof SyntaxError) {
|
|
559
|
+
console.warn(`ScenarioGenerator: requested ${numScenarios} boundary scenarios but LLM response was not valid JSON — returning empty array`);
|
|
560
|
+
return [];
|
|
561
|
+
}
|
|
562
|
+
throw e;
|
|
478
563
|
}
|
|
479
|
-
|
|
480
|
-
}
|
|
564
|
+
});
|
|
481
565
|
}
|
|
482
566
|
/**
|
|
483
567
|
* Generate security test scenarios inspired by OWASP attack patterns.
|
|
484
568
|
*/
|
|
485
569
|
async generateSecurityScenarios(params) {
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
570
|
+
return withSimulationSpan("orq.simulation.scenario_generation", {
|
|
571
|
+
"orq.simulation.num_scenarios": params.numScenarios ?? 10,
|
|
572
|
+
"orq.simulation.mode": "security",
|
|
573
|
+
"orq.simulation.model": this.model,
|
|
574
|
+
}, async (_span) => {
|
|
575
|
+
const { agentDescription, seedExamples, categories, numScenarios = 10, } = params;
|
|
576
|
+
let categoryFocus = "";
|
|
577
|
+
if (categories && categories.length > 0) {
|
|
578
|
+
const catNames = categories.map((cat) => {
|
|
579
|
+
const normalized = cat.toUpperCase().replace("OWASP-", "");
|
|
580
|
+
return `OWASP-${normalized}`;
|
|
581
|
+
});
|
|
582
|
+
categoryFocus = `\nFocus on these OWASP categories: ${delimit(catNames.join(", "))}`;
|
|
583
|
+
}
|
|
584
|
+
let seedText = "";
|
|
585
|
+
if (seedExamples && seedExamples.length > 0) {
|
|
586
|
+
const examplesToShow = seedExamples.slice(0, 5);
|
|
587
|
+
seedText = `\n\nUse these attack patterns as INSPIRATION (generate NOVEL variations, not copies):\n${delimit(JSON.stringify(examplesToShow, null, 2))}`;
|
|
588
|
+
}
|
|
589
|
+
const userPrompt = `Agent Description: ${delimit(agentDescription)}
|
|
501
590
|
${categoryFocus}
|
|
502
591
|
${seedText}
|
|
503
592
|
|
|
@@ -511,35 +600,51 @@ Requirements:
|
|
|
511
600
|
- Include conversation_strategy matching the attack style
|
|
512
601
|
|
|
513
602
|
Return ONLY a JSON array, no other text.`;
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
model: this.model,
|
|
517
|
-
messages: [
|
|
603
|
+
try {
|
|
604
|
+
const secMessages = [
|
|
518
605
|
{ role: "system", content: SECURITY_SCENARIO_PROMPT },
|
|
519
606
|
{ role: "user", content: userPrompt },
|
|
520
|
-
]
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
607
|
+
];
|
|
608
|
+
const response = await withLLMSpan({
|
|
609
|
+
model: this.model,
|
|
610
|
+
temperature: TEMPERATURE_EDGE_CASE,
|
|
611
|
+
maxTokens: 6000,
|
|
612
|
+
purpose: "scenario_security",
|
|
613
|
+
}, async (llmSpan) => {
|
|
614
|
+
recordLLMInput(llmSpan, [
|
|
615
|
+
{ role: "system", content: SECURITY_SCENARIO_PROMPT },
|
|
616
|
+
{ role: "user", content: userPrompt },
|
|
617
|
+
]);
|
|
618
|
+
const traceHeaders = await getTraceContextHeaders();
|
|
619
|
+
const res = await this.client.chat.completions.create({
|
|
620
|
+
model: this.model,
|
|
621
|
+
messages: secMessages,
|
|
622
|
+
temperature: TEMPERATURE_EDGE_CASE,
|
|
623
|
+
max_tokens: 6000,
|
|
624
|
+
}, { headers: traceHeaders });
|
|
625
|
+
recordLLMResponse(llmSpan, res);
|
|
626
|
+
return res;
|
|
627
|
+
});
|
|
628
|
+
const content = response.choices[0]?.message.content ?? "[]";
|
|
629
|
+
const extracted = extractJsonFromResponse(content);
|
|
630
|
+
const scenarioDicts = parseJsonArray(extracted);
|
|
631
|
+
// Force edge case flag
|
|
632
|
+
for (const sDict of scenarioDicts) {
|
|
633
|
+
sDict.is_edge_case = true;
|
|
634
|
+
}
|
|
635
|
+
const scenarios = parseScenarios(scenarioDicts);
|
|
636
|
+
if (scenarios.length < numScenarios) {
|
|
637
|
+
console.warn(`ScenarioGenerator: requested ${numScenarios} security scenarios but only ${scenarios.length} were successfully parsed`);
|
|
638
|
+
}
|
|
639
|
+
return scenarios;
|
|
534
640
|
}
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
641
|
+
catch (e) {
|
|
642
|
+
if (e instanceof SyntaxError) {
|
|
643
|
+
console.warn(`ScenarioGenerator: requested ${numScenarios} security scenarios but LLM response was not valid JSON — returning empty array`);
|
|
644
|
+
return [];
|
|
645
|
+
}
|
|
646
|
+
throw e;
|
|
541
647
|
}
|
|
542
|
-
|
|
543
|
-
}
|
|
648
|
+
});
|
|
544
649
|
}
|
|
545
650
|
}
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* import { simulate, wrapSimulationAgent, toOpenResponses } from "@orq-ai/evaluatorq/simulation";
|
|
11
11
|
* ```
|
|
12
12
|
*/
|
|
13
|
-
export { fromChatCompletions, fromOrqDeployment } from "./adapters.js";
|
|
13
|
+
export { fromChatCompletions, fromOrqAgent, fromOrqDeployment, } from "./adapters.js";
|
|
14
14
|
export type { AgentConfig } from "./agents/base.js";
|
|
15
15
|
export { BaseAgent } from "./agents/base.js";
|
|
16
16
|
export { JudgeAgent } from "./agents/judge.js";
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/lib/integrations/simulation/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAGH,OAAO,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/lib/integrations/simulation/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAGH,OAAO,EACL,mBAAmB,EACnB,YAAY,EACZ,iBAAiB,GAClB,MAAM,eAAe,CAAC;AACvB,YAAY,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAEpD,OAAO,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC7C,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAC/C,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAC;AAEhE,OAAO,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAC/C,YAAY,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAE9D,OAAO,EACL,gBAAgB,EAChB,YAAY,EACZ,qBAAqB,GACtB,MAAM,uBAAuB,CAAC;AAE/B,OAAO,EACL,kBAAkB,EAClB,qBAAqB,EACrB,gBAAgB,EAChB,iBAAiB,GAClB,MAAM,uBAAuB,CAAC;AAC/B,YAAY,EAAE,gBAAgB,EAAE,MAAM,mCAAmC,CAAC;AAE1E,OAAO,EACL,iBAAiB,EACjB,uBAAuB,EACvB,uBAAuB,GACxB,MAAM,mCAAmC,CAAC;AAC3C,YAAY,EACV,cAAc,EACd,SAAS,EACT,sBAAsB,EACtB,WAAW,GACZ,MAAM,wBAAwB,CAAC;AAEhC,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAI1D,YAAY,EACV,yBAAyB,EACzB,cAAc,GACf,MAAM,uBAAuB,CAAC;AAE/B,OAAO,EAAE,mBAAmB,EAAE,QAAQ,EAAE,MAAM,uBAAuB,CAAC;AAEtE,YAAY,EACV,WAAW,EACX,kBAAkB,EAClB,oBAAoB,EACpB,SAAS,EACT,eAAe,EACf,SAAS,EACT,YAAY,EACZ,WAAW,EACX,QAAQ,EACR,OAAO,IAAI,iBAAiB,EAC5B,OAAO,EACP,QAAQ,EACR,gBAAgB,EAChB,eAAe,EACf,YAAY,EACZ,UAAU,EACV,WAAW,GACZ,MAAM,YAAY,CAAC;AAEpB,OAAO,EACL,uBAAuB,EACvB,oBAAoB,EACpB,uBAAuB,EACvB,cAAc,GACf,MAAM,2BAA2B,CAAC;AACnC,OAAO,EACL,0BAA0B,EAC1B,wBAAwB,EACxB,wBAAwB,EACxB,iBAAiB,GAClB,MAAM,4BAA4B,CAAC;AACpC,YAAY,EAAE,oBAAoB,EAAE,MAAM,iBAAiB,CAAC;AAE5D,OAAO,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAC"}
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
* ```
|
|
12
12
|
*/
|
|
13
13
|
// --- Adapters ---
|
|
14
|
-
export { fromChatCompletions, fromOrqDeployment } from "./adapters.js";
|
|
14
|
+
export { fromChatCompletions, fromOrqAgent, fromOrqDeployment, } from "./adapters.js";
|
|
15
15
|
// --- Agents (advanced usage) ---
|
|
16
16
|
export { BaseAgent } from "./agents/base.js";
|
|
17
17
|
export { JudgeAgent } from "./agents/judge.js";
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"simulation.d.ts","sourceRoot":"","sources":["../../../../../src/lib/integrations/simulation/runner/simulation.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;
|
|
1
|
+
{"version":3,"file":"simulation.d.ts","sourceRoot":"","sources":["../../../../../src/lib/integrations/simulation/runner/simulation.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAaH,OAAO,KAAK,EACV,WAAW,EACX,SAAS,EAGT,OAAO,EACP,QAAQ,EACR,gBAAgB,EAGjB,MAAM,aAAa,CAAC;AAOrB,+CAA+C;AAC/C,MAAM,WAAW,WAAW;IAC1B,OAAO,CAAC,QAAQ,EAAE,WAAW,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;CACnD;AAMD,MAAM,WAAW,sBAAsB;IACrC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,cAAc,CAAC,EAAE,CAAC,QAAQ,EAAE,WAAW,EAAE,KAAK,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IACvE,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,SAAS;IACxB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,SAAS,CAAC,EAAE,SAAS,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,uDAAuD;IACvD,MAAM,CAAC,EAAE,WAAW,CAAC;CACtB;AAED,MAAM,WAAW,cAAc;IAC7B,UAAU,EAAE,SAAS,EAAE,CAAC;IACxB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,wEAAwE;IACxE,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAC9B,mDAAmD;IACnD,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AA8DD,qBAAa,gBAAgB;IAC3B,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAc;IAC3C,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAC,CAEF;IAC9B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAS;IAC/B,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAS;IAClC,OAAO,CAAC,YAAY,CAAuB;gBAE/B,MAAM,EAAE,sBAAsB;IAmB1C,OAAO,CAAC,eAAe;IAgBvB,0FAA0F;IACpF,GAAG,CAAC,MAAM,EAAE,SAAS,GAAG,OAAO,CAAC,gBAAgB,CAAC;IAwRvD,4DAA4D;IACtD,QAAQ,CAAC,MAAM,EAAE,cAAc,GAAG,OAAO,CAAC,gBAAgB,EAAE,CAAC;IA2DnE,4CAA4C;IACtC,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;YAUd,iBAAiB;IAU/B;;;OAGG;IACH,OAAO,CAAC,oBAAoB;YAiBd,cAAc;CAyC7B"}
|