@orq-ai/evaluatorq 1.2.2 → 1.2.3-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/integrations/ai-sdk/index.d.ts +2 -0
- package/dist/lib/integrations/ai-sdk/index.d.ts.map +1 -1
- package/dist/lib/integrations/ai-sdk/index.js +1 -0
- package/dist/lib/integrations/ai-sdk/simulation-adapter.d.ts +47 -0
- package/dist/lib/integrations/ai-sdk/simulation-adapter.d.ts.map +1 -0
- package/dist/lib/integrations/ai-sdk/simulation-adapter.js +58 -0
- package/dist/lib/integrations/langchain/index.d.ts +2 -0
- package/dist/lib/integrations/langchain/index.d.ts.map +1 -1
- package/dist/lib/integrations/langchain/index.js +1 -0
- package/dist/lib/integrations/langchain/simulation-adapter.d.ts +49 -0
- package/dist/lib/integrations/langchain/simulation-adapter.d.ts.map +1 -0
- package/dist/lib/integrations/langchain/simulation-adapter.js +110 -0
- package/dist/lib/integrations/simulation/adapters.d.ts +57 -0
- package/dist/lib/integrations/simulation/adapters.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/adapters.js +64 -0
- package/dist/lib/integrations/simulation/agents/base.d.ts +90 -0
- package/dist/lib/integrations/simulation/agents/base.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/agents/base.js +227 -0
- package/dist/lib/integrations/simulation/agents/index.d.ts +10 -0
- package/dist/lib/integrations/simulation/agents/index.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/agents/index.js +6 -0
- package/dist/lib/integrations/simulation/agents/judge.d.ts +50 -0
- package/dist/lib/integrations/simulation/agents/judge.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/agents/judge.js +313 -0
- package/dist/lib/integrations/simulation/agents/user-simulator.d.ts +41 -0
- package/dist/lib/integrations/simulation/agents/user-simulator.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/agents/user-simulator.js +82 -0
- package/dist/lib/integrations/simulation/convert.d.ts +22 -0
- package/dist/lib/integrations/simulation/convert.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/convert.js +124 -0
- package/dist/lib/integrations/simulation/evaluators/index.d.ts +50 -0
- package/dist/lib/integrations/simulation/evaluators/index.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/evaluators/index.js +100 -0
- package/dist/lib/integrations/simulation/generators/datapoint-generator.d.ts +60 -0
- package/dist/lib/integrations/simulation/generators/datapoint-generator.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/generators/datapoint-generator.js +223 -0
- package/dist/lib/integrations/simulation/generators/first-message-generator.d.ts +38 -0
- package/dist/lib/integrations/simulation/generators/first-message-generator.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/generators/first-message-generator.js +131 -0
- package/dist/lib/integrations/simulation/generators/index.d.ts +15 -0
- package/dist/lib/integrations/simulation/generators/index.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/generators/index.js +10 -0
- package/dist/lib/integrations/simulation/generators/persona-generator.d.ts +60 -0
- package/dist/lib/integrations/simulation/generators/persona-generator.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/generators/persona-generator.js +333 -0
- package/dist/lib/integrations/simulation/generators/scenario-generator.d.ts +77 -0
- package/dist/lib/integrations/simulation/generators/scenario-generator.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/generators/scenario-generator.js +545 -0
- package/dist/lib/integrations/simulation/index.d.ts +33 -0
- package/dist/lib/integrations/simulation/index.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/index.js +35 -0
- package/dist/lib/integrations/simulation/quality/index.d.ts +5 -0
- package/dist/lib/integrations/simulation/quality/index.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/quality/index.js +4 -0
- package/dist/lib/integrations/simulation/quality/message-perturbation.d.ts +25 -0
- package/dist/lib/integrations/simulation/quality/message-perturbation.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/quality/message-perturbation.js +150 -0
- package/dist/lib/integrations/simulation/runner/index.d.ts +5 -0
- package/dist/lib/integrations/simulation/runner/index.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/runner/index.js +4 -0
- package/dist/lib/integrations/simulation/runner/simulation.d.ts +57 -0
- package/dist/lib/integrations/simulation/runner/simulation.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/runner/simulation.js +336 -0
- package/dist/lib/integrations/simulation/schemas.d.ts +104 -0
- package/dist/lib/integrations/simulation/schemas.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/schemas.js +76 -0
- package/dist/lib/integrations/simulation/simulation/index.d.ts +49 -0
- package/dist/lib/integrations/simulation/simulation/index.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/simulation/index.js +159 -0
- package/dist/lib/integrations/simulation/types.d.ts +101 -0
- package/dist/lib/integrations/simulation/types.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/types.js +90 -0
- package/dist/lib/integrations/simulation/utils/dataset-export.d.ts +31 -0
- package/dist/lib/integrations/simulation/utils/dataset-export.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/utils/dataset-export.js +146 -0
- package/dist/lib/integrations/simulation/utils/extract-json.d.ts +17 -0
- package/dist/lib/integrations/simulation/utils/extract-json.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/utils/extract-json.js +106 -0
- package/dist/lib/integrations/simulation/utils/prompt-builders.d.ts +34 -0
- package/dist/lib/integrations/simulation/utils/prompt-builders.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/utils/prompt-builders.js +147 -0
- package/dist/lib/integrations/simulation/utils/sanitize.d.ts +15 -0
- package/dist/lib/integrations/simulation/utils/sanitize.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/utils/sanitize.js +20 -0
- package/dist/lib/integrations/simulation/wrap-agent.d.ts +65 -0
- package/dist/lib/integrations/simulation/wrap-agent.d.ts.map +1 -0
- package/dist/lib/integrations/simulation/wrap-agent.js +140 -0
- package/dist/lib/send-results.d.ts.map +1 -1
- package/dist/lib/send-results.js +17 -2
- package/dist/lib/types.d.ts +2 -2
- package/dist/lib/types.d.ts.map +1 -1
- package/dist/tsconfig.lib.tsbuildinfo +1 -1
- package/package.json +24 -2
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Zod schemas for optional runtime validation of simulation types.
|
|
3
|
+
*
|
|
4
|
+
* Separated from types.ts so that importing the simulation module
|
|
5
|
+
* does not require zod to be installed. Only users who explicitly
|
|
6
|
+
* import these schemas need zod as a dependency.
|
|
7
|
+
*/
|
|
8
|
+
import { z } from "zod";
|
|
9
|
+
export const PersonaSchema = z.object({
|
|
10
|
+
name: z.string(),
|
|
11
|
+
patience: z.number().min(0).max(1).default(0.5),
|
|
12
|
+
assertiveness: z.number().min(0).max(1).default(0.5),
|
|
13
|
+
politeness: z.number().min(0).max(1).default(0.5),
|
|
14
|
+
technical_level: z.number().min(0).max(1).default(0.5),
|
|
15
|
+
communication_style: z
|
|
16
|
+
.enum(["formal", "casual", "terse", "verbose"])
|
|
17
|
+
.default("casual"),
|
|
18
|
+
background: z.string().default(""),
|
|
19
|
+
emotional_arc: z
|
|
20
|
+
.enum([
|
|
21
|
+
"stable",
|
|
22
|
+
"escalating",
|
|
23
|
+
"de_escalating",
|
|
24
|
+
"volatile",
|
|
25
|
+
"manipulative",
|
|
26
|
+
"hostile",
|
|
27
|
+
])
|
|
28
|
+
.optional(),
|
|
29
|
+
cultural_context: z
|
|
30
|
+
.enum([
|
|
31
|
+
"neutral",
|
|
32
|
+
"direct",
|
|
33
|
+
"indirect",
|
|
34
|
+
"high_context",
|
|
35
|
+
"low_context",
|
|
36
|
+
"hierarchical",
|
|
37
|
+
])
|
|
38
|
+
.optional(),
|
|
39
|
+
});
|
|
40
|
+
export const CriterionSchema = z.object({
|
|
41
|
+
description: z.string(),
|
|
42
|
+
type: z.enum(["must_happen", "must_not_happen"]),
|
|
43
|
+
evaluator: z.string().nullable().optional(),
|
|
44
|
+
});
|
|
45
|
+
export const ScenarioSchema = z.object({
|
|
46
|
+
name: z.string(),
|
|
47
|
+
goal: z.string(),
|
|
48
|
+
context: z.string().optional(),
|
|
49
|
+
starting_emotion: z
|
|
50
|
+
.enum(["neutral", "frustrated", "confused", "happy", "urgent"])
|
|
51
|
+
.optional(),
|
|
52
|
+
criteria: z.array(CriterionSchema).optional(),
|
|
53
|
+
is_edge_case: z.boolean().optional(),
|
|
54
|
+
conversation_strategy: z
|
|
55
|
+
.enum([
|
|
56
|
+
"cooperative",
|
|
57
|
+
"topic_switching",
|
|
58
|
+
"contradictory",
|
|
59
|
+
"multi_intent",
|
|
60
|
+
"evasive",
|
|
61
|
+
"repetitive",
|
|
62
|
+
"ambiguous",
|
|
63
|
+
])
|
|
64
|
+
.optional(),
|
|
65
|
+
ground_truth: z.string().optional(),
|
|
66
|
+
input_format: z
|
|
67
|
+
.enum([
|
|
68
|
+
"plain_text",
|
|
69
|
+
"with_url",
|
|
70
|
+
"with_attachment",
|
|
71
|
+
"form_data",
|
|
72
|
+
"code_block",
|
|
73
|
+
"mixed_media",
|
|
74
|
+
])
|
|
75
|
+
.optional(),
|
|
76
|
+
});
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* evaluatorq integration for agent simulation.
|
|
3
|
+
*
|
|
4
|
+
* Provides high-level functions to run simulations,
|
|
5
|
+
* either standalone or within the evaluatorq framework.
|
|
6
|
+
*/
|
|
7
|
+
import type { ChatMessage, Datapoint, Persona, Scenario, SimulationResult } from "../types.js";
|
|
8
|
+
export interface SimulateParams {
|
|
9
|
+
evaluationName: string;
|
|
10
|
+
agentKey?: string;
|
|
11
|
+
targetCallback?: (messages: ChatMessage[]) => string | Promise<string>;
|
|
12
|
+
personas?: Persona[];
|
|
13
|
+
scenarios?: Scenario[];
|
|
14
|
+
datapoints?: Datapoint[];
|
|
15
|
+
maxTurns?: number;
|
|
16
|
+
model?: string;
|
|
17
|
+
evaluators?: string[];
|
|
18
|
+
parallelism?: number;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* High-level function to run agent simulations.
|
|
22
|
+
*
|
|
23
|
+
* Handles:
|
|
24
|
+
* - Creating persona x scenario combinations
|
|
25
|
+
* - Generating first messages for each combination
|
|
26
|
+
* - Running simulations in parallel
|
|
27
|
+
* - Applying evaluators to results
|
|
28
|
+
*/
|
|
29
|
+
export declare function simulate(params: SimulateParams): Promise<SimulationResult[]>;
|
|
30
|
+
export interface GenerateAndSimulateParams {
|
|
31
|
+
evaluationName: string;
|
|
32
|
+
agentKey?: string;
|
|
33
|
+
agentDescription: string;
|
|
34
|
+
targetCallback?: (messages: ChatMessage[]) => string | Promise<string>;
|
|
35
|
+
numPersonas?: number;
|
|
36
|
+
numScenarios?: number;
|
|
37
|
+
maxTurns?: number;
|
|
38
|
+
model?: string;
|
|
39
|
+
evaluators?: string[];
|
|
40
|
+
parallelism?: number;
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Generate personas/scenarios and run simulations.
|
|
44
|
+
*
|
|
45
|
+
* Convenience function that combines generation and simulation.
|
|
46
|
+
*/
|
|
47
|
+
export declare function generateAndSimulate(params: GenerateAndSimulateParams): Promise<SimulationResult[]>;
|
|
48
|
+
export { getAllEvaluators, getEvaluator, SIMULATION_EVALUATORS, } from "../evaluators/index.js";
|
|
49
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../../src/lib/integrations/simulation/simulation/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAQH,OAAO,KAAK,EACV,WAAW,EACX,SAAS,EACT,OAAO,EACP,QAAQ,EACR,gBAAgB,EACjB,MAAM,aAAa,CAAC;AAOrB,MAAM,WAAW,cAAc;IAC7B,cAAc,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,cAAc,CAAC,EAAE,CAAC,QAAQ,EAAE,WAAW,EAAE,KAAK,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IACvE,QAAQ,CAAC,EAAE,OAAO,EAAE,CAAC;IACrB,SAAS,CAAC,EAAE,QAAQ,EAAE,CAAC;IACvB,UAAU,CAAC,EAAE,SAAS,EAAE,CAAC;IACzB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;IACtB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;;;;;;;GAQG;AACH,wBAAsB,QAAQ,CAC5B,MAAM,EAAE,cAAc,GACrB,OAAO,CAAC,gBAAgB,EAAE,CAAC,CAkH7B;AAMD,MAAM,WAAW,yBAAyB;IACxC,cAAc,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,gBAAgB,EAAE,MAAM,CAAC;IACzB,cAAc,CAAC,EAAE,CAAC,QAAQ,EAAE,WAAW,EAAE,KAAK,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IACvE,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;IACtB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;;;GAIG;AACH,wBAAsB,mBAAmB,CACvC,MAAM,EAAE,yBAAyB,GAChC,OAAO,CAAC,gBAAgB,EAAE,CAAC,CAgF7B;AAGD,OAAO,EACL,gBAAgB,EAChB,YAAY,EACZ,qBAAqB,GACtB,MAAM,wBAAwB,CAAC"}
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* evaluatorq integration for agent simulation.
|
|
3
|
+
*
|
|
4
|
+
* Provides high-level functions to run simulations,
|
|
5
|
+
* either standalone or within the evaluatorq framework.
|
|
6
|
+
*/
|
|
7
|
+
import OpenAI from "openai";
|
|
8
|
+
import { fromOrqDeployment } from "../adapters.js";
|
|
9
|
+
import { getEvaluator } from "../evaluators/index.js";
|
|
10
|
+
import { FirstMessageGenerator } from "../generators/first-message-generator.js";
|
|
11
|
+
import { SimulationRunner } from "../runner/simulation.js";
|
|
12
|
+
import { generateDatapoint } from "../utils/prompt-builders.js";
|
|
13
|
+
/**
|
|
14
|
+
* High-level function to run agent simulations.
|
|
15
|
+
*
|
|
16
|
+
* Handles:
|
|
17
|
+
* - Creating persona x scenario combinations
|
|
18
|
+
* - Generating first messages for each combination
|
|
19
|
+
* - Running simulations in parallel
|
|
20
|
+
* - Applying evaluators to results
|
|
21
|
+
*/
|
|
22
|
+
export async function simulate(params) {
|
|
23
|
+
const { targetCallback, personas, scenarios, maxTurns = 10, model = "azure/gpt-4o-mini", evaluators: evaluatorNames, parallelism = 5, } = params;
|
|
24
|
+
let { datapoints } = params;
|
|
25
|
+
// Validate evaluator names early — throw on unknown names
|
|
26
|
+
const resolvedEvaluatorNames = evaluatorNames ?? [
|
|
27
|
+
"goal_achieved",
|
|
28
|
+
"criteria_met",
|
|
29
|
+
];
|
|
30
|
+
const scorers = resolvedEvaluatorNames.map((name) => ({
|
|
31
|
+
name,
|
|
32
|
+
fn: getEvaluator(name),
|
|
33
|
+
}));
|
|
34
|
+
// Build datapoints from personas x scenarios if not provided
|
|
35
|
+
if (!datapoints) {
|
|
36
|
+
if (!personas || !scenarios) {
|
|
37
|
+
throw new Error("Either provide 'datapoints' or both 'personas' and 'scenarios'");
|
|
38
|
+
}
|
|
39
|
+
if (personas.length === 0 || scenarios.length === 0) {
|
|
40
|
+
throw new Error("'personas' and 'scenarios' arrays must both be non-empty");
|
|
41
|
+
}
|
|
42
|
+
const apiKey = process.env.ORQ_API_KEY;
|
|
43
|
+
if (!apiKey) {
|
|
44
|
+
throw new Error("ORQ_API_KEY environment variable is not set. Set it before calling simulate().");
|
|
45
|
+
}
|
|
46
|
+
// Create a shared HTTP client so the generator doesn't leak its own pool
|
|
47
|
+
const sharedClient = new OpenAI({
|
|
48
|
+
apiKey,
|
|
49
|
+
baseURL: process.env.ROUTER_BASE_URL ?? "https://api.orq.ai/v2/router",
|
|
50
|
+
});
|
|
51
|
+
// Generate first messages for each combination (with bounded concurrency)
|
|
52
|
+
const firstMsgGen = new FirstMessageGenerator({
|
|
53
|
+
model,
|
|
54
|
+
client: sharedClient,
|
|
55
|
+
});
|
|
56
|
+
const pairs = personas.flatMap((persona) => scenarios.map((scenario) => ({ persona, scenario })));
|
|
57
|
+
const FIRST_MSG_CONCURRENCY = 5;
|
|
58
|
+
const generatedDatapoints = [];
|
|
59
|
+
for (let i = 0; i < pairs.length; i += FIRST_MSG_CONCURRENCY) {
|
|
60
|
+
const batch = pairs.slice(i, i + FIRST_MSG_CONCURRENCY);
|
|
61
|
+
const batchResults = await Promise.all(batch.map(async ({ persona, scenario }) => {
|
|
62
|
+
const firstMessage = await firstMsgGen.generate(persona, scenario);
|
|
63
|
+
return generateDatapoint(persona, scenario, firstMessage);
|
|
64
|
+
}));
|
|
65
|
+
generatedDatapoints.push(...batchResults);
|
|
66
|
+
}
|
|
67
|
+
datapoints = generatedDatapoints;
|
|
68
|
+
}
|
|
69
|
+
if (!datapoints || datapoints.length === 0) {
|
|
70
|
+
throw new Error("No datapoints to simulate — persona or scenario generation may have failed");
|
|
71
|
+
}
|
|
72
|
+
// Bridge agentKey to invoke() if no callback is provided
|
|
73
|
+
let resolvedCallback = targetCallback;
|
|
74
|
+
if (!resolvedCallback && params.agentKey) {
|
|
75
|
+
resolvedCallback = fromOrqDeployment(params.agentKey);
|
|
76
|
+
}
|
|
77
|
+
if (!resolvedCallback) {
|
|
78
|
+
throw new Error("Either targetCallback or agentKey is required");
|
|
79
|
+
}
|
|
80
|
+
// Create simulation runner
|
|
81
|
+
const runner = new SimulationRunner({
|
|
82
|
+
targetCallback: resolvedCallback,
|
|
83
|
+
model,
|
|
84
|
+
maxTurns,
|
|
85
|
+
});
|
|
86
|
+
try {
|
|
87
|
+
// Run simulations
|
|
88
|
+
const results = await runner.runBatch({
|
|
89
|
+
datapoints,
|
|
90
|
+
maxTurns,
|
|
91
|
+
maxConcurrency: parallelism,
|
|
92
|
+
});
|
|
93
|
+
// Apply evaluators to results
|
|
94
|
+
for (const result of results) {
|
|
95
|
+
const scores = {};
|
|
96
|
+
for (const { name, fn } of scorers) {
|
|
97
|
+
scores[name] = fn(result);
|
|
98
|
+
}
|
|
99
|
+
result.metadata.evaluator_scores = scores;
|
|
100
|
+
}
|
|
101
|
+
return results;
|
|
102
|
+
}
|
|
103
|
+
finally {
|
|
104
|
+
await runner.close();
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Generate personas/scenarios and run simulations.
|
|
109
|
+
*
|
|
110
|
+
* Convenience function that combines generation and simulation.
|
|
111
|
+
*/
|
|
112
|
+
export async function generateAndSimulate(params) {
|
|
113
|
+
const { evaluationName, agentDescription, targetCallback, numPersonas = 5, numScenarios = 5, maxTurns = 10, model = "azure/gpt-4o-mini", evaluators, parallelism = 5, } = params;
|
|
114
|
+
// Bridge agentKey to invoke() if no callback is provided
|
|
115
|
+
let resolvedCallback = targetCallback;
|
|
116
|
+
if (!resolvedCallback && params.agentKey) {
|
|
117
|
+
resolvedCallback = fromOrqDeployment(params.agentKey);
|
|
118
|
+
}
|
|
119
|
+
if (!resolvedCallback) {
|
|
120
|
+
throw new Error("Either targetCallback or agentKey is required for generateAndSimulate");
|
|
121
|
+
}
|
|
122
|
+
// Dynamic import to avoid hard dependency on generators module
|
|
123
|
+
let PersonaGenerator;
|
|
124
|
+
let ScenarioGenerator;
|
|
125
|
+
try {
|
|
126
|
+
const generators = await import("../generators/index.js");
|
|
127
|
+
PersonaGenerator = generators.PersonaGenerator;
|
|
128
|
+
ScenarioGenerator = generators.ScenarioGenerator;
|
|
129
|
+
}
|
|
130
|
+
catch (err) {
|
|
131
|
+
throw new Error("Generators module not available. Install generators or provide pre-built datapoints using simulate() instead.", { cause: err });
|
|
132
|
+
}
|
|
133
|
+
// Generate personas and scenarios in parallel
|
|
134
|
+
const personaGen = new PersonaGenerator({ model });
|
|
135
|
+
const scenarioGen = new ScenarioGenerator({ model });
|
|
136
|
+
const [personas, scenarios] = await Promise.all([
|
|
137
|
+
personaGen.generate({
|
|
138
|
+
agentDescription,
|
|
139
|
+
numPersonas,
|
|
140
|
+
}),
|
|
141
|
+
scenarioGen.generate({
|
|
142
|
+
agentDescription,
|
|
143
|
+
numScenarios,
|
|
144
|
+
}),
|
|
145
|
+
]);
|
|
146
|
+
// Run simulations
|
|
147
|
+
return simulate({
|
|
148
|
+
evaluationName,
|
|
149
|
+
targetCallback: resolvedCallback,
|
|
150
|
+
personas,
|
|
151
|
+
scenarios,
|
|
152
|
+
maxTurns,
|
|
153
|
+
model,
|
|
154
|
+
evaluators,
|
|
155
|
+
parallelism,
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
// Re-export evaluator utilities for convenience
|
|
159
|
+
export { getAllEvaluators, getEvaluator, SIMULATION_EVALUATORS, } from "../evaluators/index.js";
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core type definitions for the agent simulation framework.
|
|
3
|
+
*
|
|
4
|
+
* Uses plain interfaces for maximum compatibility with generators/runner/agents.
|
|
5
|
+
* Zod schemas provided for optional runtime validation.
|
|
6
|
+
*/
|
|
7
|
+
export type CommunicationStyle = "formal" | "casual" | "terse" | "verbose";
|
|
8
|
+
export type StartingEmotion = "neutral" | "frustrated" | "confused" | "happy" | "urgent";
|
|
9
|
+
export type EmotionalArc = "stable" | "escalating" | "de_escalating" | "volatile" | "manipulative" | "hostile";
|
|
10
|
+
export type CulturalContext = "neutral" | "direct" | "indirect" | "high_context" | "low_context" | "hierarchical";
|
|
11
|
+
export type ConversationStrategy = "cooperative" | "topic_switching" | "contradictory" | "multi_intent" | "evasive" | "repetitive" | "ambiguous";
|
|
12
|
+
export type InputFormat = "plain_text" | "with_url" | "with_attachment" | "form_data" | "code_block" | "mixed_media";
|
|
13
|
+
export declare const EMOTIONAL_ARC_INSTRUCTIONS: Record<EmotionalArc, string>;
|
|
14
|
+
export declare const CULTURAL_CONTEXT_INSTRUCTIONS: Record<CulturalContext, string>;
|
|
15
|
+
export declare const STRATEGY_INSTRUCTIONS: Record<ConversationStrategy, string>;
|
|
16
|
+
export declare const INPUT_FORMAT_INSTRUCTIONS: Record<InputFormat, string>;
|
|
17
|
+
export interface ChatMessage {
|
|
18
|
+
role: "user" | "assistant" | "system";
|
|
19
|
+
content: string;
|
|
20
|
+
}
|
|
21
|
+
export interface TokenUsage {
|
|
22
|
+
prompt_tokens: number;
|
|
23
|
+
completion_tokens: number;
|
|
24
|
+
total_tokens: number;
|
|
25
|
+
}
|
|
26
|
+
export interface Persona {
|
|
27
|
+
name: string;
|
|
28
|
+
patience: number;
|
|
29
|
+
assertiveness: number;
|
|
30
|
+
politeness: number;
|
|
31
|
+
technical_level: number;
|
|
32
|
+
communication_style: CommunicationStyle;
|
|
33
|
+
background: string;
|
|
34
|
+
emotional_arc?: EmotionalArc;
|
|
35
|
+
cultural_context?: CulturalContext;
|
|
36
|
+
}
|
|
37
|
+
export interface Criterion {
|
|
38
|
+
description: string;
|
|
39
|
+
type: "must_happen" | "must_not_happen";
|
|
40
|
+
evaluator?: string | null;
|
|
41
|
+
}
|
|
42
|
+
export interface Scenario {
|
|
43
|
+
name: string;
|
|
44
|
+
goal: string;
|
|
45
|
+
context?: string;
|
|
46
|
+
starting_emotion?: StartingEmotion;
|
|
47
|
+
criteria?: Criterion[];
|
|
48
|
+
is_edge_case?: boolean;
|
|
49
|
+
conversation_strategy?: ConversationStrategy;
|
|
50
|
+
ground_truth?: string;
|
|
51
|
+
input_format?: InputFormat;
|
|
52
|
+
}
|
|
53
|
+
export interface Judgment {
|
|
54
|
+
should_terminate: boolean;
|
|
55
|
+
reason: string;
|
|
56
|
+
goal_achieved: boolean;
|
|
57
|
+
rules_broken: string[];
|
|
58
|
+
goal_completion_score: number;
|
|
59
|
+
response_quality?: number | null;
|
|
60
|
+
hallucination_risk?: number | null;
|
|
61
|
+
tone_appropriateness?: number | null;
|
|
62
|
+
factual_accuracy?: number | null;
|
|
63
|
+
}
|
|
64
|
+
export interface Message {
|
|
65
|
+
role: "user" | "assistant" | "system";
|
|
66
|
+
content: string;
|
|
67
|
+
}
|
|
68
|
+
export interface TurnMetrics {
|
|
69
|
+
turn_number: number;
|
|
70
|
+
token_usage: TokenUsage;
|
|
71
|
+
response_quality?: number | null;
|
|
72
|
+
hallucination_risk?: number | null;
|
|
73
|
+
tone_appropriateness?: number | null;
|
|
74
|
+
factual_accuracy?: number | null;
|
|
75
|
+
judge_reason: string;
|
|
76
|
+
}
|
|
77
|
+
export type TerminatedBy = "judge" | "max_turns" | "error" | "timeout";
|
|
78
|
+
export interface SimulationResult {
|
|
79
|
+
messages: Message[];
|
|
80
|
+
terminated_by: TerminatedBy;
|
|
81
|
+
reason: string;
|
|
82
|
+
goal_achieved: boolean;
|
|
83
|
+
goal_completion_score: number;
|
|
84
|
+
rules_broken: string[];
|
|
85
|
+
turn_count: number;
|
|
86
|
+
token_usage: TokenUsage;
|
|
87
|
+
turn_metrics: TurnMetrics[];
|
|
88
|
+
metadata: Record<string, unknown>;
|
|
89
|
+
/** Convenience fields for evaluatorq integration */
|
|
90
|
+
criteria_results?: Record<string, boolean>;
|
|
91
|
+
total_turns?: number;
|
|
92
|
+
}
|
|
93
|
+
export interface Datapoint {
|
|
94
|
+
id: string;
|
|
95
|
+
persona: Persona;
|
|
96
|
+
scenario: Scenario;
|
|
97
|
+
user_system_prompt: string;
|
|
98
|
+
first_message: string;
|
|
99
|
+
}
|
|
100
|
+
export type { DataPoint, Job, Output } from "../../types.js";
|
|
101
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../../../src/lib/integrations/simulation/types.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAMH,MAAM,MAAM,kBAAkB,GAAG,QAAQ,GAAG,QAAQ,GAAG,OAAO,GAAG,SAAS,CAAC;AAE3E,MAAM,MAAM,eAAe,GACvB,SAAS,GACT,YAAY,GACZ,UAAU,GACV,OAAO,GACP,QAAQ,CAAC;AAEb,MAAM,MAAM,YAAY,GACpB,QAAQ,GACR,YAAY,GACZ,eAAe,GACf,UAAU,GACV,cAAc,GACd,SAAS,CAAC;AAEd,MAAM,MAAM,eAAe,GACvB,SAAS,GACT,QAAQ,GACR,UAAU,GACV,cAAc,GACd,aAAa,GACb,cAAc,CAAC;AAEnB,MAAM,MAAM,oBAAoB,GAC5B,aAAa,GACb,iBAAiB,GACjB,eAAe,GACf,cAAc,GACd,SAAS,GACT,YAAY,GACZ,WAAW,CAAC;AAEhB,MAAM,MAAM,WAAW,GACnB,YAAY,GACZ,UAAU,GACV,iBAAiB,GACjB,WAAW,GACX,YAAY,GACZ,aAAa,CAAC;AAMlB,eAAO,MAAM,0BAA0B,EAAE,MAAM,CAAC,YAAY,EAAE,MAAM,CA4BnE,CAAC;AAEF,eAAO,MAAM,6BAA6B,EAAE,MAAM,CAAC,eAAe,EAAE,MAAM,CA0BzE,CAAC;AAEF,eAAO,MAAM,qBAAqB,EAAE,MAAM,CAAC,oBAAoB,EAAE,MAAM,CA2BtE,CAAC;AAEF,eAAO,MAAM,yBAAyB,EAAE,MAAM,CAAC,WAAW,EAAE,MAAM,CAiBjE,CAAC;AAMF,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,MAAM,GAAG,WAAW,GAAG,QAAQ,CAAC;IACtC,OAAO,EAAE,MAAM,CAAC;CACjB;AAMD,MAAM,WAAW,UAAU;IACzB,aAAa,EAAE,MAAM,CAAC;IACtB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,YAAY,EAAE,MAAM,CAAC;CACtB;AAMD,MAAM,WAAW,OAAO;IACtB,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,MAAM,CAAC;IACjB,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;IACnB,eAAe,EAAE,MAAM,CAAC;IACxB,mBAAmB,EAAE,kBAAkB,CAAC;IACxC,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,CAAC,EAAE,YAAY,CAAC;IAC7B,gBAAgB,CAAC,EAAE,eAAe,CAAC;CACpC;AAMD,MAAM,WAAW,SAAS;IACxB,WAAW,EAAE,MAAM,CAAC;IACpB,IAAI,EAAE,aAAa,GAAG,iBAAiB,CAAC;IACxC,SAAS,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3B;AAMD,MAAM,WAAW,QAAQ;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,gBAAgB,CAAC,EAAE,eAAe,CAAC;IACnC,QAAQ,CAAC,EAAE,SAAS,EAAE,CAAC;IACvB,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,qBAAqB,CAAC,EAAE,oBAAoB,CAAC;IAC7C,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,YAAY,CAAC,EAAE,WAAW,CAAC;CAC5B;AAMD,MAAM,WAAW,QAAQ;IACvB,gBAAgB,EAAE,OAAO,CAAC;IAC1B,MAAM,EAAE,MAAM,CAAC;IACf,aAAa,EAAE,OAAO,CAAC;IACvB,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,qBAAqB,EAAE,MAAM,CAAC;IAC9B,gBAAgB,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC,kBAAkB,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IACnC,oBAAoB,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IACrC,gBAAgB,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CAClC;AAMD,MAAM,WAAW,OAAO;IACtB,IAAI,EAAE,MAAM,GAAG,WAAW,GAAG,QAAQ,CAAC;IACtC,OAAO,EAAE,MAAM,CAAC;CACjB;AAMD,MAAM,WAAW,WAAW;IAC1B,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,UAAU,CAAC;IACxB,gBAAgB,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC,kBAAkB,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IACnC,oBAAoB,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IACrC,gBAAgB,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC,YAAY,EAAE,MAAM,CAAC;CACtB;AAMD,MAAM,MAAM,YAAY,GAAG,OAAO,GAAG,WAAW,GAAG,OAAO,GAAG,SAAS,CAAC;AAEvE,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,EAAE,OAAO,EAAE,CAAC;IACpB,aAAa,EAAE,YAAY,CAAC;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,aAAa,EAAE,OAAO,CAAC;IACvB,qBAAqB,EAAE,MAAM,CAAC;IAC9B,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW,EAAE,UAAU,CAAC;IACxB,YAAY,EAAE,WAAW,EAAE,CAAC;IAC5B,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAClC,oDAAoD;IACpD,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC3C,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAMD,MAAM,WAAW,SAAS;IACxB,EAAE,EAAE,MAAM,CAAC;IACX,OAAO,EAAE,OAAO,CAAC;IACjB,QAAQ,EAAE,QAAQ,CAAC;IACnB,kBAAkB,EAAE,MAAM,CAAC;IAC3B,aAAa,EAAE,MAAM,CAAC;CACvB;AAMD,YAAY,EAAE,SAAS,EAAE,GAAG,EAAE,MAAM,EAAE,MAAM,gBAAgB,CAAC"}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core type definitions for the agent simulation framework.
|
|
3
|
+
*
|
|
4
|
+
* Uses plain interfaces for maximum compatibility with generators/runner/agents.
|
|
5
|
+
* Zod schemas provided for optional runtime validation.
|
|
6
|
+
*/
|
|
7
|
+
// ---------------------------------------------------------------------------
|
|
8
|
+
// Constant instruction maps (ported from Python)
|
|
9
|
+
// ---------------------------------------------------------------------------
|
|
10
|
+
export const EMOTIONAL_ARC_INSTRUCTIONS = {
|
|
11
|
+
stable: "",
|
|
12
|
+
escalating: "Your emotional intensity escalates over the course of the conversation. " +
|
|
13
|
+
"Start relatively calm but become progressively more frustrated, impatient, or upset " +
|
|
14
|
+
"with each turn, especially if the agent is slow to resolve your issue. " +
|
|
15
|
+
"By the end, you should be noticeably more emotional than at the start.",
|
|
16
|
+
de_escalating: "You start the conversation very upset, frustrated, or angry. As the agent responds " +
|
|
17
|
+
"and shows willingness to help, you gradually calm down. Your tone softens over time, " +
|
|
18
|
+
"and by the end you may even express gratitude. However, any misstep by the agent " +
|
|
19
|
+
"can temporarily reverse this trend.",
|
|
20
|
+
volatile: "Your emotional state is volatile and unpredictable. Swing between being calm, " +
|
|
21
|
+
"frustrated, grateful, and confused within the same conversation. A small positive " +
|
|
22
|
+
"response from the agent might suddenly improve your mood, while a minor setback " +
|
|
23
|
+
"could trigger strong frustration. Your reactions should feel disproportionate.",
|
|
24
|
+
manipulative: "You use emotional manipulation tactics to get what you want. Employ guilt-tripping " +
|
|
25
|
+
"('I've been a loyal customer for years and this is how you treat me?'), flattery " +
|
|
26
|
+
"('You seem so much more competent than the last agent'), threats to escalate " +
|
|
27
|
+
"('I'll leave a bad review'), or appeals to sympathy ('This is ruining my day'). " +
|
|
28
|
+
"Switch between these tactics based on what seems to be working.",
|
|
29
|
+
hostile: "You are genuinely hostile and confrontational. Use aggressive language, make " +
|
|
30
|
+
"unreasonable demands, and challenge the agent's competence. Express contempt and " +
|
|
31
|
+
"refuse to cooperate with standard procedures. This tests the agent's ability to " +
|
|
32
|
+
"maintain professionalism and de-escalate under pressure.",
|
|
33
|
+
};
|
|
34
|
+
export const CULTURAL_CONTEXT_INSTRUCTIONS = {
|
|
35
|
+
neutral: "",
|
|
36
|
+
direct: "You communicate in a very direct, low-context style typical of Northern European " +
|
|
37
|
+
"or North American cultures. Say exactly what you mean without hints or implication. " +
|
|
38
|
+
"Get straight to the point. 'No' means 'no' — you don't soften refusals.",
|
|
39
|
+
indirect: "You communicate indirectly, typical of many East Asian or Southeast Asian cultures. " +
|
|
40
|
+
"Avoid saying 'no' directly — instead use phrases like 'that might be difficult' or " +
|
|
41
|
+
"'I'll think about it.' Hint at problems rather than stating them outright. Use hedging " +
|
|
42
|
+
"language ('perhaps', 'maybe', 'it seems'). Preserving harmony is important.",
|
|
43
|
+
high_context: "You rely heavily on context and implication rather than explicit statements. " +
|
|
44
|
+
"You expect the agent to read between the lines and understand unstated needs. " +
|
|
45
|
+
"You may reference shared knowledge without explaining it. Use fewer words but " +
|
|
46
|
+
"expect more understanding. Silence can be meaningful.",
|
|
47
|
+
low_context: "You spell everything out explicitly and leave nothing to interpretation. " +
|
|
48
|
+
"Provide full context with every message. Repeat important details. " +
|
|
49
|
+
"Don't assume the agent remembers previous context. " +
|
|
50
|
+
"Be thorough and detailed in every message.",
|
|
51
|
+
hierarchical: "You approach the interaction with a strong sense of hierarchy, typical of many " +
|
|
52
|
+
"Middle Eastern, East Asian, or South Asian cultures. You may expect formal address, " +
|
|
53
|
+
"defer to authority ('can I speak to a manager?'), and be uncomfortable challenging " +
|
|
54
|
+
"the agent's statements directly. Status and titles matter to you.",
|
|
55
|
+
};
|
|
56
|
+
export const STRATEGY_INSTRUCTIONS = {
|
|
57
|
+
cooperative: "",
|
|
58
|
+
topic_switching: "You frequently switch topics mid-conversation. After a few exchanges about your main goal, " +
|
|
59
|
+
"bring up unrelated questions or concerns before returning to your original topic. " +
|
|
60
|
+
"This tests the agent's ability to handle context switching.",
|
|
61
|
+
contradictory: "You contradict yourself during the conversation. Say one thing, then later say the opposite " +
|
|
62
|
+
"or change your requirements. For example, first ask for a refund, then say you actually want " +
|
|
63
|
+
"a replacement. This tests the agent's ability to handle inconsistent user input.",
|
|
64
|
+
multi_intent: "You have multiple goals packed into each message. Combine questions, requests, and complaints " +
|
|
65
|
+
"in single messages. For example, ask about your order status while also requesting a password " +
|
|
66
|
+
"reset and complaining about a previous experience.",
|
|
67
|
+
evasive: "You are evasive and avoid directly answering the agent's questions. Give vague or incomplete " +
|
|
68
|
+
"responses when asked for details. The agent needs to work harder to extract the information " +
|
|
69
|
+
"it needs to help you.",
|
|
70
|
+
repetitive: "You repeat your requests and questions even after the agent has addressed them. Ask the same " +
|
|
71
|
+
"thing in slightly different ways, as if you didn't understand or weren't satisfied with " +
|
|
72
|
+
"the response. This tests the agent's patience and ability to rephrase explanations.",
|
|
73
|
+
ambiguous: "You are deliberately vague and unclear in your requests. Use imprecise language, " +
|
|
74
|
+
"avoid giving specific details, and make the agent work to understand what you actually need. " +
|
|
75
|
+
"When the agent asks clarifying questions, give partial or still-ambiguous answers. " +
|
|
76
|
+
"For example, say 'the thing isn't working' instead of specifying which product or error.",
|
|
77
|
+
};
|
|
78
|
+
export const INPUT_FORMAT_INSTRUCTIONS = {
|
|
79
|
+
plain_text: "",
|
|
80
|
+
with_url: "Include relevant URLs in your messages. Reference links to products, order pages, " +
|
|
81
|
+
"screenshots, or documentation.",
|
|
82
|
+
with_attachment: "Reference file attachments in your messages as if you're uploading them. " +
|
|
83
|
+
"Mention screenshots, receipts, photos, or documents.",
|
|
84
|
+
form_data: "Structure your messages like filled-out forms or structured data. Include labeled fields, " +
|
|
85
|
+
"order details in a structured format, or table-like information.",
|
|
86
|
+
code_block: "Include code snippets, error logs, stack traces, or technical output in your messages. " +
|
|
87
|
+
"Wrap technical content in code blocks.",
|
|
88
|
+
mixed_media: "Mix different input types in your messages. Combine plain text with URLs, " +
|
|
89
|
+
"attachment references, structured data, or code blocks.",
|
|
90
|
+
};
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Dataset export/import utilities for JSONL format.
|
|
3
|
+
*/
|
|
4
|
+
import type { Datapoint, SimulationResult } from "../types.js";
|
|
5
|
+
/**
|
|
6
|
+
* Export datapoints to JSONL format for orq.ai datasets.
|
|
7
|
+
*/
|
|
8
|
+
export declare function exportDatapointsToJsonl(datapoints: Datapoint[], outputPath: string): void;
|
|
9
|
+
/**
|
|
10
|
+
* Export simulation results to JSONL format.
|
|
11
|
+
*/
|
|
12
|
+
export declare function exportResultsToJsonl(results: SimulationResult[], outputPath: string): void;
|
|
13
|
+
/**
|
|
14
|
+
* Load datapoints from a JSONL file.
|
|
15
|
+
*
|
|
16
|
+
* Supports both the current format (with full persona/scenario objects) and a
|
|
17
|
+
* legacy format (with flat fields).
|
|
18
|
+
*/
|
|
19
|
+
export declare function loadDatapointsFromJsonl(inputPath: string): Datapoint[];
|
|
20
|
+
/**
|
|
21
|
+
* Convert simulation results to JSONL string for dataset export.
|
|
22
|
+
*/
|
|
23
|
+
export declare function resultsToJsonl(results: {
|
|
24
|
+
datapoint: Datapoint;
|
|
25
|
+
result: SimulationResult;
|
|
26
|
+
}[]): string;
|
|
27
|
+
/**
|
|
28
|
+
* Parse a JSONL string into an array of objects.
|
|
29
|
+
*/
|
|
30
|
+
export declare function parseJsonl<T = Record<string, unknown>>(content: string): T[];
|
|
31
|
+
//# sourceMappingURL=dataset-export.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dataset-export.d.ts","sourceRoot":"","sources":["../../../../../src/lib/integrations/simulation/utils/dataset-export.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,KAAK,EAEV,SAAS,EAGT,gBAAgB,EACjB,MAAM,aAAa,CAAC;AAMrB;;GAEG;AACH,wBAAgB,uBAAuB,CACrC,UAAU,EAAE,SAAS,EAAE,EACvB,UAAU,EAAE,MAAM,GACjB,IAAI,CAiBN;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAClC,OAAO,EAAE,gBAAgB,EAAE,EAC3B,UAAU,EAAE,MAAM,GACjB,IAAI,CAMN;AAMD;;;;;GAKG;AACH,wBAAgB,uBAAuB,CAAC,SAAS,EAAE,MAAM,GAAG,SAAS,EAAE,CAgEtE;AAMD;;GAEG;AACH,wBAAgB,cAAc,CAC5B,OAAO,EAAE;IAAE,SAAS,EAAE,SAAS,CAAC;IAAC,MAAM,EAAE,gBAAgB,CAAA;CAAE,EAAE,GAC5D,MAAM,CAoBR;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAAE,OAAO,EAAE,MAAM,GAAG,CAAC,EAAE,CAc5E"}
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Dataset export/import utilities for JSONL format.
|
|
3
|
+
*/
|
|
4
|
+
import fs from "node:fs";
|
|
5
|
+
import path from "node:path";
|
|
6
|
+
// ---------------------------------------------------------------------------
|
|
7
|
+
// Export
|
|
8
|
+
// ---------------------------------------------------------------------------
|
|
9
|
+
/**
|
|
10
|
+
* Export datapoints to JSONL format for orq.ai datasets.
|
|
11
|
+
*/
|
|
12
|
+
export function exportDatapointsToJsonl(datapoints, outputPath) {
|
|
13
|
+
const dir = path.dirname(outputPath);
|
|
14
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
15
|
+
const lines = datapoints.map((dp) => JSON.stringify({
|
|
16
|
+
inputs: {
|
|
17
|
+
category: `${dp.persona.name} - ${dp.scenario.name}`,
|
|
18
|
+
first_message: dp.first_message,
|
|
19
|
+
user_system_prompt: dp.user_system_prompt,
|
|
20
|
+
persona: dp.persona,
|
|
21
|
+
scenario: dp.scenario,
|
|
22
|
+
},
|
|
23
|
+
expected_output: null,
|
|
24
|
+
}));
|
|
25
|
+
fs.writeFileSync(outputPath, `${lines.join("\n")}\n`, "utf-8");
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Export simulation results to JSONL format.
|
|
29
|
+
*/
|
|
30
|
+
export function exportResultsToJsonl(results, outputPath) {
|
|
31
|
+
const dir = path.dirname(outputPath);
|
|
32
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
33
|
+
const lines = results.map((r) => JSON.stringify(r));
|
|
34
|
+
fs.writeFileSync(outputPath, `${lines.join("\n")}\n`, "utf-8");
|
|
35
|
+
}
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
// Import
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
/**
|
|
40
|
+
* Load datapoints from a JSONL file.
|
|
41
|
+
*
|
|
42
|
+
* Supports both the current format (with full persona/scenario objects) and a
|
|
43
|
+
* legacy format (with flat fields).
|
|
44
|
+
*/
|
|
45
|
+
export function loadDatapointsFromJsonl(inputPath) {
|
|
46
|
+
const content = fs.readFileSync(inputPath, "utf-8");
|
|
47
|
+
const datapoints = [];
|
|
48
|
+
for (const line of content.split("\n")) {
|
|
49
|
+
const trimmed = line.trim();
|
|
50
|
+
if (!trimmed)
|
|
51
|
+
continue;
|
|
52
|
+
let data;
|
|
53
|
+
try {
|
|
54
|
+
data = JSON.parse(trimmed);
|
|
55
|
+
}
|
|
56
|
+
catch {
|
|
57
|
+
console.warn(`loadDatapointsFromJsonl: skipping malformed line: ${trimmed.slice(0, 80)}`);
|
|
58
|
+
continue;
|
|
59
|
+
}
|
|
60
|
+
const inputs = data.inputs ?? {};
|
|
61
|
+
// Reconstruct persona
|
|
62
|
+
let persona;
|
|
63
|
+
if (inputs.persona && typeof inputs.persona === "object") {
|
|
64
|
+
persona = inputs.persona;
|
|
65
|
+
}
|
|
66
|
+
else {
|
|
67
|
+
persona = {
|
|
68
|
+
name: inputs.persona_name ?? "Unknown",
|
|
69
|
+
patience: 0.5,
|
|
70
|
+
assertiveness: 0.5,
|
|
71
|
+
politeness: 0.5,
|
|
72
|
+
technical_level: 0.5,
|
|
73
|
+
communication_style: "casual",
|
|
74
|
+
background: inputs.context ?? "",
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
// Reconstruct scenario
|
|
78
|
+
let scenario;
|
|
79
|
+
if (inputs.scenario && typeof inputs.scenario === "object") {
|
|
80
|
+
const raw = inputs.scenario;
|
|
81
|
+
const criteriaRaw = raw.criteria;
|
|
82
|
+
const criteria = Array.isArray(criteriaRaw)
|
|
83
|
+
? criteriaRaw
|
|
84
|
+
: [];
|
|
85
|
+
scenario = { ...raw, criteria };
|
|
86
|
+
}
|
|
87
|
+
else {
|
|
88
|
+
scenario = {
|
|
89
|
+
name: inputs.scenario_name ?? "Unknown",
|
|
90
|
+
goal: inputs.goal ?? "",
|
|
91
|
+
context: inputs.context ?? "",
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
datapoints.push({
|
|
95
|
+
id: `dp_${crypto.randomUUID().replace(/-/g, "").slice(0, 12)}`,
|
|
96
|
+
persona,
|
|
97
|
+
scenario,
|
|
98
|
+
user_system_prompt: inputs.user_system_prompt ?? "",
|
|
99
|
+
first_message: inputs.first_message ?? "",
|
|
100
|
+
});
|
|
101
|
+
}
|
|
102
|
+
return datapoints;
|
|
103
|
+
}
|
|
104
|
+
// ---------------------------------------------------------------------------
|
|
105
|
+
// String helpers
|
|
106
|
+
// ---------------------------------------------------------------------------
|
|
107
|
+
/**
|
|
108
|
+
* Convert simulation results to JSONL string for dataset export.
|
|
109
|
+
*/
|
|
110
|
+
export function resultsToJsonl(results) {
|
|
111
|
+
return results
|
|
112
|
+
.map((r) => JSON.stringify({
|
|
113
|
+
id: r.datapoint.id,
|
|
114
|
+
persona: r.datapoint.persona.name,
|
|
115
|
+
scenario: r.datapoint.scenario.name,
|
|
116
|
+
first_message: r.datapoint.first_message,
|
|
117
|
+
goal_achieved: r.result.goal_achieved,
|
|
118
|
+
goal_completion_score: r.result.goal_completion_score,
|
|
119
|
+
terminated_by: r.result.terminated_by,
|
|
120
|
+
turn_count: r.result.turn_count,
|
|
121
|
+
messages: r.result.messages,
|
|
122
|
+
rules_broken: r.result.rules_broken,
|
|
123
|
+
token_usage: r.result.token_usage,
|
|
124
|
+
turn_metrics: r.result.turn_metrics,
|
|
125
|
+
metadata: r.result.metadata,
|
|
126
|
+
}))
|
|
127
|
+
.join("\n");
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Parse a JSONL string into an array of objects.
|
|
131
|
+
*/
|
|
132
|
+
export function parseJsonl(content) {
|
|
133
|
+
const results = [];
|
|
134
|
+
for (const line of content.split("\n")) {
|
|
135
|
+
const trimmed = line.trim();
|
|
136
|
+
if (!trimmed)
|
|
137
|
+
continue;
|
|
138
|
+
try {
|
|
139
|
+
results.push(JSON.parse(trimmed));
|
|
140
|
+
}
|
|
141
|
+
catch {
|
|
142
|
+
console.warn(`parseJsonl: skipping malformed line: ${trimmed.slice(0, 80)}`);
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
return results;
|
|
146
|
+
}
|