@evalstudio/core 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/connector.d.ts +101 -0
- package/dist/connector.d.ts.map +1 -0
- package/dist/connector.js +477 -0
- package/dist/connector.js.map +1 -0
- package/dist/eval.d.ts +66 -0
- package/dist/eval.d.ts.map +1 -0
- package/dist/eval.js +188 -0
- package/dist/eval.js.map +1 -0
- package/dist/evaluator.d.ts +37 -0
- package/dist/evaluator.d.ts.map +1 -0
- package/dist/evaluator.js +121 -0
- package/dist/evaluator.js.map +1 -0
- package/dist/execution.d.ts +29 -0
- package/dist/execution.d.ts.map +1 -0
- package/dist/execution.js +94 -0
- package/dist/execution.js.map +1 -0
- package/dist/index.d.ts +17 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +16 -0
- package/dist/index.js.map +1 -0
- package/dist/llm-client.d.ts +31 -0
- package/dist/llm-client.d.ts.map +1 -0
- package/dist/llm-client.js +121 -0
- package/dist/llm-client.js.map +1 -0
- package/dist/llm-provider.d.ts +46 -0
- package/dist/llm-provider.d.ts.map +1 -0
- package/dist/llm-provider.js +199 -0
- package/dist/llm-provider.js.map +1 -0
- package/dist/persona-generator.d.ts +34 -0
- package/dist/persona-generator.d.ts.map +1 -0
- package/dist/persona-generator.js +99 -0
- package/dist/persona-generator.js.map +1 -0
- package/dist/persona.d.ts +28 -0
- package/dist/persona.d.ts.map +1 -0
- package/dist/persona.js +100 -0
- package/dist/persona.js.map +1 -0
- package/dist/project.d.ts +43 -0
- package/dist/project.d.ts.map +1 -0
- package/dist/project.js +114 -0
- package/dist/project.js.map +1 -0
- package/dist/prompt.d.ts +31 -0
- package/dist/prompt.d.ts.map +1 -0
- package/dist/prompt.js +73 -0
- package/dist/prompt.js.map +1 -0
- package/dist/run-processor.d.ts +127 -0
- package/dist/run-processor.d.ts.map +1 -0
- package/dist/run-processor.js +495 -0
- package/dist/run-processor.js.map +1 -0
- package/dist/run.d.ts +101 -0
- package/dist/run.d.ts.map +1 -0
- package/dist/run.js +279 -0
- package/dist/run.js.map +1 -0
- package/dist/scenario.d.ts +66 -0
- package/dist/scenario.d.ts.map +1 -0
- package/dist/scenario.js +110 -0
- package/dist/scenario.js.map +1 -0
- package/dist/status.d.ts +10 -0
- package/dist/status.d.ts.map +1 -0
- package/dist/status.js +15 -0
- package/dist/status.js.map +1 -0
- package/dist/storage.d.ts +11 -0
- package/dist/storage.d.ts.map +1 -0
- package/dist/storage.js +57 -0
- package/dist/storage.js.map +1 -0
- package/dist/types.d.ts +46 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +26 -0
- package/dist/types.js.map +1 -0
- package/package.json +51 -0
|
@@ -0,0 +1,495 @@
|
|
|
1
|
+
import { getEval } from "./eval.js";
|
|
2
|
+
import { invokeConnector, } from "./connector.js";
|
|
3
|
+
import { getLLMProvider } from "./llm-provider.js";
|
|
4
|
+
import { getPersona } from "./persona.js";
|
|
5
|
+
import { getProject } from "./project.js";
|
|
6
|
+
import { getScenario } from "./scenario.js";
|
|
7
|
+
import { getRun, listRuns, updateRun, } from "./run.js";
|
|
8
|
+
import { buildTestAgentSystemPrompt } from "./prompt.js";
|
|
9
|
+
import { evaluateCriteria } from "./evaluator.js";
|
|
10
|
+
import { generatePersonaMessage } from "./persona-generator.js";
|
|
11
|
+
/**
|
|
12
|
+
* Background processor for executing queued evaluation runs.
|
|
13
|
+
*
|
|
14
|
+
* The RunProcessor polls for runs with status "queued" and executes them
|
|
15
|
+
* via the configured connector. It supports concurrent execution and provides
|
|
16
|
+
* callbacks for monitoring status changes.
|
|
17
|
+
*
|
|
18
|
+
* Works from both CLI and API contexts - the same processor logic can be used
|
|
19
|
+
* with different status update mechanisms (terminal output vs WebSocket).
|
|
20
|
+
*
|
|
21
|
+
* @example
|
|
22
|
+
* ```typescript
|
|
23
|
+
* const processor = new RunProcessor({
|
|
24
|
+
* pollIntervalMs: 5000,
|
|
25
|
+
* maxConcurrent: 3,
|
|
26
|
+
* onStatusChange: (runId, status, run) => {
|
|
27
|
+
* console.log(`Run ${runId} is now ${status}`);
|
|
28
|
+
* },
|
|
29
|
+
* });
|
|
30
|
+
*
|
|
31
|
+
* processor.start();
|
|
32
|
+
*
|
|
33
|
+
* // Later: graceful shutdown
|
|
34
|
+
* await processor.stop();
|
|
35
|
+
* ```
|
|
36
|
+
*/
|
|
37
|
+
export class RunProcessor {
|
|
38
|
+
running = false;
|
|
39
|
+
intervalId = null;
|
|
40
|
+
activeRuns = new Map();
|
|
41
|
+
options;
|
|
42
|
+
constructor(options = {}) {
|
|
43
|
+
this.options = {
|
|
44
|
+
pollIntervalMs: options.pollIntervalMs ?? 5000,
|
|
45
|
+
maxConcurrent: options.maxConcurrent ?? 3,
|
|
46
|
+
projectId: options.projectId,
|
|
47
|
+
onStatusChange: options.onStatusChange,
|
|
48
|
+
onRunStart: options.onRunStart,
|
|
49
|
+
onRunComplete: options.onRunComplete,
|
|
50
|
+
onRunError: options.onRunError,
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Starts the processor loop.
|
|
55
|
+
* Call this on server/CLI startup.
|
|
56
|
+
*/
|
|
57
|
+
start() {
|
|
58
|
+
if (this.running)
|
|
59
|
+
return;
|
|
60
|
+
this.running = true;
|
|
61
|
+
// Reset any "running" runs to "queued" (recovery from crash)
|
|
62
|
+
this.recoverStuckRuns();
|
|
63
|
+
// Start polling loop
|
|
64
|
+
this.intervalId = setInterval(() => this.tick(), this.options.pollIntervalMs);
|
|
65
|
+
// Immediate first tick
|
|
66
|
+
this.tick();
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Stops the processor gracefully.
|
|
70
|
+
* Waits for active runs to complete.
|
|
71
|
+
*/
|
|
72
|
+
async stop() {
|
|
73
|
+
this.running = false;
|
|
74
|
+
if (this.intervalId) {
|
|
75
|
+
clearInterval(this.intervalId);
|
|
76
|
+
this.intervalId = null;
|
|
77
|
+
}
|
|
78
|
+
// Wait for active runs to complete
|
|
79
|
+
await Promise.all(this.activeRuns.values());
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Process a single tick (useful for testing or one-shot processing).
|
|
83
|
+
* Returns the number of runs started and waits for them to complete.
|
|
84
|
+
*/
|
|
85
|
+
async processOnce() {
|
|
86
|
+
return this.tick(true);
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Returns true if the processor is currently running.
|
|
90
|
+
*/
|
|
91
|
+
isRunning() {
|
|
92
|
+
return this.running;
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Returns the number of currently active runs.
|
|
96
|
+
*/
|
|
97
|
+
getActiveRunCount() {
|
|
98
|
+
return this.activeRuns.size;
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Main processing tick - picks up queued runs and executes them.
|
|
102
|
+
* Returns the number of runs started.
|
|
103
|
+
* @param oneShot If true, waits for runs to complete before returning
|
|
104
|
+
*/
|
|
105
|
+
async tick(oneShot = false) {
|
|
106
|
+
// Skip if not running and no active runs (unless one-shot mode)
|
|
107
|
+
if (!oneShot && !this.running && this.activeRuns.size === 0)
|
|
108
|
+
return 0;
|
|
109
|
+
// Calculate available slots
|
|
110
|
+
const availableSlots = this.options.maxConcurrent - this.activeRuns.size;
|
|
111
|
+
if (availableSlots <= 0)
|
|
112
|
+
return 0;
|
|
113
|
+
// Get queued runs (filtered by project if specified)
|
|
114
|
+
const queuedRuns = listRuns({
|
|
115
|
+
status: "queued",
|
|
116
|
+
projectId: this.options.projectId,
|
|
117
|
+
limit: availableSlots,
|
|
118
|
+
});
|
|
119
|
+
let started = 0;
|
|
120
|
+
const promises = [];
|
|
121
|
+
for (const run of queuedRuns) {
|
|
122
|
+
if (this.activeRuns.has(run.id))
|
|
123
|
+
continue;
|
|
124
|
+
// Attempt to claim the run atomically
|
|
125
|
+
if (!this.claimRun(run.id)) {
|
|
126
|
+
continue; // Already claimed by another processor
|
|
127
|
+
}
|
|
128
|
+
// Start execution
|
|
129
|
+
const promise = this.executeRun(run);
|
|
130
|
+
this.activeRuns.set(run.id, promise);
|
|
131
|
+
started++;
|
|
132
|
+
if (oneShot) {
|
|
133
|
+
promises.push(promise);
|
|
134
|
+
}
|
|
135
|
+
// Clean up when done
|
|
136
|
+
promise.finally(() => this.activeRuns.delete(run.id));
|
|
137
|
+
}
|
|
138
|
+
// In one-shot mode, wait for all runs to complete
|
|
139
|
+
if (oneShot && promises.length > 0) {
|
|
140
|
+
await Promise.all(promises);
|
|
141
|
+
}
|
|
142
|
+
return started;
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* Atomically claims a run for processing.
|
|
146
|
+
* Returns true if successful, false if already claimed.
|
|
147
|
+
*/
|
|
148
|
+
claimRun(runId) {
|
|
149
|
+
const run = getRun(runId);
|
|
150
|
+
if (!run || run.status !== "queued") {
|
|
151
|
+
return false; // Already claimed or doesn't exist
|
|
152
|
+
}
|
|
153
|
+
// Update status atomically
|
|
154
|
+
const updated = updateRun(runId, {
|
|
155
|
+
status: "running",
|
|
156
|
+
startedAt: new Date().toISOString(),
|
|
157
|
+
});
|
|
158
|
+
return updated !== undefined;
|
|
159
|
+
}
|
|
160
|
+
/**
|
|
161
|
+
* Executes a single run with the evaluation loop.
|
|
162
|
+
*
|
|
163
|
+
* The loop:
|
|
164
|
+
* 1. Sends conversation to tested agent
|
|
165
|
+
* 2. Evaluates agent response against success/failure criteria
|
|
166
|
+
* 3. If success criteria met or failure criteria met, finish the run
|
|
167
|
+
* 4. If max messages reached, finish the run
|
|
168
|
+
* 5. Otherwise, generate a new persona message and continue
|
|
169
|
+
*/
|
|
170
|
+
async executeRun(run) {
|
|
171
|
+
// Re-fetch run to get updated state after claim
|
|
172
|
+
let currentRun = getRun(run.id);
|
|
173
|
+
if (!currentRun) {
|
|
174
|
+
return;
|
|
175
|
+
}
|
|
176
|
+
try {
|
|
177
|
+
// Notify start
|
|
178
|
+
this.options.onStatusChange?.(currentRun.id, "running", currentRun);
|
|
179
|
+
this.options.onRunStart?.(currentRun);
|
|
180
|
+
// Determine connector ID
|
|
181
|
+
let connectorId;
|
|
182
|
+
if (currentRun.evalId) {
|
|
183
|
+
// Eval-based run
|
|
184
|
+
const evalItem = getEval(currentRun.evalId);
|
|
185
|
+
if (!evalItem) {
|
|
186
|
+
throw new Error(`Eval not found: ${currentRun.evalId}`);
|
|
187
|
+
}
|
|
188
|
+
if (!evalItem.connectorId) {
|
|
189
|
+
throw new Error("Eval has no connector assigned");
|
|
190
|
+
}
|
|
191
|
+
connectorId = evalItem.connectorId;
|
|
192
|
+
}
|
|
193
|
+
else {
|
|
194
|
+
// Playground run - connector stored on run
|
|
195
|
+
if (!currentRun.connectorId) {
|
|
196
|
+
throw new Error("Playground run has no connector assigned");
|
|
197
|
+
}
|
|
198
|
+
connectorId = currentRun.connectorId;
|
|
199
|
+
}
|
|
200
|
+
// Get project for LLM settings
|
|
201
|
+
const project = getProject(currentRun.projectId);
|
|
202
|
+
if (!project) {
|
|
203
|
+
throw new Error(`Project not found: ${currentRun.projectId}`);
|
|
204
|
+
}
|
|
205
|
+
// Resolve LLM configuration from project settings
|
|
206
|
+
const projectSettings = project.llmSettings;
|
|
207
|
+
const evaluationProviderId = projectSettings?.evaluation?.providerId;
|
|
208
|
+
const evaluationModel = projectSettings?.evaluation?.model;
|
|
209
|
+
if (!evaluationProviderId) {
|
|
210
|
+
throw new Error("LLM Provider for evaluation is required. Configure in project Settings > LLM Defaults.");
|
|
211
|
+
}
|
|
212
|
+
// Validate evaluation provider exists
|
|
213
|
+
const evaluationProvider = getLLMProvider(evaluationProviderId);
|
|
214
|
+
if (!evaluationProvider) {
|
|
215
|
+
throw new Error(`LLM Provider "${evaluationProviderId}" not found`);
|
|
216
|
+
}
|
|
217
|
+
// Persona: use project persona settings, fallback to evaluation settings
|
|
218
|
+
const personaProviderId = projectSettings?.persona?.providerId || evaluationProviderId;
|
|
219
|
+
const personaModel = projectSettings?.persona?.model || evaluationModel;
|
|
220
|
+
// Validate persona provider exists
|
|
221
|
+
const personaProvider = getLLMProvider(personaProviderId);
|
|
222
|
+
if (!personaProvider) {
|
|
223
|
+
throw new Error(`LLM Provider "${personaProviderId}" not found`);
|
|
224
|
+
}
|
|
225
|
+
const llmConfig = {
|
|
226
|
+
evaluationProviderId,
|
|
227
|
+
evaluationModel,
|
|
228
|
+
personaProviderId,
|
|
229
|
+
personaModel,
|
|
230
|
+
};
|
|
231
|
+
// Get scenario and persona from stored IDs
|
|
232
|
+
const scenario = getScenario(currentRun.scenarioId);
|
|
233
|
+
if (!scenario) {
|
|
234
|
+
throw new Error(`Scenario not found: ${currentRun.scenarioId}`);
|
|
235
|
+
}
|
|
236
|
+
const persona = currentRun.personaId
|
|
237
|
+
? getPersona(currentRun.personaId)
|
|
238
|
+
: undefined;
|
|
239
|
+
// Get eval input messages if this is an eval-based run
|
|
240
|
+
const evalInput = currentRun.evalId ? getEval(currentRun.evalId)?.input : undefined;
|
|
241
|
+
// Build all messages including system prompt
|
|
242
|
+
const allMessages = this.buildAllMessages(scenario, persona, evalInput);
|
|
243
|
+
// Store all initial messages in the run (so they're visible in UI)
|
|
244
|
+
const runWithMessages = updateRun(currentRun.id, {
|
|
245
|
+
messages: allMessages,
|
|
246
|
+
});
|
|
247
|
+
if (runWithMessages) {
|
|
248
|
+
currentRun = runWithMessages;
|
|
249
|
+
}
|
|
250
|
+
// Determine max messages (default to 10 if not specified)
|
|
251
|
+
const maxMessages = scenario.maxMessages ?? 10;
|
|
252
|
+
// Check if we have criteria to evaluate against
|
|
253
|
+
if (!scenario.successCriteria && !scenario.failureCriteria) {
|
|
254
|
+
throw new Error("Scenario must have success or failure criteria defined");
|
|
255
|
+
}
|
|
256
|
+
// Run the evaluation loop
|
|
257
|
+
await this.executeEvaluationLoop(currentRun, connectorId, llmConfig, scenario, persona, maxMessages);
|
|
258
|
+
}
|
|
259
|
+
catch (error) {
|
|
260
|
+
const err = error instanceof Error ? error : new Error("Unknown error");
|
|
261
|
+
const updatedRun = updateRun(currentRun.id, {
|
|
262
|
+
status: "error",
|
|
263
|
+
error: err.message,
|
|
264
|
+
completedAt: new Date().toISOString(),
|
|
265
|
+
});
|
|
266
|
+
if (updatedRun) {
|
|
267
|
+
this.options.onStatusChange?.(currentRun.id, "error", updatedRun);
|
|
268
|
+
this.options.onRunError?.(updatedRun, err);
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
/**
|
|
273
|
+
* Gets the thread ID for LangGraph.
|
|
274
|
+
* Uses the stored threadId if available (set on retry), otherwise uses run.id.
|
|
275
|
+
*/
|
|
276
|
+
getThreadId(run) {
|
|
277
|
+
return run.threadId || run.id;
|
|
278
|
+
}
|
|
279
|
+
/**
|
|
280
|
+
* Gets the role of the last non-system message.
|
|
281
|
+
* Used to determine if we need to generate a persona message before invoking the connector.
|
|
282
|
+
*/
|
|
283
|
+
getLastMessageRole(messages) {
|
|
284
|
+
const nonSystemMessages = messages.filter((m) => m.role !== "system");
|
|
285
|
+
if (nonSystemMessages.length === 0)
|
|
286
|
+
return undefined;
|
|
287
|
+
return nonSystemMessages[nonSystemMessages.length - 1].role;
|
|
288
|
+
}
|
|
289
|
+
/**
|
|
290
|
+
* Executes the main evaluation loop.
|
|
291
|
+
* Both success and failure criteria are evaluated at every turn.
|
|
292
|
+
* The loop stops when:
|
|
293
|
+
* - Success criteria is met (run succeeds)
|
|
294
|
+
* - Failure criteria is met AND failureCriteriaMode is "every_turn" (run fails early)
|
|
295
|
+
* - Max messages reached without success (run fails; default mode "on_max_messages")
|
|
296
|
+
*/
|
|
297
|
+
async executeEvaluationLoop(currentRun, connectorId, llmConfig, scenario, persona, maxMessages) {
|
|
298
|
+
let messages = [...currentRun.messages];
|
|
299
|
+
let totalLatencyMs = 0;
|
|
300
|
+
let connectorCallCount = 0;
|
|
301
|
+
let lastResult;
|
|
302
|
+
let finalEvaluation;
|
|
303
|
+
// Track messages already in the LangGraph thread (for continuation calls)
|
|
304
|
+
let threadMessageCount = 0;
|
|
305
|
+
// Count only user and assistant messages for the limit (exclude system)
|
|
306
|
+
const countConversationMessages = () => messages.filter((m) => m.role === "user" || m.role === "assistant").length;
|
|
307
|
+
// Generate an initial user message if there are no conversation messages
|
|
308
|
+
// or the last seed message is from the assistant
|
|
309
|
+
const lastRole = this.getLastMessageRole(messages);
|
|
310
|
+
if (!lastRole || lastRole === "assistant") {
|
|
311
|
+
const personaResponse = await generatePersonaMessage({
|
|
312
|
+
messages,
|
|
313
|
+
persona,
|
|
314
|
+
scenario,
|
|
315
|
+
llmProviderId: llmConfig.personaProviderId,
|
|
316
|
+
model: llmConfig.personaModel,
|
|
317
|
+
});
|
|
318
|
+
const userMessage = {
|
|
319
|
+
role: "user",
|
|
320
|
+
content: personaResponse.content,
|
|
321
|
+
};
|
|
322
|
+
messages = [...messages, userMessage];
|
|
323
|
+
// Update run with the generated persona message
|
|
324
|
+
updateRun(currentRun.id, { messages });
|
|
325
|
+
}
|
|
326
|
+
while (countConversationMessages() < maxMessages) {
|
|
327
|
+
// Build conversation messages (without system prompt) for connector
|
|
328
|
+
const conversationMessages = messages.filter((m) => m.role !== "system");
|
|
329
|
+
// Invoke the connector (send to tested agent, pass thread ID for LangGraph)
|
|
330
|
+
// threadMessageCount tells LangGraph strategy how many messages are already in the thread
|
|
331
|
+
const result = await invokeConnector(connectorId, {
|
|
332
|
+
messages: conversationMessages,
|
|
333
|
+
runId: this.getThreadId(currentRun),
|
|
334
|
+
threadMessageCount,
|
|
335
|
+
});
|
|
336
|
+
lastResult = result;
|
|
337
|
+
if (!result.success || !result.messages || result.messages.length === 0) {
|
|
338
|
+
throw new Error(result.error || "No response messages from connector");
|
|
339
|
+
}
|
|
340
|
+
// Add all agent response messages (may include tool calls, tool results, and final response)
|
|
341
|
+
messages = [...messages, ...result.messages];
|
|
342
|
+
// Update thread message count for next iteration (thread now has all conversation messages)
|
|
343
|
+
threadMessageCount = messages.filter((m) => m.role !== "system").length;
|
|
344
|
+
totalLatencyMs += result.latencyMs;
|
|
345
|
+
connectorCallCount++;
|
|
346
|
+
// Update run with current messages
|
|
347
|
+
updateRun(currentRun.id, { messages });
|
|
348
|
+
// Evaluate the conversation against criteria
|
|
349
|
+
const evaluation = await evaluateCriteria({
|
|
350
|
+
messages,
|
|
351
|
+
successCriteria: scenario.successCriteria,
|
|
352
|
+
failureCriteria: scenario.failureCriteria,
|
|
353
|
+
llmProviderId: llmConfig.evaluationProviderId,
|
|
354
|
+
model: llmConfig.evaluationModel,
|
|
355
|
+
});
|
|
356
|
+
finalEvaluation = evaluation;
|
|
357
|
+
// Determine failure criteria check mode (default: "every_turn")
|
|
358
|
+
const failureMode = scenario.failureCriteriaMode ?? "on_max_messages";
|
|
359
|
+
// Stop the loop on success, or on failure when failureCriteriaMode is "every_turn"
|
|
360
|
+
if (evaluation.successMet || (evaluation.failureMet && failureMode === "every_turn")) {
|
|
361
|
+
const runResult = {
|
|
362
|
+
success: evaluation.successMet,
|
|
363
|
+
score: evaluation.confidence,
|
|
364
|
+
reason: evaluation.successMet
|
|
365
|
+
? evaluation.reasoning
|
|
366
|
+
: `Failure criteria was triggered. ${evaluation.reasoning}`,
|
|
367
|
+
};
|
|
368
|
+
const updatedRun = updateRun(currentRun.id, {
|
|
369
|
+
status: "completed",
|
|
370
|
+
messages,
|
|
371
|
+
result: runResult,
|
|
372
|
+
output: {
|
|
373
|
+
avgLatencyMs: connectorCallCount > 0 ? Math.round(totalLatencyMs / connectorCallCount) : 0,
|
|
374
|
+
totalLatencyMs,
|
|
375
|
+
messageCount: countConversationMessages(),
|
|
376
|
+
evaluation: {
|
|
377
|
+
successMet: evaluation.successMet,
|
|
378
|
+
failureMet: evaluation.failureMet,
|
|
379
|
+
confidence: evaluation.confidence,
|
|
380
|
+
reasoning: evaluation.reasoning,
|
|
381
|
+
},
|
|
382
|
+
},
|
|
383
|
+
completedAt: new Date().toISOString(),
|
|
384
|
+
});
|
|
385
|
+
if (updatedRun && lastResult) {
|
|
386
|
+
this.options.onStatusChange?.(currentRun.id, "completed", updatedRun);
|
|
387
|
+
this.options.onRunComplete?.(updatedRun, lastResult);
|
|
388
|
+
}
|
|
389
|
+
return;
|
|
390
|
+
}
|
|
391
|
+
// Check if we've reached max messages
|
|
392
|
+
if (countConversationMessages() >= maxMessages) {
|
|
393
|
+
break;
|
|
394
|
+
}
|
|
395
|
+
// Generate a new user message to continue the conversation
|
|
396
|
+
// If persona is provided, the message will be personalized; otherwise generic
|
|
397
|
+
const personaResponse = await generatePersonaMessage({
|
|
398
|
+
messages,
|
|
399
|
+
persona,
|
|
400
|
+
scenario,
|
|
401
|
+
llmProviderId: llmConfig.personaProviderId,
|
|
402
|
+
model: llmConfig.personaModel,
|
|
403
|
+
});
|
|
404
|
+
// Add persona message as user message
|
|
405
|
+
const userMessage = {
|
|
406
|
+
role: "user",
|
|
407
|
+
content: personaResponse.content,
|
|
408
|
+
};
|
|
409
|
+
messages = [...messages, userMessage];
|
|
410
|
+
// Update run with new user message
|
|
411
|
+
updateRun(currentRun.id, { messages });
|
|
412
|
+
}
|
|
413
|
+
// Max messages reached without meeting success criteria
|
|
414
|
+
const failureTriggered = finalEvaluation?.failureMet ?? false;
|
|
415
|
+
const runResult = {
|
|
416
|
+
success: false,
|
|
417
|
+
score: finalEvaluation?.confidence ?? 0,
|
|
418
|
+
reason: failureTriggered
|
|
419
|
+
? `Failure criteria was triggered. ${finalEvaluation?.reasoning || ""}`
|
|
420
|
+
: `Max messages (${maxMessages}) reached without meeting success criteria. ${finalEvaluation?.reasoning || ""}`,
|
|
421
|
+
};
|
|
422
|
+
const updatedRun = updateRun(currentRun.id, {
|
|
423
|
+
status: "completed",
|
|
424
|
+
messages,
|
|
425
|
+
result: runResult,
|
|
426
|
+
output: {
|
|
427
|
+
avgLatencyMs: connectorCallCount > 0 ? Math.round(totalLatencyMs / connectorCallCount) : 0,
|
|
428
|
+
totalLatencyMs,
|
|
429
|
+
messageCount: countConversationMessages(),
|
|
430
|
+
maxMessagesReached: true,
|
|
431
|
+
evaluation: finalEvaluation
|
|
432
|
+
? {
|
|
433
|
+
successMet: finalEvaluation.successMet,
|
|
434
|
+
failureMet: finalEvaluation.failureMet,
|
|
435
|
+
confidence: finalEvaluation.confidence,
|
|
436
|
+
reasoning: finalEvaluation.reasoning,
|
|
437
|
+
}
|
|
438
|
+
: undefined,
|
|
439
|
+
},
|
|
440
|
+
completedAt: new Date().toISOString(),
|
|
441
|
+
});
|
|
442
|
+
if (updatedRun && lastResult) {
|
|
443
|
+
this.options.onStatusChange?.(currentRun.id, "completed", updatedRun);
|
|
444
|
+
this.options.onRunComplete?.(updatedRun, lastResult);
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
/**
|
|
448
|
+
* Builds all messages including system prompt for a run.
|
|
449
|
+
* These messages are stored in the run for visibility in the UI.
|
|
450
|
+
*/
|
|
451
|
+
buildAllMessages(scenario, persona, evalInput) {
|
|
452
|
+
const messages = [];
|
|
453
|
+
// Add system prompt from persona/scenario
|
|
454
|
+
const systemPrompt = buildTestAgentSystemPrompt({
|
|
455
|
+
persona: persona
|
|
456
|
+
? {
|
|
457
|
+
name: persona.name,
|
|
458
|
+
description: persona.description,
|
|
459
|
+
systemPrompt: persona.systemPrompt,
|
|
460
|
+
}
|
|
461
|
+
: undefined,
|
|
462
|
+
scenario: {
|
|
463
|
+
name: scenario.name,
|
|
464
|
+
instructions: scenario.instructions,
|
|
465
|
+
messages: scenario.messages,
|
|
466
|
+
},
|
|
467
|
+
});
|
|
468
|
+
if (systemPrompt.trim()) {
|
|
469
|
+
messages.push({ role: "system", content: systemPrompt });
|
|
470
|
+
}
|
|
471
|
+
// Add scenario seed messages if present
|
|
472
|
+
if (scenario.messages) {
|
|
473
|
+
messages.push(...scenario.messages);
|
|
474
|
+
}
|
|
475
|
+
// Add eval input messages (only for eval-based runs)
|
|
476
|
+
if (Array.isArray(evalInput)) {
|
|
477
|
+
messages.push(...evalInput);
|
|
478
|
+
}
|
|
479
|
+
return messages;
|
|
480
|
+
}
|
|
481
|
+
/**
|
|
482
|
+
* Recovers runs that were interrupted by server crash.
|
|
483
|
+
* Only recovers runs for this processor's project filter.
|
|
484
|
+
*/
|
|
485
|
+
recoverStuckRuns() {
|
|
486
|
+
const stuckRuns = listRuns({
|
|
487
|
+
status: "running",
|
|
488
|
+
projectId: this.options.projectId,
|
|
489
|
+
});
|
|
490
|
+
for (const run of stuckRuns) {
|
|
491
|
+
updateRun(run.id, { status: "queued" });
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
//# sourceMappingURL=run-processor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"run-processor.js","sourceRoot":"","sources":["../src/run-processor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EACL,eAAe,GAEhB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AACnD,OAAO,EAAE,UAAU,EAAgB,MAAM,cAAc,CAAC;AACxD,OAAO,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC;AAC1C,OAAO,EAAE,WAAW,EAAiB,MAAM,eAAe,CAAC;AAC3D,OAAO,EACL,MAAM,EACN,QAAQ,EACR,SAAS,GAIV,MAAM,UAAU,CAAC;AAElB,OAAO,EAAE,0BAA0B,EAAE,MAAM,aAAa,CAAC;AACzD,OAAO,EAAE,gBAAgB,EAAiC,MAAM,gBAAgB,CAAC;AACjF,OAAO,EAAE,sBAAsB,EAAE,MAAM,wBAAwB,CAAC;AAyChE;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,MAAM,OAAO,YAAY;IACf,OAAO,GAAG,KAAK,CAAC;IAChB,UAAU,GAA0B,IAAI,CAAC;IACzC,UAAU,GAAG,IAAI,GAAG,EAAyB,CAAC;IAC9C,OAAO,CAAkB;IAEjC,YAAY,UAA+B,EAAE;QAC3C,IAAI,CAAC,OAAO,GAAG;YACb,cAAc,EAAE,OAAO,CAAC,cAAc,IAAI,IAAI;YAC9C,aAAa,EAAE,OAAO,CAAC,aAAa,IAAI,CAAC;YACzC,SAAS,EAAE,OAAO,CAAC,SAAS;YAC5B,cAAc,EAAE,OAAO,CAAC,cAAc;YACtC,UAAU,EAAE,OAAO,CAAC,UAAU;YAC9B,aAAa,EAAE,OAAO,CAAC,aAAa;YACpC,UAAU,EAAE,OAAO,CAAC,UAAU;SAC/B,CAAC;IACJ,CAAC;IAED;;;OAGG;IACH,KAAK;QACH,IAAI,IAAI,CAAC,OAAO;YAAE,OAAO;QAEzB,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QAEpB,6DAA6D;QAC7D,IAAI,CAAC,gBAAgB,EAAE,CAAC;QAExB,qBAAqB;QACrB,IAAI,CAAC,UAAU,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,IAAI,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC;QAE9E,uBAAuB;QACvB,IAAI,CAAC,IAAI,EAAE,CAAC;IACd,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,IAAI;QACR,IAAI,CAAC,OAAO,GAAG,KAAK,CAAC;QAErB,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACpB,aAAa,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;YAC/B,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC;QACzB,CAAC;QAED,mCAAmC;QACnC,MAAM,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,EAAE,CAAC,CAAC;IAC9C,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,WAAW;QACf,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACzB,CAAC;IAED;;OAEG;IACH,SAAS;QACP,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,iBAAiB;QACf,OAAO,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC;IAC9B,CAAC;IAED;;;;OAIG;IACK,KAAK,CAAC,IAAI,CAAC,OAAO,GAAG,KAAK;QAChC,gEAAgE;QAChE,IAAI,CAAC,OAAO,IAAI,CAAC,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,KAAK,CAAC;YAAE,OAAO,CAAC,CAAC;QAEtE,4BAA4B;QAC5B,MAAM,cAAc,GAAG,IAAI,CAAC,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC;QACzE,IAAI,cAAc,IAAI,CAAC;YAAE,OAAO,CAAC,CAAC;QAElC,qDAAqD;QACrD,MAAM,UAAU,GAAG,QAAQ,CAAC;YAC1B,MAAM,EAAE,QAAQ;YAChB,SAAS,EAAE,IAAI,CAAC,OAAO,CAAC,SAAS;YACjC,KAAK,EAAE,cAAc;SACtB,CAAC,CAAC;QAEH,IAAI,OAAO,GAAG,CAAC,CAAC;QAChB,MAAM,QAAQ,GAAoB,EAAE,CAAC;QAErC,KAAK,MAAM,GAAG,IAAI,UAAU,EAAE,CAAC;YAC7B,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;gBAAE,SAAS;YAE1C,sCAAsC;YACtC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC;gBAC3B,SAAS,CAAC,uCAAuC;YACnD,CAAC;YAED,kBAAkB;YAClB,MAAM,OAAO,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC;YACrC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,EAAE,OAAO,CAAC,CAAC;YACrC,OAAO,EAAE,CAAC;YAEV,IAAI,OAAO,EAAE,CAAC;gBACZ,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACzB,CAAC;YAED,qBAAqB;YACrB,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;QACxD,CAAC;QAED,kDAAkD;QAClD,IAAI,OAAO,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACnC,MAAM,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QAC9B,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;;OAGG;IACK,QAAQ,CAAC,KAAa;QAC5B,MAAM,GAAG,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;QAC1B,IAAI,CAAC,GAAG,IAAI,GAAG,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;YACpC,OAAO,KAAK,CAAC,CAAC,mCAAmC;QACnD,CAAC;QAED,2BAA2B;QAC3B,MAAM,OAAO,GAAG,SAAS,CAAC,KAAK,EAAE;YAC/B,MAAM,EAAE,SAAS;YACjB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC,CAAC,CAAC;QAEH,OAAO,OAAO,KAAK,SAAS,CAAC;IAC/B,CAAC;IAED;;;;;;;;;OASG;IACK,KAAK,CAAC,UAAU,CAAC,GAAQ;QAC/B,gDAAgD;QAChD,IAAI,UAAU,GAAG,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QAChC,IAAI,CAAC,UAAU,EAAE,CAAC;YAChB,OAAO;QACT,CAAC;QAED,IAAI,CAAC;YACH,eAAe;YACf,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,CAAC,UAAU,CAAC,EAAE,EAAE,SAAS,EAAE,UAAU,CAAC,CAAC;YACpE,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,CAAC,UAAU,CAAC,CAAC;YAEtC,yBAAyB;YACzB,IAAI,WAAmB,CAAC;YAExB,IAAI,UAAU,CAAC,MAAM,EAAE,CAAC;gBACtB,iBAAiB;gBACjB,MAAM,QAAQ,GAAG,OAAO,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;gBAC5C,IAAI,CAAC,QAAQ,EAAE,CAAC;oBACd,MAAM,IAAI,KAAK,CAAC,mBAAmB,UAAU,CAAC,MAAM,EAAE,CAAC,CAAC;gBAC1D,CAAC;gBACD,IAAI,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC;oBAC1B,MAAM,IAAI,KAAK,CAAC,gCAAgC,CAAC,CAAC;gBACpD,CAAC;gBACD,WAAW,GAAG,QAAQ,CAAC,WAAW,CAAC;YACrC,CAAC;iBAAM,CAAC;gBACN,2CAA2C;gBAC3C,IAAI,CAAC,UAAU,CAAC,WAAW,EAAE,CAAC;oBAC5B,MAAM,IAAI,KAAK,CAAC,0CAA0C,CAAC,CAAC;gBAC9D,CAAC;gBACD,WAAW,GAAG,UAAU,CAAC,WAAW,CAAC;YACvC,CAAC;YAED,+BAA+B;YAC/B,MAAM,OAAO,GAAG,UAAU,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC;YACjD,IAAI,CAAC,OAAO,EAAE,CAAC;gBACb,MAAM,IAAI,KAAK,CAAC,sBAAsB,UAAU,CAAC,SAAS,EAAE,CAAC,CAAC;YAChE,CAAC;YAED,kDAAkD;YAClD,MAAM,eAAe,GAAG,OAAO,CAAC,WAAW,CAAC;YAC5C,MAAM,oBAAoB,GAAG,eAAe,EAAE,UAAU,EAAE,UAAU,CAAC;YACrE,MAAM,eAAe,GAAG,eAAe,EAAE,UAAU,EAAE,KAAK,CAAC;YAE3D,IAAI,CAAC,oBAAoB,EAAE,CAAC;gBAC1B,MAAM,IAAI,KAAK,CACb,wFAAwF,CACzF,CAAC;YACJ,CAAC;YAED,sCAAsC;YACtC,MAAM,kBAAkB,GAAG,cAAc,CAAC,oBAAoB,CAAC,CAAC;YAChE,IAAI,CAAC,kBAAkB,EAAE,CAAC;gBACxB,MAAM,IAAI,KAAK,CAAC,iBAAiB,oBAAoB,aAAa,CAAC,CAAC;YACtE,CAAC;YAED,yEAAyE;YACzE,MAAM,iBAAiB,GACrB,eAAe,EAAE,OAAO,EAAE,UAAU,IAAI,oBAAoB,CAAC;YAC/D,MAAM,YAAY,GAChB,eAAe,EAAE,OAAO,EAAE,KAAK,IAAI,eAAe,CAAC;YAErD,mCAAmC;YACnC,MAAM,eAAe,GAAG,cAAc,CAAC,iBAAiB,CAAC,CAAC;YAC1D,IAAI,CAAC,eAAe,EAAE,CAAC;gBACrB,MAAM,IAAI,KAAK,CAAC,iBAAiB,iBAAiB,aAAa,CAAC,CAAC;YACnE,CAAC;YAED,MAAM,SAAS,GAAsB;gBACnC,oBAAoB;gBACpB,eAAe;gBACf,iBAAiB;gBACjB,YAAY;aACb,CAAC;YAEF,2CAA2C;YAC3C,MAAM,QAAQ,GAAG,WAAW,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC;YACpD,IAAI,CAAC,QAAQ,EAAE,CAAC;gBACd,MAAM,IAAI,KAAK,CAAC,uBAAuB,UAAU,CAAC,UAAU,EAAE,CAAC,CAAC;YAClE,CAAC;YAED,MAAM,OAAO,GAAG,UAAU,CAAC,SAAS;gBAClC,CAAC,CAAC,UAAU,CAAC,UAAU,CAAC,SAAS,CAAC;gBAClC,CAAC,CAAC,SAAS,CAAC;YAEd,uDAAuD;YACvD,MAAM,SAAS,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,KAAK,CAAC,CAAC,CAAC,SAAS,CAAC;YAEpF,6CAA6C;YAC7C,MAAM,WAAW,GAAG,IAAI,CAAC,gBAAgB,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;YAExE,mEAAmE;YACnE,MAAM,eAAe,GAAG,SAAS,CAAC,UAAU,CAAC,EAAE,EAAE;gBAC/C,QAAQ,EAAE,WAAW;aACtB,CAAC,CAAC;YACH,IAAI,eAAe,EAAE,CAAC;gBACpB,UAAU,GAAG,eAAe,CAAC;YAC/B,CAAC;YAED,0DAA0D;YAC1D,MAAM,WAAW,GAAG,QAAQ,CAAC,WAAW,IAAI,EAAE,CAAC;YAE/C,gDAAgD;YAChD,IAAI,CAAC,QAAQ,CAAC,eAAe,IAAI,CAAC,QAAQ,CAAC,eAAe,EAAE,CAAC;gBAC3D,MAAM,IAAI,KAAK,CAAC,wDAAwD,CAAC,CAAC;YAC5E,CAAC;YAED,0BAA0B;YAC1B,MAAM,IAAI,CAAC,qBAAqB,CAC9B,UAAU,EACV,WAAW,EACX,SAAS,EACT,QAAQ,EACR,OAAO,EACP,WAAW,CACZ,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,GAAG,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,eAAe,CAAC,CAAC;YACxE,MAAM,UAAU,GAAG,SAAS,CAAC,UAAU,CAAC,EAAE,EAAE;gBAC1C,MAAM,EAAE,OAAO;gBACf,KAAK,EAAE,GAAG,CAAC,OAAO;gBAClB,WAAW,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;aACtC,CAAC,CAAC;YAEH,IAAI,UAAU,EAAE,CAAC;gBACf,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,CAAC,UAAU,CAAC,EAAE,EAAE,OAAO,EAAE,UAAU,CAAC,CAAC;gBAClE,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,CAAC,UAAU,EAAE,GAAG,CAAC,CAAC;YAC7C,CAAC;QACH,CAAC;IACH,CAAC;IAED;;;OAGG;IACK,WAAW,CAAC,GAAQ;QAC1B,OAAO,GAAG,CAAC,QAAQ,IAAI,GAAG,CAAC,EAAE,CAAC;IAChC,CAAC;IAED;;;OAGG;IACK,kBAAkB,CAAC,QAAmB;QAC5C,MAAM,iBAAiB,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC;QACtE,IAAI,iBAAiB,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,SAAS,CAAC;QACrD,OAAO,iBAAiB,CAAC,iBAAiB,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC;IAC9D,CAAC;IAED;;;;;;;OAOG;IACK,KAAK,CAAC,qBAAqB,CACjC,UAAe,EACf,WAAmB,EACnB,SAA4B,EAC5B,QAAkB,EAClB,OAA4B,EAC5B,WAAmB;QAEnB,IAAI,QAAQ,GAAG,CAAC,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC;QACxC,IAAI,cAAc,GAAG,CAAC,CAAC;QACvB,IAAI,kBAAkB,GAAG,CAAC,CAAC;QAC3B,IAAI,UAA6C,CAAC;QAClD,IAAI,eAAqD,CAAC;QAE1D,0EAA0E;QAC1E,IAAI,kBAAkB,GAAG,CAAC,CAAC;QAE3B,wEAAwE;QACxE,MAAM,yBAAyB,GAAG,GAAG,EAAE,CACrC,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,MAAM,IAAI,CAAC,CAAC,IAAI,KAAK,WAAW,CAAC,CAAC,MAAM,CAAC;QAE7E,yEAAyE;QACzE,iDAAiD;QACjD,MAAM,QAAQ,GAAG,IAAI,CAAC,kBAAkB,CAAC,QAAQ,CAAC,CAAC;QACnD,IAAI,CAAC,QAAQ,IAAI,QAAQ,KAAK,WAAW,EAAE,CAAC;YAC1C,MAAM,eAAe,GAAG,MAAM,sBAAsB,CAAC;gBACnD,QAAQ;gBACR,OAAO;gBACP,QAAQ;gBACR,aAAa,EAAE,SAAS,CAAC,iBAAiB;gBAC1C,KAAK,EAAE,SAAS,CAAC,YAAY;aAC9B,CAAC,CAAC;YAEH,MAAM,WAAW,GAAY;gBAC3B,IAAI,EAAE,MAAM;gBACZ,OAAO,EAAE,eAAe,CAAC,OAAO;aACjC,CAAC;YACF,QAAQ,GAAG,CAAC,GAAG,QAAQ,EAAE,WAAW,CAAC,CAAC;YAEtC,gDAAgD;YAChD,SAAS,CAAC,UAAU,CAAC,EAAE,EAAE,EAAE,QAAQ,EAAE,CAAC,CAAC;QACzC,CAAC;QAED,OAAO,yBAAyB,EAAE,GAAG,WAAW,EAAE,CAAC;YACjD,oEAAoE;YACpE,MAAM,oBAAoB,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC;YAEzE,4EAA4E;YAC5E,0FAA0F;YAC1F,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,WAAW,EAAE;gBAChD,QAAQ,EAAE,oBAAoB;gBAC9B,KAAK,EAAE,IAAI,CAAC,WAAW,CAAC,UAAU,CAAC;gBACnC,kBAAkB;aACnB,CAAC,CAAC;YACH,UAAU,GAAG,MAAM,CAAC;YAEpB,IAAI,CAAC,MAAM,CAAC,OAAO,IAAI,CAAC,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACxE,MAAM,IAAI,KAAK,CAAC,MAAM,CAAC,KAAK,IAAI,qCAAqC,CAAC,CAAC;YACzE,CAAC;YAED,6FAA6F;YAC7F,QAAQ,GAAG,CAAC,GAAG,QAAQ,EAAE,GAAG,MAAM,CAAC,QAAQ,CAAC,CAAC;YAE7C,4FAA4F;YAC5F,kBAAkB,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC,MAAM,CAAC;YACxE,cAAc,IAAI,MAAM,CAAC,SAAS,CAAC;YACnC,kBAAkB,EAAE,CAAC;YAErB,mCAAmC;YACnC,SAAS,CAAC,UAAU,CAAC,EAAE,EAAE,EAAE,QAAQ,EAAE,CAAC,CAAC;YAEvC,6CAA6C;YAC7C,MAAM,UAAU,GAAG,MAAM,gBAAgB,CAAC;gBACxC,QAAQ;gBACR,eAAe,EAAE,QAAQ,CAAC,eAAe;gBACzC,eAAe,EAAE,QAAQ,CAAC,eAAe;gBACzC,aAAa,EAAE,SAAS,CAAC,oBAAoB;gBAC7C,KAAK,EAAE,SAAS,CAAC,eAAe;aACjC,CAAC,CAAC;YACH,eAAe,GAAG,UAAU,CAAC;YAE7B,gEAAgE;YAChE,MAAM,WAAW,GAAG,QAAQ,CAAC,mBAAmB,IAAI,iBAAiB,CAAC;YAEtE,mFAAmF;YACnF,IAAI,UAAU,CAAC,UAAU,IAAI,CAAC,UAAU,CAAC,UAAU,IAAI,WAAW,KAAK,YAAY,CAAC,EAAE,CAAC;gBACrF,MAAM,SAAS,GAAc;oBAC3B,OAAO,EAAE,UAAU,CAAC,UAAU;oBAC9B,KAAK,EAAE,UAAU,CAAC,UAAU;oBAC5B,MAAM,EAAE,UAAU,CAAC,UAAU;wBAC3B,CAAC,CAAC,UAAU,CAAC,SAAS;wBACtB,CAAC,CAAC,mCAAmC,UAAU,CAAC,SAAS,EAAE;iBAC9D,CAAC;gBAEF,MAAM,UAAU,GAAG,SAAS,CAAC,UAAU,CAAC,EAAE,EAAE;oBAC1C,MAAM,EAAE,WAAW;oBACnB,QAAQ;oBACR,MAAM,EAAE,SAAS;oBACjB,MAAM,EAAE;wBACN,YAAY,EAAE,kBAAkB,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,cAAc,GAAG,kBAAkB,CAAC,CAAC,CAAC,CAAC,CAAC;wBAC1F,cAAc;wBACd,YAAY,EAAE,yBAAyB,EAAE;wBACzC,UAAU,EAAE;4BACV,UAAU,EAAE,UAAU,CAAC,UAAU;4BACjC,UAAU,EAAE,UAAU,CAAC,UAAU;4BACjC,UAAU,EAAE,UAAU,CAAC,UAAU;4BACjC,SAAS,EAAE,UAAU,CAAC,SAAS;yBAChC;qBACF;oBACD,WAAW,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;iBACtC,CAAC,CAAC;gBAEH,IAAI,UAAU,IAAI,UAAU,EAAE,CAAC;oBAC7B,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,CAAC,UAAU,CAAC,EAAE,EAAE,WAAW,EAAE,UAAU,CAAC,CAAC;oBACtE,IAAI,CAAC,OAAO,CAAC,aAAa,EAAE,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;gBACvD,CAAC;gBACD,OAAO;YACT,CAAC;YAED,sCAAsC;YACtC,IAAI,yBAAyB,EAAE,IAAI,WAAW,EAAE,CAAC;gBAC/C,MAAM;YACR,CAAC;YAED,2DAA2D;YAC3D,8EAA8E;YAC9E,MAAM,eAAe,GAAG,MAAM,sBAAsB,CAAC;gBACnD,QAAQ;gBACR,OAAO;gBACP,QAAQ;gBACR,aAAa,EAAE,SAAS,CAAC,iBAAiB;gBAC1C,KAAK,EAAE,SAAS,CAAC,YAAY;aAC9B,CAAC,CAAC;YAEH,sCAAsC;YACtC,MAAM,WAAW,GAAY;gBAC3B,IAAI,EAAE,MAAM;gBACZ,OAAO,EAAE,eAAe,CAAC,OAAO;aACjC,CAAC;YACF,QAAQ,GAAG,CAAC,GAAG,QAAQ,EAAE,WAAW,CAAC,CAAC;YAEtC,mCAAmC;YACnC,SAAS,CAAC,UAAU,CAAC,EAAE,EAAE,EAAE,QAAQ,EAAE,CAAC,CAAC;QACzC,CAAC;QAED,wDAAwD;QACxD,MAAM,gBAAgB,GAAG,eAAe,EAAE,UAAU,IAAI,KAAK,CAAC;QAC9D,MAAM,SAAS,GAAc;YAC3B,OAAO,EAAE,KAAK;YACd,KAAK,EAAE,eAAe,EAAE,UAAU,IAAI,CAAC;YACvC,MAAM,EAAE,gBAAgB;gBACtB,CAAC,CAAC,mCAAmC,eAAe,EAAE,SAAS,IAAI,EAAE,EAAE;gBACvE,CAAC,CAAC,iBAAiB,WAAW,+CAA+C,eAAe,EAAE,SAAS,IAAI,EAAE,EAAE;SAClH,CAAC;QAEF,MAAM,UAAU,GAAG,SAAS,CAAC,UAAU,CAAC,EAAE,EAAE;YAC1C,MAAM,EAAE,WAAW;YACnB,QAAQ;YACR,MAAM,EAAE,SAAS;YACjB,MAAM,EAAE;gBACN,YAAY,EAAE,kBAAkB,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,cAAc,GAAG,kBAAkB,CAAC,CAAC,CAAC,CAAC,CAAC;gBAC1F,cAAc;gBACd,YAAY,EAAE,yBAAyB,EAAE;gBACzC,kBAAkB,EAAE,IAAI;gBACxB,UAAU,EAAE,eAAe;oBACzB,CAAC,CAAC;wBACE,UAAU,EAAE,eAAe,CAAC,UAAU;wBACtC,UAAU,EAAE,eAAe,CAAC,UAAU;wBACtC,UAAU,EAAE,eAAe,CAAC,UAAU;wBACtC,SAAS,EAAE,eAAe,CAAC,SAAS;qBACrC;oBACH,CAAC,CAAC,SAAS;aACd;YACD,WAAW,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACtC,CAAC,CAAC;QAEH,IAAI,UAAU,IAAI,UAAU,EAAE,CAAC;YAC7B,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,CAAC,UAAU,CAAC,EAAE,EAAE,WAAW,EAAE,UAAU,CAAC,CAAC;YACtE,IAAI,CAAC,OAAO,CAAC,aAAa,EAAE,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;QACvD,CAAC;IACH,CAAC;IAED;;;OAGG;IACK,gBAAgB,CACtB,QAAkB,EAClB,OAA4B,EAC5B,SAA+C;QAE/C,MAAM,QAAQ,GAAc,EAAE,CAAC;QAE/B,0CAA0C;QAC1C,MAAM,YAAY,GAAG,0BAA0B,CAAC;YAC9C,OAAO,EAAE,OAAO;gBACd,CAAC,CAAC;oBACE,IAAI,EAAE,OAAO,CAAC,IAAI;oBAClB,WAAW,EAAE,OAAO,CAAC,WAAW;oBAChC,YAAY,EAAE,OAAO,CAAC,YAAY;iBACnC;gBACH,CAAC,CAAC,SAAS;YACb,QAAQ,EAAE;gBACR,IAAI,EAAE,QAAQ,CAAC,IAAI;gBACnB,YAAY,EAAE,QAAQ,CAAC,YAAY;gBACnC,QAAQ,EAAE,QAAQ,CAAC,QAAQ;aAC5B;SACF,CAAC,CAAC;QACH,IAAI,YAAY,CAAC,IAAI,EAAE,EAAE,CAAC;YACxB,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,EAAE,CAAC,CAAC;QAC3D,CAAC;QAED,wCAAwC;QACxC,IAAI,QAAQ,CAAC,QAAQ,EAAE,CAAC;YACtB,QAAQ,CAAC,IAAI,CAAC,GAAG,QAAQ,CAAC,QAAQ,CAAC,CAAC;QACtC,CAAC;QAED,qDAAqD;QACrD,IAAI,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,CAAC;YAC7B,QAAQ,CAAC,IAAI,CAAC,GAAI,SAAuB,CAAC,CAAC;QAC7C,CAAC;QAED,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED;;;OAGG;IACK,gBAAgB;QACtB,MAAM,SAAS,GAAG,QAAQ,CAAC;YACzB,MAAM,EAAE,SAAS;YACjB,SAAS,EAAE,IAAI,CAAC,OAAO,CAAC,SAAS;SAClC,CAAC,CAAC;QAEH,KAAK,MAAM,GAAG,IAAI,SAAS,EAAE,CAAC;YAC5B,SAAS,CAAC,GAAG,CAAC,EAAE,EAAE,EAAE,MAAM,EAAE,QAAQ,EAAE,CAAC,CAAC;QAC1C,CAAC;IACH,CAAC;CACF"}
|
package/dist/run.d.ts
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import { type Message } from "./eval.js";
|
|
2
|
+
/**
|
|
3
|
+
* Run status types:
|
|
4
|
+
* - queued: Waiting to be processed
|
|
5
|
+
* - pending: Reserved for future use
|
|
6
|
+
* - running: Currently executing
|
|
7
|
+
* - completed: Finished (check result.success for pass/fail)
|
|
8
|
+
* - error: System error occurred (retryable)
|
|
9
|
+
*/
|
|
10
|
+
export type RunStatus = "queued" | "pending" | "running" | "completed" | "error";
|
|
11
|
+
export interface RunResult {
|
|
12
|
+
success: boolean;
|
|
13
|
+
score?: number;
|
|
14
|
+
reason?: string;
|
|
15
|
+
}
|
|
16
|
+
export interface RunMetadata {
|
|
17
|
+
latencyMs?: number;
|
|
18
|
+
tokenUsage?: {
|
|
19
|
+
input: number;
|
|
20
|
+
output: number;
|
|
21
|
+
};
|
|
22
|
+
[key: string]: unknown;
|
|
23
|
+
}
|
|
24
|
+
export interface Run {
|
|
25
|
+
id: string;
|
|
26
|
+
/** Eval ID (optional for playground runs) */
|
|
27
|
+
evalId?: string;
|
|
28
|
+
projectId: string;
|
|
29
|
+
personaId?: string;
|
|
30
|
+
scenarioId: string;
|
|
31
|
+
/** Connector ID (for playground runs without eval) */
|
|
32
|
+
connectorId?: string;
|
|
33
|
+
/** Execution ID - groups runs created together in a single execution */
|
|
34
|
+
executionId?: number;
|
|
35
|
+
status: RunStatus;
|
|
36
|
+
startedAt?: string;
|
|
37
|
+
completedAt?: string;
|
|
38
|
+
messages: Message[];
|
|
39
|
+
output?: Record<string, unknown>;
|
|
40
|
+
result?: RunResult;
|
|
41
|
+
error?: string;
|
|
42
|
+
metadata?: RunMetadata;
|
|
43
|
+
/** Thread ID for LangGraph (regenerated on retry to start fresh thread) */
|
|
44
|
+
threadId?: string;
|
|
45
|
+
createdAt: string;
|
|
46
|
+
updatedAt: string;
|
|
47
|
+
}
|
|
48
|
+
export interface CreateRunInput {
|
|
49
|
+
evalId: string;
|
|
50
|
+
}
|
|
51
|
+
export interface CreatePlaygroundRunInput {
|
|
52
|
+
scenarioId: string;
|
|
53
|
+
connectorId: string;
|
|
54
|
+
personaId?: string;
|
|
55
|
+
}
|
|
56
|
+
export interface UpdateRunInput {
|
|
57
|
+
status?: RunStatus;
|
|
58
|
+
startedAt?: string;
|
|
59
|
+
completedAt?: string;
|
|
60
|
+
messages?: Message[];
|
|
61
|
+
output?: Record<string, unknown>;
|
|
62
|
+
result?: RunResult;
|
|
63
|
+
error?: string;
|
|
64
|
+
metadata?: RunMetadata;
|
|
65
|
+
threadId?: string;
|
|
66
|
+
}
|
|
67
|
+
export declare function createRuns(input: CreateRunInput): Run[];
|
|
68
|
+
export declare function createRun(input: CreateRunInput): Run;
|
|
69
|
+
/**
|
|
70
|
+
* Creates a playground run for testing scenarios without an eval.
|
|
71
|
+
* Used by the Scenario Playground to create runs directly.
|
|
72
|
+
*/
|
|
73
|
+
export declare function createPlaygroundRun(input: CreatePlaygroundRunInput): Run;
|
|
74
|
+
export declare function getRun(id: string): Run | undefined;
|
|
75
|
+
export interface ListRunsOptions {
|
|
76
|
+
evalId?: string;
|
|
77
|
+
projectId?: string;
|
|
78
|
+
scenarioId?: string;
|
|
79
|
+
status?: RunStatus;
|
|
80
|
+
limit?: number;
|
|
81
|
+
}
|
|
82
|
+
export declare function listRuns(options?: ListRunsOptions): Run[];
|
|
83
|
+
export declare function listRuns(evalId?: string, projectId?: string): Run[];
|
|
84
|
+
export declare function listRunsByEval(evalId: string): Run[];
|
|
85
|
+
export declare function listRunsByScenario(scenarioId: string): Run[];
|
|
86
|
+
export declare function listRunsByPersona(personaId: string): Run[];
|
|
87
|
+
export declare function updateRun(id: string, input: UpdateRunInput): Run | undefined;
|
|
88
|
+
export declare function deleteRun(id: string): boolean;
|
|
89
|
+
export declare function deleteRunsByEval(evalId: string): number;
|
|
90
|
+
export declare function deleteRunsByProject(projectId: string): number;
|
|
91
|
+
/**
|
|
92
|
+
* Retries a failed run by resetting it to queued status.
|
|
93
|
+
* Clears error, timing info, messages, and output.
|
|
94
|
+
* Increments retryCount to ensure a fresh LangGraph thread is used.
|
|
95
|
+
*
|
|
96
|
+
* @param id - The run ID to retry
|
|
97
|
+
* @returns The updated run, or undefined if not found
|
|
98
|
+
* @throws Error if the run status is not "failed"
|
|
99
|
+
*/
|
|
100
|
+
export declare function retryRun(id: string): Run | undefined;
|
|
101
|
+
//# sourceMappingURL=run.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"run.d.ts","sourceRoot":"","sources":["../src/run.ts"],"names":[],"mappings":"AAIA,OAAO,EAAW,KAAK,OAAO,EAAE,MAAM,WAAW,CAAC;AAMlD;;;;;;;GAOG;AACH,MAAM,MAAM,SAAS,GAAG,QAAQ,GAAG,SAAS,GAAG,SAAS,GAAG,WAAW,GAAG,OAAO,CAAC;AAEjF,MAAM,WAAW,SAAS;IACxB,OAAO,EAAE,OAAO,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,WAAW;IAC1B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE;QACX,KAAK,EAAE,MAAM,CAAC;QACd,MAAM,EAAE,MAAM,CAAC;KAChB,CAAC;IACF,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;CACxB;AAED,MAAM,WAAW,GAAG;IAClB,EAAE,EAAE,MAAM,CAAC;IACX,6CAA6C;IAC7C,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,sDAAsD;IACtD,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,wEAAwE;IACxE,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,EAAE,SAAS,CAAC;IAClB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,EAAE,OAAO,EAAE,CAAC;IACpB,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACjC,MAAM,CAAC,EAAE,SAAS,CAAC;IACnB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,WAAW,CAAC;IACvB,2EAA2E;IAC3E,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,wBAAwB;IACvC,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,cAAc;IAC7B,MAAM,CAAC,EAAE,SAAS,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,OAAO,EAAE,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACjC,MAAM,CAAC,EAAE,SAAS,CAAC;IACnB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,WAAW,CAAC;IACvB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAoBD,wBAAgB,UAAU,CAAC,KAAK,EAAE,cAAc,GAAG,GAAG,EAAE,CA2EvD;AAED,wBAAgB,SAAS,CAAC,KAAK,EAAE,cAAc,GAAG,GAAG,CAGpD;AAED;;;GAGG;AACH,wBAAgB,mBAAmB,CAAC,KAAK,EAAE,wBAAwB,GAAG,GAAG,CA2CxE;AAED,wBAAgB,MAAM,CAAC,EAAE,EAAE,MAAM,GAAG,GAAG,GAAG,SAAS,CAGlD;AAED,MAAM,WAAW,eAAe;IAC9B,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,MAAM,CAAC,EAAE,SAAS,CAAC;IACnB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,wBAAgB,QAAQ,CAAC,OAAO,CAAC,EAAE,eAAe,GAAG,GAAG,EAAE,CAAC;AAC3D,wBAAgB,QAAQ,CAAC,MAAM,CAAC,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,GAAG,EAAE,CAAC;AAwDrE,wBAAgB,cAAc,CAAC,MAAM,EAAE,MAAM,GAAG,GAAG,EAAE,CAQpD;AAED,wBAAgB,kBAAkB,CAAC,UAAU,EAAE,MAAM,GAAG,GAAG,EAAE,CAQ5D;AAED,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,GAAG,GAAG,EAAE,CAQ1D;AAED,wBAAgB,SAAS,CAAC,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,cAAc,GAAG,GAAG,GAAG,SAAS,CA6B5E;AAED,wBAAgB,SAAS,CAAC,EAAE,EAAE,MAAM,GAAG,OAAO,CAY7C;AAED,wBAAgB,gBAAgB,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,CAUvD;AAED,wBAAgB,mBAAmB,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAU7D;AAED;;;;;;;;GAQG;AACH,wBAAgB,QAAQ,CAAC,EAAE,EAAE,MAAM,GAAG,GAAG,GAAG,SAAS,CA2BpD"}
|