@axiom-lattice/agent-eval 2.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs ADDED
@@ -0,0 +1,1055 @@
1
+ // src/LatticeEval.ts
2
+ import { getAgentClient } from "@axiom-lattice/core";
3
+ import { HumanMessage } from "@langchain/core/messages";
4
+ import { v4 } from "uuid";
5
+ var LatticeEval = class {
6
+ /**
7
+ * Create a new LatticeEval instance
8
+ * @param config Optional server configuration (defaults to localhost:3203)
9
+ */
10
+ constructor(config) {
11
+ this.inMemoryLogs = [];
12
+ this.lastDurationMs = 0;
13
+ this.config = config;
14
+ this.baseUrl = this.config.base_url;
15
+ this.verbose = this.config.verbose ?? true;
16
+ }
17
+ getLastRunMeta() {
18
+ return {
19
+ duration_ms: this.lastDurationMs,
20
+ thread_id: this.lastThreadId,
21
+ judge_thread_id: this.lastJudgeThreadId,
22
+ test_prompt: this.lastTestPrompt,
23
+ final_output: this.lastFinalOutput
24
+ };
25
+ }
26
+ getInMemoryLogs() {
27
+ return [...this.inMemoryLogs];
28
+ }
29
+ record(level, message, data) {
30
+ const event = {
31
+ ts: (/* @__PURE__ */ new Date()).toISOString(),
32
+ level,
33
+ message,
34
+ data
35
+ };
36
+ this.inMemoryLogs.push(event);
37
+ if (!this.verbose) return;
38
+ if (level === "error") {
39
+ const keyInfo = this.getKeyInfo(data);
40
+ console.log(` \u2717 ${message}${keyInfo ? ` ${keyInfo}` : ""}`);
41
+ } else if (message.includes("Starting case") || message.includes("Case evaluation completed")) {
42
+ const keyInfo = this.getKeyInfo(data);
43
+ console.log(` ${message}${keyInfo ? ` ${keyInfo}` : ""}`);
44
+ }
45
+ }
46
+ log(message, data) {
47
+ this.record("info", message, data);
48
+ }
49
+ getKeyInfo(data) {
50
+ if (!data) return "";
51
+ const parts = [];
52
+ if (data.case_id) parts.push(`case=${data.case_id}`);
53
+ if (data.pass !== void 0) parts.push(`pass=${data.pass}`);
54
+ if (data.final_score !== void 0) parts.push(`score=${data.final_score}`);
55
+ if (data.error) parts.push(`error=${data.error}`);
56
+ return parts.length > 0 ? `(${parts.join(" ")})` : "";
57
+ }
58
+ /**
59
+ * Execute a single agent step and return the thread ID and response data
60
+ */
61
+ async executeAgentStep(step, threadId, inputMessage, files) {
62
+ this.log("Executing agent step", {
63
+ agent_id: step.agent_id,
64
+ thread_id: threadId,
65
+ has_override_input_message: Boolean(step.override_input_message),
66
+ input_message_length: step.override_input_message ? step.override_input_message.length : inputMessage.length,
67
+ files_count: Object.keys(files || {}).length
68
+ });
69
+ const response = await fetch(`${this.baseUrl}/api/runs`, {
70
+ method: "POST",
71
+ headers: {
72
+ "Content-Type": "application/json"
73
+ },
74
+ body: JSON.stringify({
75
+ assistant_id: step.agent_id,
76
+ thread_id: threadId,
77
+ files: Object.keys(files).reduce((acc, key) => {
78
+ acc[key] = { content: files[key].split("\n"), created_at: (/* @__PURE__ */ new Date()).toISOString(), modified_at: (/* @__PURE__ */ new Date()).toISOString() };
79
+ return acc;
80
+ }, {}),
81
+ message: step.override_input_message || inputMessage
82
+ })
83
+ });
84
+ const responseData = await response.json();
85
+ if (responseData.error) {
86
+ this.log("Agent step failed", {
87
+ agent_id: step.agent_id,
88
+ thread_id: threadId,
89
+ error: responseData.error
90
+ });
91
+ throw new Error(
92
+ `Failed to run agent ${step.agent_id}: ${responseData.error}`
93
+ );
94
+ }
95
+ this.log("Agent step completed", {
96
+ agent_id: step.agent_id,
97
+ thread_id: threadId,
98
+ response_keys: responseData ? Object.keys(responseData) : []
99
+ });
100
+ return { threadId, responseData };
101
+ }
102
+ /**
103
+ * Extract output content based on OutputType
104
+ */
105
+ async extractOutput(outputType, agentId, threadId, runResponseData) {
106
+ if (outputType.type === "file_content") {
107
+ this.log("Extracting file output", {
108
+ agent_id: agentId,
109
+ thread_id: threadId,
110
+ file_path: outputType.file_path
111
+ });
112
+ const responseState = await fetch(
113
+ `${this.baseUrl}/api/assistants/${agentId}/${threadId}/state`,
114
+ {
115
+ method: "GET"
116
+ }
117
+ );
118
+ if (!responseState.ok) {
119
+ this.log("Failed to fetch assistant state", {
120
+ agent_id: agentId,
121
+ thread_id: threadId,
122
+ status: responseState.status,
123
+ statusText: responseState.statusText
124
+ });
125
+ throw new Error(`Failed to get state: ${responseState.statusText}`);
126
+ }
127
+ const state = await responseState.json();
128
+ const stateValues = state.values;
129
+ const fileContent = stateValues.files[outputType.file_path]?.content;
130
+ if (!fileContent) {
131
+ this.log("File output not found in state", {
132
+ agent_id: agentId,
133
+ thread_id: threadId,
134
+ file_path: outputType.file_path
135
+ });
136
+ throw new Error(
137
+ `File not found in output: ${outputType.file_path}`
138
+ );
139
+ }
140
+ const content = Array.isArray(fileContent) ? fileContent.join("\n") : fileContent;
141
+ this.log("File output extracted", {
142
+ agent_id: agentId,
143
+ thread_id: threadId,
144
+ file_path: outputType.file_path,
145
+ output_length: typeof content === "string" ? content.length : void 0
146
+ });
147
+ return content;
148
+ } else {
149
+ if (runResponseData?.messages && runResponseData.messages.length > 0) {
150
+ const content = runResponseData.messages[runResponseData.messages.length - 1]?.content || "";
151
+ this.log("Message output extracted", {
152
+ agent_id: agentId,
153
+ thread_id: threadId,
154
+ output_length: typeof content === "string" ? content.length : void 0
155
+ });
156
+ return content;
157
+ }
158
+ this.log("No message content found in run response", {
159
+ agent_id: agentId,
160
+ thread_id: threadId
161
+ });
162
+ throw new Error("No message content found in run response");
163
+ }
164
+ }
165
+ /**
166
+ * Evaluate a single Lattice evaluation case
167
+ * @param evalCase The evaluation case to run
168
+ * @returns Evaluation result with pass/fail status and scores
169
+ */
170
+ async evaluateCase(evalCase) {
171
+ const startedAt = Date.now();
172
+ const threadId = `${evalCase.caseId}||${v4()}`;
173
+ this.inMemoryLogs = [];
174
+ this.lastThreadId = threadId;
175
+ this.lastJudgeThreadId = void 0;
176
+ this.lastTestPrompt = void 0;
177
+ this.lastFinalOutput = void 0;
178
+ this.lastDurationMs = 0;
179
+ this.log("Starting case evaluation", {
180
+ case_id: evalCase.caseId,
181
+ thread_id: threadId,
182
+ steps_count: evalCase.steps?.length,
183
+ output_type: evalCase.output?.type
184
+ });
185
+ let currentThreadId = threadId;
186
+ let lastResponseData = null;
187
+ for (const step of evalCase.steps) {
188
+ const result = await this.executeAgentStep(
189
+ step,
190
+ currentThreadId,
191
+ evalCase.input.message,
192
+ evalCase.input.files || {}
193
+ );
194
+ currentThreadId = result.threadId;
195
+ lastResponseData = result.responseData;
196
+ }
197
+ const finalAgentId = evalCase.steps[evalCase.steps.length - 1]?.agent_id || "";
198
+ this.log("All agent steps completed", {
199
+ case_id: evalCase.caseId,
200
+ final_agent_id: finalAgentId,
201
+ final_thread_id: currentThreadId
202
+ });
203
+ const finalOutput = await this.extractOutput(
204
+ evalCase.output,
205
+ finalAgentId,
206
+ currentThreadId,
207
+ lastResponseData
208
+ );
209
+ this.lastFinalOutput = finalOutput;
210
+ this.log("Final output extracted", {
211
+ case_id: evalCase.caseId,
212
+ output_type: evalCase.output.type,
213
+ output_length: finalOutput.length
214
+ });
215
+ const testCaseFilesContent = evalCase.input.files ? Object.keys(evalCase.input.files).map(
216
+ (key) => `File name: ${key}
217
+ File content: ${evalCase.input.files[key]}`
218
+ ).join("\n\n") : "";
219
+ const outputTypeDescription = evalCase.output.type === "file_content" ? `\u865A\u62DF\u4EA7\u7269\uFF08\u6587\u4EF6\uFF1A${evalCase.output.file_path}\uFF09` : "\u6D88\u606F\u5185\u5BB9";
220
+ const defaultRubrics = [
221
+ {
222
+ dimension: "correctness",
223
+ weight: 100,
224
+ description: "\u6574\u4F53\u6B63\u786E\u6027\uFF0C\u662F\u5426\u7B26\u5408\u9884\u671F\u8F93\u51FA\u63CF\u8FF0\u3002"
225
+ }
226
+ ];
227
+ const evalRubrics = evalCase.eval.eval_rubrics && evalCase.eval.eval_rubrics.length > 0 ? evalCase.eval.eval_rubrics : defaultRubrics;
228
+ this.log("Prepared evaluation rubrics", {
229
+ case_id: evalCase.caseId,
230
+ rubrics_count: evalRubrics.length,
231
+ rubric_dimensions: evalRubrics.map((r) => r.dimension)
232
+ });
233
+ const rubricsSection = `
234
+ ## \u8BC4\u4F30\u6307\u6807\uFF08Evaluation Rubrics\uFF09
235
+ ${evalRubrics.map(
236
+ (r) => `- **${r.dimension}**\uFF08\u6743\u91CD\uFF1A${r.weight}\uFF09\uFF1A${r.description}`
237
+ ).join("\n")}`;
238
+ const testPrompt = `# \u89D2\u8272
239
+ \u4F60\u662F\u4E00\u540D\u8D44\u6DF1\u7684 AI Agent \u8BC4\u4F30\u4E13\u5BB6\uFF0C\u8D1F\u8D23\u6839\u636E\u9884\u8BBE\u7684\u6307\u6807\uFF08Rubrics\uFF09\u5BF9 Agent \u7684\u6267\u884C\u7ED3\u679C\u8FDB\u884C"\u9ED1\u76D2\u6D4B\u8BD5"\u5224\u5B9A\u3002
240
+
241
+ # \u8F93\u5165\u4FE1\u606F
242
+ \u6D4B\u8BD5\u6846\u67B6\u5C06\u4E3A\u4F60\u63D0\u4F9B\u4EE5\u4E0B\u56DB\u4E2A\u6838\u5FC3\u4E0A\u4E0B\u6587\uFF1A
243
+
244
+ 1. **\u7528\u6237\u610F\u56FE\uFF08User Intent\uFF09**\uFF1A${evalCase.input.message}
245
+
246
+ 2. **\u8F93\u5165\u6587\u4EF6\uFF08Input Files\uFF09**\uFF1A${testCaseFilesContent || "\u65E0"}
247
+
248
+ 3. **\u5B9E\u9645\u8F93\u51FA\uFF08Actual Output\uFF0C${outputTypeDescription}\uFF09**\uFF1A
249
+ ${finalOutput}
250
+
251
+ 4. **\u671F\u671B\u8F93\u51FA\u63CF\u8FF0\uFF08Expected Output Description\uFF09**\uFF1A${evalCase.eval.content_assertion}
252
+ ${rubricsSection}
253
+
254
+ # \u4EFB\u52A1
255
+ \u4F60\u5FC5\u987B\u4E25\u683C\u5BF9\u7167"\u8BC4\u4F30\u6307\u6807\uFF08Evaluation Rubrics\uFF09"\u4E2D\u7684\u6BCF\u4E00\u9879\u6307\u6807\uFF0C\u5206\u6790"\u5B9E\u9645\u8F93\u51FA\uFF08Actual Output\uFF09"\u662F\u5426\u8FBE\u6807\u3002
256
+
257
+ # \u89C4\u5219
258
+ 1. **\u5BA2\u89C2\u6027**\uFF1A\u4EC5\u6839\u636E\u63D0\u4F9B\u7684\u4E0A\u4E0B\u6587\u5224\u5B9A\u3002\u5982\u679C\u6807\u51C6\u8981\u6C42"\u5305\u542B\u6570\u5B57"\uFF0C\u4F46\u8F93\u51FA\u53EA\u6709\u6587\u5B57\uFF0C\u5373\u4F7F\u8BED\u6C14\u518D\u597D\u4E5F\u5FC5\u987B\u6263\u5206\u3002
259
+ 2. **\u7ED3\u679C\u6821\u9A8C**\uFF1A\u5982\u679C"\u5B9E\u9645\u8F93\u51FA"\u4E2D\u7F3A\u5931\u9884\u671F\u7684\u5185\u5BB9\uFF0C\u6216\u5185\u5BB9\u4E0D\u7B26\u5408"\u8BC4\u4F30\u6307\u6807"\u4E2D\u7684\u6807\u51C6\uFF0C\u5BF9\u5E94\u7684\u6307\u6807\u5E94\u5224\u5B9A\u4E3A\u5931\u8D25\u3002
260
+ 3. **\u8BC1\u636E\u5BFC\u5411**\uFF1A\u5728\u7ED9\u51FA\u539F\u56E0\uFF08reason\uFF09\u65F6\uFF0C\u5FC5\u987B\u5F15\u7528\u8F93\u51FA\u4E2D\u7684\u539F\u6587\u6216\u865A\u62DF\u4EA7\u7269\u4E2D\u7684\u5177\u4F53\u6570\u636E\u7247\u6BB5\u3002
261
+ 4. **\u52A0\u6743\u8BA1\u7B97**\uFF1A\u6700\u7EC8\u5206\u6570\u4E3A\u5404\u9879\u6307\u6807\u5F97\u5206\u4E0E\u5176\u6743\u91CD\u7684\u4E58\u79EF\u4E4B\u548C\uFF080-100\u5206\u5236\uFF09\u3002
262
+
263
+ # \u8F93\u51FA\u683C\u5F0F\uFF08\u4EC5JSON\uFF09
264
+ \u4F60\u5FC5\u987B\u4EC5\u4EE5 JSON \u683C\u5F0F\u56DE\u590D\uFF0C\u7ED3\u6784\u5982\u4E0B\uFF1A
265
+ {
266
+ "pass": true | false,
267
+ "final_score": number,
268
+ "dimension_results": [
269
+ {
270
+ "name": "\u6307\u6807\u540D\u79F0",
271
+ "score": number,
272
+ "reason": "\u5177\u4F53\u7684\u6263\u5206\u6216\u7ED9\u5206\u7406\u7531\uFF0C\u9700\u5F15\u7528\u8BC1\u636E"
273
+ }
274
+ ],
275
+ "summary": "\u5BF9 Agent \u8868\u73B0\u7684\u6574\u4F53\u8BC4\u4EF7"
276
+ }
277
+
278
+ \u6CE8\u610F\uFF1A\u5982\u679C final_score >= 80 \u4E14\u6CA1\u6709\u81F4\u547D\u6027\u9519\u8BEF\uFF0Cpass \u5E94\u4E3A true\uFF1B\u5426\u5219\u4E3A false\u3002`;
279
+ this.lastTestPrompt = testPrompt;
280
+ const judgeThreadId = v4();
281
+ this.lastJudgeThreadId = judgeThreadId;
282
+ this.log("Invoking judge agent", { agent_key: "LatticeTest", case_id: evalCase.caseId });
283
+ const testResponse = await getAgentClient("LatticeTest").invoke(
284
+ {
285
+ messages: [new HumanMessage(testPrompt)]
286
+ },
287
+ {
288
+ configurable: {
289
+ thread_id: judgeThreadId
290
+ }
291
+ }
292
+ );
293
+ this.log("Judge agent responded", {
294
+ case_id: evalCase.caseId,
295
+ messages_count: testResponse?.messages?.length
296
+ });
297
+ const testResultContent = testResponse.messages[testResponse.messages.length - 1]?.content || "";
298
+ this.log("Judge raw output received", {
299
+ case_id: evalCase.caseId,
300
+ output_length: typeof testResultContent === "string" ? testResultContent.length : void 0
301
+ });
302
+ let parsedResult = {};
303
+ try {
304
+ const jsonMatch = testResultContent.match(/```(?:json)?\s*(\{[\s\S]*\})\s*```/) || testResultContent.match(/\{[\s\S]*\}/);
305
+ if (jsonMatch) {
306
+ parsedResult = JSON.parse(jsonMatch[1] || jsonMatch[0]);
307
+ this.log("Parsed judge JSON successfully", {
308
+ case_id: evalCase.caseId,
309
+ parsed_keys: Object.keys(parsedResult || {})
310
+ });
311
+ } else {
312
+ this.log("No JSON detected in judge output; will fallback", {
313
+ case_id: evalCase.caseId
314
+ });
315
+ }
316
+ } catch (error) {
317
+ console.warn("Failed to parse JSON from judge agent response, falling back to keyword-based parsing:", error);
318
+ this.log("Failed to parse judge JSON; falling back", {
319
+ case_id: evalCase.caseId,
320
+ error: error instanceof Error ? error.message : String(error)
321
+ });
322
+ }
323
+ let pass;
324
+ if (parsedResult.pass !== void 0) {
325
+ pass = parsedResult.pass;
326
+ this.log("Pass determined from parsedResult.pass", { case_id: evalCase.caseId, pass });
327
+ } else if (parsedResult.final_score !== void 0) {
328
+ pass = parsedResult.final_score >= 80;
329
+ this.log("Pass determined from parsedResult.final_score", {
330
+ case_id: evalCase.caseId,
331
+ final_score: parsedResult.final_score,
332
+ pass
333
+ });
334
+ } else {
335
+ pass = testResultContent.toLowerCase().includes("pass") || testResultContent.toLowerCase().includes("success") || testResultContent.toLowerCase().includes("\u901A\u8FC7") || testResultContent.toLowerCase().includes("\u7B26\u5408");
336
+ this.log("Pass determined from keyword fallback", { case_id: evalCase.caseId, pass });
337
+ }
338
+ let dimensionResults = [];
339
+ if (parsedResult.dimension_results && parsedResult.dimension_results.length > 0) {
340
+ dimensionResults = parsedResult.dimension_results;
341
+ this.log("Using parsed dimension_results from judge", {
342
+ case_id: evalCase.caseId,
343
+ dimensions_count: dimensionResults.length
344
+ });
345
+ } else if (evalRubrics.length > 0) {
346
+ dimensionResults = evalRubrics.map((rubric) => ({
347
+ name: rubric.dimension,
348
+ score: 0,
349
+ reason: ""
350
+ }));
351
+ this.log("No dimension_results parsed; using rubric skeleton", {
352
+ case_id: evalCase.caseId,
353
+ dimensions_count: dimensionResults.length
354
+ });
355
+ }
356
+ let finalScore;
357
+ if (parsedResult.final_score !== void 0) {
358
+ finalScore = parsedResult.final_score;
359
+ this.log("Final score taken from parsedResult.final_score", {
360
+ case_id: evalCase.caseId,
361
+ final_score: finalScore
362
+ });
363
+ } else if (dimensionResults.length > 0 && evalRubrics.length > 0) {
364
+ const rubricMap = new Map(evalRubrics.map((r) => [r.dimension, r.weight]));
365
+ const totalWeight = Array.from(rubricMap.values()).reduce((sum, w) => sum + w, 0);
366
+ if (totalWeight > 0) {
367
+ finalScore = dimensionResults.reduce((sum, result) => {
368
+ const weight = rubricMap.get(result.name) || 1;
369
+ return sum + result.score * weight;
370
+ }, 0) / totalWeight;
371
+ } else {
372
+ finalScore = dimensionResults.reduce((sum, result) => sum + result.score, 0) / dimensionResults.length;
373
+ }
374
+ this.log("Final score computed from dimension_results", {
375
+ case_id: evalCase.caseId,
376
+ final_score: finalScore,
377
+ total_weight: totalWeight
378
+ });
379
+ } else {
380
+ finalScore = pass ? 100 : 0;
381
+ this.log("Final score fallback (pass-based)", {
382
+ case_id: evalCase.caseId,
383
+ final_score: finalScore,
384
+ pass
385
+ });
386
+ }
387
+ this.log("Case evaluation completed", {
388
+ case_id: evalCase.caseId,
389
+ pass,
390
+ final_score: finalScore
391
+ });
392
+ const finishedAt = Date.now();
393
+ this.lastDurationMs = finishedAt - startedAt;
394
+ this.record("info", "Case duration recorded", {
395
+ case_id: evalCase.caseId,
396
+ duration_ms: this.lastDurationMs
397
+ });
398
+ return {
399
+ pass,
400
+ final_score: finalScore,
401
+ dimension_results: dimensionResults,
402
+ summary: parsedResult.summary || testResultContent
403
+ };
404
+ }
405
+ };
406
+ async function evaluateLatticeCase(evalCase, config) {
407
+ const defaultConfig = {
408
+ base_url: "http://localhost:3203"
409
+ };
410
+ const evaluator = new LatticeEval(config || defaultConfig);
411
+ return evaluator.evaluateCase(evalCase);
412
+ }
413
+ async function evaluateLatticeCaseWithLogs(evalCase, config) {
414
+ const defaultConfig = {
415
+ base_url: "http://localhost:3203"
416
+ };
417
+ const evaluator = new LatticeEval(config || defaultConfig);
418
+ try {
419
+ const result = await evaluator.evaluateCase(evalCase);
420
+ const meta = evaluator.getLastRunMeta();
421
+ return {
422
+ caseId: evalCase.caseId,
423
+ result,
424
+ duration_ms: meta.duration_ms,
425
+ thread_id: meta.thread_id,
426
+ judge_thread_id: meta.judge_thread_id,
427
+ test_prompt: meta.test_prompt,
428
+ final_output: meta.final_output,
429
+ logs: evaluator.getInMemoryLogs()
430
+ };
431
+ } catch (error) {
432
+ const errorMessage = error instanceof Error ? error.message : String(error);
433
+ const errorStack = error instanceof Error ? error.stack : void 0;
434
+ evaluator.record("error", "Case evaluation failed", {
435
+ case_id: evalCase.caseId,
436
+ error: errorMessage
437
+ });
438
+ const meta = evaluator.getLastRunMeta();
439
+ return {
440
+ caseId: evalCase.caseId,
441
+ error: errorMessage,
442
+ error_stack: errorStack,
443
+ duration_ms: meta.duration_ms,
444
+ thread_id: meta.thread_id,
445
+ judge_thread_id: meta.judge_thread_id,
446
+ test_prompt: meta.test_prompt,
447
+ final_output: meta.final_output,
448
+ logs: evaluator.getInMemoryLogs()
449
+ };
450
+ }
451
+ }
452
+
453
+ // src/LatticeEvalSuite.ts
454
+ async function limitConcurrency(tasks, concurrency) {
455
+ const results = [];
456
+ const executing = [];
457
+ let index = 0;
458
+ const executeTask = async (task, taskIndex) => {
459
+ try {
460
+ const result = await task();
461
+ results[taskIndex] = { success: true, result };
462
+ } catch (error) {
463
+ results[taskIndex] = {
464
+ success: false,
465
+ error: error instanceof Error ? error.message : String(error)
466
+ };
467
+ }
468
+ };
469
+ while (index < tasks.length || executing.length > 0) {
470
+ while (executing.length < concurrency && index < tasks.length) {
471
+ const task = tasks[index];
472
+ const currentIndex = index++;
473
+ const promise = executeTask(task, currentIndex).catch((err) => {
474
+ console.error(`Unexpected error in task execution:`, err);
475
+ }).finally(() => {
476
+ const idx = executing.indexOf(promise);
477
+ if (idx > -1) {
478
+ executing.splice(idx, 1);
479
+ }
480
+ });
481
+ executing.push(promise);
482
+ }
483
+ if (executing.length > 0) {
484
+ await Promise.race(executing);
485
+ }
486
+ }
487
+ await Promise.allSettled(executing);
488
+ return results;
489
+ }
490
+ function resolveTemplateCase(templateCase, templates) {
491
+ const template = templates.get(templateCase.templateId);
492
+ if (!template) {
493
+ throw new Error(`Template not found: ${templateCase.templateId}`);
494
+ }
495
+ const resolvedCase = {
496
+ caseId: templateCase.caseId,
497
+ input: {
498
+ message: templateCase.input.message ?? template.default_case.input.message,
499
+ files: {
500
+ ...template.default_case.input.files,
501
+ ...templateCase.input.files
502
+ }
503
+ },
504
+ steps: template.default_case.steps,
505
+ output: templateCase.output || template.default_case.output,
506
+ eval: {
507
+ content_assertion: templateCase.eval.content_assertion,
508
+ eval_rubrics: templateCase.eval.eval_rubrics || template.default_case.eval?.eval_rubrics
509
+ }
510
+ };
511
+ return resolvedCase;
512
+ }
513
+ function isTemplateCase(case_) {
514
+ return "templateId" in case_;
515
+ }
516
+ var LatticeEvalSuite = class {
517
+ constructor(suite, projectConfig, templates = /* @__PURE__ */ new Map()) {
518
+ this.suite = suite;
519
+ this.projectConfig = projectConfig;
520
+ this.templates = templates;
521
+ }
522
+ /**
523
+ * Get resolved configuration from project
524
+ */
525
+ getResolvedConfig() {
526
+ return this.projectConfig;
527
+ }
528
+ /**
529
+ * Get suite name
530
+ */
531
+ getSuiteName() {
532
+ return this.suite.suiteName;
533
+ }
534
+ /**
535
+ * Get suite version
536
+ */
537
+ getVersion() {
538
+ return this.suite.version;
539
+ }
540
+ /**
541
+ * Get all cases in this suite (resolved from templates if needed)
542
+ */
543
+ getCases() {
544
+ return this.suite.cases.map((case_) => {
545
+ if (isTemplateCase(case_)) {
546
+ return resolveTemplateCase(case_, this.templates);
547
+ }
548
+ return case_;
549
+ });
550
+ }
551
+ /**
552
+ * Get a specific case by ID (resolved from template if needed)
553
+ */
554
+ getCase(caseId) {
555
+ const case_ = this.suite.cases.find((c) => c.caseId === caseId);
556
+ if (!case_) {
557
+ return void 0;
558
+ }
559
+ if (isTemplateCase(case_)) {
560
+ return resolveTemplateCase(case_, this.templates);
561
+ }
562
+ return case_;
563
+ }
564
+ /**
565
+ * Run a single case in this suite with error handling
566
+ * @param caseId The case ID to run
567
+ * @returns Case run result with error handling
568
+ */
569
+ async runCase(caseId) {
570
+ try {
571
+ const evalCase = this.getCase(caseId);
572
+ if (!evalCase) {
573
+ return {
574
+ caseId,
575
+ error: `Case not found: ${caseId}`,
576
+ logs: []
577
+ };
578
+ }
579
+ const config = this.getResolvedConfig();
580
+ const evalConfig = {
581
+ base_url: config.lattice_server_config.base_url,
582
+ api_key: config.lattice_server_config.api_key
583
+ };
584
+ const run = await evaluateLatticeCaseWithLogs(evalCase, evalConfig);
585
+ return {
586
+ caseId,
587
+ result: run.result,
588
+ error: run.error,
589
+ error_stack: run.error_stack,
590
+ duration_ms: run.duration_ms,
591
+ thread_id: run.thread_id,
592
+ judge_thread_id: run.judge_thread_id,
593
+ test_prompt: run.test_prompt,
594
+ final_output: run.final_output,
595
+ logs: run.logs
596
+ };
597
+ } catch (error) {
598
+ return {
599
+ caseId,
600
+ error: error instanceof Error ? error.message : String(error),
601
+ logs: []
602
+ };
603
+ }
604
+ }
605
+ /**
606
+ * Run all cases in this suite with concurrency control and error isolation
607
+ * @param concurrency Optional concurrency limit (overrides project config)
608
+ * @returns Array of case run results with error handling
609
+ */
610
+ async runAllCases(concurrency) {
611
+ const config = this.getResolvedConfig();
612
+ const maxConcurrency = concurrency ?? config.concurrency;
613
+ const tasks = this.suite.cases.map((case_) => async () => {
614
+ try {
615
+ const evalCase = isTemplateCase(case_) ? resolveTemplateCase(case_, this.templates) : case_;
616
+ const evalConfig = {
617
+ base_url: config.lattice_server_config.base_url,
618
+ api_key: config.lattice_server_config.api_key
619
+ };
620
+ const run = await evaluateLatticeCaseWithLogs(evalCase, evalConfig);
621
+ return {
622
+ caseId: evalCase.caseId,
623
+ result: run.result,
624
+ error: run.error,
625
+ error_stack: run.error_stack,
626
+ duration_ms: run.duration_ms,
627
+ thread_id: run.thread_id,
628
+ judge_thread_id: run.judge_thread_id,
629
+ test_prompt: run.test_prompt,
630
+ final_output: run.final_output,
631
+ logs: run.logs
632
+ };
633
+ } catch (error) {
634
+ return {
635
+ caseId: case_.caseId,
636
+ error: error instanceof Error ? error.message : String(error),
637
+ logs: []
638
+ };
639
+ }
640
+ });
641
+ const taskResults = await limitConcurrency(tasks, maxConcurrency);
642
+ return taskResults.map((taskResult, index) => {
643
+ if (taskResult.success && taskResult.result) {
644
+ return taskResult.result;
645
+ }
646
+ return {
647
+ caseId: this.suite.cases[index].caseId,
648
+ error: taskResult.error || "Unknown error",
649
+ logs: []
650
+ };
651
+ });
652
+ }
653
+ };
654
+
655
+ // src/LatticeEvalProject.ts
656
+ import {
657
+ registerModelLattice,
658
+ registerAgentLattice,
659
+ AgentType
660
+ } from "@axiom-lattice/core";
661
+ import { mkdir, writeFile } from "fs/promises";
662
+ import path from "path";
663
+ var LatticeEvalProject = class {
664
+ constructor(project) {
665
+ this.suites = /* @__PURE__ */ new Map();
666
+ this.project = project;
667
+ this.reportConfig = project.report_config;
668
+ const judgeModelKey = `${this.project.projectName}_judge_model`;
669
+ registerModelLattice(judgeModelKey, this.project.judge_agent_config.model);
670
+ const judgeAgentKey = "LatticeTest";
671
+ const judgeAgentConfig = {
672
+ key: judgeAgentKey,
673
+ name: "Lattice Test Judge Agent",
674
+ description: "Judge agent for evaluating Lattice test cases",
675
+ type: AgentType.REACT,
676
+ prompt: "",
677
+ // No prompt as requested
678
+ modelKey: judgeModelKey
679
+ };
680
+ registerAgentLattice(judgeAgentConfig);
681
+ const projectConfig = {
682
+ lattice_server_config: {
683
+ base_url: this.project.lattice_server_config.base_url,
684
+ api_key: this.project.lattice_server_config.api_key
685
+ },
686
+ judge_agent_config: this.project.judge_agent_config,
687
+ concurrency: this.project.concurrency ?? 1
688
+ };
689
+ const templatesMap = /* @__PURE__ */ new Map();
690
+ if (this.project.templates) {
691
+ for (const template of this.project.templates) {
692
+ templatesMap.set(template.templateId, template);
693
+ }
694
+ }
695
+ for (const suite of this.project.suites) {
696
+ this.suites.set(
697
+ suite.suiteName,
698
+ new LatticeEvalSuite(suite, projectConfig, templatesMap)
699
+ );
700
+ }
701
+ }
702
+ /**
703
+ * Get project name
704
+ */
705
+ getProjectName() {
706
+ return this.project.projectName;
707
+ }
708
+ /**
709
+ * Get project version
710
+ */
711
+ getVersion() {
712
+ return this.project.version;
713
+ }
714
+ /**
715
+ * Get project description
716
+ */
717
+ getDescription() {
718
+ return this.project.description;
719
+ }
720
+ /**
721
+ * Get all suite names
722
+ */
723
+ getSuiteNames() {
724
+ return Array.from(this.suites.keys());
725
+ }
726
+ /**
727
+ * Get a specific suite by name
728
+ */
729
+ getSuite(suiteName) {
730
+ return this.suites.get(suiteName);
731
+ }
732
+ /**
733
+ * Run a specific case in a specific suite
734
+ * @param suiteName The suite name
735
+ * @param caseId The case ID to run
736
+ * @returns Case run result with error handling
737
+ */
738
+ async runCase(suiteName, caseId) {
739
+ const suite = this.getSuite(suiteName);
740
+ if (!suite) {
741
+ return {
742
+ caseId,
743
+ error: `Suite not found: ${suiteName}`,
744
+ logs: []
745
+ };
746
+ }
747
+ return suite.runCase(caseId);
748
+ }
749
+ /**
750
+ * Run all cases in a specific suite with concurrency control and error isolation
751
+ * @param suiteName The suite name
752
+ * @param concurrency Optional concurrency limit (overrides project config)
753
+ * @returns Array of case run results with error handling
754
+ */
755
+ async runSuite(suiteName, concurrency) {
756
+ const suite = this.getSuite(suiteName);
757
+ if (!suite) {
758
+ throw new Error(`Suite not found: ${suiteName}`);
759
+ }
760
+ return suite.runAllCases(concurrency);
761
+ }
762
+ /**
763
+ * Run all cases in all suites with concurrency control and error isolation
764
+ * @param concurrency Optional concurrency limit (overrides project config)
765
+ * @returns Map of suite names to their case run results
766
+ */
767
+ async runAllSuites(concurrency) {
768
+ const results = /* @__PURE__ */ new Map();
769
+ for (const suiteName of this.getSuiteNames()) {
770
+ try {
771
+ const suiteResults = await this.runSuite(suiteName, concurrency);
772
+ results.set(suiteName, suiteResults);
773
+ } catch (error) {
774
+ const suite = this.getSuite(suiteName);
775
+ if (suite) {
776
+ const errorResults = suite.getCases().map((c) => ({
777
+ caseId: c.caseId,
778
+ error: error instanceof Error ? error.message : String(error),
779
+ logs: []
780
+ }));
781
+ results.set(suiteName, errorResults);
782
+ }
783
+ }
784
+ }
785
+ return results;
786
+ }
787
+ /**
788
+ * Run all suites as a "batch", build a report, and optionally write it to disk.
789
+ */
790
+ async runAllSuitesBatch(concurrency) {
791
+ const started_at = (/* @__PURE__ */ new Date()).toISOString();
792
+ const batch_id = this.reportConfig?.batch_id || `${Date.now()}`;
793
+ console.log(`
794
+ Running batch: ${this.project.projectName} (${this.getSuiteNames().length} suites)`);
795
+ const results = await this.runAllSuites(concurrency);
796
+ let total_cases = 0;
797
+ let passed_cases = 0;
798
+ let failed_cases = 0;
799
+ const suites = [];
800
+ const durations = [];
801
+ for (const [suiteName, caseResults] of results.entries()) {
802
+ const suiteTotal = caseResults.length;
803
+ const suitePassed = caseResults.filter((r) => r.result?.pass).length;
804
+ const suiteFailed = suiteTotal - suitePassed;
805
+ total_cases += suiteTotal;
806
+ passed_cases += suitePassed;
807
+ failed_cases += suiteFailed;
808
+ suites.push({
809
+ suiteName,
810
+ total_cases: suiteTotal,
811
+ passed_cases: suitePassed,
812
+ failed_cases: suiteFailed,
813
+ cases: caseResults.map((r) => ({
814
+ caseId: r.caseId,
815
+ pass: r.result?.pass,
816
+ final_score: r.result?.final_score,
817
+ error: r.error
818
+ }))
819
+ });
820
+ for (const r of caseResults) {
821
+ if (typeof r.duration_ms === "number") durations.push(r.duration_ms);
822
+ }
823
+ }
824
+ const finished_at = (/* @__PURE__ */ new Date()).toISOString();
825
+ const report = {
826
+ batch_id,
827
+ started_at,
828
+ finished_at,
829
+ project: {
830
+ projectName: this.project.projectName,
831
+ version: this.project.version,
832
+ description: this.project.description
833
+ },
834
+ summary: {
835
+ total_cases,
836
+ passed_cases,
837
+ failed_cases,
838
+ pass_rate: total_cases > 0 ? passed_cases / total_cases : 0
839
+ },
840
+ suites
841
+ };
842
+ const batch_dir = await this.maybeWriteBatchArtifacts(
843
+ batch_id,
844
+ report,
845
+ results
846
+ );
847
+ console.log(`
848
+ === Summary ===`);
849
+ console.log(`Total: ${report.summary.total_cases} | Passed: ${report.summary.passed_cases} | Failed: ${report.summary.failed_cases} | Pass Rate: ${(report.summary.pass_rate * 100).toFixed(2)}%`);
850
+ if (batch_dir) {
851
+ console.log(`
852
+ Results saved to: ${batch_dir}`);
853
+ }
854
+ return { batch_id, batch_dir, results, report };
855
+ }
856
+ generateCaseMarkdown(index, suiteName, caseResult, payload) {
857
+ const lines = [];
858
+ const status = caseResult.result?.pass ? "\u2705 PASS" : "\u274C FAIL";
859
+ lines.push(`# Test ${index}: ${status}`);
860
+ lines.push(``);
861
+ lines.push(`- **Suite**: ${suiteName}`);
862
+ lines.push(`- **Case ID**: ${caseResult.caseId}`);
863
+ lines.push(`- **Status**: ${caseResult.result?.pass ? "PASS" : "FAIL"}`);
864
+ if (typeof payload.duration === "number") {
865
+ lines.push(`- **Duration**: ${(payload.duration / 1e3).toFixed(2)}s`);
866
+ }
867
+ if (payload.threadId) {
868
+ lines.push(`- **Thread ID**: ${payload.threadId}`);
869
+ }
870
+ if (payload.judgeThreadId) {
871
+ lines.push(`- **Judge Thread ID**: ${payload.judgeThreadId}`);
872
+ }
873
+ lines.push(``);
874
+ if (caseResult.result) {
875
+ lines.push(`## Result`);
876
+ lines.push(``);
877
+ lines.push(`- **Final Score**: ${caseResult.result.final_score}`);
878
+ lines.push(`- **Summary**: ${caseResult.result.summary || "N/A"}`);
879
+ lines.push(``);
880
+ if (caseResult.result.dimension_results && caseResult.result.dimension_results.length > 0) {
881
+ lines.push(`## Dimension Results`);
882
+ lines.push(``);
883
+ for (const dim of caseResult.result.dimension_results) {
884
+ lines.push(`### ${dim.name}`);
885
+ lines.push(`- **Score**: ${dim.score}`);
886
+ lines.push(`- **Reason**: ${dim.reason}`);
887
+ lines.push(``);
888
+ }
889
+ }
890
+ }
891
+ if (caseResult.error) {
892
+ lines.push(`## Error`);
893
+ lines.push(``);
894
+ lines.push(`\`\`\``);
895
+ lines.push(caseResult.error);
896
+ if (caseResult.error_stack) {
897
+ lines.push(``);
898
+ lines.push(caseResult.error_stack);
899
+ }
900
+ lines.push(`\`\`\``);
901
+ lines.push(``);
902
+ }
903
+ if (payload.finalOutput) {
904
+ lines.push(`## Final Output`);
905
+ lines.push(``);
906
+ lines.push(`\`\`\``);
907
+ const output = payload.finalOutput.length > 5e3 ? payload.finalOutput.substring(0, 5e3) + "\n\n... (truncated, see JSON for full output)" : payload.finalOutput;
908
+ lines.push(output);
909
+ lines.push(`\`\`\``);
910
+ lines.push(``);
911
+ }
912
+ if (payload.testPrompt) {
913
+ lines.push(`## Test Prompt`);
914
+ lines.push(``);
915
+ lines.push(`\`\`\``);
916
+ const prompt = payload.testPrompt.length > 5e3 ? payload.testPrompt.substring(0, 5e3) + "\n\n... (truncated, see JSON for full prompt)" : payload.testPrompt;
917
+ lines.push(prompt);
918
+ lines.push(`\`\`\``);
919
+ lines.push(``);
920
+ }
921
+ return lines.join("\n");
922
+ }
923
+ generateMarkdownSummary(batch_id, report, results) {
924
+ const lines = [];
925
+ lines.push(`# Lattice Eval Batch Summary`);
926
+ lines.push(``);
927
+ lines.push(`- **Project**: ${report.project.projectName}`);
928
+ if (report.project.version) lines.push(`- **Version**: ${report.project.version}`);
929
+ if (report.project.description) lines.push(`- **Description**: ${report.project.description}`);
930
+ lines.push(`- **Batch ID**: ${batch_id}`);
931
+ lines.push(`- **Started**: ${report.started_at}`);
932
+ lines.push(`- **Finished**: ${report.finished_at}`);
933
+ lines.push(``);
934
+ lines.push(`## Overview`);
935
+ lines.push(``);
936
+ lines.push(`| Metric | Value |`);
937
+ lines.push(`|---|---:|`);
938
+ lines.push(`| Total cases | ${report.summary.total_cases} |`);
939
+ lines.push(`| Passed | ${report.summary.passed_cases} |`);
940
+ lines.push(`| Failed | ${report.summary.failed_cases} |`);
941
+ lines.push(`| Pass rate | ${(report.summary.pass_rate * 100).toFixed(2)}% |`);
942
+ lines.push(``);
943
+ lines.push(`## Suites`);
944
+ lines.push(``);
945
+ for (const suite of report.suites) {
946
+ lines.push(`### ${suite.suiteName}`);
947
+ lines.push(``);
948
+ lines.push(`| Case | Status | Score | Duration (ms) | Thread |`);
949
+ lines.push(`|---|---|---:|---:|---|`);
950
+ const suiteResults = results.get(suite.suiteName) || [];
951
+ for (const r of suiteResults) {
952
+ const status = r.result?.pass ? "PASS" : "FAIL";
953
+ const score = r.result?.final_score ?? "";
954
+ const dur = typeof r.duration_ms === "number" ? r.duration_ms : "";
955
+ const thread = r.thread_id ?? "";
956
+ lines.push(`| ${r.caseId} | ${status} | ${score} | ${dur} | ${thread} |`);
957
+ }
958
+ lines.push(``);
959
+ }
960
+ return lines.join("\n");
961
+ }
962
+ async maybeWriteBatchArtifacts(batch_id, report, results) {
963
+ const config = this.reportConfig;
964
+ if (!config?.output_dir) return void 0;
965
+ const batchDir = path.join(config.output_dir, batch_id);
966
+ await mkdir(batchDir, { recursive: true });
967
+ const writeReportJson = config.write_report_json ?? true;
968
+ const writeCaseLogs = config.write_case_logs ?? true;
969
+ if (writeReportJson) {
970
+ await writeFile(
971
+ path.join(batchDir, "report.json"),
972
+ JSON.stringify(report, null, 2),
973
+ "utf-8"
974
+ );
975
+ }
976
+ const resultsJsonPath = path.join(batchDir, "results.json");
977
+ const resultsJson = {
978
+ executionTimestamp: batch_id,
979
+ summary: report.summary,
980
+ report,
981
+ results: Array.from(results.entries()).map(([suiteName, caseResults]) => ({
982
+ suiteName,
983
+ cases: caseResults.map((r) => ({
984
+ caseId: r.caseId,
985
+ passed: r.result?.pass === true,
986
+ message: r.result?.summary || r.error || "",
987
+ error: r.error ? {
988
+ message: r.error,
989
+ stack: r.error_stack
990
+ } : void 0,
991
+ duration: r.duration_ms,
992
+ testPrompt: r.test_prompt,
993
+ finalOutput: r.final_output,
994
+ threadId: r.thread_id,
995
+ judgeThreadId: r.judge_thread_id
996
+ }))
997
+ }))
998
+ };
999
+ await writeFile(resultsJsonPath, JSON.stringify(resultsJson, null, 2), "utf-8");
1000
+ const summaryMdPath = path.join(batchDir, "summary.md");
1001
+ const summaryMd = this.generateMarkdownSummary(batch_id, report, results);
1002
+ await writeFile(summaryMdPath, summaryMd, "utf-8");
1003
+ const individualDir = path.join(batchDir, "individual");
1004
+ await mkdir(individualDir, { recursive: true });
1005
+ let index = 1;
1006
+ for (const [suiteName, caseResults] of results.entries()) {
1007
+ for (const r of caseResults) {
1008
+ const status = r.result?.pass ? "PASS" : "FAIL";
1009
+ const baseFilename = `test-${index}-${suiteName}-${r.caseId}-${status}`.replace(/[\/\\]/g, "_");
1010
+ const jsonPath = path.join(individualDir, `${baseFilename}.json`);
1011
+ const payload = {
1012
+ index,
1013
+ suiteName,
1014
+ caseId: r.caseId,
1015
+ passed: r.result?.pass === true,
1016
+ result: r.result,
1017
+ message: r.result?.summary || r.error || "",
1018
+ error: r.error ? { message: r.error, stack: r.error_stack } : void 0,
1019
+ duration: r.duration_ms,
1020
+ threadId: r.thread_id,
1021
+ judgeThreadId: r.judge_thread_id,
1022
+ finalOutput: r.final_output,
1023
+ testPrompt: r.test_prompt
1024
+ };
1025
+ await writeFile(jsonPath, JSON.stringify(payload, null, 2), "utf-8");
1026
+ const mdPath = path.join(individualDir, `${baseFilename}.md`);
1027
+ const mdContent = this.generateCaseMarkdown(index, suiteName, r, payload);
1028
+ await writeFile(mdPath, mdContent, "utf-8");
1029
+ index += 1;
1030
+ }
1031
+ }
1032
+ if (writeCaseLogs) {
1033
+ for (const [suiteName, caseResults] of results.entries()) {
1034
+ const suiteDir = path.join(batchDir, "cases", suiteName);
1035
+ await mkdir(suiteDir, { recursive: true });
1036
+ for (const r of caseResults) {
1037
+ await writeFile(
1038
+ path.join(suiteDir, `${r.caseId}.logs.json`),
1039
+ JSON.stringify(r.logs || [], null, 2),
1040
+ "utf-8"
1041
+ );
1042
+ }
1043
+ }
1044
+ }
1045
+ return batchDir;
1046
+ }
1047
+ };
1048
+ export {
1049
+ LatticeEval,
1050
+ LatticeEvalProject,
1051
+ LatticeEvalSuite,
1052
+ evaluateLatticeCase,
1053
+ evaluateLatticeCaseWithLogs
1054
+ };
1055
+ //# sourceMappingURL=index.mjs.map