@axiom-lattice/agent-eval 2.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,496 @@
1
+ import type {
2
+ LatticeEvalBatchReport,
3
+ LatticeEvalProjectType,
4
+ LatticeEvalReportConfig,
5
+ LatticeEvalTemplate,
6
+ } from "./types";
7
+ import { LatticeEvalSuite, ResolvedConfig, CaseRunResult } from "./LatticeEvalSuite";
8
+ import {
9
+ registerModelLattice,
10
+ registerAgentLattice,
11
+ AgentType,
12
+ AgentConfig,
13
+ } from "@axiom-lattice/core";
14
+ import { mkdir, writeFile } from "fs/promises";
15
+ import path from "path";
16
+
17
+ /**
18
+ * LatticeEvalProject class manages a project with multiple evaluation suites
19
+ * with project-level configuration
20
+ */
21
+ export class LatticeEvalProject {
22
+ private project: LatticeEvalProjectType;
23
+ private suites: Map<string, LatticeEvalSuite> = new Map();
24
+ private reportConfig?: LatticeEvalReportConfig;
25
+
26
+ constructor(project: LatticeEvalProjectType) {
27
+ this.project = project;
28
+ this.reportConfig = project.report_config;
29
+
30
+ // Register judge model
31
+ const judgeModelKey = `${this.project.projectName}_judge_model`;
32
+ registerModelLattice(judgeModelKey, this.project.judge_agent_config.model);
33
+
34
+ // Register judge agent
35
+ const judgeAgentKey = "LatticeTest";
36
+ const judgeAgentConfig: AgentConfig = {
37
+ key: judgeAgentKey,
38
+ name: "Lattice Test Judge Agent",
39
+ description: "Judge agent for evaluating Lattice test cases",
40
+ type: AgentType.REACT,
41
+ prompt: "", // No prompt as requested
42
+ modelKey: judgeModelKey,
43
+ };
44
+ registerAgentLattice(judgeAgentConfig);
45
+
46
+ // Build resolved config for project
47
+ const projectConfig: ResolvedConfig = {
48
+ lattice_server_config: {
49
+ base_url: this.project.lattice_server_config.base_url,
50
+ api_key: this.project.lattice_server_config.api_key,
51
+ },
52
+ judge_agent_config: this.project.judge_agent_config,
53
+ concurrency: this.project.concurrency ?? 1,
54
+ };
55
+
56
+ // Build templates map
57
+ const templatesMap = new Map<string, LatticeEvalTemplate>();
58
+ if (this.project.templates) {
59
+ for (const template of this.project.templates) {
60
+ templatesMap.set(template.templateId, template);
61
+ }
62
+ }
63
+
64
+ // Initialize all suites with project config and templates
65
+ for (const suite of this.project.suites) {
66
+ this.suites.set(
67
+ suite.suiteName,
68
+ new LatticeEvalSuite(suite, projectConfig, templatesMap)
69
+ );
70
+ }
71
+ }
72
+
73
+ /**
74
+ * Get project name
75
+ */
76
+ getProjectName(): string {
77
+ return this.project.projectName;
78
+ }
79
+
80
+ /**
81
+ * Get project version
82
+ */
83
+ getVersion(): string | undefined {
84
+ return this.project.version;
85
+ }
86
+
87
+ /**
88
+ * Get project description
89
+ */
90
+ getDescription(): string | undefined {
91
+ return this.project.description;
92
+ }
93
+
94
+ /**
95
+ * Get all suite names
96
+ */
97
+ getSuiteNames(): string[] {
98
+ return Array.from(this.suites.keys());
99
+ }
100
+
101
+ /**
102
+ * Get a specific suite by name
103
+ */
104
+ getSuite(suiteName: string): LatticeEvalSuite | undefined {
105
+ return this.suites.get(suiteName);
106
+ }
107
+
108
+ /**
109
+ * Run a specific case in a specific suite
110
+ * @param suiteName The suite name
111
+ * @param caseId The case ID to run
112
+ * @returns Case run result with error handling
113
+ */
114
+ async runCase(suiteName: string, caseId: string): Promise<CaseRunResult> {
115
+ const suite = this.getSuite(suiteName);
116
+ if (!suite) {
117
+ return {
118
+ caseId,
119
+ error: `Suite not found: ${suiteName}`,
120
+ logs: [],
121
+ };
122
+ }
123
+ return suite.runCase(caseId);
124
+ }
125
+
126
+ /**
127
+ * Run all cases in a specific suite with concurrency control and error isolation
128
+ * @param suiteName The suite name
129
+ * @param concurrency Optional concurrency limit (overrides project config)
130
+ * @returns Array of case run results with error handling
131
+ */
132
+ async runSuite(suiteName: string, concurrency?: number): Promise<CaseRunResult[]> {
133
+ const suite = this.getSuite(suiteName);
134
+ if (!suite) {
135
+ throw new Error(`Suite not found: ${suiteName}`);
136
+ }
137
+ return suite.runAllCases(concurrency);
138
+ }
139
+
140
+ /**
141
+ * Run all cases in all suites with concurrency control and error isolation
142
+ * @param concurrency Optional concurrency limit (overrides project config)
143
+ * @returns Map of suite names to their case run results
144
+ */
145
+ async runAllSuites(concurrency?: number): Promise<Map<string, CaseRunResult[]>> {
146
+ const results = new Map<string, CaseRunResult[]>();
147
+
148
+ for (const suiteName of this.getSuiteNames()) {
149
+ try {
150
+ const suiteResults = await this.runSuite(suiteName, concurrency);
151
+ results.set(suiteName, suiteResults);
152
+ } catch (error) {
153
+ // If suite execution fails, create error results for all cases
154
+ const suite = this.getSuite(suiteName);
155
+ if (suite) {
156
+ const errorResults: CaseRunResult[] = suite.getCases().map((c) => ({
157
+ caseId: c.caseId,
158
+ error: error instanceof Error ? error.message : String(error),
159
+ logs: [],
160
+ }));
161
+ results.set(suiteName, errorResults);
162
+ }
163
+ }
164
+ }
165
+
166
+ return results;
167
+ }
168
+
169
+ /**
170
+ * Run all suites as a "batch", build a report, and optionally write it to disk.
171
+ */
172
+ async runAllSuitesBatch(concurrency?: number): Promise<{
173
+ batch_id: string;
174
+ batch_dir?: string;
175
+ results: Map<string, CaseRunResult[]>;
176
+ report: LatticeEvalBatchReport;
177
+ }> {
178
+ const started_at = new Date().toISOString();
179
+ const batch_id =
180
+ this.reportConfig?.batch_id ||
181
+ `${Date.now()}`;
182
+
183
+ console.log(`\nRunning batch: ${this.project.projectName} (${this.getSuiteNames().length} suites)`);
184
+
185
+ const results = await this.runAllSuites(concurrency);
186
+
187
+ let total_cases = 0;
188
+ let passed_cases = 0;
189
+ let failed_cases = 0;
190
+
191
+ const suites: LatticeEvalBatchReport["suites"] = [];
192
+ const durations: number[] = [];
193
+ for (const [suiteName, caseResults] of results.entries()) {
194
+ const suiteTotal = caseResults.length;
195
+ const suitePassed = caseResults.filter((r) => r.result?.pass).length;
196
+ const suiteFailed = suiteTotal - suitePassed;
197
+
198
+ total_cases += suiteTotal;
199
+ passed_cases += suitePassed;
200
+ failed_cases += suiteFailed;
201
+
202
+ suites.push({
203
+ suiteName,
204
+ total_cases: suiteTotal,
205
+ passed_cases: suitePassed,
206
+ failed_cases: suiteFailed,
207
+ cases: caseResults.map((r) => ({
208
+ caseId: r.caseId,
209
+ pass: r.result?.pass,
210
+ final_score: r.result?.final_score,
211
+ error: r.error,
212
+ })),
213
+ });
214
+
215
+ for (const r of caseResults) {
216
+ if (typeof r.duration_ms === "number") durations.push(r.duration_ms);
217
+ }
218
+ }
219
+
220
+ const finished_at = new Date().toISOString();
221
+ const report: LatticeEvalBatchReport = {
222
+ batch_id,
223
+ started_at,
224
+ finished_at,
225
+ project: {
226
+ projectName: this.project.projectName,
227
+ version: this.project.version,
228
+ description: this.project.description,
229
+ },
230
+ summary: {
231
+ total_cases,
232
+ passed_cases,
233
+ failed_cases,
234
+ pass_rate: total_cases > 0 ? passed_cases / total_cases : 0,
235
+ },
236
+ suites,
237
+ };
238
+
239
+ const batch_dir = await this.maybeWriteBatchArtifacts(
240
+ batch_id,
241
+ report,
242
+ results
243
+ );
244
+
245
+ console.log(`\n=== Summary ===`);
246
+ console.log(`Total: ${report.summary.total_cases} | Passed: ${report.summary.passed_cases} | Failed: ${report.summary.failed_cases} | Pass Rate: ${(report.summary.pass_rate * 100).toFixed(2)}%`);
247
+ if (batch_dir) {
248
+ console.log(`\nResults saved to: ${batch_dir}`);
249
+ }
250
+
251
+ return { batch_id, batch_dir, results, report };
252
+ }
253
+
254
+ private generateCaseMarkdown(
255
+ index: number,
256
+ suiteName: string,
257
+ caseResult: CaseRunResult,
258
+ payload: any
259
+ ): string {
260
+ const lines: string[] = [];
261
+ const status = caseResult.result?.pass ? "✅ PASS" : "❌ FAIL";
262
+
263
+ lines.push(`# Test ${index}: ${status}`);
264
+ lines.push(``);
265
+ lines.push(`- **Suite**: ${suiteName}`);
266
+ lines.push(`- **Case ID**: ${caseResult.caseId}`);
267
+ lines.push(`- **Status**: ${caseResult.result?.pass ? "PASS" : "FAIL"}`);
268
+ if (typeof payload.duration === "number") {
269
+ lines.push(`- **Duration**: ${(payload.duration / 1000).toFixed(2)}s`);
270
+ }
271
+ if (payload.threadId) {
272
+ lines.push(`- **Thread ID**: ${payload.threadId}`);
273
+ }
274
+ if (payload.judgeThreadId) {
275
+ lines.push(`- **Judge Thread ID**: ${payload.judgeThreadId}`);
276
+ }
277
+ lines.push(``);
278
+
279
+ if (caseResult.result) {
280
+ lines.push(`## Result`);
281
+ lines.push(``);
282
+ lines.push(`- **Final Score**: ${caseResult.result.final_score}`);
283
+ lines.push(`- **Summary**: ${caseResult.result.summary || "N/A"}`);
284
+ lines.push(``);
285
+
286
+ if (caseResult.result.dimension_results && caseResult.result.dimension_results.length > 0) {
287
+ lines.push(`## Dimension Results`);
288
+ lines.push(``);
289
+ for (const dim of caseResult.result.dimension_results) {
290
+ lines.push(`### ${dim.name}`);
291
+ lines.push(`- **Score**: ${dim.score}`);
292
+ lines.push(`- **Reason**: ${dim.reason}`);
293
+ lines.push(``);
294
+ }
295
+ }
296
+ }
297
+
298
+ if (caseResult.error) {
299
+ lines.push(`## Error`);
300
+ lines.push(``);
301
+ lines.push(`\`\`\``);
302
+ lines.push(caseResult.error);
303
+ if (caseResult.error_stack) {
304
+ lines.push(``);
305
+ lines.push(caseResult.error_stack);
306
+ }
307
+ lines.push(`\`\`\``);
308
+ lines.push(``);
309
+ }
310
+
311
+ if (payload.finalOutput) {
312
+ lines.push(`## Final Output`);
313
+ lines.push(``);
314
+ lines.push(`\`\`\``);
315
+ // Truncate very long outputs in markdown for readability
316
+ const output = payload.finalOutput.length > 5000
317
+ ? payload.finalOutput.substring(0, 5000) + "\n\n... (truncated, see JSON for full output)"
318
+ : payload.finalOutput;
319
+ lines.push(output);
320
+ lines.push(`\`\`\``);
321
+ lines.push(``);
322
+ }
323
+
324
+ if (payload.testPrompt) {
325
+ lines.push(`## Test Prompt`);
326
+ lines.push(``);
327
+ lines.push(`\`\`\``);
328
+ // Truncate very long prompts in markdown for readability
329
+ const prompt = payload.testPrompt.length > 5000
330
+ ? payload.testPrompt.substring(0, 5000) + "\n\n... (truncated, see JSON for full prompt)"
331
+ : payload.testPrompt;
332
+ lines.push(prompt);
333
+ lines.push(`\`\`\``);
334
+ lines.push(``);
335
+ }
336
+
337
+ return lines.join("\n");
338
+ }
339
+
340
+ private generateMarkdownSummary(
341
+ batch_id: string,
342
+ report: LatticeEvalBatchReport,
343
+ results: Map<string, CaseRunResult[]>
344
+ ): string {
345
+ const lines: string[] = [];
346
+ lines.push(`# Lattice Eval Batch Summary`);
347
+ lines.push(``);
348
+ lines.push(`- **Project**: ${report.project.projectName}`);
349
+ if (report.project.version) lines.push(`- **Version**: ${report.project.version}`);
350
+ if (report.project.description) lines.push(`- **Description**: ${report.project.description}`);
351
+ lines.push(`- **Batch ID**: ${batch_id}`);
352
+ lines.push(`- **Started**: ${report.started_at}`);
353
+ lines.push(`- **Finished**: ${report.finished_at}`);
354
+ lines.push(``);
355
+
356
+ lines.push(`## Overview`);
357
+ lines.push(``);
358
+ lines.push(`| Metric | Value |`);
359
+ lines.push(`|---|---:|`);
360
+ lines.push(`| Total cases | ${report.summary.total_cases} |`);
361
+ lines.push(`| Passed | ${report.summary.passed_cases} |`);
362
+ lines.push(`| Failed | ${report.summary.failed_cases} |`);
363
+ lines.push(`| Pass rate | ${(report.summary.pass_rate * 100).toFixed(2)}% |`);
364
+ lines.push(``);
365
+
366
+ lines.push(`## Suites`);
367
+ lines.push(``);
368
+ for (const suite of report.suites) {
369
+ lines.push(`### ${suite.suiteName}`);
370
+ lines.push(``);
371
+ lines.push(`| Case | Status | Score | Duration (ms) | Thread |`);
372
+ lines.push(`|---|---|---:|---:|---|`);
373
+ const suiteResults = results.get(suite.suiteName) || [];
374
+ for (const r of suiteResults) {
375
+ const status = r.result?.pass ? "PASS" : "FAIL";
376
+ const score = r.result?.final_score ?? "";
377
+ const dur = typeof r.duration_ms === "number" ? r.duration_ms : "";
378
+ const thread = r.thread_id ?? "";
379
+ lines.push(`| ${r.caseId} | ${status} | ${score} | ${dur} | ${thread} |`);
380
+ }
381
+ lines.push(``);
382
+ }
383
+
384
+ return lines.join("\n");
385
+ }
386
+
387
+ private async maybeWriteBatchArtifacts(
388
+ batch_id: string,
389
+ report: LatticeEvalBatchReport,
390
+ results: Map<string, CaseRunResult[]>
391
+ ): Promise<string | undefined> {
392
+ const config = this.reportConfig;
393
+ if (!config?.output_dir) return undefined;
394
+
395
+ const batchDir = path.join(config.output_dir, batch_id);
396
+ await mkdir(batchDir, { recursive: true });
397
+
398
+ const writeReportJson = config.write_report_json ?? true;
399
+ const writeCaseLogs = config.write_case_logs ?? true;
400
+
401
+ if (writeReportJson) {
402
+ await writeFile(
403
+ path.join(batchDir, "report.json"),
404
+ JSON.stringify(report, null, 2),
405
+ "utf-8"
406
+ );
407
+ }
408
+
409
+ // Write richer results.json (similar to TestRunner)
410
+ const resultsJsonPath = path.join(batchDir, "results.json");
411
+ const resultsJson = {
412
+ executionTimestamp: batch_id,
413
+ summary: report.summary,
414
+ report,
415
+ results: Array.from(results.entries()).map(([suiteName, caseResults]) => ({
416
+ suiteName,
417
+ cases: caseResults.map((r) => ({
418
+ caseId: r.caseId,
419
+ passed: r.result?.pass === true,
420
+ message: r.result?.summary || r.error || "",
421
+ error: r.error
422
+ ? {
423
+ message: r.error,
424
+ stack: r.error_stack,
425
+ }
426
+ : undefined,
427
+ duration: r.duration_ms,
428
+ testPrompt: r.test_prompt,
429
+ finalOutput: r.final_output,
430
+ threadId: r.thread_id,
431
+ judgeThreadId: r.judge_thread_id,
432
+ })),
433
+ })),
434
+ };
435
+ await writeFile(resultsJsonPath, JSON.stringify(resultsJson, null, 2), "utf-8");
436
+
437
+ // Write summary.md
438
+ const summaryMdPath = path.join(batchDir, "summary.md");
439
+ const summaryMd = this.generateMarkdownSummary(batch_id, report, results);
440
+ await writeFile(summaryMdPath, summaryMd, "utf-8");
441
+
442
+ // Write per-case detailed json and markdown
443
+ const individualDir = path.join(batchDir, "individual");
444
+ await mkdir(individualDir, { recursive: true });
445
+ let index = 1;
446
+ for (const [suiteName, caseResults] of results.entries()) {
447
+ for (const r of caseResults) {
448
+ const status = r.result?.pass ? "PASS" : "FAIL";
449
+ const baseFilename = `test-${index}-${suiteName}-${r.caseId}-${status}`.replace(/[\/\\]/g, "_");
450
+
451
+ // Write JSON
452
+ const jsonPath = path.join(individualDir, `${baseFilename}.json`);
453
+ const payload = {
454
+ index,
455
+ suiteName,
456
+ caseId: r.caseId,
457
+ passed: r.result?.pass === true,
458
+ result: r.result,
459
+ message: r.result?.summary || r.error || "",
460
+ error: r.error
461
+ ? { message: r.error, stack: r.error_stack }
462
+ : undefined,
463
+ duration: r.duration_ms,
464
+ threadId: r.thread_id,
465
+ judgeThreadId: r.judge_thread_id,
466
+ finalOutput: r.final_output,
467
+ testPrompt: r.test_prompt,
468
+ };
469
+ await writeFile(jsonPath, JSON.stringify(payload, null, 2), "utf-8");
470
+
471
+ // Write Markdown
472
+ const mdPath = path.join(individualDir, `${baseFilename}.md`);
473
+ const mdContent = this.generateCaseMarkdown(index, suiteName, r, payload);
474
+ await writeFile(mdPath, mdContent, "utf-8");
475
+
476
+ index += 1;
477
+ }
478
+ }
479
+
480
+ if (writeCaseLogs) {
481
+ for (const [suiteName, caseResults] of results.entries()) {
482
+ const suiteDir = path.join(batchDir, "cases", suiteName);
483
+ await mkdir(suiteDir, { recursive: true });
484
+ for (const r of caseResults) {
485
+ await writeFile(
486
+ path.join(suiteDir, `${r.caseId}.logs.json`),
487
+ JSON.stringify(r.logs || [], null, 2),
488
+ "utf-8"
489
+ );
490
+ }
491
+ }
492
+ }
493
+
494
+ return batchDir;
495
+ }
496
+ }