@axiom-lattice/agent-eval 2.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env +29 -0
- package/.turbo/turbo-build.log +20 -0
- package/CHANGELOG.md +10 -0
- package/LICENSE +201 -0
- package/dist/index.d.mts +366 -0
- package/dist/index.d.ts +366 -0
- package/dist/index.js +1092 -0
- package/dist/index.js.map +1 -0
- package/dist/index.mjs +1055 -0
- package/dist/index.mjs.map +1 -0
- package/package.json +51 -0
- package/src/LatticeEval.ts +615 -0
- package/src/LatticeEvalProject.ts +496 -0
- package/src/LatticeEvalSuite.ts +321 -0
- package/src/index.ts +4 -0
- package/src/test.ts +23 -0
- package/src/types.ts +160 -0
- package/tsconfig.json +33 -0
|
@@ -0,0 +1,496 @@
|
|
|
1
|
+
import type {
|
|
2
|
+
LatticeEvalBatchReport,
|
|
3
|
+
LatticeEvalProjectType,
|
|
4
|
+
LatticeEvalReportConfig,
|
|
5
|
+
LatticeEvalTemplate,
|
|
6
|
+
} from "./types";
|
|
7
|
+
import { LatticeEvalSuite, ResolvedConfig, CaseRunResult } from "./LatticeEvalSuite";
|
|
8
|
+
import {
|
|
9
|
+
registerModelLattice,
|
|
10
|
+
registerAgentLattice,
|
|
11
|
+
AgentType,
|
|
12
|
+
AgentConfig,
|
|
13
|
+
} from "@axiom-lattice/core";
|
|
14
|
+
import { mkdir, writeFile } from "fs/promises";
|
|
15
|
+
import path from "path";
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* LatticeEvalProject class manages a project with multiple evaluation suites
|
|
19
|
+
* with project-level configuration
|
|
20
|
+
*/
|
|
21
|
+
export class LatticeEvalProject {
|
|
22
|
+
private project: LatticeEvalProjectType;
|
|
23
|
+
private suites: Map<string, LatticeEvalSuite> = new Map();
|
|
24
|
+
private reportConfig?: LatticeEvalReportConfig;
|
|
25
|
+
|
|
26
|
+
constructor(project: LatticeEvalProjectType) {
|
|
27
|
+
this.project = project;
|
|
28
|
+
this.reportConfig = project.report_config;
|
|
29
|
+
|
|
30
|
+
// Register judge model
|
|
31
|
+
const judgeModelKey = `${this.project.projectName}_judge_model`;
|
|
32
|
+
registerModelLattice(judgeModelKey, this.project.judge_agent_config.model);
|
|
33
|
+
|
|
34
|
+
// Register judge agent
|
|
35
|
+
const judgeAgentKey = "LatticeTest";
|
|
36
|
+
const judgeAgentConfig: AgentConfig = {
|
|
37
|
+
key: judgeAgentKey,
|
|
38
|
+
name: "Lattice Test Judge Agent",
|
|
39
|
+
description: "Judge agent for evaluating Lattice test cases",
|
|
40
|
+
type: AgentType.REACT,
|
|
41
|
+
prompt: "", // No prompt as requested
|
|
42
|
+
modelKey: judgeModelKey,
|
|
43
|
+
};
|
|
44
|
+
registerAgentLattice(judgeAgentConfig);
|
|
45
|
+
|
|
46
|
+
// Build resolved config for project
|
|
47
|
+
const projectConfig: ResolvedConfig = {
|
|
48
|
+
lattice_server_config: {
|
|
49
|
+
base_url: this.project.lattice_server_config.base_url,
|
|
50
|
+
api_key: this.project.lattice_server_config.api_key,
|
|
51
|
+
},
|
|
52
|
+
judge_agent_config: this.project.judge_agent_config,
|
|
53
|
+
concurrency: this.project.concurrency ?? 1,
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
// Build templates map
|
|
57
|
+
const templatesMap = new Map<string, LatticeEvalTemplate>();
|
|
58
|
+
if (this.project.templates) {
|
|
59
|
+
for (const template of this.project.templates) {
|
|
60
|
+
templatesMap.set(template.templateId, template);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Initialize all suites with project config and templates
|
|
65
|
+
for (const suite of this.project.suites) {
|
|
66
|
+
this.suites.set(
|
|
67
|
+
suite.suiteName,
|
|
68
|
+
new LatticeEvalSuite(suite, projectConfig, templatesMap)
|
|
69
|
+
);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Get project name
|
|
75
|
+
*/
|
|
76
|
+
getProjectName(): string {
|
|
77
|
+
return this.project.projectName;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Get project version
|
|
82
|
+
*/
|
|
83
|
+
getVersion(): string | undefined {
|
|
84
|
+
return this.project.version;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Get project description
|
|
89
|
+
*/
|
|
90
|
+
getDescription(): string | undefined {
|
|
91
|
+
return this.project.description;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Get all suite names
|
|
96
|
+
*/
|
|
97
|
+
getSuiteNames(): string[] {
|
|
98
|
+
return Array.from(this.suites.keys());
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Get a specific suite by name
|
|
103
|
+
*/
|
|
104
|
+
getSuite(suiteName: string): LatticeEvalSuite | undefined {
|
|
105
|
+
return this.suites.get(suiteName);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Run a specific case in a specific suite
|
|
110
|
+
* @param suiteName The suite name
|
|
111
|
+
* @param caseId The case ID to run
|
|
112
|
+
* @returns Case run result with error handling
|
|
113
|
+
*/
|
|
114
|
+
async runCase(suiteName: string, caseId: string): Promise<CaseRunResult> {
|
|
115
|
+
const suite = this.getSuite(suiteName);
|
|
116
|
+
if (!suite) {
|
|
117
|
+
return {
|
|
118
|
+
caseId,
|
|
119
|
+
error: `Suite not found: ${suiteName}`,
|
|
120
|
+
logs: [],
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
return suite.runCase(caseId);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Run all cases in a specific suite with concurrency control and error isolation
|
|
128
|
+
* @param suiteName The suite name
|
|
129
|
+
* @param concurrency Optional concurrency limit (overrides project config)
|
|
130
|
+
* @returns Array of case run results with error handling
|
|
131
|
+
*/
|
|
132
|
+
async runSuite(suiteName: string, concurrency?: number): Promise<CaseRunResult[]> {
|
|
133
|
+
const suite = this.getSuite(suiteName);
|
|
134
|
+
if (!suite) {
|
|
135
|
+
throw new Error(`Suite not found: ${suiteName}`);
|
|
136
|
+
}
|
|
137
|
+
return suite.runAllCases(concurrency);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Run all cases in all suites with concurrency control and error isolation
|
|
142
|
+
* @param concurrency Optional concurrency limit (overrides project config)
|
|
143
|
+
* @returns Map of suite names to their case run results
|
|
144
|
+
*/
|
|
145
|
+
async runAllSuites(concurrency?: number): Promise<Map<string, CaseRunResult[]>> {
|
|
146
|
+
const results = new Map<string, CaseRunResult[]>();
|
|
147
|
+
|
|
148
|
+
for (const suiteName of this.getSuiteNames()) {
|
|
149
|
+
try {
|
|
150
|
+
const suiteResults = await this.runSuite(suiteName, concurrency);
|
|
151
|
+
results.set(suiteName, suiteResults);
|
|
152
|
+
} catch (error) {
|
|
153
|
+
// If suite execution fails, create error results for all cases
|
|
154
|
+
const suite = this.getSuite(suiteName);
|
|
155
|
+
if (suite) {
|
|
156
|
+
const errorResults: CaseRunResult[] = suite.getCases().map((c) => ({
|
|
157
|
+
caseId: c.caseId,
|
|
158
|
+
error: error instanceof Error ? error.message : String(error),
|
|
159
|
+
logs: [],
|
|
160
|
+
}));
|
|
161
|
+
results.set(suiteName, errorResults);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
return results;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Run all suites as a "batch", build a report, and optionally write it to disk.
|
|
171
|
+
*/
|
|
172
|
+
async runAllSuitesBatch(concurrency?: number): Promise<{
|
|
173
|
+
batch_id: string;
|
|
174
|
+
batch_dir?: string;
|
|
175
|
+
results: Map<string, CaseRunResult[]>;
|
|
176
|
+
report: LatticeEvalBatchReport;
|
|
177
|
+
}> {
|
|
178
|
+
const started_at = new Date().toISOString();
|
|
179
|
+
const batch_id =
|
|
180
|
+
this.reportConfig?.batch_id ||
|
|
181
|
+
`${Date.now()}`;
|
|
182
|
+
|
|
183
|
+
console.log(`\nRunning batch: ${this.project.projectName} (${this.getSuiteNames().length} suites)`);
|
|
184
|
+
|
|
185
|
+
const results = await this.runAllSuites(concurrency);
|
|
186
|
+
|
|
187
|
+
let total_cases = 0;
|
|
188
|
+
let passed_cases = 0;
|
|
189
|
+
let failed_cases = 0;
|
|
190
|
+
|
|
191
|
+
const suites: LatticeEvalBatchReport["suites"] = [];
|
|
192
|
+
const durations: number[] = [];
|
|
193
|
+
for (const [suiteName, caseResults] of results.entries()) {
|
|
194
|
+
const suiteTotal = caseResults.length;
|
|
195
|
+
const suitePassed = caseResults.filter((r) => r.result?.pass).length;
|
|
196
|
+
const suiteFailed = suiteTotal - suitePassed;
|
|
197
|
+
|
|
198
|
+
total_cases += suiteTotal;
|
|
199
|
+
passed_cases += suitePassed;
|
|
200
|
+
failed_cases += suiteFailed;
|
|
201
|
+
|
|
202
|
+
suites.push({
|
|
203
|
+
suiteName,
|
|
204
|
+
total_cases: suiteTotal,
|
|
205
|
+
passed_cases: suitePassed,
|
|
206
|
+
failed_cases: suiteFailed,
|
|
207
|
+
cases: caseResults.map((r) => ({
|
|
208
|
+
caseId: r.caseId,
|
|
209
|
+
pass: r.result?.pass,
|
|
210
|
+
final_score: r.result?.final_score,
|
|
211
|
+
error: r.error,
|
|
212
|
+
})),
|
|
213
|
+
});
|
|
214
|
+
|
|
215
|
+
for (const r of caseResults) {
|
|
216
|
+
if (typeof r.duration_ms === "number") durations.push(r.duration_ms);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
const finished_at = new Date().toISOString();
|
|
221
|
+
const report: LatticeEvalBatchReport = {
|
|
222
|
+
batch_id,
|
|
223
|
+
started_at,
|
|
224
|
+
finished_at,
|
|
225
|
+
project: {
|
|
226
|
+
projectName: this.project.projectName,
|
|
227
|
+
version: this.project.version,
|
|
228
|
+
description: this.project.description,
|
|
229
|
+
},
|
|
230
|
+
summary: {
|
|
231
|
+
total_cases,
|
|
232
|
+
passed_cases,
|
|
233
|
+
failed_cases,
|
|
234
|
+
pass_rate: total_cases > 0 ? passed_cases / total_cases : 0,
|
|
235
|
+
},
|
|
236
|
+
suites,
|
|
237
|
+
};
|
|
238
|
+
|
|
239
|
+
const batch_dir = await this.maybeWriteBatchArtifacts(
|
|
240
|
+
batch_id,
|
|
241
|
+
report,
|
|
242
|
+
results
|
|
243
|
+
);
|
|
244
|
+
|
|
245
|
+
console.log(`\n=== Summary ===`);
|
|
246
|
+
console.log(`Total: ${report.summary.total_cases} | Passed: ${report.summary.passed_cases} | Failed: ${report.summary.failed_cases} | Pass Rate: ${(report.summary.pass_rate * 100).toFixed(2)}%`);
|
|
247
|
+
if (batch_dir) {
|
|
248
|
+
console.log(`\nResults saved to: ${batch_dir}`);
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
return { batch_id, batch_dir, results, report };
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
private generateCaseMarkdown(
|
|
255
|
+
index: number,
|
|
256
|
+
suiteName: string,
|
|
257
|
+
caseResult: CaseRunResult,
|
|
258
|
+
payload: any
|
|
259
|
+
): string {
|
|
260
|
+
const lines: string[] = [];
|
|
261
|
+
const status = caseResult.result?.pass ? "✅ PASS" : "❌ FAIL";
|
|
262
|
+
|
|
263
|
+
lines.push(`# Test ${index}: ${status}`);
|
|
264
|
+
lines.push(``);
|
|
265
|
+
lines.push(`- **Suite**: ${suiteName}`);
|
|
266
|
+
lines.push(`- **Case ID**: ${caseResult.caseId}`);
|
|
267
|
+
lines.push(`- **Status**: ${caseResult.result?.pass ? "PASS" : "FAIL"}`);
|
|
268
|
+
if (typeof payload.duration === "number") {
|
|
269
|
+
lines.push(`- **Duration**: ${(payload.duration / 1000).toFixed(2)}s`);
|
|
270
|
+
}
|
|
271
|
+
if (payload.threadId) {
|
|
272
|
+
lines.push(`- **Thread ID**: ${payload.threadId}`);
|
|
273
|
+
}
|
|
274
|
+
if (payload.judgeThreadId) {
|
|
275
|
+
lines.push(`- **Judge Thread ID**: ${payload.judgeThreadId}`);
|
|
276
|
+
}
|
|
277
|
+
lines.push(``);
|
|
278
|
+
|
|
279
|
+
if (caseResult.result) {
|
|
280
|
+
lines.push(`## Result`);
|
|
281
|
+
lines.push(``);
|
|
282
|
+
lines.push(`- **Final Score**: ${caseResult.result.final_score}`);
|
|
283
|
+
lines.push(`- **Summary**: ${caseResult.result.summary || "N/A"}`);
|
|
284
|
+
lines.push(``);
|
|
285
|
+
|
|
286
|
+
if (caseResult.result.dimension_results && caseResult.result.dimension_results.length > 0) {
|
|
287
|
+
lines.push(`## Dimension Results`);
|
|
288
|
+
lines.push(``);
|
|
289
|
+
for (const dim of caseResult.result.dimension_results) {
|
|
290
|
+
lines.push(`### ${dim.name}`);
|
|
291
|
+
lines.push(`- **Score**: ${dim.score}`);
|
|
292
|
+
lines.push(`- **Reason**: ${dim.reason}`);
|
|
293
|
+
lines.push(``);
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
if (caseResult.error) {
|
|
299
|
+
lines.push(`## Error`);
|
|
300
|
+
lines.push(``);
|
|
301
|
+
lines.push(`\`\`\``);
|
|
302
|
+
lines.push(caseResult.error);
|
|
303
|
+
if (caseResult.error_stack) {
|
|
304
|
+
lines.push(``);
|
|
305
|
+
lines.push(caseResult.error_stack);
|
|
306
|
+
}
|
|
307
|
+
lines.push(`\`\`\``);
|
|
308
|
+
lines.push(``);
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
if (payload.finalOutput) {
|
|
312
|
+
lines.push(`## Final Output`);
|
|
313
|
+
lines.push(``);
|
|
314
|
+
lines.push(`\`\`\``);
|
|
315
|
+
// Truncate very long outputs in markdown for readability
|
|
316
|
+
const output = payload.finalOutput.length > 5000
|
|
317
|
+
? payload.finalOutput.substring(0, 5000) + "\n\n... (truncated, see JSON for full output)"
|
|
318
|
+
: payload.finalOutput;
|
|
319
|
+
lines.push(output);
|
|
320
|
+
lines.push(`\`\`\``);
|
|
321
|
+
lines.push(``);
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
if (payload.testPrompt) {
|
|
325
|
+
lines.push(`## Test Prompt`);
|
|
326
|
+
lines.push(``);
|
|
327
|
+
lines.push(`\`\`\``);
|
|
328
|
+
// Truncate very long prompts in markdown for readability
|
|
329
|
+
const prompt = payload.testPrompt.length > 5000
|
|
330
|
+
? payload.testPrompt.substring(0, 5000) + "\n\n... (truncated, see JSON for full prompt)"
|
|
331
|
+
: payload.testPrompt;
|
|
332
|
+
lines.push(prompt);
|
|
333
|
+
lines.push(`\`\`\``);
|
|
334
|
+
lines.push(``);
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
return lines.join("\n");
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
private generateMarkdownSummary(
|
|
341
|
+
batch_id: string,
|
|
342
|
+
report: LatticeEvalBatchReport,
|
|
343
|
+
results: Map<string, CaseRunResult[]>
|
|
344
|
+
): string {
|
|
345
|
+
const lines: string[] = [];
|
|
346
|
+
lines.push(`# Lattice Eval Batch Summary`);
|
|
347
|
+
lines.push(``);
|
|
348
|
+
lines.push(`- **Project**: ${report.project.projectName}`);
|
|
349
|
+
if (report.project.version) lines.push(`- **Version**: ${report.project.version}`);
|
|
350
|
+
if (report.project.description) lines.push(`- **Description**: ${report.project.description}`);
|
|
351
|
+
lines.push(`- **Batch ID**: ${batch_id}`);
|
|
352
|
+
lines.push(`- **Started**: ${report.started_at}`);
|
|
353
|
+
lines.push(`- **Finished**: ${report.finished_at}`);
|
|
354
|
+
lines.push(``);
|
|
355
|
+
|
|
356
|
+
lines.push(`## Overview`);
|
|
357
|
+
lines.push(``);
|
|
358
|
+
lines.push(`| Metric | Value |`);
|
|
359
|
+
lines.push(`|---|---:|`);
|
|
360
|
+
lines.push(`| Total cases | ${report.summary.total_cases} |`);
|
|
361
|
+
lines.push(`| Passed | ${report.summary.passed_cases} |`);
|
|
362
|
+
lines.push(`| Failed | ${report.summary.failed_cases} |`);
|
|
363
|
+
lines.push(`| Pass rate | ${(report.summary.pass_rate * 100).toFixed(2)}% |`);
|
|
364
|
+
lines.push(``);
|
|
365
|
+
|
|
366
|
+
lines.push(`## Suites`);
|
|
367
|
+
lines.push(``);
|
|
368
|
+
for (const suite of report.suites) {
|
|
369
|
+
lines.push(`### ${suite.suiteName}`);
|
|
370
|
+
lines.push(``);
|
|
371
|
+
lines.push(`| Case | Status | Score | Duration (ms) | Thread |`);
|
|
372
|
+
lines.push(`|---|---|---:|---:|---|`);
|
|
373
|
+
const suiteResults = results.get(suite.suiteName) || [];
|
|
374
|
+
for (const r of suiteResults) {
|
|
375
|
+
const status = r.result?.pass ? "PASS" : "FAIL";
|
|
376
|
+
const score = r.result?.final_score ?? "";
|
|
377
|
+
const dur = typeof r.duration_ms === "number" ? r.duration_ms : "";
|
|
378
|
+
const thread = r.thread_id ?? "";
|
|
379
|
+
lines.push(`| ${r.caseId} | ${status} | ${score} | ${dur} | ${thread} |`);
|
|
380
|
+
}
|
|
381
|
+
lines.push(``);
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
return lines.join("\n");
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
private async maybeWriteBatchArtifacts(
|
|
388
|
+
batch_id: string,
|
|
389
|
+
report: LatticeEvalBatchReport,
|
|
390
|
+
results: Map<string, CaseRunResult[]>
|
|
391
|
+
): Promise<string | undefined> {
|
|
392
|
+
const config = this.reportConfig;
|
|
393
|
+
if (!config?.output_dir) return undefined;
|
|
394
|
+
|
|
395
|
+
const batchDir = path.join(config.output_dir, batch_id);
|
|
396
|
+
await mkdir(batchDir, { recursive: true });
|
|
397
|
+
|
|
398
|
+
const writeReportJson = config.write_report_json ?? true;
|
|
399
|
+
const writeCaseLogs = config.write_case_logs ?? true;
|
|
400
|
+
|
|
401
|
+
if (writeReportJson) {
|
|
402
|
+
await writeFile(
|
|
403
|
+
path.join(batchDir, "report.json"),
|
|
404
|
+
JSON.stringify(report, null, 2),
|
|
405
|
+
"utf-8"
|
|
406
|
+
);
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
// Write richer results.json (similar to TestRunner)
|
|
410
|
+
const resultsJsonPath = path.join(batchDir, "results.json");
|
|
411
|
+
const resultsJson = {
|
|
412
|
+
executionTimestamp: batch_id,
|
|
413
|
+
summary: report.summary,
|
|
414
|
+
report,
|
|
415
|
+
results: Array.from(results.entries()).map(([suiteName, caseResults]) => ({
|
|
416
|
+
suiteName,
|
|
417
|
+
cases: caseResults.map((r) => ({
|
|
418
|
+
caseId: r.caseId,
|
|
419
|
+
passed: r.result?.pass === true,
|
|
420
|
+
message: r.result?.summary || r.error || "",
|
|
421
|
+
error: r.error
|
|
422
|
+
? {
|
|
423
|
+
message: r.error,
|
|
424
|
+
stack: r.error_stack,
|
|
425
|
+
}
|
|
426
|
+
: undefined,
|
|
427
|
+
duration: r.duration_ms,
|
|
428
|
+
testPrompt: r.test_prompt,
|
|
429
|
+
finalOutput: r.final_output,
|
|
430
|
+
threadId: r.thread_id,
|
|
431
|
+
judgeThreadId: r.judge_thread_id,
|
|
432
|
+
})),
|
|
433
|
+
})),
|
|
434
|
+
};
|
|
435
|
+
await writeFile(resultsJsonPath, JSON.stringify(resultsJson, null, 2), "utf-8");
|
|
436
|
+
|
|
437
|
+
// Write summary.md
|
|
438
|
+
const summaryMdPath = path.join(batchDir, "summary.md");
|
|
439
|
+
const summaryMd = this.generateMarkdownSummary(batch_id, report, results);
|
|
440
|
+
await writeFile(summaryMdPath, summaryMd, "utf-8");
|
|
441
|
+
|
|
442
|
+
// Write per-case detailed json and markdown
|
|
443
|
+
const individualDir = path.join(batchDir, "individual");
|
|
444
|
+
await mkdir(individualDir, { recursive: true });
|
|
445
|
+
let index = 1;
|
|
446
|
+
for (const [suiteName, caseResults] of results.entries()) {
|
|
447
|
+
for (const r of caseResults) {
|
|
448
|
+
const status = r.result?.pass ? "PASS" : "FAIL";
|
|
449
|
+
const baseFilename = `test-${index}-${suiteName}-${r.caseId}-${status}`.replace(/[\/\\]/g, "_");
|
|
450
|
+
|
|
451
|
+
// Write JSON
|
|
452
|
+
const jsonPath = path.join(individualDir, `${baseFilename}.json`);
|
|
453
|
+
const payload = {
|
|
454
|
+
index,
|
|
455
|
+
suiteName,
|
|
456
|
+
caseId: r.caseId,
|
|
457
|
+
passed: r.result?.pass === true,
|
|
458
|
+
result: r.result,
|
|
459
|
+
message: r.result?.summary || r.error || "",
|
|
460
|
+
error: r.error
|
|
461
|
+
? { message: r.error, stack: r.error_stack }
|
|
462
|
+
: undefined,
|
|
463
|
+
duration: r.duration_ms,
|
|
464
|
+
threadId: r.thread_id,
|
|
465
|
+
judgeThreadId: r.judge_thread_id,
|
|
466
|
+
finalOutput: r.final_output,
|
|
467
|
+
testPrompt: r.test_prompt,
|
|
468
|
+
};
|
|
469
|
+
await writeFile(jsonPath, JSON.stringify(payload, null, 2), "utf-8");
|
|
470
|
+
|
|
471
|
+
// Write Markdown
|
|
472
|
+
const mdPath = path.join(individualDir, `${baseFilename}.md`);
|
|
473
|
+
const mdContent = this.generateCaseMarkdown(index, suiteName, r, payload);
|
|
474
|
+
await writeFile(mdPath, mdContent, "utf-8");
|
|
475
|
+
|
|
476
|
+
index += 1;
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
if (writeCaseLogs) {
|
|
481
|
+
for (const [suiteName, caseResults] of results.entries()) {
|
|
482
|
+
const suiteDir = path.join(batchDir, "cases", suiteName);
|
|
483
|
+
await mkdir(suiteDir, { recursive: true });
|
|
484
|
+
for (const r of caseResults) {
|
|
485
|
+
await writeFile(
|
|
486
|
+
path.join(suiteDir, `${r.caseId}.logs.json`),
|
|
487
|
+
JSON.stringify(r.logs || [], null, 2),
|
|
488
|
+
"utf-8"
|
|
489
|
+
);
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
return batchDir;
|
|
495
|
+
}
|
|
496
|
+
}
|