@axiom-lattice/agent-eval 2.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env +29 -0
- package/.turbo/turbo-build.log +20 -0
- package/CHANGELOG.md +10 -0
- package/LICENSE +201 -0
- package/dist/index.d.mts +366 -0
- package/dist/index.d.ts +366 -0
- package/dist/index.js +1092 -0
- package/dist/index.js.map +1 -0
- package/dist/index.mjs +1055 -0
- package/dist/index.mjs.map +1 -0
- package/package.json +51 -0
- package/src/LatticeEval.ts +615 -0
- package/src/LatticeEvalProject.ts +496 -0
- package/src/LatticeEvalSuite.ts +321 -0
- package/src/index.ts +4 -0
- package/src/test.ts +23 -0
- package/src/types.ts +160 -0
- package/tsconfig.json +33 -0
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,1055 @@
|
|
|
1
|
+
// src/LatticeEval.ts
|
|
2
|
+
import { getAgentClient } from "@axiom-lattice/core";
|
|
3
|
+
import { HumanMessage } from "@langchain/core/messages";
|
|
4
|
+
import { v4 } from "uuid";
|
|
5
|
+
var LatticeEval = class {
|
|
6
|
+
/**
|
|
7
|
+
* Create a new LatticeEval instance
|
|
8
|
+
* @param config Optional server configuration (defaults to localhost:3203)
|
|
9
|
+
*/
|
|
10
|
+
constructor(config) {
|
|
11
|
+
this.inMemoryLogs = [];
|
|
12
|
+
this.lastDurationMs = 0;
|
|
13
|
+
this.config = config;
|
|
14
|
+
this.baseUrl = this.config.base_url;
|
|
15
|
+
this.verbose = this.config.verbose ?? true;
|
|
16
|
+
}
|
|
17
|
+
getLastRunMeta() {
|
|
18
|
+
return {
|
|
19
|
+
duration_ms: this.lastDurationMs,
|
|
20
|
+
thread_id: this.lastThreadId,
|
|
21
|
+
judge_thread_id: this.lastJudgeThreadId,
|
|
22
|
+
test_prompt: this.lastTestPrompt,
|
|
23
|
+
final_output: this.lastFinalOutput
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
getInMemoryLogs() {
|
|
27
|
+
return [...this.inMemoryLogs];
|
|
28
|
+
}
|
|
29
|
+
record(level, message, data) {
|
|
30
|
+
const event = {
|
|
31
|
+
ts: (/* @__PURE__ */ new Date()).toISOString(),
|
|
32
|
+
level,
|
|
33
|
+
message,
|
|
34
|
+
data
|
|
35
|
+
};
|
|
36
|
+
this.inMemoryLogs.push(event);
|
|
37
|
+
if (!this.verbose) return;
|
|
38
|
+
if (level === "error") {
|
|
39
|
+
const keyInfo = this.getKeyInfo(data);
|
|
40
|
+
console.log(` \u2717 ${message}${keyInfo ? ` ${keyInfo}` : ""}`);
|
|
41
|
+
} else if (message.includes("Starting case") || message.includes("Case evaluation completed")) {
|
|
42
|
+
const keyInfo = this.getKeyInfo(data);
|
|
43
|
+
console.log(` ${message}${keyInfo ? ` ${keyInfo}` : ""}`);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
log(message, data) {
|
|
47
|
+
this.record("info", message, data);
|
|
48
|
+
}
|
|
49
|
+
getKeyInfo(data) {
|
|
50
|
+
if (!data) return "";
|
|
51
|
+
const parts = [];
|
|
52
|
+
if (data.case_id) parts.push(`case=${data.case_id}`);
|
|
53
|
+
if (data.pass !== void 0) parts.push(`pass=${data.pass}`);
|
|
54
|
+
if (data.final_score !== void 0) parts.push(`score=${data.final_score}`);
|
|
55
|
+
if (data.error) parts.push(`error=${data.error}`);
|
|
56
|
+
return parts.length > 0 ? `(${parts.join(" ")})` : "";
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Execute a single agent step and return the thread ID and response data
|
|
60
|
+
*/
|
|
61
|
+
async executeAgentStep(step, threadId, inputMessage, files) {
|
|
62
|
+
this.log("Executing agent step", {
|
|
63
|
+
agent_id: step.agent_id,
|
|
64
|
+
thread_id: threadId,
|
|
65
|
+
has_override_input_message: Boolean(step.override_input_message),
|
|
66
|
+
input_message_length: step.override_input_message ? step.override_input_message.length : inputMessage.length,
|
|
67
|
+
files_count: Object.keys(files || {}).length
|
|
68
|
+
});
|
|
69
|
+
const response = await fetch(`${this.baseUrl}/api/runs`, {
|
|
70
|
+
method: "POST",
|
|
71
|
+
headers: {
|
|
72
|
+
"Content-Type": "application/json"
|
|
73
|
+
},
|
|
74
|
+
body: JSON.stringify({
|
|
75
|
+
assistant_id: step.agent_id,
|
|
76
|
+
thread_id: threadId,
|
|
77
|
+
files: Object.keys(files).reduce((acc, key) => {
|
|
78
|
+
acc[key] = { content: files[key].split("\n"), created_at: (/* @__PURE__ */ new Date()).toISOString(), modified_at: (/* @__PURE__ */ new Date()).toISOString() };
|
|
79
|
+
return acc;
|
|
80
|
+
}, {}),
|
|
81
|
+
message: step.override_input_message || inputMessage
|
|
82
|
+
})
|
|
83
|
+
});
|
|
84
|
+
const responseData = await response.json();
|
|
85
|
+
if (responseData.error) {
|
|
86
|
+
this.log("Agent step failed", {
|
|
87
|
+
agent_id: step.agent_id,
|
|
88
|
+
thread_id: threadId,
|
|
89
|
+
error: responseData.error
|
|
90
|
+
});
|
|
91
|
+
throw new Error(
|
|
92
|
+
`Failed to run agent ${step.agent_id}: ${responseData.error}`
|
|
93
|
+
);
|
|
94
|
+
}
|
|
95
|
+
this.log("Agent step completed", {
|
|
96
|
+
agent_id: step.agent_id,
|
|
97
|
+
thread_id: threadId,
|
|
98
|
+
response_keys: responseData ? Object.keys(responseData) : []
|
|
99
|
+
});
|
|
100
|
+
return { threadId, responseData };
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Extract output content based on OutputType
|
|
104
|
+
*/
|
|
105
|
+
async extractOutput(outputType, agentId, threadId, runResponseData) {
|
|
106
|
+
if (outputType.type === "file_content") {
|
|
107
|
+
this.log("Extracting file output", {
|
|
108
|
+
agent_id: agentId,
|
|
109
|
+
thread_id: threadId,
|
|
110
|
+
file_path: outputType.file_path
|
|
111
|
+
});
|
|
112
|
+
const responseState = await fetch(
|
|
113
|
+
`${this.baseUrl}/api/assistants/${agentId}/${threadId}/state`,
|
|
114
|
+
{
|
|
115
|
+
method: "GET"
|
|
116
|
+
}
|
|
117
|
+
);
|
|
118
|
+
if (!responseState.ok) {
|
|
119
|
+
this.log("Failed to fetch assistant state", {
|
|
120
|
+
agent_id: agentId,
|
|
121
|
+
thread_id: threadId,
|
|
122
|
+
status: responseState.status,
|
|
123
|
+
statusText: responseState.statusText
|
|
124
|
+
});
|
|
125
|
+
throw new Error(`Failed to get state: ${responseState.statusText}`);
|
|
126
|
+
}
|
|
127
|
+
const state = await responseState.json();
|
|
128
|
+
const stateValues = state.values;
|
|
129
|
+
const fileContent = stateValues.files[outputType.file_path]?.content;
|
|
130
|
+
if (!fileContent) {
|
|
131
|
+
this.log("File output not found in state", {
|
|
132
|
+
agent_id: agentId,
|
|
133
|
+
thread_id: threadId,
|
|
134
|
+
file_path: outputType.file_path
|
|
135
|
+
});
|
|
136
|
+
throw new Error(
|
|
137
|
+
`File not found in output: ${outputType.file_path}`
|
|
138
|
+
);
|
|
139
|
+
}
|
|
140
|
+
const content = Array.isArray(fileContent) ? fileContent.join("\n") : fileContent;
|
|
141
|
+
this.log("File output extracted", {
|
|
142
|
+
agent_id: agentId,
|
|
143
|
+
thread_id: threadId,
|
|
144
|
+
file_path: outputType.file_path,
|
|
145
|
+
output_length: typeof content === "string" ? content.length : void 0
|
|
146
|
+
});
|
|
147
|
+
return content;
|
|
148
|
+
} else {
|
|
149
|
+
if (runResponseData?.messages && runResponseData.messages.length > 0) {
|
|
150
|
+
const content = runResponseData.messages[runResponseData.messages.length - 1]?.content || "";
|
|
151
|
+
this.log("Message output extracted", {
|
|
152
|
+
agent_id: agentId,
|
|
153
|
+
thread_id: threadId,
|
|
154
|
+
output_length: typeof content === "string" ? content.length : void 0
|
|
155
|
+
});
|
|
156
|
+
return content;
|
|
157
|
+
}
|
|
158
|
+
this.log("No message content found in run response", {
|
|
159
|
+
agent_id: agentId,
|
|
160
|
+
thread_id: threadId
|
|
161
|
+
});
|
|
162
|
+
throw new Error("No message content found in run response");
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Evaluate a single Lattice evaluation case
|
|
167
|
+
* @param evalCase The evaluation case to run
|
|
168
|
+
* @returns Evaluation result with pass/fail status and scores
|
|
169
|
+
*/
|
|
170
|
+
async evaluateCase(evalCase) {
|
|
171
|
+
const startedAt = Date.now();
|
|
172
|
+
const threadId = `${evalCase.caseId}||${v4()}`;
|
|
173
|
+
this.inMemoryLogs = [];
|
|
174
|
+
this.lastThreadId = threadId;
|
|
175
|
+
this.lastJudgeThreadId = void 0;
|
|
176
|
+
this.lastTestPrompt = void 0;
|
|
177
|
+
this.lastFinalOutput = void 0;
|
|
178
|
+
this.lastDurationMs = 0;
|
|
179
|
+
this.log("Starting case evaluation", {
|
|
180
|
+
case_id: evalCase.caseId,
|
|
181
|
+
thread_id: threadId,
|
|
182
|
+
steps_count: evalCase.steps?.length,
|
|
183
|
+
output_type: evalCase.output?.type
|
|
184
|
+
});
|
|
185
|
+
let currentThreadId = threadId;
|
|
186
|
+
let lastResponseData = null;
|
|
187
|
+
for (const step of evalCase.steps) {
|
|
188
|
+
const result = await this.executeAgentStep(
|
|
189
|
+
step,
|
|
190
|
+
currentThreadId,
|
|
191
|
+
evalCase.input.message,
|
|
192
|
+
evalCase.input.files || {}
|
|
193
|
+
);
|
|
194
|
+
currentThreadId = result.threadId;
|
|
195
|
+
lastResponseData = result.responseData;
|
|
196
|
+
}
|
|
197
|
+
const finalAgentId = evalCase.steps[evalCase.steps.length - 1]?.agent_id || "";
|
|
198
|
+
this.log("All agent steps completed", {
|
|
199
|
+
case_id: evalCase.caseId,
|
|
200
|
+
final_agent_id: finalAgentId,
|
|
201
|
+
final_thread_id: currentThreadId
|
|
202
|
+
});
|
|
203
|
+
const finalOutput = await this.extractOutput(
|
|
204
|
+
evalCase.output,
|
|
205
|
+
finalAgentId,
|
|
206
|
+
currentThreadId,
|
|
207
|
+
lastResponseData
|
|
208
|
+
);
|
|
209
|
+
this.lastFinalOutput = finalOutput;
|
|
210
|
+
this.log("Final output extracted", {
|
|
211
|
+
case_id: evalCase.caseId,
|
|
212
|
+
output_type: evalCase.output.type,
|
|
213
|
+
output_length: finalOutput.length
|
|
214
|
+
});
|
|
215
|
+
const testCaseFilesContent = evalCase.input.files ? Object.keys(evalCase.input.files).map(
|
|
216
|
+
(key) => `File name: ${key}
|
|
217
|
+
File content: ${evalCase.input.files[key]}`
|
|
218
|
+
).join("\n\n") : "";
|
|
219
|
+
const outputTypeDescription = evalCase.output.type === "file_content" ? `\u865A\u62DF\u4EA7\u7269\uFF08\u6587\u4EF6\uFF1A${evalCase.output.file_path}\uFF09` : "\u6D88\u606F\u5185\u5BB9";
|
|
220
|
+
const defaultRubrics = [
|
|
221
|
+
{
|
|
222
|
+
dimension: "correctness",
|
|
223
|
+
weight: 100,
|
|
224
|
+
description: "\u6574\u4F53\u6B63\u786E\u6027\uFF0C\u662F\u5426\u7B26\u5408\u9884\u671F\u8F93\u51FA\u63CF\u8FF0\u3002"
|
|
225
|
+
}
|
|
226
|
+
];
|
|
227
|
+
const evalRubrics = evalCase.eval.eval_rubrics && evalCase.eval.eval_rubrics.length > 0 ? evalCase.eval.eval_rubrics : defaultRubrics;
|
|
228
|
+
this.log("Prepared evaluation rubrics", {
|
|
229
|
+
case_id: evalCase.caseId,
|
|
230
|
+
rubrics_count: evalRubrics.length,
|
|
231
|
+
rubric_dimensions: evalRubrics.map((r) => r.dimension)
|
|
232
|
+
});
|
|
233
|
+
const rubricsSection = `
|
|
234
|
+
## \u8BC4\u4F30\u6307\u6807\uFF08Evaluation Rubrics\uFF09
|
|
235
|
+
${evalRubrics.map(
|
|
236
|
+
(r) => `- **${r.dimension}**\uFF08\u6743\u91CD\uFF1A${r.weight}\uFF09\uFF1A${r.description}`
|
|
237
|
+
).join("\n")}`;
|
|
238
|
+
const testPrompt = `# \u89D2\u8272
|
|
239
|
+
\u4F60\u662F\u4E00\u540D\u8D44\u6DF1\u7684 AI Agent \u8BC4\u4F30\u4E13\u5BB6\uFF0C\u8D1F\u8D23\u6839\u636E\u9884\u8BBE\u7684\u6307\u6807\uFF08Rubrics\uFF09\u5BF9 Agent \u7684\u6267\u884C\u7ED3\u679C\u8FDB\u884C"\u9ED1\u76D2\u6D4B\u8BD5"\u5224\u5B9A\u3002
|
|
240
|
+
|
|
241
|
+
# \u8F93\u5165\u4FE1\u606F
|
|
242
|
+
\u6D4B\u8BD5\u6846\u67B6\u5C06\u4E3A\u4F60\u63D0\u4F9B\u4EE5\u4E0B\u56DB\u4E2A\u6838\u5FC3\u4E0A\u4E0B\u6587\uFF1A
|
|
243
|
+
|
|
244
|
+
1. **\u7528\u6237\u610F\u56FE\uFF08User Intent\uFF09**\uFF1A${evalCase.input.message}
|
|
245
|
+
|
|
246
|
+
2. **\u8F93\u5165\u6587\u4EF6\uFF08Input Files\uFF09**\uFF1A${testCaseFilesContent || "\u65E0"}
|
|
247
|
+
|
|
248
|
+
3. **\u5B9E\u9645\u8F93\u51FA\uFF08Actual Output\uFF0C${outputTypeDescription}\uFF09**\uFF1A
|
|
249
|
+
${finalOutput}
|
|
250
|
+
|
|
251
|
+
4. **\u671F\u671B\u8F93\u51FA\u63CF\u8FF0\uFF08Expected Output Description\uFF09**\uFF1A${evalCase.eval.content_assertion}
|
|
252
|
+
${rubricsSection}
|
|
253
|
+
|
|
254
|
+
# \u4EFB\u52A1
|
|
255
|
+
\u4F60\u5FC5\u987B\u4E25\u683C\u5BF9\u7167"\u8BC4\u4F30\u6307\u6807\uFF08Evaluation Rubrics\uFF09"\u4E2D\u7684\u6BCF\u4E00\u9879\u6307\u6807\uFF0C\u5206\u6790"\u5B9E\u9645\u8F93\u51FA\uFF08Actual Output\uFF09"\u662F\u5426\u8FBE\u6807\u3002
|
|
256
|
+
|
|
257
|
+
# \u89C4\u5219
|
|
258
|
+
1. **\u5BA2\u89C2\u6027**\uFF1A\u4EC5\u6839\u636E\u63D0\u4F9B\u7684\u4E0A\u4E0B\u6587\u5224\u5B9A\u3002\u5982\u679C\u6807\u51C6\u8981\u6C42"\u5305\u542B\u6570\u5B57"\uFF0C\u4F46\u8F93\u51FA\u53EA\u6709\u6587\u5B57\uFF0C\u5373\u4F7F\u8BED\u6C14\u518D\u597D\u4E5F\u5FC5\u987B\u6263\u5206\u3002
|
|
259
|
+
2. **\u7ED3\u679C\u6821\u9A8C**\uFF1A\u5982\u679C"\u5B9E\u9645\u8F93\u51FA"\u4E2D\u7F3A\u5931\u9884\u671F\u7684\u5185\u5BB9\uFF0C\u6216\u5185\u5BB9\u4E0D\u7B26\u5408"\u8BC4\u4F30\u6307\u6807"\u4E2D\u7684\u6807\u51C6\uFF0C\u5BF9\u5E94\u7684\u6307\u6807\u5E94\u5224\u5B9A\u4E3A\u5931\u8D25\u3002
|
|
260
|
+
3. **\u8BC1\u636E\u5BFC\u5411**\uFF1A\u5728\u7ED9\u51FA\u539F\u56E0\uFF08reason\uFF09\u65F6\uFF0C\u5FC5\u987B\u5F15\u7528\u8F93\u51FA\u4E2D\u7684\u539F\u6587\u6216\u865A\u62DF\u4EA7\u7269\u4E2D\u7684\u5177\u4F53\u6570\u636E\u7247\u6BB5\u3002
|
|
261
|
+
4. **\u52A0\u6743\u8BA1\u7B97**\uFF1A\u6700\u7EC8\u5206\u6570\u4E3A\u5404\u9879\u6307\u6807\u5F97\u5206\u4E0E\u5176\u6743\u91CD\u7684\u4E58\u79EF\u4E4B\u548C\uFF080-100\u5206\u5236\uFF09\u3002
|
|
262
|
+
|
|
263
|
+
# \u8F93\u51FA\u683C\u5F0F\uFF08\u4EC5JSON\uFF09
|
|
264
|
+
\u4F60\u5FC5\u987B\u4EC5\u4EE5 JSON \u683C\u5F0F\u56DE\u590D\uFF0C\u7ED3\u6784\u5982\u4E0B\uFF1A
|
|
265
|
+
{
|
|
266
|
+
"pass": true | false,
|
|
267
|
+
"final_score": number,
|
|
268
|
+
"dimension_results": [
|
|
269
|
+
{
|
|
270
|
+
"name": "\u6307\u6807\u540D\u79F0",
|
|
271
|
+
"score": number,
|
|
272
|
+
"reason": "\u5177\u4F53\u7684\u6263\u5206\u6216\u7ED9\u5206\u7406\u7531\uFF0C\u9700\u5F15\u7528\u8BC1\u636E"
|
|
273
|
+
}
|
|
274
|
+
],
|
|
275
|
+
"summary": "\u5BF9 Agent \u8868\u73B0\u7684\u6574\u4F53\u8BC4\u4EF7"
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
\u6CE8\u610F\uFF1A\u5982\u679C final_score >= 80 \u4E14\u6CA1\u6709\u81F4\u547D\u6027\u9519\u8BEF\uFF0Cpass \u5E94\u4E3A true\uFF1B\u5426\u5219\u4E3A false\u3002`;
|
|
279
|
+
this.lastTestPrompt = testPrompt;
|
|
280
|
+
const judgeThreadId = v4();
|
|
281
|
+
this.lastJudgeThreadId = judgeThreadId;
|
|
282
|
+
this.log("Invoking judge agent", { agent_key: "LatticeTest", case_id: evalCase.caseId });
|
|
283
|
+
const testResponse = await getAgentClient("LatticeTest").invoke(
|
|
284
|
+
{
|
|
285
|
+
messages: [new HumanMessage(testPrompt)]
|
|
286
|
+
},
|
|
287
|
+
{
|
|
288
|
+
configurable: {
|
|
289
|
+
thread_id: judgeThreadId
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
);
|
|
293
|
+
this.log("Judge agent responded", {
|
|
294
|
+
case_id: evalCase.caseId,
|
|
295
|
+
messages_count: testResponse?.messages?.length
|
|
296
|
+
});
|
|
297
|
+
const testResultContent = testResponse.messages[testResponse.messages.length - 1]?.content || "";
|
|
298
|
+
this.log("Judge raw output received", {
|
|
299
|
+
case_id: evalCase.caseId,
|
|
300
|
+
output_length: typeof testResultContent === "string" ? testResultContent.length : void 0
|
|
301
|
+
});
|
|
302
|
+
let parsedResult = {};
|
|
303
|
+
try {
|
|
304
|
+
const jsonMatch = testResultContent.match(/```(?:json)?\s*(\{[\s\S]*\})\s*```/) || testResultContent.match(/\{[\s\S]*\}/);
|
|
305
|
+
if (jsonMatch) {
|
|
306
|
+
parsedResult = JSON.parse(jsonMatch[1] || jsonMatch[0]);
|
|
307
|
+
this.log("Parsed judge JSON successfully", {
|
|
308
|
+
case_id: evalCase.caseId,
|
|
309
|
+
parsed_keys: Object.keys(parsedResult || {})
|
|
310
|
+
});
|
|
311
|
+
} else {
|
|
312
|
+
this.log("No JSON detected in judge output; will fallback", {
|
|
313
|
+
case_id: evalCase.caseId
|
|
314
|
+
});
|
|
315
|
+
}
|
|
316
|
+
} catch (error) {
|
|
317
|
+
console.warn("Failed to parse JSON from judge agent response, falling back to keyword-based parsing:", error);
|
|
318
|
+
this.log("Failed to parse judge JSON; falling back", {
|
|
319
|
+
case_id: evalCase.caseId,
|
|
320
|
+
error: error instanceof Error ? error.message : String(error)
|
|
321
|
+
});
|
|
322
|
+
}
|
|
323
|
+
let pass;
|
|
324
|
+
if (parsedResult.pass !== void 0) {
|
|
325
|
+
pass = parsedResult.pass;
|
|
326
|
+
this.log("Pass determined from parsedResult.pass", { case_id: evalCase.caseId, pass });
|
|
327
|
+
} else if (parsedResult.final_score !== void 0) {
|
|
328
|
+
pass = parsedResult.final_score >= 80;
|
|
329
|
+
this.log("Pass determined from parsedResult.final_score", {
|
|
330
|
+
case_id: evalCase.caseId,
|
|
331
|
+
final_score: parsedResult.final_score,
|
|
332
|
+
pass
|
|
333
|
+
});
|
|
334
|
+
} else {
|
|
335
|
+
pass = testResultContent.toLowerCase().includes("pass") || testResultContent.toLowerCase().includes("success") || testResultContent.toLowerCase().includes("\u901A\u8FC7") || testResultContent.toLowerCase().includes("\u7B26\u5408");
|
|
336
|
+
this.log("Pass determined from keyword fallback", { case_id: evalCase.caseId, pass });
|
|
337
|
+
}
|
|
338
|
+
let dimensionResults = [];
|
|
339
|
+
if (parsedResult.dimension_results && parsedResult.dimension_results.length > 0) {
|
|
340
|
+
dimensionResults = parsedResult.dimension_results;
|
|
341
|
+
this.log("Using parsed dimension_results from judge", {
|
|
342
|
+
case_id: evalCase.caseId,
|
|
343
|
+
dimensions_count: dimensionResults.length
|
|
344
|
+
});
|
|
345
|
+
} else if (evalRubrics.length > 0) {
|
|
346
|
+
dimensionResults = evalRubrics.map((rubric) => ({
|
|
347
|
+
name: rubric.dimension,
|
|
348
|
+
score: 0,
|
|
349
|
+
reason: ""
|
|
350
|
+
}));
|
|
351
|
+
this.log("No dimension_results parsed; using rubric skeleton", {
|
|
352
|
+
case_id: evalCase.caseId,
|
|
353
|
+
dimensions_count: dimensionResults.length
|
|
354
|
+
});
|
|
355
|
+
}
|
|
356
|
+
let finalScore;
|
|
357
|
+
if (parsedResult.final_score !== void 0) {
|
|
358
|
+
finalScore = parsedResult.final_score;
|
|
359
|
+
this.log("Final score taken from parsedResult.final_score", {
|
|
360
|
+
case_id: evalCase.caseId,
|
|
361
|
+
final_score: finalScore
|
|
362
|
+
});
|
|
363
|
+
} else if (dimensionResults.length > 0 && evalRubrics.length > 0) {
|
|
364
|
+
const rubricMap = new Map(evalRubrics.map((r) => [r.dimension, r.weight]));
|
|
365
|
+
const totalWeight = Array.from(rubricMap.values()).reduce((sum, w) => sum + w, 0);
|
|
366
|
+
if (totalWeight > 0) {
|
|
367
|
+
finalScore = dimensionResults.reduce((sum, result) => {
|
|
368
|
+
const weight = rubricMap.get(result.name) || 1;
|
|
369
|
+
return sum + result.score * weight;
|
|
370
|
+
}, 0) / totalWeight;
|
|
371
|
+
} else {
|
|
372
|
+
finalScore = dimensionResults.reduce((sum, result) => sum + result.score, 0) / dimensionResults.length;
|
|
373
|
+
}
|
|
374
|
+
this.log("Final score computed from dimension_results", {
|
|
375
|
+
case_id: evalCase.caseId,
|
|
376
|
+
final_score: finalScore,
|
|
377
|
+
total_weight: totalWeight
|
|
378
|
+
});
|
|
379
|
+
} else {
|
|
380
|
+
finalScore = pass ? 100 : 0;
|
|
381
|
+
this.log("Final score fallback (pass-based)", {
|
|
382
|
+
case_id: evalCase.caseId,
|
|
383
|
+
final_score: finalScore,
|
|
384
|
+
pass
|
|
385
|
+
});
|
|
386
|
+
}
|
|
387
|
+
this.log("Case evaluation completed", {
|
|
388
|
+
case_id: evalCase.caseId,
|
|
389
|
+
pass,
|
|
390
|
+
final_score: finalScore
|
|
391
|
+
});
|
|
392
|
+
const finishedAt = Date.now();
|
|
393
|
+
this.lastDurationMs = finishedAt - startedAt;
|
|
394
|
+
this.record("info", "Case duration recorded", {
|
|
395
|
+
case_id: evalCase.caseId,
|
|
396
|
+
duration_ms: this.lastDurationMs
|
|
397
|
+
});
|
|
398
|
+
return {
|
|
399
|
+
pass,
|
|
400
|
+
final_score: finalScore,
|
|
401
|
+
dimension_results: dimensionResults,
|
|
402
|
+
summary: parsedResult.summary || testResultContent
|
|
403
|
+
};
|
|
404
|
+
}
|
|
405
|
+
};
|
|
406
|
+
async function evaluateLatticeCase(evalCase, config) {
|
|
407
|
+
const defaultConfig = {
|
|
408
|
+
base_url: "http://localhost:3203"
|
|
409
|
+
};
|
|
410
|
+
const evaluator = new LatticeEval(config || defaultConfig);
|
|
411
|
+
return evaluator.evaluateCase(evalCase);
|
|
412
|
+
}
|
|
413
|
+
async function evaluateLatticeCaseWithLogs(evalCase, config) {
|
|
414
|
+
const defaultConfig = {
|
|
415
|
+
base_url: "http://localhost:3203"
|
|
416
|
+
};
|
|
417
|
+
const evaluator = new LatticeEval(config || defaultConfig);
|
|
418
|
+
try {
|
|
419
|
+
const result = await evaluator.evaluateCase(evalCase);
|
|
420
|
+
const meta = evaluator.getLastRunMeta();
|
|
421
|
+
return {
|
|
422
|
+
caseId: evalCase.caseId,
|
|
423
|
+
result,
|
|
424
|
+
duration_ms: meta.duration_ms,
|
|
425
|
+
thread_id: meta.thread_id,
|
|
426
|
+
judge_thread_id: meta.judge_thread_id,
|
|
427
|
+
test_prompt: meta.test_prompt,
|
|
428
|
+
final_output: meta.final_output,
|
|
429
|
+
logs: evaluator.getInMemoryLogs()
|
|
430
|
+
};
|
|
431
|
+
} catch (error) {
|
|
432
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
433
|
+
const errorStack = error instanceof Error ? error.stack : void 0;
|
|
434
|
+
evaluator.record("error", "Case evaluation failed", {
|
|
435
|
+
case_id: evalCase.caseId,
|
|
436
|
+
error: errorMessage
|
|
437
|
+
});
|
|
438
|
+
const meta = evaluator.getLastRunMeta();
|
|
439
|
+
return {
|
|
440
|
+
caseId: evalCase.caseId,
|
|
441
|
+
error: errorMessage,
|
|
442
|
+
error_stack: errorStack,
|
|
443
|
+
duration_ms: meta.duration_ms,
|
|
444
|
+
thread_id: meta.thread_id,
|
|
445
|
+
judge_thread_id: meta.judge_thread_id,
|
|
446
|
+
test_prompt: meta.test_prompt,
|
|
447
|
+
final_output: meta.final_output,
|
|
448
|
+
logs: evaluator.getInMemoryLogs()
|
|
449
|
+
};
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
// src/LatticeEvalSuite.ts
|
|
454
|
+
async function limitConcurrency(tasks, concurrency) {
|
|
455
|
+
const results = [];
|
|
456
|
+
const executing = [];
|
|
457
|
+
let index = 0;
|
|
458
|
+
const executeTask = async (task, taskIndex) => {
|
|
459
|
+
try {
|
|
460
|
+
const result = await task();
|
|
461
|
+
results[taskIndex] = { success: true, result };
|
|
462
|
+
} catch (error) {
|
|
463
|
+
results[taskIndex] = {
|
|
464
|
+
success: false,
|
|
465
|
+
error: error instanceof Error ? error.message : String(error)
|
|
466
|
+
};
|
|
467
|
+
}
|
|
468
|
+
};
|
|
469
|
+
while (index < tasks.length || executing.length > 0) {
|
|
470
|
+
while (executing.length < concurrency && index < tasks.length) {
|
|
471
|
+
const task = tasks[index];
|
|
472
|
+
const currentIndex = index++;
|
|
473
|
+
const promise = executeTask(task, currentIndex).catch((err) => {
|
|
474
|
+
console.error(`Unexpected error in task execution:`, err);
|
|
475
|
+
}).finally(() => {
|
|
476
|
+
const idx = executing.indexOf(promise);
|
|
477
|
+
if (idx > -1) {
|
|
478
|
+
executing.splice(idx, 1);
|
|
479
|
+
}
|
|
480
|
+
});
|
|
481
|
+
executing.push(promise);
|
|
482
|
+
}
|
|
483
|
+
if (executing.length > 0) {
|
|
484
|
+
await Promise.race(executing);
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
await Promise.allSettled(executing);
|
|
488
|
+
return results;
|
|
489
|
+
}
|
|
490
|
+
function resolveTemplateCase(templateCase, templates) {
|
|
491
|
+
const template = templates.get(templateCase.templateId);
|
|
492
|
+
if (!template) {
|
|
493
|
+
throw new Error(`Template not found: ${templateCase.templateId}`);
|
|
494
|
+
}
|
|
495
|
+
const resolvedCase = {
|
|
496
|
+
caseId: templateCase.caseId,
|
|
497
|
+
input: {
|
|
498
|
+
message: templateCase.input.message ?? template.default_case.input.message,
|
|
499
|
+
files: {
|
|
500
|
+
...template.default_case.input.files,
|
|
501
|
+
...templateCase.input.files
|
|
502
|
+
}
|
|
503
|
+
},
|
|
504
|
+
steps: template.default_case.steps,
|
|
505
|
+
output: templateCase.output || template.default_case.output,
|
|
506
|
+
eval: {
|
|
507
|
+
content_assertion: templateCase.eval.content_assertion,
|
|
508
|
+
eval_rubrics: templateCase.eval.eval_rubrics || template.default_case.eval?.eval_rubrics
|
|
509
|
+
}
|
|
510
|
+
};
|
|
511
|
+
return resolvedCase;
|
|
512
|
+
}
|
|
513
|
+
function isTemplateCase(case_) {
|
|
514
|
+
return "templateId" in case_;
|
|
515
|
+
}
|
|
516
|
+
var LatticeEvalSuite = class {
|
|
517
|
+
constructor(suite, projectConfig, templates = /* @__PURE__ */ new Map()) {
|
|
518
|
+
this.suite = suite;
|
|
519
|
+
this.projectConfig = projectConfig;
|
|
520
|
+
this.templates = templates;
|
|
521
|
+
}
|
|
522
|
+
/**
|
|
523
|
+
* Get resolved configuration from project
|
|
524
|
+
*/
|
|
525
|
+
getResolvedConfig() {
|
|
526
|
+
return this.projectConfig;
|
|
527
|
+
}
|
|
528
|
+
/**
|
|
529
|
+
* Get suite name
|
|
530
|
+
*/
|
|
531
|
+
getSuiteName() {
|
|
532
|
+
return this.suite.suiteName;
|
|
533
|
+
}
|
|
534
|
+
/**
|
|
535
|
+
* Get suite version
|
|
536
|
+
*/
|
|
537
|
+
getVersion() {
|
|
538
|
+
return this.suite.version;
|
|
539
|
+
}
|
|
540
|
+
/**
|
|
541
|
+
* Get all cases in this suite (resolved from templates if needed)
|
|
542
|
+
*/
|
|
543
|
+
getCases() {
|
|
544
|
+
return this.suite.cases.map((case_) => {
|
|
545
|
+
if (isTemplateCase(case_)) {
|
|
546
|
+
return resolveTemplateCase(case_, this.templates);
|
|
547
|
+
}
|
|
548
|
+
return case_;
|
|
549
|
+
});
|
|
550
|
+
}
|
|
551
|
+
/**
|
|
552
|
+
* Get a specific case by ID (resolved from template if needed)
|
|
553
|
+
*/
|
|
554
|
+
getCase(caseId) {
|
|
555
|
+
const case_ = this.suite.cases.find((c) => c.caseId === caseId);
|
|
556
|
+
if (!case_) {
|
|
557
|
+
return void 0;
|
|
558
|
+
}
|
|
559
|
+
if (isTemplateCase(case_)) {
|
|
560
|
+
return resolveTemplateCase(case_, this.templates);
|
|
561
|
+
}
|
|
562
|
+
return case_;
|
|
563
|
+
}
|
|
564
|
+
/**
|
|
565
|
+
* Run a single case in this suite with error handling
|
|
566
|
+
* @param caseId The case ID to run
|
|
567
|
+
* @returns Case run result with error handling
|
|
568
|
+
*/
|
|
569
|
+
async runCase(caseId) {
|
|
570
|
+
try {
|
|
571
|
+
const evalCase = this.getCase(caseId);
|
|
572
|
+
if (!evalCase) {
|
|
573
|
+
return {
|
|
574
|
+
caseId,
|
|
575
|
+
error: `Case not found: ${caseId}`,
|
|
576
|
+
logs: []
|
|
577
|
+
};
|
|
578
|
+
}
|
|
579
|
+
const config = this.getResolvedConfig();
|
|
580
|
+
const evalConfig = {
|
|
581
|
+
base_url: config.lattice_server_config.base_url,
|
|
582
|
+
api_key: config.lattice_server_config.api_key
|
|
583
|
+
};
|
|
584
|
+
const run = await evaluateLatticeCaseWithLogs(evalCase, evalConfig);
|
|
585
|
+
return {
|
|
586
|
+
caseId,
|
|
587
|
+
result: run.result,
|
|
588
|
+
error: run.error,
|
|
589
|
+
error_stack: run.error_stack,
|
|
590
|
+
duration_ms: run.duration_ms,
|
|
591
|
+
thread_id: run.thread_id,
|
|
592
|
+
judge_thread_id: run.judge_thread_id,
|
|
593
|
+
test_prompt: run.test_prompt,
|
|
594
|
+
final_output: run.final_output,
|
|
595
|
+
logs: run.logs
|
|
596
|
+
};
|
|
597
|
+
} catch (error) {
|
|
598
|
+
return {
|
|
599
|
+
caseId,
|
|
600
|
+
error: error instanceof Error ? error.message : String(error),
|
|
601
|
+
logs: []
|
|
602
|
+
};
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
/**
|
|
606
|
+
* Run all cases in this suite with concurrency control and error isolation
|
|
607
|
+
* @param concurrency Optional concurrency limit (overrides project config)
|
|
608
|
+
* @returns Array of case run results with error handling
|
|
609
|
+
*/
|
|
610
|
+
async runAllCases(concurrency) {
|
|
611
|
+
const config = this.getResolvedConfig();
|
|
612
|
+
const maxConcurrency = concurrency ?? config.concurrency;
|
|
613
|
+
const tasks = this.suite.cases.map((case_) => async () => {
|
|
614
|
+
try {
|
|
615
|
+
const evalCase = isTemplateCase(case_) ? resolveTemplateCase(case_, this.templates) : case_;
|
|
616
|
+
const evalConfig = {
|
|
617
|
+
base_url: config.lattice_server_config.base_url,
|
|
618
|
+
api_key: config.lattice_server_config.api_key
|
|
619
|
+
};
|
|
620
|
+
const run = await evaluateLatticeCaseWithLogs(evalCase, evalConfig);
|
|
621
|
+
return {
|
|
622
|
+
caseId: evalCase.caseId,
|
|
623
|
+
result: run.result,
|
|
624
|
+
error: run.error,
|
|
625
|
+
error_stack: run.error_stack,
|
|
626
|
+
duration_ms: run.duration_ms,
|
|
627
|
+
thread_id: run.thread_id,
|
|
628
|
+
judge_thread_id: run.judge_thread_id,
|
|
629
|
+
test_prompt: run.test_prompt,
|
|
630
|
+
final_output: run.final_output,
|
|
631
|
+
logs: run.logs
|
|
632
|
+
};
|
|
633
|
+
} catch (error) {
|
|
634
|
+
return {
|
|
635
|
+
caseId: case_.caseId,
|
|
636
|
+
error: error instanceof Error ? error.message : String(error),
|
|
637
|
+
logs: []
|
|
638
|
+
};
|
|
639
|
+
}
|
|
640
|
+
});
|
|
641
|
+
const taskResults = await limitConcurrency(tasks, maxConcurrency);
|
|
642
|
+
return taskResults.map((taskResult, index) => {
|
|
643
|
+
if (taskResult.success && taskResult.result) {
|
|
644
|
+
return taskResult.result;
|
|
645
|
+
}
|
|
646
|
+
return {
|
|
647
|
+
caseId: this.suite.cases[index].caseId,
|
|
648
|
+
error: taskResult.error || "Unknown error",
|
|
649
|
+
logs: []
|
|
650
|
+
};
|
|
651
|
+
});
|
|
652
|
+
}
|
|
653
|
+
};
|
|
654
|
+
|
|
655
|
+
// src/LatticeEvalProject.ts
|
|
656
|
+
import {
|
|
657
|
+
registerModelLattice,
|
|
658
|
+
registerAgentLattice,
|
|
659
|
+
AgentType
|
|
660
|
+
} from "@axiom-lattice/core";
|
|
661
|
+
import { mkdir, writeFile } from "fs/promises";
|
|
662
|
+
import path from "path";
|
|
663
|
+
var LatticeEvalProject = class {
|
|
664
|
+
constructor(project) {
|
|
665
|
+
this.suites = /* @__PURE__ */ new Map();
|
|
666
|
+
this.project = project;
|
|
667
|
+
this.reportConfig = project.report_config;
|
|
668
|
+
const judgeModelKey = `${this.project.projectName}_judge_model`;
|
|
669
|
+
registerModelLattice(judgeModelKey, this.project.judge_agent_config.model);
|
|
670
|
+
const judgeAgentKey = "LatticeTest";
|
|
671
|
+
const judgeAgentConfig = {
|
|
672
|
+
key: judgeAgentKey,
|
|
673
|
+
name: "Lattice Test Judge Agent",
|
|
674
|
+
description: "Judge agent for evaluating Lattice test cases",
|
|
675
|
+
type: AgentType.REACT,
|
|
676
|
+
prompt: "",
|
|
677
|
+
// No prompt as requested
|
|
678
|
+
modelKey: judgeModelKey
|
|
679
|
+
};
|
|
680
|
+
registerAgentLattice(judgeAgentConfig);
|
|
681
|
+
const projectConfig = {
|
|
682
|
+
lattice_server_config: {
|
|
683
|
+
base_url: this.project.lattice_server_config.base_url,
|
|
684
|
+
api_key: this.project.lattice_server_config.api_key
|
|
685
|
+
},
|
|
686
|
+
judge_agent_config: this.project.judge_agent_config,
|
|
687
|
+
concurrency: this.project.concurrency ?? 1
|
|
688
|
+
};
|
|
689
|
+
const templatesMap = /* @__PURE__ */ new Map();
|
|
690
|
+
if (this.project.templates) {
|
|
691
|
+
for (const template of this.project.templates) {
|
|
692
|
+
templatesMap.set(template.templateId, template);
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
for (const suite of this.project.suites) {
|
|
696
|
+
this.suites.set(
|
|
697
|
+
suite.suiteName,
|
|
698
|
+
new LatticeEvalSuite(suite, projectConfig, templatesMap)
|
|
699
|
+
);
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
/**
|
|
703
|
+
* Get project name
|
|
704
|
+
*/
|
|
705
|
+
getProjectName() {
|
|
706
|
+
return this.project.projectName;
|
|
707
|
+
}
|
|
708
|
+
/**
|
|
709
|
+
* Get project version
|
|
710
|
+
*/
|
|
711
|
+
getVersion() {
|
|
712
|
+
return this.project.version;
|
|
713
|
+
}
|
|
714
|
+
/**
|
|
715
|
+
* Get project description
|
|
716
|
+
*/
|
|
717
|
+
getDescription() {
|
|
718
|
+
return this.project.description;
|
|
719
|
+
}
|
|
720
|
+
/**
|
|
721
|
+
* Get all suite names
|
|
722
|
+
*/
|
|
723
|
+
getSuiteNames() {
|
|
724
|
+
return Array.from(this.suites.keys());
|
|
725
|
+
}
|
|
726
|
+
/**
|
|
727
|
+
* Get a specific suite by name
|
|
728
|
+
*/
|
|
729
|
+
getSuite(suiteName) {
|
|
730
|
+
return this.suites.get(suiteName);
|
|
731
|
+
}
|
|
732
|
+
/**
|
|
733
|
+
* Run a specific case in a specific suite
|
|
734
|
+
* @param suiteName The suite name
|
|
735
|
+
* @param caseId The case ID to run
|
|
736
|
+
* @returns Case run result with error handling
|
|
737
|
+
*/
|
|
738
|
+
async runCase(suiteName, caseId) {
|
|
739
|
+
const suite = this.getSuite(suiteName);
|
|
740
|
+
if (!suite) {
|
|
741
|
+
return {
|
|
742
|
+
caseId,
|
|
743
|
+
error: `Suite not found: ${suiteName}`,
|
|
744
|
+
logs: []
|
|
745
|
+
};
|
|
746
|
+
}
|
|
747
|
+
return suite.runCase(caseId);
|
|
748
|
+
}
|
|
749
|
+
/**
|
|
750
|
+
* Run all cases in a specific suite with concurrency control and error isolation
|
|
751
|
+
* @param suiteName The suite name
|
|
752
|
+
* @param concurrency Optional concurrency limit (overrides project config)
|
|
753
|
+
* @returns Array of case run results with error handling
|
|
754
|
+
*/
|
|
755
|
+
async runSuite(suiteName, concurrency) {
|
|
756
|
+
const suite = this.getSuite(suiteName);
|
|
757
|
+
if (!suite) {
|
|
758
|
+
throw new Error(`Suite not found: ${suiteName}`);
|
|
759
|
+
}
|
|
760
|
+
return suite.runAllCases(concurrency);
|
|
761
|
+
}
|
|
762
|
+
/**
|
|
763
|
+
* Run all cases in all suites with concurrency control and error isolation
|
|
764
|
+
* @param concurrency Optional concurrency limit (overrides project config)
|
|
765
|
+
* @returns Map of suite names to their case run results
|
|
766
|
+
*/
|
|
767
|
+
async runAllSuites(concurrency) {
|
|
768
|
+
const results = /* @__PURE__ */ new Map();
|
|
769
|
+
for (const suiteName of this.getSuiteNames()) {
|
|
770
|
+
try {
|
|
771
|
+
const suiteResults = await this.runSuite(suiteName, concurrency);
|
|
772
|
+
results.set(suiteName, suiteResults);
|
|
773
|
+
} catch (error) {
|
|
774
|
+
const suite = this.getSuite(suiteName);
|
|
775
|
+
if (suite) {
|
|
776
|
+
const errorResults = suite.getCases().map((c) => ({
|
|
777
|
+
caseId: c.caseId,
|
|
778
|
+
error: error instanceof Error ? error.message : String(error),
|
|
779
|
+
logs: []
|
|
780
|
+
}));
|
|
781
|
+
results.set(suiteName, errorResults);
|
|
782
|
+
}
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
return results;
|
|
786
|
+
}
|
|
787
|
+
/**
|
|
788
|
+
* Run all suites as a "batch", build a report, and optionally write it to disk.
|
|
789
|
+
*/
|
|
790
|
+
async runAllSuitesBatch(concurrency) {
|
|
791
|
+
const started_at = (/* @__PURE__ */ new Date()).toISOString();
|
|
792
|
+
const batch_id = this.reportConfig?.batch_id || `${Date.now()}`;
|
|
793
|
+
console.log(`
|
|
794
|
+
Running batch: ${this.project.projectName} (${this.getSuiteNames().length} suites)`);
|
|
795
|
+
const results = await this.runAllSuites(concurrency);
|
|
796
|
+
let total_cases = 0;
|
|
797
|
+
let passed_cases = 0;
|
|
798
|
+
let failed_cases = 0;
|
|
799
|
+
const suites = [];
|
|
800
|
+
const durations = [];
|
|
801
|
+
for (const [suiteName, caseResults] of results.entries()) {
|
|
802
|
+
const suiteTotal = caseResults.length;
|
|
803
|
+
const suitePassed = caseResults.filter((r) => r.result?.pass).length;
|
|
804
|
+
const suiteFailed = suiteTotal - suitePassed;
|
|
805
|
+
total_cases += suiteTotal;
|
|
806
|
+
passed_cases += suitePassed;
|
|
807
|
+
failed_cases += suiteFailed;
|
|
808
|
+
suites.push({
|
|
809
|
+
suiteName,
|
|
810
|
+
total_cases: suiteTotal,
|
|
811
|
+
passed_cases: suitePassed,
|
|
812
|
+
failed_cases: suiteFailed,
|
|
813
|
+
cases: caseResults.map((r) => ({
|
|
814
|
+
caseId: r.caseId,
|
|
815
|
+
pass: r.result?.pass,
|
|
816
|
+
final_score: r.result?.final_score,
|
|
817
|
+
error: r.error
|
|
818
|
+
}))
|
|
819
|
+
});
|
|
820
|
+
for (const r of caseResults) {
|
|
821
|
+
if (typeof r.duration_ms === "number") durations.push(r.duration_ms);
|
|
822
|
+
}
|
|
823
|
+
}
|
|
824
|
+
const finished_at = (/* @__PURE__ */ new Date()).toISOString();
|
|
825
|
+
const report = {
|
|
826
|
+
batch_id,
|
|
827
|
+
started_at,
|
|
828
|
+
finished_at,
|
|
829
|
+
project: {
|
|
830
|
+
projectName: this.project.projectName,
|
|
831
|
+
version: this.project.version,
|
|
832
|
+
description: this.project.description
|
|
833
|
+
},
|
|
834
|
+
summary: {
|
|
835
|
+
total_cases,
|
|
836
|
+
passed_cases,
|
|
837
|
+
failed_cases,
|
|
838
|
+
pass_rate: total_cases > 0 ? passed_cases / total_cases : 0
|
|
839
|
+
},
|
|
840
|
+
suites
|
|
841
|
+
};
|
|
842
|
+
const batch_dir = await this.maybeWriteBatchArtifacts(
|
|
843
|
+
batch_id,
|
|
844
|
+
report,
|
|
845
|
+
results
|
|
846
|
+
);
|
|
847
|
+
console.log(`
|
|
848
|
+
=== Summary ===`);
|
|
849
|
+
console.log(`Total: ${report.summary.total_cases} | Passed: ${report.summary.passed_cases} | Failed: ${report.summary.failed_cases} | Pass Rate: ${(report.summary.pass_rate * 100).toFixed(2)}%`);
|
|
850
|
+
if (batch_dir) {
|
|
851
|
+
console.log(`
|
|
852
|
+
Results saved to: ${batch_dir}`);
|
|
853
|
+
}
|
|
854
|
+
return { batch_id, batch_dir, results, report };
|
|
855
|
+
}
|
|
856
|
+
generateCaseMarkdown(index, suiteName, caseResult, payload) {
|
|
857
|
+
const lines = [];
|
|
858
|
+
const status = caseResult.result?.pass ? "\u2705 PASS" : "\u274C FAIL";
|
|
859
|
+
lines.push(`# Test ${index}: ${status}`);
|
|
860
|
+
lines.push(``);
|
|
861
|
+
lines.push(`- **Suite**: ${suiteName}`);
|
|
862
|
+
lines.push(`- **Case ID**: ${caseResult.caseId}`);
|
|
863
|
+
lines.push(`- **Status**: ${caseResult.result?.pass ? "PASS" : "FAIL"}`);
|
|
864
|
+
if (typeof payload.duration === "number") {
|
|
865
|
+
lines.push(`- **Duration**: ${(payload.duration / 1e3).toFixed(2)}s`);
|
|
866
|
+
}
|
|
867
|
+
if (payload.threadId) {
|
|
868
|
+
lines.push(`- **Thread ID**: ${payload.threadId}`);
|
|
869
|
+
}
|
|
870
|
+
if (payload.judgeThreadId) {
|
|
871
|
+
lines.push(`- **Judge Thread ID**: ${payload.judgeThreadId}`);
|
|
872
|
+
}
|
|
873
|
+
lines.push(``);
|
|
874
|
+
if (caseResult.result) {
|
|
875
|
+
lines.push(`## Result`);
|
|
876
|
+
lines.push(``);
|
|
877
|
+
lines.push(`- **Final Score**: ${caseResult.result.final_score}`);
|
|
878
|
+
lines.push(`- **Summary**: ${caseResult.result.summary || "N/A"}`);
|
|
879
|
+
lines.push(``);
|
|
880
|
+
if (caseResult.result.dimension_results && caseResult.result.dimension_results.length > 0) {
|
|
881
|
+
lines.push(`## Dimension Results`);
|
|
882
|
+
lines.push(``);
|
|
883
|
+
for (const dim of caseResult.result.dimension_results) {
|
|
884
|
+
lines.push(`### ${dim.name}`);
|
|
885
|
+
lines.push(`- **Score**: ${dim.score}`);
|
|
886
|
+
lines.push(`- **Reason**: ${dim.reason}`);
|
|
887
|
+
lines.push(``);
|
|
888
|
+
}
|
|
889
|
+
}
|
|
890
|
+
}
|
|
891
|
+
if (caseResult.error) {
|
|
892
|
+
lines.push(`## Error`);
|
|
893
|
+
lines.push(``);
|
|
894
|
+
lines.push(`\`\`\``);
|
|
895
|
+
lines.push(caseResult.error);
|
|
896
|
+
if (caseResult.error_stack) {
|
|
897
|
+
lines.push(``);
|
|
898
|
+
lines.push(caseResult.error_stack);
|
|
899
|
+
}
|
|
900
|
+
lines.push(`\`\`\``);
|
|
901
|
+
lines.push(``);
|
|
902
|
+
}
|
|
903
|
+
if (payload.finalOutput) {
|
|
904
|
+
lines.push(`## Final Output`);
|
|
905
|
+
lines.push(``);
|
|
906
|
+
lines.push(`\`\`\``);
|
|
907
|
+
const output = payload.finalOutput.length > 5e3 ? payload.finalOutput.substring(0, 5e3) + "\n\n... (truncated, see JSON for full output)" : payload.finalOutput;
|
|
908
|
+
lines.push(output);
|
|
909
|
+
lines.push(`\`\`\``);
|
|
910
|
+
lines.push(``);
|
|
911
|
+
}
|
|
912
|
+
if (payload.testPrompt) {
|
|
913
|
+
lines.push(`## Test Prompt`);
|
|
914
|
+
lines.push(``);
|
|
915
|
+
lines.push(`\`\`\``);
|
|
916
|
+
const prompt = payload.testPrompt.length > 5e3 ? payload.testPrompt.substring(0, 5e3) + "\n\n... (truncated, see JSON for full prompt)" : payload.testPrompt;
|
|
917
|
+
lines.push(prompt);
|
|
918
|
+
lines.push(`\`\`\``);
|
|
919
|
+
lines.push(``);
|
|
920
|
+
}
|
|
921
|
+
return lines.join("\n");
|
|
922
|
+
}
|
|
923
|
+
generateMarkdownSummary(batch_id, report, results) {
|
|
924
|
+
const lines = [];
|
|
925
|
+
lines.push(`# Lattice Eval Batch Summary`);
|
|
926
|
+
lines.push(``);
|
|
927
|
+
lines.push(`- **Project**: ${report.project.projectName}`);
|
|
928
|
+
if (report.project.version) lines.push(`- **Version**: ${report.project.version}`);
|
|
929
|
+
if (report.project.description) lines.push(`- **Description**: ${report.project.description}`);
|
|
930
|
+
lines.push(`- **Batch ID**: ${batch_id}`);
|
|
931
|
+
lines.push(`- **Started**: ${report.started_at}`);
|
|
932
|
+
lines.push(`- **Finished**: ${report.finished_at}`);
|
|
933
|
+
lines.push(``);
|
|
934
|
+
lines.push(`## Overview`);
|
|
935
|
+
lines.push(``);
|
|
936
|
+
lines.push(`| Metric | Value |`);
|
|
937
|
+
lines.push(`|---|---:|`);
|
|
938
|
+
lines.push(`| Total cases | ${report.summary.total_cases} |`);
|
|
939
|
+
lines.push(`| Passed | ${report.summary.passed_cases} |`);
|
|
940
|
+
lines.push(`| Failed | ${report.summary.failed_cases} |`);
|
|
941
|
+
lines.push(`| Pass rate | ${(report.summary.pass_rate * 100).toFixed(2)}% |`);
|
|
942
|
+
lines.push(``);
|
|
943
|
+
lines.push(`## Suites`);
|
|
944
|
+
lines.push(``);
|
|
945
|
+
for (const suite of report.suites) {
|
|
946
|
+
lines.push(`### ${suite.suiteName}`);
|
|
947
|
+
lines.push(``);
|
|
948
|
+
lines.push(`| Case | Status | Score | Duration (ms) | Thread |`);
|
|
949
|
+
lines.push(`|---|---|---:|---:|---|`);
|
|
950
|
+
const suiteResults = results.get(suite.suiteName) || [];
|
|
951
|
+
for (const r of suiteResults) {
|
|
952
|
+
const status = r.result?.pass ? "PASS" : "FAIL";
|
|
953
|
+
const score = r.result?.final_score ?? "";
|
|
954
|
+
const dur = typeof r.duration_ms === "number" ? r.duration_ms : "";
|
|
955
|
+
const thread = r.thread_id ?? "";
|
|
956
|
+
lines.push(`| ${r.caseId} | ${status} | ${score} | ${dur} | ${thread} |`);
|
|
957
|
+
}
|
|
958
|
+
lines.push(``);
|
|
959
|
+
}
|
|
960
|
+
return lines.join("\n");
|
|
961
|
+
}
|
|
962
|
+
async maybeWriteBatchArtifacts(batch_id, report, results) {
|
|
963
|
+
const config = this.reportConfig;
|
|
964
|
+
if (!config?.output_dir) return void 0;
|
|
965
|
+
const batchDir = path.join(config.output_dir, batch_id);
|
|
966
|
+
await mkdir(batchDir, { recursive: true });
|
|
967
|
+
const writeReportJson = config.write_report_json ?? true;
|
|
968
|
+
const writeCaseLogs = config.write_case_logs ?? true;
|
|
969
|
+
if (writeReportJson) {
|
|
970
|
+
await writeFile(
|
|
971
|
+
path.join(batchDir, "report.json"),
|
|
972
|
+
JSON.stringify(report, null, 2),
|
|
973
|
+
"utf-8"
|
|
974
|
+
);
|
|
975
|
+
}
|
|
976
|
+
const resultsJsonPath = path.join(batchDir, "results.json");
|
|
977
|
+
const resultsJson = {
|
|
978
|
+
executionTimestamp: batch_id,
|
|
979
|
+
summary: report.summary,
|
|
980
|
+
report,
|
|
981
|
+
results: Array.from(results.entries()).map(([suiteName, caseResults]) => ({
|
|
982
|
+
suiteName,
|
|
983
|
+
cases: caseResults.map((r) => ({
|
|
984
|
+
caseId: r.caseId,
|
|
985
|
+
passed: r.result?.pass === true,
|
|
986
|
+
message: r.result?.summary || r.error || "",
|
|
987
|
+
error: r.error ? {
|
|
988
|
+
message: r.error,
|
|
989
|
+
stack: r.error_stack
|
|
990
|
+
} : void 0,
|
|
991
|
+
duration: r.duration_ms,
|
|
992
|
+
testPrompt: r.test_prompt,
|
|
993
|
+
finalOutput: r.final_output,
|
|
994
|
+
threadId: r.thread_id,
|
|
995
|
+
judgeThreadId: r.judge_thread_id
|
|
996
|
+
}))
|
|
997
|
+
}))
|
|
998
|
+
};
|
|
999
|
+
await writeFile(resultsJsonPath, JSON.stringify(resultsJson, null, 2), "utf-8");
|
|
1000
|
+
const summaryMdPath = path.join(batchDir, "summary.md");
|
|
1001
|
+
const summaryMd = this.generateMarkdownSummary(batch_id, report, results);
|
|
1002
|
+
await writeFile(summaryMdPath, summaryMd, "utf-8");
|
|
1003
|
+
const individualDir = path.join(batchDir, "individual");
|
|
1004
|
+
await mkdir(individualDir, { recursive: true });
|
|
1005
|
+
let index = 1;
|
|
1006
|
+
for (const [suiteName, caseResults] of results.entries()) {
|
|
1007
|
+
for (const r of caseResults) {
|
|
1008
|
+
const status = r.result?.pass ? "PASS" : "FAIL";
|
|
1009
|
+
const baseFilename = `test-${index}-${suiteName}-${r.caseId}-${status}`.replace(/[\/\\]/g, "_");
|
|
1010
|
+
const jsonPath = path.join(individualDir, `${baseFilename}.json`);
|
|
1011
|
+
const payload = {
|
|
1012
|
+
index,
|
|
1013
|
+
suiteName,
|
|
1014
|
+
caseId: r.caseId,
|
|
1015
|
+
passed: r.result?.pass === true,
|
|
1016
|
+
result: r.result,
|
|
1017
|
+
message: r.result?.summary || r.error || "",
|
|
1018
|
+
error: r.error ? { message: r.error, stack: r.error_stack } : void 0,
|
|
1019
|
+
duration: r.duration_ms,
|
|
1020
|
+
threadId: r.thread_id,
|
|
1021
|
+
judgeThreadId: r.judge_thread_id,
|
|
1022
|
+
finalOutput: r.final_output,
|
|
1023
|
+
testPrompt: r.test_prompt
|
|
1024
|
+
};
|
|
1025
|
+
await writeFile(jsonPath, JSON.stringify(payload, null, 2), "utf-8");
|
|
1026
|
+
const mdPath = path.join(individualDir, `${baseFilename}.md`);
|
|
1027
|
+
const mdContent = this.generateCaseMarkdown(index, suiteName, r, payload);
|
|
1028
|
+
await writeFile(mdPath, mdContent, "utf-8");
|
|
1029
|
+
index += 1;
|
|
1030
|
+
}
|
|
1031
|
+
}
|
|
1032
|
+
if (writeCaseLogs) {
|
|
1033
|
+
for (const [suiteName, caseResults] of results.entries()) {
|
|
1034
|
+
const suiteDir = path.join(batchDir, "cases", suiteName);
|
|
1035
|
+
await mkdir(suiteDir, { recursive: true });
|
|
1036
|
+
for (const r of caseResults) {
|
|
1037
|
+
await writeFile(
|
|
1038
|
+
path.join(suiteDir, `${r.caseId}.logs.json`),
|
|
1039
|
+
JSON.stringify(r.logs || [], null, 2),
|
|
1040
|
+
"utf-8"
|
|
1041
|
+
);
|
|
1042
|
+
}
|
|
1043
|
+
}
|
|
1044
|
+
}
|
|
1045
|
+
return batchDir;
|
|
1046
|
+
}
|
|
1047
|
+
};
|
|
1048
|
+
export {
|
|
1049
|
+
LatticeEval,
|
|
1050
|
+
LatticeEvalProject,
|
|
1051
|
+
LatticeEvalSuite,
|
|
1052
|
+
evaluateLatticeCase,
|
|
1053
|
+
evaluateLatticeCaseWithLogs
|
|
1054
|
+
};
|
|
1055
|
+
//# sourceMappingURL=index.mjs.map
|