@axiom-lattice/agent-eval 2.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env +29 -0
- package/.turbo/turbo-build.log +20 -0
- package/CHANGELOG.md +10 -0
- package/LICENSE +201 -0
- package/dist/index.d.mts +366 -0
- package/dist/index.d.ts +366 -0
- package/dist/index.js +1092 -0
- package/dist/index.js.map +1 -0
- package/dist/index.mjs +1055 -0
- package/dist/index.mjs.map +1 -0
- package/package.json +51 -0
- package/src/LatticeEval.ts +615 -0
- package/src/LatticeEvalProject.ts +496 -0
- package/src/LatticeEvalSuite.ts +321 -0
- package/src/index.ts +4 -0
- package/src/test.ts +23 -0
- package/src/types.ts +160 -0
- package/tsconfig.json +33 -0
|
@@ -0,0 +1,615 @@
|
|
|
1
|
+
import { getAgentClient } from "@axiom-lattice/core";
|
|
2
|
+
import { HumanMessage } from "@langchain/core/messages";
|
|
3
|
+
import { v4 } from "uuid";
|
|
4
|
+
import type {
|
|
5
|
+
LatticeEvalCase,
|
|
6
|
+
LatticeEvalLogEvent,
|
|
7
|
+
LatticeEvalLogLevel,
|
|
8
|
+
LatticeEvalRubric,
|
|
9
|
+
LatticeEvalResult,
|
|
10
|
+
OutputType,
|
|
11
|
+
LatticeAgentStepConfig,
|
|
12
|
+
} from "./types";
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Configuration for Lattice evaluation server
|
|
16
|
+
*/
|
|
17
|
+
export interface LatticeEvalConfig {
|
|
18
|
+
base_url: string;
|
|
19
|
+
api_key?: string;
|
|
20
|
+
/**
|
|
21
|
+
* When true, prints detailed execution logs for each action.
|
|
22
|
+
* Defaults to true.
|
|
23
|
+
*/
|
|
24
|
+
verbose?: boolean;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface LatticeEvalCaseRunResult {
|
|
28
|
+
caseId: string;
|
|
29
|
+
result?: LatticeEvalResult;
|
|
30
|
+
error?: string;
|
|
31
|
+
error_stack?: string;
|
|
32
|
+
duration_ms: number;
|
|
33
|
+
thread_id?: string;
|
|
34
|
+
judge_thread_id?: string;
|
|
35
|
+
test_prompt?: string;
|
|
36
|
+
final_output?: string;
|
|
37
|
+
logs: LatticeEvalLogEvent[];
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* LatticeEval class for evaluating Lattice evaluation cases
|
|
43
|
+
*/
|
|
44
|
+
export class LatticeEval {
|
|
45
|
+
private config: LatticeEvalConfig;
|
|
46
|
+
private baseUrl: string;
|
|
47
|
+
private verbose: boolean;
|
|
48
|
+
private inMemoryLogs: LatticeEvalLogEvent[] = [];
|
|
49
|
+
private lastThreadId?: string;
|
|
50
|
+
private lastJudgeThreadId?: string;
|
|
51
|
+
private lastTestPrompt?: string;
|
|
52
|
+
private lastFinalOutput?: string;
|
|
53
|
+
private lastDurationMs: number = 0;
|
|
54
|
+
|
|
55
|
+
public getLastRunMeta() {
|
|
56
|
+
return {
|
|
57
|
+
duration_ms: this.lastDurationMs,
|
|
58
|
+
thread_id: this.lastThreadId,
|
|
59
|
+
judge_thread_id: this.lastJudgeThreadId,
|
|
60
|
+
test_prompt: this.lastTestPrompt,
|
|
61
|
+
final_output: this.lastFinalOutput,
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Create a new LatticeEval instance
|
|
67
|
+
* @param config Optional server configuration (defaults to localhost:3203)
|
|
68
|
+
*/
|
|
69
|
+
constructor(config: LatticeEvalConfig) {
|
|
70
|
+
this.config = config;
|
|
71
|
+
this.baseUrl = this.config.base_url;
|
|
72
|
+
this.verbose = this.config.verbose ?? true;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
public getInMemoryLogs(): LatticeEvalLogEvent[] {
|
|
76
|
+
return [...this.inMemoryLogs];
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
public record(
|
|
80
|
+
level: LatticeEvalLogLevel,
|
|
81
|
+
message: string,
|
|
82
|
+
data?: Record<string, unknown>
|
|
83
|
+
) {
|
|
84
|
+
const event: LatticeEvalLogEvent = {
|
|
85
|
+
ts: new Date().toISOString(),
|
|
86
|
+
level,
|
|
87
|
+
message,
|
|
88
|
+
data,
|
|
89
|
+
};
|
|
90
|
+
this.inMemoryLogs.push(event);
|
|
91
|
+
|
|
92
|
+
if (!this.verbose) return;
|
|
93
|
+
// Simple console output - only show key info, no verbose details
|
|
94
|
+
if (level === "error") {
|
|
95
|
+
const keyInfo = this.getKeyInfo(data);
|
|
96
|
+
console.log(` ✗ ${message}${keyInfo ? ` ${keyInfo}` : ""}`);
|
|
97
|
+
} else if (message.includes("Starting case") || message.includes("Case evaluation completed")) {
|
|
98
|
+
// Only show start/end of case evaluation
|
|
99
|
+
const keyInfo = this.getKeyInfo(data);
|
|
100
|
+
console.log(` ${message}${keyInfo ? ` ${keyInfo}` : ""}`);
|
|
101
|
+
}
|
|
102
|
+
// Skip other verbose logs in console (they're still in memory for file output)
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
private log(message: string, data?: Record<string, unknown>) {
|
|
106
|
+
this.record("info", message, data);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
private getKeyInfo(data?: Record<string, unknown>): string {
|
|
110
|
+
if (!data) return "";
|
|
111
|
+
const parts: string[] = [];
|
|
112
|
+
// Only show case_id, pass, final_score, error - key info only
|
|
113
|
+
if (data.case_id) parts.push(`case=${data.case_id}`);
|
|
114
|
+
if (data.pass !== undefined) parts.push(`pass=${data.pass}`);
|
|
115
|
+
if (data.final_score !== undefined) parts.push(`score=${data.final_score}`);
|
|
116
|
+
if (data.error) parts.push(`error=${data.error}`);
|
|
117
|
+
return parts.length > 0 ? `(${parts.join(" ")})` : "";
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Execute a single agent step and return the thread ID and response data
|
|
122
|
+
*/
|
|
123
|
+
private async executeAgentStep(
|
|
124
|
+
step: LatticeAgentStepConfig,
|
|
125
|
+
threadId: string,
|
|
126
|
+
inputMessage: string,
|
|
127
|
+
files: Record<string, string>
|
|
128
|
+
): Promise<{ threadId: string; responseData: any }> {
|
|
129
|
+
this.log("Executing agent step", {
|
|
130
|
+
agent_id: step.agent_id,
|
|
131
|
+
thread_id: threadId,
|
|
132
|
+
has_override_input_message: Boolean(step.override_input_message),
|
|
133
|
+
input_message_length: step.override_input_message
|
|
134
|
+
? step.override_input_message.length
|
|
135
|
+
: inputMessage.length,
|
|
136
|
+
files_count: Object.keys(files || {}).length,
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
const response = await fetch(`${this.baseUrl}/api/runs`, {
|
|
140
|
+
method: "POST",
|
|
141
|
+
headers: {
|
|
142
|
+
"Content-Type": "application/json",
|
|
143
|
+
},
|
|
144
|
+
body: JSON.stringify({
|
|
145
|
+
assistant_id: step.agent_id,
|
|
146
|
+
thread_id: threadId,
|
|
147
|
+
files: Object.keys(files).reduce((acc, key) => {
|
|
148
|
+
acc[key] = { content: files[key].split("\n"), created_at: new Date().toISOString(), modified_at: new Date().toISOString() };
|
|
149
|
+
return acc;
|
|
150
|
+
}, {} as Record<string, any>),
|
|
151
|
+
message: step.override_input_message || inputMessage,
|
|
152
|
+
}),
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
const responseData: any = await response.json();
|
|
157
|
+
if (responseData.error) {
|
|
158
|
+
this.log("Agent step failed", {
|
|
159
|
+
agent_id: step.agent_id,
|
|
160
|
+
thread_id: threadId,
|
|
161
|
+
error: responseData.error,
|
|
162
|
+
});
|
|
163
|
+
throw new Error(
|
|
164
|
+
`Failed to run agent ${step.agent_id}: ${responseData.error}`
|
|
165
|
+
);
|
|
166
|
+
}
|
|
167
|
+
this.log("Agent step completed", {
|
|
168
|
+
agent_id: step.agent_id,
|
|
169
|
+
thread_id: threadId,
|
|
170
|
+
response_keys: responseData ? Object.keys(responseData) : [],
|
|
171
|
+
});
|
|
172
|
+
return { threadId, responseData };
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/**
|
|
176
|
+
* Extract output content based on OutputType
|
|
177
|
+
*/
|
|
178
|
+
private async extractOutput(
|
|
179
|
+
outputType: OutputType,
|
|
180
|
+
agentId: string,
|
|
181
|
+
threadId: string,
|
|
182
|
+
runResponseData?: any
|
|
183
|
+
): Promise<string> {
|
|
184
|
+
if (outputType.type === "file_content") {
|
|
185
|
+
this.log("Extracting file output", {
|
|
186
|
+
agent_id: agentId,
|
|
187
|
+
thread_id: threadId,
|
|
188
|
+
file_path: outputType.file_path,
|
|
189
|
+
});
|
|
190
|
+
const responseState = await fetch(
|
|
191
|
+
`${this.baseUrl}/api/assistants/${agentId}/${threadId}/state`,
|
|
192
|
+
{
|
|
193
|
+
method: "GET",
|
|
194
|
+
}
|
|
195
|
+
);
|
|
196
|
+
|
|
197
|
+
if (!responseState.ok) {
|
|
198
|
+
this.log("Failed to fetch assistant state", {
|
|
199
|
+
agent_id: agentId,
|
|
200
|
+
thread_id: threadId,
|
|
201
|
+
status: responseState.status,
|
|
202
|
+
statusText: responseState.statusText,
|
|
203
|
+
});
|
|
204
|
+
throw new Error(`Failed to get state: ${responseState.statusText}`);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
const state: any = await responseState.json();
|
|
208
|
+
const stateValues: any = state.values;
|
|
209
|
+
const fileContent = stateValues.files[outputType.file_path]?.content;
|
|
210
|
+
|
|
211
|
+
if (!fileContent) {
|
|
212
|
+
this.log("File output not found in state", {
|
|
213
|
+
agent_id: agentId,
|
|
214
|
+
thread_id: threadId,
|
|
215
|
+
file_path: outputType.file_path,
|
|
216
|
+
});
|
|
217
|
+
throw new Error(
|
|
218
|
+
`File not found in output: ${outputType.file_path}`
|
|
219
|
+
);
|
|
220
|
+
}
|
|
221
|
+
const content = Array.isArray(fileContent)
|
|
222
|
+
? fileContent.join("\n")
|
|
223
|
+
: fileContent;
|
|
224
|
+
this.log("File output extracted", {
|
|
225
|
+
agent_id: agentId,
|
|
226
|
+
thread_id: threadId,
|
|
227
|
+
file_path: outputType.file_path,
|
|
228
|
+
output_length: typeof content === "string" ? content.length : undefined,
|
|
229
|
+
});
|
|
230
|
+
return content;
|
|
231
|
+
} else {
|
|
232
|
+
// For message_content type, get the last message from the run response
|
|
233
|
+
if (runResponseData?.messages && runResponseData.messages.length > 0) {
|
|
234
|
+
const content =
|
|
235
|
+
runResponseData.messages[runResponseData.messages.length - 1]?.content ||
|
|
236
|
+
"";
|
|
237
|
+
this.log("Message output extracted", {
|
|
238
|
+
agent_id: agentId,
|
|
239
|
+
thread_id: threadId,
|
|
240
|
+
output_length: typeof content === "string" ? content.length : undefined,
|
|
241
|
+
});
|
|
242
|
+
return content;
|
|
243
|
+
}
|
|
244
|
+
this.log("No message content found in run response", {
|
|
245
|
+
agent_id: agentId,
|
|
246
|
+
thread_id: threadId,
|
|
247
|
+
});
|
|
248
|
+
throw new Error("No message content found in run response");
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
/**
|
|
253
|
+
* Evaluate a single Lattice evaluation case
|
|
254
|
+
* @param evalCase The evaluation case to run
|
|
255
|
+
* @returns Evaluation result with pass/fail status and scores
|
|
256
|
+
*/
|
|
257
|
+
async evaluateCase(evalCase: LatticeEvalCase): Promise<LatticeEvalResult> {
|
|
258
|
+
const startedAt = Date.now();
|
|
259
|
+
const threadId = `${evalCase.caseId}||${v4()}`;
|
|
260
|
+
this.inMemoryLogs = [];
|
|
261
|
+
this.lastThreadId = threadId;
|
|
262
|
+
this.lastJudgeThreadId = undefined;
|
|
263
|
+
this.lastTestPrompt = undefined;
|
|
264
|
+
this.lastFinalOutput = undefined;
|
|
265
|
+
this.lastDurationMs = 0;
|
|
266
|
+
this.log("Starting case evaluation", {
|
|
267
|
+
case_id: evalCase.caseId,
|
|
268
|
+
thread_id: threadId,
|
|
269
|
+
steps_count: evalCase.steps?.length,
|
|
270
|
+
output_type: evalCase.output?.type,
|
|
271
|
+
});
|
|
272
|
+
|
|
273
|
+
// Execute all agent steps sequentially
|
|
274
|
+
let currentThreadId = threadId;
|
|
275
|
+
let lastResponseData: any = null;
|
|
276
|
+
for (const step of evalCase.steps) {
|
|
277
|
+
const result = await this.executeAgentStep(
|
|
278
|
+
step,
|
|
279
|
+
currentThreadId,
|
|
280
|
+
evalCase.input.message,
|
|
281
|
+
evalCase.input.files || {}
|
|
282
|
+
);
|
|
283
|
+
currentThreadId = result.threadId;
|
|
284
|
+
lastResponseData = result.responseData;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
// Get the final agent ID from the last step
|
|
288
|
+
const finalAgentId =
|
|
289
|
+
evalCase.steps[evalCase.steps.length - 1]?.agent_id || "";
|
|
290
|
+
this.log("All agent steps completed", {
|
|
291
|
+
case_id: evalCase.caseId,
|
|
292
|
+
final_agent_id: finalAgentId,
|
|
293
|
+
final_thread_id: currentThreadId,
|
|
294
|
+
});
|
|
295
|
+
|
|
296
|
+
// Extract output based on output type
|
|
297
|
+
const finalOutput = await this.extractOutput(
|
|
298
|
+
evalCase.output,
|
|
299
|
+
finalAgentId,
|
|
300
|
+
currentThreadId,
|
|
301
|
+
lastResponseData
|
|
302
|
+
);
|
|
303
|
+
this.lastFinalOutput = finalOutput;
|
|
304
|
+
this.log("Final output extracted", {
|
|
305
|
+
case_id: evalCase.caseId,
|
|
306
|
+
output_type: evalCase.output.type,
|
|
307
|
+
output_length: finalOutput.length,
|
|
308
|
+
});
|
|
309
|
+
|
|
310
|
+
// Build test prompt
|
|
311
|
+
const testCaseFilesContent = evalCase.input.files
|
|
312
|
+
? Object.keys(evalCase.input.files)
|
|
313
|
+
.map(
|
|
314
|
+
(key: string) =>
|
|
315
|
+
`File name: ${key}\nFile content: ${evalCase.input.files![key]}`
|
|
316
|
+
)
|
|
317
|
+
.join("\n\n")
|
|
318
|
+
: "";
|
|
319
|
+
|
|
320
|
+
// Determine output type description
|
|
321
|
+
const outputTypeDescription = evalCase.output.type === "file_content"
|
|
322
|
+
? `虚拟产物(文件:${evalCase.output.file_path})`
|
|
323
|
+
: "消息内容";
|
|
324
|
+
|
|
325
|
+
const defaultRubrics: LatticeEvalRubric[] = [
|
|
326
|
+
{
|
|
327
|
+
dimension: "correctness",
|
|
328
|
+
weight: 100,
|
|
329
|
+
description:
|
|
330
|
+
"整体正确性,是否符合预期输出描述。",
|
|
331
|
+
},
|
|
332
|
+
];
|
|
333
|
+
|
|
334
|
+
const evalRubrics =
|
|
335
|
+
evalCase.eval.eval_rubrics && evalCase.eval.eval_rubrics.length > 0
|
|
336
|
+
? evalCase.eval.eval_rubrics
|
|
337
|
+
: defaultRubrics;
|
|
338
|
+
this.log("Prepared evaluation rubrics", {
|
|
339
|
+
case_id: evalCase.caseId,
|
|
340
|
+
rubrics_count: evalRubrics.length,
|
|
341
|
+
rubric_dimensions: evalRubrics.map((r) => r.dimension),
|
|
342
|
+
});
|
|
343
|
+
|
|
344
|
+
const rubricsSection = `\n## 评估指标(Evaluation Rubrics)\n${evalRubrics
|
|
345
|
+
.map(
|
|
346
|
+
(r) =>
|
|
347
|
+
`- **${r.dimension}**(权重:${r.weight}):${r.description}`
|
|
348
|
+
)
|
|
349
|
+
.join("\n")}`;
|
|
350
|
+
|
|
351
|
+
const testPrompt = `# 角色
|
|
352
|
+
你是一名资深的 AI Agent 评估专家,负责根据预设的指标(Rubrics)对 Agent 的执行结果进行"黑盒测试"判定。
|
|
353
|
+
|
|
354
|
+
# 输入信息
|
|
355
|
+
测试框架将为你提供以下四个核心上下文:
|
|
356
|
+
|
|
357
|
+
1. **用户意图(User Intent)**:${evalCase.input.message}
|
|
358
|
+
|
|
359
|
+
2. **输入文件(Input Files)**:${testCaseFilesContent || "无"}
|
|
360
|
+
|
|
361
|
+
3. **实际输出(Actual Output,${outputTypeDescription})**:
|
|
362
|
+
${finalOutput}
|
|
363
|
+
|
|
364
|
+
4. **期望输出描述(Expected Output Description)**:${evalCase.eval.content_assertion}
|
|
365
|
+
${rubricsSection}
|
|
366
|
+
|
|
367
|
+
# 任务
|
|
368
|
+
你必须严格对照"评估指标(Evaluation Rubrics)"中的每一项指标,分析"实际输出(Actual Output)"是否达标。
|
|
369
|
+
|
|
370
|
+
# 规则
|
|
371
|
+
1. **客观性**:仅根据提供的上下文判定。如果标准要求"包含数字",但输出只有文字,即使语气再好也必须扣分。
|
|
372
|
+
2. **结果校验**:如果"实际输出"中缺失预期的内容,或内容不符合"评估指标"中的标准,对应的指标应判定为失败。
|
|
373
|
+
3. **证据导向**:在给出原因(reason)时,必须引用输出中的原文或虚拟产物中的具体数据片段。
|
|
374
|
+
4. **加权计算**:最终分数为各项指标得分与其权重的乘积之和(0-100分制)。
|
|
375
|
+
|
|
376
|
+
# 输出格式(仅JSON)
|
|
377
|
+
你必须仅以 JSON 格式回复,结构如下:
|
|
378
|
+
{
|
|
379
|
+
"pass": true | false,
|
|
380
|
+
"final_score": number,
|
|
381
|
+
"dimension_results": [
|
|
382
|
+
{
|
|
383
|
+
"name": "指标名称",
|
|
384
|
+
"score": number,
|
|
385
|
+
"reason": "具体的扣分或给分理由,需引用证据"
|
|
386
|
+
}
|
|
387
|
+
],
|
|
388
|
+
"summary": "对 Agent 表现的整体评价"
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
注意:如果 final_score >= 80 且没有致命性错误,pass 应为 true;否则为 false。`;
|
|
392
|
+
this.lastTestPrompt = testPrompt;
|
|
393
|
+
|
|
394
|
+
// Invoke judge agent
|
|
395
|
+
const judgeThreadId = v4();
|
|
396
|
+
this.lastJudgeThreadId = judgeThreadId;
|
|
397
|
+
this.log("Invoking judge agent", { agent_key: "LatticeTest", case_id: evalCase.caseId });
|
|
398
|
+
const testResponse = await getAgentClient("LatticeTest").invoke(
|
|
399
|
+
{
|
|
400
|
+
messages: [new HumanMessage(testPrompt)],
|
|
401
|
+
},
|
|
402
|
+
{
|
|
403
|
+
configurable: {
|
|
404
|
+
thread_id: judgeThreadId,
|
|
405
|
+
},
|
|
406
|
+
}
|
|
407
|
+
);
|
|
408
|
+
this.log("Judge agent responded", {
|
|
409
|
+
case_id: evalCase.caseId,
|
|
410
|
+
messages_count: testResponse?.messages?.length,
|
|
411
|
+
});
|
|
412
|
+
|
|
413
|
+
const testResultContent: any =
|
|
414
|
+
testResponse.messages[testResponse.messages.length - 1]?.content || "";
|
|
415
|
+
this.log("Judge raw output received", {
|
|
416
|
+
case_id: evalCase.caseId,
|
|
417
|
+
output_length:
|
|
418
|
+
typeof testResultContent === "string" ? testResultContent.length : undefined,
|
|
419
|
+
});
|
|
420
|
+
|
|
421
|
+
// Parse JSON response from judge agent
|
|
422
|
+
let parsedResult: {
|
|
423
|
+
pass?: boolean;
|
|
424
|
+
final_score?: number;
|
|
425
|
+
dimension_results?: Array<{ name: string; score: number; reason: string }>;
|
|
426
|
+
summary?: string;
|
|
427
|
+
} = {};
|
|
428
|
+
|
|
429
|
+
try {
|
|
430
|
+
// Try to extract JSON from the response (handle code blocks or plain JSON)
|
|
431
|
+
const jsonMatch = testResultContent.match(/```(?:json)?\s*(\{[\s\S]*\})\s*```/) ||
|
|
432
|
+
testResultContent.match(/\{[\s\S]*\}/);
|
|
433
|
+
if (jsonMatch) {
|
|
434
|
+
parsedResult = JSON.parse(jsonMatch[1] || jsonMatch[0]);
|
|
435
|
+
this.log("Parsed judge JSON successfully", {
|
|
436
|
+
case_id: evalCase.caseId,
|
|
437
|
+
parsed_keys: Object.keys(parsedResult || {}),
|
|
438
|
+
});
|
|
439
|
+
} else {
|
|
440
|
+
this.log("No JSON detected in judge output; will fallback", {
|
|
441
|
+
case_id: evalCase.caseId,
|
|
442
|
+
});
|
|
443
|
+
}
|
|
444
|
+
} catch (error) {
|
|
445
|
+
// If JSON parsing fails, fall back to keyword-based parsing
|
|
446
|
+
console.warn("Failed to parse JSON from judge agent response, falling back to keyword-based parsing:", error);
|
|
447
|
+
this.log("Failed to parse judge JSON; falling back", {
|
|
448
|
+
case_id: evalCase.caseId,
|
|
449
|
+
error: error instanceof Error ? error.message : String(error),
|
|
450
|
+
});
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
// Determine pass status
|
|
454
|
+
let pass: boolean;
|
|
455
|
+
if (parsedResult.pass !== undefined) {
|
|
456
|
+
pass = parsedResult.pass;
|
|
457
|
+
this.log("Pass determined from parsedResult.pass", { case_id: evalCase.caseId, pass });
|
|
458
|
+
} else if (parsedResult.final_score !== undefined) {
|
|
459
|
+
pass = parsedResult.final_score >= 80;
|
|
460
|
+
this.log("Pass determined from parsedResult.final_score", {
|
|
461
|
+
case_id: evalCase.caseId,
|
|
462
|
+
final_score: parsedResult.final_score,
|
|
463
|
+
pass,
|
|
464
|
+
});
|
|
465
|
+
} else {
|
|
466
|
+
// Fallback to keyword-based parsing
|
|
467
|
+
pass = testResultContent.toLowerCase().includes("pass") ||
|
|
468
|
+
testResultContent.toLowerCase().includes("success") ||
|
|
469
|
+
testResultContent.toLowerCase().includes("通过") ||
|
|
470
|
+
testResultContent.toLowerCase().includes("符合");
|
|
471
|
+
this.log("Pass determined from keyword fallback", { case_id: evalCase.caseId, pass });
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
// Extract dimension results
|
|
475
|
+
let dimensionResults: Array<{ name: string; score: number; reason: string }> = [];
|
|
476
|
+
|
|
477
|
+
if (parsedResult.dimension_results && parsedResult.dimension_results.length > 0) {
|
|
478
|
+
// Use parsed dimension results from JSON
|
|
479
|
+
dimensionResults = parsedResult.dimension_results;
|
|
480
|
+
this.log("Using parsed dimension_results from judge", {
|
|
481
|
+
case_id: evalCase.caseId,
|
|
482
|
+
dimensions_count: dimensionResults.length,
|
|
483
|
+
});
|
|
484
|
+
} else if (evalRubrics.length > 0) {
|
|
485
|
+
// Fallback: create dimension results structure if rubrics are provided but not parsed
|
|
486
|
+
dimensionResults = evalRubrics.map((rubric) => ({
|
|
487
|
+
name: rubric.dimension,
|
|
488
|
+
score: 0,
|
|
489
|
+
reason: "",
|
|
490
|
+
}));
|
|
491
|
+
this.log("No dimension_results parsed; using rubric skeleton", {
|
|
492
|
+
case_id: evalCase.caseId,
|
|
493
|
+
dimensions_count: dimensionResults.length,
|
|
494
|
+
});
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
// Calculate final score
|
|
498
|
+
let finalScore: number;
|
|
499
|
+
if (parsedResult.final_score !== undefined) {
|
|
500
|
+
finalScore = parsedResult.final_score;
|
|
501
|
+
this.log("Final score taken from parsedResult.final_score", {
|
|
502
|
+
case_id: evalCase.caseId,
|
|
503
|
+
final_score: finalScore,
|
|
504
|
+
});
|
|
505
|
+
} else if (dimensionResults.length > 0 && evalRubrics.length > 0) {
|
|
506
|
+
// Calculate weighted average if rubrics are provided
|
|
507
|
+
const rubricMap = new Map(evalRubrics.map((r) => [r.dimension, r.weight]));
|
|
508
|
+
const totalWeight = Array.from(rubricMap.values()).reduce((sum, w) => sum + w, 0);
|
|
509
|
+
|
|
510
|
+
if (totalWeight > 0) {
|
|
511
|
+
finalScore = dimensionResults.reduce((sum, result) => {
|
|
512
|
+
const weight = rubricMap.get(result.name) || 1;
|
|
513
|
+
return sum + (result.score * weight);
|
|
514
|
+
}, 0) / totalWeight;
|
|
515
|
+
} else {
|
|
516
|
+
// Fallback to simple average if no weights
|
|
517
|
+
finalScore = dimensionResults.reduce((sum, result) => sum + result.score, 0) / dimensionResults.length;
|
|
518
|
+
}
|
|
519
|
+
this.log("Final score computed from dimension_results", {
|
|
520
|
+
case_id: evalCase.caseId,
|
|
521
|
+
final_score: finalScore,
|
|
522
|
+
total_weight: totalWeight,
|
|
523
|
+
});
|
|
524
|
+
} else {
|
|
525
|
+
finalScore = pass ? 100 : 0;
|
|
526
|
+
this.log("Final score fallback (pass-based)", {
|
|
527
|
+
case_id: evalCase.caseId,
|
|
528
|
+
final_score: finalScore,
|
|
529
|
+
pass,
|
|
530
|
+
});
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
this.log("Case evaluation completed", {
|
|
534
|
+
case_id: evalCase.caseId,
|
|
535
|
+
pass,
|
|
536
|
+
final_score: finalScore,
|
|
537
|
+
});
|
|
538
|
+
const finishedAt = Date.now();
|
|
539
|
+
this.lastDurationMs = finishedAt - startedAt;
|
|
540
|
+
this.record("info", "Case duration recorded", {
|
|
541
|
+
case_id: evalCase.caseId,
|
|
542
|
+
duration_ms: this.lastDurationMs,
|
|
543
|
+
});
|
|
544
|
+
return {
|
|
545
|
+
pass,
|
|
546
|
+
final_score: finalScore,
|
|
547
|
+
dimension_results: dimensionResults,
|
|
548
|
+
summary: parsedResult.summary || testResultContent,
|
|
549
|
+
};
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
/**
|
|
554
|
+
* Evaluate a single Lattice evaluation case (backward compatibility function)
|
|
555
|
+
* @param evalCase The evaluation case to run
|
|
556
|
+
* @param config Optional server configuration (defaults to localhost:3203)
|
|
557
|
+
* @returns Evaluation result with pass/fail status and scores
|
|
558
|
+
* @deprecated Use LatticeEval class instead
|
|
559
|
+
*/
|
|
560
|
+
export async function evaluateLatticeCase(
|
|
561
|
+
evalCase: LatticeEvalCase,
|
|
562
|
+
config?: LatticeEvalConfig
|
|
563
|
+
): Promise<LatticeEvalResult> {
|
|
564
|
+
const defaultConfig: LatticeEvalConfig = {
|
|
565
|
+
base_url: "http://localhost:3203",
|
|
566
|
+
};
|
|
567
|
+
const evaluator = new LatticeEval(config || defaultConfig);
|
|
568
|
+
return evaluator.evaluateCase(evalCase);
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
/**
|
|
572
|
+
* Evaluate a single Lattice evaluation case and always return logs (never throws).
|
|
573
|
+
*/
|
|
574
|
+
export async function evaluateLatticeCaseWithLogs(
|
|
575
|
+
evalCase: LatticeEvalCase,
|
|
576
|
+
config?: LatticeEvalConfig
|
|
577
|
+
): Promise<LatticeEvalCaseRunResult> {
|
|
578
|
+
const defaultConfig: LatticeEvalConfig = {
|
|
579
|
+
base_url: "http://localhost:3203",
|
|
580
|
+
};
|
|
581
|
+
const evaluator = new LatticeEval(config || defaultConfig);
|
|
582
|
+
try {
|
|
583
|
+
const result = await evaluator.evaluateCase(evalCase);
|
|
584
|
+
const meta = evaluator.getLastRunMeta();
|
|
585
|
+
return {
|
|
586
|
+
caseId: evalCase.caseId,
|
|
587
|
+
result,
|
|
588
|
+
duration_ms: meta.duration_ms,
|
|
589
|
+
thread_id: meta.thread_id,
|
|
590
|
+
judge_thread_id: meta.judge_thread_id,
|
|
591
|
+
test_prompt: meta.test_prompt,
|
|
592
|
+
final_output: meta.final_output,
|
|
593
|
+
logs: evaluator.getInMemoryLogs(),
|
|
594
|
+
};
|
|
595
|
+
} catch (error) {
|
|
596
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
597
|
+
const errorStack = error instanceof Error ? error.stack : undefined;
|
|
598
|
+
evaluator.record("error", "Case evaluation failed", {
|
|
599
|
+
case_id: evalCase.caseId,
|
|
600
|
+
error: errorMessage,
|
|
601
|
+
});
|
|
602
|
+
const meta = evaluator.getLastRunMeta();
|
|
603
|
+
return {
|
|
604
|
+
caseId: evalCase.caseId,
|
|
605
|
+
error: errorMessage,
|
|
606
|
+
error_stack: errorStack,
|
|
607
|
+
duration_ms: meta.duration_ms,
|
|
608
|
+
thread_id: meta.thread_id,
|
|
609
|
+
judge_thread_id: meta.judge_thread_id,
|
|
610
|
+
test_prompt: meta.test_prompt,
|
|
611
|
+
final_output: meta.final_output,
|
|
612
|
+
logs: evaluator.getInMemoryLogs(),
|
|
613
|
+
};
|
|
614
|
+
}
|
|
615
|
+
}
|