@axiom-lattice/agent-eval 2.1.9 → 2.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +10 -10
- package/CHANGELOG.md +9 -0
- package/dist/index.d.mts +16 -1
- package/dist/index.d.ts +16 -1
- package/dist/index.js +78 -10
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +78 -10
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -3
- package/src/LatticeEval.ts +66 -2
- package/src/LatticeEvalProject.ts +19 -10
- package/src/LatticeEvalSuite.ts +3 -0
- package/src/types.ts +0 -1
package/.turbo/turbo-build.log
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
|
|
2
|
-
> @axiom-lattice/agent-eval@2.1.
|
|
2
|
+
> @axiom-lattice/agent-eval@2.1.10 build /home/runner/work/agentic/agentic/packages/agent-eval
|
|
3
3
|
> tsup src/index.ts --format cjs,esm --dts --sourcemap
|
|
4
4
|
|
|
5
5
|
[34mCLI[39m Building entry: src/index.ts
|
|
@@ -8,13 +8,13 @@
|
|
|
8
8
|
[34mCLI[39m Target: es2020
|
|
9
9
|
[34mCJS[39m Build start
|
|
10
10
|
[34mESM[39m Build start
|
|
11
|
-
[32mCJS[39m [1mdist/index.js [22m[
|
|
12
|
-
[32mCJS[39m [1mdist/index.js.map [22m[
|
|
13
|
-
[32mCJS[39m ⚡️ Build success in
|
|
14
|
-
[32mESM[39m [1mdist/index.mjs [22m[
|
|
15
|
-
[32mESM[39m [1mdist/index.mjs.map [22m[
|
|
16
|
-
[32mESM[39m ⚡️ Build success in
|
|
11
|
+
[32mCJS[39m [1mdist/index.js [22m[32m41.54 KB[39m
|
|
12
|
+
[32mCJS[39m [1mdist/index.js.map [22m[32m72.12 KB[39m
|
|
13
|
+
[32mCJS[39m ⚡️ Build success in 155ms
|
|
14
|
+
[32mESM[39m [1mdist/index.mjs [22m[32m39.35 KB[39m
|
|
15
|
+
[32mESM[39m [1mdist/index.mjs.map [22m[32m71.85 KB[39m
|
|
16
|
+
[32mESM[39m ⚡️ Build success in 159ms
|
|
17
17
|
[34mDTS[39m Build start
|
|
18
|
-
[32mDTS[39m ⚡️ Build success in
|
|
19
|
-
[32mDTS[39m [1mdist/index.d.ts [22m[32m10.
|
|
20
|
-
[32mDTS[39m [1mdist/index.d.mts [22m[32m10.
|
|
18
|
+
[32mDTS[39m ⚡️ Build success in 9956ms
|
|
19
|
+
[32mDTS[39m [1mdist/index.d.ts [22m[32m10.99 KB[39m
|
|
20
|
+
[32mDTS[39m [1mdist/index.d.mts [22m[32m10.99 KB[39m
|
package/CHANGELOG.md
CHANGED
package/dist/index.d.mts
CHANGED
|
@@ -10,7 +10,6 @@ type OutputFileContent = {
|
|
|
10
10
|
};
|
|
11
11
|
type OutputMessageContent = {
|
|
12
12
|
type: "message_content";
|
|
13
|
-
message: string;
|
|
14
13
|
};
|
|
15
14
|
type OutputType = OutputFileContent | OutputMessageContent;
|
|
16
15
|
interface LatticeEvalProjectType {
|
|
@@ -170,6 +169,11 @@ interface LatticeEvalCaseRunResult {
|
|
|
170
169
|
judge_thread_id?: string;
|
|
171
170
|
test_prompt?: string;
|
|
172
171
|
final_output?: string;
|
|
172
|
+
messages?: Array<{
|
|
173
|
+
role: string;
|
|
174
|
+
content: string;
|
|
175
|
+
id?: string;
|
|
176
|
+
}>;
|
|
173
177
|
logs: LatticeEvalLogEvent[];
|
|
174
178
|
}
|
|
175
179
|
/**
|
|
@@ -185,12 +189,18 @@ declare class LatticeEval {
|
|
|
185
189
|
private lastTestPrompt?;
|
|
186
190
|
private lastFinalOutput?;
|
|
187
191
|
private lastDurationMs;
|
|
192
|
+
private lastMessages;
|
|
188
193
|
getLastRunMeta(): {
|
|
189
194
|
duration_ms: number;
|
|
190
195
|
thread_id: string | undefined;
|
|
191
196
|
judge_thread_id: string | undefined;
|
|
192
197
|
test_prompt: string | undefined;
|
|
193
198
|
final_output: string | undefined;
|
|
199
|
+
messages: {
|
|
200
|
+
role: string;
|
|
201
|
+
content: string;
|
|
202
|
+
id?: string;
|
|
203
|
+
}[];
|
|
194
204
|
};
|
|
195
205
|
/**
|
|
196
206
|
* Create a new LatticeEval instance
|
|
@@ -255,6 +265,11 @@ interface CaseRunResult {
|
|
|
255
265
|
judge_thread_id?: string;
|
|
256
266
|
test_prompt?: string;
|
|
257
267
|
final_output?: string;
|
|
268
|
+
messages?: Array<{
|
|
269
|
+
role: string;
|
|
270
|
+
content: string;
|
|
271
|
+
id?: string;
|
|
272
|
+
}>;
|
|
258
273
|
error_stack?: string;
|
|
259
274
|
}
|
|
260
275
|
/**
|
package/dist/index.d.ts
CHANGED
|
@@ -10,7 +10,6 @@ type OutputFileContent = {
|
|
|
10
10
|
};
|
|
11
11
|
type OutputMessageContent = {
|
|
12
12
|
type: "message_content";
|
|
13
|
-
message: string;
|
|
14
13
|
};
|
|
15
14
|
type OutputType = OutputFileContent | OutputMessageContent;
|
|
16
15
|
interface LatticeEvalProjectType {
|
|
@@ -170,6 +169,11 @@ interface LatticeEvalCaseRunResult {
|
|
|
170
169
|
judge_thread_id?: string;
|
|
171
170
|
test_prompt?: string;
|
|
172
171
|
final_output?: string;
|
|
172
|
+
messages?: Array<{
|
|
173
|
+
role: string;
|
|
174
|
+
content: string;
|
|
175
|
+
id?: string;
|
|
176
|
+
}>;
|
|
173
177
|
logs: LatticeEvalLogEvent[];
|
|
174
178
|
}
|
|
175
179
|
/**
|
|
@@ -185,12 +189,18 @@ declare class LatticeEval {
|
|
|
185
189
|
private lastTestPrompt?;
|
|
186
190
|
private lastFinalOutput?;
|
|
187
191
|
private lastDurationMs;
|
|
192
|
+
private lastMessages;
|
|
188
193
|
getLastRunMeta(): {
|
|
189
194
|
duration_ms: number;
|
|
190
195
|
thread_id: string | undefined;
|
|
191
196
|
judge_thread_id: string | undefined;
|
|
192
197
|
test_prompt: string | undefined;
|
|
193
198
|
final_output: string | undefined;
|
|
199
|
+
messages: {
|
|
200
|
+
role: string;
|
|
201
|
+
content: string;
|
|
202
|
+
id?: string;
|
|
203
|
+
}[];
|
|
194
204
|
};
|
|
195
205
|
/**
|
|
196
206
|
* Create a new LatticeEval instance
|
|
@@ -255,6 +265,11 @@ interface CaseRunResult {
|
|
|
255
265
|
judge_thread_id?: string;
|
|
256
266
|
test_prompt?: string;
|
|
257
267
|
final_output?: string;
|
|
268
|
+
messages?: Array<{
|
|
269
|
+
role: string;
|
|
270
|
+
content: string;
|
|
271
|
+
id?: string;
|
|
272
|
+
}>;
|
|
258
273
|
error_stack?: string;
|
|
259
274
|
}
|
|
260
275
|
/**
|
package/dist/index.js
CHANGED
|
@@ -50,6 +50,7 @@ var LatticeEval = class {
|
|
|
50
50
|
constructor(config) {
|
|
51
51
|
this.inMemoryLogs = [];
|
|
52
52
|
this.lastDurationMs = 0;
|
|
53
|
+
this.lastMessages = [];
|
|
53
54
|
this.config = config;
|
|
54
55
|
this.baseUrl = this.config.base_url;
|
|
55
56
|
this.verbose = this.config.verbose ?? true;
|
|
@@ -60,7 +61,8 @@ var LatticeEval = class {
|
|
|
60
61
|
thread_id: this.lastThreadId,
|
|
61
62
|
judge_thread_id: this.lastJudgeThreadId,
|
|
62
63
|
test_prompt: this.lastTestPrompt,
|
|
63
|
-
final_output: this.lastFinalOutput
|
|
64
|
+
final_output: this.lastFinalOutput,
|
|
65
|
+
messages: this.lastMessages
|
|
64
66
|
};
|
|
65
67
|
}
|
|
66
68
|
getInMemoryLogs() {
|
|
@@ -78,9 +80,19 @@ var LatticeEval = class {
|
|
|
78
80
|
if (level === "error") {
|
|
79
81
|
const keyInfo = this.getKeyInfo(data);
|
|
80
82
|
console.log(` \u2717 ${message}${keyInfo ? ` ${keyInfo}` : ""}`);
|
|
81
|
-
} else if (message.
|
|
83
|
+
} else if (message.startsWith("Starting")) {
|
|
82
84
|
const keyInfo = this.getKeyInfo(data);
|
|
83
85
|
console.log(` ${message}${keyInfo ? ` ${keyInfo}` : ""}`);
|
|
86
|
+
} else if (message.includes("Case evaluation completed")) {
|
|
87
|
+
const keyInfo = this.getKeyInfo(data);
|
|
88
|
+
const pass = data?.pass;
|
|
89
|
+
const summary = data?.summary;
|
|
90
|
+
const status = pass ? "\u2713 PASS" : "\u2717 FAIL";
|
|
91
|
+
const reason = summary || (pass ? "Test passed" : "Test failed");
|
|
92
|
+
console.log(` ${status} ${message}${keyInfo ? ` ${keyInfo}` : ""}`);
|
|
93
|
+
if (reason) {
|
|
94
|
+
console.log(` Reason: ${reason}`);
|
|
95
|
+
}
|
|
84
96
|
}
|
|
85
97
|
}
|
|
86
98
|
log(message, data) {
|
|
@@ -216,11 +228,15 @@ var LatticeEval = class {
|
|
|
216
228
|
this.lastTestPrompt = void 0;
|
|
217
229
|
this.lastFinalOutput = void 0;
|
|
218
230
|
this.lastDurationMs = 0;
|
|
219
|
-
this.
|
|
231
|
+
this.lastMessages = [];
|
|
232
|
+
const contentAssertion = evalCase.eval?.content_assertion || "";
|
|
233
|
+
const message = contentAssertion ? `Starting: ${contentAssertion}` : "Starting";
|
|
234
|
+
this.log(message, {
|
|
220
235
|
case_id: evalCase.caseId,
|
|
221
236
|
thread_id: threadId,
|
|
222
237
|
steps_count: evalCase.steps?.length,
|
|
223
|
-
output_type: evalCase.output?.type
|
|
238
|
+
output_type: evalCase.output?.type,
|
|
239
|
+
content_assertion: contentAssertion
|
|
224
240
|
});
|
|
225
241
|
let currentThreadId = threadId;
|
|
226
242
|
let lastResponseData = null;
|
|
@@ -233,6 +249,38 @@ var LatticeEval = class {
|
|
|
233
249
|
);
|
|
234
250
|
currentThreadId = result.threadId;
|
|
235
251
|
lastResponseData = result.responseData;
|
|
252
|
+
const existingIds = new Set(this.lastMessages.map((m) => m.id).filter(Boolean));
|
|
253
|
+
if (result.responseData?.messages && Array.isArray(result.responseData.messages)) {
|
|
254
|
+
for (const msg of result.responseData.messages) {
|
|
255
|
+
if (msg && typeof msg === "object") {
|
|
256
|
+
const msgId = msg.id || msg.lc_id || msg._id;
|
|
257
|
+
if (msgId && existingIds.has(msgId)) {
|
|
258
|
+
continue;
|
|
259
|
+
}
|
|
260
|
+
const role = msg.role || msg.getType?.() || msg.type || "unknown";
|
|
261
|
+
let content = "";
|
|
262
|
+
if (typeof msg.content === "string") {
|
|
263
|
+
content = msg.content;
|
|
264
|
+
} else if (Array.isArray(msg.content)) {
|
|
265
|
+
content = msg.content.map(
|
|
266
|
+
(c) => typeof c === "string" ? c : c?.text || (typeof c === "object" ? JSON.stringify(c) : String(c))
|
|
267
|
+
).join("\n");
|
|
268
|
+
} else if (msg.content && typeof msg.content === "object") {
|
|
269
|
+
content = msg.content.text || JSON.stringify(msg.content);
|
|
270
|
+
} else {
|
|
271
|
+
content = String(msg.content || "");
|
|
272
|
+
}
|
|
273
|
+
this.lastMessages.push({
|
|
274
|
+
role,
|
|
275
|
+
content,
|
|
276
|
+
id: msgId
|
|
277
|
+
});
|
|
278
|
+
if (msgId) {
|
|
279
|
+
existingIds.add(msgId);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
}
|
|
236
284
|
}
|
|
237
285
|
const finalAgentId = evalCase.steps[evalCase.steps.length - 1]?.agent_id || "";
|
|
238
286
|
this.log("All agent steps completed", {
|
|
@@ -424,10 +472,12 @@ ${rubricsSection}
|
|
|
424
472
|
pass
|
|
425
473
|
});
|
|
426
474
|
}
|
|
475
|
+
const summary = parsedResult.summary || testResultContent || "";
|
|
427
476
|
this.log("Case evaluation completed", {
|
|
428
477
|
case_id: evalCase.caseId,
|
|
429
478
|
pass,
|
|
430
|
-
final_score: finalScore
|
|
479
|
+
final_score: finalScore,
|
|
480
|
+
summary
|
|
431
481
|
});
|
|
432
482
|
const finishedAt = Date.now();
|
|
433
483
|
this.lastDurationMs = finishedAt - startedAt;
|
|
@@ -466,6 +516,7 @@ async function evaluateLatticeCaseWithLogs(evalCase, config) {
|
|
|
466
516
|
judge_thread_id: meta.judge_thread_id,
|
|
467
517
|
test_prompt: meta.test_prompt,
|
|
468
518
|
final_output: meta.final_output,
|
|
519
|
+
messages: meta.messages,
|
|
469
520
|
logs: evaluator.getInMemoryLogs()
|
|
470
521
|
};
|
|
471
522
|
} catch (error) {
|
|
@@ -485,6 +536,7 @@ async function evaluateLatticeCaseWithLogs(evalCase, config) {
|
|
|
485
536
|
judge_thread_id: meta.judge_thread_id,
|
|
486
537
|
test_prompt: meta.test_prompt,
|
|
487
538
|
final_output: meta.final_output,
|
|
539
|
+
messages: meta.messages,
|
|
488
540
|
logs: evaluator.getInMemoryLogs()
|
|
489
541
|
};
|
|
490
542
|
}
|
|
@@ -632,6 +684,7 @@ var LatticeEvalSuite = class {
|
|
|
632
684
|
judge_thread_id: run.judge_thread_id,
|
|
633
685
|
test_prompt: run.test_prompt,
|
|
634
686
|
final_output: run.final_output,
|
|
687
|
+
messages: run.messages,
|
|
635
688
|
logs: run.logs
|
|
636
689
|
};
|
|
637
690
|
} catch (error) {
|
|
@@ -668,6 +721,7 @@ var LatticeEvalSuite = class {
|
|
|
668
721
|
judge_thread_id: run.judge_thread_id,
|
|
669
722
|
test_prompt: run.test_prompt,
|
|
670
723
|
final_output: run.final_output,
|
|
724
|
+
messages: run.messages,
|
|
671
725
|
logs: run.logs
|
|
672
726
|
};
|
|
673
727
|
} catch (error) {
|
|
@@ -936,12 +990,26 @@ Results saved to: ${batch_dir}`);
|
|
|
936
990
|
lines.push(`\`\`\``);
|
|
937
991
|
lines.push(``);
|
|
938
992
|
}
|
|
993
|
+
if (payload.messages && Array.isArray(payload.messages) && payload.messages.length > 0) {
|
|
994
|
+
lines.push(`## Conversation Messages`);
|
|
995
|
+
lines.push(``);
|
|
996
|
+
for (let i = 0; i < payload.messages.length; i++) {
|
|
997
|
+
const msg = payload.messages[i];
|
|
998
|
+
const role = msg.role || "unknown";
|
|
999
|
+
const content = msg.content || "";
|
|
1000
|
+
lines.push(`### Message ${i + 1} (${role})`);
|
|
1001
|
+
lines.push(``);
|
|
1002
|
+
lines.push(`\`\`\``);
|
|
1003
|
+
lines.push(content);
|
|
1004
|
+
lines.push(`\`\`\``);
|
|
1005
|
+
lines.push(``);
|
|
1006
|
+
}
|
|
1007
|
+
}
|
|
939
1008
|
if (payload.finalOutput) {
|
|
940
1009
|
lines.push(`## Final Output`);
|
|
941
1010
|
lines.push(``);
|
|
942
1011
|
lines.push(`\`\`\``);
|
|
943
|
-
|
|
944
|
-
lines.push(output);
|
|
1012
|
+
lines.push(payload.finalOutput);
|
|
945
1013
|
lines.push(`\`\`\``);
|
|
946
1014
|
lines.push(``);
|
|
947
1015
|
}
|
|
@@ -949,8 +1017,7 @@ Results saved to: ${batch_dir}`);
|
|
|
949
1017
|
lines.push(`## Test Prompt`);
|
|
950
1018
|
lines.push(``);
|
|
951
1019
|
lines.push(`\`\`\``);
|
|
952
|
-
|
|
953
|
-
lines.push(prompt);
|
|
1020
|
+
lines.push(payload.testPrompt);
|
|
954
1021
|
lines.push(`\`\`\``);
|
|
955
1022
|
lines.push(``);
|
|
956
1023
|
}
|
|
@@ -1056,7 +1123,8 @@ Results saved to: ${batch_dir}`);
|
|
|
1056
1123
|
threadId: r.thread_id,
|
|
1057
1124
|
judgeThreadId: r.judge_thread_id,
|
|
1058
1125
|
finalOutput: r.final_output,
|
|
1059
|
-
testPrompt: r.test_prompt
|
|
1126
|
+
testPrompt: r.test_prompt,
|
|
1127
|
+
messages: r.messages
|
|
1060
1128
|
};
|
|
1061
1129
|
await (0, import_promises.writeFile)(jsonPath, JSON.stringify(payload, null, 2), "utf-8");
|
|
1062
1130
|
const mdPath = import_path.default.join(individualDir, `${baseFilename}.md`);
|