@axiom-lattice/agent-eval 2.1.9 → 2.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- > @axiom-lattice/agent-eval@2.1.9 build /home/runner/work/agentic/agentic/packages/agent-eval
2
+ > @axiom-lattice/agent-eval@2.1.11 build /home/runner/work/agentic/agentic/packages/agent-eval
3
3
  > tsup src/index.ts --format cjs,esm --dts --sourcemap
4
4
 
5
5
  CLI Building entry: src/index.ts
@@ -8,13 +8,13 @@
8
8
  CLI Target: es2020
9
9
  CJS Build start
10
10
  ESM Build start
11
- CJS dist/index.js 39.11 KB
12
- CJS dist/index.js.map 67.54 KB
13
- CJS ⚡️ Build success in 186ms
14
- ESM dist/index.mjs 36.92 KB
15
- ESM dist/index.mjs.map 67.28 KB
16
- ESM ⚡️ Build success in 191ms
11
+ CJS dist/index.js 41.54 KB
12
+ CJS dist/index.js.map 72.12 KB
13
+ CJS ⚡️ Build success in 182ms
14
+ ESM dist/index.mjs 39.35 KB
15
+ ESM dist/index.mjs.map 71.85 KB
16
+ ESM ⚡️ Build success in 184ms
17
17
  DTS Build start
18
- DTS ⚡️ Build success in 8789ms
19
- DTS dist/index.d.ts 10.68 KB
20
- DTS dist/index.d.mts 10.68 KB
18
+ DTS ⚡️ Build success in 11062ms
19
+ DTS dist/index.d.ts 10.99 KB
20
+ DTS dist/index.d.mts 10.99 KB
package/CHANGELOG.md CHANGED
@@ -1,5 +1,22 @@
1
1
  # @axiom-lattice/agent-eval
2
2
 
3
+ ## 2.1.11
4
+
5
+ ### Patch Changes
6
+
7
+ - Updated dependencies [2422cbf]
8
+ - @axiom-lattice/protocols@2.1.11
9
+ - @axiom-lattice/core@2.1.17
10
+
11
+ ## 2.1.10
12
+
13
+ ### Patch Changes
14
+
15
+ - 773c03f: add skills
16
+ - Updated dependencies [773c03f]
17
+ - @axiom-lattice/protocols@2.1.10
18
+ - @axiom-lattice/core@2.1.16
19
+
3
20
  ## 2.1.9
4
21
 
5
22
  ### Patch Changes
package/dist/index.d.mts CHANGED
@@ -10,7 +10,6 @@ type OutputFileContent = {
10
10
  };
11
11
  type OutputMessageContent = {
12
12
  type: "message_content";
13
- message: string;
14
13
  };
15
14
  type OutputType = OutputFileContent | OutputMessageContent;
16
15
  interface LatticeEvalProjectType {
@@ -170,6 +169,11 @@ interface LatticeEvalCaseRunResult {
170
169
  judge_thread_id?: string;
171
170
  test_prompt?: string;
172
171
  final_output?: string;
172
+ messages?: Array<{
173
+ role: string;
174
+ content: string;
175
+ id?: string;
176
+ }>;
173
177
  logs: LatticeEvalLogEvent[];
174
178
  }
175
179
  /**
@@ -185,12 +189,18 @@ declare class LatticeEval {
185
189
  private lastTestPrompt?;
186
190
  private lastFinalOutput?;
187
191
  private lastDurationMs;
192
+ private lastMessages;
188
193
  getLastRunMeta(): {
189
194
  duration_ms: number;
190
195
  thread_id: string | undefined;
191
196
  judge_thread_id: string | undefined;
192
197
  test_prompt: string | undefined;
193
198
  final_output: string | undefined;
199
+ messages: {
200
+ role: string;
201
+ content: string;
202
+ id?: string;
203
+ }[];
194
204
  };
195
205
  /**
196
206
  * Create a new LatticeEval instance
@@ -255,6 +265,11 @@ interface CaseRunResult {
255
265
  judge_thread_id?: string;
256
266
  test_prompt?: string;
257
267
  final_output?: string;
268
+ messages?: Array<{
269
+ role: string;
270
+ content: string;
271
+ id?: string;
272
+ }>;
258
273
  error_stack?: string;
259
274
  }
260
275
  /**
package/dist/index.d.ts CHANGED
@@ -10,7 +10,6 @@ type OutputFileContent = {
10
10
  };
11
11
  type OutputMessageContent = {
12
12
  type: "message_content";
13
- message: string;
14
13
  };
15
14
  type OutputType = OutputFileContent | OutputMessageContent;
16
15
  interface LatticeEvalProjectType {
@@ -170,6 +169,11 @@ interface LatticeEvalCaseRunResult {
170
169
  judge_thread_id?: string;
171
170
  test_prompt?: string;
172
171
  final_output?: string;
172
+ messages?: Array<{
173
+ role: string;
174
+ content: string;
175
+ id?: string;
176
+ }>;
173
177
  logs: LatticeEvalLogEvent[];
174
178
  }
175
179
  /**
@@ -185,12 +189,18 @@ declare class LatticeEval {
185
189
  private lastTestPrompt?;
186
190
  private lastFinalOutput?;
187
191
  private lastDurationMs;
192
+ private lastMessages;
188
193
  getLastRunMeta(): {
189
194
  duration_ms: number;
190
195
  thread_id: string | undefined;
191
196
  judge_thread_id: string | undefined;
192
197
  test_prompt: string | undefined;
193
198
  final_output: string | undefined;
199
+ messages: {
200
+ role: string;
201
+ content: string;
202
+ id?: string;
203
+ }[];
194
204
  };
195
205
  /**
196
206
  * Create a new LatticeEval instance
@@ -255,6 +265,11 @@ interface CaseRunResult {
255
265
  judge_thread_id?: string;
256
266
  test_prompt?: string;
257
267
  final_output?: string;
268
+ messages?: Array<{
269
+ role: string;
270
+ content: string;
271
+ id?: string;
272
+ }>;
258
273
  error_stack?: string;
259
274
  }
260
275
  /**
package/dist/index.js CHANGED
@@ -50,6 +50,7 @@ var LatticeEval = class {
50
50
  constructor(config) {
51
51
  this.inMemoryLogs = [];
52
52
  this.lastDurationMs = 0;
53
+ this.lastMessages = [];
53
54
  this.config = config;
54
55
  this.baseUrl = this.config.base_url;
55
56
  this.verbose = this.config.verbose ?? true;
@@ -60,7 +61,8 @@ var LatticeEval = class {
60
61
  thread_id: this.lastThreadId,
61
62
  judge_thread_id: this.lastJudgeThreadId,
62
63
  test_prompt: this.lastTestPrompt,
63
- final_output: this.lastFinalOutput
64
+ final_output: this.lastFinalOutput,
65
+ messages: this.lastMessages
64
66
  };
65
67
  }
66
68
  getInMemoryLogs() {
@@ -78,9 +80,19 @@ var LatticeEval = class {
78
80
  if (level === "error") {
79
81
  const keyInfo = this.getKeyInfo(data);
80
82
  console.log(` \u2717 ${message}${keyInfo ? ` ${keyInfo}` : ""}`);
81
- } else if (message.includes("Starting case") || message.includes("Case evaluation completed")) {
83
+ } else if (message.startsWith("Starting")) {
82
84
  const keyInfo = this.getKeyInfo(data);
83
85
  console.log(` ${message}${keyInfo ? ` ${keyInfo}` : ""}`);
86
+ } else if (message.includes("Case evaluation completed")) {
87
+ const keyInfo = this.getKeyInfo(data);
88
+ const pass = data?.pass;
89
+ const summary = data?.summary;
90
+ const status = pass ? "\u2713 PASS" : "\u2717 FAIL";
91
+ const reason = summary || (pass ? "Test passed" : "Test failed");
92
+ console.log(` ${status} ${message}${keyInfo ? ` ${keyInfo}` : ""}`);
93
+ if (reason) {
94
+ console.log(` Reason: ${reason}`);
95
+ }
84
96
  }
85
97
  }
86
98
  log(message, data) {
@@ -216,11 +228,15 @@ var LatticeEval = class {
216
228
  this.lastTestPrompt = void 0;
217
229
  this.lastFinalOutput = void 0;
218
230
  this.lastDurationMs = 0;
219
- this.log("Starting case evaluation", {
231
+ this.lastMessages = [];
232
+ const contentAssertion = evalCase.eval?.content_assertion || "";
233
+ const message = contentAssertion ? `Starting: ${contentAssertion}` : "Starting";
234
+ this.log(message, {
220
235
  case_id: evalCase.caseId,
221
236
  thread_id: threadId,
222
237
  steps_count: evalCase.steps?.length,
223
- output_type: evalCase.output?.type
238
+ output_type: evalCase.output?.type,
239
+ content_assertion: contentAssertion
224
240
  });
225
241
  let currentThreadId = threadId;
226
242
  let lastResponseData = null;
@@ -233,6 +249,38 @@ var LatticeEval = class {
233
249
  );
234
250
  currentThreadId = result.threadId;
235
251
  lastResponseData = result.responseData;
252
+ const existingIds = new Set(this.lastMessages.map((m) => m.id).filter(Boolean));
253
+ if (result.responseData?.messages && Array.isArray(result.responseData.messages)) {
254
+ for (const msg of result.responseData.messages) {
255
+ if (msg && typeof msg === "object") {
256
+ const msgId = msg.id || msg.lc_id || msg._id;
257
+ if (msgId && existingIds.has(msgId)) {
258
+ continue;
259
+ }
260
+ const role = msg.role || msg.getType?.() || msg.type || "unknown";
261
+ let content = "";
262
+ if (typeof msg.content === "string") {
263
+ content = msg.content;
264
+ } else if (Array.isArray(msg.content)) {
265
+ content = msg.content.map(
266
+ (c) => typeof c === "string" ? c : c?.text || (typeof c === "object" ? JSON.stringify(c) : String(c))
267
+ ).join("\n");
268
+ } else if (msg.content && typeof msg.content === "object") {
269
+ content = msg.content.text || JSON.stringify(msg.content);
270
+ } else {
271
+ content = String(msg.content || "");
272
+ }
273
+ this.lastMessages.push({
274
+ role,
275
+ content,
276
+ id: msgId
277
+ });
278
+ if (msgId) {
279
+ existingIds.add(msgId);
280
+ }
281
+ }
282
+ }
283
+ }
236
284
  }
237
285
  const finalAgentId = evalCase.steps[evalCase.steps.length - 1]?.agent_id || "";
238
286
  this.log("All agent steps completed", {
@@ -424,10 +472,12 @@ ${rubricsSection}
424
472
  pass
425
473
  });
426
474
  }
475
+ const summary = parsedResult.summary || testResultContent || "";
427
476
  this.log("Case evaluation completed", {
428
477
  case_id: evalCase.caseId,
429
478
  pass,
430
- final_score: finalScore
479
+ final_score: finalScore,
480
+ summary
431
481
  });
432
482
  const finishedAt = Date.now();
433
483
  this.lastDurationMs = finishedAt - startedAt;
@@ -466,6 +516,7 @@ async function evaluateLatticeCaseWithLogs(evalCase, config) {
466
516
  judge_thread_id: meta.judge_thread_id,
467
517
  test_prompt: meta.test_prompt,
468
518
  final_output: meta.final_output,
519
+ messages: meta.messages,
469
520
  logs: evaluator.getInMemoryLogs()
470
521
  };
471
522
  } catch (error) {
@@ -485,6 +536,7 @@ async function evaluateLatticeCaseWithLogs(evalCase, config) {
485
536
  judge_thread_id: meta.judge_thread_id,
486
537
  test_prompt: meta.test_prompt,
487
538
  final_output: meta.final_output,
539
+ messages: meta.messages,
488
540
  logs: evaluator.getInMemoryLogs()
489
541
  };
490
542
  }
@@ -632,6 +684,7 @@ var LatticeEvalSuite = class {
632
684
  judge_thread_id: run.judge_thread_id,
633
685
  test_prompt: run.test_prompt,
634
686
  final_output: run.final_output,
687
+ messages: run.messages,
635
688
  logs: run.logs
636
689
  };
637
690
  } catch (error) {
@@ -668,6 +721,7 @@ var LatticeEvalSuite = class {
668
721
  judge_thread_id: run.judge_thread_id,
669
722
  test_prompt: run.test_prompt,
670
723
  final_output: run.final_output,
724
+ messages: run.messages,
671
725
  logs: run.logs
672
726
  };
673
727
  } catch (error) {
@@ -936,12 +990,26 @@ Results saved to: ${batch_dir}`);
936
990
  lines.push(`\`\`\``);
937
991
  lines.push(``);
938
992
  }
993
+ if (payload.messages && Array.isArray(payload.messages) && payload.messages.length > 0) {
994
+ lines.push(`## Conversation Messages`);
995
+ lines.push(``);
996
+ for (let i = 0; i < payload.messages.length; i++) {
997
+ const msg = payload.messages[i];
998
+ const role = msg.role || "unknown";
999
+ const content = msg.content || "";
1000
+ lines.push(`### Message ${i + 1} (${role})`);
1001
+ lines.push(``);
1002
+ lines.push(`\`\`\``);
1003
+ lines.push(content);
1004
+ lines.push(`\`\`\``);
1005
+ lines.push(``);
1006
+ }
1007
+ }
939
1008
  if (payload.finalOutput) {
940
1009
  lines.push(`## Final Output`);
941
1010
  lines.push(``);
942
1011
  lines.push(`\`\`\``);
943
- const output = payload.finalOutput.length > 5e3 ? payload.finalOutput.substring(0, 5e3) + "\n\n... (truncated, see JSON for full output)" : payload.finalOutput;
944
- lines.push(output);
1012
+ lines.push(payload.finalOutput);
945
1013
  lines.push(`\`\`\``);
946
1014
  lines.push(``);
947
1015
  }
@@ -949,8 +1017,7 @@ Results saved to: ${batch_dir}`);
949
1017
  lines.push(`## Test Prompt`);
950
1018
  lines.push(``);
951
1019
  lines.push(`\`\`\``);
952
- const prompt = payload.testPrompt.length > 5e3 ? payload.testPrompt.substring(0, 5e3) + "\n\n... (truncated, see JSON for full prompt)" : payload.testPrompt;
953
- lines.push(prompt);
1020
+ lines.push(payload.testPrompt);
954
1021
  lines.push(`\`\`\``);
955
1022
  lines.push(``);
956
1023
  }
@@ -1056,7 +1123,8 @@ Results saved to: ${batch_dir}`);
1056
1123
  threadId: r.thread_id,
1057
1124
  judgeThreadId: r.judge_thread_id,
1058
1125
  finalOutput: r.final_output,
1059
- testPrompt: r.test_prompt
1126
+ testPrompt: r.test_prompt,
1127
+ messages: r.messages
1060
1128
  };
1061
1129
  await (0, import_promises.writeFile)(jsonPath, JSON.stringify(payload, null, 2), "utf-8");
1062
1130
  const mdPath = import_path.default.join(individualDir, `${baseFilename}.md`);