@axiom-lattice/agent-eval 2.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,366 @@
1
+ import { LLMConfig } from '@axiom-lattice/protocols';
2
+
3
+ interface LatticeAgentStepConfig {
4
+ agent_id: string;
5
+ override_input_message?: string;
6
+ }
7
+ type OutputFileContent = {
8
+ type: "file_content";
9
+ file_path: string;
10
+ };
11
+ type OutputMessageContent = {
12
+ type: "message_content";
13
+ message: string;
14
+ };
15
+ type OutputType = OutputFileContent | OutputMessageContent;
16
+ interface LatticeEvalProjectType {
17
+ projectName: string;
18
+ version?: string;
19
+ description?: string;
20
+ suites: LatticeEvalSuiteType[];
21
+ templates?: LatticeEvalTemplate[];
22
+ report_config?: LatticeEvalReportConfig;
23
+ judge_agent_config: {
24
+ model: LLMConfig;
25
+ };
26
+ lattice_server_config: {
27
+ base_url: string;
28
+ api_key: string;
29
+ };
30
+ concurrency?: number;
31
+ }
32
+ type LatticeEvalLogLevel = "debug" | "info" | "warn" | "error";
33
+ interface LatticeEvalLogEvent {
34
+ ts: string;
35
+ level: LatticeEvalLogLevel;
36
+ message: string;
37
+ data?: Record<string, unknown>;
38
+ }
39
+ interface LatticeEvalReportConfig {
40
+ /**
41
+ * Output directory for each batch run.
42
+ * A subfolder will be created per batch.
43
+ */
44
+ output_dir: string;
45
+ /**
46
+ * Optional batch id. If not set, a timestamp-based id will be generated per run.
47
+ */
48
+ batch_id?: string;
49
+ /**
50
+ * When true, writes `report.json` into the batch folder.
51
+ * Defaults to true.
52
+ */
53
+ write_report_json?: boolean;
54
+ /**
55
+ * When true, writes per-case log files into the batch folder.
56
+ * Defaults to true.
57
+ */
58
+ write_case_logs?: boolean;
59
+ }
60
+ interface LatticeEvalBatchReport {
61
+ batch_id: string;
62
+ started_at: string;
63
+ finished_at: string;
64
+ project: {
65
+ projectName: string;
66
+ version?: string;
67
+ description?: string;
68
+ };
69
+ summary: {
70
+ total_cases: number;
71
+ passed_cases: number;
72
+ failed_cases: number;
73
+ pass_rate: number;
74
+ };
75
+ suites: Array<{
76
+ suiteName: string;
77
+ total_cases: number;
78
+ passed_cases: number;
79
+ failed_cases: number;
80
+ cases: Array<{
81
+ caseId: string;
82
+ pass?: boolean;
83
+ final_score?: number;
84
+ error?: string;
85
+ }>;
86
+ }>;
87
+ }
88
+ type LatticeEvalCaseType = LatticeEvalCase | LatticeEvalCaseWithTemplate;
89
+ interface LatticeEvalSuiteType {
90
+ suiteName: string;
91
+ version?: string;
92
+ cases: LatticeEvalCaseType[];
93
+ }
94
+ interface LatticeEvalRubric {
95
+ dimension: string;
96
+ weight: number;
97
+ description: string;
98
+ }
99
+ interface LatticeEvalCase {
100
+ caseId: string;
101
+ input: {
102
+ message: string;
103
+ files?: Record<string, string>;
104
+ };
105
+ steps: LatticeAgentStepConfig[];
106
+ output: OutputType;
107
+ eval: {
108
+ content_assertion: string;
109
+ eval_rubrics?: LatticeEvalRubric[];
110
+ };
111
+ }
112
+ interface LatticeEvalCaseWithTemplate {
113
+ caseId: string;
114
+ templateId: string;
115
+ input: {
116
+ message?: string;
117
+ files?: Record<string, string>;
118
+ variables?: Record<string, string>;
119
+ };
120
+ output?: OutputType;
121
+ eval: {
122
+ content_assertion: string;
123
+ eval_rubrics?: LatticeEvalRubric[];
124
+ };
125
+ }
126
+ interface LatticeEvalTemplate {
127
+ templateId: string;
128
+ description?: string;
129
+ input_schema: {
130
+ required_files?: string[];
131
+ variables?: string[];
132
+ };
133
+ default_case: Omit<LatticeEvalCase, "caseId" | "eval"> & {
134
+ eval?: {
135
+ eval_rubrics?: LatticeEvalRubric[];
136
+ };
137
+ };
138
+ }
139
+ interface LatticeEvalResult {
140
+ pass: boolean;
141
+ final_score: number;
142
+ dimension_results: {
143
+ name: string;
144
+ score: number;
145
+ reason: string;
146
+ }[];
147
+ summary: string;
148
+ error?: string;
149
+ }
150
+
151
+ /**
152
+ * Configuration for Lattice evaluation server
153
+ */
154
+ interface LatticeEvalConfig {
155
+ base_url: string;
156
+ api_key?: string;
157
+ /**
158
+ * When true, prints detailed execution logs for each action.
159
+ * Defaults to true.
160
+ */
161
+ verbose?: boolean;
162
+ }
163
+ interface LatticeEvalCaseRunResult {
164
+ caseId: string;
165
+ result?: LatticeEvalResult;
166
+ error?: string;
167
+ error_stack?: string;
168
+ duration_ms: number;
169
+ thread_id?: string;
170
+ judge_thread_id?: string;
171
+ test_prompt?: string;
172
+ final_output?: string;
173
+ logs: LatticeEvalLogEvent[];
174
+ }
175
+ /**
176
+ * LatticeEval class for evaluating Lattice evaluation cases
177
+ */
178
+ declare class LatticeEval {
179
+ private config;
180
+ private baseUrl;
181
+ private verbose;
182
+ private inMemoryLogs;
183
+ private lastThreadId?;
184
+ private lastJudgeThreadId?;
185
+ private lastTestPrompt?;
186
+ private lastFinalOutput?;
187
+ private lastDurationMs;
188
+ getLastRunMeta(): {
189
+ duration_ms: number;
190
+ thread_id: string | undefined;
191
+ judge_thread_id: string | undefined;
192
+ test_prompt: string | undefined;
193
+ final_output: string | undefined;
194
+ };
195
+ /**
196
+ * Create a new LatticeEval instance
197
+ * @param config Optional server configuration (defaults to localhost:3203)
198
+ */
199
+ constructor(config: LatticeEvalConfig);
200
+ getInMemoryLogs(): LatticeEvalLogEvent[];
201
+ record(level: LatticeEvalLogLevel, message: string, data?: Record<string, unknown>): void;
202
+ private log;
203
+ private getKeyInfo;
204
+ /**
205
+ * Execute a single agent step and return the thread ID and response data
206
+ */
207
+ private executeAgentStep;
208
+ /**
209
+ * Extract output content based on OutputType
210
+ */
211
+ private extractOutput;
212
+ /**
213
+ * Evaluate a single Lattice evaluation case
214
+ * @param evalCase The evaluation case to run
215
+ * @returns Evaluation result with pass/fail status and scores
216
+ */
217
+ evaluateCase(evalCase: LatticeEvalCase): Promise<LatticeEvalResult>;
218
+ }
219
+ /**
220
+ * Evaluate a single Lattice evaluation case (backward compatibility function)
221
+ * @param evalCase The evaluation case to run
222
+ * @param config Optional server configuration (defaults to localhost:3203)
223
+ * @returns Evaluation result with pass/fail status and scores
224
+ * @deprecated Use LatticeEval class instead
225
+ */
226
+ declare function evaluateLatticeCase(evalCase: LatticeEvalCase, config?: LatticeEvalConfig): Promise<LatticeEvalResult>;
227
+ /**
228
+ * Evaluate a single Lattice evaluation case and always return logs (never throws).
229
+ */
230
+ declare function evaluateLatticeCaseWithLogs(evalCase: LatticeEvalCase, config?: LatticeEvalConfig): Promise<LatticeEvalCaseRunResult>;
231
+
232
+ /**
233
+ * Configuration resolved from project/suite hierarchy
234
+ */
235
+ interface ResolvedConfig {
236
+ lattice_server_config: {
237
+ base_url: string;
238
+ api_key?: string;
239
+ };
240
+ judge_agent_config?: {
241
+ model: LLMConfig;
242
+ };
243
+ concurrency: number;
244
+ }
245
+ /**
246
+ * Result with error handling
247
+ */
248
+ interface CaseRunResult {
249
+ caseId: string;
250
+ result?: LatticeEvalResult;
251
+ error?: string;
252
+ logs: LatticeEvalLogEvent[];
253
+ duration_ms?: number;
254
+ thread_id?: string;
255
+ judge_thread_id?: string;
256
+ test_prompt?: string;
257
+ final_output?: string;
258
+ error_stack?: string;
259
+ }
260
+ /**
261
+ * LatticeEvalSuite class manages a suite of evaluation cases
262
+ * with suite-level configuration
263
+ */
264
+ declare class LatticeEvalSuite {
265
+ private suite;
266
+ private projectConfig;
267
+ private templates;
268
+ constructor(suite: LatticeEvalSuiteType, projectConfig: ResolvedConfig, templates?: Map<string, LatticeEvalTemplate>);
269
+ /**
270
+ * Get resolved configuration from project
271
+ */
272
+ private getResolvedConfig;
273
+ /**
274
+ * Get suite name
275
+ */
276
+ getSuiteName(): string;
277
+ /**
278
+ * Get suite version
279
+ */
280
+ getVersion(): string | undefined;
281
+ /**
282
+ * Get all cases in this suite (resolved from templates if needed)
283
+ */
284
+ getCases(): LatticeEvalCase[];
285
+ /**
286
+ * Get a specific case by ID (resolved from template if needed)
287
+ */
288
+ getCase(caseId: string): LatticeEvalCase | undefined;
289
+ /**
290
+ * Run a single case in this suite with error handling
291
+ * @param caseId The case ID to run
292
+ * @returns Case run result with error handling
293
+ */
294
+ runCase(caseId: string): Promise<CaseRunResult>;
295
+ /**
296
+ * Run all cases in this suite with concurrency control and error isolation
297
+ * @param concurrency Optional concurrency limit (overrides project config)
298
+ * @returns Array of case run results with error handling
299
+ */
300
+ runAllCases(concurrency?: number): Promise<CaseRunResult[]>;
301
+ }
302
+
303
+ /**
304
+ * LatticeEvalProject class manages a project with multiple evaluation suites
305
+ * with project-level configuration
306
+ */
307
+ declare class LatticeEvalProject {
308
+ private project;
309
+ private suites;
310
+ private reportConfig?;
311
+ constructor(project: LatticeEvalProjectType);
312
+ /**
313
+ * Get project name
314
+ */
315
+ getProjectName(): string;
316
+ /**
317
+ * Get project version
318
+ */
319
+ getVersion(): string | undefined;
320
+ /**
321
+ * Get project description
322
+ */
323
+ getDescription(): string | undefined;
324
+ /**
325
+ * Get all suite names
326
+ */
327
+ getSuiteNames(): string[];
328
+ /**
329
+ * Get a specific suite by name
330
+ */
331
+ getSuite(suiteName: string): LatticeEvalSuite | undefined;
332
+ /**
333
+ * Run a specific case in a specific suite
334
+ * @param suiteName The suite name
335
+ * @param caseId The case ID to run
336
+ * @returns Case run result with error handling
337
+ */
338
+ runCase(suiteName: string, caseId: string): Promise<CaseRunResult>;
339
+ /**
340
+ * Run all cases in a specific suite with concurrency control and error isolation
341
+ * @param suiteName The suite name
342
+ * @param concurrency Optional concurrency limit (overrides project config)
343
+ * @returns Array of case run results with error handling
344
+ */
345
+ runSuite(suiteName: string, concurrency?: number): Promise<CaseRunResult[]>;
346
+ /**
347
+ * Run all cases in all suites with concurrency control and error isolation
348
+ * @param concurrency Optional concurrency limit (overrides project config)
349
+ * @returns Map of suite names to their case run results
350
+ */
351
+ runAllSuites(concurrency?: number): Promise<Map<string, CaseRunResult[]>>;
352
+ /**
353
+ * Run all suites as a "batch", build a report, and optionally write it to disk.
354
+ */
355
+ runAllSuitesBatch(concurrency?: number): Promise<{
356
+ batch_id: string;
357
+ batch_dir?: string;
358
+ results: Map<string, CaseRunResult[]>;
359
+ report: LatticeEvalBatchReport;
360
+ }>;
361
+ private generateCaseMarkdown;
362
+ private generateMarkdownSummary;
363
+ private maybeWriteBatchArtifacts;
364
+ }
365
+
366
+ export { type CaseRunResult, type LatticeAgentStepConfig, LatticeEval, type LatticeEvalBatchReport, type LatticeEvalCase, type LatticeEvalCaseRunResult, type LatticeEvalCaseType, type LatticeEvalCaseWithTemplate, type LatticeEvalConfig, type LatticeEvalLogEvent, type LatticeEvalLogLevel, LatticeEvalProject, type LatticeEvalProjectType, type LatticeEvalReportConfig, type LatticeEvalResult, type LatticeEvalRubric, LatticeEvalSuite, type LatticeEvalSuiteType, type LatticeEvalTemplate, type OutputFileContent, type OutputMessageContent, type OutputType, type ResolvedConfig, evaluateLatticeCase, evaluateLatticeCaseWithLogs };