@axiom-lattice/agent-eval 2.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env +29 -0
- package/.turbo/turbo-build.log +20 -0
- package/CHANGELOG.md +10 -0
- package/LICENSE +201 -0
- package/dist/index.d.mts +366 -0
- package/dist/index.d.ts +366 -0
- package/dist/index.js +1092 -0
- package/dist/index.js.map +1 -0
- package/dist/index.mjs +1055 -0
- package/dist/index.mjs.map +1 -0
- package/package.json +51 -0
- package/src/LatticeEval.ts +615 -0
- package/src/LatticeEvalProject.ts +496 -0
- package/src/LatticeEvalSuite.ts +321 -0
- package/src/index.ts +4 -0
- package/src/test.ts +23 -0
- package/src/types.ts +160 -0
- package/tsconfig.json +33 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
import { LLMConfig } from '@axiom-lattice/protocols';
|
|
2
|
+
|
|
3
|
+
interface LatticeAgentStepConfig {
|
|
4
|
+
agent_id: string;
|
|
5
|
+
override_input_message?: string;
|
|
6
|
+
}
|
|
7
|
+
type OutputFileContent = {
|
|
8
|
+
type: "file_content";
|
|
9
|
+
file_path: string;
|
|
10
|
+
};
|
|
11
|
+
type OutputMessageContent = {
|
|
12
|
+
type: "message_content";
|
|
13
|
+
message: string;
|
|
14
|
+
};
|
|
15
|
+
type OutputType = OutputFileContent | OutputMessageContent;
|
|
16
|
+
interface LatticeEvalProjectType {
|
|
17
|
+
projectName: string;
|
|
18
|
+
version?: string;
|
|
19
|
+
description?: string;
|
|
20
|
+
suites: LatticeEvalSuiteType[];
|
|
21
|
+
templates?: LatticeEvalTemplate[];
|
|
22
|
+
report_config?: LatticeEvalReportConfig;
|
|
23
|
+
judge_agent_config: {
|
|
24
|
+
model: LLMConfig;
|
|
25
|
+
};
|
|
26
|
+
lattice_server_config: {
|
|
27
|
+
base_url: string;
|
|
28
|
+
api_key: string;
|
|
29
|
+
};
|
|
30
|
+
concurrency?: number;
|
|
31
|
+
}
|
|
32
|
+
type LatticeEvalLogLevel = "debug" | "info" | "warn" | "error";
|
|
33
|
+
interface LatticeEvalLogEvent {
|
|
34
|
+
ts: string;
|
|
35
|
+
level: LatticeEvalLogLevel;
|
|
36
|
+
message: string;
|
|
37
|
+
data?: Record<string, unknown>;
|
|
38
|
+
}
|
|
39
|
+
interface LatticeEvalReportConfig {
|
|
40
|
+
/**
|
|
41
|
+
* Output directory for each batch run.
|
|
42
|
+
* A subfolder will be created per batch.
|
|
43
|
+
*/
|
|
44
|
+
output_dir: string;
|
|
45
|
+
/**
|
|
46
|
+
* Optional batch id. If not set, a timestamp-based id will be generated per run.
|
|
47
|
+
*/
|
|
48
|
+
batch_id?: string;
|
|
49
|
+
/**
|
|
50
|
+
* When true, writes `report.json` into the batch folder.
|
|
51
|
+
* Defaults to true.
|
|
52
|
+
*/
|
|
53
|
+
write_report_json?: boolean;
|
|
54
|
+
/**
|
|
55
|
+
* When true, writes per-case log files into the batch folder.
|
|
56
|
+
* Defaults to true.
|
|
57
|
+
*/
|
|
58
|
+
write_case_logs?: boolean;
|
|
59
|
+
}
|
|
60
|
+
interface LatticeEvalBatchReport {
|
|
61
|
+
batch_id: string;
|
|
62
|
+
started_at: string;
|
|
63
|
+
finished_at: string;
|
|
64
|
+
project: {
|
|
65
|
+
projectName: string;
|
|
66
|
+
version?: string;
|
|
67
|
+
description?: string;
|
|
68
|
+
};
|
|
69
|
+
summary: {
|
|
70
|
+
total_cases: number;
|
|
71
|
+
passed_cases: number;
|
|
72
|
+
failed_cases: number;
|
|
73
|
+
pass_rate: number;
|
|
74
|
+
};
|
|
75
|
+
suites: Array<{
|
|
76
|
+
suiteName: string;
|
|
77
|
+
total_cases: number;
|
|
78
|
+
passed_cases: number;
|
|
79
|
+
failed_cases: number;
|
|
80
|
+
cases: Array<{
|
|
81
|
+
caseId: string;
|
|
82
|
+
pass?: boolean;
|
|
83
|
+
final_score?: number;
|
|
84
|
+
error?: string;
|
|
85
|
+
}>;
|
|
86
|
+
}>;
|
|
87
|
+
}
|
|
88
|
+
type LatticeEvalCaseType = LatticeEvalCase | LatticeEvalCaseWithTemplate;
|
|
89
|
+
interface LatticeEvalSuiteType {
|
|
90
|
+
suiteName: string;
|
|
91
|
+
version?: string;
|
|
92
|
+
cases: LatticeEvalCaseType[];
|
|
93
|
+
}
|
|
94
|
+
interface LatticeEvalRubric {
|
|
95
|
+
dimension: string;
|
|
96
|
+
weight: number;
|
|
97
|
+
description: string;
|
|
98
|
+
}
|
|
99
|
+
interface LatticeEvalCase {
|
|
100
|
+
caseId: string;
|
|
101
|
+
input: {
|
|
102
|
+
message: string;
|
|
103
|
+
files?: Record<string, string>;
|
|
104
|
+
};
|
|
105
|
+
steps: LatticeAgentStepConfig[];
|
|
106
|
+
output: OutputType;
|
|
107
|
+
eval: {
|
|
108
|
+
content_assertion: string;
|
|
109
|
+
eval_rubrics?: LatticeEvalRubric[];
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
interface LatticeEvalCaseWithTemplate {
|
|
113
|
+
caseId: string;
|
|
114
|
+
templateId: string;
|
|
115
|
+
input: {
|
|
116
|
+
message?: string;
|
|
117
|
+
files?: Record<string, string>;
|
|
118
|
+
variables?: Record<string, string>;
|
|
119
|
+
};
|
|
120
|
+
output?: OutputType;
|
|
121
|
+
eval: {
|
|
122
|
+
content_assertion: string;
|
|
123
|
+
eval_rubrics?: LatticeEvalRubric[];
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
interface LatticeEvalTemplate {
|
|
127
|
+
templateId: string;
|
|
128
|
+
description?: string;
|
|
129
|
+
input_schema: {
|
|
130
|
+
required_files?: string[];
|
|
131
|
+
variables?: string[];
|
|
132
|
+
};
|
|
133
|
+
default_case: Omit<LatticeEvalCase, "caseId" | "eval"> & {
|
|
134
|
+
eval?: {
|
|
135
|
+
eval_rubrics?: LatticeEvalRubric[];
|
|
136
|
+
};
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
interface LatticeEvalResult {
|
|
140
|
+
pass: boolean;
|
|
141
|
+
final_score: number;
|
|
142
|
+
dimension_results: {
|
|
143
|
+
name: string;
|
|
144
|
+
score: number;
|
|
145
|
+
reason: string;
|
|
146
|
+
}[];
|
|
147
|
+
summary: string;
|
|
148
|
+
error?: string;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Configuration for Lattice evaluation server
|
|
153
|
+
*/
|
|
154
|
+
interface LatticeEvalConfig {
|
|
155
|
+
base_url: string;
|
|
156
|
+
api_key?: string;
|
|
157
|
+
/**
|
|
158
|
+
* When true, prints detailed execution logs for each action.
|
|
159
|
+
* Defaults to true.
|
|
160
|
+
*/
|
|
161
|
+
verbose?: boolean;
|
|
162
|
+
}
|
|
163
|
+
interface LatticeEvalCaseRunResult {
|
|
164
|
+
caseId: string;
|
|
165
|
+
result?: LatticeEvalResult;
|
|
166
|
+
error?: string;
|
|
167
|
+
error_stack?: string;
|
|
168
|
+
duration_ms: number;
|
|
169
|
+
thread_id?: string;
|
|
170
|
+
judge_thread_id?: string;
|
|
171
|
+
test_prompt?: string;
|
|
172
|
+
final_output?: string;
|
|
173
|
+
logs: LatticeEvalLogEvent[];
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* LatticeEval class for evaluating Lattice evaluation cases
|
|
177
|
+
*/
|
|
178
|
+
declare class LatticeEval {
|
|
179
|
+
private config;
|
|
180
|
+
private baseUrl;
|
|
181
|
+
private verbose;
|
|
182
|
+
private inMemoryLogs;
|
|
183
|
+
private lastThreadId?;
|
|
184
|
+
private lastJudgeThreadId?;
|
|
185
|
+
private lastTestPrompt?;
|
|
186
|
+
private lastFinalOutput?;
|
|
187
|
+
private lastDurationMs;
|
|
188
|
+
getLastRunMeta(): {
|
|
189
|
+
duration_ms: number;
|
|
190
|
+
thread_id: string | undefined;
|
|
191
|
+
judge_thread_id: string | undefined;
|
|
192
|
+
test_prompt: string | undefined;
|
|
193
|
+
final_output: string | undefined;
|
|
194
|
+
};
|
|
195
|
+
/**
|
|
196
|
+
* Create a new LatticeEval instance
|
|
197
|
+
* @param config Optional server configuration (defaults to localhost:3203)
|
|
198
|
+
*/
|
|
199
|
+
constructor(config: LatticeEvalConfig);
|
|
200
|
+
getInMemoryLogs(): LatticeEvalLogEvent[];
|
|
201
|
+
record(level: LatticeEvalLogLevel, message: string, data?: Record<string, unknown>): void;
|
|
202
|
+
private log;
|
|
203
|
+
private getKeyInfo;
|
|
204
|
+
/**
|
|
205
|
+
* Execute a single agent step and return the thread ID and response data
|
|
206
|
+
*/
|
|
207
|
+
private executeAgentStep;
|
|
208
|
+
/**
|
|
209
|
+
* Extract output content based on OutputType
|
|
210
|
+
*/
|
|
211
|
+
private extractOutput;
|
|
212
|
+
/**
|
|
213
|
+
* Evaluate a single Lattice evaluation case
|
|
214
|
+
* @param evalCase The evaluation case to run
|
|
215
|
+
* @returns Evaluation result with pass/fail status and scores
|
|
216
|
+
*/
|
|
217
|
+
evaluateCase(evalCase: LatticeEvalCase): Promise<LatticeEvalResult>;
|
|
218
|
+
}
|
|
219
|
+
/**
|
|
220
|
+
* Evaluate a single Lattice evaluation case (backward compatibility function)
|
|
221
|
+
* @param evalCase The evaluation case to run
|
|
222
|
+
* @param config Optional server configuration (defaults to localhost:3203)
|
|
223
|
+
* @returns Evaluation result with pass/fail status and scores
|
|
224
|
+
* @deprecated Use LatticeEval class instead
|
|
225
|
+
*/
|
|
226
|
+
declare function evaluateLatticeCase(evalCase: LatticeEvalCase, config?: LatticeEvalConfig): Promise<LatticeEvalResult>;
|
|
227
|
+
/**
|
|
228
|
+
* Evaluate a single Lattice evaluation case and always return logs (never throws).
|
|
229
|
+
*/
|
|
230
|
+
declare function evaluateLatticeCaseWithLogs(evalCase: LatticeEvalCase, config?: LatticeEvalConfig): Promise<LatticeEvalCaseRunResult>;
|
|
231
|
+
|
|
232
|
+
/**
|
|
233
|
+
* Configuration resolved from project/suite hierarchy
|
|
234
|
+
*/
|
|
235
|
+
interface ResolvedConfig {
|
|
236
|
+
lattice_server_config: {
|
|
237
|
+
base_url: string;
|
|
238
|
+
api_key?: string;
|
|
239
|
+
};
|
|
240
|
+
judge_agent_config?: {
|
|
241
|
+
model: LLMConfig;
|
|
242
|
+
};
|
|
243
|
+
concurrency: number;
|
|
244
|
+
}
|
|
245
|
+
/**
|
|
246
|
+
* Result with error handling
|
|
247
|
+
*/
|
|
248
|
+
interface CaseRunResult {
|
|
249
|
+
caseId: string;
|
|
250
|
+
result?: LatticeEvalResult;
|
|
251
|
+
error?: string;
|
|
252
|
+
logs: LatticeEvalLogEvent[];
|
|
253
|
+
duration_ms?: number;
|
|
254
|
+
thread_id?: string;
|
|
255
|
+
judge_thread_id?: string;
|
|
256
|
+
test_prompt?: string;
|
|
257
|
+
final_output?: string;
|
|
258
|
+
error_stack?: string;
|
|
259
|
+
}
|
|
260
|
+
/**
|
|
261
|
+
* LatticeEvalSuite class manages a suite of evaluation cases
|
|
262
|
+
* with suite-level configuration
|
|
263
|
+
*/
|
|
264
|
+
declare class LatticeEvalSuite {
|
|
265
|
+
private suite;
|
|
266
|
+
private projectConfig;
|
|
267
|
+
private templates;
|
|
268
|
+
constructor(suite: LatticeEvalSuiteType, projectConfig: ResolvedConfig, templates?: Map<string, LatticeEvalTemplate>);
|
|
269
|
+
/**
|
|
270
|
+
* Get resolved configuration from project
|
|
271
|
+
*/
|
|
272
|
+
private getResolvedConfig;
|
|
273
|
+
/**
|
|
274
|
+
* Get suite name
|
|
275
|
+
*/
|
|
276
|
+
getSuiteName(): string;
|
|
277
|
+
/**
|
|
278
|
+
* Get suite version
|
|
279
|
+
*/
|
|
280
|
+
getVersion(): string | undefined;
|
|
281
|
+
/**
|
|
282
|
+
* Get all cases in this suite (resolved from templates if needed)
|
|
283
|
+
*/
|
|
284
|
+
getCases(): LatticeEvalCase[];
|
|
285
|
+
/**
|
|
286
|
+
* Get a specific case by ID (resolved from template if needed)
|
|
287
|
+
*/
|
|
288
|
+
getCase(caseId: string): LatticeEvalCase | undefined;
|
|
289
|
+
/**
|
|
290
|
+
* Run a single case in this suite with error handling
|
|
291
|
+
* @param caseId The case ID to run
|
|
292
|
+
* @returns Case run result with error handling
|
|
293
|
+
*/
|
|
294
|
+
runCase(caseId: string): Promise<CaseRunResult>;
|
|
295
|
+
/**
|
|
296
|
+
* Run all cases in this suite with concurrency control and error isolation
|
|
297
|
+
* @param concurrency Optional concurrency limit (overrides project config)
|
|
298
|
+
* @returns Array of case run results with error handling
|
|
299
|
+
*/
|
|
300
|
+
runAllCases(concurrency?: number): Promise<CaseRunResult[]>;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
/**
|
|
304
|
+
* LatticeEvalProject class manages a project with multiple evaluation suites
|
|
305
|
+
* with project-level configuration
|
|
306
|
+
*/
|
|
307
|
+
declare class LatticeEvalProject {
|
|
308
|
+
private project;
|
|
309
|
+
private suites;
|
|
310
|
+
private reportConfig?;
|
|
311
|
+
constructor(project: LatticeEvalProjectType);
|
|
312
|
+
/**
|
|
313
|
+
* Get project name
|
|
314
|
+
*/
|
|
315
|
+
getProjectName(): string;
|
|
316
|
+
/**
|
|
317
|
+
* Get project version
|
|
318
|
+
*/
|
|
319
|
+
getVersion(): string | undefined;
|
|
320
|
+
/**
|
|
321
|
+
* Get project description
|
|
322
|
+
*/
|
|
323
|
+
getDescription(): string | undefined;
|
|
324
|
+
/**
|
|
325
|
+
* Get all suite names
|
|
326
|
+
*/
|
|
327
|
+
getSuiteNames(): string[];
|
|
328
|
+
/**
|
|
329
|
+
* Get a specific suite by name
|
|
330
|
+
*/
|
|
331
|
+
getSuite(suiteName: string): LatticeEvalSuite | undefined;
|
|
332
|
+
/**
|
|
333
|
+
* Run a specific case in a specific suite
|
|
334
|
+
* @param suiteName The suite name
|
|
335
|
+
* @param caseId The case ID to run
|
|
336
|
+
* @returns Case run result with error handling
|
|
337
|
+
*/
|
|
338
|
+
runCase(suiteName: string, caseId: string): Promise<CaseRunResult>;
|
|
339
|
+
/**
|
|
340
|
+
* Run all cases in a specific suite with concurrency control and error isolation
|
|
341
|
+
* @param suiteName The suite name
|
|
342
|
+
* @param concurrency Optional concurrency limit (overrides project config)
|
|
343
|
+
* @returns Array of case run results with error handling
|
|
344
|
+
*/
|
|
345
|
+
runSuite(suiteName: string, concurrency?: number): Promise<CaseRunResult[]>;
|
|
346
|
+
/**
|
|
347
|
+
* Run all cases in all suites with concurrency control and error isolation
|
|
348
|
+
* @param concurrency Optional concurrency limit (overrides project config)
|
|
349
|
+
* @returns Map of suite names to their case run results
|
|
350
|
+
*/
|
|
351
|
+
runAllSuites(concurrency?: number): Promise<Map<string, CaseRunResult[]>>;
|
|
352
|
+
/**
|
|
353
|
+
* Run all suites as a "batch", build a report, and optionally write it to disk.
|
|
354
|
+
*/
|
|
355
|
+
runAllSuitesBatch(concurrency?: number): Promise<{
|
|
356
|
+
batch_id: string;
|
|
357
|
+
batch_dir?: string;
|
|
358
|
+
results: Map<string, CaseRunResult[]>;
|
|
359
|
+
report: LatticeEvalBatchReport;
|
|
360
|
+
}>;
|
|
361
|
+
private generateCaseMarkdown;
|
|
362
|
+
private generateMarkdownSummary;
|
|
363
|
+
private maybeWriteBatchArtifacts;
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
export { type CaseRunResult, type LatticeAgentStepConfig, LatticeEval, type LatticeEvalBatchReport, type LatticeEvalCase, type LatticeEvalCaseRunResult, type LatticeEvalCaseType, type LatticeEvalCaseWithTemplate, type LatticeEvalConfig, type LatticeEvalLogEvent, type LatticeEvalLogLevel, LatticeEvalProject, type LatticeEvalProjectType, type LatticeEvalReportConfig, type LatticeEvalResult, type LatticeEvalRubric, LatticeEvalSuite, type LatticeEvalSuiteType, type LatticeEvalTemplate, type OutputFileContent, type OutputMessageContent, type OutputType, type ResolvedConfig, evaluateLatticeCase, evaluateLatticeCaseWithLogs };
|