@axiom-lattice/agent-eval 2.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,321 @@
1
+ import type { LLMConfig } from "@axiom-lattice/protocols";
2
+ import type {
3
+ LatticeEvalSuiteType,
4
+ LatticeEvalCase,
5
+ LatticeEvalCaseType,
6
+ LatticeEvalCaseWithTemplate,
7
+ LatticeEvalTemplate,
8
+ LatticeEvalLogEvent,
9
+ LatticeEvalResult,
10
+ } from "./types";
11
+ import {
12
+ evaluateLatticeCaseWithLogs,
13
+ LatticeEvalConfig,
14
+ } from "./LatticeEval";
15
+
16
+ /**
17
+ * Configuration resolved from project/suite hierarchy
18
+ */
19
+ export interface ResolvedConfig {
20
+ lattice_server_config: {
21
+ base_url: string;
22
+ api_key?: string;
23
+ };
24
+ judge_agent_config?: {
25
+ model: LLMConfig;
26
+ };
27
+ concurrency: number; // Number of cases to run concurrently
28
+ }
29
+
30
+ /**
31
+ * Result with error handling
32
+ */
33
+ export interface CaseRunResult {
34
+ caseId: string;
35
+ result?: LatticeEvalResult;
36
+ error?: string;
37
+ logs: LatticeEvalLogEvent[];
38
+ duration_ms?: number;
39
+ thread_id?: string;
40
+ judge_thread_id?: string;
41
+ test_prompt?: string;
42
+ final_output?: string;
43
+ error_stack?: string;
44
+ }
45
+
46
+ /**
47
+ * Limit concurrency of async operations with error isolation
48
+ * Each task failure will not affect other tasks
49
+ */
50
+ async function limitConcurrency<T>(
51
+ tasks: Array<() => Promise<T>>,
52
+ concurrency: number
53
+ ): Promise<Array<{ success: boolean; result?: T; error?: string }>> {
54
+ const results: Array<{ success: boolean; result?: T; error?: string }> = [];
55
+ const executing: Promise<void>[] = [];
56
+ let index = 0;
57
+
58
+ // Execute a single task with error handling
59
+ const executeTask = async (task: () => Promise<T>, taskIndex: number): Promise<void> => {
60
+ try {
61
+ const result = await task();
62
+ results[taskIndex] = { success: true, result };
63
+ } catch (error) {
64
+ results[taskIndex] = {
65
+ success: false,
66
+ error: error instanceof Error ? error.message : String(error),
67
+ };
68
+ }
69
+ // Never throw error here - always resolve to allow other tasks to continue
70
+ // The error is already captured in the results array
71
+ };
72
+
73
+ // Process tasks with concurrency control
74
+ while (index < tasks.length || executing.length > 0) {
75
+ // Start new tasks up to concurrency limit
76
+ while (executing.length < concurrency && index < tasks.length) {
77
+ const task = tasks[index];
78
+ const currentIndex = index++;
79
+ // Wrap executeTask to ensure it never rejects, even on error
80
+ const promise = executeTask(task, currentIndex)
81
+ .catch((err) => {
82
+ // This should never happen since executeTask catches all errors,
83
+ // but add this as a safety net
84
+ console.error(`Unexpected error in task execution:`, err);
85
+ })
86
+ .finally(() => {
87
+ // Remove from executing array when done
88
+ const idx = executing.indexOf(promise);
89
+ if (idx > -1) {
90
+ executing.splice(idx, 1);
91
+ }
92
+ });
93
+ executing.push(promise);
94
+ }
95
+
96
+ // Wait for at least one task to complete before starting new ones
97
+ // Since executeTask always resolves (never rejects), Promise.race is safe
98
+ if (executing.length > 0) {
99
+ await Promise.race(executing);
100
+ }
101
+ }
102
+
103
+ // Wait for all remaining tasks to complete
104
+ // Since all promises are guaranteed to resolve (never reject), Promise.allSettled is safe
105
+ await Promise.allSettled(executing);
106
+
107
+ return results;
108
+ }
109
+
110
+ /**
111
+ * Resolve a template case to a full case
112
+ */
113
+ function resolveTemplateCase(
114
+ templateCase: LatticeEvalCaseWithTemplate,
115
+ templates: Map<string, LatticeEvalTemplate>
116
+ ): LatticeEvalCase {
117
+ const template = templates.get(templateCase.templateId);
118
+ if (!template) {
119
+ throw new Error(`Template not found: ${templateCase.templateId}`);
120
+ }
121
+
122
+ // Merge template default_case with case-specific overrides
123
+ const resolvedCase: LatticeEvalCase = {
124
+ caseId: templateCase.caseId,
125
+ input: {
126
+ message:
127
+ templateCase.input.message ?? template.default_case.input.message,
128
+ files: {
129
+ ...template.default_case.input.files,
130
+ ...templateCase.input.files,
131
+ },
132
+ },
133
+ steps: template.default_case.steps,
134
+ output: templateCase.output || template.default_case.output,
135
+ eval: {
136
+ content_assertion: templateCase.eval.content_assertion,
137
+ eval_rubrics: templateCase.eval.eval_rubrics || template.default_case.eval?.eval_rubrics,
138
+ },
139
+ };
140
+
141
+ return resolvedCase;
142
+ }
143
+
144
+ /**
145
+ * Check if a case is a template case
146
+ */
147
+ function isTemplateCase(
148
+ case_: LatticeEvalCaseType
149
+ ): case_ is LatticeEvalCaseWithTemplate {
150
+ return "templateId" in case_;
151
+ }
152
+
153
+ /**
154
+ * LatticeEvalSuite class manages a suite of evaluation cases
155
+ * with suite-level configuration
156
+ */
157
+ export class LatticeEvalSuite {
158
+ private suite: LatticeEvalSuiteType;
159
+ private projectConfig: ResolvedConfig;
160
+ private templates: Map<string, LatticeEvalTemplate>;
161
+
162
+ constructor(
163
+ suite: LatticeEvalSuiteType,
164
+ projectConfig: ResolvedConfig,
165
+ templates: Map<string, LatticeEvalTemplate> = new Map()
166
+ ) {
167
+ this.suite = suite;
168
+ this.projectConfig = projectConfig;
169
+ this.templates = templates;
170
+ }
171
+
172
+ /**
173
+ * Get resolved configuration from project
174
+ */
175
+ private getResolvedConfig(): ResolvedConfig {
176
+ return this.projectConfig;
177
+ }
178
+
179
+ /**
180
+ * Get suite name
181
+ */
182
+ getSuiteName(): string {
183
+ return this.suite.suiteName;
184
+ }
185
+
186
+ /**
187
+ * Get suite version
188
+ */
189
+ getVersion(): string | undefined {
190
+ return this.suite.version;
191
+ }
192
+
193
+ /**
194
+ * Get all cases in this suite (resolved from templates if needed)
195
+ */
196
+ getCases(): LatticeEvalCase[] {
197
+ return this.suite.cases.map((case_) => {
198
+ if (isTemplateCase(case_)) {
199
+ return resolveTemplateCase(case_, this.templates);
200
+ }
201
+ return case_;
202
+ });
203
+ }
204
+
205
+ /**
206
+ * Get a specific case by ID (resolved from template if needed)
207
+ */
208
+ getCase(caseId: string): LatticeEvalCase | undefined {
209
+ const case_ = this.suite.cases.find((c) => c.caseId === caseId);
210
+ if (!case_) {
211
+ return undefined;
212
+ }
213
+ if (isTemplateCase(case_)) {
214
+ return resolveTemplateCase(case_, this.templates);
215
+ }
216
+ return case_;
217
+ }
218
+
219
+ /**
220
+ * Run a single case in this suite with error handling
221
+ * @param caseId The case ID to run
222
+ * @returns Case run result with error handling
223
+ */
224
+ async runCase(caseId: string): Promise<CaseRunResult> {
225
+ try {
226
+ const evalCase = this.getCase(caseId);
227
+ if (!evalCase) {
228
+ return {
229
+ caseId,
230
+ error: `Case not found: ${caseId}`,
231
+ logs: [],
232
+ };
233
+ }
234
+
235
+ const config = this.getResolvedConfig();
236
+ const evalConfig: LatticeEvalConfig = {
237
+ base_url: config.lattice_server_config.base_url,
238
+ api_key: config.lattice_server_config.api_key,
239
+ };
240
+
241
+ const run = await evaluateLatticeCaseWithLogs(evalCase, evalConfig);
242
+ return {
243
+ caseId,
244
+ result: run.result,
245
+ error: run.error,
246
+ error_stack: run.error_stack,
247
+ duration_ms: run.duration_ms,
248
+ thread_id: run.thread_id,
249
+ judge_thread_id: run.judge_thread_id,
250
+ test_prompt: run.test_prompt,
251
+ final_output: run.final_output,
252
+ logs: run.logs,
253
+ };
254
+ } catch (error) {
255
+ return {
256
+ caseId,
257
+ error: error instanceof Error ? error.message : String(error),
258
+ logs: [],
259
+ };
260
+ }
261
+ }
262
+
263
+ /**
264
+ * Run all cases in this suite with concurrency control and error isolation
265
+ * @param concurrency Optional concurrency limit (overrides project config)
266
+ * @returns Array of case run results with error handling
267
+ */
268
+ async runAllCases(concurrency?: number): Promise<CaseRunResult[]> {
269
+ const config = this.getResolvedConfig();
270
+ const maxConcurrency = concurrency ?? config.concurrency;
271
+
272
+ // Create tasks for all cases
273
+ const tasks = this.suite.cases.map((case_) => async () => {
274
+ try {
275
+ // Resolve template case if needed
276
+ const evalCase: LatticeEvalCase = isTemplateCase(case_)
277
+ ? resolveTemplateCase(case_, this.templates)
278
+ : case_;
279
+
280
+ const evalConfig: LatticeEvalConfig = {
281
+ base_url: config.lattice_server_config.base_url,
282
+ api_key: config.lattice_server_config.api_key,
283
+ };
284
+ const run = await evaluateLatticeCaseWithLogs(evalCase, evalConfig);
285
+ return {
286
+ caseId: evalCase.caseId,
287
+ result: run.result,
288
+ error: run.error,
289
+ error_stack: run.error_stack,
290
+ duration_ms: run.duration_ms,
291
+ thread_id: run.thread_id,
292
+ judge_thread_id: run.judge_thread_id,
293
+ test_prompt: run.test_prompt,
294
+ final_output: run.final_output,
295
+ logs: run.logs,
296
+ } as CaseRunResult;
297
+ } catch (error) {
298
+ return {
299
+ caseId: case_.caseId,
300
+ error: error instanceof Error ? error.message : String(error),
301
+ logs: [],
302
+ } as CaseRunResult;
303
+ }
304
+ });
305
+
306
+ // Run with concurrency limit
307
+ const taskResults = await limitConcurrency(tasks, maxConcurrency);
308
+
309
+ // Map results to CaseRunResult format
310
+ return taskResults.map((taskResult, index) => {
311
+ if (taskResult.success && taskResult.result) {
312
+ return taskResult.result;
313
+ }
314
+ return {
315
+ caseId: this.suite.cases[index].caseId,
316
+ error: taskResult.error || "Unknown error",
317
+ logs: [],
318
+ };
319
+ });
320
+ }
321
+ }
package/src/index.ts ADDED
@@ -0,0 +1,4 @@
1
+ export * from "./types";
2
+ export * from "./LatticeEval";
3
+ export * from "./LatticeEvalSuite";
4
+ export * from "./LatticeEvalProject";
package/src/test.ts ADDED
@@ -0,0 +1,23 @@
1
+ import { LatticeEvalProject } from "./LatticeEvalProject";
2
+ import { fuliEvalProject } from "./mock/fuli_eval_project";
3
+
4
+ /**
5
+ * Test runner for evaluation project
6
+ */
7
+ async function runTest() {
8
+ const project = new LatticeEvalProject(fuliEvalProject);
9
+
10
+ // Run all suites as a batch. Logging and persistence are handled internally.
11
+ try {
12
+ await project.runAllSuitesBatch();
13
+ } catch (error) {
14
+ console.error("Error running tests:", error);
15
+ process.exit(1);
16
+ }
17
+ }
18
+
19
+ // Run the test
20
+ runTest().catch((error) => {
21
+ console.error("Fatal error:", error);
22
+ process.exit(1);
23
+ });
package/src/types.ts ADDED
@@ -0,0 +1,160 @@
1
+ import { LLMConfig } from "@axiom-lattice/protocols";
2
+
3
+ export interface LatticeAgentStepConfig {
4
+ agent_id: string;
5
+ override_input_message?: string;
6
+ }
7
+ export type OutputFileContent = {
8
+ type: "file_content";
9
+ file_path: string;
10
+ }
11
+ export type OutputMessageContent = {
12
+ type: "message_content";
13
+ message: string;
14
+ }
15
+ export type OutputType = OutputFileContent | OutputMessageContent
16
+
17
+ export interface LatticeEvalProjectType {
18
+ projectName: string;
19
+ version?: string;
20
+ description?: string;
21
+ suites: LatticeEvalSuiteType[];
22
+ templates?: LatticeEvalTemplate[];
23
+ report_config?: LatticeEvalReportConfig;
24
+ judge_agent_config: {
25
+ model: LLMConfig;
26
+ }
27
+ lattice_server_config: {
28
+ base_url: string;
29
+ api_key: string;
30
+ }
31
+ concurrency?: number; // Number of cases to run concurrently (default: 1)
32
+ }
33
+
34
+ export type LatticeEvalLogLevel = "debug" | "info" | "warn" | "error";
35
+
36
+ export interface LatticeEvalLogEvent {
37
+ ts: string; // ISO timestamp
38
+ level: LatticeEvalLogLevel;
39
+ message: string;
40
+ data?: Record<string, unknown>;
41
+ }
42
+
43
+ export interface LatticeEvalReportConfig {
44
+ /**
45
+ * Output directory for each batch run.
46
+ * A subfolder will be created per batch.
47
+ */
48
+ output_dir: string;
49
+ /**
50
+ * Optional batch id. If not set, a timestamp-based id will be generated per run.
51
+ */
52
+ batch_id?: string;
53
+ /**
54
+ * When true, writes `report.json` into the batch folder.
55
+ * Defaults to true.
56
+ */
57
+ write_report_json?: boolean;
58
+ /**
59
+ * When true, writes per-case log files into the batch folder.
60
+ * Defaults to true.
61
+ */
62
+ write_case_logs?: boolean;
63
+ }
64
+
65
+ export interface LatticeEvalBatchReport {
66
+ batch_id: string;
67
+ started_at: string;
68
+ finished_at: string;
69
+ project: {
70
+ projectName: string;
71
+ version?: string;
72
+ description?: string;
73
+ };
74
+ summary: {
75
+ total_cases: number;
76
+ passed_cases: number;
77
+ failed_cases: number;
78
+ pass_rate: number; // 0-1
79
+ };
80
+ suites: Array<{
81
+ suiteName: string;
82
+ total_cases: number;
83
+ passed_cases: number;
84
+ failed_cases: number;
85
+ cases: Array<{
86
+ caseId: string;
87
+ pass?: boolean;
88
+ final_score?: number;
89
+ error?: string;
90
+ }>;
91
+ }>;
92
+ }
93
+
94
+ export type LatticeEvalCaseType = LatticeEvalCase | LatticeEvalCaseWithTemplate;
95
+
96
+ export interface LatticeEvalSuiteType {
97
+ suiteName: string;
98
+ version?: string;
99
+ cases: LatticeEvalCaseType[];
100
+ }
101
+
102
+ export interface LatticeEvalRubric {
103
+ dimension: string;
104
+ weight: number;
105
+ description: string;
106
+ }
107
+
108
+
109
+ export interface LatticeEvalCase {
110
+ caseId: string;
111
+ input: {
112
+ message: string;
113
+ files?: Record<string, string>;
114
+ }
115
+ steps: LatticeAgentStepConfig[];
116
+ output: OutputType // what content to check in the output
117
+
118
+ eval: {
119
+ content_assertion: string; //expected nlp description of the output
120
+ eval_rubrics?: LatticeEvalRubric[] // rubrics to evaluate the output
121
+ }
122
+ }
123
+
124
+ export interface LatticeEvalCaseWithTemplate {
125
+ caseId: string;
126
+ templateId: string;
127
+ input: {
128
+ message?: string;
129
+ files?: Record<string, string>;
130
+ variables?: Record<string, string>;
131
+ }
132
+ output?: OutputType // what content to check in the output
133
+ eval: {
134
+ content_assertion: string; //expected nlp description of the output
135
+ eval_rubrics?: LatticeEvalRubric[] // rubrics to evaluate the output
136
+ }
137
+ }
138
+
139
+ export interface LatticeEvalTemplate {
140
+ templateId: string;
141
+ description?: string;
142
+ input_schema: {
143
+ required_files?: string[];
144
+ variables?: string[];
145
+ }
146
+ default_case: Omit<LatticeEvalCase, "caseId" | "eval"> & { eval?: { eval_rubrics?: LatticeEvalRubric[] } };
147
+
148
+ }
149
+
150
+ export interface LatticeEvalResult {
151
+ pass: boolean;
152
+ final_score: number;
153
+ dimension_results: {
154
+ name: string;
155
+ score: number;
156
+ reason: string;
157
+ }[];
158
+ summary: string;
159
+ error?: string; // Error message if the case failed to run
160
+ }
package/tsconfig.json ADDED
@@ -0,0 +1,33 @@
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2020",
4
+ "module": "preserve",
5
+ "lib": [
6
+ "ES2020"
7
+ ],
8
+ "outDir": "./dist",
9
+ "rootDir": "./src",
10
+ "strict": true,
11
+ "moduleResolution": "Bundler",
12
+ "esModuleInterop": true,
13
+ "skipLibCheck": true,
14
+ "forceConsistentCasingInFileNames": true,
15
+ "resolveJsonModule": true,
16
+ "declaration": true,
17
+ "declarationMap": true,
18
+ "types": [
19
+ "node",
20
+ "jest"
21
+ ],
22
+ "sourceMap": true,
23
+ "incremental": true, // 确保启用增量编译
24
+ "tsBuildInfoFile": "./.tsbuildinfo" // 指定构建信息文件位置
25
+ },
26
+ "include": [
27
+ "src/index.ts"
28
+ ],
29
+ "exclude": [
30
+ "node_modules",
31
+ "dist"
32
+ ]
33
+ }