npm - @axiom-lattice/agent-eval - Versions diffs - 2.1.9 - Mend

@axiom-lattice/agent-eval 2.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/.env +29 -0
package/.turbo/turbo-build.log +20 -0
package/CHANGELOG.md +10 -0
package/LICENSE +201 -0
package/dist/index.d.mts +366 -0
package/dist/index.d.ts +366 -0
package/dist/index.js +1092 -0
package/dist/index.js.map +1 -0
package/dist/index.mjs +1055 -0
package/dist/index.mjs.map +1 -0
package/package.json +51 -0
package/src/LatticeEval.ts +615 -0
package/src/LatticeEvalProject.ts +496 -0
package/src/LatticeEvalSuite.ts +321 -0
package/src/index.ts +4 -0
package/src/test.ts +23 -0
package/src/types.ts +160 -0
package/tsconfig.json +33 -0

package/src/LatticeEvalSuite.ts ADDED Viewed

@@ -0,0 +1,321 @@
+import type { LLMConfig } from "@axiom-lattice/protocols";
+import type {
+  LatticeEvalSuiteType,
+  LatticeEvalCase,
+  LatticeEvalCaseType,
+  LatticeEvalCaseWithTemplate,
+  LatticeEvalTemplate,
+  LatticeEvalLogEvent,
+  LatticeEvalResult,
+} from "./types";
+import {
+  evaluateLatticeCaseWithLogs,
+  LatticeEvalConfig,
+} from "./LatticeEval";
+/**
+ * Configuration resolved from project/suite hierarchy
+ */
+export interface ResolvedConfig {
+  lattice_server_config: {
+    base_url: string;
+    api_key?: string;
+  };
+  judge_agent_config?: {
+    model: LLMConfig;
+  };
+  concurrency: number; // Number of cases to run concurrently
+}
+/**
+ * Result with error handling
+ */
+export interface CaseRunResult {
+  caseId: string;
+  result?: LatticeEvalResult;
+  error?: string;
+  logs: LatticeEvalLogEvent[];
+  duration_ms?: number;
+  thread_id?: string;
+  judge_thread_id?: string;
+  test_prompt?: string;
+  final_output?: string;
+  error_stack?: string;
+}
+/**
+ * Limit concurrency of async operations with error isolation
+ * Each task failure will not affect other tasks
+ */
+async function limitConcurrency<T>(
+  tasks: Array<() => Promise<T>>,
+  concurrency: number
+): Promise<Array<{ success: boolean; result?: T; error?: string }>> {
+  const results: Array<{ success: boolean; result?: T; error?: string }> = [];
+  const executing: Promise<void>[] = [];
+  let index = 0;
+  // Execute a single task with error handling
+  const executeTask = async (task: () => Promise<T>, taskIndex: number): Promise<void> => {
+    try {
+      const result = await task();
+      results[taskIndex] = { success: true, result };
+    } catch (error) {
+      results[taskIndex] = {
+        success: false,
+        error: error instanceof Error ? error.message : String(error),
+      };
+    }
+    // Never throw error here - always resolve to allow other tasks to continue
+    // The error is already captured in the results array
+  };
+  // Process tasks with concurrency control
+  while (index < tasks.length || executing.length > 0) {
+    // Start new tasks up to concurrency limit
+    while (executing.length < concurrency && index < tasks.length) {
+      const task = tasks[index];
+      const currentIndex = index++;
+      // Wrap executeTask to ensure it never rejects, even on error
+      const promise = executeTask(task, currentIndex)
+        .catch((err) => {
+          // This should never happen since executeTask catches all errors,
+          // but add this as a safety net
+          console.error(`Unexpected error in task execution:`, err);
+        })
+        .finally(() => {
+          // Remove from executing array when done
+          const idx = executing.indexOf(promise);
+          if (idx > -1) {
+            executing.splice(idx, 1);
+          }
+        });
+      executing.push(promise);
+    }
+    // Wait for at least one task to complete before starting new ones
+    // Since executeTask always resolves (never rejects), Promise.race is safe
+    if (executing.length > 0) {
+      await Promise.race(executing);
+    }
+  }
+  // Wait for all remaining tasks to complete
+  // Since all promises are guaranteed to resolve (never reject), Promise.allSettled is safe
+  await Promise.allSettled(executing);
+  return results;
+}
+/**
+ * Resolve a template case to a full case
+ */
+function resolveTemplateCase(
+  templateCase: LatticeEvalCaseWithTemplate,
+  templates: Map<string, LatticeEvalTemplate>
+): LatticeEvalCase {
+  const template = templates.get(templateCase.templateId);
+  if (!template) {
+    throw new Error(`Template not found: ${templateCase.templateId}`);
+  }
+  // Merge template default_case with case-specific overrides
+  const resolvedCase: LatticeEvalCase = {
+    caseId: templateCase.caseId,
+    input: {
+      message:
+        templateCase.input.message ?? template.default_case.input.message,
+      files: {
+        ...template.default_case.input.files,
+        ...templateCase.input.files,
+      },
+    },
+    steps: template.default_case.steps,
+    output: templateCase.output || template.default_case.output,
+    eval: {
+      content_assertion: templateCase.eval.content_assertion,
+      eval_rubrics: templateCase.eval.eval_rubrics || template.default_case.eval?.eval_rubrics,
+    },
+  };
+  return resolvedCase;
+}
+/**
+ * Check if a case is a template case
+ */
+function isTemplateCase(
+  case_: LatticeEvalCaseType
+): case_ is LatticeEvalCaseWithTemplate {
+  return "templateId" in case_;
+}
+/**
+ * LatticeEvalSuite class manages a suite of evaluation cases
+ * with suite-level configuration
+ */
+export class LatticeEvalSuite {
+  private suite: LatticeEvalSuiteType;
+  private projectConfig: ResolvedConfig;
+  private templates: Map<string, LatticeEvalTemplate>;
+  constructor(
+    suite: LatticeEvalSuiteType,
+    projectConfig: ResolvedConfig,
+    templates: Map<string, LatticeEvalTemplate> = new Map()
+  ) {
+    this.suite = suite;
+    this.projectConfig = projectConfig;
+    this.templates = templates;
+  }
+  /**
+   * Get resolved configuration from project
+   */
+  private getResolvedConfig(): ResolvedConfig {
+    return this.projectConfig;
+  }
+  /**
+   * Get suite name
+   */
+  getSuiteName(): string {
+    return this.suite.suiteName;
+  }
+  /**
+   * Get suite version
+   */
+  getVersion(): string | undefined {
+    return this.suite.version;
+  }
+  /**
+   * Get all cases in this suite (resolved from templates if needed)
+   */
+  getCases(): LatticeEvalCase[] {
+    return this.suite.cases.map((case_) => {
+      if (isTemplateCase(case_)) {
+        return resolveTemplateCase(case_, this.templates);
+      }
+      return case_;
+    });
+  }
+  /**
+   * Get a specific case by ID (resolved from template if needed)
+   */
+  getCase(caseId: string): LatticeEvalCase | undefined {
+    const case_ = this.suite.cases.find((c) => c.caseId === caseId);
+    if (!case_) {
+      return undefined;
+    }
+    if (isTemplateCase(case_)) {
+      return resolveTemplateCase(case_, this.templates);
+    }
+    return case_;
+  }
+  /**
+   * Run a single case in this suite with error handling
+   * @param caseId The case ID to run
+   * @returns Case run result with error handling
+   */
+  async runCase(caseId: string): Promise<CaseRunResult> {
+    try {
+      const evalCase = this.getCase(caseId);
+      if (!evalCase) {
+        return {
+          caseId,
+          error: `Case not found: ${caseId}`,
+          logs: [],
+        };
+      }
+      const config = this.getResolvedConfig();
+      const evalConfig: LatticeEvalConfig = {
+        base_url: config.lattice_server_config.base_url,
+        api_key: config.lattice_server_config.api_key,
+      };
+      const run = await evaluateLatticeCaseWithLogs(evalCase, evalConfig);
+      return {
+        caseId,
+        result: run.result,
+        error: run.error,
+        error_stack: run.error_stack,
+        duration_ms: run.duration_ms,
+        thread_id: run.thread_id,
+        judge_thread_id: run.judge_thread_id,
+        test_prompt: run.test_prompt,
+        final_output: run.final_output,
+        logs: run.logs,
+      };
+    } catch (error) {
+      return {
+        caseId,
+        error: error instanceof Error ? error.message : String(error),
+        logs: [],
+      };
+    }
+  }
+  /**
+   * Run all cases in this suite with concurrency control and error isolation
+   * @param concurrency Optional concurrency limit (overrides project config)
+   * @returns Array of case run results with error handling
+   */
+  async runAllCases(concurrency?: number): Promise<CaseRunResult[]> {
+    const config = this.getResolvedConfig();
+    const maxConcurrency = concurrency ?? config.concurrency;
+    // Create tasks for all cases
+    const tasks = this.suite.cases.map((case_) => async () => {
+      try {
+        // Resolve template case if needed
+        const evalCase: LatticeEvalCase = isTemplateCase(case_)
+          ? resolveTemplateCase(case_, this.templates)
+          : case_;
+        const evalConfig: LatticeEvalConfig = {
+          base_url: config.lattice_server_config.base_url,
+          api_key: config.lattice_server_config.api_key,
+        };
+        const run = await evaluateLatticeCaseWithLogs(evalCase, evalConfig);
+        return {
+          caseId: evalCase.caseId,
+          result: run.result,
+          error: run.error,
+          error_stack: run.error_stack,
+          duration_ms: run.duration_ms,
+          thread_id: run.thread_id,
+          judge_thread_id: run.judge_thread_id,
+          test_prompt: run.test_prompt,
+          final_output: run.final_output,
+          logs: run.logs,
+        } as CaseRunResult;
+      } catch (error) {
+        return {
+          caseId: case_.caseId,
+          error: error instanceof Error ? error.message : String(error),
+          logs: [],
+        } as CaseRunResult;
+      }
+    });
+    // Run with concurrency limit
+    const taskResults = await limitConcurrency(tasks, maxConcurrency);
+    // Map results to CaseRunResult format
+    return taskResults.map((taskResult, index) => {
+      if (taskResult.success && taskResult.result) {
+        return taskResult.result;
+      }
+      return {
+        caseId: this.suite.cases[index].caseId,
+        error: taskResult.error || "Unknown error",
+        logs: [],
+      };
+    });
+  }
+}

package/src/index.ts ADDED Viewed

@@ -0,0 +1,4 @@
+export * from "./types";
+export * from "./LatticeEval";
+export * from "./LatticeEvalSuite";
+export * from "./LatticeEvalProject";

package/src/test.ts ADDED Viewed

@@ -0,0 +1,23 @@
+import { LatticeEvalProject } from "./LatticeEvalProject";
+import { fuliEvalProject } from "./mock/fuli_eval_project";
+/**
+ * Test runner for evaluation project
+ */
+async function runTest() {
+  const project = new LatticeEvalProject(fuliEvalProject);
+  // Run all suites as a batch. Logging and persistence are handled internally.
+  try {
+    await project.runAllSuitesBatch();
+  } catch (error) {
+    console.error("Error running tests:", error);
+    process.exit(1);
+  }
+}
+// Run the test
+runTest().catch((error) => {
+  console.error("Fatal error:", error);
+  process.exit(1);
+});

package/src/types.ts ADDED Viewed

@@ -0,0 +1,160 @@
+import { LLMConfig } from "@axiom-lattice/protocols";
+export interface LatticeAgentStepConfig {
+    agent_id: string;
+    override_input_message?: string;
+}
+export type OutputFileContent = {
+    type: "file_content";
+    file_path: string;
+}
+export type OutputMessageContent = {
+    type: "message_content";
+    message: string;
+}
+export type OutputType = OutputFileContent | OutputMessageContent
+export interface LatticeEvalProjectType {
+    projectName: string;
+    version?: string;
+    description?: string;
+    suites: LatticeEvalSuiteType[];
+    templates?: LatticeEvalTemplate[];
+    report_config?: LatticeEvalReportConfig;
+    judge_agent_config: {
+        model: LLMConfig;
+    }
+    lattice_server_config: {
+        base_url: string;
+        api_key: string;
+    }
+    concurrency?: number; // Number of cases to run concurrently (default: 1)
+}
+export type LatticeEvalLogLevel = "debug" | "info" | "warn" | "error";
+export interface LatticeEvalLogEvent {
+    ts: string; // ISO timestamp
+    level: LatticeEvalLogLevel;
+    message: string;
+    data?: Record<string, unknown>;
+}
+export interface LatticeEvalReportConfig {
+    /**
+     * Output directory for each batch run.
+     * A subfolder will be created per batch.
+     */
+    output_dir: string;
+    /**
+     * Optional batch id. If not set, a timestamp-based id will be generated per run.
+     */
+    batch_id?: string;
+    /**
+     * When true, writes `report.json` into the batch folder.
+     * Defaults to true.
+     */
+    write_report_json?: boolean;
+    /**
+     * When true, writes per-case log files into the batch folder.
+     * Defaults to true.
+     */
+    write_case_logs?: boolean;
+}
+export interface LatticeEvalBatchReport {
+    batch_id: string;
+    started_at: string;
+    finished_at: string;
+    project: {
+        projectName: string;
+        version?: string;
+        description?: string;
+    };
+    summary: {
+        total_cases: number;
+        passed_cases: number;
+        failed_cases: number;
+        pass_rate: number; // 0-1
+    };
+    suites: Array<{
+        suiteName: string;
+        total_cases: number;
+        passed_cases: number;
+        failed_cases: number;
+        cases: Array<{
+            caseId: string;
+            pass?: boolean;
+            final_score?: number;
+            error?: string;
+        }>;
+    }>;
+}
+export type LatticeEvalCaseType = LatticeEvalCase | LatticeEvalCaseWithTemplate;
+export interface LatticeEvalSuiteType {
+    suiteName: string;
+    version?: string;
+    cases: LatticeEvalCaseType[];
+}
+export interface LatticeEvalRubric {
+    dimension: string;
+    weight: number;
+    description: string;
+}
+export interface LatticeEvalCase {
+    caseId: string;
+    input: {
+        message: string;
+        files?: Record<string, string>;
+    }
+    steps: LatticeAgentStepConfig[];
+    output: OutputType // what content to check in the output
+    eval: {
+        content_assertion: string; //expected nlp description of the output
+        eval_rubrics?: LatticeEvalRubric[] // rubrics to evaluate the output
+    }
+}
+export interface LatticeEvalCaseWithTemplate {
+    caseId: string;
+    templateId: string;
+    input: {
+        message?: string;
+        files?: Record<string, string>;
+        variables?: Record<string, string>;
+    }
+    output?: OutputType // what content to check in the output
+    eval: {
+        content_assertion: string; //expected nlp description of the output
+        eval_rubrics?: LatticeEvalRubric[] // rubrics to evaluate the output
+    }
+}
+export interface LatticeEvalTemplate {
+    templateId: string;
+    description?: string;
+    input_schema: {
+        required_files?: string[];
+        variables?: string[];
+    }
+    default_case: Omit<LatticeEvalCase, "caseId" | "eval"> & { eval?: { eval_rubrics?: LatticeEvalRubric[] } };
+}
+export interface LatticeEvalResult {
+    pass: boolean;
+    final_score: number;
+    dimension_results: {
+        name: string;
+        score: number;
+        reason: string;
+    }[];
+    summary: string;
+    error?: string; // Error message if the case failed to run
+}

package/tsconfig.json ADDED Viewed

@@ -0,0 +1,33 @@
+{
+  "compilerOptions": {
+    "target": "ES2020",
+    "module": "preserve",
+    "lib": [
+      "ES2020"
+    ],
+    "outDir": "./dist",
+    "rootDir": "./src",
+    "strict": true,
+    "moduleResolution": "Bundler",
+    "esModuleInterop": true,
+    "skipLibCheck": true,
+    "forceConsistentCasingInFileNames": true,
+    "resolveJsonModule": true,
+    "declaration": true,
+    "declarationMap": true,
+    "types": [
+      "node",
+      "jest"
+    ],
+    "sourceMap": true,
+    "incremental": true, // 确保启用增量编译
+    "tsBuildInfoFile": "./.tsbuildinfo" // 指定构建信息文件位置
+  },
+  "include": [
+    "src/index.ts"
+  ],
+  "exclude": [
+    "node_modules",
+    "dist"
+  ]
+}