@aigne/cli 1.48.4-beta.5 → 1.49.0-beta.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,28 @@
1
1
  # Changelog
2
2
 
3
+ ## [1.49.0-beta.5](https://github.com/AIGNE-io/aigne-framework/compare/cli-v1.48.4-beta.5...cli-v1.49.0-beta.5) (2025-09-25)
4
+
5
+
6
+ ### Features
7
+
8
+ * **cli:** add new eval command for assessing AI agent performance using custom datasets ([#535](https://github.com/AIGNE-io/aigne-framework/issues/535)) ([9da967b](https://github.com/AIGNE-io/aigne-framework/commit/9da967b01ef9eeee4c5e1242934cf08e14815753))
9
+ * **cli:** add new eval command for assessing AI agent performance using custom datasets ([#535](https://github.com/AIGNE-io/aigne-framework/issues/535)) ([9da967b](https://github.com/AIGNE-io/aigne-framework/commit/9da967b01ef9eeee4c5e1242934cf08e14815753))
10
+
11
+
12
+ ### Dependencies
13
+
14
+ * The following workspace dependencies were updated
15
+ * dependencies
16
+ * @aigne/agent-library bumped to 1.21.46-beta.5
17
+ * @aigne/agentic-memory bumped to 1.0.46-beta.5
18
+ * @aigne/aigne-hub bumped to 0.10.0-beta.5
19
+ * @aigne/core bumped to 1.61.0-beta.4
20
+ * @aigne/default-memory bumped to 1.2.9-beta.5
21
+ * @aigne/observability-api bumped to 0.10.5-beta
22
+ * @aigne/openai bumped to 0.16.0-beta.5
23
+ * devDependencies
24
+ * @aigne/test-utils bumped to 0.5.53-beta.4
25
+
3
26
  ## [1.48.4-beta.5](https://github.com/AIGNE-io/aigne-framework/compare/cli-v1.48.4-beta.4...cli-v1.48.4-beta.5) (2025-09-24)
4
27
 
5
28
 
@@ -4,6 +4,7 @@ import { asciiLogo } from "../utils/ascii-logo.js";
4
4
  import { createAppCommands } from "./app.js";
5
5
  import { createCreateCommand } from "./create.js";
6
6
  import { createDeployCommands } from "./deploy.js";
7
+ import { createEvalCommand } from "./eval.js";
7
8
  import { createHubCommand } from "./hub.js";
8
9
  import { createObservabilityCommand } from "./observe.js";
9
10
  import { createRunCommand } from "./run.js";
@@ -15,6 +16,7 @@ export function createAIGNECommand(options) {
15
16
  .usage(`${asciiLogo}\n$0 <command> [options]`)
16
17
  .version(AIGNE_CLI_VERSION)
17
18
  .command(createRunCommand(options))
19
+ .command(createEvalCommand(options))
18
20
  .command(createTestCommand(options))
19
21
  .command(createCreateCommand())
20
22
  .command(createServeMCPCommand(options))
@@ -0,0 +1,11 @@
1
+ import type { CommandModule } from "yargs";
2
+ export declare function createEvalCommand({ aigneFilePath, }?: {
3
+ aigneFilePath?: string;
4
+ }): CommandModule<unknown, {
5
+ path?: string;
6
+ agent?: string;
7
+ dataset?: string;
8
+ evaluator?: string;
9
+ concurrency?: number;
10
+ output?: string;
11
+ }>;
@@ -0,0 +1,109 @@
1
+ import { isAbsolute, resolve } from "node:path";
2
+ import { exists } from "@aigne/agent-library/utils/fs.js";
3
+ import { z } from "zod";
4
+ import { runEvaluationPipeline } from "../utils/evaluation/core.js";
5
+ import { FileDataset } from "../utils/evaluation/dataset.js";
6
+ import { LLMEvaluator } from "../utils/evaluation/evaluator.js";
7
+ import { ConsoleReporter, CsvReporter } from "../utils/evaluation/reporter.js";
8
+ import { DefaultRunnerWithConcurrency } from "../utils/evaluation/runner.js";
9
+ import { loadAIGNE } from "../utils/load-aigne.js";
10
+ const schema = z.object({
11
+ path: z.string().optional(),
12
+ agent: z.string(),
13
+ dataset: z.string(),
14
+ evaluator: z.string().optional(),
15
+ concurrency: z.number().optional(),
16
+ output: z.string().optional(),
17
+ });
18
+ const getResolvePath = (path) => {
19
+ return isAbsolute(path) ? path : resolve(process.cwd(), path);
20
+ };
21
+ export function createEvalCommand({ aigneFilePath, } = {}) {
22
+ return {
23
+ command: "eval [path] [agent]",
24
+ describe: "Evaluate AIGNE for the specified path",
25
+ builder: async (yargs) => {
26
+ return yargs
27
+ .positional("path", {
28
+ type: "string",
29
+ describe: "Path to the agents directory or URL to an aigne project",
30
+ default: ".",
31
+ })
32
+ .positional("agent", {
33
+ type: "string",
34
+ describe: "Name of the agent to evaluate",
35
+ })
36
+ .positional("dataset", {
37
+ type: "string",
38
+ describe: "Path to the dataset file",
39
+ })
40
+ .positional("evaluator", {
41
+ type: "string",
42
+ describe: "Name of the evaluator to use",
43
+ })
44
+ .positional("output", {
45
+ alias: "o",
46
+ type: "string",
47
+ describe: "Path to the output file",
48
+ })
49
+ .positional("concurrency", {
50
+ type: "number",
51
+ describe: "Concurrency level",
52
+ default: 1,
53
+ })
54
+ .help(false)
55
+ .version(false)
56
+ .strict(false);
57
+ },
58
+ handler: async (options) => {
59
+ const parsedOptions = await schema.safeParseAsync(options);
60
+ if (!parsedOptions.success) {
61
+ throw new Error(`Invalid options: ${JSON.stringify(parsedOptions.error.format())}`);
62
+ }
63
+ const { agent: entryAgent, dataset: datasetPath, evaluator: evaluatorName, concurrency, } = parsedOptions.data;
64
+ const path = parsedOptions.data?.path;
65
+ const aigne = await loadAIGNE({ path: aigneFilePath || path || "." });
66
+ const resolvedDatasetPath = getResolvePath(datasetPath);
67
+ if (!(await exists(resolvedDatasetPath))) {
68
+ throw new Error("Dataset file does not exist");
69
+ }
70
+ const { chat } = aigne.cli;
71
+ const agent = chat && chat.name === entryAgent
72
+ ? chat
73
+ : aigne.cli.agents[entryAgent] ||
74
+ aigne.agents[entryAgent] ||
75
+ aigne.skills[entryAgent] ||
76
+ aigne.mcpServer.agents[entryAgent];
77
+ if (!agent)
78
+ throw new Error("Entry agent does not exist");
79
+ agent.model = agent.model ?? aigne.model;
80
+ let evaluatorAgent;
81
+ if (evaluatorName) {
82
+ evaluatorAgent =
83
+ aigne.cli.agents[evaluatorName] ||
84
+ aigne.agents[evaluatorName] ||
85
+ aigne.skills[evaluatorName] ||
86
+ aigne.mcpServer.agents[evaluatorName];
87
+ }
88
+ if (evaluatorAgent) {
89
+ evaluatorAgent.model = evaluatorAgent.model ?? aigne.model;
90
+ }
91
+ const dataset = new FileDataset(resolvedDatasetPath);
92
+ const runner = new DefaultRunnerWithConcurrency(agent, aigne);
93
+ const evaluator = new LLMEvaluator(aigne, evaluatorAgent);
94
+ const reporters = [new ConsoleReporter()];
95
+ if (options.output) {
96
+ const resolvedReporterPath = getResolvePath(options.output);
97
+ const reporter = new CsvReporter(resolvedReporterPath);
98
+ reporters.push(reporter);
99
+ }
100
+ await runEvaluationPipeline({
101
+ dataset,
102
+ runner,
103
+ evaluators: [evaluator],
104
+ reporters: reporters,
105
+ options: { concurrency },
106
+ });
107
+ },
108
+ };
109
+ }
@@ -0,0 +1,8 @@
1
+ import type { Dataset, Evaluator, Reporter, Runner, RunOptions } from "./type.js";
2
+ export declare function runEvaluationPipeline(params: {
3
+ dataset: Dataset;
4
+ runner: Runner;
5
+ evaluators: Evaluator[];
6
+ reporters?: Reporter[];
7
+ options?: RunOptions;
8
+ }): Promise<void>;
@@ -0,0 +1,80 @@
1
+ import { Listr } from "@aigne/listr2";
2
+ import { ConsoleReporter } from "./reporter.js";
3
+ function aggregateSummary(results, duration) {
4
+ const total = results.length;
5
+ const scores = results.flatMap((r) => r.evaluations.map((e) => e.score));
6
+ const successRate = Number((scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0).toFixed(2));
7
+ const latencies = results.map((r) => r.latency || 0);
8
+ const totalTokens = results.reduce((a, r) => a + (r.usage?.inputTokens || 0) + (r.usage?.outputTokens || 0), 0);
9
+ const errors = results.filter((r) => r.error).length;
10
+ return {
11
+ total,
12
+ successRate,
13
+ duration: Number(duration.toFixed(3)),
14
+ avgLatency: latencies.reduce((a, b) => a + b, 0) / (latencies.length || 1),
15
+ maxLatency: Math.max(...latencies, 0),
16
+ minLatency: Math.min(...latencies, 0),
17
+ totalTokens,
18
+ errorCount: errors,
19
+ scoreDistribution: {
20
+ min: Math.min(...scores, 0),
21
+ max: Math.max(...scores, 0),
22
+ mean: successRate,
23
+ median: scores.length ? scores.sort((a, b) => a - b)[Math.floor(scores.length / 2)] : 0,
24
+ variance: scores.length > 1
25
+ ? scores.reduce((a, b) => a + (b - successRate) ** 2, 0) / scores.length
26
+ : 0,
27
+ },
28
+ };
29
+ }
30
+ export async function runEvaluationPipeline(params) {
31
+ const now = Date.now();
32
+ const { dataset, runner, evaluators, reporters = [new ConsoleReporter()], options } = params;
33
+ const results = [];
34
+ const task1 = new Listr([
35
+ {
36
+ title: "Load dataset",
37
+ task: async (ctx, _task) => {
38
+ ctx.items = await dataset.loadWithOptions();
39
+ },
40
+ },
41
+ ]);
42
+ const { items } = await task1.run();
43
+ const task2 = new Listr(items.map((item) => {
44
+ const input = JSON.stringify(item.input);
45
+ return {
46
+ title: `Run evaluations with input: ${input.length > 100 ? `${input.slice(0, 100)}...` : input}`,
47
+ task: async (ctx, task) => {
48
+ task.output = `Start running agent with input: ${JSON.stringify(item.input, null, 2)}`;
49
+ const runnerResults = await runner.run([item], options);
50
+ for await (const result of runnerResults) {
51
+ task.output = `Start running evaluation with: ${JSON.stringify({
52
+ input: result.input,
53
+ output: result.output,
54
+ expected: result.expected,
55
+ }, null, 2)}`;
56
+ const evaluations = [];
57
+ for (const evaluator of evaluators) {
58
+ const evals = await evaluator.evaluate(result);
59
+ evaluations.push(...evals);
60
+ }
61
+ results.push({ ...result, evaluations });
62
+ task.output = `Finish running evaluation`;
63
+ }
64
+ ctx.results = results;
65
+ },
66
+ };
67
+ }), {
68
+ concurrent: options?.concurrency ? Math.min(items.length, options?.concurrency) : false,
69
+ exitOnError: true,
70
+ rendererOptions: {
71
+ collapseSubtasks: false,
72
+ },
73
+ });
74
+ await task2.run();
75
+ const summary = aggregateSummary(results, (Date.now() - now) / 1000);
76
+ const report = { dataset: dataset.name, results, summary };
77
+ for (const reporter of reporters) {
78
+ await reporter.report(report);
79
+ }
80
+ }
@@ -0,0 +1,15 @@
1
+ import type { Dataset, DatasetItem } from "./type.js";
2
+ export declare class FileDataset implements Dataset {
3
+ name: string;
4
+ private filePath;
5
+ constructor(filePath: string);
6
+ load(): Promise<DatasetItem[]>;
7
+ loadWithOptions(): Promise<DatasetItem[]>;
8
+ }
9
+ export declare class JsonDataset implements Dataset {
10
+ name: string;
11
+ private data;
12
+ constructor(data: DatasetItem[]);
13
+ load(): Promise<DatasetItem[]>;
14
+ loadWithOptions(): Promise<DatasetItem[]>;
15
+ }
@@ -0,0 +1,61 @@
1
+ import fs from "node:fs/promises";
2
+ import { z } from "zod";
3
+ const recordSchema = z.record(z.any());
4
+ const datasetItemSchema = z.object({
5
+ id: z.union([z.string(), z.number()]),
6
+ input: recordSchema,
7
+ output: recordSchema.optional(),
8
+ expected: recordSchema.optional(),
9
+ metadata: recordSchema.optional(),
10
+ tags: z.array(z.string()).optional(),
11
+ selected: z.boolean().optional(),
12
+ });
13
+ const datasetSchema = z.array(datasetItemSchema);
14
+ export class FileDataset {
15
+ name = "file-dataset";
16
+ filePath;
17
+ constructor(filePath) {
18
+ this.filePath = filePath;
19
+ }
20
+ async load() {
21
+ let list;
22
+ try {
23
+ list = await fs.readFile(this.filePath, "utf-8");
24
+ }
25
+ catch (err) {
26
+ throw new Error(`Failed to read dataset file: ${err.message}`);
27
+ }
28
+ let parsed;
29
+ try {
30
+ parsed = JSON.parse(list);
31
+ }
32
+ catch (err) {
33
+ throw new Error(`Invalid JSON in dataset file: ${err.message}`);
34
+ }
35
+ const result = await datasetSchema.safeParseAsync(parsed);
36
+ if (!result.success) {
37
+ throw new Error(`Invalid dataset file: ${JSON.stringify(result.error.format())}`);
38
+ }
39
+ return result.data;
40
+ }
41
+ async loadWithOptions() {
42
+ return this.load();
43
+ }
44
+ }
45
+ export class JsonDataset {
46
+ name = "json-dataset";
47
+ data;
48
+ constructor(data) {
49
+ this.data = data;
50
+ }
51
+ async load() {
52
+ const result = await datasetSchema.safeParseAsync(this.data);
53
+ if (!result.success) {
54
+ throw new Error(`Invalid dataset file: ${JSON.stringify(result.error.format())}`);
55
+ }
56
+ return result.data;
57
+ }
58
+ async loadWithOptions() {
59
+ return this.load();
60
+ }
61
+ }
@@ -0,0 +1,9 @@
1
+ import { type Agent, AIGNE } from "@aigne/core";
2
+ import type { DatasetItem, Evaluation, Evaluator } from "./type.js";
3
+ export declare class LLMEvaluator implements Evaluator {
4
+ private readonly aigne;
5
+ private readonly agent;
6
+ name: string;
7
+ constructor(aigne?: AIGNE, agent?: Agent);
8
+ evaluate(dataset: DatasetItem): Promise<Evaluation[]>;
9
+ }
@@ -0,0 +1,107 @@
1
+ import { AIAgent, AIGNE } from "@aigne/core";
2
+ import { z } from "zod";
3
+ const EVALUATOR_PROMPT = `
4
+ # Instructions
5
+ You are an expert evaluator. Your task is to evaluate the quality of AI-generated responses.
6
+ You will be given:
7
+ 1. User Input (Prompt)
8
+ 2. AI-generated Output
9
+ 3. Expected Output
10
+
11
+ ## Evaluation Methods
12
+ Follow these three correlation checks before assigning a score:
13
+ 1. **AI Output vs User Input**: Check if the AI response is relevant to the user input.
14
+ 2. **Expected Output vs User Input**: Check if the expected output is relevant to the user input.
15
+ 3. **AI Output vs Expected Output**: Check the similarity and alignment between the AI output and the expected output.
16
+
17
+ Then assign a rating and a score based on the overall quality.
18
+
19
+ ## Criteria
20
+ - **Instruction following**: Does the AI response follow the prompt’s requirements?
21
+ - **Groundedness**: Is the AI response consistent with the expected output and free from irrelevant information?
22
+ - **Completeness**: Does the AI response fully address the task?
23
+ - **Accuracy/Correctness**: Is the AI response factually correct and logically consistent?
24
+ - **Fluency**: Is the AI response clear, structured, and easy to read?
25
+
26
+ ## Rating Rubric (1–5)
27
+ - **5 - Very Good**: Highly relevant, closely aligned with the expected output, accurate, complete, and fluent.
28
+ - **4 - Good**: Relevant, mostly aligned with the expected output, generally accurate and complete, only minor issues.
29
+ - **3 - Ok**: Somewhat relevant, partially aligned, or missing important details.
30
+ - **2 - Bad**: Weak relevance, low similarity with expected output, contains significant errors or omissions.
31
+ - **1 - Very Bad**: Irrelevant, fails to align with expected output, or completely incorrect.
32
+
33
+ ## Evaluation Steps
34
+ 1. Compare the **semantic content** of AI Output vs Expected Output.
35
+ - Ignore JSON keys, object structure, formatting, whitespace, capitalization, and minor punctuation differences.
36
+ - If meaning is the same but phrasing differs slightly, assign a higher score (4–5).
37
+ - If AI output deviates significantly, assign a lower score (1–2).
38
+ - If AI output is empty, assign a lower score (1–2).
39
+ 2. Assess against criteria: instruction following, groundedness, completeness, correctness, fluency.
40
+ 3. Assign a 1–5 integer score.
41
+ 4. Provide reasoning, and explicitly justify why this result is **not** a 1/2/3 case (why it avoids being a negative example).
42
+
43
+ # Response Output Format
44
+ Your output must strictly follow this three-line format:
45
+ - First line: rating (Very Good, Good, Ok, Bad, Very Bad)
46
+ - Second line: reasoning (must include justification why it is not a 1, 2, or 3 if scored higher)
47
+ - Third line: SCORE: [1-5]
48
+
49
+ Example:
50
+ Good
51
+ The response follows most instructions and is largely consistent with the expected output, but it omits one detail. This prevents it from being 5. However, it is more accurate and complete than an "Ok" response, so it deserves 4.
52
+ SCORE: 4
53
+
54
+ # User Inputs and AI-generated Response
55
+ ### Input
56
+ {{input}}
57
+
58
+ ### AI-generated Output
59
+ {{output}}
60
+
61
+ ### Expected Output
62
+ {{expectedOutput}}
63
+ `;
64
+ const defaultAgent = AIAgent.from({
65
+ name: "LLMEvaluator",
66
+ instructions: EVALUATOR_PROMPT,
67
+ inputSchema: z.object({
68
+ input: z.string().describe("The input content to analyze"),
69
+ output: z.string().describe("The output content to analyze"),
70
+ expectedOutput: z.string().describe("The expected output content to analyze"),
71
+ }),
72
+ outputSchema: z.object({
73
+ rating: z
74
+ .enum(["Very Good", "Good", "Ok", "Bad", "Very Bad"])
75
+ .describe("The rating of the output"),
76
+ reasoning: z.string().describe("The reasoning of the rating, including justification"),
77
+ score: z.number().int().min(1).max(5).describe("The score of the output, 1–5, 5 is the best"),
78
+ }),
79
+ });
80
+ const defaultAigne = new AIGNE();
81
+ export class LLMEvaluator {
82
+ aigne;
83
+ agent;
84
+ name = "llm-as-judge";
85
+ constructor(aigne = defaultAigne, agent = defaultAgent) {
86
+ this.aigne = aigne;
87
+ this.agent = agent;
88
+ }
89
+ async evaluate(dataset) {
90
+ const result = await this.aigne.invoke(this.agent, {
91
+ input: typeof dataset.input === "string"
92
+ ? dataset.input
93
+ : JSON.stringify(dataset.input, null, 2),
94
+ output: dataset.output ? JSON.stringify(dataset.output, null, 2) : "",
95
+ expectedOutput: JSON.stringify(dataset.expected, null, 2),
96
+ }, { returnMetadata: true });
97
+ return [
98
+ {
99
+ name: this.name,
100
+ rating: result.rating,
101
+ score: result.score,
102
+ reason: result.reasoning,
103
+ usage: result?.$meta?.usage ?? {},
104
+ },
105
+ ];
106
+ }
107
+ }
@@ -0,0 +1,28 @@
1
+ import type { Report, Reporter } from "./type.js";
2
+ export declare class BaseReporter implements Reporter {
3
+ name: string;
4
+ report(_report: Report): Promise<void>;
5
+ protected formatReport(report: Report): {
6
+ header: string;
7
+ key: string;
8
+ width: number;
9
+ value: string | number;
10
+ }[][];
11
+ protected formatSummary(summary: Report["summary"]): {
12
+ header: string;
13
+ key: string;
14
+ width: number;
15
+ value: string | number;
16
+ }[];
17
+ }
18
+ export declare class ConsoleReporter extends BaseReporter {
19
+ name: string;
20
+ report(report: Report): Promise<void>;
21
+ }
22
+ export declare class CsvReporter extends BaseReporter {
23
+ private filePath;
24
+ name: string;
25
+ constructor(filePath: string);
26
+ private writeCsv;
27
+ report(report: Report): Promise<void>;
28
+ }
@@ -0,0 +1,202 @@
1
+ import fs from "node:fs";
2
+ import path from "node:path";
3
+ import { format } from "@fast-csv/format";
4
+ import chalk from "chalk";
5
+ import Table from "cli-table3";
6
+ const borderColor = chalk.green;
7
+ const chars = {
8
+ top: borderColor("─"),
9
+ "top-mid": borderColor("┬"),
10
+ "top-left": borderColor("┌"),
11
+ "top-right": borderColor("┐"),
12
+ bottom: borderColor("─"),
13
+ "bottom-mid": borderColor("┴"),
14
+ "bottom-left": borderColor("└"),
15
+ "bottom-right": borderColor("┘"),
16
+ left: borderColor("│"),
17
+ "left-mid": borderColor("├"),
18
+ mid: borderColor("─"),
19
+ "mid-mid": borderColor("┼"),
20
+ right: borderColor("│"),
21
+ "right-mid": borderColor("┤"),
22
+ middle: borderColor("│"),
23
+ };
24
+ export class BaseReporter {
25
+ name = "base";
26
+ async report(_report) {
27
+ throw new Error("Not implemented");
28
+ }
29
+ formatReport(report) {
30
+ return report.results.map((r) => {
31
+ return [
32
+ { header: "ID", key: "ID", width: 10, value: r.id },
33
+ { header: "Input", key: "Input", width: 40, value: JSON.stringify(r.input) },
34
+ {
35
+ header: "Output",
36
+ key: "Output",
37
+ width: 40,
38
+ value: r.output ? JSON.stringify(r.output) : "-",
39
+ },
40
+ {
41
+ header: "Expected",
42
+ key: "Expected",
43
+ width: 40,
44
+ value: r.expected ? JSON.stringify(r.expected) : "-",
45
+ },
46
+ { header: "Error", key: "Error", width: 20, value: r.error ?? "-" },
47
+ {
48
+ header: "Evaluations",
49
+ key: "Evaluations",
50
+ width: 30,
51
+ value: r.evaluations.map((e) => `${e.name}:${e.score}`).join(", "),
52
+ },
53
+ {
54
+ header: "Rating",
55
+ key: "Rating",
56
+ width: 20,
57
+ value: r.evaluations.map((e) => `${e.rating}`).join(", "),
58
+ },
59
+ {
60
+ header: "Reason",
61
+ key: "Reason",
62
+ width: 50,
63
+ value: r.evaluations
64
+ .map((e) => e.reason ?? "")
65
+ .filter(Boolean)
66
+ .join(" | "),
67
+ },
68
+ {
69
+ header: "Latency",
70
+ key: "Latency",
71
+ width: 15,
72
+ value: r.latency ? `${r.latency.toFixed(2)}s` : "-",
73
+ },
74
+ {
75
+ header: "Tokens",
76
+ key: "Tokens",
77
+ width: 40,
78
+ value: r.usage
79
+ ? `${(r.usage.inputTokens || 0) + (r.usage.outputTokens || 0)} (input:${r.usage.inputTokens || 0}, output:${r.usage.outputTokens || 0})`
80
+ : "-",
81
+ },
82
+ ];
83
+ });
84
+ }
85
+ formatSummary(summary) {
86
+ return [
87
+ {
88
+ header: "Total",
89
+ key: "Total",
90
+ width: 10,
91
+ value: summary.total,
92
+ },
93
+ {
94
+ header: "Success Rate",
95
+ key: "SuccessRate",
96
+ width: 15,
97
+ value: summary.successRate,
98
+ },
99
+ {
100
+ header: "Total Duration",
101
+ key: "Duration",
102
+ width: 15,
103
+ value: summary.duration ? `${summary.duration.toFixed(3)}s` : "-",
104
+ },
105
+ {
106
+ header: "Avg Latency",
107
+ key: "AvgLatency",
108
+ width: 15,
109
+ value: summary.avgLatency ? `${summary.avgLatency.toFixed(3)}s` : "-",
110
+ },
111
+ {
112
+ header: "Total Tokens",
113
+ key: "TotalTokens",
114
+ width: 15,
115
+ value: summary.totalTokens ?? "-",
116
+ },
117
+ {
118
+ header: "Errors",
119
+ key: "Errors",
120
+ width: 8,
121
+ value: summary.errorCount ?? 0,
122
+ },
123
+ ];
124
+ }
125
+ }
126
+ export class ConsoleReporter extends BaseReporter {
127
+ name = "console";
128
+ async report(report) {
129
+ const summary = report.summary;
130
+ console.log("\n=== 📊 Evaluation Summary ===");
131
+ const summaryList = this.formatSummary(summary);
132
+ const summaryTable = new Table({
133
+ head: summaryList.map((h) => h.header),
134
+ colWidths: summaryList.map((h) => h.width),
135
+ chars,
136
+ });
137
+ summaryTable.push(summaryList.map((h) => h.value));
138
+ console.log(summaryTable.toString());
139
+ const list = this.formatReport(report);
140
+ if (!list.length)
141
+ return;
142
+ console.log("\n=== 📋 Detailed Results ===");
143
+ const head = list[0]?.map((h) => h.header) ?? [];
144
+ const colWidths = list[0]?.map((h) => h.width) ?? [];
145
+ const detailTable = new Table({
146
+ head,
147
+ colWidths,
148
+ wordWrap: true,
149
+ chars,
150
+ });
151
+ for (const r of list) {
152
+ detailTable.push(r.map((h) => h.value));
153
+ }
154
+ console.log(detailTable.toString());
155
+ const failed = report.results.filter((r) => r.error);
156
+ if (failed.length) {
157
+ console.log(chalk.red("\n=== ❌ Failed Cases ==="));
158
+ for (const f of failed) {
159
+ console.log(`#${f.id} Input: ${JSON.stringify(f.input)}\n Expected: ${f.expected ? JSON.stringify(f.expected) : "-"}\n Output: ${f.output ? JSON.stringify(f.output) : "-"}\n Error: ${f.error ?? "-"}\n`);
160
+ }
161
+ }
162
+ }
163
+ }
164
+ export class CsvReporter extends BaseReporter {
165
+ filePath;
166
+ name = "csv";
167
+ constructor(filePath) {
168
+ super();
169
+ this.filePath = filePath;
170
+ }
171
+ async writeCsv(filePath, data, headers) {
172
+ fs.mkdirSync(path.dirname(filePath), { recursive: true });
173
+ const stream = format({ headers });
174
+ const writeStream = fs.createWriteStream(filePath);
175
+ stream.pipe(writeStream);
176
+ for (const row of data) {
177
+ stream.write(row);
178
+ }
179
+ stream.end();
180
+ await new Promise((resolve, reject) => {
181
+ writeStream.on("finish", resolve);
182
+ writeStream.on("error", reject);
183
+ });
184
+ }
185
+ async report(report) {
186
+ const list = this.formatReport(report);
187
+ if (list.length > 0) {
188
+ const resultsHeaders = list[0]?.map((h) => h.header) ?? [];
189
+ const resultsRows = list.map((row) => {
190
+ const record = {};
191
+ for (const item of row) {
192
+ record[item.header] = item.value;
193
+ }
194
+ return record;
195
+ });
196
+ const ext = path.extname(this.filePath).toLowerCase();
197
+ const outputFile = ext ? this.filePath : `${this.filePath}.csv`;
198
+ await this.writeCsv(outputFile, resultsRows, resultsHeaders);
199
+ console.log(`✅ Results CSV saved to ${outputFile}`);
200
+ }
201
+ }
202
+ }
@@ -0,0 +1,16 @@
1
+ import { type Agent, AIGNE } from "@aigne/core";
2
+ import type { DatasetItem, Runner, RunOptions, RunResult } from "./type.js";
3
+ export declare class DefaultRunner implements Runner {
4
+ private agent;
5
+ private aigne;
6
+ name: string;
7
+ constructor(agent: Agent, aigne?: AIGNE);
8
+ run(dataset: DatasetItem[], options?: RunOptions): AsyncGenerator<RunResult>;
9
+ }
10
+ export declare class DefaultRunnerWithConcurrency implements Runner {
11
+ private agent;
12
+ private aigne;
13
+ name: string;
14
+ constructor(agent: Agent, aigne?: AIGNE);
15
+ run(dataset: DatasetItem[], options?: RunOptions): AsyncGenerator<RunResult>;
16
+ }
@@ -0,0 +1,129 @@
1
+ import { AIGNE } from "@aigne/core";
2
+ export class DefaultRunner {
3
+ agent;
4
+ aigne;
5
+ name = "default-runner";
6
+ constructor(agent, aigne = new AIGNE()) {
7
+ this.agent = agent;
8
+ this.aigne = aigne;
9
+ }
10
+ async *run(dataset, options) {
11
+ const timeoutMs = options?.timeoutMs ?? 0;
12
+ const runTask = async (item) => {
13
+ const start = Date.now();
14
+ options?.hooks?.onBeforeRun?.(item);
15
+ try {
16
+ const execPromise = this.aigne.invoke(this.agent, item.input, { returnMetadata: true });
17
+ const result = timeoutMs > 0 ? await withTimeout(execPromise, timeoutMs, item.id) : await execPromise;
18
+ const { $meta, ...output } = result;
19
+ options?.hooks?.onAfterRun?.(result);
20
+ return {
21
+ ...item,
22
+ output,
23
+ latency: (Date.now() - start) / 1000,
24
+ usage: $meta?.usage || {},
25
+ };
26
+ }
27
+ catch (err) {
28
+ options?.hooks?.onError?.(err);
29
+ return {
30
+ ...item,
31
+ error: err.message,
32
+ };
33
+ }
34
+ };
35
+ for (const item of dataset) {
36
+ yield await runTask(item);
37
+ }
38
+ }
39
+ }
40
+ function withTimeout(promise, ms, id) {
41
+ return new Promise((resolve, reject) => {
42
+ const timer = setTimeout(() => {
43
+ reject(new Error(`Task ${id} timed out after ${ms}ms`));
44
+ }, ms);
45
+ promise
46
+ .then((res) => {
47
+ clearTimeout(timer);
48
+ resolve(res);
49
+ })
50
+ .catch((err) => {
51
+ clearTimeout(timer);
52
+ reject(err);
53
+ });
54
+ });
55
+ }
56
+ export class DefaultRunnerWithConcurrency {
57
+ agent;
58
+ aigne;
59
+ name = "default-runner-with-concurrency";
60
+ constructor(agent, aigne = new AIGNE()) {
61
+ this.agent = agent;
62
+ this.aigne = aigne;
63
+ }
64
+ async *run(dataset, options) {
65
+ const concurrency = options?.concurrency ?? 1;
66
+ const timeoutMs = options?.timeoutMs ?? 0;
67
+ let index = 0;
68
+ const yieldQueue = [];
69
+ let waitingResolve = null;
70
+ let activeWorkers = 0;
71
+ const runTask = async (item) => {
72
+ const start = Date.now();
73
+ options?.hooks?.onBeforeRun?.(item);
74
+ try {
75
+ const execPromise = this.aigne.invoke(this.agent, item.input, { returnMetadata: true });
76
+ const result = timeoutMs > 0 ? await withTimeout(execPromise, timeoutMs, item.id) : await execPromise;
77
+ const { $meta, ...output } = result;
78
+ options?.hooks?.onAfterRun?.(result);
79
+ return {
80
+ ...item,
81
+ output,
82
+ latency: (Date.now() - start) / 1000,
83
+ usage: $meta?.usage || {},
84
+ };
85
+ }
86
+ catch (err) {
87
+ options?.hooks?.onError?.(err);
88
+ return {
89
+ ...item,
90
+ error: err.message,
91
+ };
92
+ }
93
+ };
94
+ const worker = async () => {
95
+ activeWorkers++;
96
+ try {
97
+ while (true) {
98
+ const currentIndex = index++;
99
+ if (currentIndex >= dataset.length)
100
+ break;
101
+ const item = dataset[currentIndex];
102
+ if (!item)
103
+ continue;
104
+ const res = await runTask(item);
105
+ yieldQueue.push(res);
106
+ waitingResolve?.();
107
+ }
108
+ }
109
+ finally {
110
+ activeWorkers--;
111
+ waitingResolve?.();
112
+ }
113
+ };
114
+ Array.from({ length: Math.min(concurrency, dataset.length) }, () => worker());
115
+ while (yieldQueue.length > 0 || activeWorkers > 0) {
116
+ if (yieldQueue.length > 0) {
117
+ const result = yieldQueue.shift();
118
+ if (result)
119
+ yield result;
120
+ }
121
+ else {
122
+ await new Promise((resolve) => {
123
+ waitingResolve = resolve;
124
+ });
125
+ waitingResolve = null;
126
+ }
127
+ }
128
+ }
129
+ }
@@ -0,0 +1,68 @@
1
+ export interface DatasetItem {
2
+ id: string | number;
3
+ input: Record<string, any>;
4
+ output?: Record<string, any>;
5
+ expected?: Record<string, any>;
6
+ metadata?: Record<string, any>;
7
+ tags?: string[];
8
+ selected?: boolean;
9
+ }
10
+ export interface Dataset {
11
+ name: string;
12
+ load(): Promise<DatasetItem[]>;
13
+ loadWithOptions(options?: {
14
+ filter?: (item: DatasetItem) => boolean;
15
+ limit?: number;
16
+ }): Promise<DatasetItem[]>;
17
+ }
18
+ export interface RunOptions {
19
+ timeoutMs?: number;
20
+ concurrency?: number;
21
+ hooks?: {
22
+ onBeforeRun?: (item: DatasetItem) => void;
23
+ onAfterRun?: (result: RunResult) => void;
24
+ onError?: (err: Error) => void;
25
+ };
26
+ }
27
+ export interface RunResult extends DatasetItem {
28
+ error?: string;
29
+ latency?: number;
30
+ usage?: {
31
+ inputTokens: number;
32
+ outputTokens: number;
33
+ };
34
+ }
35
+ export interface Runner {
36
+ name: string;
37
+ run(dataset: DatasetItem[], options?: RunOptions): AsyncGenerator<RunResult>;
38
+ }
39
+ export interface Evaluation {
40
+ name: string;
41
+ score: number;
42
+ reason?: string;
43
+ [key: string]: any;
44
+ }
45
+ export interface Evaluator {
46
+ name: string;
47
+ evaluate(result: RunResult): Promise<Evaluation[]>;
48
+ }
49
+ export interface EvaluationSummary {
50
+ total: number;
51
+ successRate: number;
52
+ avgLatency?: number;
53
+ totalTokens?: number;
54
+ errorCount?: number;
55
+ [key: string]: any;
56
+ }
57
+ export interface EvaluationResult extends RunResult {
58
+ evaluations: Evaluation[];
59
+ }
60
+ export interface Report {
61
+ dataset: string;
62
+ results: EvaluationResult[];
63
+ summary: EvaluationSummary;
64
+ }
65
+ export interface Reporter {
66
+ name: string;
67
+ report(report: Report): Promise<void>;
68
+ }
@@ -0,0 +1 @@
1
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@aigne/cli",
3
- "version": "1.48.4-beta.5",
3
+ "version": "1.49.0-beta.5",
4
4
  "description": "Your command center for agent development",
5
5
  "publishConfig": {
6
6
  "access": "public"
@@ -47,6 +47,7 @@
47
47
  "dependencies": {
48
48
  "@aigne/listr2": "^1.0.10",
49
49
  "@aigne/marked-terminal": "^7.3.2",
50
+ "@fast-csv/format": "^5.0.5",
50
51
  "@inquirer/core": "^10.2.2",
51
52
  "@inquirer/figures": "^1.0.13",
52
53
  "@inquirer/prompts": "^7.8.6",
@@ -81,13 +82,13 @@
81
82
  "yargs": "^18.0.0",
82
83
  "yoctocolors-cjs": "^2.1.3",
83
84
  "zod": "^3.25.67",
84
- "@aigne/agent-library": "^1.21.46-beta.4",
85
- "@aigne/agentic-memory": "^1.0.46-beta.4",
86
- "@aigne/aigne-hub": "^0.10.0-beta.4",
87
- "@aigne/core": "^1.61.0-beta.3",
88
- "@aigne/default-memory": "^1.2.9-beta.4",
89
- "@aigne/openai": "^0.16.0-beta.4",
90
- "@aigne/observability-api": "^0.10.4"
85
+ "@aigne/agent-library": "^1.21.46-beta.5",
86
+ "@aigne/agentic-memory": "^1.0.46-beta.5",
87
+ "@aigne/aigne-hub": "^0.10.0-beta.5",
88
+ "@aigne/core": "^1.61.0-beta.4",
89
+ "@aigne/default-memory": "^1.2.9-beta.5",
90
+ "@aigne/observability-api": "^0.10.5-beta",
91
+ "@aigne/openai": "^0.16.0-beta.5"
91
92
  },
92
93
  "devDependencies": {
93
94
  "@inquirer/testing": "^2.1.50",
@@ -104,7 +105,7 @@
104
105
  "rimraf": "^6.0.1",
105
106
  "typescript": "^5.9.2",
106
107
  "ufo": "^1.6.1",
107
- "@aigne/test-utils": "^0.5.53-beta.3"
108
+ "@aigne/test-utils": "^0.5.53-beta.4"
108
109
  },
109
110
  "scripts": {
110
111
  "lint": "tsc --noEmit",