@agentica/benchmark 0.8.2 → 0.8.3-dev.20250227

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,181 +1,180 @@
1
- import { IAgenticaTokenUsage } from "@agentica/core";
2
-
3
- import { IAgenticaCallBenchmarkEvent } from "../structures/IAgenticaCallBenchmarkEvent";
4
- import { IAgenticaCallBenchmarkResult } from "../structures/IAgenticaCallBenchmarkResult";
5
- import { MathUtil } from "../utils/MathUtil";
6
- import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
7
- import { AgenticaPromptReporter } from "./AgenticaPromptReporter";
8
-
9
- export namespace AgenticaCallBenchmarkReporter {
10
- export const markdown = (
11
- result: IAgenticaCallBenchmarkResult,
12
- ): Record<string, string> =>
13
- Object.fromEntries([
14
- ["./README.md", writeIndex(result)],
15
- ...result.experiments
16
- .map((exp) => [
17
- [`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
18
- ...exp.events.map((event, i) => [
19
- `./${exp.scenario.name}/${i + 1}.${event.type}.md`,
20
- writeExperimentEvent(event, i),
21
- ]),
22
- ])
23
- .flat(),
24
- ]);
25
-
26
- const writeIndex = (result: IAgenticaCallBenchmarkResult): string => {
27
- const events: IAgenticaCallBenchmarkEvent[] = result.experiments
28
- .map((r) => r.events)
29
- .flat();
30
- const average: number =
31
- events
32
- .map((e) => e.completed_at.getTime() - e.started_at.getTime())
33
- .reduce((a, b) => a + b, 0) / events.length;
34
- const aggregate: IAgenticaTokenUsage.IComponent<"aggregate"> =
35
- result.usage.aggregate;
36
- return [
37
- "# LLM Function Call Benchmark",
38
- "## Summary",
39
- ` - Aggregation:`,
40
- ` - Scenarios: #${result.experiments.length.toLocaleString()}`,
41
- ` - Trial: ${events.length}`,
42
- ` - Success: ${events.filter((e) => e.type === "success").length}`,
43
- ` - Failure: ${events.filter((e) => e.type === "failure").length}`,
44
- ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`,
45
- ` - Token Usage`,
46
- ` - Total: ${aggregate.total.toLocaleString()}`,
47
- ` - Input`,
48
- ` - Total: ${aggregate.input.total.toLocaleString()}`,
49
- ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
50
- ` - Output:`,
51
- ` - Total: ${aggregate.output.total.toLocaleString()}`,
52
- ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
53
- ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
54
- ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
55
- "",
56
- "## Experiments",
57
- " Name | Select | Call | Time/Avg ",
58
- ":-----|:-------|:-----|----------:",
59
- ...result.experiments.map((exp) =>
60
- [
61
- `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`,
62
- drawStatus(
63
- exp.events,
64
- (e) => e.type !== "error" && e.select === true,
65
- ),
66
- drawStatus(exp.events, (e) => e.type !== "error" && e.call === true),
67
- `${MathUtil.round(
68
- exp.events
69
- .map((e) => e.completed_at.getTime() - e.started_at.getTime())
70
- .reduce((a, b) => a + b, 0) / exp.events.length,
71
- ).toLocaleString()} ms`,
72
- ].join(" | "),
73
- ),
74
- ].join("\n");
75
- };
76
-
77
- const writeExperimentIndex = (
78
- exp: IAgenticaCallBenchmarkResult.IExperiment,
79
- ): string => {
80
- return [
81
- `# ${exp.scenario.name}`,
82
- "## Summary",
83
- ` - Scenarios: #${exp.events.length.toLocaleString()}`,
84
- ` - Success: ${exp.events.filter((e) => e.type === "success").length}`,
85
- ` - Failure: ${exp.events.filter((e) => e.type === "failure").length}`,
86
- ` - Average Time: ${MathUtil.round(
87
- exp.events
88
- .map((e) => e.completed_at.getTime() - e.started_at.getTime())
89
- .reduce((a, b) => a + b, 0) / exp.events.length,
90
- ).toLocaleString()} ms`,
91
- "",
92
- "## Events",
93
- " Name | Type | Time",
94
- ":-----|:-----|----:",
95
- ...exp.events.map((e, i) =>
96
- [
97
- `[${i + 1}.](./${i + 1}.${e.type}.md)`,
98
- e.type,
99
- `${MathUtil.round(e.completed_at.getTime() - e.started_at.getTime())} ms`,
100
- ].join(" | "),
101
- ),
102
- "",
103
- "## Scenario",
104
- "### User Prompt",
105
- exp.scenario.text,
106
- "",
107
- "### Expected",
108
- "```json",
109
- JSON.stringify(
110
- AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected),
111
- null,
112
- 2,
113
- ),
114
- "```",
115
- ].join("\n");
116
- };
117
-
118
- const writeExperimentEvent = (
119
- event: IAgenticaCallBenchmarkEvent,
120
- index: number,
121
- ): string => {
122
- return [
123
- `# ${index + 1}. ${event.type}`,
124
- "## Summary",
125
- ` - Name: ${event.scenario.name}`,
126
- ` - Type: ${event.type}`,
127
- ` - Time: ${MathUtil.round(
128
- event.completed_at.getTime() - event.started_at.getTime(),
129
- ).toLocaleString()} ms`,
130
- ...(event.type !== "error"
131
- ? [
132
- ` - Select: ${event.select ? "✅" : "❌"}`,
133
- ` - Call: ${event.call ? "✅" : "❌"}`,
134
- ]
135
- : []),
136
- ` - Token Usage: ${event.usage.toLocaleString()}`,
137
- "",
138
- "## Scenario",
139
- "### User Prompt",
140
- event.scenario.text,
141
- "",
142
- "### Expected",
143
- "```json",
144
- JSON.stringify(
145
- AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected),
146
- null,
147
- 2,
148
- ),
149
- "```",
150
- "",
151
- "## Prompt Histories",
152
- ...event.prompts.map(AgenticaPromptReporter.markdown),
153
- "",
154
- ...(event.type === "error"
155
- ? [
156
- "## Error",
157
- "```json",
158
- JSON.stringify(
159
- AgenticaBenchmarkUtil.errorToJson(event.error),
160
- null,
161
- 2,
162
- ),
163
- "```",
164
- ]
165
- : []),
166
- ].join("\n");
167
- };
168
-
169
- const drawStatus = (
170
- events: IAgenticaCallBenchmarkEvent[],
171
- success: (e: IAgenticaCallBenchmarkEvent) => boolean,
172
- ): string => {
173
- const count: number = Math.floor(
174
- (events.filter(success).length / events.length) * 10,
175
- );
176
- return (
177
- new Array(count).fill("").join("") +
178
- new Array(10 - count).fill("□").join("")
179
- );
180
- };
181
- }
1
+ import { IAgenticaTokenUsage } from "@agentica/core";
2
+
3
+ import { IAgenticaCallBenchmarkEvent } from "../structures/IAgenticaCallBenchmarkEvent";
4
+ import { IAgenticaCallBenchmarkResult } from "../structures/IAgenticaCallBenchmarkResult";
5
+ import { MathUtil } from "../utils/MathUtil";
6
+ import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
7
+ import { AgenticaPromptReporter } from "./AgenticaPromptReporter";
8
+
9
+ export namespace AgenticaCallBenchmarkReporter {
10
+ export const markdown = (
11
+ result: IAgenticaCallBenchmarkResult,
12
+ ): Record<string, string> =>
13
+ Object.fromEntries([
14
+ ["./README.md", writeIndex(result)],
15
+ ...result.experiments
16
+ .map((exp) => [
17
+ [`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
18
+ ...exp.events.map((event, i) => [
19
+ `./${exp.scenario.name}/${i + 1}.${event.type}.md`,
20
+ writeExperimentEvent(event, i),
21
+ ]),
22
+ ])
23
+ .flat(),
24
+ ]);
25
+
26
+ const writeIndex = (result: IAgenticaCallBenchmarkResult): string => {
27
+ const events: IAgenticaCallBenchmarkEvent[] = result.experiments
28
+ .map((r) => r.events)
29
+ .flat();
30
+ const average: number =
31
+ events
32
+ .map((e) => e.completed_at.getTime() - e.started_at.getTime())
33
+ .reduce((a, b) => a + b, 0) / events.length;
34
+ const aggregate: IAgenticaTokenUsage.IComponent = result.usage.aggregate;
35
+ return [
36
+ "# LLM Function Call Benchmark",
37
+ "## Summary",
38
+ ` - Aggregation:`,
39
+ ` - Scenarios: #${result.experiments.length.toLocaleString()}`,
40
+ ` - Trial: ${events.length}`,
41
+ ` - Success: ${events.filter((e) => e.type === "success").length}`,
42
+ ` - Failure: ${events.filter((e) => e.type === "failure").length}`,
43
+ ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`,
44
+ ` - Token Usage`,
45
+ ` - Total: ${aggregate.total.toLocaleString()}`,
46
+ ` - Input`,
47
+ ` - Total: ${aggregate.input.total.toLocaleString()}`,
48
+ ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
49
+ ` - Output:`,
50
+ ` - Total: ${aggregate.output.total.toLocaleString()}`,
51
+ ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
52
+ ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
53
+ ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
54
+ "",
55
+ "## Experiments",
56
+ " Name | Select | Call | Time/Avg ",
57
+ ":-----|:-------|:-----|----------:",
58
+ ...result.experiments.map((exp) =>
59
+ [
60
+ `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`,
61
+ drawStatus(
62
+ exp.events,
63
+ (e) => e.type !== "error" && e.select === true,
64
+ ),
65
+ drawStatus(exp.events, (e) => e.type !== "error" && e.call === true),
66
+ `${MathUtil.round(
67
+ exp.events
68
+ .map((e) => e.completed_at.getTime() - e.started_at.getTime())
69
+ .reduce((a, b) => a + b, 0) / exp.events.length,
70
+ ).toLocaleString()} ms`,
71
+ ].join(" | "),
72
+ ),
73
+ ].join("\n");
74
+ };
75
+
76
+ const writeExperimentIndex = (
77
+ exp: IAgenticaCallBenchmarkResult.IExperiment,
78
+ ): string => {
79
+ return [
80
+ `# ${exp.scenario.name}`,
81
+ "## Summary",
82
+ ` - Scenarios: #${exp.events.length.toLocaleString()}`,
83
+ ` - Success: ${exp.events.filter((e) => e.type === "success").length}`,
84
+ ` - Failure: ${exp.events.filter((e) => e.type === "failure").length}`,
85
+ ` - Average Time: ${MathUtil.round(
86
+ exp.events
87
+ .map((e) => e.completed_at.getTime() - e.started_at.getTime())
88
+ .reduce((a, b) => a + b, 0) / exp.events.length,
89
+ ).toLocaleString()} ms`,
90
+ "",
91
+ "## Events",
92
+ " Name | Type | Time",
93
+ ":-----|:-----|----:",
94
+ ...exp.events.map((e, i) =>
95
+ [
96
+ `[${i + 1}.](./${i + 1}.${e.type}.md)`,
97
+ e.type,
98
+ `${MathUtil.round(e.completed_at.getTime() - e.started_at.getTime())} ms`,
99
+ ].join(" | "),
100
+ ),
101
+ "",
102
+ "## Scenario",
103
+ "### User Prompt",
104
+ exp.scenario.text,
105
+ "",
106
+ "### Expected",
107
+ "```json",
108
+ JSON.stringify(
109
+ AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected),
110
+ null,
111
+ 2,
112
+ ),
113
+ "```",
114
+ ].join("\n");
115
+ };
116
+
117
+ const writeExperimentEvent = (
118
+ event: IAgenticaCallBenchmarkEvent,
119
+ index: number,
120
+ ): string => {
121
+ return [
122
+ `# ${index + 1}. ${event.type}`,
123
+ "## Summary",
124
+ ` - Name: ${event.scenario.name}`,
125
+ ` - Type: ${event.type}`,
126
+ ` - Time: ${MathUtil.round(
127
+ event.completed_at.getTime() - event.started_at.getTime(),
128
+ ).toLocaleString()} ms`,
129
+ ...(event.type !== "error"
130
+ ? [
131
+ ` - Select: ${event.select ? "✅" : "❌"}`,
132
+ ` - Call: ${event.call ? "✅" : "❌"}`,
133
+ ]
134
+ : []),
135
+ ` - Token Usage: ${event.usage.toLocaleString()}`,
136
+ "",
137
+ "## Scenario",
138
+ "### User Prompt",
139
+ event.scenario.text,
140
+ "",
141
+ "### Expected",
142
+ "```json",
143
+ JSON.stringify(
144
+ AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected),
145
+ null,
146
+ 2,
147
+ ),
148
+ "```",
149
+ "",
150
+ "## Prompt Histories",
151
+ ...event.prompts.map(AgenticaPromptReporter.markdown),
152
+ "",
153
+ ...(event.type === "error"
154
+ ? [
155
+ "## Error",
156
+ "```json",
157
+ JSON.stringify(
158
+ AgenticaBenchmarkUtil.errorToJson(event.error),
159
+ null,
160
+ 2,
161
+ ),
162
+ "```",
163
+ ]
164
+ : []),
165
+ ].join("\n");
166
+ };
167
+
168
+ const drawStatus = (
169
+ events: IAgenticaCallBenchmarkEvent[],
170
+ success: (e: IAgenticaCallBenchmarkEvent) => boolean,
171
+ ): string => {
172
+ const count: number = Math.floor(
173
+ (events.filter(success).length / events.length) * 10,
174
+ );
175
+ return (
176
+ new Array(count).fill("■").join("") +
177
+ new Array(10 - count).fill("").join("")
178
+ );
179
+ };
180
+ }
@@ -1,43 +1,43 @@
1
- import { IAgenticaPrompt } from "@agentica/core";
2
-
3
- export namespace AgenticaPromptReporter {
4
- export const markdown = (p: IAgenticaPrompt): string => {
5
- if (p.type === "text")
6
- return [`### Text (${p.role})`, p.text, ""].join("\n");
7
- else if (p.type === "select" || p.type === "cancel")
8
- return [
9
- `### ${p.type === "select" ? "Select" : "Cancel"}`,
10
- ...p.operations
11
- .map((op) => [
12
- `#### ${op.name}`,
13
- ` - controller: ${op.controller.name}`,
14
- ` - function: ${op.function.name}`,
15
- ` - reason: ${op.reason}`,
16
- "",
17
- ...(!!op.function.description?.length
18
- ? [op.function.description, ""]
19
- : []),
20
- ])
21
- .flat(),
22
- ].join("\n");
23
- else if (p.type === "describe")
24
- return [
25
- "### Describe",
26
- ...p.executions.map((e) => ` - ${e.name}`),
27
- "",
28
- ...p.text.split("\n").map((s) => `> ${s}`),
29
- "",
30
- ].join("\n");
31
- return [
32
- "### Execute",
33
- ` - name: ${p.name}`,
34
- ` - controller: ${p.controller.name}`,
35
- ` - function: ${p.function.name}`,
36
- "",
37
- "```json",
38
- JSON.stringify(p.arguments, null, 2),
39
- "```",
40
- "",
41
- ].join("\n");
42
- };
43
- }
1
+ import { IAgenticaPrompt } from "@agentica/core";
2
+
3
+ export namespace AgenticaPromptReporter {
4
+ export const markdown = (p: IAgenticaPrompt): string => {
5
+ if (p.type === "text")
6
+ return [`### Text (${p.role})`, p.text, ""].join("\n");
7
+ else if (p.type === "select" || p.type === "cancel")
8
+ return [
9
+ `### ${p.type === "select" ? "Select" : "Cancel"}`,
10
+ ...p.operations
11
+ .map((op) => [
12
+ `#### ${op.name}`,
13
+ ` - controller: ${op.controller.name}`,
14
+ ` - function: ${op.function.name}`,
15
+ ` - reason: ${op.reason}`,
16
+ "",
17
+ ...(!!op.function.description?.length
18
+ ? [op.function.description, ""]
19
+ : []),
20
+ ])
21
+ .flat(),
22
+ ].join("\n");
23
+ else if (p.type === "describe")
24
+ return [
25
+ "### Describe",
26
+ ...p.executions.map((e) => ` - ${e.name}`),
27
+ "",
28
+ ...p.text.split("\n").map((s) => `> ${s}`),
29
+ "",
30
+ ].join("\n");
31
+ return [
32
+ "### Execute",
33
+ ` - name: ${p.name}`,
34
+ ` - controller: ${p.controller.name}`,
35
+ ` - function: ${p.function.name}`,
36
+ "",
37
+ "```json",
38
+ JSON.stringify(p.arguments, null, 2),
39
+ "```",
40
+ "",
41
+ ].join("\n");
42
+ };
43
+ }