@agentica/benchmark 0.8.2 → 0.8.3-dev.20250227

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,212 +1,210 @@
1
- import { IAgenticaTokenUsage } from "@agentica/core";
2
-
3
- import { IAgenticaSelectBenchmarkEvent } from "../structures/IAgenticaSelectBenchmarkEvent";
4
- import { IAgenticaSelectBenchmarkResult } from "../structures/IAgenticaSelectBenchmarkResult";
5
- import { MathUtil } from "../utils/MathUtil";
6
- import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
7
-
8
- /**
9
- * @internal
10
- */
11
- export namespace AgenticaSelectBenchmarkReporter {
12
- export const markdown = (
13
- result: IAgenticaSelectBenchmarkResult,
14
- ): Record<string, string> =>
15
- Object.fromEntries([
16
- ["./README.md", writeIndex(result)],
17
- ...result.experiments
18
- .map((exp) => [
19
- [`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
20
- ...exp.events.map((event, i) => [
21
- `./${exp.scenario.name}/${i + 1}.${event.type}.md`,
22
- writeExperimentEvent(event, i),
23
- ]),
24
- ])
25
- .flat(),
26
- ]);
27
-
28
- const writeIndex = (result: IAgenticaSelectBenchmarkResult): string => {
29
- const events: IAgenticaSelectBenchmarkEvent[] = result.experiments
30
- .map((r) => r.events)
31
- .flat();
32
- const average: number =
33
- events
34
- .map((e) => e.completed_at.getTime() - e.started_at.getTime())
35
- .reduce((a, b) => a + b, 0) / events.length;
36
- const aggregate: IAgenticaTokenUsage.IComponent<"aggregate"> =
37
- result.usage.aggregate;
38
- return [
39
- "# LLM Function Selection Benchmark",
40
- "## Summary",
41
- ` - Aggregation:`,
42
- ` - Scenarios: #${result.experiments.length.toLocaleString()}`,
43
- ` - Trial: ${events.length}`,
44
- ` - Success: ${events.filter((e) => e.type === "success").length}`,
45
- ` - Failure: ${events.filter((e) => e.type === "failure").length}`,
46
- ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`,
47
- ` - Token Usage`,
48
- ` - Total: ${aggregate.total.toLocaleString()}`,
49
- ` - Input`,
50
- ` - Total: ${aggregate.input.total.toLocaleString()}`,
51
- ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
52
- ` - Output:`,
53
- ` - Total: ${aggregate.output.total.toLocaleString()}`,
54
- ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
55
- ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
56
- ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
57
- "",
58
- "## Experiments",
59
- " Name | Status | Time/Avg ",
60
- ":-----|:-------|----------:",
61
- ...result.experiments.map((exp) =>
62
- [
63
- `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`,
64
- (() => {
65
- const success: number = Math.floor(
66
- (exp.events.filter((e) => e.type === "success").length /
67
- exp.events.length) *
68
- 10,
69
- );
70
- return (
71
- new Array(success).fill("").join("") +
72
- new Array(10 - success).fill("□").join("")
73
- );
74
- })(),
75
- MathUtil.round(
76
- exp.events
77
- .map(
78
- (event) =>
79
- event.completed_at.getTime() - event.started_at.getTime(),
80
- )
81
- .reduce((a, b) => a + b, 0) / exp.events.length,
82
- ).toLocaleString() + " ms",
83
- ].join(" | "),
84
- ),
85
- ].join("\n");
86
- };
87
-
88
- const writeExperimentIndex = (
89
- exp: IAgenticaSelectBenchmarkResult.IExperiment,
90
- ): string => {
91
- const aggregate: IAgenticaTokenUsage.IComponent<"aggregate"> =
92
- exp.usage.aggregate;
93
- return [
94
- `# ${exp.scenario.name}`,
95
- "## Summary",
96
- " - Aggregation:",
97
- ` - Trial: ${exp.events.length}`,
98
- ` - Success: ${exp.events.filter((e) => e.type === "success").length}`,
99
- ` - Failure: ${exp.events.filter((e) => e.type === "failure").length}`,
100
- ` - Average Time: ${MathUtil.round(
101
- exp.events
102
- .map(
103
- (event) =>
104
- event.completed_at.getTime() - event.started_at.getTime(),
105
- )
106
- .reduce((a, b) => a + b, 0) / exp.events.length,
107
- ).toLocaleString()} ms`,
108
- ` - Token Usage`,
109
- ` - Total: ${aggregate.total.toLocaleString()}`,
110
- ` - Input`,
111
- ` - Total: ${aggregate.input.total.toLocaleString()}`,
112
- ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
113
- ` - Output:`,
114
- ` - Total: ${aggregate.output.total.toLocaleString()}`,
115
- ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
116
- ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
117
- ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
118
- "",
119
- "## Events",
120
- " No | Type | Time",
121
- "---:|:-----|----:",
122
- ...exp.events.map((e, i) =>
123
- [
124
- `[${i + 1}.](./${i + 1}.${e.type}.md)`,
125
- e.type,
126
- MathUtil.round(e.completed_at.getTime() - e.started_at.getTime()) +
127
- " ms",
128
- ].join(" | "),
129
- ),
130
- "",
131
- "## Scenario",
132
- "### User Prompt",
133
- exp.scenario.text,
134
- "",
135
- "### Expected",
136
- "```json",
137
- JSON.stringify(
138
- AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected),
139
- null,
140
- 2,
141
- ),
142
- "```",
143
- ].join("\n");
144
- };
145
-
146
- const writeExperimentEvent = (
147
- event: IAgenticaSelectBenchmarkEvent,
148
- index: number,
149
- ): string => {
150
- return [
151
- `# ${index + 1}. ${event.type}`,
152
- `## Summary`,
153
- ` - Name: ${event.scenario.name}`,
154
- ` - Type: ${event.type}`,
155
- ` - Time: ${(event.completed_at.getTime() - event.started_at.getTime()).toLocaleString()} ms`,
156
- ...(event.type !== "error"
157
- ? [
158
- " - Token Usage",
159
- ` - Total: ${event.usage.aggregate.toLocaleString()}`,
160
- ` - Prompt`,
161
- ` - Total: ${event.usage.aggregate.input.total.toLocaleString()}`,
162
- ` - Cached: ${event.usage.aggregate.input.cached.toLocaleString()}`,
163
- ` - Completion:`,
164
- ` - Total: ${event.usage.aggregate.output.total.toLocaleString()}`,
165
- ` - Reasoning: ${event.usage.aggregate.output.reasoning.toLocaleString()}`,
166
- ` - Accepted Prediction: ${event.usage.aggregate.output.accepted_prediction.toLocaleString()}`,
167
- ` - Rejected Prediction: ${event.usage.aggregate.output.rejected_prediction.toLocaleString()}`,
168
- ]
169
- : []),
170
- "",
171
- "## Scenario",
172
- "### User Prompt",
173
- event.scenario.text,
174
- "",
175
- "### Expected",
176
- "```json",
177
- JSON.stringify(
178
- AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected),
179
- null,
180
- 2,
181
- ),
182
- "```",
183
- "",
184
- ...(event.type === "success" || event.type === "failure"
185
- ? [
186
- "## Result",
187
- ...event.selected.map((s) =>
188
- [
189
- `### ${s.name}`,
190
- ` - Controller: \`${s.controller.name}\``,
191
- ` - Function: \`${s.function.name}\``,
192
- ` - Reason: ${s.reason}`,
193
- "",
194
- ...(s.function.description ? [s.function.description, ""] : []),
195
- ].join("\n"),
196
- ),
197
- ]
198
- : []),
199
- ...(event.type === "error"
200
- ? [
201
- "## Error",
202
- "```json",
203
- AgenticaBenchmarkUtil.errorToJson(
204
- JSON.stringify(event.error, null, 2),
205
- ),
206
- "```",
207
- "",
208
- ]
209
- : []),
210
- ].join("\n");
211
- };
212
- }
1
+ import { IAgenticaTokenUsage } from "@agentica/core";
2
+
3
+ import { IAgenticaSelectBenchmarkEvent } from "../structures/IAgenticaSelectBenchmarkEvent";
4
+ import { IAgenticaSelectBenchmarkResult } from "../structures/IAgenticaSelectBenchmarkResult";
5
+ import { MathUtil } from "../utils/MathUtil";
6
+ import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
7
+
8
+ /**
9
+ * @internal
10
+ */
11
+ export namespace AgenticaSelectBenchmarkReporter {
12
+ export const markdown = (
13
+ result: IAgenticaSelectBenchmarkResult,
14
+ ): Record<string, string> =>
15
+ Object.fromEntries([
16
+ ["./README.md", writeIndex(result)],
17
+ ...result.experiments
18
+ .map((exp) => [
19
+ [`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
20
+ ...exp.events.map((event, i) => [
21
+ `./${exp.scenario.name}/${i + 1}.${event.type}.md`,
22
+ writeExperimentEvent(event, i),
23
+ ]),
24
+ ])
25
+ .flat(),
26
+ ]);
27
+
28
+ const writeIndex = (result: IAgenticaSelectBenchmarkResult): string => {
29
+ const events: IAgenticaSelectBenchmarkEvent[] = result.experiments
30
+ .map((r) => r.events)
31
+ .flat();
32
+ const average: number =
33
+ events
34
+ .map((e) => e.completed_at.getTime() - e.started_at.getTime())
35
+ .reduce((a, b) => a + b, 0) / events.length;
36
+ const aggregate: IAgenticaTokenUsage.IComponent = result.usage.aggregate;
37
+ return [
38
+ "# LLM Function Selection Benchmark",
39
+ "## Summary",
40
+ ` - Aggregation:`,
41
+ ` - Scenarios: #${result.experiments.length.toLocaleString()}`,
42
+ ` - Trial: ${events.length}`,
43
+ ` - Success: ${events.filter((e) => e.type === "success").length}`,
44
+ ` - Failure: ${events.filter((e) => e.type === "failure").length}`,
45
+ ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`,
46
+ ` - Token Usage`,
47
+ ` - Total: ${aggregate.total.toLocaleString()}`,
48
+ ` - Input`,
49
+ ` - Total: ${aggregate.input.total.toLocaleString()}`,
50
+ ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
51
+ ` - Output:`,
52
+ ` - Total: ${aggregate.output.total.toLocaleString()}`,
53
+ ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
54
+ ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
55
+ ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
56
+ "",
57
+ "## Experiments",
58
+ " Name | Status | Time/Avg ",
59
+ ":-----|:-------|----------:",
60
+ ...result.experiments.map((exp) =>
61
+ [
62
+ `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`,
63
+ (() => {
64
+ const success: number = Math.floor(
65
+ (exp.events.filter((e) => e.type === "success").length /
66
+ exp.events.length) *
67
+ 10,
68
+ );
69
+ return (
70
+ new Array(success).fill("■").join("") +
71
+ new Array(10 - success).fill("").join("")
72
+ );
73
+ })(),
74
+ MathUtil.round(
75
+ exp.events
76
+ .map(
77
+ (event) =>
78
+ event.completed_at.getTime() - event.started_at.getTime(),
79
+ )
80
+ .reduce((a, b) => a + b, 0) / exp.events.length,
81
+ ).toLocaleString() + " ms",
82
+ ].join(" | "),
83
+ ),
84
+ ].join("\n");
85
+ };
86
+
87
+ const writeExperimentIndex = (
88
+ exp: IAgenticaSelectBenchmarkResult.IExperiment,
89
+ ): string => {
90
+ const aggregate: IAgenticaTokenUsage.IComponent = exp.usage.aggregate;
91
+ return [
92
+ `# ${exp.scenario.name}`,
93
+ "## Summary",
94
+ " - Aggregation:",
95
+ ` - Trial: ${exp.events.length}`,
96
+ ` - Success: ${exp.events.filter((e) => e.type === "success").length}`,
97
+ ` - Failure: ${exp.events.filter((e) => e.type === "failure").length}`,
98
+ ` - Average Time: ${MathUtil.round(
99
+ exp.events
100
+ .map(
101
+ (event) =>
102
+ event.completed_at.getTime() - event.started_at.getTime(),
103
+ )
104
+ .reduce((a, b) => a + b, 0) / exp.events.length,
105
+ ).toLocaleString()} ms`,
106
+ ` - Token Usage`,
107
+ ` - Total: ${aggregate.total.toLocaleString()}`,
108
+ ` - Input`,
109
+ ` - Total: ${aggregate.input.total.toLocaleString()}`,
110
+ ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
111
+ ` - Output:`,
112
+ ` - Total: ${aggregate.output.total.toLocaleString()}`,
113
+ ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
114
+ ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
115
+ ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
116
+ "",
117
+ "## Events",
118
+ " No | Type | Time",
119
+ "---:|:-----|----:",
120
+ ...exp.events.map((e, i) =>
121
+ [
122
+ `[${i + 1}.](./${i + 1}.${e.type}.md)`,
123
+ e.type,
124
+ MathUtil.round(e.completed_at.getTime() - e.started_at.getTime()) +
125
+ " ms",
126
+ ].join(" | "),
127
+ ),
128
+ "",
129
+ "## Scenario",
130
+ "### User Prompt",
131
+ exp.scenario.text,
132
+ "",
133
+ "### Expected",
134
+ "```json",
135
+ JSON.stringify(
136
+ AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected),
137
+ null,
138
+ 2,
139
+ ),
140
+ "```",
141
+ ].join("\n");
142
+ };
143
+
144
+ const writeExperimentEvent = (
145
+ event: IAgenticaSelectBenchmarkEvent,
146
+ index: number,
147
+ ): string => {
148
+ return [
149
+ `# ${index + 1}. ${event.type}`,
150
+ `## Summary`,
151
+ ` - Name: ${event.scenario.name}`,
152
+ ` - Type: ${event.type}`,
153
+ ` - Time: ${(event.completed_at.getTime() - event.started_at.getTime()).toLocaleString()} ms`,
154
+ ...(event.type !== "error"
155
+ ? [
156
+ " - Token Usage",
157
+ ` - Total: ${event.usage.aggregate.toLocaleString()}`,
158
+ ` - Prompt`,
159
+ ` - Total: ${event.usage.aggregate.input.total.toLocaleString()}`,
160
+ ` - Cached: ${event.usage.aggregate.input.cached.toLocaleString()}`,
161
+ ` - Completion:`,
162
+ ` - Total: ${event.usage.aggregate.output.total.toLocaleString()}`,
163
+ ` - Reasoning: ${event.usage.aggregate.output.reasoning.toLocaleString()}`,
164
+ ` - Accepted Prediction: ${event.usage.aggregate.output.accepted_prediction.toLocaleString()}`,
165
+ ` - Rejected Prediction: ${event.usage.aggregate.output.rejected_prediction.toLocaleString()}`,
166
+ ]
167
+ : []),
168
+ "",
169
+ "## Scenario",
170
+ "### User Prompt",
171
+ event.scenario.text,
172
+ "",
173
+ "### Expected",
174
+ "```json",
175
+ JSON.stringify(
176
+ AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected),
177
+ null,
178
+ 2,
179
+ ),
180
+ "```",
181
+ "",
182
+ ...(event.type === "success" || event.type === "failure"
183
+ ? [
184
+ "## Result",
185
+ ...event.selected.map((s) =>
186
+ [
187
+ `### ${s.name}`,
188
+ ` - Controller: \`${s.controller.name}\``,
189
+ ` - Function: \`${s.function.name}\``,
190
+ ` - Reason: ${s.reason}`,
191
+ "",
192
+ ...(s.function.description ? [s.function.description, ""] : []),
193
+ ].join("\n"),
194
+ ),
195
+ ]
196
+ : []),
197
+ ...(event.type === "error"
198
+ ? [
199
+ "## Error",
200
+ "```json",
201
+ AgenticaBenchmarkUtil.errorToJson(
202
+ JSON.stringify(event.error, null, 2),
203
+ ),
204
+ "```",
205
+ "",
206
+ ]
207
+ : []),
208
+ ].join("\n");
209
+ };
210
+ }
@@ -1,58 +1,58 @@
1
- import { IAgenticaOperation } from "@agentica/core";
2
-
3
- /**
4
- * Expected operation determinant.
5
- *
6
- * `IAgenticaBenchmarkExpected` is a type for determining what
7
- * operation is expected in the benchmarking.
8
- *
9
- * And `IAgenticaBenchmarkExpected` is an union type of 4 types,
10
- * especially designed for the detailed determination of the expected
11
- * operations.
12
- *
13
- * @author Samchon
14
- */
15
- export type IAgenticaBenchmarkExpected =
16
- | IAgenticaBenchmarkExpected.IAllOf
17
- | IAgenticaBenchmarkExpected.IAnyOf
18
- | IAgenticaBenchmarkExpected.IArray
19
- | IAgenticaBenchmarkExpected.IStandalone;
20
- export namespace IAgenticaBenchmarkExpected {
21
- /**
22
- * All of them must meet the condition, but sequence is not important.
23
- */
24
- export interface IAllOf {
25
- type: "allOf";
26
- allOf: Array<
27
- Exclude<IAgenticaBenchmarkExpected, IAgenticaBenchmarkExpected.IAllOf>
28
- >;
29
- }
30
-
31
- /**
32
- * At least one of them must meet the condition.
33
- */
34
- export interface IAnyOf {
35
- type: "anyOf";
36
- anyOf: Array<
37
- Exclude<IAgenticaBenchmarkExpected, IAgenticaBenchmarkExpected.IAnyOf>
38
- >;
39
- }
40
-
41
- /**
42
- * All of them must meet the condition, and sequence is important.
43
- */
44
- export interface IArray {
45
- type: "array";
46
- items: Array<
47
- Exclude<IAgenticaBenchmarkExpected, IAgenticaBenchmarkExpected.IArray>
48
- >;
49
- }
50
-
51
- /**
52
- * Standalone operation.
53
- */
54
- export interface IStandalone {
55
- type: "standalone";
56
- operation: IAgenticaOperation;
57
- }
58
- }
1
+ import { IAgenticaOperation } from "@agentica/core";
2
+
3
+ /**
4
+ * Expected operation determinant.
5
+ *
6
+ * `IAgenticaBenchmarkExpected` is a type for determining what
7
+ * operation is expected in the benchmarking.
8
+ *
9
+ * And `IAgenticaBenchmarkExpected` is an union type of 4 types,
10
+ * especially designed for the detailed determination of the expected
11
+ * operations.
12
+ *
13
+ * @author Samchon
14
+ */
15
+ export type IAgenticaBenchmarkExpected =
16
+ | IAgenticaBenchmarkExpected.IAllOf
17
+ | IAgenticaBenchmarkExpected.IAnyOf
18
+ | IAgenticaBenchmarkExpected.IArray
19
+ | IAgenticaBenchmarkExpected.IStandalone;
20
+ export namespace IAgenticaBenchmarkExpected {
21
+ /**
22
+ * All of them must meet the condition, but sequence is not important.
23
+ */
24
+ export interface IAllOf {
25
+ type: "allOf";
26
+ allOf: Array<
27
+ Exclude<IAgenticaBenchmarkExpected, IAgenticaBenchmarkExpected.IAllOf>
28
+ >;
29
+ }
30
+
31
+ /**
32
+ * At least one of them must meet the condition.
33
+ */
34
+ export interface IAnyOf {
35
+ type: "anyOf";
36
+ anyOf: Array<
37
+ Exclude<IAgenticaBenchmarkExpected, IAgenticaBenchmarkExpected.IAnyOf>
38
+ >;
39
+ }
40
+
41
+ /**
42
+ * All of them must meet the condition, and sequence is important.
43
+ */
44
+ export interface IArray {
45
+ type: "array";
46
+ items: Array<
47
+ Exclude<IAgenticaBenchmarkExpected, IAgenticaBenchmarkExpected.IArray>
48
+ >;
49
+ }
50
+
51
+ /**
52
+ * Standalone operation.
53
+ */
54
+ export interface IStandalone {
55
+ type: "standalone";
56
+ operation: IAgenticaOperation;
57
+ }
58
+ }