@agentica/benchmark 0.43.3 → 0.44.0-dev.20260313-2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,220 +1,220 @@
1
- /**
2
- * @module
3
- * This file contains functions to work with AgenticaSelectBenchmarkReporter.
4
- *
5
- * @author Wrtn Technologies
6
- */
7
- import type { AgenticaTokenUsage } from "@agentica/core";
8
-
9
- import type { IAgenticaSelectBenchmarkEvent } from "../structures/IAgenticaSelectBenchmarkEvent";
10
- import type { IAgenticaSelectBenchmarkResult } from "../structures/IAgenticaSelectBenchmarkResult";
11
-
12
- import { MathUtil } from "../utils/MathUtil";
13
-
14
- import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
15
-
16
- /**
17
- * @internal
18
- */
19
- export const AgenticaSelectBenchmarkReporter = {
20
- markdown,
21
- };
22
-
23
- export function markdown(result: IAgenticaSelectBenchmarkResult): Record<string, string> {
24
- const iterator = [
25
- ["./README.md", writeIndex(result)],
26
- ...result.experiments
27
- .map<[string, string][]>(exp => [
28
- [`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
29
- ...exp.events.map<[string, string]>((event, i) => [
30
- `./${exp.scenario.name}/${i + 1}.${event.type}.md`,
31
- writeExperimentEvent(event, i),
32
- ]),
33
- ])
34
- .flat(),
35
- ] satisfies [string, string][];
36
-
37
- return Object.fromEntries(iterator);
38
- }
39
-
40
- function writeIndex(result: IAgenticaSelectBenchmarkResult): string {
41
- const events: IAgenticaSelectBenchmarkEvent[] = result.experiments
42
- .map(r => r.events)
43
- .flat();
44
- const average: number
45
- = events
46
- .map(e => e.completed_at.getTime() - e.started_at.getTime())
47
- .reduce((a, b) => a + b, 0) / events.length;
48
- const aggregate: AgenticaTokenUsage.IComponent = result.usage.aggregate;
49
- return [
50
- "# LLM Function Selection Benchmark",
51
- "## Summary",
52
- ` - Aggregation:`,
53
- ` - Scenarios: #${result.experiments.length.toLocaleString()}`,
54
- ` - Trial: ${events.length}`,
55
- ` - Success: ${events.filter(e => e.type === "success").length}`,
56
- ` - Failure: ${events.filter(e => e.type === "failure").length}`,
57
- ` - Error: ${events.filter(e => e.type === "error").length}`,
58
- ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`,
59
- ` - Token Usage`,
60
- ` - Total: ${aggregate.total.toLocaleString()}`,
61
- ` - Input`,
62
- ` - Total: ${aggregate.input.total.toLocaleString()}`,
63
- ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
64
- ` - Output:`,
65
- ` - Total: ${aggregate.output.total.toLocaleString()}`,
66
- ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
67
- ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
68
- ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
69
- "",
70
- "## Experiments",
71
- " Name | Status | Time/Avg ",
72
- ":-----|:-------|----------:",
73
- ...result.experiments.map(exp =>
74
- [
75
- `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`,
76
- (() => {
77
- const success: number = Math.floor(
78
- (exp.events.filter(e => e.type === "success").length
79
- / exp.events.length)
80
- * 10,
81
- );
82
- return (
83
- Array.from({ length: success }).fill("■").join("")
84
- + Array.from({ length: 10 - success }).fill("□").join("")
85
- );
86
- })(),
87
- `${MathUtil.round(
88
- exp.events
89
- .map(
90
- event =>
91
- event.completed_at.getTime() - event.started_at.getTime(),
92
- )
93
- .reduce((a, b) => a + b, 0) / exp.events.length,
94
- ).toLocaleString()} ms`,
95
- ].join(" | "),
96
- ),
97
- ].join("\n");
98
- }
99
-
100
- function writeExperimentIndex(exp: IAgenticaSelectBenchmarkResult.IExperiment): string {
101
- const aggregate: AgenticaTokenUsage.IComponent = exp.usage.aggregate;
102
- return [
103
- `# ${exp.scenario.name}`,
104
- "## Summary",
105
- " - Aggregation:",
106
- ` - Trial: ${exp.events.length}`,
107
- ` - Success: ${exp.events.filter(e => e.type === "success").length}`,
108
- ` - Failure: ${exp.events.filter(e => e.type === "failure").length}`,
109
- ` - Error: ${exp.events.filter(e => e.type === "error").length}`,
110
- ` - Average Time: ${MathUtil.round(
111
- exp.events
112
- .map(
113
- event =>
114
- event.completed_at.getTime() - event.started_at.getTime(),
115
- )
116
- .reduce((a, b) => a + b, 0) / exp.events.length,
117
- ).toLocaleString()} ms`,
118
- ` - Token Usage`,
119
- ` - Total: ${aggregate.total.toLocaleString()}`,
120
- ` - Input`,
121
- ` - Total: ${aggregate.input.total.toLocaleString()}`,
122
- ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
123
- ` - Output:`,
124
- ` - Total: ${aggregate.output.total.toLocaleString()}`,
125
- ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
126
- ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
127
- ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
128
- "",
129
- "## Events",
130
- " No | Type | Time",
131
- "---:|:-----|----:",
132
- ...exp.events.map((e, i) =>
133
- [
134
- `[${i + 1}.](./${i + 1}.${e.type}.md)`,
135
- e.type,
136
- `${MathUtil.round(e.completed_at.getTime() - e.started_at.getTime())
137
- } ms`,
138
- ].join(" | "),
139
- ),
140
- "",
141
- "## Scenario",
142
- "### User Prompt",
143
- exp.scenario.text,
144
- "",
145
- "### Expected",
146
- "```json",
147
- JSON.stringify(
148
- AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected),
149
- null,
150
- 2,
151
- ),
152
- "```",
153
- ].join("\n");
154
- }
155
-
156
- function writeExperimentEvent(event: IAgenticaSelectBenchmarkEvent, index: number): string {
157
- return [
158
- `# ${index + 1}. ${event.type}`,
159
- `## Summary`,
160
- ` - Name: ${event.scenario.name}`,
161
- ` - Type: ${event.type}`,
162
- ` - Time: ${(event.completed_at.getTime() - event.started_at.getTime()).toLocaleString()} ms`,
163
- ...(event.type !== "error"
164
- ? [
165
- " - Token Usage",
166
- ` - Total: ${event.usage.aggregate.total.toLocaleString()}`,
167
- ` - Prompt`,
168
- ` - Total: ${event.usage.aggregate.input.total.toLocaleString()}`,
169
- ` - Cached: ${event.usage.aggregate.input.cached.toLocaleString()}`,
170
- ` - Completion:`,
171
- ` - Total: ${event.usage.aggregate.output.total.toLocaleString()}`,
172
- ` - Reasoning: ${event.usage.aggregate.output.reasoning.toLocaleString()}`,
173
- ` - Accepted Prediction: ${event.usage.aggregate.output.accepted_prediction.toLocaleString()}`,
174
- ` - Rejected Prediction: ${event.usage.aggregate.output.rejected_prediction.toLocaleString()}`,
175
- ]
176
- : []),
177
- "",
178
- "## Scenario",
179
- "### User Prompt",
180
- event.scenario.text,
181
- "",
182
- "### Expected",
183
- "```json",
184
- JSON.stringify(
185
- AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected),
186
- null,
187
- 2,
188
- ),
189
- "```",
190
- "",
191
- ...(event.type === "success" || event.type === "failure"
192
- ? [
193
- "## Result",
194
- ...event.selected.map(s =>
195
- [
196
- `### ${s.operation.name}`,
197
- ` - Controller: \`${s.operation.controller.name}\``,
198
- ` - Function: \`${s.operation.function.name}\``,
199
- ` - Reason: ${s.reason}`,
200
- "",
201
- ...(s.operation.function.description !== undefined && s.operation.function.description !== ""
202
- ? [s.operation.function.description, ""]
203
- : []),
204
- ].join("\n"),
205
- ),
206
- ]
207
- : []),
208
- ...(event.type === "error"
209
- ? [
210
- "## Error",
211
- "```json",
212
- AgenticaBenchmarkUtil.errorToJson(
213
- JSON.stringify(event.error, null, 2),
214
- ),
215
- "```",
216
- "",
217
- ]
218
- : []),
219
- ].join("\n");
220
- }
1
+ /**
2
+ * @module
3
+ * This file contains functions to work with AgenticaSelectBenchmarkReporter.
4
+ *
5
+ * @author Wrtn Technologies
6
+ */
7
+ import type { AgenticaTokenUsage } from "@agentica/core";
8
+
9
+ import type { IAgenticaSelectBenchmarkEvent } from "../structures/IAgenticaSelectBenchmarkEvent";
10
+ import type { IAgenticaSelectBenchmarkResult } from "../structures/IAgenticaSelectBenchmarkResult";
11
+
12
+ import { MathUtil } from "../utils/MathUtil";
13
+
14
+ import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
15
+
16
+ /**
17
+ * @internal
18
+ */
19
+ export const AgenticaSelectBenchmarkReporter = {
20
+ markdown,
21
+ };
22
+
23
+ export function markdown(result: IAgenticaSelectBenchmarkResult): Record<string, string> {
24
+ const iterator = [
25
+ ["./README.md", writeIndex(result)],
26
+ ...result.experiments
27
+ .map<[string, string][]>(exp => [
28
+ [`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
29
+ ...exp.events.map<[string, string]>((event, i) => [
30
+ `./${exp.scenario.name}/${i + 1}.${event.type}.md`,
31
+ writeExperimentEvent(event, i),
32
+ ]),
33
+ ])
34
+ .flat(),
35
+ ] satisfies [string, string][];
36
+
37
+ return Object.fromEntries(iterator);
38
+ }
39
+
40
+ function writeIndex(result: IAgenticaSelectBenchmarkResult): string {
41
+ const events: IAgenticaSelectBenchmarkEvent[] = result.experiments
42
+ .map(r => r.events)
43
+ .flat();
44
+ const average: number
45
+ = events
46
+ .map(e => e.completed_at.getTime() - e.started_at.getTime())
47
+ .reduce((a, b) => a + b, 0) / events.length;
48
+ const aggregate: AgenticaTokenUsage.IComponent = result.usage.aggregate;
49
+ return [
50
+ "# LLM Function Selection Benchmark",
51
+ "## Summary",
52
+ ` - Aggregation:`,
53
+ ` - Scenarios: #${result.experiments.length.toLocaleString()}`,
54
+ ` - Trial: ${events.length}`,
55
+ ` - Success: ${events.filter(e => e.type === "success").length}`,
56
+ ` - Failure: ${events.filter(e => e.type === "failure").length}`,
57
+ ` - Error: ${events.filter(e => e.type === "error").length}`,
58
+ ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`,
59
+ ` - Token Usage`,
60
+ ` - Total: ${aggregate.total.toLocaleString()}`,
61
+ ` - Input`,
62
+ ` - Total: ${aggregate.input.total.toLocaleString()}`,
63
+ ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
64
+ ` - Output:`,
65
+ ` - Total: ${aggregate.output.total.toLocaleString()}`,
66
+ ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
67
+ ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
68
+ ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
69
+ "",
70
+ "## Experiments",
71
+ " Name | Status | Time/Avg ",
72
+ ":-----|:-------|----------:",
73
+ ...result.experiments.map(exp =>
74
+ [
75
+ `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`,
76
+ (() => {
77
+ const success: number = Math.floor(
78
+ (exp.events.filter(e => e.type === "success").length
79
+ / exp.events.length)
80
+ * 10,
81
+ );
82
+ return (
83
+ Array.from({ length: success }).fill("■").join("")
84
+ + Array.from({ length: 10 - success }).fill("□").join("")
85
+ );
86
+ })(),
87
+ `${MathUtil.round(
88
+ exp.events
89
+ .map(
90
+ event =>
91
+ event.completed_at.getTime() - event.started_at.getTime(),
92
+ )
93
+ .reduce((a, b) => a + b, 0) / exp.events.length,
94
+ ).toLocaleString()} ms`,
95
+ ].join(" | "),
96
+ ),
97
+ ].join("\n");
98
+ }
99
+
100
+ function writeExperimentIndex(exp: IAgenticaSelectBenchmarkResult.IExperiment): string {
101
+ const aggregate: AgenticaTokenUsage.IComponent = exp.usage.aggregate;
102
+ return [
103
+ `# ${exp.scenario.name}`,
104
+ "## Summary",
105
+ " - Aggregation:",
106
+ ` - Trial: ${exp.events.length}`,
107
+ ` - Success: ${exp.events.filter(e => e.type === "success").length}`,
108
+ ` - Failure: ${exp.events.filter(e => e.type === "failure").length}`,
109
+ ` - Error: ${exp.events.filter(e => e.type === "error").length}`,
110
+ ` - Average Time: ${MathUtil.round(
111
+ exp.events
112
+ .map(
113
+ event =>
114
+ event.completed_at.getTime() - event.started_at.getTime(),
115
+ )
116
+ .reduce((a, b) => a + b, 0) / exp.events.length,
117
+ ).toLocaleString()} ms`,
118
+ ` - Token Usage`,
119
+ ` - Total: ${aggregate.total.toLocaleString()}`,
120
+ ` - Input`,
121
+ ` - Total: ${aggregate.input.total.toLocaleString()}`,
122
+ ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
123
+ ` - Output:`,
124
+ ` - Total: ${aggregate.output.total.toLocaleString()}`,
125
+ ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
126
+ ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
127
+ ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
128
+ "",
129
+ "## Events",
130
+ " No | Type | Time",
131
+ "---:|:-----|----:",
132
+ ...exp.events.map((e, i) =>
133
+ [
134
+ `[${i + 1}.](./${i + 1}.${e.type}.md)`,
135
+ e.type,
136
+ `${MathUtil.round(e.completed_at.getTime() - e.started_at.getTime())
137
+ } ms`,
138
+ ].join(" | "),
139
+ ),
140
+ "",
141
+ "## Scenario",
142
+ "### User Prompt",
143
+ exp.scenario.text,
144
+ "",
145
+ "### Expected",
146
+ "```json",
147
+ JSON.stringify(
148
+ AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected),
149
+ null,
150
+ 2,
151
+ ),
152
+ "```",
153
+ ].join("\n");
154
+ }
155
+
156
+ function writeExperimentEvent(event: IAgenticaSelectBenchmarkEvent, index: number): string {
157
+ return [
158
+ `# ${index + 1}. ${event.type}`,
159
+ `## Summary`,
160
+ ` - Name: ${event.scenario.name}`,
161
+ ` - Type: ${event.type}`,
162
+ ` - Time: ${(event.completed_at.getTime() - event.started_at.getTime()).toLocaleString()} ms`,
163
+ ...(event.type !== "error"
164
+ ? [
165
+ " - Token Usage",
166
+ ` - Total: ${event.usage.aggregate.total.toLocaleString()}`,
167
+ ` - Prompt`,
168
+ ` - Total: ${event.usage.aggregate.input.total.toLocaleString()}`,
169
+ ` - Cached: ${event.usage.aggregate.input.cached.toLocaleString()}`,
170
+ ` - Completion:`,
171
+ ` - Total: ${event.usage.aggregate.output.total.toLocaleString()}`,
172
+ ` - Reasoning: ${event.usage.aggregate.output.reasoning.toLocaleString()}`,
173
+ ` - Accepted Prediction: ${event.usage.aggregate.output.accepted_prediction.toLocaleString()}`,
174
+ ` - Rejected Prediction: ${event.usage.aggregate.output.rejected_prediction.toLocaleString()}`,
175
+ ]
176
+ : []),
177
+ "",
178
+ "## Scenario",
179
+ "### User Prompt",
180
+ event.scenario.text,
181
+ "",
182
+ "### Expected",
183
+ "```json",
184
+ JSON.stringify(
185
+ AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected),
186
+ null,
187
+ 2,
188
+ ),
189
+ "```",
190
+ "",
191
+ ...(event.type === "success" || event.type === "failure"
192
+ ? [
193
+ "## Result",
194
+ ...event.selected.map(s =>
195
+ [
196
+ `### ${s.operation.name}`,
197
+ ` - Controller: \`${s.operation.controller.name}\``,
198
+ ` - Function: \`${s.operation.function.name}\``,
199
+ ` - Reason: ${s.reason}`,
200
+ "",
201
+ ...(s.operation.function.description !== undefined && s.operation.function.description !== ""
202
+ ? [s.operation.function.description, ""]
203
+ : []),
204
+ ].join("\n"),
205
+ ),
206
+ ]
207
+ : []),
208
+ ...(event.type === "error"
209
+ ? [
210
+ "## Error",
211
+ "```json",
212
+ AgenticaBenchmarkUtil.errorToJson(
213
+ JSON.stringify(event.error, null, 2),
214
+ ),
215
+ "```",
216
+ "",
217
+ ]
218
+ : []),
219
+ ].join("\n");
220
+ }
@@ -1,74 +1,74 @@
1
- /**
2
- * @module
3
- * This file contains the implementation of the IAgenticaBenchmarkExpected class.
4
- *
5
- * @author Wrtn Technologies
6
- */
7
- import type { AgenticaOperation } from "@agentica/core";
8
-
9
- /**
10
- * Expected operation determinant.
11
- *
12
- * `IAgenticaBenchmarkExpected` is a type for determining what
13
- * operation is expected in the benchmarking.
14
- *
15
- * And `IAgenticaBenchmarkExpected` is an union type of 4 types,
16
- * especially designed for the detailed determination of the expected
17
- * operations.
18
- *
19
- * @author Samchon
20
- */
21
- export type IAgenticaBenchmarkExpected =
22
- | IAgenticaBenchmarkExpected.IAllOf
23
- | IAgenticaBenchmarkExpected.IAnyOf
24
- | IAgenticaBenchmarkExpected.IArray
25
- | IAgenticaBenchmarkExpected.IStandalone;
26
-
27
- export namespace IAgenticaBenchmarkExpected {
28
- /**
29
- * All of them must meet the condition, but sequence is not important.
30
- */
31
- export interface IAllOf {
32
- type: "allOf";
33
- allOf: Array<
34
- Exclude<
35
- IAgenticaBenchmarkExpected,
36
- IAgenticaBenchmarkExpected.IAllOf
37
- >
38
- >;
39
- }
40
-
41
- /**
42
- * At least one of them must meet the condition.
43
- */
44
- export interface IAnyOf {
45
- type: "anyOf";
46
- anyOf: Array<
47
- Exclude<
48
- IAgenticaBenchmarkExpected,
49
- IAgenticaBenchmarkExpected.IAnyOf
50
- >
51
- >;
52
- }
53
-
54
- /**
55
- * All of them must meet the condition, and sequence is important.
56
- */
57
- export interface IArray {
58
- type: "array";
59
- items: Array<
60
- Exclude<
61
- IAgenticaBenchmarkExpected,
62
- IAgenticaBenchmarkExpected.IArray
63
- >
64
- >;
65
- }
66
-
67
- /**
68
- * Standalone operation.
69
- */
70
- export interface IStandalone {
71
- type: "standalone";
72
- operation: AgenticaOperation;
73
- }
74
- }
1
+ /**
2
+ * @module
3
+ * This file contains the implementation of the IAgenticaBenchmarkExpected class.
4
+ *
5
+ * @author Wrtn Technologies
6
+ */
7
+ import type { AgenticaOperation } from "@agentica/core";
8
+
9
+ /**
10
+ * Expected operation determinant.
11
+ *
12
+ * `IAgenticaBenchmarkExpected` is a type for determining what
13
+ * operation is expected in the benchmarking.
14
+ *
15
+ * And `IAgenticaBenchmarkExpected` is an union type of 4 types,
16
+ * especially designed for the detailed determination of the expected
17
+ * operations.
18
+ *
19
+ * @author Samchon
20
+ */
21
+ export type IAgenticaBenchmarkExpected
22
+ = | IAgenticaBenchmarkExpected.IAllOf
23
+ | IAgenticaBenchmarkExpected.IAnyOf
24
+ | IAgenticaBenchmarkExpected.IArray
25
+ | IAgenticaBenchmarkExpected.IStandalone;
26
+
27
+ export namespace IAgenticaBenchmarkExpected {
28
+ /**
29
+ * All of them must meet the condition, but sequence is not important.
30
+ */
31
+ export interface IAllOf {
32
+ type: "allOf";
33
+ allOf: Array<
34
+ Exclude<
35
+ IAgenticaBenchmarkExpected,
36
+ IAgenticaBenchmarkExpected.IAllOf
37
+ >
38
+ >;
39
+ }
40
+
41
+ /**
42
+ * At least one of them must meet the condition.
43
+ */
44
+ export interface IAnyOf {
45
+ type: "anyOf";
46
+ anyOf: Array<
47
+ Exclude<
48
+ IAgenticaBenchmarkExpected,
49
+ IAgenticaBenchmarkExpected.IAnyOf
50
+ >
51
+ >;
52
+ }
53
+
54
+ /**
55
+ * All of them must meet the condition, and sequence is important.
56
+ */
57
+ export interface IArray {
58
+ type: "array";
59
+ items: Array<
60
+ Exclude<
61
+ IAgenticaBenchmarkExpected,
62
+ IAgenticaBenchmarkExpected.IArray
63
+ >
64
+ >;
65
+ }
66
+
67
+ /**
68
+ * Standalone operation.
69
+ */
70
+ export interface IStandalone {
71
+ type: "standalone";
72
+ operation: AgenticaOperation;
73
+ }
74
+ }