@agentica/benchmark 0.12.1 → 0.12.2-dev.20250314

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,215 +1,215 @@
1
- import { AgenticaTokenUsage } from "@agentica/core";
2
- import { ILlmSchema } from "@samchon/openapi";
3
-
4
- import { IAgenticaSelectBenchmarkEvent } from "../structures/IAgenticaSelectBenchmarkEvent";
5
- import { IAgenticaSelectBenchmarkResult } from "../structures/IAgenticaSelectBenchmarkResult";
6
- import { MathUtil } from "../utils/MathUtil";
7
- import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
8
-
9
- /**
10
- * @internal
11
- */
12
- export namespace AgenticaSelectBenchmarkReporter {
13
- export const markdown = <Model extends ILlmSchema.Model>(
14
- result: IAgenticaSelectBenchmarkResult<Model>,
15
- ): Record<string, string> =>
16
- Object.fromEntries([
17
- ["./README.md", writeIndex(result)],
18
- ...result.experiments
19
- .map((exp) => [
20
- [`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
21
- ...exp.events.map((event, i) => [
22
- `./${exp.scenario.name}/${i + 1}.${event.type}.md`,
23
- writeExperimentEvent(event, i),
24
- ]),
25
- ])
26
- .flat(),
27
- ]);
28
-
29
- const writeIndex = <Model extends ILlmSchema.Model>(
30
- result: IAgenticaSelectBenchmarkResult<Model>,
31
- ): string => {
32
- const events: IAgenticaSelectBenchmarkEvent<Model>[] = result.experiments
33
- .map((r) => r.events)
34
- .flat();
35
- const average: number =
36
- events
37
- .map((e) => e.completed_at.getTime() - e.started_at.getTime())
38
- .reduce((a, b) => a + b, 0) / events.length;
39
- const aggregate: AgenticaTokenUsage.IComponent = result.usage.aggregate;
40
- return [
41
- "# LLM Function Selection Benchmark",
42
- "## Summary",
43
- ` - Aggregation:`,
44
- ` - Scenarios: #${result.experiments.length.toLocaleString()}`,
45
- ` - Trial: ${events.length}`,
46
- ` - Success: ${events.filter((e) => e.type === "success").length}`,
47
- ` - Failure: ${events.filter((e) => e.type === "failure").length}`,
48
- ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`,
49
- ` - Token Usage`,
50
- ` - Total: ${aggregate.total.toLocaleString()}`,
51
- ` - Input`,
52
- ` - Total: ${aggregate.input.total.toLocaleString()}`,
53
- ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
54
- ` - Output:`,
55
- ` - Total: ${aggregate.output.total.toLocaleString()}`,
56
- ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
57
- ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
58
- ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
59
- "",
60
- "## Experiments",
61
- " Name | Status | Time/Avg ",
62
- ":-----|:-------|----------:",
63
- ...result.experiments.map((exp) =>
64
- [
65
- `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`,
66
- (() => {
67
- const success: number = Math.floor(
68
- (exp.events.filter((e) => e.type === "success").length /
69
- exp.events.length) *
70
- 10,
71
- );
72
- return (
73
- new Array(success).fill("■").join("") +
74
- new Array(10 - success).fill("□").join("")
75
- );
76
- })(),
77
- MathUtil.round(
78
- exp.events
79
- .map(
80
- (event) =>
81
- event.completed_at.getTime() - event.started_at.getTime(),
82
- )
83
- .reduce((a, b) => a + b, 0) / exp.events.length,
84
- ).toLocaleString() + " ms",
85
- ].join(" | "),
86
- ),
87
- ].join("\n");
88
- };
89
-
90
- const writeExperimentIndex = <Model extends ILlmSchema.Model>(
91
- exp: IAgenticaSelectBenchmarkResult.IExperiment<Model>,
92
- ): string => {
93
- const aggregate: AgenticaTokenUsage.IComponent = exp.usage.aggregate;
94
- return [
95
- `# ${exp.scenario.name}`,
96
- "## Summary",
97
- " - Aggregation:",
98
- ` - Trial: ${exp.events.length}`,
99
- ` - Success: ${exp.events.filter((e) => e.type === "success").length}`,
100
- ` - Failure: ${exp.events.filter((e) => e.type === "failure").length}`,
101
- ` - Average Time: ${MathUtil.round(
102
- exp.events
103
- .map(
104
- (event) =>
105
- event.completed_at.getTime() - event.started_at.getTime(),
106
- )
107
- .reduce((a, b) => a + b, 0) / exp.events.length,
108
- ).toLocaleString()} ms`,
109
- ` - Token Usage`,
110
- ` - Total: ${aggregate.total.toLocaleString()}`,
111
- ` - Input`,
112
- ` - Total: ${aggregate.input.total.toLocaleString()}`,
113
- ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
114
- ` - Output:`,
115
- ` - Total: ${aggregate.output.total.toLocaleString()}`,
116
- ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
117
- ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
118
- ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
119
- "",
120
- "## Events",
121
- " No | Type | Time",
122
- "---:|:-----|----:",
123
- ...exp.events.map((e, i) =>
124
- [
125
- `[${i + 1}.](./${i + 1}.${e.type}.md)`,
126
- e.type,
127
- MathUtil.round(e.completed_at.getTime() - e.started_at.getTime()) +
128
- " ms",
129
- ].join(" | "),
130
- ),
131
- "",
132
- "## Scenario",
133
- "### User Prompt",
134
- exp.scenario.text,
135
- "",
136
- "### Expected",
137
- "```json",
138
- JSON.stringify(
139
- AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected),
140
- null,
141
- 2,
142
- ),
143
- "```",
144
- ].join("\n");
145
- };
146
-
147
- const writeExperimentEvent = <Model extends ILlmSchema.Model>(
148
- event: IAgenticaSelectBenchmarkEvent<Model>,
149
- index: number,
150
- ): string => {
151
- return [
152
- `# ${index + 1}. ${event.type}`,
153
- `## Summary`,
154
- ` - Name: ${event.scenario.name}`,
155
- ` - Type: ${event.type}`,
156
- ` - Time: ${(event.completed_at.getTime() - event.started_at.getTime()).toLocaleString()} ms`,
157
- ...(event.type !== "error"
158
- ? [
159
- " - Token Usage",
160
- ` - Total: ${event.usage.aggregate.toLocaleString()}`,
161
- ` - Prompt`,
162
- ` - Total: ${event.usage.aggregate.input.total.toLocaleString()}`,
163
- ` - Cached: ${event.usage.aggregate.input.cached.toLocaleString()}`,
164
- ` - Completion:`,
165
- ` - Total: ${event.usage.aggregate.output.total.toLocaleString()}`,
166
- ` - Reasoning: ${event.usage.aggregate.output.reasoning.toLocaleString()}`,
167
- ` - Accepted Prediction: ${event.usage.aggregate.output.accepted_prediction.toLocaleString()}`,
168
- ` - Rejected Prediction: ${event.usage.aggregate.output.rejected_prediction.toLocaleString()}`,
169
- ]
170
- : []),
171
- "",
172
- "## Scenario",
173
- "### User Prompt",
174
- event.scenario.text,
175
- "",
176
- "### Expected",
177
- "```json",
178
- JSON.stringify(
179
- AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected),
180
- null,
181
- 2,
182
- ),
183
- "```",
184
- "",
185
- ...(event.type === "success" || event.type === "failure"
186
- ? [
187
- "## Result",
188
- ...event.selected.map((s) =>
189
- [
190
- `### ${s.operation.name}`,
191
- ` - Controller: \`${s.operation.controller.name}\``,
192
- ` - Function: \`${s.operation.function.name}\``,
193
- ` - Reason: ${s.reason}`,
194
- "",
195
- ...(s.operation.function.description
196
- ? [s.operation.function.description, ""]
197
- : []),
198
- ].join("\n"),
199
- ),
200
- ]
201
- : []),
202
- ...(event.type === "error"
203
- ? [
204
- "## Error",
205
- "```json",
206
- AgenticaBenchmarkUtil.errorToJson(
207
- JSON.stringify(event.error, null, 2),
208
- ),
209
- "```",
210
- "",
211
- ]
212
- : []),
213
- ].join("\n");
214
- };
215
- }
1
+ import { AgenticaTokenUsage } from "@agentica/core";
2
+ import { ILlmSchema } from "@samchon/openapi";
3
+
4
+ import { IAgenticaSelectBenchmarkEvent } from "../structures/IAgenticaSelectBenchmarkEvent";
5
+ import { IAgenticaSelectBenchmarkResult } from "../structures/IAgenticaSelectBenchmarkResult";
6
+ import { MathUtil } from "../utils/MathUtil";
7
+ import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
8
+
9
+ /**
10
+ * @internal
11
+ */
12
+ export namespace AgenticaSelectBenchmarkReporter {
13
+ export const markdown = <Model extends ILlmSchema.Model>(
14
+ result: IAgenticaSelectBenchmarkResult<Model>,
15
+ ): Record<string, string> =>
16
+ Object.fromEntries([
17
+ ["./README.md", writeIndex(result)],
18
+ ...result.experiments
19
+ .map((exp) => [
20
+ [`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
21
+ ...exp.events.map((event, i) => [
22
+ `./${exp.scenario.name}/${i + 1}.${event.type}.md`,
23
+ writeExperimentEvent(event, i),
24
+ ]),
25
+ ])
26
+ .flat(),
27
+ ]);
28
+
29
+ const writeIndex = <Model extends ILlmSchema.Model>(
30
+ result: IAgenticaSelectBenchmarkResult<Model>,
31
+ ): string => {
32
+ const events: IAgenticaSelectBenchmarkEvent<Model>[] = result.experiments
33
+ .map((r) => r.events)
34
+ .flat();
35
+ const average: number =
36
+ events
37
+ .map((e) => e.completed_at.getTime() - e.started_at.getTime())
38
+ .reduce((a, b) => a + b, 0) / events.length;
39
+ const aggregate: AgenticaTokenUsage.IComponent = result.usage.aggregate;
40
+ return [
41
+ "# LLM Function Selection Benchmark",
42
+ "## Summary",
43
+ ` - Aggregation:`,
44
+ ` - Scenarios: #${result.experiments.length.toLocaleString()}`,
45
+ ` - Trial: ${events.length}`,
46
+ ` - Success: ${events.filter((e) => e.type === "success").length}`,
47
+ ` - Failure: ${events.filter((e) => e.type === "failure").length}`,
48
+ ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`,
49
+ ` - Token Usage`,
50
+ ` - Total: ${aggregate.total.toLocaleString()}`,
51
+ ` - Input`,
52
+ ` - Total: ${aggregate.input.total.toLocaleString()}`,
53
+ ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
54
+ ` - Output:`,
55
+ ` - Total: ${aggregate.output.total.toLocaleString()}`,
56
+ ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
57
+ ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
58
+ ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
59
+ "",
60
+ "## Experiments",
61
+ " Name | Status | Time/Avg ",
62
+ ":-----|:-------|----------:",
63
+ ...result.experiments.map((exp) =>
64
+ [
65
+ `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`,
66
+ (() => {
67
+ const success: number = Math.floor(
68
+ (exp.events.filter((e) => e.type === "success").length /
69
+ exp.events.length) *
70
+ 10,
71
+ );
72
+ return (
73
+ new Array(success).fill("■").join("") +
74
+ new Array(10 - success).fill("□").join("")
75
+ );
76
+ })(),
77
+ MathUtil.round(
78
+ exp.events
79
+ .map(
80
+ (event) =>
81
+ event.completed_at.getTime() - event.started_at.getTime(),
82
+ )
83
+ .reduce((a, b) => a + b, 0) / exp.events.length,
84
+ ).toLocaleString() + " ms",
85
+ ].join(" | "),
86
+ ),
87
+ ].join("\n");
88
+ };
89
+
90
+ const writeExperimentIndex = <Model extends ILlmSchema.Model>(
91
+ exp: IAgenticaSelectBenchmarkResult.IExperiment<Model>,
92
+ ): string => {
93
+ const aggregate: AgenticaTokenUsage.IComponent = exp.usage.aggregate;
94
+ return [
95
+ `# ${exp.scenario.name}`,
96
+ "## Summary",
97
+ " - Aggregation:",
98
+ ` - Trial: ${exp.events.length}`,
99
+ ` - Success: ${exp.events.filter((e) => e.type === "success").length}`,
100
+ ` - Failure: ${exp.events.filter((e) => e.type === "failure").length}`,
101
+ ` - Average Time: ${MathUtil.round(
102
+ exp.events
103
+ .map(
104
+ (event) =>
105
+ event.completed_at.getTime() - event.started_at.getTime(),
106
+ )
107
+ .reduce((a, b) => a + b, 0) / exp.events.length,
108
+ ).toLocaleString()} ms`,
109
+ ` - Token Usage`,
110
+ ` - Total: ${aggregate.total.toLocaleString()}`,
111
+ ` - Input`,
112
+ ` - Total: ${aggregate.input.total.toLocaleString()}`,
113
+ ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
114
+ ` - Output:`,
115
+ ` - Total: ${aggregate.output.total.toLocaleString()}`,
116
+ ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
117
+ ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
118
+ ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
119
+ "",
120
+ "## Events",
121
+ " No | Type | Time",
122
+ "---:|:-----|----:",
123
+ ...exp.events.map((e, i) =>
124
+ [
125
+ `[${i + 1}.](./${i + 1}.${e.type}.md)`,
126
+ e.type,
127
+ MathUtil.round(e.completed_at.getTime() - e.started_at.getTime()) +
128
+ " ms",
129
+ ].join(" | "),
130
+ ),
131
+ "",
132
+ "## Scenario",
133
+ "### User Prompt",
134
+ exp.scenario.text,
135
+ "",
136
+ "### Expected",
137
+ "```json",
138
+ JSON.stringify(
139
+ AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected),
140
+ null,
141
+ 2,
142
+ ),
143
+ "```",
144
+ ].join("\n");
145
+ };
146
+
147
+ const writeExperimentEvent = <Model extends ILlmSchema.Model>(
148
+ event: IAgenticaSelectBenchmarkEvent<Model>,
149
+ index: number,
150
+ ): string => {
151
+ return [
152
+ `# ${index + 1}. ${event.type}`,
153
+ `## Summary`,
154
+ ` - Name: ${event.scenario.name}`,
155
+ ` - Type: ${event.type}`,
156
+ ` - Time: ${(event.completed_at.getTime() - event.started_at.getTime()).toLocaleString()} ms`,
157
+ ...(event.type !== "error"
158
+ ? [
159
+ " - Token Usage",
160
+ ` - Total: ${event.usage.aggregate.toLocaleString()}`,
161
+ ` - Prompt`,
162
+ ` - Total: ${event.usage.aggregate.input.total.toLocaleString()}`,
163
+ ` - Cached: ${event.usage.aggregate.input.cached.toLocaleString()}`,
164
+ ` - Completion:`,
165
+ ` - Total: ${event.usage.aggregate.output.total.toLocaleString()}`,
166
+ ` - Reasoning: ${event.usage.aggregate.output.reasoning.toLocaleString()}`,
167
+ ` - Accepted Prediction: ${event.usage.aggregate.output.accepted_prediction.toLocaleString()}`,
168
+ ` - Rejected Prediction: ${event.usage.aggregate.output.rejected_prediction.toLocaleString()}`,
169
+ ]
170
+ : []),
171
+ "",
172
+ "## Scenario",
173
+ "### User Prompt",
174
+ event.scenario.text,
175
+ "",
176
+ "### Expected",
177
+ "```json",
178
+ JSON.stringify(
179
+ AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected),
180
+ null,
181
+ 2,
182
+ ),
183
+ "```",
184
+ "",
185
+ ...(event.type === "success" || event.type === "failure"
186
+ ? [
187
+ "## Result",
188
+ ...event.selected.map((s) =>
189
+ [
190
+ `### ${s.operation.name}`,
191
+ ` - Controller: \`${s.operation.controller.name}\``,
192
+ ` - Function: \`${s.operation.function.name}\``,
193
+ ` - Reason: ${s.reason}`,
194
+ "",
195
+ ...(s.operation.function.description
196
+ ? [s.operation.function.description, ""]
197
+ : []),
198
+ ].join("\n"),
199
+ ),
200
+ ]
201
+ : []),
202
+ ...(event.type === "error"
203
+ ? [
204
+ "## Error",
205
+ "```json",
206
+ AgenticaBenchmarkUtil.errorToJson(
207
+ JSON.stringify(event.error, null, 2),
208
+ ),
209
+ "```",
210
+ "",
211
+ ]
212
+ : []),
213
+ ].join("\n");
214
+ };
215
+ }
@@ -1,68 +1,68 @@
1
- import { AgenticaOperation } from "@agentica/core";
2
- import { ILlmSchema } from "@samchon/openapi";
3
-
4
- /**
5
- * Expected operation determinant.
6
- *
7
- * `IAgenticaBenchmarkExpected` is a type for determining what
8
- * operation is expected in the benchmarking.
9
- *
10
- * And `IAgenticaBenchmarkExpected` is an union type of 4 types,
11
- * especially designed for the detailed determination of the expected
12
- * operations.
13
- *
14
- * @author Samchon
15
- */
16
- export type IAgenticaBenchmarkExpected<Model extends ILlmSchema.Model> =
17
- | IAgenticaBenchmarkExpected.IAllOf<Model>
18
- | IAgenticaBenchmarkExpected.IAnyOf<Model>
19
- | IAgenticaBenchmarkExpected.IArray<Model>
20
- | IAgenticaBenchmarkExpected.IStandalone<Model>;
21
- export namespace IAgenticaBenchmarkExpected {
22
- /**
23
- * All of them must meet the condition, but sequence is not important.
24
- */
25
- export interface IAllOf<Model extends ILlmSchema.Model> {
26
- type: "allOf";
27
- allOf: Array<
28
- Exclude<
29
- IAgenticaBenchmarkExpected<Model>,
30
- IAgenticaBenchmarkExpected.IAllOf<Model>
31
- >
32
- >;
33
- }
34
-
35
- /**
36
- * At least one of them must meet the condition.
37
- */
38
- export interface IAnyOf<Model extends ILlmSchema.Model> {
39
- type: "anyOf";
40
- anyOf: Array<
41
- Exclude<
42
- IAgenticaBenchmarkExpected<Model>,
43
- IAgenticaBenchmarkExpected.IAnyOf<Model>
44
- >
45
- >;
46
- }
47
-
48
- /**
49
- * All of them must meet the condition, and sequence is important.
50
- */
51
- export interface IArray<Model extends ILlmSchema.Model> {
52
- type: "array";
53
- items: Array<
54
- Exclude<
55
- IAgenticaBenchmarkExpected<Model>,
56
- IAgenticaBenchmarkExpected.IArray<Model>
57
- >
58
- >;
59
- }
60
-
61
- /**
62
- * Standalone operation.
63
- */
64
- export interface IStandalone<Model extends ILlmSchema.Model> {
65
- type: "standalone";
66
- operation: AgenticaOperation<Model>;
67
- }
68
- }
1
+ import { AgenticaOperation } from "@agentica/core";
2
+ import { ILlmSchema } from "@samchon/openapi";
3
+
4
+ /**
5
+ * Expected operation determinant.
6
+ *
7
+ * `IAgenticaBenchmarkExpected` is a type for determining what
8
+ * operation is expected in the benchmarking.
9
+ *
10
+ * And `IAgenticaBenchmarkExpected` is an union type of 4 types,
11
+ * especially designed for the detailed determination of the expected
12
+ * operations.
13
+ *
14
+ * @author Samchon
15
+ */
16
+ export type IAgenticaBenchmarkExpected<Model extends ILlmSchema.Model> =
17
+ | IAgenticaBenchmarkExpected.IAllOf<Model>
18
+ | IAgenticaBenchmarkExpected.IAnyOf<Model>
19
+ | IAgenticaBenchmarkExpected.IArray<Model>
20
+ | IAgenticaBenchmarkExpected.IStandalone<Model>;
21
+ export namespace IAgenticaBenchmarkExpected {
22
+ /**
23
+ * All of them must meet the condition, but sequence is not important.
24
+ */
25
+ export interface IAllOf<Model extends ILlmSchema.Model> {
26
+ type: "allOf";
27
+ allOf: Array<
28
+ Exclude<
29
+ IAgenticaBenchmarkExpected<Model>,
30
+ IAgenticaBenchmarkExpected.IAllOf<Model>
31
+ >
32
+ >;
33
+ }
34
+
35
+ /**
36
+ * At least one of them must meet the condition.
37
+ */
38
+ export interface IAnyOf<Model extends ILlmSchema.Model> {
39
+ type: "anyOf";
40
+ anyOf: Array<
41
+ Exclude<
42
+ IAgenticaBenchmarkExpected<Model>,
43
+ IAgenticaBenchmarkExpected.IAnyOf<Model>
44
+ >
45
+ >;
46
+ }
47
+
48
+ /**
49
+ * All of them must meet the condition, and sequence is important.
50
+ */
51
+ export interface IArray<Model extends ILlmSchema.Model> {
52
+ type: "array";
53
+ items: Array<
54
+ Exclude<
55
+ IAgenticaBenchmarkExpected<Model>,
56
+ IAgenticaBenchmarkExpected.IArray<Model>
57
+ >
58
+ >;
59
+ }
60
+
61
+ /**
62
+ * Standalone operation.
63
+ */
64
+ export interface IStandalone<Model extends ILlmSchema.Model> {
65
+ type: "standalone";
66
+ operation: AgenticaOperation<Model>;
67
+ }
68
+ }