@agentica/benchmark 0.44.0-dev.20260313-2 → 0.44.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,281 +1,281 @@
1
- import type { MicroAgentica } from "@agentica/core";
2
- import type { tags } from "typia";
3
-
4
- /**
5
- * @module
6
- * This file contains the implementation of the AgenticaCallBenchmark class.
7
- *
8
- * @author Wrtn Technologies
9
- */
10
- import { AgenticaTokenUsage } from "@agentica/core";
11
- import { Semaphore } from "tstl";
12
-
13
- import type { IAgenticaCallBenchmarkEvent } from "./structures/IAgenticaCallBenchmarkEvent";
14
- import type { IAgenticaCallBenchmarkResult } from "./structures/IAgenticaCallBenchmarkResult";
15
- import type { IAgenticaCallBenchmarkScenario } from "./structures/IAgenticaCallBenchmarkScenario";
16
-
17
- import { AgenticaBenchmarkPredicator } from "./internal/AgenticaBenchmarkPredicator";
18
- import { AgenticaCallBenchmarkReporter } from "./internal/AgenticaCallBenchmarkReporter";
19
-
20
- /**
21
- * LLM function calling selection benchmark.
22
- *
23
- * `AgenticaCallBenchmark` is a class for the benchmark of the
24
- * LLM (Large Model Language) function calling part. It utilizes both
25
- * `selector` and `caller` agents and tests whether the expected
26
- * {@link IAgenticaOperation operations} are properly selected and
27
- * called from the given
28
- * {@link IAgenticaCallBenchmarkScenario scenarios}.
29
- *
30
- * Note that, this `MicroAgenticaCallBenchmark` consumes a lot of time and
31
- * LLM token costs because it needs the whole process of the
32
- * {@link MicroAgentica} class with a lot of repetitions. If you don't want
33
- * such a heavy benchmark, consider to using
34
- * {@link AgenticaSelectBenchmark} instead. In my experience,
35
- * {@link MicroAgentica} does not fail to function calling, so the function
36
- * selection benchmark is much economical.
37
- *
38
- * @author Samchon
39
- */
40
- export class MicroAgenticaCallBenchmark {
41
- private agent_: MicroAgentica;
42
- private scenarios_: IAgenticaCallBenchmarkScenario[];
43
- private config_: MicroAgenticaCallBenchmark.IConfig;
44
- private result_: IAgenticaCallBenchmarkResult | null;
45
-
46
- /**
47
- * Initializer Constructor.
48
- *
49
- * @param props Properties of the selection benchmark
50
- */
51
- public constructor(props: MicroAgenticaCallBenchmark.IProps) {
52
- this.agent_ = props.agent;
53
- this.scenarios_ = props.scenarios.slice();
54
- this.config_ = {
55
- repeat: props.config?.repeat ?? 10,
56
- simultaneous: props.config?.simultaneous ?? 10,
57
- consent: props.config?.consent ?? 3,
58
- };
59
- this.result_ = null;
60
- }
61
-
62
- /**
63
- * Execute the benchmark.
64
- *
65
- * Execute the benchmark of the LLM function calling, and returns
66
- * the result of the benchmark.
67
- *
68
- * If you wanna see progress of the benchmark, you can pass a callback
69
- * function as the argument of the `listener`. The callback function
70
- * would be called whenever a benchmark event is occurred.
71
- *
72
- * Also, you can publish a markdown format report by calling
73
- * the {@link report} function after the benchmark execution.
74
- *
75
- * @param listener Callback function listening the benchmark events
76
- * @returns Results of the function calling benchmark
77
- */
78
- public async execute(
79
- listener?: (event: IAgenticaCallBenchmarkEvent) => void,
80
- ): Promise<IAgenticaCallBenchmarkResult> {
81
- const started_at: Date = new Date();
82
- const semaphore: Semaphore = new Semaphore(this.config_.simultaneous);
83
- const task = this.scenarios_.map(async (scenario) => {
84
- const events: IAgenticaCallBenchmarkEvent[]
85
- = await Promise.all(
86
- Array.from({ length: this.config_.repeat }).map(async () => {
87
- await semaphore.acquire();
88
- const e: IAgenticaCallBenchmarkEvent
89
- = await this.step(scenario);
90
- await semaphore.release();
91
-
92
- if (listener !== undefined) {
93
- listener(e);
94
- }
95
-
96
- return e;
97
- }),
98
- );
99
- return {
100
- scenario,
101
- events,
102
- usage: events
103
- .filter(e => e.type !== "error")
104
- .map(e => e.usage)
105
- .reduce((acc, cur) => AgenticaTokenUsage.plus(acc, cur), AgenticaTokenUsage.zero()),
106
- };
107
- });
108
- const experiments: IAgenticaCallBenchmarkResult.IExperiment[]
109
- = await Promise.all(task);
110
- return (this.result_ = {
111
- experiments,
112
- started_at,
113
- completed_at: new Date(),
114
- usage: experiments
115
- .map(p => p.usage)
116
- .reduce((acc, cur) => AgenticaTokenUsage.plus(acc, cur), AgenticaTokenUsage.zero()),
117
- });
118
- }
119
-
120
- /**
121
- * Report the benchmark result as markdown files.
122
- *
123
- * Report the benchmark result {@link execute}d by
124
- * `AgenticaCallBenchmark` as markdown files, and returns a dictionary
125
- * object of the markdown reporting files. The key of the dictionary
126
- * would be file name, and the value would be the markdown content.
127
- *
128
- * For reference, the markdown files are composed like below:
129
- *
130
- * - `./README.md`
131
- * - `./scenario-1/README.md`
132
- * - `./scenario-1/1.success.md`
133
- * - `./scenario-1/2.failure.md`
134
- * - `./scenario-1/3.error.md`
135
- *
136
- * @returns Dictionary of markdown files.
137
- */
138
- public report(): Record<string, string> {
139
- if (this.result_ === null) {
140
- throw new Error("Benchmark is not executed yet.");
141
- }
142
- return AgenticaCallBenchmarkReporter.markdown(this.result_);
143
- }
144
-
145
- private async step(
146
- scenario: IAgenticaCallBenchmarkScenario,
147
- ): Promise<IAgenticaCallBenchmarkEvent> {
148
- const agent: MicroAgentica = this.agent_.clone();
149
- const started_at: Date = new Date();
150
- const success = () =>
151
- AgenticaBenchmarkPredicator.success({
152
- expected: scenario.expected,
153
- operations: agent
154
- .getHistories()
155
- .filter(p => p.type === "execute")
156
- .map(p => p.operation),
157
- strict: false,
158
- });
159
- const out = (): IAgenticaCallBenchmarkEvent => {
160
- const select = AgenticaBenchmarkPredicator.success({
161
- expected: scenario.expected,
162
- operations: agent
163
- .getHistories()
164
- .filter(p => p.type === "execute")
165
- .map(p => p.operation),
166
- strict: false,
167
- });
168
- const call = success();
169
- return {
170
- type: (call ? "success" : "failure") as "failure",
171
- scenario,
172
- select,
173
- call,
174
- prompts: agent.getHistories(),
175
- usage: agent.getTokenUsage(),
176
- started_at,
177
- completed_at: new Date(),
178
- } satisfies IAgenticaCallBenchmarkEvent.IFailure;
179
- };
180
-
181
- try {
182
- await agent.conversate(scenario.text);
183
- if (success()) {
184
- return out();
185
- }
186
-
187
- for (let i: number = 0; i < this.config_.consent; ++i) {
188
- const next: string | null
189
- = await AgenticaBenchmarkPredicator.isNext(agent);
190
- if (next === null) {
191
- break;
192
- }
193
-
194
- await agent.conversate(next);
195
- if (success()) {
196
- return out();
197
- }
198
- }
199
- return out();
200
- }
201
- catch (error) {
202
- return {
203
- type: "error",
204
- scenario,
205
- prompts: agent.getHistories(),
206
- usage: agent.getTokenUsage(),
207
- error,
208
- started_at,
209
- completed_at: new Date(),
210
- };
211
- }
212
- }
213
- }
214
- export namespace MicroAgenticaCallBenchmark {
215
- /**
216
- * Properties of the {@link MicroAgenticaCallBenchmark} constructor.
217
- */
218
- export interface IProps {
219
- /**
220
- * AI agent instance.
221
- */
222
- agent: MicroAgentica;
223
-
224
- /**
225
- * List of scenarios what you expect.
226
- */
227
- scenarios: IAgenticaCallBenchmarkScenario[];
228
-
229
- /**
230
- * Configuration for the benchmark.
231
- */
232
- config?: Partial<IConfig>;
233
- }
234
-
235
- /**
236
- * Configuration for the benchmark.
237
- *
238
- * `AgenticaSelectBenchmark.IConfig` is a data structure which
239
- * represents a configuration for the benchmark, especially the
240
- * capacity information of the benchmark execution.
241
- */
242
- export interface IConfig {
243
- /**
244
- * Repeat count.
245
- *
246
- * The number of repeating count for the benchmark execution
247
- * for each scenario.
248
- *
249
- * @default 10
250
- */
251
- repeat: number & tags.Type<"uint32"> & tags.Minimum<1>;
252
-
253
- /**
254
- * Simultaneous count.
255
- *
256
- * The number of simultaneous count for the parallel benchmark
257
- * execution.
258
- *
259
- * If you configure this property greater than `1`, the benchmark
260
- * for each scenario would be executed in parallel in the given
261
- * count.
262
- *
263
- * @default 10
264
- */
265
- simultaneous: number & tags.Type<"uint32"> & tags.Minimum<1>;
266
-
267
- /**
268
- * Number of consents.
269
- *
270
- * AI agent sometimes asks user to consent to the function
271
- * calling, and perform it at the next step.
272
- *
273
- * This property represents the number of consents to allow.
274
- * If the number of consents from the AI agent exceeds the
275
- * configured value, the benchmark will be failed.
276
- *
277
- * @default 3
278
- */
279
- consent: number;
280
- }
281
- }
1
+ import type { MicroAgentica } from "@agentica/core";
2
+ import type { tags } from "typia";
3
+
4
+ /**
5
+ * @module
6
+ * This file contains the implementation of the AgenticaCallBenchmark class.
7
+ *
8
+ * @author Wrtn Technologies
9
+ */
10
+ import { AgenticaTokenUsage } from "@agentica/core";
11
+ import { Semaphore } from "tstl";
12
+
13
+ import type { IAgenticaCallBenchmarkEvent } from "./structures/IAgenticaCallBenchmarkEvent";
14
+ import type { IAgenticaCallBenchmarkResult } from "./structures/IAgenticaCallBenchmarkResult";
15
+ import type { IAgenticaCallBenchmarkScenario } from "./structures/IAgenticaCallBenchmarkScenario";
16
+
17
+ import { AgenticaBenchmarkPredicator } from "./internal/AgenticaBenchmarkPredicator";
18
+ import { AgenticaCallBenchmarkReporter } from "./internal/AgenticaCallBenchmarkReporter";
19
+
20
+ /**
21
+ * LLM function calling selection benchmark.
22
+ *
23
+ * `AgenticaCallBenchmark` is a class for the benchmark of the
24
+ * LLM (Large Model Language) function calling part. It utilizes both
25
+ * `selector` and `caller` agents and tests whether the expected
26
+ * {@link IAgenticaOperation operations} are properly selected and
27
+ * called from the given
28
+ * {@link IAgenticaCallBenchmarkScenario scenarios}.
29
+ *
30
+ * Note that, this `MicroAgenticaCallBenchmark` consumes a lot of time and
31
+ * LLM token costs because it needs the whole process of the
32
+ * {@link MicroAgentica} class with a lot of repetitions. If you don't want
33
+ * such a heavy benchmark, consider to using
34
+ * {@link AgenticaSelectBenchmark} instead. In my experience,
35
+ * {@link MicroAgentica} does not fail to function calling, so the function
36
+ * selection benchmark is much economical.
37
+ *
38
+ * @author Samchon
39
+ */
40
+ export class MicroAgenticaCallBenchmark {
41
+ private agent_: MicroAgentica;
42
+ private scenarios_: IAgenticaCallBenchmarkScenario[];
43
+ private config_: MicroAgenticaCallBenchmark.IConfig;
44
+ private result_: IAgenticaCallBenchmarkResult | null;
45
+
46
+ /**
47
+ * Initializer Constructor.
48
+ *
49
+ * @param props Properties of the selection benchmark
50
+ */
51
+ public constructor(props: MicroAgenticaCallBenchmark.IProps) {
52
+ this.agent_ = props.agent;
53
+ this.scenarios_ = props.scenarios.slice();
54
+ this.config_ = {
55
+ repeat: props.config?.repeat ?? 10,
56
+ simultaneous: props.config?.simultaneous ?? 10,
57
+ consent: props.config?.consent ?? 3,
58
+ };
59
+ this.result_ = null;
60
+ }
61
+
62
+ /**
63
+ * Execute the benchmark.
64
+ *
65
+ * Execute the benchmark of the LLM function calling, and returns
66
+ * the result of the benchmark.
67
+ *
68
+ * If you wanna see progress of the benchmark, you can pass a callback
69
+ * function as the argument of the `listener`. The callback function
70
+ * would be called whenever a benchmark event is occurred.
71
+ *
72
+ * Also, you can publish a markdown format report by calling
73
+ * the {@link report} function after the benchmark execution.
74
+ *
75
+ * @param listener Callback function listening the benchmark events
76
+ * @returns Results of the function calling benchmark
77
+ */
78
+ public async execute(
79
+ listener?: (event: IAgenticaCallBenchmarkEvent) => void,
80
+ ): Promise<IAgenticaCallBenchmarkResult> {
81
+ const started_at: Date = new Date();
82
+ const semaphore: Semaphore = new Semaphore(this.config_.simultaneous);
83
+ const task = this.scenarios_.map(async (scenario) => {
84
+ const events: IAgenticaCallBenchmarkEvent[]
85
+ = await Promise.all(
86
+ Array.from({ length: this.config_.repeat }).map(async () => {
87
+ await semaphore.acquire();
88
+ const e: IAgenticaCallBenchmarkEvent
89
+ = await this.step(scenario);
90
+ await semaphore.release();
91
+
92
+ if (listener !== undefined) {
93
+ listener(e);
94
+ }
95
+
96
+ return e;
97
+ }),
98
+ );
99
+ return {
100
+ scenario,
101
+ events,
102
+ usage: events
103
+ .filter(e => e.type !== "error")
104
+ .map(e => e.usage)
105
+ .reduce((acc, cur) => AgenticaTokenUsage.plus(acc, cur), AgenticaTokenUsage.zero()),
106
+ };
107
+ });
108
+ const experiments: IAgenticaCallBenchmarkResult.IExperiment[]
109
+ = await Promise.all(task);
110
+ return (this.result_ = {
111
+ experiments,
112
+ started_at,
113
+ completed_at: new Date(),
114
+ usage: experiments
115
+ .map(p => p.usage)
116
+ .reduce((acc, cur) => AgenticaTokenUsage.plus(acc, cur), AgenticaTokenUsage.zero()),
117
+ });
118
+ }
119
+
120
+ /**
121
+ * Report the benchmark result as markdown files.
122
+ *
123
+ * Report the benchmark result {@link execute}d by
124
+ * `AgenticaCallBenchmark` as markdown files, and returns a dictionary
125
+ * object of the markdown reporting files. The key of the dictionary
126
+ * would be file name, and the value would be the markdown content.
127
+ *
128
+ * For reference, the markdown files are composed like below:
129
+ *
130
+ * - `./README.md`
131
+ * - `./scenario-1/README.md`
132
+ * - `./scenario-1/1.success.md`
133
+ * - `./scenario-1/2.failure.md`
134
+ * - `./scenario-1/3.error.md`
135
+ *
136
+ * @returns Dictionary of markdown files.
137
+ */
138
+ public report(): Record<string, string> {
139
+ if (this.result_ === null) {
140
+ throw new Error("Benchmark is not executed yet.");
141
+ }
142
+ return AgenticaCallBenchmarkReporter.markdown(this.result_);
143
+ }
144
+
145
+ private async step(
146
+ scenario: IAgenticaCallBenchmarkScenario,
147
+ ): Promise<IAgenticaCallBenchmarkEvent> {
148
+ const agent: MicroAgentica = this.agent_.clone();
149
+ const started_at: Date = new Date();
150
+ const success = () =>
151
+ AgenticaBenchmarkPredicator.success({
152
+ expected: scenario.expected,
153
+ operations: agent
154
+ .getHistories()
155
+ .filter(p => p.type === "execute")
156
+ .map(p => p.operation),
157
+ strict: false,
158
+ });
159
+ const out = (): IAgenticaCallBenchmarkEvent => {
160
+ const select = AgenticaBenchmarkPredicator.success({
161
+ expected: scenario.expected,
162
+ operations: agent
163
+ .getHistories()
164
+ .filter(p => p.type === "execute")
165
+ .map(p => p.operation),
166
+ strict: false,
167
+ });
168
+ const call = success();
169
+ return {
170
+ type: (call ? "success" : "failure") as "failure",
171
+ scenario,
172
+ select,
173
+ call,
174
+ prompts: agent.getHistories(),
175
+ usage: agent.getTokenUsage(),
176
+ started_at,
177
+ completed_at: new Date(),
178
+ } satisfies IAgenticaCallBenchmarkEvent.IFailure;
179
+ };
180
+
181
+ try {
182
+ await agent.conversate(scenario.text);
183
+ if (success()) {
184
+ return out();
185
+ }
186
+
187
+ for (let i: number = 0; i < this.config_.consent; ++i) {
188
+ const next: string | null
189
+ = await AgenticaBenchmarkPredicator.isNext(agent);
190
+ if (next === null) {
191
+ break;
192
+ }
193
+
194
+ await agent.conversate(next);
195
+ if (success()) {
196
+ return out();
197
+ }
198
+ }
199
+ return out();
200
+ }
201
+ catch (error) {
202
+ return {
203
+ type: "error",
204
+ scenario,
205
+ prompts: agent.getHistories(),
206
+ usage: agent.getTokenUsage(),
207
+ error,
208
+ started_at,
209
+ completed_at: new Date(),
210
+ };
211
+ }
212
+ }
213
+ }
214
+ export namespace MicroAgenticaCallBenchmark {
215
+ /**
216
+ * Properties of the {@link MicroAgenticaCallBenchmark} constructor.
217
+ */
218
+ export interface IProps {
219
+ /**
220
+ * AI agent instance.
221
+ */
222
+ agent: MicroAgentica;
223
+
224
+ /**
225
+ * List of scenarios what you expect.
226
+ */
227
+ scenarios: IAgenticaCallBenchmarkScenario[];
228
+
229
+ /**
230
+ * Configuration for the benchmark.
231
+ */
232
+ config?: Partial<IConfig>;
233
+ }
234
+
235
+ /**
236
+ * Configuration for the benchmark.
237
+ *
238
+ * `AgenticaSelectBenchmark.IConfig` is a data structure which
239
+ * represents a configuration for the benchmark, especially the
240
+ * capacity information of the benchmark execution.
241
+ */
242
+ export interface IConfig {
243
+ /**
244
+ * Repeat count.
245
+ *
246
+ * The number of repeating count for the benchmark execution
247
+ * for each scenario.
248
+ *
249
+ * @default 10
250
+ */
251
+ repeat: number & tags.Type<"uint32"> & tags.Minimum<1>;
252
+
253
+ /**
254
+ * Simultaneous count.
255
+ *
256
+ * The number of simultaneous count for the parallel benchmark
257
+ * execution.
258
+ *
259
+ * If you configure this property greater than `1`, the benchmark
260
+ * for each scenario would be executed in parallel in the given
261
+ * count.
262
+ *
263
+ * @default 10
264
+ */
265
+ simultaneous: number & tags.Type<"uint32"> & tags.Minimum<1>;
266
+
267
+ /**
268
+ * Number of consents.
269
+ *
270
+ * AI agent sometimes asks user to consent to the function
271
+ * calling, and perform it at the next step.
272
+ *
273
+ * This property represents the number of consents to allow.
274
+ * If the number of consents from the AI agent exceeds the
275
+ * configured value, the benchmark will be failed.
276
+ *
277
+ * @default 3
278
+ */
279
+ consent: number;
280
+ }
281
+ }
package/src/index.ts CHANGED
@@ -1,3 +1,3 @@
1
- export * from "./AgenticaCallBenchmark";
2
- export * from "./AgenticaSelectBenchmark";
3
- export * from "./MicroAgenticaCallBenchmark";
1
+ export * from "./AgenticaCallBenchmark";
2
+ export * from "./AgenticaSelectBenchmark";
3
+ export * from "./MicroAgenticaCallBenchmark";