@agentica/benchmark 0.12.1 → 0.12.2-dev.20250314
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +326 -326
- package/package.json +2 -2
- package/src/AgenticaCallBenchmark.ts +263 -263
- package/src/AgenticaSelectBenchmark.ts +248 -248
- package/src/index.ts +3 -3
- package/src/internal/AgenticaBenchmarkPredicator.ts +220 -220
- package/src/internal/AgenticaBenchmarkUtil.ts +44 -44
- package/src/internal/AgenticaCallBenchmarkReporter.ts +193 -193
- package/src/internal/AgenticaPromptReporter.ts +46 -46
- package/src/internal/AgenticaSelectBenchmarkReporter.ts +215 -215
- package/src/structures/IAgenticaBenchmarkExpected.ts +68 -68
- package/src/structures/IAgenticaCallBenchmarkEvent.ts +113 -113
- package/src/structures/IAgenticaCallBenchmarkResult.ts +70 -70
- package/src/structures/IAgenticaCallBenchmarkScenario.ts +43 -43
- package/src/structures/IAgenticaSelectBenchmarkEvent.ts +120 -120
- package/src/structures/IAgenticaSelectBenchmarkResult.ts +72 -72
- package/src/structures/IAgenticaSelectBenchmarkScenario.ts +43 -43
- package/src/utils/MathUtil.ts +3 -3
package/README.md
CHANGED
|
@@ -1,327 +1,327 @@
|
|
|
1
|
-
# `@agentica/benchmark`
|
|
2
|
-

|
|
3
|
-
|
|
4
|
-
[](https://github.com/wrtnlabs/agentica/blob/master/LICENSE)
|
|
5
|
-
[](https://www.npmjs.com/package/@agentica/benchmark)
|
|
6
|
-
[](https://www.npmjs.com/package/@agentica/benchmark)
|
|
7
|
-
[](https://github.com/wrtnlabs/agentica/actions?query=workflow%3Abuild)
|
|
8
|
-
|
|
9
|
-
Benchmark program of `Agentica`.
|
|
10
|
-
|
|
11
|
-
`Agentica` is the simplest Agentic AI library specialized in **LLM Function Calling**, and `@agentica/benchmark` is the benchmark tool of such Agentic AI library. It supports two quantitive benchmark tools `AgenticaSelectBenchmark` and `AgenticaCallBenchmark` which can measure function calling's selecting and calling qualities.
|
|
12
|
-
|
|
13
|
-
Here is an example report generated by `@agentica/benchmark` measuring function calling quality of "Shopping Mall" scenario. Below measured benchmark scenario is exactly same with the recorded video, and you can find that every function calling has succeeded without any error.
|
|
14
|
-
|
|
15
|
-
> https://github.com/user-attachments/assets/01604b53-aca4-41cb-91aa-3faf63549ea6
|
|
16
|
-
>
|
|
17
|
-
> Benchmark of Shopping Mall Scenario
|
|
18
|
-
>
|
|
19
|
-
> - [Benchmark Report](https://github.com/wrtnlabs/agentica/tree/main/test/examples/benchmarks/select)
|
|
20
|
-
> - Swagger Document: https://shopping-be.wrtn.ai/editor
|
|
21
|
-
> - Repository: https://github.com/wrtnlabs/shopping-backend
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
## How to use
|
|
27
|
-
### Setup
|
|
28
|
-
```bash
|
|
29
|
-
npm install @agentica/core @agentica/benchmark @samchon/openapi typia
|
|
30
|
-
npx typia setup
|
|
31
|
-
```
|
|
32
|
-
|
|
33
|
-
Install `@agentica/benchmark` with its dependent libraries.
|
|
34
|
-
|
|
35
|
-
Note that, you have to install not only `@agentica/core` or `@agentica/benchmark` libraries, but also [`@samchon/openapi`](https://github.com/samchon/openapi) and [`typia`](https://github.com/samchon/typia) too.
|
|
36
|
-
|
|
37
|
-
`@samchon/openapi` is an OpenAPI specification library which can convert Swagger/OpenAPI document to LLM function calling schema. And `typia` is a transformer (compiler) library which can compose LLM function calling schema from a TypeScript class type.
|
|
38
|
-
|
|
39
|
-
By the way, as `typia` is a transformer library analyzing TypeScript source code in the compilation level, it needs additional setup command `npx typia setup`.
|
|
40
|
-
|
|
41
|
-
### Function selecting Benchmark
|
|
42
|
-
```typescript
|
|
43
|
-
import { AgenticaSelectBenchmark } from "@agentica/benchmark";
|
|
44
|
-
import { Agentica, IAgenticaOperation } from "@agentica/core";
|
|
45
|
-
import { HttpLlm, IHttpConnection, OpenApi } from "@samchon/openapi";
|
|
46
|
-
import fs from "fs";
|
|
47
|
-
import OpenAI from "openai";
|
|
48
|
-
import path from "path";
|
|
49
|
-
|
|
50
|
-
const main = async (): Promise<void> => {
|
|
51
|
-
// CREATE AI AGENT
|
|
52
|
-
const agent: Agentica<"chatgpt"> = new Agentica({
|
|
53
|
-
model: "chatgpt",
|
|
54
|
-
vendor: {
|
|
55
|
-
api: new OpenAI({
|
|
56
|
-
apiKey: "YOUR_OPENAI_API_KEY",
|
|
57
|
-
}),
|
|
58
|
-
model: "gpt-4o-mini",
|
|
59
|
-
},
|
|
60
|
-
controllers: [
|
|
61
|
-
{
|
|
62
|
-
protocol: "http",
|
|
63
|
-
name: "shopping",
|
|
64
|
-
application: HttpLlm.application({
|
|
65
|
-
model: "chatgpt",
|
|
66
|
-
document: await fetch(
|
|
67
|
-
"https://shopping-be.wrtn.ai/editor/swagger.json",
|
|
68
|
-
).then((res) => res.json()),
|
|
69
|
-
}),
|
|
70
|
-
connection: {
|
|
71
|
-
host: "https://shopping-be.wrtn.ai",
|
|
72
|
-
},
|
|
73
|
-
},
|
|
74
|
-
],
|
|
75
|
-
});
|
|
76
|
-
|
|
77
|
-
// DO BENCHMARK
|
|
78
|
-
const find = (method: OpenApi.Method, path: string): IAgenticaOperation => {
|
|
79
|
-
const found = agent
|
|
80
|
-
.getOperations()
|
|
81
|
-
.find(
|
|
82
|
-
(op) =>
|
|
83
|
-
op.protocol === "http" &&
|
|
84
|
-
op.function.method === method &&
|
|
85
|
-
op.function.path === path,
|
|
86
|
-
);
|
|
87
|
-
if (!found) throw new Error(`Operation not found: ${method} ${path}`);
|
|
88
|
-
return found;
|
|
89
|
-
};
|
|
90
|
-
const benchmark: AgenticaSelectBenchmark<"chatgpt"> =
|
|
91
|
-
new AgenticaSelectBenchmark({
|
|
92
|
-
agent,
|
|
93
|
-
config: {
|
|
94
|
-
repeat: 4,
|
|
95
|
-
},
|
|
96
|
-
scenarios: [
|
|
97
|
-
{
|
|
98
|
-
name: "order",
|
|
99
|
-
text: [
|
|
100
|
-
"I wanna see every sales in the shopping mall",
|
|
101
|
-
"",
|
|
102
|
-
"And then show me the detailed information about the Macbook.",
|
|
103
|
-
"",
|
|
104
|
-
"After that, select the most expensive stock",
|
|
105
|
-
"from the Macbook, and put it into my shopping cart.",
|
|
106
|
-
"And take the shopping cart to the order.",
|
|
107
|
-
"",
|
|
108
|
-
"At last, I'll publish it by cash payment, and my address is",
|
|
109
|
-
"",
|
|
110
|
-
" - country: South Korea",
|
|
111
|
-
" - city/province: Seoul",
|
|
112
|
-
" - department: Wrtn Apartment",
|
|
113
|
-
" - Possession: 101-1411",
|
|
114
|
-
].join("\n"),
|
|
115
|
-
expected: {
|
|
116
|
-
type: "array",
|
|
117
|
-
items: [
|
|
118
|
-
{
|
|
119
|
-
type: "standalone",
|
|
120
|
-
operation: find("patch", "/shoppings/customers/sales"),
|
|
121
|
-
},
|
|
122
|
-
{
|
|
123
|
-
type: "standalone",
|
|
124
|
-
operation: find("get", "/shoppings/customers/sales/{id}"),
|
|
125
|
-
},
|
|
126
|
-
{
|
|
127
|
-
type: "anyOf",
|
|
128
|
-
anyOf: [
|
|
129
|
-
{
|
|
130
|
-
type: "standalone",
|
|
131
|
-
operation: find("post", "/shoppings/customers/orders"),
|
|
132
|
-
},
|
|
133
|
-
{
|
|
134
|
-
type: "standalone",
|
|
135
|
-
operation: find("post", "/shoppings/customers/orders/direct"),
|
|
136
|
-
},
|
|
137
|
-
],
|
|
138
|
-
},
|
|
139
|
-
{
|
|
140
|
-
type: "standalone",
|
|
141
|
-
operation: find(
|
|
142
|
-
"post",
|
|
143
|
-
"/shoppings/customers/orders/{orderId}/publish",
|
|
144
|
-
),
|
|
145
|
-
},
|
|
146
|
-
],
|
|
147
|
-
},
|
|
148
|
-
},
|
|
149
|
-
],
|
|
150
|
-
});
|
|
151
|
-
await benchmark.execute();
|
|
152
|
-
|
|
153
|
-
// REPORT
|
|
154
|
-
const docs: Record<string, string> = benchmark.report();
|
|
155
|
-
const root: string = `docs/benchmarks/call`;
|
|
156
|
-
|
|
157
|
-
await rmdir(root);
|
|
158
|
-
for (const [key, value] of Object.entries(docs)) {
|
|
159
|
-
await mkdir(path.join(root, key.split("/").slice(0, -1).join("/")));
|
|
160
|
-
await fs.promises.writeFile(path.join(root, key), value, "utf8");
|
|
161
|
-
}
|
|
162
|
-
};
|
|
163
|
-
```
|
|
164
|
-
|
|
165
|
-
> Benchmark of Shopping Mall Scenario
|
|
166
|
-
>
|
|
167
|
-
> - [Benchmark Report](https://github.com/wrtnlabs/agentica/tree/main/test/examples/benchmarks/select)
|
|
168
|
-
> - Swagger Document: https://shopping-be.wrtn.ai/editor
|
|
169
|
-
> - Repository: https://github.com/wrtnlabs/shopping-backend
|
|
170
|
-
|
|
171
|
-
Benchmark function selecting quality.
|
|
172
|
-
|
|
173
|
-
You can measure a benchmark that AI agent can select proper functions from the user's conversations by the LLM (Large Language Model) function calling feature. Create `Agentica` and `AgenticaSelectBenchmark` typed instances, and execute the benchmark with your specific scenarios like above.
|
|
174
|
-
|
|
175
|
-
If you have written enough and proper descriptions to the functions (or API operations) and DTO schema types, success ratio of `AgenticaSelectBenchmark` would be higher. Otherwise descriptions are not enough or have bad quality, you may get a threatening benchmark report. If you wanna see how the `AgenticaSelectBenchmark` reports, click above [benchmark report link](https://github.com/wrtnlabs/agentica/tree/main/test/examples/benchmarks/select) please.
|
|
176
|
-
|
|
177
|
-
### Function Calling Benchmark
|
|
178
|
-
> Benchmark of Shopping Mall Scenario
|
|
179
|
-
>
|
|
180
|
-
> - [Benchmark Report](https://github.com/wrtnlabs/agentica/tree/main/test/examples/benchmarks/call)
|
|
181
|
-
> - Swagger Document: https://shopping-be.wrtn.ai/editor
|
|
182
|
-
> - Repository: https://github.com/wrtnlabs/shopping-backend
|
|
183
|
-
|
|
184
|
-
```typescript
|
|
185
|
-
import { AgenticaCallBenchmark } from "@agentica/benchmark";
|
|
186
|
-
import { Agentica, IAgenticaOperation } from "@agentica/core";
|
|
187
|
-
import { HttpLlm, IHttpConnection, OpenApi } from "@samchon/openapi";
|
|
188
|
-
import fs from "fs";
|
|
189
|
-
import OpenAI from "openai";
|
|
190
|
-
import path from "path";
|
|
191
|
-
|
|
192
|
-
const main = async (): Promise<void> => {
|
|
193
|
-
// CREATE AI AGENT
|
|
194
|
-
const agent: Agentica<"chatgpt"> = new Agentica({
|
|
195
|
-
model: "chatgpt",
|
|
196
|
-
vendor: {
|
|
197
|
-
api: new OpenAI({
|
|
198
|
-
apiKey: "YOUR_OPENAI_API_KEY",
|
|
199
|
-
}),
|
|
200
|
-
model: "gpt-4o-mini",
|
|
201
|
-
},
|
|
202
|
-
controllers: [
|
|
203
|
-
{
|
|
204
|
-
protocol: "http",
|
|
205
|
-
name: "shopping",
|
|
206
|
-
application: HttpLlm.application({
|
|
207
|
-
model: "chatgpt",
|
|
208
|
-
document: await fetch(
|
|
209
|
-
"https://shopping-be.wrtn.ai/editor/swagger.json",
|
|
210
|
-
).then((res) => res.json()),
|
|
211
|
-
}),
|
|
212
|
-
connection: {
|
|
213
|
-
host: "https://shopping-be.wrtn.ai",
|
|
214
|
-
},
|
|
215
|
-
},
|
|
216
|
-
],
|
|
217
|
-
});
|
|
218
|
-
|
|
219
|
-
// DO BENCHMARK
|
|
220
|
-
const find = (method: OpenApi.Method, path: string): IAgenticaOperation => {
|
|
221
|
-
const found = agent
|
|
222
|
-
.getOperations()
|
|
223
|
-
.find(
|
|
224
|
-
(op) =>
|
|
225
|
-
op.protocol === "http" &&
|
|
226
|
-
op.function.method === method &&
|
|
227
|
-
op.function.path === path,
|
|
228
|
-
);
|
|
229
|
-
if (!found) throw new Error(`Operation not found: ${method} ${path}`);
|
|
230
|
-
return found;
|
|
231
|
-
};
|
|
232
|
-
const benchmark: AgenticaSelectBenchmark<"chatgpt"> =
|
|
233
|
-
new AgenticaSelectBenchmark({
|
|
234
|
-
agent,
|
|
235
|
-
config: {
|
|
236
|
-
repeat: 4,
|
|
237
|
-
},
|
|
238
|
-
scenarios: [
|
|
239
|
-
{
|
|
240
|
-
name: "order",
|
|
241
|
-
text: [
|
|
242
|
-
"I wanna see every sales in the shopping mall",
|
|
243
|
-
"",
|
|
244
|
-
"And then show me the detailed information about the Macbook.",
|
|
245
|
-
"",
|
|
246
|
-
"After that, select the most expensive stock",
|
|
247
|
-
"from the Macbook, and put it into my shopping cart.",
|
|
248
|
-
"And take the shopping cart to the order.",
|
|
249
|
-
"",
|
|
250
|
-
"At last, I'll publish it by cash payment, and my address is",
|
|
251
|
-
"",
|
|
252
|
-
" - country: South Korea",
|
|
253
|
-
" - city/province: Seoul",
|
|
254
|
-
" - department: Wrtn Apartment",
|
|
255
|
-
" - Possession: 101-1411",
|
|
256
|
-
].join("\n"),
|
|
257
|
-
expected: {
|
|
258
|
-
type: "array",
|
|
259
|
-
items: [
|
|
260
|
-
{
|
|
261
|
-
type: "standalone",
|
|
262
|
-
operation: find("patch", "/shoppings/customers/sales"),
|
|
263
|
-
},
|
|
264
|
-
{
|
|
265
|
-
type: "standalone",
|
|
266
|
-
operation: find("get", "/shoppings/customers/sales/{id}"),
|
|
267
|
-
},
|
|
268
|
-
{
|
|
269
|
-
type: "anyOf",
|
|
270
|
-
anyOf: [
|
|
271
|
-
{
|
|
272
|
-
type: "standalone",
|
|
273
|
-
operation: find("post", "/shoppings/customers/orders"),
|
|
274
|
-
},
|
|
275
|
-
{
|
|
276
|
-
type: "standalone",
|
|
277
|
-
operation: find("post", "/shoppings/customers/orders/direct"),
|
|
278
|
-
},
|
|
279
|
-
],
|
|
280
|
-
},
|
|
281
|
-
{
|
|
282
|
-
type: "standalone",
|
|
283
|
-
operation: find(
|
|
284
|
-
"post",
|
|
285
|
-
"/shoppings/customers/orders/{orderId}/publish",
|
|
286
|
-
),
|
|
287
|
-
},
|
|
288
|
-
],
|
|
289
|
-
},
|
|
290
|
-
},
|
|
291
|
-
],
|
|
292
|
-
});
|
|
293
|
-
await benchmark.execute();
|
|
294
|
-
|
|
295
|
-
// REPORT
|
|
296
|
-
const docs: Record<string, string> = benchmark.report();
|
|
297
|
-
const root: string = `docs/benchmarks/call`;
|
|
298
|
-
|
|
299
|
-
await rmdir(root);
|
|
300
|
-
for (const [key, value] of Object.entries(docs)) {
|
|
301
|
-
await mkdir(path.join(root, key.split("/").slice(0, -1).join("/")));
|
|
302
|
-
await fs.promises.writeFile(path.join(root, key), value, "utf8");
|
|
303
|
-
}
|
|
304
|
-
};
|
|
305
|
-
```
|
|
306
|
-
|
|
307
|
-
Benchmark function calling quality.
|
|
308
|
-
|
|
309
|
-
You can measure a benchmark that AI agent can call proper functions from the user's conversations by the LLM (Large Language Model) function calling feature. Create `Agentica` and `AgenticaCallBenchmark` typed instances, and execute the benchmark with your specific scenarios like above.
|
|
310
|
-
|
|
311
|
-
If you have written enough and proper descriptions to the functions (or API operations) and DTO schema types, success ratio of `AgenticaCallBenchmark` would be higher. Otherwise descriptions are not enough or have bad quality, you may get a threatening benchmark report. If you wanna see how the `AgenticaCallBenchmark` reports, click above [benchmark report link](https://github.com/wrtnlabs/agentica/tree/main/test/examples/benchmarks/call) please.
|
|
312
|
-
|
|
313
|
-
For reference, `@agentica/core` tends not to failed on arguments filling of LLM function calling. So it is okay that ending up with [`AgenticaSelectBenchmark`](#function-selecting-benchmark) stage, because function calling with arguments filling spends much more times and LLM tokens.
|
|
314
|
-
|
|
315
|
-
Also, current `AgenticaCallBenchmark` has been designed to perform multiple LLM function callings just by one conversation text. However, the multiple LLM function calling benchmark actually requires the [#Multi Turn Benchmark](#multi-turn-benchmark) feature of [#Roadmap](#rodmap). Therefore, [`AgenticaSelectBenchmark`](#function-selecting-benchmark) is economic than `AgenticaCallBenchmark`.
|
|
316
|
-
|
|
317
|
-
> In the above "Shopping Mall" scenario, function selecting benchmark ends in 4 seconds, but function calling benchmark consumes about 3 minutes.
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
## Roadmap
|
|
323
|
-
### Multi Turn Benchmark
|
|
324
|
-
Will support multi-turn benchmark for [#Function Calling Benchmark](#function-calling-benchmark).
|
|
325
|
-
|
|
326
|
-
### Estimator Agent
|
|
1
|
+
# `@agentica/benchmark`
|
|
2
|
+

|
|
3
|
+
|
|
4
|
+
[](https://github.com/wrtnlabs/agentica/blob/master/LICENSE)
|
|
5
|
+
[](https://www.npmjs.com/package/@agentica/benchmark)
|
|
6
|
+
[](https://www.npmjs.com/package/@agentica/benchmark)
|
|
7
|
+
[](https://github.com/wrtnlabs/agentica/actions?query=workflow%3Abuild)
|
|
8
|
+
|
|
9
|
+
Benchmark program of `Agentica`.
|
|
10
|
+
|
|
11
|
+
`Agentica` is the simplest Agentic AI library specialized in **LLM Function Calling**, and `@agentica/benchmark` is the benchmark tool of such Agentic AI library. It supports two quantitive benchmark tools `AgenticaSelectBenchmark` and `AgenticaCallBenchmark` which can measure function calling's selecting and calling qualities.
|
|
12
|
+
|
|
13
|
+
Here is an example report generated by `@agentica/benchmark` measuring function calling quality of "Shopping Mall" scenario. Below measured benchmark scenario is exactly same with the recorded video, and you can find that every function calling has succeeded without any error.
|
|
14
|
+
|
|
15
|
+
> https://github.com/user-attachments/assets/01604b53-aca4-41cb-91aa-3faf63549ea6
|
|
16
|
+
>
|
|
17
|
+
> Benchmark of Shopping Mall Scenario
|
|
18
|
+
>
|
|
19
|
+
> - [Benchmark Report](https://github.com/wrtnlabs/agentica/tree/main/test/examples/benchmarks/select)
|
|
20
|
+
> - Swagger Document: https://shopping-be.wrtn.ai/editor
|
|
21
|
+
> - Repository: https://github.com/wrtnlabs/shopping-backend
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
## How to use
|
|
27
|
+
### Setup
|
|
28
|
+
```bash
|
|
29
|
+
npm install @agentica/core @agentica/benchmark @samchon/openapi typia
|
|
30
|
+
npx typia setup
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Install `@agentica/benchmark` with its dependent libraries.
|
|
34
|
+
|
|
35
|
+
Note that, you have to install not only `@agentica/core` or `@agentica/benchmark` libraries, but also [`@samchon/openapi`](https://github.com/samchon/openapi) and [`typia`](https://github.com/samchon/typia) too.
|
|
36
|
+
|
|
37
|
+
`@samchon/openapi` is an OpenAPI specification library which can convert Swagger/OpenAPI document to LLM function calling schema. And `typia` is a transformer (compiler) library which can compose LLM function calling schema from a TypeScript class type.
|
|
38
|
+
|
|
39
|
+
By the way, as `typia` is a transformer library analyzing TypeScript source code in the compilation level, it needs additional setup command `npx typia setup`.
|
|
40
|
+
|
|
41
|
+
### Function selecting Benchmark
|
|
42
|
+
```typescript
|
|
43
|
+
import { AgenticaSelectBenchmark } from "@agentica/benchmark";
|
|
44
|
+
import { Agentica, IAgenticaOperation } from "@agentica/core";
|
|
45
|
+
import { HttpLlm, IHttpConnection, OpenApi } from "@samchon/openapi";
|
|
46
|
+
import fs from "fs";
|
|
47
|
+
import OpenAI from "openai";
|
|
48
|
+
import path from "path";
|
|
49
|
+
|
|
50
|
+
const main = async (): Promise<void> => {
|
|
51
|
+
// CREATE AI AGENT
|
|
52
|
+
const agent: Agentica<"chatgpt"> = new Agentica({
|
|
53
|
+
model: "chatgpt",
|
|
54
|
+
vendor: {
|
|
55
|
+
api: new OpenAI({
|
|
56
|
+
apiKey: "YOUR_OPENAI_API_KEY",
|
|
57
|
+
}),
|
|
58
|
+
model: "gpt-4o-mini",
|
|
59
|
+
},
|
|
60
|
+
controllers: [
|
|
61
|
+
{
|
|
62
|
+
protocol: "http",
|
|
63
|
+
name: "shopping",
|
|
64
|
+
application: HttpLlm.application({
|
|
65
|
+
model: "chatgpt",
|
|
66
|
+
document: await fetch(
|
|
67
|
+
"https://shopping-be.wrtn.ai/editor/swagger.json",
|
|
68
|
+
).then((res) => res.json()),
|
|
69
|
+
}),
|
|
70
|
+
connection: {
|
|
71
|
+
host: "https://shopping-be.wrtn.ai",
|
|
72
|
+
},
|
|
73
|
+
},
|
|
74
|
+
],
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
// DO BENCHMARK
|
|
78
|
+
const find = (method: OpenApi.Method, path: string): IAgenticaOperation => {
|
|
79
|
+
const found = agent
|
|
80
|
+
.getOperations()
|
|
81
|
+
.find(
|
|
82
|
+
(op) =>
|
|
83
|
+
op.protocol === "http" &&
|
|
84
|
+
op.function.method === method &&
|
|
85
|
+
op.function.path === path,
|
|
86
|
+
);
|
|
87
|
+
if (!found) throw new Error(`Operation not found: ${method} ${path}`);
|
|
88
|
+
return found;
|
|
89
|
+
};
|
|
90
|
+
const benchmark: AgenticaSelectBenchmark<"chatgpt"> =
|
|
91
|
+
new AgenticaSelectBenchmark({
|
|
92
|
+
agent,
|
|
93
|
+
config: {
|
|
94
|
+
repeat: 4,
|
|
95
|
+
},
|
|
96
|
+
scenarios: [
|
|
97
|
+
{
|
|
98
|
+
name: "order",
|
|
99
|
+
text: [
|
|
100
|
+
"I wanna see every sales in the shopping mall",
|
|
101
|
+
"",
|
|
102
|
+
"And then show me the detailed information about the Macbook.",
|
|
103
|
+
"",
|
|
104
|
+
"After that, select the most expensive stock",
|
|
105
|
+
"from the Macbook, and put it into my shopping cart.",
|
|
106
|
+
"And take the shopping cart to the order.",
|
|
107
|
+
"",
|
|
108
|
+
"At last, I'll publish it by cash payment, and my address is",
|
|
109
|
+
"",
|
|
110
|
+
" - country: South Korea",
|
|
111
|
+
" - city/province: Seoul",
|
|
112
|
+
" - department: Wrtn Apartment",
|
|
113
|
+
" - Possession: 101-1411",
|
|
114
|
+
].join("\n"),
|
|
115
|
+
expected: {
|
|
116
|
+
type: "array",
|
|
117
|
+
items: [
|
|
118
|
+
{
|
|
119
|
+
type: "standalone",
|
|
120
|
+
operation: find("patch", "/shoppings/customers/sales"),
|
|
121
|
+
},
|
|
122
|
+
{
|
|
123
|
+
type: "standalone",
|
|
124
|
+
operation: find("get", "/shoppings/customers/sales/{id}"),
|
|
125
|
+
},
|
|
126
|
+
{
|
|
127
|
+
type: "anyOf",
|
|
128
|
+
anyOf: [
|
|
129
|
+
{
|
|
130
|
+
type: "standalone",
|
|
131
|
+
operation: find("post", "/shoppings/customers/orders"),
|
|
132
|
+
},
|
|
133
|
+
{
|
|
134
|
+
type: "standalone",
|
|
135
|
+
operation: find("post", "/shoppings/customers/orders/direct"),
|
|
136
|
+
},
|
|
137
|
+
],
|
|
138
|
+
},
|
|
139
|
+
{
|
|
140
|
+
type: "standalone",
|
|
141
|
+
operation: find(
|
|
142
|
+
"post",
|
|
143
|
+
"/shoppings/customers/orders/{orderId}/publish",
|
|
144
|
+
),
|
|
145
|
+
},
|
|
146
|
+
],
|
|
147
|
+
},
|
|
148
|
+
},
|
|
149
|
+
],
|
|
150
|
+
});
|
|
151
|
+
await benchmark.execute();
|
|
152
|
+
|
|
153
|
+
// REPORT
|
|
154
|
+
const docs: Record<string, string> = benchmark.report();
|
|
155
|
+
const root: string = `docs/benchmarks/call`;
|
|
156
|
+
|
|
157
|
+
await rmdir(root);
|
|
158
|
+
for (const [key, value] of Object.entries(docs)) {
|
|
159
|
+
await mkdir(path.join(root, key.split("/").slice(0, -1).join("/")));
|
|
160
|
+
await fs.promises.writeFile(path.join(root, key), value, "utf8");
|
|
161
|
+
}
|
|
162
|
+
};
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
> Benchmark of Shopping Mall Scenario
|
|
166
|
+
>
|
|
167
|
+
> - [Benchmark Report](https://github.com/wrtnlabs/agentica/tree/main/test/examples/benchmarks/select)
|
|
168
|
+
> - Swagger Document: https://shopping-be.wrtn.ai/editor
|
|
169
|
+
> - Repository: https://github.com/wrtnlabs/shopping-backend
|
|
170
|
+
|
|
171
|
+
Benchmark function selecting quality.
|
|
172
|
+
|
|
173
|
+
You can measure a benchmark that AI agent can select proper functions from the user's conversations by the LLM (Large Language Model) function calling feature. Create `Agentica` and `AgenticaSelectBenchmark` typed instances, and execute the benchmark with your specific scenarios like above.
|
|
174
|
+
|
|
175
|
+
If you have written enough and proper descriptions to the functions (or API operations) and DTO schema types, success ratio of `AgenticaSelectBenchmark` would be higher. Otherwise descriptions are not enough or have bad quality, you may get a threatening benchmark report. If you wanna see how the `AgenticaSelectBenchmark` reports, click above [benchmark report link](https://github.com/wrtnlabs/agentica/tree/main/test/examples/benchmarks/select) please.
|
|
176
|
+
|
|
177
|
+
### Function Calling Benchmark
|
|
178
|
+
> Benchmark of Shopping Mall Scenario
|
|
179
|
+
>
|
|
180
|
+
> - [Benchmark Report](https://github.com/wrtnlabs/agentica/tree/main/test/examples/benchmarks/call)
|
|
181
|
+
> - Swagger Document: https://shopping-be.wrtn.ai/editor
|
|
182
|
+
> - Repository: https://github.com/wrtnlabs/shopping-backend
|
|
183
|
+
|
|
184
|
+
```typescript
|
|
185
|
+
import { AgenticaCallBenchmark } from "@agentica/benchmark";
|
|
186
|
+
import { Agentica, IAgenticaOperation } from "@agentica/core";
|
|
187
|
+
import { HttpLlm, IHttpConnection, OpenApi } from "@samchon/openapi";
|
|
188
|
+
import fs from "fs";
|
|
189
|
+
import OpenAI from "openai";
|
|
190
|
+
import path from "path";
|
|
191
|
+
|
|
192
|
+
const main = async (): Promise<void> => {
|
|
193
|
+
// CREATE AI AGENT
|
|
194
|
+
const agent: Agentica<"chatgpt"> = new Agentica({
|
|
195
|
+
model: "chatgpt",
|
|
196
|
+
vendor: {
|
|
197
|
+
api: new OpenAI({
|
|
198
|
+
apiKey: "YOUR_OPENAI_API_KEY",
|
|
199
|
+
}),
|
|
200
|
+
model: "gpt-4o-mini",
|
|
201
|
+
},
|
|
202
|
+
controllers: [
|
|
203
|
+
{
|
|
204
|
+
protocol: "http",
|
|
205
|
+
name: "shopping",
|
|
206
|
+
application: HttpLlm.application({
|
|
207
|
+
model: "chatgpt",
|
|
208
|
+
document: await fetch(
|
|
209
|
+
"https://shopping-be.wrtn.ai/editor/swagger.json",
|
|
210
|
+
).then((res) => res.json()),
|
|
211
|
+
}),
|
|
212
|
+
connection: {
|
|
213
|
+
host: "https://shopping-be.wrtn.ai",
|
|
214
|
+
},
|
|
215
|
+
},
|
|
216
|
+
],
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
// DO BENCHMARK
|
|
220
|
+
const find = (method: OpenApi.Method, path: string): IAgenticaOperation => {
|
|
221
|
+
const found = agent
|
|
222
|
+
.getOperations()
|
|
223
|
+
.find(
|
|
224
|
+
(op) =>
|
|
225
|
+
op.protocol === "http" &&
|
|
226
|
+
op.function.method === method &&
|
|
227
|
+
op.function.path === path,
|
|
228
|
+
);
|
|
229
|
+
if (!found) throw new Error(`Operation not found: ${method} ${path}`);
|
|
230
|
+
return found;
|
|
231
|
+
};
|
|
232
|
+
const benchmark: AgenticaSelectBenchmark<"chatgpt"> =
|
|
233
|
+
new AgenticaSelectBenchmark({
|
|
234
|
+
agent,
|
|
235
|
+
config: {
|
|
236
|
+
repeat: 4,
|
|
237
|
+
},
|
|
238
|
+
scenarios: [
|
|
239
|
+
{
|
|
240
|
+
name: "order",
|
|
241
|
+
text: [
|
|
242
|
+
"I wanna see every sales in the shopping mall",
|
|
243
|
+
"",
|
|
244
|
+
"And then show me the detailed information about the Macbook.",
|
|
245
|
+
"",
|
|
246
|
+
"After that, select the most expensive stock",
|
|
247
|
+
"from the Macbook, and put it into my shopping cart.",
|
|
248
|
+
"And take the shopping cart to the order.",
|
|
249
|
+
"",
|
|
250
|
+
"At last, I'll publish it by cash payment, and my address is",
|
|
251
|
+
"",
|
|
252
|
+
" - country: South Korea",
|
|
253
|
+
" - city/province: Seoul",
|
|
254
|
+
" - department: Wrtn Apartment",
|
|
255
|
+
" - Possession: 101-1411",
|
|
256
|
+
].join("\n"),
|
|
257
|
+
expected: {
|
|
258
|
+
type: "array",
|
|
259
|
+
items: [
|
|
260
|
+
{
|
|
261
|
+
type: "standalone",
|
|
262
|
+
operation: find("patch", "/shoppings/customers/sales"),
|
|
263
|
+
},
|
|
264
|
+
{
|
|
265
|
+
type: "standalone",
|
|
266
|
+
operation: find("get", "/shoppings/customers/sales/{id}"),
|
|
267
|
+
},
|
|
268
|
+
{
|
|
269
|
+
type: "anyOf",
|
|
270
|
+
anyOf: [
|
|
271
|
+
{
|
|
272
|
+
type: "standalone",
|
|
273
|
+
operation: find("post", "/shoppings/customers/orders"),
|
|
274
|
+
},
|
|
275
|
+
{
|
|
276
|
+
type: "standalone",
|
|
277
|
+
operation: find("post", "/shoppings/customers/orders/direct"),
|
|
278
|
+
},
|
|
279
|
+
],
|
|
280
|
+
},
|
|
281
|
+
{
|
|
282
|
+
type: "standalone",
|
|
283
|
+
operation: find(
|
|
284
|
+
"post",
|
|
285
|
+
"/shoppings/customers/orders/{orderId}/publish",
|
|
286
|
+
),
|
|
287
|
+
},
|
|
288
|
+
],
|
|
289
|
+
},
|
|
290
|
+
},
|
|
291
|
+
],
|
|
292
|
+
});
|
|
293
|
+
await benchmark.execute();
|
|
294
|
+
|
|
295
|
+
// REPORT
|
|
296
|
+
const docs: Record<string, string> = benchmark.report();
|
|
297
|
+
const root: string = `docs/benchmarks/call`;
|
|
298
|
+
|
|
299
|
+
await rmdir(root);
|
|
300
|
+
for (const [key, value] of Object.entries(docs)) {
|
|
301
|
+
await mkdir(path.join(root, key.split("/").slice(0, -1).join("/")));
|
|
302
|
+
await fs.promises.writeFile(path.join(root, key), value, "utf8");
|
|
303
|
+
}
|
|
304
|
+
};
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
Benchmark function calling quality.
|
|
308
|
+
|
|
309
|
+
You can measure a benchmark that AI agent can call proper functions from the user's conversations by the LLM (Large Language Model) function calling feature. Create `Agentica` and `AgenticaCallBenchmark` typed instances, and execute the benchmark with your specific scenarios like above.
|
|
310
|
+
|
|
311
|
+
If you have written enough and proper descriptions to the functions (or API operations) and DTO schema types, success ratio of `AgenticaCallBenchmark` would be higher. Otherwise descriptions are not enough or have bad quality, you may get a threatening benchmark report. If you wanna see how the `AgenticaCallBenchmark` reports, click above [benchmark report link](https://github.com/wrtnlabs/agentica/tree/main/test/examples/benchmarks/call) please.
|
|
312
|
+
|
|
313
|
+
For reference, `@agentica/core` tends not to failed on arguments filling of LLM function calling. So it is okay that ending up with [`AgenticaSelectBenchmark`](#function-selecting-benchmark) stage, because function calling with arguments filling spends much more times and LLM tokens.
|
|
314
|
+
|
|
315
|
+
Also, current `AgenticaCallBenchmark` has been designed to perform multiple LLM function callings just by one conversation text. However, the multiple LLM function calling benchmark actually requires the [#Multi Turn Benchmark](#multi-turn-benchmark) feature of [#Roadmap](#rodmap). Therefore, [`AgenticaSelectBenchmark`](#function-selecting-benchmark) is economic than `AgenticaCallBenchmark`.
|
|
316
|
+
|
|
317
|
+
> In the above "Shopping Mall" scenario, function selecting benchmark ends in 4 seconds, but function calling benchmark consumes about 3 minutes.
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
## Roadmap
|
|
323
|
+
### Multi Turn Benchmark
|
|
324
|
+
Will support multi-turn benchmark for [#Function Calling Benchmark](#function-calling-benchmark).
|
|
325
|
+
|
|
326
|
+
### Estimator Agent
|
|
327
327
|
We will create some benchmark features that can analyze conversation context and issue summary reports or provide quantitative evaluations.
|