@agentica/benchmark 0.12.20 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -33
- package/lib/AgenticaCallBenchmark.d.ts +12 -6
- package/lib/AgenticaCallBenchmark.js +24 -18
- package/lib/AgenticaCallBenchmark.js.map +1 -1
- package/lib/AgenticaSelectBenchmark.d.ts +12 -6
- package/lib/AgenticaSelectBenchmark.js +14 -12
- package/lib/AgenticaSelectBenchmark.js.map +1 -1
- package/lib/index.mjs +315 -236
- package/lib/index.mjs.map +1 -1
- package/lib/internal/AgenticaBenchmarkPredicator.d.ts +38 -29
- package/lib/internal/AgenticaBenchmarkPredicator.js +100 -84
- package/lib/internal/AgenticaBenchmarkPredicator.js.map +1 -1
- package/lib/internal/AgenticaBenchmarkUtil.d.ts +21 -6
- package/lib/internal/AgenticaBenchmarkUtil.js +39 -33
- package/lib/internal/AgenticaBenchmarkUtil.js.map +1 -1
- package/lib/internal/AgenticaCallBenchmarkReporter.d.ts +6 -5
- package/lib/internal/AgenticaCallBenchmarkReporter.js +130 -126
- package/lib/internal/AgenticaCallBenchmarkReporter.js.map +1 -1
- package/lib/internal/AgenticaPromptReporter.d.ts +13 -5
- package/lib/internal/AgenticaPromptReporter.js +45 -41
- package/lib/internal/AgenticaPromptReporter.js.map +1 -1
- package/lib/internal/AgenticaSelectBenchmarkReporter.d.ts +3 -1
- package/lib/internal/AgenticaSelectBenchmarkReporter.js +153 -150
- package/lib/internal/AgenticaSelectBenchmarkReporter.js.map +1 -1
- package/lib/structures/IAgenticaBenchmarkExpected.d.ts +8 -2
- package/lib/structures/IAgenticaCallBenchmarkEvent.d.ts +9 -3
- package/lib/structures/IAgenticaCallBenchmarkResult.d.ts +10 -4
- package/lib/structures/IAgenticaCallBenchmarkScenario.d.ts +8 -2
- package/lib/structures/IAgenticaSelectBenchmarkEvent.d.ts +9 -3
- package/lib/structures/IAgenticaSelectBenchmarkResult.d.ts +10 -4
- package/lib/structures/IAgenticaSelectBenchmarkScenario.d.ts +8 -2
- package/lib/utils/MathUtil.d.ts +15 -3
- package/lib/utils/MathUtil.js +15 -4
- package/lib/utils/MathUtil.js.map +1 -1
- package/package.json +12 -10
- package/src/AgenticaCallBenchmark.ts +64 -45
- package/src/AgenticaSelectBenchmark.ts +42 -30
- package/src/internal/AgenticaBenchmarkPredicator.ts +208 -186
- package/src/internal/AgenticaBenchmarkUtil.ts +58 -40
- package/src/internal/AgenticaCallBenchmarkReporter.ts +180 -182
- package/src/internal/AgenticaPromptReporter.ts +46 -33
- package/src/internal/AgenticaSelectBenchmarkReporter.ts +205 -203
- package/src/structures/IAgenticaBenchmarkExpected.ts +9 -2
- package/src/structures/IAgenticaCallBenchmarkEvent.ts +9 -3
- package/src/structures/IAgenticaCallBenchmarkResult.ts +10 -4
- package/src/structures/IAgenticaCallBenchmarkScenario.ts +8 -2
- package/src/structures/IAgenticaSelectBenchmarkEvent.ts +9 -3
- package/src/structures/IAgenticaSelectBenchmarkResult.ts +10 -4
- package/src/structures/IAgenticaSelectBenchmarkScenario.ts +8 -2
- package/src/utils/MathUtil.ts +16 -3
package/README.md
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
# `@agentica/benchmark`
|
|
2
|
+
|
|
2
3
|

|
|
3
4
|
|
|
4
5
|
[](https://github.com/wrtnlabs/agentica/blob/master/LICENSE)
|
|
@@ -20,11 +21,10 @@ Here is an example report generated by `@agentica/benchmark` measuring function
|
|
|
20
21
|
> - Swagger Document: https://shopping-be.wrtn.ai/editor
|
|
21
22
|
> - Repository: https://github.com/wrtnlabs/shopping-backend
|
|
22
23
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
24
|
## How to use
|
|
25
|
+
|
|
27
26
|
### Setup
|
|
27
|
+
|
|
28
28
|
```bash
|
|
29
29
|
npm install @agentica/core @agentica/benchmark @samchon/openapi typia
|
|
30
30
|
npx typia setup
|
|
@@ -32,22 +32,23 @@ npx typia setup
|
|
|
32
32
|
|
|
33
33
|
Install `@agentica/benchmark` with its dependent libraries.
|
|
34
34
|
|
|
35
|
-
Note that, you have to install not only `@agentica/core` or `@agentica/benchmark` libraries, but also [`@samchon/openapi`](https://github.com/samchon/openapi) and [`typia`](https://github.com/samchon/typia) too.
|
|
35
|
+
Note that, you have to install not only `@agentica/core` or `@agentica/benchmark` libraries, but also [`@samchon/openapi`](https://github.com/samchon/openapi) and [`typia`](https://github.com/samchon/typia) too.
|
|
36
36
|
|
|
37
37
|
`@samchon/openapi` is an OpenAPI specification library which can convert Swagger/OpenAPI document to LLM function calling schema. And `typia` is a transformer (compiler) library which can compose LLM function calling schema from a TypeScript class type.
|
|
38
38
|
|
|
39
39
|
By the way, as `typia` is a transformer library analyzing TypeScript source code in the compilation level, it needs additional setup command `npx typia setup`.
|
|
40
40
|
|
|
41
41
|
### Function selecting Benchmark
|
|
42
|
+
|
|
42
43
|
```typescript
|
|
44
|
+
import fs from "node:fs";
|
|
45
|
+
import path from "node:path";
|
|
43
46
|
import { AgenticaSelectBenchmark } from "@agentica/benchmark";
|
|
44
47
|
import { Agentica, IAgenticaOperation } from "@agentica/core";
|
|
45
48
|
import { HttpLlm, IHttpConnection, OpenApi } from "@samchon/openapi";
|
|
46
|
-
import fs from "fs";
|
|
47
49
|
import OpenAI from "openai";
|
|
48
|
-
import path from "path";
|
|
49
50
|
|
|
50
|
-
|
|
51
|
+
async function main(): Promise<void> {
|
|
51
52
|
// CREATE AI AGENT
|
|
52
53
|
const agent: Agentica<"chatgpt"> = new Agentica({
|
|
53
54
|
model: "chatgpt",
|
|
@@ -65,7 +66,7 @@ const main = async (): Promise<void> => {
|
|
|
65
66
|
model: "chatgpt",
|
|
66
67
|
document: await fetch(
|
|
67
68
|
"https://shopping-be.wrtn.ai/editor/swagger.json",
|
|
68
|
-
).then(
|
|
69
|
+
).then(res => res.json()),
|
|
69
70
|
}),
|
|
70
71
|
connection: {
|
|
71
72
|
host: "https://shopping-be.wrtn.ai",
|
|
@@ -79,16 +80,18 @@ const main = async (): Promise<void> => {
|
|
|
79
80
|
const found = agent
|
|
80
81
|
.getOperations()
|
|
81
82
|
.find(
|
|
82
|
-
|
|
83
|
-
op.protocol === "http"
|
|
84
|
-
op.function.method === method
|
|
85
|
-
op.function.path === path,
|
|
83
|
+
op =>
|
|
84
|
+
op.protocol === "http"
|
|
85
|
+
&& op.function.method === method
|
|
86
|
+
&& op.function.path === path,
|
|
86
87
|
);
|
|
87
|
-
if (!found)
|
|
88
|
+
if (!found) {
|
|
89
|
+
throw new Error(`Operation not found: ${method} ${path}`);
|
|
90
|
+
}
|
|
88
91
|
return found;
|
|
89
92
|
};
|
|
90
|
-
const benchmark: AgenticaSelectBenchmark<"chatgpt">
|
|
91
|
-
new AgenticaSelectBenchmark({
|
|
93
|
+
const benchmark: AgenticaSelectBenchmark<"chatgpt">
|
|
94
|
+
= new AgenticaSelectBenchmark({
|
|
92
95
|
agent,
|
|
93
96
|
config: {
|
|
94
97
|
repeat: 4,
|
|
@@ -159,7 +162,7 @@ const main = async (): Promise<void> => {
|
|
|
159
162
|
await mkdir(path.join(root, key.split("/").slice(0, -1).join("/")));
|
|
160
163
|
await fs.promises.writeFile(path.join(root, key), value, "utf8");
|
|
161
164
|
}
|
|
162
|
-
}
|
|
165
|
+
}
|
|
163
166
|
```
|
|
164
167
|
|
|
165
168
|
> Benchmark of Shopping Mall Scenario
|
|
@@ -175,6 +178,7 @@ You can measure a benchmark that AI agent can select proper functions from the u
|
|
|
175
178
|
If you have written enough and proper descriptions to the functions (or API operations) and DTO schema types, success ratio of `AgenticaSelectBenchmark` would be higher. Otherwise descriptions are not enough or have bad quality, you may get a threatening benchmark report. If you wanna see how the `AgenticaSelectBenchmark` reports, click above [benchmark report link](https://github.com/wrtnlabs/agentica/tree/main/test/examples/benchmarks/select) please.
|
|
176
179
|
|
|
177
180
|
### Function Calling Benchmark
|
|
181
|
+
|
|
178
182
|
> Benchmark of Shopping Mall Scenario
|
|
179
183
|
>
|
|
180
184
|
> - [Benchmark Report](https://github.com/wrtnlabs/agentica/tree/main/test/examples/benchmarks/call)
|
|
@@ -182,14 +186,14 @@ If you have written enough and proper descriptions to the functions (or API oper
|
|
|
182
186
|
> - Repository: https://github.com/wrtnlabs/shopping-backend
|
|
183
187
|
|
|
184
188
|
```typescript
|
|
189
|
+
import fs from "node:fs";
|
|
190
|
+
import path from "node:path";
|
|
185
191
|
import { AgenticaCallBenchmark } from "@agentica/benchmark";
|
|
186
192
|
import { Agentica, IAgenticaOperation } from "@agentica/core";
|
|
187
193
|
import { HttpLlm, IHttpConnection, OpenApi } from "@samchon/openapi";
|
|
188
|
-
import fs from "fs";
|
|
189
194
|
import OpenAI from "openai";
|
|
190
|
-
import path from "path";
|
|
191
195
|
|
|
192
|
-
|
|
196
|
+
async function main(): Promise<void> {
|
|
193
197
|
// CREATE AI AGENT
|
|
194
198
|
const agent: Agentica<"chatgpt"> = new Agentica({
|
|
195
199
|
model: "chatgpt",
|
|
@@ -207,7 +211,7 @@ const main = async (): Promise<void> => {
|
|
|
207
211
|
model: "chatgpt",
|
|
208
212
|
document: await fetch(
|
|
209
213
|
"https://shopping-be.wrtn.ai/editor/swagger.json",
|
|
210
|
-
).then(
|
|
214
|
+
).then(res => res.json()),
|
|
211
215
|
}),
|
|
212
216
|
connection: {
|
|
213
217
|
host: "https://shopping-be.wrtn.ai",
|
|
@@ -221,16 +225,18 @@ const main = async (): Promise<void> => {
|
|
|
221
225
|
const found = agent
|
|
222
226
|
.getOperations()
|
|
223
227
|
.find(
|
|
224
|
-
|
|
225
|
-
op.protocol === "http"
|
|
226
|
-
op.function.method === method
|
|
227
|
-
op.function.path === path,
|
|
228
|
+
op =>
|
|
229
|
+
op.protocol === "http"
|
|
230
|
+
&& op.function.method === method
|
|
231
|
+
&& op.function.path === path,
|
|
228
232
|
);
|
|
229
|
-
if (!found)
|
|
233
|
+
if (!found) {
|
|
234
|
+
throw new Error(`Operation not found: ${method} ${path}`);
|
|
235
|
+
}
|
|
230
236
|
return found;
|
|
231
237
|
};
|
|
232
|
-
const benchmark: AgenticaSelectBenchmark<"chatgpt">
|
|
233
|
-
new AgenticaSelectBenchmark({
|
|
238
|
+
const benchmark: AgenticaSelectBenchmark<"chatgpt">
|
|
239
|
+
= new AgenticaSelectBenchmark({
|
|
234
240
|
agent,
|
|
235
241
|
config: {
|
|
236
242
|
repeat: 4,
|
|
@@ -301,7 +307,7 @@ const main = async (): Promise<void> => {
|
|
|
301
307
|
await mkdir(path.join(root, key.split("/").slice(0, -1).join("/")));
|
|
302
308
|
await fs.promises.writeFile(path.join(root, key), value, "utf8");
|
|
303
309
|
}
|
|
304
|
-
}
|
|
310
|
+
}
|
|
305
311
|
```
|
|
306
312
|
|
|
307
313
|
Benchmark function calling quality.
|
|
@@ -310,18 +316,18 @@ You can measure a benchmark that AI agent can call proper functions from the use
|
|
|
310
316
|
|
|
311
317
|
If you have written enough and proper descriptions to the functions (or API operations) and DTO schema types, success ratio of `AgenticaCallBenchmark` would be higher. Otherwise descriptions are not enough or have bad quality, you may get a threatening benchmark report. If you wanna see how the `AgenticaCallBenchmark` reports, click above [benchmark report link](https://github.com/wrtnlabs/agentica/tree/main/test/examples/benchmarks/call) please.
|
|
312
318
|
|
|
313
|
-
For reference, `@agentica/core` tends not to failed on arguments filling of LLM function calling. So it is okay that ending up with [`AgenticaSelectBenchmark`](#function-selecting-benchmark) stage, because function calling with arguments filling spends much more times and LLM tokens.
|
|
319
|
+
For reference, `@agentica/core` tends not to failed on arguments filling of LLM function calling. So it is okay that ending up with [`AgenticaSelectBenchmark`](#function-selecting-benchmark) stage, because function calling with arguments filling spends much more times and LLM tokens.
|
|
314
320
|
|
|
315
321
|
Also, current `AgenticaCallBenchmark` has been designed to perform multiple LLM function callings just by one conversation text. However, the multiple LLM function calling benchmark actually requires the [#Multi Turn Benchmark](#multi-turn-benchmark) feature of [#Roadmap](#rodmap). Therefore, [`AgenticaSelectBenchmark`](#function-selecting-benchmark) is economic than `AgenticaCallBenchmark`.
|
|
316
322
|
|
|
317
323
|
> In the above "Shopping Mall" scenario, function selecting benchmark ends in 4 seconds, but function calling benchmark consumes about 3 minutes.
|
|
318
324
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
325
|
## Roadmap
|
|
326
|
+
|
|
323
327
|
### Multi Turn Benchmark
|
|
328
|
+
|
|
324
329
|
Will support multi-turn benchmark for [#Function Calling Benchmark](#function-calling-benchmark).
|
|
325
330
|
|
|
326
331
|
### Estimator Agent
|
|
327
|
-
|
|
332
|
+
|
|
333
|
+
We will create some benchmark features that can analyze conversation context and issue summary reports or provide quantitative evaluations.
|
|
@@ -1,9 +1,15 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
1
|
+
/**
|
|
2
|
+
* @module
|
|
3
|
+
* This file contains the implementation of the AgenticaCallBenchmark class.
|
|
4
|
+
*
|
|
5
|
+
* @author Wrtn Technologies
|
|
6
|
+
*/
|
|
7
|
+
import type { Agentica } from "@agentica/core";
|
|
8
|
+
import type { ILlmSchema } from "@samchon/openapi";
|
|
9
|
+
import type { tags } from "typia";
|
|
10
|
+
import type { IAgenticaCallBenchmarkEvent } from "./structures/IAgenticaCallBenchmarkEvent";
|
|
11
|
+
import type { IAgenticaCallBenchmarkResult } from "./structures/IAgenticaCallBenchmarkResult";
|
|
12
|
+
import type { IAgenticaCallBenchmarkScenario } from "./structures/IAgenticaCallBenchmarkScenario";
|
|
7
13
|
/**
|
|
8
14
|
* LLM function calling selection benchmark.
|
|
9
15
|
*
|
|
@@ -71,31 +71,33 @@ class AgenticaCallBenchmark {
|
|
|
71
71
|
return __awaiter(this, void 0, void 0, function* () {
|
|
72
72
|
const started_at = new Date();
|
|
73
73
|
const semaphore = new tstl_1.Semaphore(this.config_.simultaneous);
|
|
74
|
-
const
|
|
75
|
-
const events = yield Promise.all(
|
|
74
|
+
const task = this.scenarios_.map((scenario) => __awaiter(this, void 0, void 0, function* () {
|
|
75
|
+
const events = yield Promise.all(Array.from({ length: this.config_.repeat }).map(() => __awaiter(this, void 0, void 0, function* () {
|
|
76
76
|
yield semaphore.acquire();
|
|
77
77
|
const e = yield this.step(scenario);
|
|
78
78
|
yield semaphore.release();
|
|
79
|
-
if (listener !== undefined)
|
|
79
|
+
if (listener !== undefined) {
|
|
80
80
|
listener(e);
|
|
81
|
+
}
|
|
81
82
|
return e;
|
|
82
83
|
})));
|
|
83
84
|
return {
|
|
84
85
|
scenario,
|
|
85
86
|
events,
|
|
86
87
|
usage: events
|
|
87
|
-
.filter(
|
|
88
|
-
.map(
|
|
89
|
-
.reduce(core_1.AgenticaTokenUsage.plus, core_1.AgenticaTokenUsage.zero()),
|
|
88
|
+
.filter(e => e.type !== "error")
|
|
89
|
+
.map(e => e.usage)
|
|
90
|
+
.reduce((acc, cur) => core_1.AgenticaTokenUsage.plus(acc, cur), core_1.AgenticaTokenUsage.zero()),
|
|
90
91
|
};
|
|
91
|
-
}))
|
|
92
|
+
}));
|
|
93
|
+
const experiments = yield Promise.all(task);
|
|
92
94
|
return (this.result_ = {
|
|
93
95
|
experiments,
|
|
94
96
|
started_at,
|
|
95
97
|
completed_at: new Date(),
|
|
96
98
|
usage: experiments
|
|
97
|
-
.map(
|
|
98
|
-
.reduce(core_1.AgenticaTokenUsage.plus, core_1.AgenticaTokenUsage.zero()),
|
|
99
|
+
.map(p => p.usage)
|
|
100
|
+
.reduce((acc, cur) => core_1.AgenticaTokenUsage.plus(acc, cur), core_1.AgenticaTokenUsage.zero()),
|
|
99
101
|
});
|
|
100
102
|
});
|
|
101
103
|
}
|
|
@@ -118,8 +120,9 @@ class AgenticaCallBenchmark {
|
|
|
118
120
|
* @returns Dictionary of markdown files.
|
|
119
121
|
*/
|
|
120
122
|
report() {
|
|
121
|
-
if (this.result_ === null)
|
|
123
|
+
if (this.result_ === null) {
|
|
122
124
|
throw new Error("Benchmark is not executed yet.");
|
|
125
|
+
}
|
|
123
126
|
return AgenticaCallBenchmarkReporter_1.AgenticaCallBenchmarkReporter.markdown(this.result_);
|
|
124
127
|
}
|
|
125
128
|
step(scenario) {
|
|
@@ -130,8 +133,8 @@ class AgenticaCallBenchmark {
|
|
|
130
133
|
expected: scenario.expected,
|
|
131
134
|
operations: agent
|
|
132
135
|
.getPromptHistories()
|
|
133
|
-
.filter(
|
|
134
|
-
.map(
|
|
136
|
+
.filter(p => p.type === "execute")
|
|
137
|
+
.map(p => p.operation),
|
|
135
138
|
strict: false,
|
|
136
139
|
});
|
|
137
140
|
const out = () => {
|
|
@@ -139,10 +142,10 @@ class AgenticaCallBenchmark {
|
|
|
139
142
|
expected: scenario.expected,
|
|
140
143
|
operations: agent
|
|
141
144
|
.getPromptHistories()
|
|
142
|
-
.filter(
|
|
143
|
-
.map(
|
|
145
|
+
.filter(p => p.type === "select")
|
|
146
|
+
.map(p => p.selections)
|
|
144
147
|
.flat()
|
|
145
|
-
.map(
|
|
148
|
+
.map(p => p.operation),
|
|
146
149
|
strict: false,
|
|
147
150
|
});
|
|
148
151
|
const call = success();
|
|
@@ -159,15 +162,18 @@ class AgenticaCallBenchmark {
|
|
|
159
162
|
};
|
|
160
163
|
try {
|
|
161
164
|
yield agent.conversate(scenario.text);
|
|
162
|
-
if (success())
|
|
165
|
+
if (success()) {
|
|
163
166
|
return out();
|
|
167
|
+
}
|
|
164
168
|
for (let i = 0; i < this.config_.consent; ++i) {
|
|
165
169
|
const next = yield AgenticaBenchmarkPredicator_1.AgenticaBenchmarkPredicator.isNext(agent);
|
|
166
|
-
if (next === null)
|
|
170
|
+
if (next === null) {
|
|
167
171
|
break;
|
|
172
|
+
}
|
|
168
173
|
yield agent.conversate(next);
|
|
169
|
-
if (success())
|
|
174
|
+
if (success()) {
|
|
170
175
|
return out();
|
|
176
|
+
}
|
|
171
177
|
}
|
|
172
178
|
return out();
|
|
173
179
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"AgenticaCallBenchmark.js","sourceRoot":"","sources":["../src/AgenticaCallBenchmark.ts"],"names":[],"mappings":";;;;;;;;;;;;
|
|
1
|
+
{"version":3,"file":"AgenticaCallBenchmark.js","sourceRoot":"","sources":["../src/AgenticaCallBenchmark.ts"],"names":[],"mappings":";;;;;;;;;;;;AAaA,yCAAoD;AACpD,+BAAiC;AACjC,wFAAqF;AACrF,4FAAyF;AAEzF;;;;;;;;;;;;;;;;;;;GAmBG;AACH,MAAa,qBAAqB;IAMhC;;;;OAIG;IACH,YAAmB,KAA0C;;QAC3D,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC,KAAK,CAAC;QAC1B,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC,SAAS,CAAC,KAAK,EAAE,CAAC;QAC1C,IAAI,CAAC,OAAO,GAAG;YACb,MAAM,EAAE,MAAA,MAAA,KAAK,CAAC,MAAM,0CAAE,MAAM,mCAAI,EAAE;YAClC,YAAY,EAAE,MAAA,MAAA,KAAK,CAAC,MAAM,0CAAE,YAAY,mCAAI,EAAE;YAC9C,OAAO,EAAE,MAAA,MAAA,KAAK,CAAC,MAAM,0CAAE,OAAO,mCAAI,CAAC;SACpC,CAAC;QACF,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;IACtB,CAAC;IAED;;;;;;;;;;;;;;;OAeG;IACU,OAAO,CAClB,QAA8D;;YAE9D,MAAM,UAAU,GAAS,IAAI,IAAI,EAAE,CAAC;YACpC,MAAM,SAAS,GAAc,IAAI,gBAAS,CAAC,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;YACtE,MAAM,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAO,QAAQ,EAAE,EAAE;gBAClD,MAAM,MAAM,GACR,MAAM,OAAO,CAAC,GAAG,CACjB,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC,GAAG,CAAC,GAAS,EAAE;oBACzD,MAAM,SAAS,CAAC,OAAO,EAAE,CAAC;oBAC1B,MAAM,CAAC,GACH,MAAM,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;oBAC9B,MAAM,SAAS,CAAC,OAAO,EAAE,CAAC;oBAE1B,IAAI,QAAQ,KAAK,SAAS,EAAE,CAAC;wBAC3B,QAAQ,CAAC,CAAC,CAAC,CAAC;oBACd,CAAC;oBAED,OAAO,CAAC,CAAC;gBACX,CAAC,CAAA,CAAC,CACH,CAAC;gBACJ,OAAO;oBACL,QAAQ;oBACR,MAAM;oBACN,KAAK,EAAE,MAAM;yBACV,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,OAAO,CAAC;yBAC/B,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;yBACjB,MAAM,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE,CAAC,yBAAkB,CAAC,IAAI,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,yBAAkB,CAAC,IAAI,EAAE,CAAC;iBACtF,CAAC;YACJ,CAAC,CAAA,CAAC,CAAC;YACH,MAAM,WAAW,GACb,MAAM,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YAC5B,OAAO,CAAC,IAAI,CAAC,OAAO,GAAG;gBACrB,WAAW;gBACX,UAAU;gBACV,YAAY,EAAE,IAAI,IAAI,EAAE;gBACxB,KAAK,EAAE,WAAW;qBACf,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;qBACjB,MAAM,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE,CAAC,yBAAkB,CAAC,IAAI,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,yBAAkB,CAAC,IAAI,EAAE,CAAC;aACtF,CAAC,CAAC;QACL,CAAC;KAAA;IAED;;;;;;;;;;;;;;;;;OAiBG;IACI,MAAM;QACX,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;YAC1B,MAAM,IAAI,KAAK,CAAC,gCAAgC,CAAC,CAAC;QACpD,CAAC;QACD,OAAO,6DAA6B,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAC9D,CAAC;IAEa,IAAI,CAChB,QAA+C;;YAE/C,MAAM,KAAK,GAAoB,IAAI,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC;YACnD,MAAM,UAAU,GAAS,IAAI,IAAI,EAAE,CAAC;YACpC,MAAM,OAAO,GAAG,GAAG,EAAE,CACnB,yDAA2B,CAAC,OAAO,CAAC;gBAClC,QAAQ,EAAE,QAAQ,CAAC,QAAQ;gBAC3B,UAAU,EAAE,KAAK;qBACd,kBAAkB,EAAE;qBACpB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,SAAS,CAAC;qBACjC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;gBACxB,MAAM,EAAE,KAAK;aACd,CAAC,CAAC;YACL,MAAM,GAAG,GAAG,GAAuC,EAAE;gBACnD,MAAM,MAAM,GAAG,yDAA2B,CAAC,OAAO,CAAC;oBACjD,QAAQ,EAAE,QAAQ,CAAC,QAAQ;oBAC3B,UAAU,EAAE,KAAK;yBACd,kBAAkB,EAAE;yBACpB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC;yBAChC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC;yBACtB,IAAI,EAAE;yBACN,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;oBACxB,MAAM,EAAE,KAAK;iBACd,CAAC,CAAC;gBACH,MAAM,IAAI,GAAG,OAAO,EAAE,CAAC;gBACvB,OAAO;oBACL,IAAI,EAAE,CAAC,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,SAAS,CAAc;oBACjD,QAAQ;oBACR,MAAM;oBACN,IAAI;oBACJ,OAAO,EAAE,KAAK,CAAC,kBAAkB,EAAE;oBACnC,KAAK,EAAE,KAAK,CAAC,aAAa,EAAE;oBAC5B,UAAU;oBACV,YAAY,EAAE,IAAI,IAAI,EAAE;iBAC6B,CAAC;YAC1D,CAAC,CAAC;YAEF,IAAI,CAAC;gBACH,MAAM,KAAK,CAAC,UAAU,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;gBACtC,IAAI,OAAO,EAAE,EAAE,CAAC;oBACd,OAAO,GAAG,EAAE,CAAC;gBACf,CAAC;gBAED,KAAK,IAAI,CAAC,GAAW,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,EAAE,CAAC;oBACtD,MAAM,IAAI,GACN,MAAM,yDAA2B,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;oBACpD,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;wBAClB,MAAM;oBACR,CAAC;oBAED,MAAM,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;oBAC7B,IAAI,OAAO,EAAE,EAAE,CAAC;wBACd,OAAO,GAAG,EAAE,CAAC;oBACf,CAAC;gBACH,CAAC;gBACD,OAAO,GAAG,EAAE,CAAC;YACf,CAAC;YACD,OAAO,KAAK,EAAE,CAAC;gBACb,OAAO;oBACL,IAAI,EAAE,OAAO;oBACb,QAAQ;oBACR,OAAO,EAAE,KAAK,CAAC,kBAAkB,EAAE;oBACnC,KAAK,EAAE,KAAK,CAAC,aAAa,EAAE;oBAC5B,KAAK;oBACL,UAAU;oBACV,YAAY,EAAE,IAAI,IAAI,EAAE;iBACzB,CAAC;YACJ,CAAC;QACH,CAAC;KAAA;CACF;AA/KD,sDA+KC"}
|
|
@@ -1,9 +1,15 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
1
|
+
/**
|
|
2
|
+
* @module
|
|
3
|
+
* This file contains the implementation of the AgenticaSelectBenchmark class.
|
|
4
|
+
*
|
|
5
|
+
* @author Wrtn Technologies
|
|
6
|
+
*/
|
|
7
|
+
import type { Agentica } from "@agentica/core";
|
|
8
|
+
import type { ILlmSchema } from "@samchon/openapi";
|
|
9
|
+
import type { tags } from "typia";
|
|
10
|
+
import type { IAgenticaSelectBenchmarkEvent } from "./structures/IAgenticaSelectBenchmarkEvent";
|
|
11
|
+
import type { IAgenticaSelectBenchmarkResult } from "./structures/IAgenticaSelectBenchmarkResult";
|
|
12
|
+
import type { IAgenticaSelectBenchmarkScenario } from "./structures/IAgenticaSelectBenchmarkScenario";
|
|
7
13
|
/**
|
|
8
14
|
* LLM function calling selection benchmark.
|
|
9
15
|
*
|
|
@@ -70,21 +70,22 @@ class AgenticaSelectBenchmark {
|
|
|
70
70
|
const started_at = new Date();
|
|
71
71
|
const semaphore = new tstl_1.Semaphore(this.config_.simultaneous);
|
|
72
72
|
const experiments = yield Promise.all(this.scenarios_.map((scenario) => __awaiter(this, void 0, void 0, function* () {
|
|
73
|
-
const events = yield Promise.all(
|
|
73
|
+
const events = yield Promise.all(Array.from({ length: this.config_.repeat }).map(() => __awaiter(this, void 0, void 0, function* () {
|
|
74
74
|
yield semaphore.acquire();
|
|
75
75
|
const e = yield this.step(scenario);
|
|
76
76
|
yield semaphore.release();
|
|
77
|
-
if (listener !== undefined)
|
|
77
|
+
if (listener !== undefined) {
|
|
78
78
|
listener(e);
|
|
79
|
+
}
|
|
79
80
|
return e;
|
|
80
81
|
})));
|
|
81
82
|
return {
|
|
82
83
|
scenario,
|
|
83
84
|
events,
|
|
84
85
|
usage: events
|
|
85
|
-
.filter(
|
|
86
|
-
.map(
|
|
87
|
-
.reduce(core_1.AgenticaTokenUsage.plus, core_1.AgenticaTokenUsage.zero()),
|
|
86
|
+
.filter(e => e.type !== "error")
|
|
87
|
+
.map(e => e.usage)
|
|
88
|
+
.reduce((acc, cur) => core_1.AgenticaTokenUsage.plus(acc, cur), core_1.AgenticaTokenUsage.zero()),
|
|
88
89
|
};
|
|
89
90
|
})));
|
|
90
91
|
return (this.result_ = {
|
|
@@ -92,8 +93,8 @@ class AgenticaSelectBenchmark {
|
|
|
92
93
|
started_at,
|
|
93
94
|
completed_at: new Date(),
|
|
94
95
|
usage: experiments
|
|
95
|
-
.map(
|
|
96
|
-
.reduce(core_1.AgenticaTokenUsage.plus, core_1.AgenticaTokenUsage.zero()),
|
|
96
|
+
.map(p => p.usage)
|
|
97
|
+
.reduce((acc, cur) => core_1.AgenticaTokenUsage.plus(acc, cur), core_1.AgenticaTokenUsage.zero()),
|
|
97
98
|
});
|
|
98
99
|
});
|
|
99
100
|
}
|
|
@@ -117,8 +118,9 @@ class AgenticaSelectBenchmark {
|
|
|
117
118
|
* @returns Dictionary of markdown files.
|
|
118
119
|
*/
|
|
119
120
|
report() {
|
|
120
|
-
if (this.result_ === null)
|
|
121
|
+
if (this.result_ === null) {
|
|
121
122
|
throw new Error("Benchmark is not executed yet.");
|
|
123
|
+
}
|
|
122
124
|
return AgenticaSelectBenchmarkReporter_1.AgenticaSelectBenchmarkReporter.markdown(this.result_);
|
|
123
125
|
}
|
|
124
126
|
step(scenario) {
|
|
@@ -134,13 +136,13 @@ class AgenticaSelectBenchmark {
|
|
|
134
136
|
usage,
|
|
135
137
|
})), { histories: this.histories_.slice(), stack: [], ready: () => true, dispatch: () => __awaiter(this, void 0, void 0, function* () { }) }));
|
|
136
138
|
const selected = prompts
|
|
137
|
-
.filter(
|
|
138
|
-
.map(
|
|
139
|
+
.filter(p => p.type === "select")
|
|
140
|
+
.map(p => p.selections)
|
|
139
141
|
.flat();
|
|
140
142
|
return {
|
|
141
143
|
type: AgenticaBenchmarkPredicator_1.AgenticaBenchmarkPredicator.success({
|
|
142
144
|
expected: scenario.expected,
|
|
143
|
-
operations: selected.map(
|
|
145
|
+
operations: selected.map(s => s.operation),
|
|
144
146
|
})
|
|
145
147
|
? "success"
|
|
146
148
|
: "failure",
|
|
@@ -148,7 +150,7 @@ class AgenticaSelectBenchmark {
|
|
|
148
150
|
selected,
|
|
149
151
|
usage,
|
|
150
152
|
assistantPrompts: prompts
|
|
151
|
-
.filter(
|
|
153
|
+
.filter(p => p.type === "text")
|
|
152
154
|
.filter((p) => p.role === "assistant"),
|
|
153
155
|
started_at,
|
|
154
156
|
completed_at: new Date(),
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"AgenticaSelectBenchmark.js","sourceRoot":"","sources":["../src/AgenticaSelectBenchmark.ts"],"names":[],"mappings":";;;;;;;;;;;;
|
|
1
|
+
{"version":3,"file":"AgenticaSelectBenchmark.js","sourceRoot":"","sources":["../src/AgenticaSelectBenchmark.ts"],"names":[],"mappings":";;;;;;;;;;;;AAkBA,yCAGwB;AACxB,sGAAmG;AACnG,+BAAiC;AACjC,wFAAqF;AACrF,gGAA6F;AAE7F;;;;;;;;;;;;;;;;GAgBG;AACH,MAAa,uBAAuB;IAOlC;;;;OAIG;IACH,YAAmB,KAA4C;;QAC7D,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC,KAAK,CAAC;QAC1B,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC,SAAS,CAAC,KAAK,EAAE,CAAC;QAC1C,IAAI,CAAC,OAAO,GAAG;YACb,MAAM,EAAE,MAAA,MAAA,KAAK,CAAC,MAAM,0CAAE,MAAM,mCAAI,EAAE;YAClC,YAAY,EAAE,MAAA,MAAA,KAAK,CAAC,MAAM,0CAAE,YAAY,mCAAI,EAAE;SAC/C,CAAC;QACF,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC,KAAK,CAAC,kBAAkB,EAAE,CAAC,KAAK,EAAE,CAAC;QAC3D,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;IACtB,CAAC;IAED;;;;;;;;;;;;;;;OAeG;IACU,OAAO,CAClB,QAAgE;;YAEhE,MAAM,UAAU,GAAS,IAAI,IAAI,EAAE,CAAC;YACpC,MAAM,SAAS,GAAc,IAAI,gBAAS,CAAC,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;YACtE,MAAM,WAAW,GACb,MAAM,OAAO,CAAC,GAAG,CACjB,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAO,QAAQ,EAAE,EAAE;gBACrC,MAAM,MAAM,GACR,MAAM,OAAO,CAAC,GAAG,CACjB,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC,GAAG,CAAC,GAAS,EAAE;oBACzD,MAAM,SAAS,CAAC,OAAO,EAAE,CAAC;oBAC1B,MAAM,CAAC,GACH,MAAM,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;oBAC9B,MAAM,SAAS,CAAC,OAAO,EAAE,CAAC;oBAC1B,IAAI,QAAQ,KAAK,SAAS,EAAE,CAAC;wBAC3B,QAAQ,CAAC,CAAC,CAAC,CAAC;oBACd,CAAC;oBACD,OAAO,CAAC,CAAC;gBACX,CAAC,CAAA,CAAC,CACH,CAAC;gBACJ,OAAO;oBACL,QAAQ;oBACR,MAAM;oBACN,KAAK,EAAE,MAAM;yBACV,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,OAAO,CAAC;yBAC/B,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;yBACjB,MAAM,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE,CAAC,yBAAkB,CAAC,IAAI,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,yBAAkB,CAAC,IAAI,EAAE,CAAC;iBACtF,CAAC;YACJ,CAAC,CAAA,CAAC,CACH,CAAC;YACJ,OAAO,CAAC,IAAI,CAAC,OAAO,GAAG;gBACrB,WAAW;gBACX,UAAU;gBACV,YAAY,EAAE,IAAI,IAAI,EAAE;gBACxB,KAAK,EAAE,WAAW;qBACf,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;qBACjB,MAAM,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE,CAAC,yBAAkB,CAAC,IAAI,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,yBAAkB,CAAC,IAAI,EAAE,CAAC;aACtF,CAAC,CAAC;QACL,CAAC;KAAA;IAED;;;;;;;;;;;;;;;;;;OAkBG;IACI,MAAM;QACX,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;YAC1B,MAAM,IAAI,KAAK,CAAC,gCAAgC,CAAC,CAAC;QACpD,CAAC;QACD,OAAO,iEAA+B,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAChE,CAAC;IAEa,IAAI,CAChB,QAAiD;;YAEjD,MAAM,UAAU,GAAS,IAAI,IAAI,EAAE,CAAC;YACpC,IAAI,CAAC;gBACH,MAAM,KAAK,GAAuB,yBAAkB,CAAC,IAAI,EAAE,CAAC;gBAC5D,MAAM,OAAO,GACT,MAAM,uDAA0B,CAAC,OAAO,CAAC,gCACtC,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC;oBACxB,MAAM,EAAE,IAAI,yBAAkB,CAAC;wBAC7B,IAAI,EAAE,MAAM;wBACZ,IAAI,EAAE,QAAQ,CAAC,IAAI;qBACpB,CAAC;oBACF,KAAK;iBACN,CAAC,KACF,SAAS,EAAE,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,EAClC,KAAK,EAAE,EAAE,EACT,KAAK,EAAE,GAAG,EAAE,CAAC,IAAI,EACjB,QAAQ,EAAE,GAAS,EAAE,gDAAE,CAAC,CAAA,GACQ,CAAC,CAAC;gBACtC,MAAM,QAAQ,GAAwC,OAAO;qBAC1D,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC;qBAChC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC;qBACtB,IAAI,EAAE,CAAC;gBACV,OAAO;oBACL,IAAI,EAAE,yDAA2B,CAAC,OAAO,CAAC;wBACxC,QAAQ,EAAE,QAAQ,CAAC,QAAQ;wBAC3B,UAAU,EAAE,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;qBAC3C,CAAC;wBACA,CAAC,CAAC,SAAS;wBACX,CAAC,CAAC,SAAS;oBACb,QAAQ;oBACR,QAAQ;oBACR,KAAK;oBACL,gBAAgB,EAAE,OAAO;yBACtB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,MAAM,CAAC;yBAC9B,MAAM,CACL,CAAC,CAAC,EAAwC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,WAAW,CACpE;oBACH,UAAU;oBACV,YAAY,EAAE,IAAI,IAAI,EAAE;iBAGqB,CAAC;YAClD,CAAC;YACD,OAAO,KAAK,EAAE,CAAC;gBACb,OAAO;oBACL,IAAI,EAAE,OAAO;oBACb,QAAQ;oBACR,KAAK;oBACL,UAAU;oBACV,YAAY,EAAE,IAAI,IAAI,EAAE;iBAC6B,CAAC;YAC1D,CAAC;QACH,CAAC;KAAA;CACF;AAjKD,0DAiKC"}
|