@agentica/benchmark 0.12.21 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -33
- package/lib/AgenticaCallBenchmark.d.ts +12 -6
- package/lib/AgenticaCallBenchmark.js +24 -18
- package/lib/AgenticaCallBenchmark.js.map +1 -1
- package/lib/AgenticaSelectBenchmark.d.ts +12 -6
- package/lib/AgenticaSelectBenchmark.js +14 -12
- package/lib/AgenticaSelectBenchmark.js.map +1 -1
- package/lib/index.mjs +315 -236
- package/lib/index.mjs.map +1 -1
- package/lib/internal/AgenticaBenchmarkPredicator.d.ts +38 -29
- package/lib/internal/AgenticaBenchmarkPredicator.js +100 -84
- package/lib/internal/AgenticaBenchmarkPredicator.js.map +1 -1
- package/lib/internal/AgenticaBenchmarkUtil.d.ts +21 -6
- package/lib/internal/AgenticaBenchmarkUtil.js +39 -33
- package/lib/internal/AgenticaBenchmarkUtil.js.map +1 -1
- package/lib/internal/AgenticaCallBenchmarkReporter.d.ts +6 -5
- package/lib/internal/AgenticaCallBenchmarkReporter.js +130 -126
- package/lib/internal/AgenticaCallBenchmarkReporter.js.map +1 -1
- package/lib/internal/AgenticaPromptReporter.d.ts +13 -5
- package/lib/internal/AgenticaPromptReporter.js +45 -41
- package/lib/internal/AgenticaPromptReporter.js.map +1 -1
- package/lib/internal/AgenticaSelectBenchmarkReporter.d.ts +3 -1
- package/lib/internal/AgenticaSelectBenchmarkReporter.js +153 -150
- package/lib/internal/AgenticaSelectBenchmarkReporter.js.map +1 -1
- package/lib/structures/IAgenticaBenchmarkExpected.d.ts +8 -2
- package/lib/structures/IAgenticaCallBenchmarkEvent.d.ts +9 -3
- package/lib/structures/IAgenticaCallBenchmarkResult.d.ts +10 -4
- package/lib/structures/IAgenticaCallBenchmarkScenario.d.ts +8 -2
- package/lib/structures/IAgenticaSelectBenchmarkEvent.d.ts +9 -3
- package/lib/structures/IAgenticaSelectBenchmarkResult.d.ts +10 -4
- package/lib/structures/IAgenticaSelectBenchmarkScenario.d.ts +8 -2
- package/lib/utils/MathUtil.d.ts +15 -3
- package/lib/utils/MathUtil.js +15 -4
- package/lib/utils/MathUtil.js.map +1 -1
- package/package.json +12 -10
- package/src/AgenticaCallBenchmark.ts +64 -45
- package/src/AgenticaSelectBenchmark.ts +42 -30
- package/src/internal/AgenticaBenchmarkPredicator.ts +208 -186
- package/src/internal/AgenticaBenchmarkUtil.ts +58 -40
- package/src/internal/AgenticaCallBenchmarkReporter.ts +180 -182
- package/src/internal/AgenticaPromptReporter.ts +46 -33
- package/src/internal/AgenticaSelectBenchmarkReporter.ts +205 -203
- package/src/structures/IAgenticaBenchmarkExpected.ts +9 -2
- package/src/structures/IAgenticaCallBenchmarkEvent.ts +9 -3
- package/src/structures/IAgenticaCallBenchmarkResult.ts +10 -4
- package/src/structures/IAgenticaCallBenchmarkScenario.ts +8 -2
- package/src/structures/IAgenticaSelectBenchmarkEvent.ts +9 -3
- package/src/structures/IAgenticaSelectBenchmarkResult.ts +10 -4
- package/src/structures/IAgenticaSelectBenchmarkScenario.ts +8 -2
- package/src/utils/MathUtil.ts +16 -3
|
@@ -1,13 +1,20 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
/**
|
|
2
|
+
* @module
|
|
3
|
+
* This file contains the implementation of the AgenticaCallBenchmark class.
|
|
4
|
+
*
|
|
5
|
+
* @author Wrtn Technologies
|
|
6
|
+
*/
|
|
7
|
+
import type { Agentica } from "@agentica/core";
|
|
8
|
+
import type { ILlmSchema } from "@samchon/openapi";
|
|
9
|
+
import type { tags } from "typia";
|
|
10
|
+
import type { IAgenticaCallBenchmarkEvent } from "./structures/IAgenticaCallBenchmarkEvent";
|
|
11
|
+
import type { IAgenticaCallBenchmarkResult } from "./structures/IAgenticaCallBenchmarkResult";
|
|
5
12
|
|
|
13
|
+
import type { IAgenticaCallBenchmarkScenario } from "./structures/IAgenticaCallBenchmarkScenario";
|
|
14
|
+
import { AgenticaTokenUsage } from "@agentica/core";
|
|
15
|
+
import { Semaphore } from "tstl";
|
|
6
16
|
import { AgenticaBenchmarkPredicator } from "./internal/AgenticaBenchmarkPredicator";
|
|
7
17
|
import { AgenticaCallBenchmarkReporter } from "./internal/AgenticaCallBenchmarkReporter";
|
|
8
|
-
import { IAgenticaCallBenchmarkEvent } from "./structures/IAgenticaCallBenchmarkEvent";
|
|
9
|
-
import { IAgenticaCallBenchmarkResult } from "./structures/IAgenticaCallBenchmarkResult";
|
|
10
|
-
import { IAgenticaCallBenchmarkScenario } from "./structures/IAgenticaCallBenchmarkScenario";
|
|
11
18
|
|
|
12
19
|
/**
|
|
13
20
|
* LLM function calling selection benchmark.
|
|
@@ -72,37 +79,40 @@ export class AgenticaCallBenchmark<Model extends ILlmSchema.Model> {
|
|
|
72
79
|
): Promise<IAgenticaCallBenchmarkResult<Model>> {
|
|
73
80
|
const started_at: Date = new Date();
|
|
74
81
|
const semaphore: Semaphore = new Semaphore(this.config_.simultaneous);
|
|
75
|
-
const
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
await
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
82
|
+
const task = this.scenarios_.map(async (scenario) => {
|
|
83
|
+
const events: IAgenticaCallBenchmarkEvent<Model>[]
|
|
84
|
+
= await Promise.all(
|
|
85
|
+
Array.from({ length: this.config_.repeat }).map(async () => {
|
|
86
|
+
await semaphore.acquire();
|
|
87
|
+
const e: IAgenticaCallBenchmarkEvent<Model>
|
|
88
|
+
= await this.step(scenario);
|
|
89
|
+
await semaphore.release();
|
|
90
|
+
|
|
91
|
+
if (listener !== undefined) {
|
|
92
|
+
listener(e);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return e;
|
|
96
|
+
}),
|
|
97
|
+
);
|
|
98
|
+
return {
|
|
99
|
+
scenario,
|
|
100
|
+
events,
|
|
101
|
+
usage: events
|
|
102
|
+
.filter(e => e.type !== "error")
|
|
103
|
+
.map(e => e.usage)
|
|
104
|
+
.reduce((acc, cur) => AgenticaTokenUsage.plus(acc, cur), AgenticaTokenUsage.zero()),
|
|
105
|
+
};
|
|
106
|
+
});
|
|
107
|
+
const experiments: IAgenticaCallBenchmarkResult.IExperiment<Model>[]
|
|
108
|
+
= await Promise.all(task);
|
|
99
109
|
return (this.result_ = {
|
|
100
110
|
experiments,
|
|
101
111
|
started_at,
|
|
102
112
|
completed_at: new Date(),
|
|
103
113
|
usage: experiments
|
|
104
|
-
.map(
|
|
105
|
-
.reduce(AgenticaTokenUsage.plus, AgenticaTokenUsage.zero()),
|
|
114
|
+
.map(p => p.usage)
|
|
115
|
+
.reduce((acc, cur) => AgenticaTokenUsage.plus(acc, cur), AgenticaTokenUsage.zero()),
|
|
106
116
|
});
|
|
107
117
|
}
|
|
108
118
|
|
|
@@ -125,8 +135,9 @@ export class AgenticaCallBenchmark<Model extends ILlmSchema.Model> {
|
|
|
125
135
|
* @returns Dictionary of markdown files.
|
|
126
136
|
*/
|
|
127
137
|
public report(): Record<string, string> {
|
|
128
|
-
if (this.result_ === null)
|
|
138
|
+
if (this.result_ === null) {
|
|
129
139
|
throw new Error("Benchmark is not executed yet.");
|
|
140
|
+
}
|
|
130
141
|
return AgenticaCallBenchmarkReporter.markdown(this.result_);
|
|
131
142
|
}
|
|
132
143
|
|
|
@@ -140,8 +151,8 @@ export class AgenticaCallBenchmark<Model extends ILlmSchema.Model> {
|
|
|
140
151
|
expected: scenario.expected,
|
|
141
152
|
operations: agent
|
|
142
153
|
.getPromptHistories()
|
|
143
|
-
.filter(
|
|
144
|
-
.map(
|
|
154
|
+
.filter(p => p.type === "execute")
|
|
155
|
+
.map(p => p.operation),
|
|
145
156
|
strict: false,
|
|
146
157
|
});
|
|
147
158
|
const out = (): IAgenticaCallBenchmarkEvent<Model> => {
|
|
@@ -149,10 +160,10 @@ export class AgenticaCallBenchmark<Model extends ILlmSchema.Model> {
|
|
|
149
160
|
expected: scenario.expected,
|
|
150
161
|
operations: agent
|
|
151
162
|
.getPromptHistories()
|
|
152
|
-
.filter(
|
|
153
|
-
.map(
|
|
163
|
+
.filter(p => p.type === "select")
|
|
164
|
+
.map(p => p.selections)
|
|
154
165
|
.flat()
|
|
155
|
-
.map(
|
|
166
|
+
.map(p => p.operation),
|
|
156
167
|
strict: false,
|
|
157
168
|
});
|
|
158
169
|
const call = success();
|
|
@@ -170,17 +181,25 @@ export class AgenticaCallBenchmark<Model extends ILlmSchema.Model> {
|
|
|
170
181
|
|
|
171
182
|
try {
|
|
172
183
|
await agent.conversate(scenario.text);
|
|
173
|
-
if (success())
|
|
184
|
+
if (success()) {
|
|
185
|
+
return out();
|
|
186
|
+
}
|
|
187
|
+
|
|
174
188
|
for (let i: number = 0; i < this.config_.consent; ++i) {
|
|
175
|
-
const next: string | null
|
|
176
|
-
await AgenticaBenchmarkPredicator.isNext(agent);
|
|
177
|
-
if (next === null)
|
|
189
|
+
const next: string | null
|
|
190
|
+
= await AgenticaBenchmarkPredicator.isNext(agent);
|
|
191
|
+
if (next === null) {
|
|
192
|
+
break;
|
|
193
|
+
}
|
|
178
194
|
|
|
179
195
|
await agent.conversate(next);
|
|
180
|
-
if (success())
|
|
196
|
+
if (success()) {
|
|
197
|
+
return out();
|
|
198
|
+
}
|
|
181
199
|
}
|
|
182
200
|
return out();
|
|
183
|
-
}
|
|
201
|
+
}
|
|
202
|
+
catch (error) {
|
|
184
203
|
return {
|
|
185
204
|
type: "error",
|
|
186
205
|
scenario,
|
|
@@ -1,21 +1,29 @@
|
|
|
1
|
-
|
|
1
|
+
/**
|
|
2
|
+
* @module
|
|
3
|
+
* This file contains the implementation of the AgenticaSelectBenchmark class.
|
|
4
|
+
*
|
|
5
|
+
* @author Wrtn Technologies
|
|
6
|
+
*/
|
|
7
|
+
import type {
|
|
2
8
|
Agentica,
|
|
3
9
|
AgenticaContext,
|
|
4
10
|
AgenticaOperationSelection,
|
|
5
11
|
AgenticaPrompt,
|
|
12
|
+
} from "@agentica/core";
|
|
13
|
+
import type { ILlmSchema } from "@samchon/openapi";
|
|
14
|
+
import type { tags } from "typia";
|
|
15
|
+
import type { IAgenticaSelectBenchmarkEvent } from "./structures/IAgenticaSelectBenchmarkEvent";
|
|
16
|
+
import type { IAgenticaSelectBenchmarkResult } from "./structures/IAgenticaSelectBenchmarkResult";
|
|
17
|
+
import type { IAgenticaSelectBenchmarkScenario } from "./structures/IAgenticaSelectBenchmarkScenario";
|
|
18
|
+
|
|
19
|
+
import {
|
|
6
20
|
AgenticaTextPrompt,
|
|
7
21
|
AgenticaTokenUsage,
|
|
8
22
|
} from "@agentica/core";
|
|
9
23
|
import { ChatGptSelectFunctionAgent } from "@agentica/core/src/chatgpt/ChatGptSelectFunctionAgent";
|
|
10
|
-
import { ILlmSchema } from "@samchon/openapi";
|
|
11
24
|
import { Semaphore } from "tstl";
|
|
12
|
-
import { tags } from "typia";
|
|
13
|
-
|
|
14
25
|
import { AgenticaBenchmarkPredicator } from "./internal/AgenticaBenchmarkPredicator";
|
|
15
26
|
import { AgenticaSelectBenchmarkReporter } from "./internal/AgenticaSelectBenchmarkReporter";
|
|
16
|
-
import { IAgenticaSelectBenchmarkEvent } from "./structures/IAgenticaSelectBenchmarkEvent";
|
|
17
|
-
import { IAgenticaSelectBenchmarkResult } from "./structures/IAgenticaSelectBenchmarkResult";
|
|
18
|
-
import { IAgenticaSelectBenchmarkScenario } from "./structures/IAgenticaSelectBenchmarkScenario";
|
|
19
27
|
|
|
20
28
|
/**
|
|
21
29
|
* LLM function calling selection benchmark.
|
|
@@ -78,17 +86,19 @@ export class AgenticaSelectBenchmark<Model extends ILlmSchema.Model> {
|
|
|
78
86
|
): Promise<IAgenticaSelectBenchmarkResult<Model>> {
|
|
79
87
|
const started_at: Date = new Date();
|
|
80
88
|
const semaphore: Semaphore = new Semaphore(this.config_.simultaneous);
|
|
81
|
-
const experiments: IAgenticaSelectBenchmarkResult.IExperiment<Model>[]
|
|
82
|
-
await Promise.all(
|
|
89
|
+
const experiments: IAgenticaSelectBenchmarkResult.IExperiment<Model>[]
|
|
90
|
+
= await Promise.all(
|
|
83
91
|
this.scenarios_.map(async (scenario) => {
|
|
84
|
-
const events: IAgenticaSelectBenchmarkEvent<Model>[]
|
|
85
|
-
await Promise.all(
|
|
86
|
-
|
|
92
|
+
const events: IAgenticaSelectBenchmarkEvent<Model>[]
|
|
93
|
+
= await Promise.all(
|
|
94
|
+
Array.from({ length: this.config_.repeat }).map(async () => {
|
|
87
95
|
await semaphore.acquire();
|
|
88
|
-
const e: IAgenticaSelectBenchmarkEvent<Model>
|
|
89
|
-
await this.step(scenario);
|
|
96
|
+
const e: IAgenticaSelectBenchmarkEvent<Model>
|
|
97
|
+
= await this.step(scenario);
|
|
90
98
|
await semaphore.release();
|
|
91
|
-
if (listener !== undefined)
|
|
99
|
+
if (listener !== undefined) {
|
|
100
|
+
listener(e);
|
|
101
|
+
}
|
|
92
102
|
return e;
|
|
93
103
|
}),
|
|
94
104
|
);
|
|
@@ -96,9 +106,9 @@ export class AgenticaSelectBenchmark<Model extends ILlmSchema.Model> {
|
|
|
96
106
|
scenario,
|
|
97
107
|
events,
|
|
98
108
|
usage: events
|
|
99
|
-
.filter(
|
|
100
|
-
.map(
|
|
101
|
-
.reduce(AgenticaTokenUsage.plus, AgenticaTokenUsage.zero()),
|
|
109
|
+
.filter(e => e.type !== "error")
|
|
110
|
+
.map(e => e.usage)
|
|
111
|
+
.reduce((acc, cur) => AgenticaTokenUsage.plus(acc, cur), AgenticaTokenUsage.zero()),
|
|
102
112
|
};
|
|
103
113
|
}),
|
|
104
114
|
);
|
|
@@ -107,8 +117,8 @@ export class AgenticaSelectBenchmark<Model extends ILlmSchema.Model> {
|
|
|
107
117
|
started_at,
|
|
108
118
|
completed_at: new Date(),
|
|
109
119
|
usage: experiments
|
|
110
|
-
.map(
|
|
111
|
-
.reduce(AgenticaTokenUsage.plus, AgenticaTokenUsage.zero()),
|
|
120
|
+
.map(p => p.usage)
|
|
121
|
+
.reduce((acc, cur) => AgenticaTokenUsage.plus(acc, cur), AgenticaTokenUsage.zero()),
|
|
112
122
|
});
|
|
113
123
|
}
|
|
114
124
|
|
|
@@ -132,8 +142,9 @@ export class AgenticaSelectBenchmark<Model extends ILlmSchema.Model> {
|
|
|
132
142
|
* @returns Dictionary of markdown files.
|
|
133
143
|
*/
|
|
134
144
|
public report(): Record<string, string> {
|
|
135
|
-
if (this.result_ === null)
|
|
145
|
+
if (this.result_ === null) {
|
|
136
146
|
throw new Error("Benchmark is not executed yet.");
|
|
147
|
+
}
|
|
137
148
|
return AgenticaSelectBenchmarkReporter.markdown(this.result_);
|
|
138
149
|
}
|
|
139
150
|
|
|
@@ -143,8 +154,8 @@ export class AgenticaSelectBenchmark<Model extends ILlmSchema.Model> {
|
|
|
143
154
|
const started_at: Date = new Date();
|
|
144
155
|
try {
|
|
145
156
|
const usage: AgenticaTokenUsage = AgenticaTokenUsage.zero();
|
|
146
|
-
const prompts: AgenticaPrompt<Model>[]
|
|
147
|
-
await ChatGptSelectFunctionAgent.execute({
|
|
157
|
+
const prompts: AgenticaPrompt<Model>[]
|
|
158
|
+
= await ChatGptSelectFunctionAgent.execute({
|
|
148
159
|
...this.agent_.getContext({
|
|
149
160
|
prompt: new AgenticaTextPrompt({
|
|
150
161
|
role: "user",
|
|
@@ -158,13 +169,13 @@ export class AgenticaSelectBenchmark<Model extends ILlmSchema.Model> {
|
|
|
158
169
|
dispatch: async () => {},
|
|
159
170
|
} satisfies AgenticaContext<Model>);
|
|
160
171
|
const selected: AgenticaOperationSelection<Model>[] = prompts
|
|
161
|
-
.filter(
|
|
162
|
-
.map(
|
|
172
|
+
.filter(p => p.type === "select")
|
|
173
|
+
.map(p => p.selections)
|
|
163
174
|
.flat();
|
|
164
175
|
return {
|
|
165
176
|
type: AgenticaBenchmarkPredicator.success({
|
|
166
177
|
expected: scenario.expected,
|
|
167
|
-
operations: selected.map(
|
|
178
|
+
operations: selected.map(s => s.operation),
|
|
168
179
|
})
|
|
169
180
|
? "success"
|
|
170
181
|
: "failure",
|
|
@@ -172,16 +183,17 @@ export class AgenticaSelectBenchmark<Model extends ILlmSchema.Model> {
|
|
|
172
183
|
selected,
|
|
173
184
|
usage,
|
|
174
185
|
assistantPrompts: prompts
|
|
175
|
-
.filter(
|
|
186
|
+
.filter(p => p.type === "text")
|
|
176
187
|
.filter(
|
|
177
188
|
(p): p is AgenticaTextPrompt<"assistant"> => p.role === "assistant",
|
|
178
189
|
),
|
|
179
190
|
started_at,
|
|
180
191
|
completed_at: new Date(),
|
|
181
192
|
} satisfies
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
}
|
|
193
|
+
| IAgenticaSelectBenchmarkEvent.ISuccess<Model>
|
|
194
|
+
| IAgenticaSelectBenchmarkEvent.IFailure<Model>;
|
|
195
|
+
}
|
|
196
|
+
catch (error) {
|
|
185
197
|
return {
|
|
186
198
|
type: "error",
|
|
187
199
|
scenario,
|