@agentica/benchmark 0.44.0-dev.20260313-2 → 0.44.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +218 -218
- package/package.json +4 -4
- package/src/AgenticaCallBenchmark.ts +281 -281
- package/src/AgenticaSelectBenchmark.ts +282 -282
- package/src/MicroAgenticaCallBenchmark.ts +281 -281
- package/src/index.ts +3 -3
- package/src/internal/AgenticaBenchmarkPredicator.ts +240 -240
- package/src/internal/AgenticaBenchmarkUtil.ts +60 -60
- package/src/internal/AgenticaCallBenchmarkReporter.ts +192 -192
- package/src/internal/AgenticaPromptReporter.ts +63 -63
- package/src/internal/AgenticaSelectBenchmarkReporter.ts +220 -220
- package/src/structures/IAgenticaBenchmarkExpected.ts +74 -74
- package/src/structures/IAgenticaCallBenchmarkEvent.ts +118 -118
- package/src/structures/IAgenticaCallBenchmarkResult.ts +75 -75
- package/src/structures/IAgenticaCallBenchmarkScenario.ts +45 -45
- package/src/structures/IAgenticaSelectBenchmarkEvent.ts +122 -122
- package/src/structures/IAgenticaSelectBenchmarkResult.ts +75 -75
- package/src/structures/IAgenticaSelectBenchmarkScenario.ts +45 -45
- package/src/utils/MathUtil.ts +16 -16
|
@@ -1,282 +1,282 @@
|
|
|
1
|
-
import type {
|
|
2
|
-
Agentica,
|
|
3
|
-
AgenticaContext,
|
|
4
|
-
AgenticaEvent,
|
|
5
|
-
AgenticaHistory,
|
|
6
|
-
AgenticaOperationSelection,
|
|
7
|
-
} from "@agentica/core";
|
|
8
|
-
import type { tags } from "typia";
|
|
9
|
-
|
|
10
|
-
/**
|
|
11
|
-
* @module
|
|
12
|
-
* This file contains the implementation of the AgenticaSelectBenchmark class.
|
|
13
|
-
*
|
|
14
|
-
* @author Wrtn Technologies
|
|
15
|
-
*/
|
|
16
|
-
import { AgenticaTokenUsage, factory, orchestrate } from "@agentica/core";
|
|
17
|
-
import { Semaphore } from "tstl";
|
|
18
|
-
import { v4 } from "uuid";
|
|
19
|
-
|
|
20
|
-
import type { IAgenticaSelectBenchmarkEvent } from "./structures/IAgenticaSelectBenchmarkEvent";
|
|
21
|
-
import type { IAgenticaSelectBenchmarkResult } from "./structures/IAgenticaSelectBenchmarkResult";
|
|
22
|
-
import type { IAgenticaSelectBenchmarkScenario } from "./structures/IAgenticaSelectBenchmarkScenario";
|
|
23
|
-
|
|
24
|
-
import { AgenticaBenchmarkPredicator } from "./internal/AgenticaBenchmarkPredicator";
|
|
25
|
-
import { AgenticaSelectBenchmarkReporter } from "./internal/AgenticaSelectBenchmarkReporter";
|
|
26
|
-
|
|
27
|
-
/**
|
|
28
|
-
* LLM function calling selection benchmark.
|
|
29
|
-
*
|
|
30
|
-
* `AgenticaSelectBenchmark` is a class for the benchmark of the
|
|
31
|
-
* LLM (Large Model Language) function calling's selection part.
|
|
32
|
-
* It utilizes the `selector` agent and tests whether the expected
|
|
33
|
-
* {@link IAgenticaOperation operations} are properly selected from
|
|
34
|
-
* the given {@link IAgenticaSelectBenchmarkScenario scenarios}.
|
|
35
|
-
*
|
|
36
|
-
* Note that, this `AgenticaSelectBenchmark` class measures only the
|
|
37
|
-
* selection benchmark, testing whether the `selector` agent can select
|
|
38
|
-
* candidate functions to call as expected. Therefore, it does not test
|
|
39
|
-
* about the actual function calling which is done by the `executor` agent.
|
|
40
|
-
* If you want that feature, use {@link AgenticaCallBenchmark} class instead.
|
|
41
|
-
*
|
|
42
|
-
* @author Samchon
|
|
43
|
-
*/
|
|
44
|
-
export class AgenticaSelectBenchmark {
|
|
45
|
-
private agent_: Agentica;
|
|
46
|
-
private scenarios_: IAgenticaSelectBenchmarkScenario[];
|
|
47
|
-
private config_: AgenticaSelectBenchmark.IConfig;
|
|
48
|
-
private histories_: AgenticaHistory[];
|
|
49
|
-
private result_: IAgenticaSelectBenchmarkResult | null;
|
|
50
|
-
|
|
51
|
-
/**
|
|
52
|
-
* Initializer Constructor.
|
|
53
|
-
*
|
|
54
|
-
* @param props Properties of the selection benchmark
|
|
55
|
-
*/
|
|
56
|
-
public constructor(props: AgenticaSelectBenchmark.IProps) {
|
|
57
|
-
this.agent_ = props.agent;
|
|
58
|
-
this.scenarios_ = props.scenarios.slice();
|
|
59
|
-
this.config_ = {
|
|
60
|
-
repeat: props.config?.repeat ?? 10,
|
|
61
|
-
simultaneous: props.config?.simultaneous ?? 10,
|
|
62
|
-
};
|
|
63
|
-
this.histories_ = props.agent.getHistories().slice();
|
|
64
|
-
this.result_ = null;
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
/**
|
|
68
|
-
* Execute the benchmark.
|
|
69
|
-
*
|
|
70
|
-
* Execute the benchmark of the LLM function selection, and returns
|
|
71
|
-
* the result of the benchmark.
|
|
72
|
-
*
|
|
73
|
-
* If you wanna see progress of the benchmark, you can pass a callback
|
|
74
|
-
* function as the argument of the `listener`. The callback function
|
|
75
|
-
* would be called whenever a benchmark event is occurred.
|
|
76
|
-
*
|
|
77
|
-
* Also, you can publish a markdown format report by calling
|
|
78
|
-
* the {@link report} function after the benchmark execution.
|
|
79
|
-
*
|
|
80
|
-
* @param listener Callback function listening the benchmark events
|
|
81
|
-
* @returns Results of the function selection benchmark
|
|
82
|
-
*/
|
|
83
|
-
public async execute(
|
|
84
|
-
listener?: (event: IAgenticaSelectBenchmarkEvent) => void,
|
|
85
|
-
): Promise<IAgenticaSelectBenchmarkResult> {
|
|
86
|
-
const started_at: Date = new Date();
|
|
87
|
-
const semaphore: Semaphore = new Semaphore(this.config_.simultaneous);
|
|
88
|
-
const experiments: IAgenticaSelectBenchmarkResult.IExperiment[]
|
|
89
|
-
= await Promise.all(
|
|
90
|
-
this.scenarios_.map(async (scenario) => {
|
|
91
|
-
const events: IAgenticaSelectBenchmarkEvent[]
|
|
92
|
-
= await Promise.all(
|
|
93
|
-
Array.from({ length: this.config_.repeat }).map(async () => {
|
|
94
|
-
await semaphore.acquire();
|
|
95
|
-
const e: IAgenticaSelectBenchmarkEvent
|
|
96
|
-
= await this.step(scenario);
|
|
97
|
-
await semaphore.release();
|
|
98
|
-
if (listener !== undefined) {
|
|
99
|
-
listener(e);
|
|
100
|
-
}
|
|
101
|
-
return e;
|
|
102
|
-
}),
|
|
103
|
-
);
|
|
104
|
-
return {
|
|
105
|
-
scenario,
|
|
106
|
-
events,
|
|
107
|
-
usage: events
|
|
108
|
-
.filter(e => e.type !== "error")
|
|
109
|
-
.map(e => e.usage)
|
|
110
|
-
.reduce((acc, cur) => AgenticaTokenUsage.plus(acc, cur), AgenticaTokenUsage.zero()),
|
|
111
|
-
};
|
|
112
|
-
}),
|
|
113
|
-
);
|
|
114
|
-
return (this.result_ = {
|
|
115
|
-
experiments,
|
|
116
|
-
started_at,
|
|
117
|
-
completed_at: new Date(),
|
|
118
|
-
usage: experiments
|
|
119
|
-
.map(p => p.usage)
|
|
120
|
-
.reduce((acc, cur) => AgenticaTokenUsage.plus(acc, cur), AgenticaTokenUsage.zero()),
|
|
121
|
-
});
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
/**
|
|
125
|
-
* Report the benchmark result as markdown files.
|
|
126
|
-
*
|
|
127
|
-
* Report the benchmark result {@link execute}d by
|
|
128
|
-
* `AgenticaSelectBenchmark` as markdown files, and returns a
|
|
129
|
-
* dictionary object of the markdown reporting files. The key of
|
|
130
|
-
* the dictionary would be file name, and the value would be the
|
|
131
|
-
* markdown content.
|
|
132
|
-
*
|
|
133
|
-
* For reference, the markdown files are composed like below:
|
|
134
|
-
*
|
|
135
|
-
* - `./README.md`
|
|
136
|
-
* - `./scenario-1/README.md`
|
|
137
|
-
* - `./scenario-1/1.success.md`
|
|
138
|
-
* - `./scenario-1/2.failure.md`
|
|
139
|
-
* - `./scenario-1/3.error.md`
|
|
140
|
-
*
|
|
141
|
-
* @returns Dictionary of markdown files.
|
|
142
|
-
*/
|
|
143
|
-
public report(): Record<string, string> {
|
|
144
|
-
if (this.result_ === null) {
|
|
145
|
-
throw new Error("Benchmark is not executed yet.");
|
|
146
|
-
}
|
|
147
|
-
return AgenticaSelectBenchmarkReporter.markdown(this.result_);
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
private async step(
|
|
151
|
-
scenario: IAgenticaSelectBenchmarkScenario,
|
|
152
|
-
): Promise<IAgenticaSelectBenchmarkEvent> {
|
|
153
|
-
const started_at: Date = new Date();
|
|
154
|
-
try {
|
|
155
|
-
const usage: AgenticaTokenUsage = AgenticaTokenUsage.zero();
|
|
156
|
-
const historyGetters: Array<() => Promise<AgenticaHistory>> = [];
|
|
157
|
-
const dispatch = async (event: AgenticaEvent): Promise<void> => {
|
|
158
|
-
if ("toHistory" in event) {
|
|
159
|
-
if ("join" in event) {
|
|
160
|
-
historyGetters.push(async () => {
|
|
161
|
-
await event.join();
|
|
162
|
-
return event.toHistory();
|
|
163
|
-
});
|
|
164
|
-
}
|
|
165
|
-
else {
|
|
166
|
-
historyGetters.push(async () => event.toHistory());
|
|
167
|
-
}
|
|
168
|
-
}
|
|
169
|
-
};
|
|
170
|
-
const context: AgenticaContext = this.agent_.getContext({
|
|
171
|
-
prompt: factory.createUserMessageHistory({
|
|
172
|
-
id: v4(),
|
|
173
|
-
created_at: started_at.toISOString(),
|
|
174
|
-
contents: [{
|
|
175
|
-
type: "text",
|
|
176
|
-
text: scenario.text,
|
|
177
|
-
}],
|
|
178
|
-
}),
|
|
179
|
-
usage,
|
|
180
|
-
dispatch,
|
|
181
|
-
});
|
|
182
|
-
if (typeof context.config?.executor === "function") {
|
|
183
|
-
throw new TypeError("select function is not found");
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
await (context.config?.executor?.select ?? orchestrate.select)({
|
|
187
|
-
...context,
|
|
188
|
-
histories: this.histories_.slice(),
|
|
189
|
-
stack: [],
|
|
190
|
-
ready: () => true,
|
|
191
|
-
});
|
|
192
|
-
const histories: AgenticaHistory[]
|
|
193
|
-
= await Promise.all(
|
|
194
|
-
historyGetters.map(async g => g()),
|
|
195
|
-
);
|
|
196
|
-
const selected: AgenticaOperationSelection[] = histories
|
|
197
|
-
.filter(p => p.type === "select")
|
|
198
|
-
.map(p => p.selection);
|
|
199
|
-
return {
|
|
200
|
-
type: AgenticaBenchmarkPredicator.success({
|
|
201
|
-
expected: scenario.expected,
|
|
202
|
-
operations: selected.map(s => s.operation),
|
|
203
|
-
})
|
|
204
|
-
? "success"
|
|
205
|
-
: "failure",
|
|
206
|
-
scenario,
|
|
207
|
-
selected,
|
|
208
|
-
usage,
|
|
209
|
-
assistantPrompts: histories
|
|
210
|
-
// Only the assistant is allowed to emit text events.
|
|
211
|
-
.filter(p => p.type === "assistantMessage"),
|
|
212
|
-
started_at,
|
|
213
|
-
completed_at: new Date(),
|
|
214
|
-
} satisfies
|
|
215
|
-
| IAgenticaSelectBenchmarkEvent.ISuccess
|
|
216
|
-
| IAgenticaSelectBenchmarkEvent.IFailure;
|
|
217
|
-
}
|
|
218
|
-
catch (error) {
|
|
219
|
-
return {
|
|
220
|
-
type: "error",
|
|
221
|
-
scenario,
|
|
222
|
-
error,
|
|
223
|
-
started_at,
|
|
224
|
-
completed_at: new Date(),
|
|
225
|
-
} satisfies IAgenticaSelectBenchmarkEvent.IError;
|
|
226
|
-
}
|
|
227
|
-
}
|
|
228
|
-
}
|
|
229
|
-
export namespace AgenticaSelectBenchmark {
|
|
230
|
-
/**
|
|
231
|
-
* Properties of the {@link AgenticaSelectBenchmark} constructor.
|
|
232
|
-
*/
|
|
233
|
-
export interface IProps {
|
|
234
|
-
/**
|
|
235
|
-
* AI agent instance.
|
|
236
|
-
*/
|
|
237
|
-
agent: Agentica;
|
|
238
|
-
|
|
239
|
-
/**
|
|
240
|
-
* List of scenarios what you expect.
|
|
241
|
-
*/
|
|
242
|
-
scenarios: IAgenticaSelectBenchmarkScenario[];
|
|
243
|
-
|
|
244
|
-
/**
|
|
245
|
-
* Configuration for the benchmark.
|
|
246
|
-
*/
|
|
247
|
-
config?: Partial<IConfig>;
|
|
248
|
-
}
|
|
249
|
-
|
|
250
|
-
/**
|
|
251
|
-
* Configuration for the benchmark.
|
|
252
|
-
*
|
|
253
|
-
* `AgenticaSelectBenchmark.IConfig` is a data structure which
|
|
254
|
-
* represents a configuration for the benchmark, especially the
|
|
255
|
-
* capacity information of the benchmark execution.
|
|
256
|
-
*/
|
|
257
|
-
export interface IConfig {
|
|
258
|
-
/**
|
|
259
|
-
* Repeat count.
|
|
260
|
-
*
|
|
261
|
-
* The number of repeating count for the benchmark execution
|
|
262
|
-
* for each scenario.
|
|
263
|
-
*
|
|
264
|
-
* @default 10
|
|
265
|
-
*/
|
|
266
|
-
repeat: number & tags.Type<"uint32"> & tags.Minimum<1>;
|
|
267
|
-
|
|
268
|
-
/**
|
|
269
|
-
* Simultaneous count.
|
|
270
|
-
*
|
|
271
|
-
* The number of simultaneous count for the parallel benchmark
|
|
272
|
-
* execution.
|
|
273
|
-
*
|
|
274
|
-
* If you configure this property greater than `1`, the benchmark
|
|
275
|
-
* for each scenario would be executed in parallel in the given
|
|
276
|
-
* count.
|
|
277
|
-
*
|
|
278
|
-
* @default 10
|
|
279
|
-
*/
|
|
280
|
-
simultaneous: number & tags.Type<"uint32"> & tags.Minimum<1>;
|
|
281
|
-
}
|
|
282
|
-
}
|
|
1
|
+
import type {
|
|
2
|
+
Agentica,
|
|
3
|
+
AgenticaContext,
|
|
4
|
+
AgenticaEvent,
|
|
5
|
+
AgenticaHistory,
|
|
6
|
+
AgenticaOperationSelection,
|
|
7
|
+
} from "@agentica/core";
|
|
8
|
+
import type { tags } from "typia";
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* @module
|
|
12
|
+
* This file contains the implementation of the AgenticaSelectBenchmark class.
|
|
13
|
+
*
|
|
14
|
+
* @author Wrtn Technologies
|
|
15
|
+
*/
|
|
16
|
+
import { AgenticaTokenUsage, factory, orchestrate } from "@agentica/core";
|
|
17
|
+
import { Semaphore } from "tstl";
|
|
18
|
+
import { v4 } from "uuid";
|
|
19
|
+
|
|
20
|
+
import type { IAgenticaSelectBenchmarkEvent } from "./structures/IAgenticaSelectBenchmarkEvent";
|
|
21
|
+
import type { IAgenticaSelectBenchmarkResult } from "./structures/IAgenticaSelectBenchmarkResult";
|
|
22
|
+
import type { IAgenticaSelectBenchmarkScenario } from "./structures/IAgenticaSelectBenchmarkScenario";
|
|
23
|
+
|
|
24
|
+
import { AgenticaBenchmarkPredicator } from "./internal/AgenticaBenchmarkPredicator";
|
|
25
|
+
import { AgenticaSelectBenchmarkReporter } from "./internal/AgenticaSelectBenchmarkReporter";
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* LLM function calling selection benchmark.
|
|
29
|
+
*
|
|
30
|
+
* `AgenticaSelectBenchmark` is a class for the benchmark of the
|
|
31
|
+
* LLM (Large Model Language) function calling's selection part.
|
|
32
|
+
* It utilizes the `selector` agent and tests whether the expected
|
|
33
|
+
* {@link IAgenticaOperation operations} are properly selected from
|
|
34
|
+
* the given {@link IAgenticaSelectBenchmarkScenario scenarios}.
|
|
35
|
+
*
|
|
36
|
+
* Note that, this `AgenticaSelectBenchmark` class measures only the
|
|
37
|
+
* selection benchmark, testing whether the `selector` agent can select
|
|
38
|
+
* candidate functions to call as expected. Therefore, it does not test
|
|
39
|
+
* about the actual function calling which is done by the `executor` agent.
|
|
40
|
+
* If you want that feature, use {@link AgenticaCallBenchmark} class instead.
|
|
41
|
+
*
|
|
42
|
+
* @author Samchon
|
|
43
|
+
*/
|
|
44
|
+
export class AgenticaSelectBenchmark {
|
|
45
|
+
private agent_: Agentica;
|
|
46
|
+
private scenarios_: IAgenticaSelectBenchmarkScenario[];
|
|
47
|
+
private config_: AgenticaSelectBenchmark.IConfig;
|
|
48
|
+
private histories_: AgenticaHistory[];
|
|
49
|
+
private result_: IAgenticaSelectBenchmarkResult | null;
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Initializer Constructor.
|
|
53
|
+
*
|
|
54
|
+
* @param props Properties of the selection benchmark
|
|
55
|
+
*/
|
|
56
|
+
public constructor(props: AgenticaSelectBenchmark.IProps) {
|
|
57
|
+
this.agent_ = props.agent;
|
|
58
|
+
this.scenarios_ = props.scenarios.slice();
|
|
59
|
+
this.config_ = {
|
|
60
|
+
repeat: props.config?.repeat ?? 10,
|
|
61
|
+
simultaneous: props.config?.simultaneous ?? 10,
|
|
62
|
+
};
|
|
63
|
+
this.histories_ = props.agent.getHistories().slice();
|
|
64
|
+
this.result_ = null;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Execute the benchmark.
|
|
69
|
+
*
|
|
70
|
+
* Execute the benchmark of the LLM function selection, and returns
|
|
71
|
+
* the result of the benchmark.
|
|
72
|
+
*
|
|
73
|
+
* If you wanna see progress of the benchmark, you can pass a callback
|
|
74
|
+
* function as the argument of the `listener`. The callback function
|
|
75
|
+
* would be called whenever a benchmark event is occurred.
|
|
76
|
+
*
|
|
77
|
+
* Also, you can publish a markdown format report by calling
|
|
78
|
+
* the {@link report} function after the benchmark execution.
|
|
79
|
+
*
|
|
80
|
+
* @param listener Callback function listening the benchmark events
|
|
81
|
+
* @returns Results of the function selection benchmark
|
|
82
|
+
*/
|
|
83
|
+
public async execute(
|
|
84
|
+
listener?: (event: IAgenticaSelectBenchmarkEvent) => void,
|
|
85
|
+
): Promise<IAgenticaSelectBenchmarkResult> {
|
|
86
|
+
const started_at: Date = new Date();
|
|
87
|
+
const semaphore: Semaphore = new Semaphore(this.config_.simultaneous);
|
|
88
|
+
const experiments: IAgenticaSelectBenchmarkResult.IExperiment[]
|
|
89
|
+
= await Promise.all(
|
|
90
|
+
this.scenarios_.map(async (scenario) => {
|
|
91
|
+
const events: IAgenticaSelectBenchmarkEvent[]
|
|
92
|
+
= await Promise.all(
|
|
93
|
+
Array.from({ length: this.config_.repeat }).map(async () => {
|
|
94
|
+
await semaphore.acquire();
|
|
95
|
+
const e: IAgenticaSelectBenchmarkEvent
|
|
96
|
+
= await this.step(scenario);
|
|
97
|
+
await semaphore.release();
|
|
98
|
+
if (listener !== undefined) {
|
|
99
|
+
listener(e);
|
|
100
|
+
}
|
|
101
|
+
return e;
|
|
102
|
+
}),
|
|
103
|
+
);
|
|
104
|
+
return {
|
|
105
|
+
scenario,
|
|
106
|
+
events,
|
|
107
|
+
usage: events
|
|
108
|
+
.filter(e => e.type !== "error")
|
|
109
|
+
.map(e => e.usage)
|
|
110
|
+
.reduce((acc, cur) => AgenticaTokenUsage.plus(acc, cur), AgenticaTokenUsage.zero()),
|
|
111
|
+
};
|
|
112
|
+
}),
|
|
113
|
+
);
|
|
114
|
+
return (this.result_ = {
|
|
115
|
+
experiments,
|
|
116
|
+
started_at,
|
|
117
|
+
completed_at: new Date(),
|
|
118
|
+
usage: experiments
|
|
119
|
+
.map(p => p.usage)
|
|
120
|
+
.reduce((acc, cur) => AgenticaTokenUsage.plus(acc, cur), AgenticaTokenUsage.zero()),
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Report the benchmark result as markdown files.
|
|
126
|
+
*
|
|
127
|
+
* Report the benchmark result {@link execute}d by
|
|
128
|
+
* `AgenticaSelectBenchmark` as markdown files, and returns a
|
|
129
|
+
* dictionary object of the markdown reporting files. The key of
|
|
130
|
+
* the dictionary would be file name, and the value would be the
|
|
131
|
+
* markdown content.
|
|
132
|
+
*
|
|
133
|
+
* For reference, the markdown files are composed like below:
|
|
134
|
+
*
|
|
135
|
+
* - `./README.md`
|
|
136
|
+
* - `./scenario-1/README.md`
|
|
137
|
+
* - `./scenario-1/1.success.md`
|
|
138
|
+
* - `./scenario-1/2.failure.md`
|
|
139
|
+
* - `./scenario-1/3.error.md`
|
|
140
|
+
*
|
|
141
|
+
* @returns Dictionary of markdown files.
|
|
142
|
+
*/
|
|
143
|
+
public report(): Record<string, string> {
|
|
144
|
+
if (this.result_ === null) {
|
|
145
|
+
throw new Error("Benchmark is not executed yet.");
|
|
146
|
+
}
|
|
147
|
+
return AgenticaSelectBenchmarkReporter.markdown(this.result_);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
private async step(
|
|
151
|
+
scenario: IAgenticaSelectBenchmarkScenario,
|
|
152
|
+
): Promise<IAgenticaSelectBenchmarkEvent> {
|
|
153
|
+
const started_at: Date = new Date();
|
|
154
|
+
try {
|
|
155
|
+
const usage: AgenticaTokenUsage = AgenticaTokenUsage.zero();
|
|
156
|
+
const historyGetters: Array<() => Promise<AgenticaHistory>> = [];
|
|
157
|
+
const dispatch = async (event: AgenticaEvent): Promise<void> => {
|
|
158
|
+
if ("toHistory" in event) {
|
|
159
|
+
if ("join" in event) {
|
|
160
|
+
historyGetters.push(async () => {
|
|
161
|
+
await event.join();
|
|
162
|
+
return event.toHistory();
|
|
163
|
+
});
|
|
164
|
+
}
|
|
165
|
+
else {
|
|
166
|
+
historyGetters.push(async () => event.toHistory());
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
};
|
|
170
|
+
const context: AgenticaContext = this.agent_.getContext({
|
|
171
|
+
prompt: factory.createUserMessageHistory({
|
|
172
|
+
id: v4(),
|
|
173
|
+
created_at: started_at.toISOString(),
|
|
174
|
+
contents: [{
|
|
175
|
+
type: "text",
|
|
176
|
+
text: scenario.text,
|
|
177
|
+
}],
|
|
178
|
+
}),
|
|
179
|
+
usage,
|
|
180
|
+
dispatch,
|
|
181
|
+
});
|
|
182
|
+
if (typeof context.config?.executor === "function") {
|
|
183
|
+
throw new TypeError("select function is not found");
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
await (context.config?.executor?.select ?? orchestrate.select)({
|
|
187
|
+
...context,
|
|
188
|
+
histories: this.histories_.slice(),
|
|
189
|
+
stack: [],
|
|
190
|
+
ready: () => true,
|
|
191
|
+
});
|
|
192
|
+
const histories: AgenticaHistory[]
|
|
193
|
+
= await Promise.all(
|
|
194
|
+
historyGetters.map(async g => g()),
|
|
195
|
+
);
|
|
196
|
+
const selected: AgenticaOperationSelection[] = histories
|
|
197
|
+
.filter(p => p.type === "select")
|
|
198
|
+
.map(p => p.selection);
|
|
199
|
+
return {
|
|
200
|
+
type: AgenticaBenchmarkPredicator.success({
|
|
201
|
+
expected: scenario.expected,
|
|
202
|
+
operations: selected.map(s => s.operation),
|
|
203
|
+
})
|
|
204
|
+
? "success"
|
|
205
|
+
: "failure",
|
|
206
|
+
scenario,
|
|
207
|
+
selected,
|
|
208
|
+
usage,
|
|
209
|
+
assistantPrompts: histories
|
|
210
|
+
// Only the assistant is allowed to emit text events.
|
|
211
|
+
.filter(p => p.type === "assistantMessage"),
|
|
212
|
+
started_at,
|
|
213
|
+
completed_at: new Date(),
|
|
214
|
+
} satisfies
|
|
215
|
+
| IAgenticaSelectBenchmarkEvent.ISuccess
|
|
216
|
+
| IAgenticaSelectBenchmarkEvent.IFailure;
|
|
217
|
+
}
|
|
218
|
+
catch (error) {
|
|
219
|
+
return {
|
|
220
|
+
type: "error",
|
|
221
|
+
scenario,
|
|
222
|
+
error,
|
|
223
|
+
started_at,
|
|
224
|
+
completed_at: new Date(),
|
|
225
|
+
} satisfies IAgenticaSelectBenchmarkEvent.IError;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
export namespace AgenticaSelectBenchmark {
|
|
230
|
+
/**
|
|
231
|
+
* Properties of the {@link AgenticaSelectBenchmark} constructor.
|
|
232
|
+
*/
|
|
233
|
+
export interface IProps {
|
|
234
|
+
/**
|
|
235
|
+
* AI agent instance.
|
|
236
|
+
*/
|
|
237
|
+
agent: Agentica;
|
|
238
|
+
|
|
239
|
+
/**
|
|
240
|
+
* List of scenarios what you expect.
|
|
241
|
+
*/
|
|
242
|
+
scenarios: IAgenticaSelectBenchmarkScenario[];
|
|
243
|
+
|
|
244
|
+
/**
|
|
245
|
+
* Configuration for the benchmark.
|
|
246
|
+
*/
|
|
247
|
+
config?: Partial<IConfig>;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
/**
|
|
251
|
+
* Configuration for the benchmark.
|
|
252
|
+
*
|
|
253
|
+
* `AgenticaSelectBenchmark.IConfig` is a data structure which
|
|
254
|
+
* represents a configuration for the benchmark, especially the
|
|
255
|
+
* capacity information of the benchmark execution.
|
|
256
|
+
*/
|
|
257
|
+
export interface IConfig {
|
|
258
|
+
/**
|
|
259
|
+
* Repeat count.
|
|
260
|
+
*
|
|
261
|
+
* The number of repeating count for the benchmark execution
|
|
262
|
+
* for each scenario.
|
|
263
|
+
*
|
|
264
|
+
* @default 10
|
|
265
|
+
*/
|
|
266
|
+
repeat: number & tags.Type<"uint32"> & tags.Minimum<1>;
|
|
267
|
+
|
|
268
|
+
/**
|
|
269
|
+
* Simultaneous count.
|
|
270
|
+
*
|
|
271
|
+
* The number of simultaneous count for the parallel benchmark
|
|
272
|
+
* execution.
|
|
273
|
+
*
|
|
274
|
+
* If you configure this property greater than `1`, the benchmark
|
|
275
|
+
* for each scenario would be executed in parallel in the given
|
|
276
|
+
* count.
|
|
277
|
+
*
|
|
278
|
+
* @default 10
|
|
279
|
+
*/
|
|
280
|
+
simultaneous: number & tags.Type<"uint32"> & tags.Minimum<1>;
|
|
281
|
+
}
|
|
282
|
+
}
|