@agentica/benchmark 0.12.21 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -33
- package/lib/AgenticaCallBenchmark.d.ts +12 -6
- package/lib/AgenticaCallBenchmark.js +24 -18
- package/lib/AgenticaCallBenchmark.js.map +1 -1
- package/lib/AgenticaSelectBenchmark.d.ts +12 -6
- package/lib/AgenticaSelectBenchmark.js +14 -12
- package/lib/AgenticaSelectBenchmark.js.map +1 -1
- package/lib/index.mjs +315 -236
- package/lib/index.mjs.map +1 -1
- package/lib/internal/AgenticaBenchmarkPredicator.d.ts +38 -29
- package/lib/internal/AgenticaBenchmarkPredicator.js +100 -84
- package/lib/internal/AgenticaBenchmarkPredicator.js.map +1 -1
- package/lib/internal/AgenticaBenchmarkUtil.d.ts +21 -6
- package/lib/internal/AgenticaBenchmarkUtil.js +39 -33
- package/lib/internal/AgenticaBenchmarkUtil.js.map +1 -1
- package/lib/internal/AgenticaCallBenchmarkReporter.d.ts +6 -5
- package/lib/internal/AgenticaCallBenchmarkReporter.js +130 -126
- package/lib/internal/AgenticaCallBenchmarkReporter.js.map +1 -1
- package/lib/internal/AgenticaPromptReporter.d.ts +13 -5
- package/lib/internal/AgenticaPromptReporter.js +45 -41
- package/lib/internal/AgenticaPromptReporter.js.map +1 -1
- package/lib/internal/AgenticaSelectBenchmarkReporter.d.ts +3 -1
- package/lib/internal/AgenticaSelectBenchmarkReporter.js +153 -150
- package/lib/internal/AgenticaSelectBenchmarkReporter.js.map +1 -1
- package/lib/structures/IAgenticaBenchmarkExpected.d.ts +8 -2
- package/lib/structures/IAgenticaCallBenchmarkEvent.d.ts +9 -3
- package/lib/structures/IAgenticaCallBenchmarkResult.d.ts +10 -4
- package/lib/structures/IAgenticaCallBenchmarkScenario.d.ts +8 -2
- package/lib/structures/IAgenticaSelectBenchmarkEvent.d.ts +9 -3
- package/lib/structures/IAgenticaSelectBenchmarkResult.d.ts +10 -4
- package/lib/structures/IAgenticaSelectBenchmarkScenario.d.ts +8 -2
- package/lib/utils/MathUtil.d.ts +15 -3
- package/lib/utils/MathUtil.js +15 -4
- package/lib/utils/MathUtil.js.map +1 -1
- package/package.json +12 -10
- package/src/AgenticaCallBenchmark.ts +64 -45
- package/src/AgenticaSelectBenchmark.ts +42 -30
- package/src/internal/AgenticaBenchmarkPredicator.ts +208 -186
- package/src/internal/AgenticaBenchmarkUtil.ts +58 -40
- package/src/internal/AgenticaCallBenchmarkReporter.ts +180 -182
- package/src/internal/AgenticaPromptReporter.ts +46 -33
- package/src/internal/AgenticaSelectBenchmarkReporter.ts +205 -203
- package/src/structures/IAgenticaBenchmarkExpected.ts +9 -2
- package/src/structures/IAgenticaCallBenchmarkEvent.ts +9 -3
- package/src/structures/IAgenticaCallBenchmarkResult.ts +10 -4
- package/src/structures/IAgenticaCallBenchmarkScenario.ts +8 -2
- package/src/structures/IAgenticaSelectBenchmarkEvent.ts +9 -3
- package/src/structures/IAgenticaSelectBenchmarkResult.ts +10 -4
- package/src/structures/IAgenticaSelectBenchmarkScenario.ts +8 -2
- package/src/utils/MathUtil.ts +16 -3
package/lib/index.mjs
CHANGED
|
@@ -8,254 +8,302 @@ import "typia";
|
|
|
8
8
|
|
|
9
9
|
import { ChatGptSelectFunctionAgent } from "@agentica/core/src/chatgpt/ChatGptSelectFunctionAgent";
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
const AgenticaBenchmarkPredicator = {
|
|
12
|
+
isNext,
|
|
13
|
+
success
|
|
14
|
+
};
|
|
12
15
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
description: "Recommended reply message for the user.\n\nThe message what AI agent wants the user to reply\naccepting the AI agent's next job suggestion.",
|
|
31
|
-
type: "string"
|
|
32
|
-
}
|
|
16
|
+
async function isNext(agent) {
|
|
17
|
+
const last = agent.getPromptHistories().at(-1);
|
|
18
|
+
const llmVendor = agent.getVendor();
|
|
19
|
+
const isTextPrompt = last?.type === "text" && last.role === "assistant";
|
|
20
|
+
if (!isTextPrompt) {
|
|
21
|
+
return null;
|
|
22
|
+
}
|
|
23
|
+
const consent = {
|
|
24
|
+
functions: [ {
|
|
25
|
+
name: "consent",
|
|
26
|
+
parameters: {
|
|
27
|
+
description: "Properties for asking the user's consent",
|
|
28
|
+
type: "object",
|
|
29
|
+
properties: {
|
|
30
|
+
content: {
|
|
31
|
+
description: "Reason of the message implying what the AI agent wants\nto do at the next step after the user's consent.",
|
|
32
|
+
type: "string"
|
|
33
33
|
},
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
34
|
+
reply: {
|
|
35
|
+
title: "Recommended reply message for the user",
|
|
36
|
+
description: "Recommended reply message for the user.\n\nThe message what AI agent wants the user to reply\naccepting the AI agent's next job suggestion.",
|
|
37
|
+
type: "string"
|
|
38
|
+
}
|
|
37
39
|
},
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
success: true,
|
|
40
|
+
required: [ "content", "reply" ],
|
|
41
|
+
additionalProperties: false,
|
|
42
|
+
$defs: {}
|
|
43
|
+
},
|
|
44
|
+
description: "Ask user to consent for what the AI agent wants to do next.\n\nIf AI agent wants to do some function calling at next,\nbut it needs the user's consent about the function calling to do,\nthen call this tool function.",
|
|
45
|
+
validate: (() => {
|
|
46
|
+
const _io0 = input => "string" === typeof input.content && "string" === typeof input.reply;
|
|
47
|
+
const _vo0 = (input, _path, _exceptionable = true) => [ "string" === typeof input.content || _report(_exceptionable, {
|
|
48
|
+
path: _path + ".content",
|
|
49
|
+
expected: "string",
|
|
50
|
+
value: input.content
|
|
51
|
+
}), "string" === typeof input.reply || _report(_exceptionable, {
|
|
52
|
+
path: _path + ".reply",
|
|
53
|
+
expected: "string",
|
|
54
|
+
value: input.reply
|
|
55
|
+
}) ].every((flag => flag));
|
|
56
|
+
const __is = input => "object" === typeof input && null !== input && _io0(input);
|
|
57
|
+
let errors;
|
|
58
|
+
let _report;
|
|
59
|
+
return input => {
|
|
60
|
+
if (false === __is(input)) {
|
|
61
|
+
errors = [];
|
|
62
|
+
_report = __typia_transform__validateReport._validateReport(errors);
|
|
63
|
+
((input, _path, _exceptionable = true) => ("object" === typeof input && null !== input || _report(true, {
|
|
64
|
+
path: _path + "",
|
|
65
|
+
expected: "IConsentProps",
|
|
66
|
+
value: input
|
|
67
|
+
})) && _vo0(input, _path + "", true) || _report(true, {
|
|
68
|
+
path: _path + "",
|
|
69
|
+
expected: "IConsentProps",
|
|
70
|
+
value: input
|
|
71
|
+
}))(input, "$input", true);
|
|
72
|
+
const success = 0 === errors.length;
|
|
73
|
+
return success ? {
|
|
74
|
+
success,
|
|
75
|
+
data: input
|
|
76
|
+
} : {
|
|
77
|
+
success,
|
|
78
|
+
errors,
|
|
78
79
|
data: input
|
|
79
80
|
};
|
|
81
|
+
}
|
|
82
|
+
return {
|
|
83
|
+
success: true,
|
|
84
|
+
data: input
|
|
85
|
+
};
|
|
86
|
+
};
|
|
87
|
+
})()
|
|
88
|
+
} ]
|
|
89
|
+
}.functions[0];
|
|
90
|
+
const result = await llmVendor.api.chat.completions.create({
|
|
91
|
+
model: llmVendor.model,
|
|
92
|
+
messages: [ {
|
|
93
|
+
role: "system",
|
|
94
|
+
content: [ "You are an helpful assistant.", "", "If what the assistant said seems like to asking for", "user's consent about some function calling at the next step,", "use the tools appropriately to step to the next." ].join("\n")
|
|
95
|
+
}, {
|
|
96
|
+
role: "assistant",
|
|
97
|
+
content: last.text
|
|
98
|
+
} ],
|
|
99
|
+
tools: [ {
|
|
100
|
+
type: "function",
|
|
101
|
+
function: {
|
|
102
|
+
name: consent.name,
|
|
103
|
+
description: consent.description,
|
|
104
|
+
parameters: consent.parameters
|
|
105
|
+
}
|
|
106
|
+
} ],
|
|
107
|
+
tool_choice: "required",
|
|
108
|
+
parallel_tool_calls: false
|
|
109
|
+
}, llmVendor.options);
|
|
110
|
+
const toolCall = (result.choices[0]?.message.tool_calls ?? []).filter((tc => tc.type === "function" && tc.function.name === consent.name))?.[0];
|
|
111
|
+
if (toolCall === undefined) {
|
|
112
|
+
return null;
|
|
113
|
+
}
|
|
114
|
+
const input = (() => {
|
|
115
|
+
const _io0 = input => "string" === typeof input.content && "string" === typeof input.reply;
|
|
116
|
+
const __is = input => "object" === typeof input && null !== input && _io0(input);
|
|
117
|
+
return input => {
|
|
118
|
+
input = JSON.parse(input);
|
|
119
|
+
return __is(input) ? input : null;
|
|
120
|
+
};
|
|
121
|
+
})()(toolCall.function.arguments);
|
|
122
|
+
return input !== null ? input.reply : null;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
function success(props) {
|
|
126
|
+
return successInner(props).result;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
function successInner(props) {
|
|
130
|
+
const call = (expected, overrideOperations) => successInner({
|
|
131
|
+
expected,
|
|
132
|
+
operations: overrideOperations ?? props.operations,
|
|
133
|
+
strict: props.strict
|
|
134
|
+
});
|
|
135
|
+
switch (props.expected.type) {
|
|
136
|
+
case "array":
|
|
137
|
+
{
|
|
138
|
+
let take = 0;
|
|
139
|
+
const targetIterator = props.expected.items[Symbol.iterator]();
|
|
140
|
+
let targeted = targetIterator.next();
|
|
141
|
+
while (true) {
|
|
142
|
+
if (targeted.done === true) {
|
|
143
|
+
return {
|
|
144
|
+
result: true,
|
|
145
|
+
take
|
|
80
146
|
};
|
|
81
|
-
})()
|
|
82
|
-
} ]
|
|
83
|
-
}.functions[0];
|
|
84
|
-
const result = await agent["props"].vendor.api.chat.completions.create({
|
|
85
|
-
model: agent["props"].vendor.model,
|
|
86
|
-
messages: [ {
|
|
87
|
-
role: "system",
|
|
88
|
-
content: [ "You are an helpful assistant.", "", "If what the assistant said seems like to asking for", "user's consent about some function calling at the next step,", "use the tools appropriately to step to the next." ].join("\n")
|
|
89
|
-
}, {
|
|
90
|
-
role: "assistant",
|
|
91
|
-
content: last.text
|
|
92
|
-
} ],
|
|
93
|
-
tools: [ {
|
|
94
|
-
type: "function",
|
|
95
|
-
function: {
|
|
96
|
-
name: consent.name,
|
|
97
|
-
description: consent.description,
|
|
98
|
-
parameters: consent.parameters
|
|
99
147
|
}
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
const _io0 = input => "string" === typeof input.content && "string" === typeof input.reply;
|
|
109
|
-
return input => "object" === typeof input && null !== input && _io0(input);
|
|
110
|
-
})()(input) ? input.reply : null;
|
|
111
|
-
};
|
|
112
|
-
AgenticaBenchmarkPredicator.success = props => successInner(props).result;
|
|
113
|
-
const successInner = props => {
|
|
114
|
-
const call = (expected, overrideOperations) => successInner({
|
|
115
|
-
expected,
|
|
116
|
-
operations: overrideOperations ?? props.operations,
|
|
117
|
-
strict: props.strict
|
|
118
|
-
});
|
|
119
|
-
switch (props.expected.type) {
|
|
120
|
-
case "array":
|
|
121
|
-
{
|
|
122
|
-
let take = 0;
|
|
123
|
-
const targetIterator = props.expected.items[Symbol.iterator]();
|
|
124
|
-
let targeted = targetIterator.next();
|
|
125
|
-
while (true) {
|
|
126
|
-
if (targeted.done) {
|
|
127
|
-
return {
|
|
128
|
-
result: true,
|
|
129
|
-
take
|
|
130
|
-
};
|
|
131
|
-
}
|
|
132
|
-
if (take >= props.operations.length) {
|
|
133
|
-
return {
|
|
134
|
-
result: false
|
|
135
|
-
};
|
|
136
|
-
}
|
|
137
|
-
const result = call(targeted.value, props.operations.slice(take));
|
|
138
|
-
if (!result.result) {
|
|
139
|
-
if (!props.strict) {
|
|
140
|
-
take += 1;
|
|
141
|
-
continue;
|
|
142
|
-
}
|
|
148
|
+
if (take >= props.operations.length) {
|
|
149
|
+
return {
|
|
150
|
+
result: false
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
const result = call(targeted.value, props.operations.slice(take));
|
|
154
|
+
if (!result.result) {
|
|
155
|
+
if (props.strict === true) {
|
|
143
156
|
return {
|
|
144
157
|
result: false
|
|
145
158
|
};
|
|
146
159
|
}
|
|
147
|
-
take +=
|
|
148
|
-
|
|
160
|
+
take += 1;
|
|
161
|
+
continue;
|
|
149
162
|
}
|
|
163
|
+
take += result.take;
|
|
164
|
+
targeted = targetIterator.next();
|
|
150
165
|
}
|
|
166
|
+
}
|
|
151
167
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
return {
|
|
158
|
-
result,
|
|
159
|
-
take: 1
|
|
160
|
-
};
|
|
161
|
-
}
|
|
168
|
+
case "standalone":
|
|
169
|
+
{
|
|
170
|
+
const target = props.expected.operation;
|
|
171
|
+
const result = props.operations.some((op => op.name === target.name));
|
|
172
|
+
if (result) {
|
|
162
173
|
return {
|
|
163
|
-
result
|
|
174
|
+
result,
|
|
175
|
+
take: 1
|
|
164
176
|
};
|
|
165
177
|
}
|
|
166
|
-
|
|
167
|
-
case "anyOf":
|
|
168
|
-
for (const expected of props.expected.anyOf) {
|
|
169
|
-
const callResult = call(expected);
|
|
170
|
-
if (callResult.result) {
|
|
171
|
-
return callResult;
|
|
172
|
-
}
|
|
173
|
-
}
|
|
174
178
|
return {
|
|
175
|
-
result
|
|
179
|
+
result
|
|
176
180
|
};
|
|
181
|
+
}
|
|
177
182
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
183
|
+
case "anyOf":
|
|
184
|
+
for (const expected of props.expected.anyOf) {
|
|
185
|
+
const callResult = call(expected);
|
|
186
|
+
if (callResult.result) {
|
|
187
|
+
return callResult;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
return {
|
|
191
|
+
result: false
|
|
192
|
+
};
|
|
193
|
+
|
|
194
|
+
case "allOf":
|
|
195
|
+
{
|
|
196
|
+
const result = props.expected.allOf.map((expected => call(expected)));
|
|
197
|
+
if (result.every((r => r.result))) {
|
|
187
198
|
return {
|
|
188
|
-
result:
|
|
199
|
+
result: true,
|
|
200
|
+
take: result.reduce(((acc, r) => Math.max(acc, r.take)), 0)
|
|
189
201
|
};
|
|
190
202
|
}
|
|
203
|
+
return {
|
|
204
|
+
result: false
|
|
205
|
+
};
|
|
191
206
|
}
|
|
192
|
-
}
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
var MathUtil;
|
|
207
|
+
}
|
|
208
|
+
}
|
|
196
209
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
}
|
|
210
|
+
const MathUtil = {
|
|
211
|
+
round: value => Math.floor(value * 100) / 100
|
|
212
|
+
};
|
|
200
213
|
|
|
201
|
-
|
|
214
|
+
const AgenticaBenchmarkUtil = {
|
|
215
|
+
errorToJson,
|
|
216
|
+
expectedToJson
|
|
217
|
+
};
|
|
202
218
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
219
|
+
function errorToJson(error) {
|
|
220
|
+
if (error instanceof Error) {
|
|
221
|
+
return {
|
|
206
222
|
...error,
|
|
207
223
|
name: error.name,
|
|
208
224
|
message: error.message,
|
|
209
225
|
stack: error.stack
|
|
210
226
|
};
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
227
|
+
}
|
|
228
|
+
return error;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
function expectedToJson(expected) {
|
|
232
|
+
if (expected.type === "standalone") {
|
|
233
|
+
return {
|
|
215
234
|
type: expected.type,
|
|
216
235
|
operation: {
|
|
217
236
|
name: expected.operation.name,
|
|
218
237
|
description: expected.operation.function.description
|
|
219
238
|
}
|
|
220
|
-
};
|
|
239
|
+
};
|
|
240
|
+
} else if (expected.type === "array") {
|
|
241
|
+
return {
|
|
221
242
|
type: expected.type,
|
|
222
|
-
items: expected.items.map(
|
|
223
|
-
};
|
|
243
|
+
items: expected.items.map(expectedToJson)
|
|
244
|
+
};
|
|
245
|
+
} else if (expected.type === "allOf") {
|
|
246
|
+
return {
|
|
224
247
|
type: expected.type,
|
|
225
|
-
allOf: expected.allOf.map(
|
|
226
|
-
};
|
|
248
|
+
allOf: expected.allOf.map(expectedToJson)
|
|
249
|
+
};
|
|
250
|
+
} else {
|
|
251
|
+
return {
|
|
227
252
|
type: expected.type,
|
|
228
|
-
anyOf: expected.anyOf.map(
|
|
253
|
+
anyOf: expected.anyOf.map(expectedToJson)
|
|
229
254
|
};
|
|
230
|
-
}
|
|
231
|
-
}
|
|
255
|
+
}
|
|
256
|
+
}
|
|
232
257
|
|
|
233
|
-
|
|
258
|
+
const AgenticaPromptReporter = {
|
|
259
|
+
markdown: markdown$2
|
|
260
|
+
};
|
|
234
261
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
262
|
+
function markdown$2(p) {
|
|
263
|
+
if (p.type === "text") {
|
|
264
|
+
return [ `### Text (${p.role})`, p.text, "" ].join("\n");
|
|
265
|
+
} else if (p.type === "select" || p.type === "cancel") {
|
|
266
|
+
return [ `### ${p.type === "select" ? "Select" : "Cancel"}`, ...p.selections.flatMap((s => {
|
|
267
|
+
const functionDescriptionCount = s.operation.function.description?.length ?? 0;
|
|
268
|
+
return [ `#### ${s.operation.name}`, ` - controller: ${s.operation.controller.name}`, ` - function: ${s.operation.function.name}`, ` - reason: ${s.reason}`, "", ...functionDescriptionCount > 0 ? [ s.operation.function.description, "" ] : [] ];
|
|
269
|
+
})) ].join("\n");
|
|
270
|
+
} else if (p.type === "describe") {
|
|
271
|
+
return [ "### Describe", ...p.executes.map((e => ` - ${e.operation.name}`)), "", ...p.text.split("\n").map((s => `> ${s}`)), "" ].join("\n");
|
|
272
|
+
}
|
|
273
|
+
return [ "### Execute", ` - name: ${p.operation.name}`, ` - controller: ${p.operation.controller.name}`, ` - function: ${p.operation.function.name}`, "", "```json", JSON.stringify(p.arguments, null, 2), "```", "" ].join("\n");
|
|
274
|
+
}
|
|
241
275
|
|
|
242
|
-
|
|
276
|
+
const AgenticaCallBenchmarkReporter = {
|
|
277
|
+
markdown: markdown$1
|
|
278
|
+
};
|
|
243
279
|
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
280
|
+
function markdown$1(result) {
|
|
281
|
+
return Object.fromEntries([ [ "./README.md", writeIndex$1(result) ], ...result.experiments.map((exp => [ [ `./${exp.scenario.name}/README.md`, writeExperimentIndex$1(exp) ], ...exp.events.map(((event, i) => [ `./${exp.scenario.name}/${i + 1}.${event.type}.md`, writeExperimentEvent$1(event, i) ])) ])).flat() ]);
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
function writeIndex$1(result) {
|
|
285
|
+
const events = result.experiments.map((r => r.events)).flat();
|
|
286
|
+
const average = events.map((e => e.completed_at.getTime() - e.started_at.getTime())).reduce(((a, b) => a + b), 0) / events.length;
|
|
287
|
+
const aggregate = result.usage.aggregate;
|
|
288
|
+
return [ "# LLM Function Call Benchmark", "## Summary", ` - Aggregation:`, ` - Scenarios: #${result.experiments.length.toLocaleString()}`, ` - Trial: ${events.length}`, ` - Success: ${events.filter((e => e.type === "success")).length}`, ` - Failure: ${events.filter((e => e.type === "failure")).length}`, ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`, ` - Token Usage`, ` - Total: ${aggregate.total.toLocaleString()}`, ` - Input`, ` - Total: ${aggregate.input.total.toLocaleString()}`, ` - Cached: ${aggregate.input.cached.toLocaleString()}`, ` - Output:`, ` - Total: ${aggregate.output.total.toLocaleString()}`, ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`, ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`, ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`, "", "## Experiments", " Name | Select | Call | Time/Avg ", ":-----|:-------|:-----|----------:", ...result.experiments.map((exp => [ `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`, drawStatus(exp.events, (e => e.type !== "error" && e.select === true)), drawStatus(exp.events, (e => e.type !== "error" && e.call === true)), `${MathUtil.round(exp.events.map((e => e.completed_at.getTime() - e.started_at.getTime())).reduce(((a, b) => a + b), 0) / exp.events.length).toLocaleString()} ms` ].join(" | "))) ].join("\n");
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
function writeExperimentIndex$1(exp) {
|
|
292
|
+
return [ `# ${exp.scenario.name}`, "## Summary", ` - Scenarios: #${exp.events.length.toLocaleString()}`, ` - Success: ${exp.events.filter((e => e.type === "success")).length}`, ` - Failure: ${exp.events.filter((e => e.type === "failure")).length}`, ` - Average Time: ${MathUtil.round(exp.events.map((e => e.completed_at.getTime() - e.started_at.getTime())).reduce(((a, b) => a + b), 0) / exp.events.length).toLocaleString()} ms`, "", "## Events", " Name | Type | Time", ":-----|:-----|----:", ...exp.events.map(((e, i) => [ `[${i + 1}.](./${i + 1}.${e.type}.md)`, e.type, `${MathUtil.round(e.completed_at.getTime() - e.started_at.getTime())} ms` ].join(" | "))), "", "## Scenario", "### User Prompt", exp.scenario.text, "", "### Expected", "```json", JSON.stringify(AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected), null, 2), "```" ].join("\n");
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
function writeExperimentEvent$1(event, index) {
|
|
296
|
+
return [ `# ${index + 1}. ${event.type}`, "## Summary", ` - Name: ${event.scenario.name}`, ` - Type: ${event.type}`, ` - Time: ${MathUtil.round(event.completed_at.getTime() - event.started_at.getTime()).toLocaleString()} ms`, ...event.type !== "error" ? [ ` - Select: ${event.select ? "✅" : "❌"}`, ` - Call: ${event.call ? "✅" : "❌"}` ] : [], ` - Token Usage:`, ` - Total: ${JSON.stringify(event.usage.aggregate.total)}`, ` - Input`, ` - Total: ${event.usage.aggregate.input.total}`, ` - Cached: ${event.usage.aggregate.input.cached}`, ` - Output:`, ` - Total: ${event.usage.aggregate.output.total}`, ` - Accepted Prediction: ${event.usage.aggregate.output.accepted_prediction}`, ` - Reasoning: ${event.usage.aggregate.output.reasoning}`, ` - Rejected Prediction: ${event.usage.aggregate.output.rejected_prediction}`, "", "## Scenario", "### User Prompt", event.scenario.text, "", "### Expected", "```json", JSON.stringify(AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected), null, 2), "```", "", "## Prompt Histories", ...event.prompts.map(AgenticaPromptReporter.markdown), "", ...event.type === "error" ? [ "## Error", "```json", JSON.stringify(AgenticaBenchmarkUtil.errorToJson(event.error), null, 2), "```" ] : [] ].join("\n");
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
function drawStatus(events, success) {
|
|
300
|
+
const count = Math.floor(events.filter(success).length / events.length * 10);
|
|
301
|
+
return Array.from({
|
|
302
|
+
length: count
|
|
303
|
+
}).fill("■").join("") + Array.from({
|
|
304
|
+
length: 10 - count
|
|
305
|
+
}).fill("□").join("");
|
|
306
|
+
}
|
|
259
307
|
|
|
260
308
|
class AgenticaCallBenchmark {
|
|
261
309
|
constructor(props) {
|
|
@@ -271,29 +319,36 @@ class AgenticaCallBenchmark {
|
|
|
271
319
|
async execute(listener) {
|
|
272
320
|
const started_at = new Date;
|
|
273
321
|
const semaphore = new Semaphore(this.config_.simultaneous);
|
|
274
|
-
const
|
|
275
|
-
const events = await Promise.all(
|
|
322
|
+
const task = this.scenarios_.map((async scenario => {
|
|
323
|
+
const events = await Promise.all(Array.from({
|
|
324
|
+
length: this.config_.repeat
|
|
325
|
+
}).map((async () => {
|
|
276
326
|
await semaphore.acquire();
|
|
277
327
|
const e = await this.step(scenario);
|
|
278
328
|
await semaphore.release();
|
|
279
|
-
if (listener !== undefined)
|
|
329
|
+
if (listener !== undefined) {
|
|
330
|
+
listener(e);
|
|
331
|
+
}
|
|
280
332
|
return e;
|
|
281
333
|
})));
|
|
282
334
|
return {
|
|
283
335
|
scenario,
|
|
284
336
|
events,
|
|
285
|
-
usage: events.filter((e => e.type !== "error")).map((e => e.usage)).reduce(AgenticaTokenUsage.plus, AgenticaTokenUsage.zero())
|
|
337
|
+
usage: events.filter((e => e.type !== "error")).map((e => e.usage)).reduce(((acc, cur) => AgenticaTokenUsage.plus(acc, cur)), AgenticaTokenUsage.zero())
|
|
286
338
|
};
|
|
287
|
-
}))
|
|
339
|
+
}));
|
|
340
|
+
const experiments = await Promise.all(task);
|
|
288
341
|
return this.result_ = {
|
|
289
342
|
experiments,
|
|
290
343
|
started_at,
|
|
291
344
|
completed_at: new Date,
|
|
292
|
-
usage: experiments.map((p => p.usage)).reduce(AgenticaTokenUsage.plus, AgenticaTokenUsage.zero())
|
|
345
|
+
usage: experiments.map((p => p.usage)).reduce(((acc, cur) => AgenticaTokenUsage.plus(acc, cur)), AgenticaTokenUsage.zero())
|
|
293
346
|
};
|
|
294
347
|
}
|
|
295
348
|
report() {
|
|
296
|
-
if (this.result_ === null)
|
|
349
|
+
if (this.result_ === null) {
|
|
350
|
+
throw new Error("Benchmark is not executed yet.");
|
|
351
|
+
}
|
|
297
352
|
return AgenticaCallBenchmarkReporter.markdown(this.result_);
|
|
298
353
|
}
|
|
299
354
|
async step(scenario) {
|
|
@@ -324,12 +379,18 @@ class AgenticaCallBenchmark {
|
|
|
324
379
|
};
|
|
325
380
|
try {
|
|
326
381
|
await agent.conversate(scenario.text);
|
|
327
|
-
if (success())
|
|
382
|
+
if (success()) {
|
|
383
|
+
return out();
|
|
384
|
+
}
|
|
328
385
|
for (let i = 0; i < this.config_.consent; ++i) {
|
|
329
386
|
const next = await AgenticaBenchmarkPredicator.isNext(agent);
|
|
330
|
-
if (next === null)
|
|
387
|
+
if (next === null) {
|
|
388
|
+
break;
|
|
389
|
+
}
|
|
331
390
|
await agent.conversate(next);
|
|
332
|
-
if (success())
|
|
391
|
+
if (success()) {
|
|
392
|
+
return out();
|
|
393
|
+
}
|
|
333
394
|
}
|
|
334
395
|
return out();
|
|
335
396
|
} catch (error) {
|
|
@@ -346,25 +407,37 @@ class AgenticaCallBenchmark {
|
|
|
346
407
|
}
|
|
347
408
|
}
|
|
348
409
|
|
|
349
|
-
|
|
410
|
+
const AgenticaSelectBenchmarkReporter = {
|
|
411
|
+
markdown
|
|
412
|
+
};
|
|
413
|
+
|
|
414
|
+
function markdown(result) {
|
|
415
|
+
const iterator = [ [ "./README.md", writeIndex(result) ], ...result.experiments.map((exp => [ [ `./${exp.scenario.name}/README.md`, writeExperimentIndex(exp) ], ...exp.events.map(((event, i) => [ `./${exp.scenario.name}/${i + 1}.${event.type}.md`, writeExperimentEvent(event, i) ])) ])).flat() ];
|
|
416
|
+
return Object.fromEntries(iterator);
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
function writeIndex(result) {
|
|
420
|
+
const events = result.experiments.map((r => r.events)).flat();
|
|
421
|
+
const average = events.map((e => e.completed_at.getTime() - e.started_at.getTime())).reduce(((a, b) => a + b), 0) / events.length;
|
|
422
|
+
const aggregate = result.usage.aggregate;
|
|
423
|
+
return [ "# LLM Function Selection Benchmark", "## Summary", ` - Aggregation:`, ` - Scenarios: #${result.experiments.length.toLocaleString()}`, ` - Trial: ${events.length}`, ` - Success: ${events.filter((e => e.type === "success")).length}`, ` - Failure: ${events.filter((e => e.type === "failure")).length}`, ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`, ` - Token Usage`, ` - Total: ${aggregate.total.toLocaleString()}`, ` - Input`, ` - Total: ${aggregate.input.total.toLocaleString()}`, ` - Cached: ${aggregate.input.cached.toLocaleString()}`, ` - Output:`, ` - Total: ${aggregate.output.total.toLocaleString()}`, ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`, ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`, ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`, "", "## Experiments", " Name | Status | Time/Avg ", ":-----|:-------|----------:", ...result.experiments.map((exp => [ `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`, (() => {
|
|
424
|
+
const success = Math.floor(exp.events.filter((e => e.type === "success")).length / exp.events.length * 10);
|
|
425
|
+
return Array.from({
|
|
426
|
+
length: success
|
|
427
|
+
}).fill("■").join("") + Array.from({
|
|
428
|
+
length: 10 - success
|
|
429
|
+
}).fill("□").join("");
|
|
430
|
+
})(), `${MathUtil.round(exp.events.map((event => event.completed_at.getTime() - event.started_at.getTime())).reduce(((a, b) => a + b), 0) / exp.events.length).toLocaleString()} ms` ].join(" | "))) ].join("\n");
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
function writeExperimentIndex(exp) {
|
|
434
|
+
const aggregate = exp.usage.aggregate;
|
|
435
|
+
return [ `# ${exp.scenario.name}`, "## Summary", " - Aggregation:", ` - Trial: ${exp.events.length}`, ` - Success: ${exp.events.filter((e => e.type === "success")).length}`, ` - Failure: ${exp.events.filter((e => e.type === "failure")).length}`, ` - Average Time: ${MathUtil.round(exp.events.map((event => event.completed_at.getTime() - event.started_at.getTime())).reduce(((a, b) => a + b), 0) / exp.events.length).toLocaleString()} ms`, ` - Token Usage`, ` - Total: ${aggregate.total.toLocaleString()}`, ` - Input`, ` - Total: ${aggregate.input.total.toLocaleString()}`, ` - Cached: ${aggregate.input.cached.toLocaleString()}`, ` - Output:`, ` - Total: ${aggregate.output.total.toLocaleString()}`, ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`, ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`, ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`, "", "## Events", " No | Type | Time", "---:|:-----|----:", ...exp.events.map(((e, i) => [ `[${i + 1}.](./${i + 1}.${e.type}.md)`, e.type, `${MathUtil.round(e.completed_at.getTime() - e.started_at.getTime())} ms` ].join(" | "))), "", "## Scenario", "### User Prompt", exp.scenario.text, "", "### Expected", "```json", JSON.stringify(AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected), null, 2), "```" ].join("\n");
|
|
436
|
+
}
|
|
350
437
|
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
const events = result.experiments.map((r => r.events)).flat();
|
|
355
|
-
const average = events.map((e => e.completed_at.getTime() - e.started_at.getTime())).reduce(((a, b) => a + b), 0) / events.length;
|
|
356
|
-
const aggregate = result.usage.aggregate;
|
|
357
|
-
return [ "# LLM Function Selection Benchmark", "## Summary", ` - Aggregation:`, ` - Scenarios: #${result.experiments.length.toLocaleString()}`, ` - Trial: ${events.length}`, ` - Success: ${events.filter((e => e.type === "success")).length}`, ` - Failure: ${events.filter((e => e.type === "failure")).length}`, ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`, ` - Token Usage`, ` - Total: ${aggregate.total.toLocaleString()}`, ` - Input`, ` - Total: ${aggregate.input.total.toLocaleString()}`, ` - Cached: ${aggregate.input.cached.toLocaleString()}`, ` - Output:`, ` - Total: ${aggregate.output.total.toLocaleString()}`, ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`, ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`, ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`, "", "## Experiments", " Name | Status | Time/Avg ", ":-----|:-------|----------:", ...result.experiments.map((exp => [ `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`, (() => {
|
|
358
|
-
const success = Math.floor(exp.events.filter((e => e.type === "success")).length / exp.events.length * 10);
|
|
359
|
-
return new Array(success).fill("■").join("") + new Array(10 - success).fill("□").join("");
|
|
360
|
-
})(), MathUtil.round(exp.events.map((event => event.completed_at.getTime() - event.started_at.getTime())).reduce(((a, b) => a + b), 0) / exp.events.length).toLocaleString() + " ms" ].join(" | "))) ].join("\n");
|
|
361
|
-
};
|
|
362
|
-
const writeExperimentIndex = exp => {
|
|
363
|
-
const aggregate = exp.usage.aggregate;
|
|
364
|
-
return [ `# ${exp.scenario.name}`, "## Summary", " - Aggregation:", ` - Trial: ${exp.events.length}`, ` - Success: ${exp.events.filter((e => e.type === "success")).length}`, ` - Failure: ${exp.events.filter((e => e.type === "failure")).length}`, ` - Average Time: ${MathUtil.round(exp.events.map((event => event.completed_at.getTime() - event.started_at.getTime())).reduce(((a, b) => a + b), 0) / exp.events.length).toLocaleString()} ms`, ` - Token Usage`, ` - Total: ${aggregate.total.toLocaleString()}`, ` - Input`, ` - Total: ${aggregate.input.total.toLocaleString()}`, ` - Cached: ${aggregate.input.cached.toLocaleString()}`, ` - Output:`, ` - Total: ${aggregate.output.total.toLocaleString()}`, ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`, ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`, ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`, "", "## Events", " No | Type | Time", "---:|:-----|----:", ...exp.events.map(((e, i) => [ `[${i + 1}.](./${i + 1}.${e.type}.md)`, e.type, MathUtil.round(e.completed_at.getTime() - e.started_at.getTime()) + " ms" ].join(" | "))), "", "## Scenario", "### User Prompt", exp.scenario.text, "", "### Expected", "```json", JSON.stringify(AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected), null, 2), "```" ].join("\n");
|
|
365
|
-
};
|
|
366
|
-
const writeExperimentEvent = (event, index) => [ `# ${index + 1}. ${event.type}`, `## Summary`, ` - Name: ${event.scenario.name}`, ` - Type: ${event.type}`, ` - Time: ${(event.completed_at.getTime() - event.started_at.getTime()).toLocaleString()} ms`, ...event.type !== "error" ? [ " - Token Usage", ` - Total: ${event.usage.aggregate.toLocaleString()}`, ` - Prompt`, ` - Total: ${event.usage.aggregate.input.total.toLocaleString()}`, ` - Cached: ${event.usage.aggregate.input.cached.toLocaleString()}`, ` - Completion:`, ` - Total: ${event.usage.aggregate.output.total.toLocaleString()}`, ` - Reasoning: ${event.usage.aggregate.output.reasoning.toLocaleString()}`, ` - Accepted Prediction: ${event.usage.aggregate.output.accepted_prediction.toLocaleString()}`, ` - Rejected Prediction: ${event.usage.aggregate.output.rejected_prediction.toLocaleString()}` ] : [], "", "## Scenario", "### User Prompt", event.scenario.text, "", "### Expected", "```json", JSON.stringify(AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected), null, 2), "```", "", ...event.type === "success" || event.type === "failure" ? [ "## Result", ...event.selected.map((s => [ `### ${s.operation.name}`, ` - Controller: \`${s.operation.controller.name}\``, ` - Function: \`${s.operation.function.name}\``, ` - Reason: ${s.reason}`, "", ...s.operation.function.description ? [ s.operation.function.description, "" ] : [] ].join("\n"))) ] : [], ...event.type === "error" ? [ "## Error", "```json", AgenticaBenchmarkUtil.errorToJson(JSON.stringify(event.error, null, 2)), "```", "" ] : [] ].join("\n");
|
|
367
|
-
})(AgenticaSelectBenchmarkReporter || (AgenticaSelectBenchmarkReporter = {}));
|
|
438
|
+
function writeExperimentEvent(event, index) {
|
|
439
|
+
return [ `# ${index + 1}. ${event.type}`, `## Summary`, ` - Name: ${event.scenario.name}`, ` - Type: ${event.type}`, ` - Time: ${(event.completed_at.getTime() - event.started_at.getTime()).toLocaleString()} ms`, ...event.type !== "error" ? [ " - Token Usage", ` - Total: ${event.usage.aggregate.toLocaleString()}`, ` - Prompt`, ` - Total: ${event.usage.aggregate.input.total.toLocaleString()}`, ` - Cached: ${event.usage.aggregate.input.cached.toLocaleString()}`, ` - Completion:`, ` - Total: ${event.usage.aggregate.output.total.toLocaleString()}`, ` - Reasoning: ${event.usage.aggregate.output.reasoning.toLocaleString()}`, ` - Accepted Prediction: ${event.usage.aggregate.output.accepted_prediction.toLocaleString()}`, ` - Rejected Prediction: ${event.usage.aggregate.output.rejected_prediction.toLocaleString()}` ] : [], "", "## Scenario", "### User Prompt", event.scenario.text, "", "### Expected", "```json", JSON.stringify(AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected), null, 2), "```", "", ...event.type === "success" || event.type === "failure" ? [ "## Result", ...event.selected.map((s => [ `### ${s.operation.name}`, ` - Controller: \`${s.operation.controller.name}\``, ` - Function: \`${s.operation.function.name}\``, ` - Reason: ${s.reason}`, "", ...s.operation.function.description !== undefined && s.operation.function.description !== "" ? [ s.operation.function.description, "" ] : [] ].join("\n"))) ] : [], ...event.type === "error" ? [ "## Error", "```json", AgenticaBenchmarkUtil.errorToJson(JSON.stringify(event.error, null, 2)), "```", "" ] : [] ].join("\n");
|
|
440
|
+
}
|
|
368
441
|
|
|
369
442
|
class AgenticaSelectBenchmark {
|
|
370
443
|
constructor(props) {
|
|
@@ -381,28 +454,34 @@ class AgenticaSelectBenchmark {
|
|
|
381
454
|
const started_at = new Date;
|
|
382
455
|
const semaphore = new Semaphore(this.config_.simultaneous);
|
|
383
456
|
const experiments = await Promise.all(this.scenarios_.map((async scenario => {
|
|
384
|
-
const events = await Promise.all(
|
|
457
|
+
const events = await Promise.all(Array.from({
|
|
458
|
+
length: this.config_.repeat
|
|
459
|
+
}).map((async () => {
|
|
385
460
|
await semaphore.acquire();
|
|
386
461
|
const e = await this.step(scenario);
|
|
387
462
|
await semaphore.release();
|
|
388
|
-
if (listener !== undefined)
|
|
463
|
+
if (listener !== undefined) {
|
|
464
|
+
listener(e);
|
|
465
|
+
}
|
|
389
466
|
return e;
|
|
390
467
|
})));
|
|
391
468
|
return {
|
|
392
469
|
scenario,
|
|
393
470
|
events,
|
|
394
|
-
usage: events.filter((e => e.type !== "error")).map((e => e.usage)).reduce(AgenticaTokenUsage.plus, AgenticaTokenUsage.zero())
|
|
471
|
+
usage: events.filter((e => e.type !== "error")).map((e => e.usage)).reduce(((acc, cur) => AgenticaTokenUsage.plus(acc, cur)), AgenticaTokenUsage.zero())
|
|
395
472
|
};
|
|
396
473
|
})));
|
|
397
474
|
return this.result_ = {
|
|
398
475
|
experiments,
|
|
399
476
|
started_at,
|
|
400
477
|
completed_at: new Date,
|
|
401
|
-
usage: experiments.map((p => p.usage)).reduce(AgenticaTokenUsage.plus, AgenticaTokenUsage.zero())
|
|
478
|
+
usage: experiments.map((p => p.usage)).reduce(((acc, cur) => AgenticaTokenUsage.plus(acc, cur)), AgenticaTokenUsage.zero())
|
|
402
479
|
};
|
|
403
480
|
}
|
|
404
481
|
report() {
|
|
405
|
-
if (this.result_ === null)
|
|
482
|
+
if (this.result_ === null) {
|
|
483
|
+
throw new Error("Benchmark is not executed yet.");
|
|
484
|
+
}
|
|
406
485
|
return AgenticaSelectBenchmarkReporter.markdown(this.result_);
|
|
407
486
|
}
|
|
408
487
|
async step(scenario) {
|