@agentica/benchmark 0.12.21 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/README.md +39 -33
  2. package/lib/AgenticaCallBenchmark.d.ts +12 -6
  3. package/lib/AgenticaCallBenchmark.js +24 -18
  4. package/lib/AgenticaCallBenchmark.js.map +1 -1
  5. package/lib/AgenticaSelectBenchmark.d.ts +12 -6
  6. package/lib/AgenticaSelectBenchmark.js +14 -12
  7. package/lib/AgenticaSelectBenchmark.js.map +1 -1
  8. package/lib/index.mjs +315 -236
  9. package/lib/index.mjs.map +1 -1
  10. package/lib/internal/AgenticaBenchmarkPredicator.d.ts +38 -29
  11. package/lib/internal/AgenticaBenchmarkPredicator.js +100 -84
  12. package/lib/internal/AgenticaBenchmarkPredicator.js.map +1 -1
  13. package/lib/internal/AgenticaBenchmarkUtil.d.ts +21 -6
  14. package/lib/internal/AgenticaBenchmarkUtil.js +39 -33
  15. package/lib/internal/AgenticaBenchmarkUtil.js.map +1 -1
  16. package/lib/internal/AgenticaCallBenchmarkReporter.d.ts +6 -5
  17. package/lib/internal/AgenticaCallBenchmarkReporter.js +130 -126
  18. package/lib/internal/AgenticaCallBenchmarkReporter.js.map +1 -1
  19. package/lib/internal/AgenticaPromptReporter.d.ts +13 -5
  20. package/lib/internal/AgenticaPromptReporter.js +45 -41
  21. package/lib/internal/AgenticaPromptReporter.js.map +1 -1
  22. package/lib/internal/AgenticaSelectBenchmarkReporter.d.ts +3 -1
  23. package/lib/internal/AgenticaSelectBenchmarkReporter.js +153 -150
  24. package/lib/internal/AgenticaSelectBenchmarkReporter.js.map +1 -1
  25. package/lib/structures/IAgenticaBenchmarkExpected.d.ts +8 -2
  26. package/lib/structures/IAgenticaCallBenchmarkEvent.d.ts +9 -3
  27. package/lib/structures/IAgenticaCallBenchmarkResult.d.ts +10 -4
  28. package/lib/structures/IAgenticaCallBenchmarkScenario.d.ts +8 -2
  29. package/lib/structures/IAgenticaSelectBenchmarkEvent.d.ts +9 -3
  30. package/lib/structures/IAgenticaSelectBenchmarkResult.d.ts +10 -4
  31. package/lib/structures/IAgenticaSelectBenchmarkScenario.d.ts +8 -2
  32. package/lib/utils/MathUtil.d.ts +15 -3
  33. package/lib/utils/MathUtil.js +15 -4
  34. package/lib/utils/MathUtil.js.map +1 -1
  35. package/package.json +12 -10
  36. package/src/AgenticaCallBenchmark.ts +64 -45
  37. package/src/AgenticaSelectBenchmark.ts +42 -30
  38. package/src/internal/AgenticaBenchmarkPredicator.ts +208 -186
  39. package/src/internal/AgenticaBenchmarkUtil.ts +58 -40
  40. package/src/internal/AgenticaCallBenchmarkReporter.ts +180 -182
  41. package/src/internal/AgenticaPromptReporter.ts +46 -33
  42. package/src/internal/AgenticaSelectBenchmarkReporter.ts +205 -203
  43. package/src/structures/IAgenticaBenchmarkExpected.ts +9 -2
  44. package/src/structures/IAgenticaCallBenchmarkEvent.ts +9 -3
  45. package/src/structures/IAgenticaCallBenchmarkResult.ts +10 -4
  46. package/src/structures/IAgenticaCallBenchmarkScenario.ts +8 -2
  47. package/src/structures/IAgenticaSelectBenchmarkEvent.ts +9 -3
  48. package/src/structures/IAgenticaSelectBenchmarkResult.ts +10 -4
  49. package/src/structures/IAgenticaSelectBenchmarkScenario.ts +8 -2
  50. package/src/utils/MathUtil.ts +16 -3
package/lib/index.mjs CHANGED
@@ -8,254 +8,302 @@ import "typia";
8
8
 
9
9
  import { ChatGptSelectFunctionAgent } from "@agentica/core/src/chatgpt/ChatGptSelectFunctionAgent";
10
10
 
11
- var AgenticaBenchmarkPredicator;
11
+ const AgenticaBenchmarkPredicator = {
12
+ isNext,
13
+ success
14
+ };
12
15
 
13
- (function(AgenticaBenchmarkPredicator) {
14
- AgenticaBenchmarkPredicator.isNext = async agent => {
15
- const last = agent.getPromptHistories().at(-1);
16
- if (last?.type !== "text" || last.role !== "assistant") return null;
17
- const consent = {
18
- functions: [ {
19
- name: "consent",
20
- parameters: {
21
- description: "Properties for asking the user's consent",
22
- type: "object",
23
- properties: {
24
- content: {
25
- description: "Reason of the message implying what the AI agent wants\nto do at the next step after the user's consent.",
26
- type: "string"
27
- },
28
- reply: {
29
- title: "Recommended reply message for the user",
30
- description: "Recommended reply message for the user.\n\nThe message what AI agent wants the user to reply\naccepting the AI agent's next job suggestion.",
31
- type: "string"
32
- }
16
+ async function isNext(agent) {
17
+ const last = agent.getPromptHistories().at(-1);
18
+ const llmVendor = agent.getVendor();
19
+ const isTextPrompt = last?.type === "text" && last.role === "assistant";
20
+ if (!isTextPrompt) {
21
+ return null;
22
+ }
23
+ const consent = {
24
+ functions: [ {
25
+ name: "consent",
26
+ parameters: {
27
+ description: "Properties for asking the user's consent",
28
+ type: "object",
29
+ properties: {
30
+ content: {
31
+ description: "Reason of the message implying what the AI agent wants\nto do at the next step after the user's consent.",
32
+ type: "string"
33
33
  },
34
- required: [ "content", "reply" ],
35
- additionalProperties: false,
36
- $defs: {}
34
+ reply: {
35
+ title: "Recommended reply message for the user",
36
+ description: "Recommended reply message for the user.\n\nThe message what AI agent wants the user to reply\naccepting the AI agent's next job suggestion.",
37
+ type: "string"
38
+ }
37
39
  },
38
- description: "Ask user to consent for what the AI agent wants to do next.\n\nIf AI agent wants to do some function calling at next,\nbut it needs the user's consent about the function calling to do,\nthen call this tool function.",
39
- validate: (() => {
40
- const _io0 = input => "string" === typeof input.content && "string" === typeof input.reply;
41
- const _vo0 = (input, _path, _exceptionable = true) => [ "string" === typeof input.content || _report(_exceptionable, {
42
- path: _path + ".content",
43
- expected: "string",
44
- value: input.content
45
- }), "string" === typeof input.reply || _report(_exceptionable, {
46
- path: _path + ".reply",
47
- expected: "string",
48
- value: input.reply
49
- }) ].every((flag => flag));
50
- const __is = input => "object" === typeof input && null !== input && _io0(input);
51
- let errors;
52
- let _report;
53
- return input => {
54
- if (false === __is(input)) {
55
- errors = [];
56
- _report = __typia_transform__validateReport._validateReport(errors);
57
- ((input, _path, _exceptionable = true) => ("object" === typeof input && null !== input || _report(true, {
58
- path: _path + "",
59
- expected: "IConsentProps",
60
- value: input
61
- })) && _vo0(input, _path + "", true) || _report(true, {
62
- path: _path + "",
63
- expected: "IConsentProps",
64
- value: input
65
- }))(input, "$input", true);
66
- const success = 0 === errors.length;
67
- return success ? {
68
- success,
69
- data: input
70
- } : {
71
- success,
72
- errors,
73
- data: input
74
- };
75
- }
76
- return {
77
- success: true,
40
+ required: [ "content", "reply" ],
41
+ additionalProperties: false,
42
+ $defs: {}
43
+ },
44
+ description: "Ask user to consent for what the AI agent wants to do next.\n\nIf AI agent wants to do some function calling at next,\nbut it needs the user's consent about the function calling to do,\nthen call this tool function.",
45
+ validate: (() => {
46
+ const _io0 = input => "string" === typeof input.content && "string" === typeof input.reply;
47
+ const _vo0 = (input, _path, _exceptionable = true) => [ "string" === typeof input.content || _report(_exceptionable, {
48
+ path: _path + ".content",
49
+ expected: "string",
50
+ value: input.content
51
+ }), "string" === typeof input.reply || _report(_exceptionable, {
52
+ path: _path + ".reply",
53
+ expected: "string",
54
+ value: input.reply
55
+ }) ].every((flag => flag));
56
+ const __is = input => "object" === typeof input && null !== input && _io0(input);
57
+ let errors;
58
+ let _report;
59
+ return input => {
60
+ if (false === __is(input)) {
61
+ errors = [];
62
+ _report = __typia_transform__validateReport._validateReport(errors);
63
+ ((input, _path, _exceptionable = true) => ("object" === typeof input && null !== input || _report(true, {
64
+ path: _path + "",
65
+ expected: "IConsentProps",
66
+ value: input
67
+ })) && _vo0(input, _path + "", true) || _report(true, {
68
+ path: _path + "",
69
+ expected: "IConsentProps",
70
+ value: input
71
+ }))(input, "$input", true);
72
+ const success = 0 === errors.length;
73
+ return success ? {
74
+ success,
75
+ data: input
76
+ } : {
77
+ success,
78
+ errors,
78
79
  data: input
79
80
  };
81
+ }
82
+ return {
83
+ success: true,
84
+ data: input
85
+ };
86
+ };
87
+ })()
88
+ } ]
89
+ }.functions[0];
90
+ const result = await llmVendor.api.chat.completions.create({
91
+ model: llmVendor.model,
92
+ messages: [ {
93
+ role: "system",
94
+ content: [ "You are an helpful assistant.", "", "If what the assistant said seems like to asking for", "user's consent about some function calling at the next step,", "use the tools appropriately to step to the next." ].join("\n")
95
+ }, {
96
+ role: "assistant",
97
+ content: last.text
98
+ } ],
99
+ tools: [ {
100
+ type: "function",
101
+ function: {
102
+ name: consent.name,
103
+ description: consent.description,
104
+ parameters: consent.parameters
105
+ }
106
+ } ],
107
+ tool_choice: "required",
108
+ parallel_tool_calls: false
109
+ }, llmVendor.options);
110
+ const toolCall = (result.choices[0]?.message.tool_calls ?? []).filter((tc => tc.type === "function" && tc.function.name === consent.name))?.[0];
111
+ if (toolCall === undefined) {
112
+ return null;
113
+ }
114
+ const input = (() => {
115
+ const _io0 = input => "string" === typeof input.content && "string" === typeof input.reply;
116
+ const __is = input => "object" === typeof input && null !== input && _io0(input);
117
+ return input => {
118
+ input = JSON.parse(input);
119
+ return __is(input) ? input : null;
120
+ };
121
+ })()(toolCall.function.arguments);
122
+ return input !== null ? input.reply : null;
123
+ }
124
+
125
+ function success(props) {
126
+ return successInner(props).result;
127
+ }
128
+
129
+ function successInner(props) {
130
+ const call = (expected, overrideOperations) => successInner({
131
+ expected,
132
+ operations: overrideOperations ?? props.operations,
133
+ strict: props.strict
134
+ });
135
+ switch (props.expected.type) {
136
+ case "array":
137
+ {
138
+ let take = 0;
139
+ const targetIterator = props.expected.items[Symbol.iterator]();
140
+ let targeted = targetIterator.next();
141
+ while (true) {
142
+ if (targeted.done === true) {
143
+ return {
144
+ result: true,
145
+ take
80
146
  };
81
- })()
82
- } ]
83
- }.functions[0];
84
- const result = await agent["props"].vendor.api.chat.completions.create({
85
- model: agent["props"].vendor.model,
86
- messages: [ {
87
- role: "system",
88
- content: [ "You are an helpful assistant.", "", "If what the assistant said seems like to asking for", "user's consent about some function calling at the next step,", "use the tools appropriately to step to the next." ].join("\n")
89
- }, {
90
- role: "assistant",
91
- content: last.text
92
- } ],
93
- tools: [ {
94
- type: "function",
95
- function: {
96
- name: consent.name,
97
- description: consent.description,
98
- parameters: consent.parameters
99
147
  }
100
- } ],
101
- tool_choice: "required",
102
- parallel_tool_calls: false
103
- }, agent["props"].vendor.options);
104
- const toolCall = (result.choices[0]?.message.tool_calls ?? []).filter((tc => tc.type === "function" && tc.function.name === consent.name))?.[0];
105
- if (toolCall === undefined) return null;
106
- const input = JSON.parse(toolCall.function.arguments);
107
- return (() => {
108
- const _io0 = input => "string" === typeof input.content && "string" === typeof input.reply;
109
- return input => "object" === typeof input && null !== input && _io0(input);
110
- })()(input) ? input.reply : null;
111
- };
112
- AgenticaBenchmarkPredicator.success = props => successInner(props).result;
113
- const successInner = props => {
114
- const call = (expected, overrideOperations) => successInner({
115
- expected,
116
- operations: overrideOperations ?? props.operations,
117
- strict: props.strict
118
- });
119
- switch (props.expected.type) {
120
- case "array":
121
- {
122
- let take = 0;
123
- const targetIterator = props.expected.items[Symbol.iterator]();
124
- let targeted = targetIterator.next();
125
- while (true) {
126
- if (targeted.done) {
127
- return {
128
- result: true,
129
- take
130
- };
131
- }
132
- if (take >= props.operations.length) {
133
- return {
134
- result: false
135
- };
136
- }
137
- const result = call(targeted.value, props.operations.slice(take));
138
- if (!result.result) {
139
- if (!props.strict) {
140
- take += 1;
141
- continue;
142
- }
148
+ if (take >= props.operations.length) {
149
+ return {
150
+ result: false
151
+ };
152
+ }
153
+ const result = call(targeted.value, props.operations.slice(take));
154
+ if (!result.result) {
155
+ if (props.strict === true) {
143
156
  return {
144
157
  result: false
145
158
  };
146
159
  }
147
- take += result.take;
148
- targeted = targetIterator.next();
160
+ take += 1;
161
+ continue;
149
162
  }
163
+ take += result.take;
164
+ targeted = targetIterator.next();
150
165
  }
166
+ }
151
167
 
152
- case "standalone":
153
- {
154
- const target = props.expected.operation;
155
- const result = props.operations.some((op => op.name === target.name));
156
- if (result) {
157
- return {
158
- result,
159
- take: 1
160
- };
161
- }
168
+ case "standalone":
169
+ {
170
+ const target = props.expected.operation;
171
+ const result = props.operations.some((op => op.name === target.name));
172
+ if (result) {
162
173
  return {
163
- result
174
+ result,
175
+ take: 1
164
176
  };
165
177
  }
166
-
167
- case "anyOf":
168
- for (const expected of props.expected.anyOf) {
169
- const callResult = call(expected);
170
- if (callResult.result) {
171
- return callResult;
172
- }
173
- }
174
178
  return {
175
- result: false
179
+ result
176
180
  };
181
+ }
177
182
 
178
- case "allOf":
179
- {
180
- const result = props.expected.allOf.map((expected => call(expected)));
181
- if (result.every((r => r.result))) {
182
- return {
183
- result: true,
184
- take: result.reduce(((acc, r) => Math.max(acc, r.take)), 0)
185
- };
186
- }
183
+ case "anyOf":
184
+ for (const expected of props.expected.anyOf) {
185
+ const callResult = call(expected);
186
+ if (callResult.result) {
187
+ return callResult;
188
+ }
189
+ }
190
+ return {
191
+ result: false
192
+ };
193
+
194
+ case "allOf":
195
+ {
196
+ const result = props.expected.allOf.map((expected => call(expected)));
197
+ if (result.every((r => r.result))) {
187
198
  return {
188
- result: false
199
+ result: true,
200
+ take: result.reduce(((acc, r) => Math.max(acc, r.take)), 0)
189
201
  };
190
202
  }
203
+ return {
204
+ result: false
205
+ };
191
206
  }
192
- };
193
- })(AgenticaBenchmarkPredicator || (AgenticaBenchmarkPredicator = {}));
194
-
195
- var MathUtil;
207
+ }
208
+ }
196
209
 
197
- (function(MathUtil) {
198
- MathUtil.round = value => Math.floor(value * 100) / 100;
199
- })(MathUtil || (MathUtil = {}));
210
+ const MathUtil = {
211
+ round: value => Math.floor(value * 100) / 100
212
+ };
200
213
 
201
- var AgenticaBenchmarkUtil;
214
+ const AgenticaBenchmarkUtil = {
215
+ errorToJson,
216
+ expectedToJson
217
+ };
202
218
 
203
- (function(AgenticaBenchmarkUtil) {
204
- AgenticaBenchmarkUtil.errorToJson = error => {
205
- if (error instanceof Error) return {
219
+ function errorToJson(error) {
220
+ if (error instanceof Error) {
221
+ return {
206
222
  ...error,
207
223
  name: error.name,
208
224
  message: error.message,
209
225
  stack: error.stack
210
226
  };
211
- return error;
212
- };
213
- AgenticaBenchmarkUtil.expectedToJson = expected => {
214
- if (expected.type === "standalone") return {
227
+ }
228
+ return error;
229
+ }
230
+
231
+ function expectedToJson(expected) {
232
+ if (expected.type === "standalone") {
233
+ return {
215
234
  type: expected.type,
216
235
  operation: {
217
236
  name: expected.operation.name,
218
237
  description: expected.operation.function.description
219
238
  }
220
- }; else if (expected.type === "array") return {
239
+ };
240
+ } else if (expected.type === "array") {
241
+ return {
221
242
  type: expected.type,
222
- items: expected.items.map(AgenticaBenchmarkUtil.expectedToJson)
223
- }; else if (expected.type === "allOf") return {
243
+ items: expected.items.map(expectedToJson)
244
+ };
245
+ } else if (expected.type === "allOf") {
246
+ return {
224
247
  type: expected.type,
225
- allOf: expected.allOf.map(AgenticaBenchmarkUtil.expectedToJson)
226
- }; else return {
248
+ allOf: expected.allOf.map(expectedToJson)
249
+ };
250
+ } else {
251
+ return {
227
252
  type: expected.type,
228
- anyOf: expected.anyOf.map(AgenticaBenchmarkUtil.expectedToJson)
253
+ anyOf: expected.anyOf.map(expectedToJson)
229
254
  };
230
- };
231
- })(AgenticaBenchmarkUtil || (AgenticaBenchmarkUtil = {}));
255
+ }
256
+ }
232
257
 
233
- var AgenticaPromptReporter;
258
+ const AgenticaPromptReporter = {
259
+ markdown: markdown$2
260
+ };
234
261
 
235
- (function(AgenticaPromptReporter) {
236
- AgenticaPromptReporter.markdown = p => {
237
- if (p.type === "text") return [ `### Text (${p.role})`, p.text, "" ].join("\n"); else if (p.type === "select" || p.type === "cancel") return [ `### ${p.type === "select" ? "Select" : "Cancel"}`, ...p.selections.map((s => [ `#### ${s.operation.name}`, ` - controller: ${s.operation.controller.name}`, ` - function: ${s.operation.function.name}`, ` - reason: ${s.reason}`, "", ...!!s.operation.function.description?.length ? [ s.operation.function.description, "" ] : [] ])).flat() ].join("\n"); else if (p.type === "describe") return [ "### Describe", ...p.executes.map((e => ` - ${e.operation.name}`)), "", ...p.text.split("\n").map((s => `> ${s}`)), "" ].join("\n");
238
- return [ "### Execute", ` - name: ${p.operation.name}`, ` - controller: ${p.operation.controller.name}`, ` - function: ${p.operation.function.name}`, "", "```json", JSON.stringify(p.arguments, null, 2), "```", "" ].join("\n");
239
- };
240
- })(AgenticaPromptReporter || (AgenticaPromptReporter = {}));
262
+ function markdown$2(p) {
263
+ if (p.type === "text") {
264
+ return [ `### Text (${p.role})`, p.text, "" ].join("\n");
265
+ } else if (p.type === "select" || p.type === "cancel") {
266
+ return [ `### ${p.type === "select" ? "Select" : "Cancel"}`, ...p.selections.flatMap((s => {
267
+ const functionDescriptionCount = s.operation.function.description?.length ?? 0;
268
+ return [ `#### ${s.operation.name}`, ` - controller: ${s.operation.controller.name}`, ` - function: ${s.operation.function.name}`, ` - reason: ${s.reason}`, "", ...functionDescriptionCount > 0 ? [ s.operation.function.description, "" ] : [] ];
269
+ })) ].join("\n");
270
+ } else if (p.type === "describe") {
271
+ return [ "### Describe", ...p.executes.map((e => ` - ${e.operation.name}`)), "", ...p.text.split("\n").map((s => `> ${s}`)), "" ].join("\n");
272
+ }
273
+ return [ "### Execute", ` - name: ${p.operation.name}`, ` - controller: ${p.operation.controller.name}`, ` - function: ${p.operation.function.name}`, "", "```json", JSON.stringify(p.arguments, null, 2), "```", "" ].join("\n");
274
+ }
241
275
 
242
- var AgenticaCallBenchmarkReporter;
276
+ const AgenticaCallBenchmarkReporter = {
277
+ markdown: markdown$1
278
+ };
243
279
 
244
- (function(AgenticaCallBenchmarkReporter) {
245
- AgenticaCallBenchmarkReporter.markdown = result => Object.fromEntries([ [ "./README.md", writeIndex(result) ], ...result.experiments.map((exp => [ [ `./${exp.scenario.name}/README.md`, writeExperimentIndex(exp) ], ...exp.events.map(((event, i) => [ `./${exp.scenario.name}/${i + 1}.${event.type}.md`, writeExperimentEvent(event, i) ])) ])).flat() ]);
246
- const writeIndex = result => {
247
- const events = result.experiments.map((r => r.events)).flat();
248
- const average = events.map((e => e.completed_at.getTime() - e.started_at.getTime())).reduce(((a, b) => a + b), 0) / events.length;
249
- const aggregate = result.usage.aggregate;
250
- return [ "# LLM Function Call Benchmark", "## Summary", ` - Aggregation:`, ` - Scenarios: #${result.experiments.length.toLocaleString()}`, ` - Trial: ${events.length}`, ` - Success: ${events.filter((e => e.type === "success")).length}`, ` - Failure: ${events.filter((e => e.type === "failure")).length}`, ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`, ` - Token Usage`, ` - Total: ${aggregate.total.toLocaleString()}`, ` - Input`, ` - Total: ${aggregate.input.total.toLocaleString()}`, ` - Cached: ${aggregate.input.cached.toLocaleString()}`, ` - Output:`, ` - Total: ${aggregate.output.total.toLocaleString()}`, ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`, ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`, ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`, "", "## Experiments", " Name | Select | Call | Time/Avg ", ":-----|:-------|:-----|----------:", ...result.experiments.map((exp => [ `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`, drawStatus(exp.events, (e => e.type !== "error" && e.select === true)), drawStatus(exp.events, (e => e.type !== "error" && e.call === true)), `${MathUtil.round(exp.events.map((e => e.completed_at.getTime() - e.started_at.getTime())).reduce(((a, b) => a + b), 0) / exp.events.length).toLocaleString()} ms` ].join(" | "))) ].join("\n");
251
- };
252
- const writeExperimentIndex = exp => [ `# ${exp.scenario.name}`, "## Summary", ` - Scenarios: #${exp.events.length.toLocaleString()}`, ` - Success: ${exp.events.filter((e => e.type === "success")).length}`, ` - Failure: ${exp.events.filter((e => e.type === "failure")).length}`, ` - Average Time: ${MathUtil.round(exp.events.map((e => e.completed_at.getTime() - e.started_at.getTime())).reduce(((a, b) => a + b), 0) / exp.events.length).toLocaleString()} ms`, "", "## Events", " Name | Type | Time", ":-----|:-----|----:", ...exp.events.map(((e, i) => [ `[${i + 1}.](./${i + 1}.${e.type}.md)`, e.type, `${MathUtil.round(e.completed_at.getTime() - e.started_at.getTime())} ms` ].join(" | "))), "", "## Scenario", "### User Prompt", exp.scenario.text, "", "### Expected", "```json", JSON.stringify(AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected), null, 2), "```" ].join("\n");
253
- const writeExperimentEvent = (event, index) => [ `# ${index + 1}. ${event.type}`, "## Summary", ` - Name: ${event.scenario.name}`, ` - Type: ${event.type}`, ` - Time: ${MathUtil.round(event.completed_at.getTime() - event.started_at.getTime()).toLocaleString()} ms`, ...event.type !== "error" ? [ ` - Select: ${event.select ? "✅" : "❌"}`, ` - Call: ${event.call ? "✅" : "❌"}` ] : [], ` - Token Usage:`, ` - Total: ${JSON.stringify(event.usage.aggregate.total)}`, ` - Input`, ` - Total: ${event.usage.aggregate.input.total}`, ` - Cached: ${event.usage.aggregate.input.cached}`, ` - Output:`, ` - Total: ${event.usage.aggregate.output.total}`, ` - Accepted Prediction: ${event.usage.aggregate.output.accepted_prediction}`, ` - Reasoning: ${event.usage.aggregate.output.reasoning}`, ` - Rejected Prediction: ${event.usage.aggregate.output.rejected_prediction}`, "", "## Scenario", "### User Prompt", event.scenario.text, "", "### Expected", "```json", JSON.stringify(AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected), null, 2), "```", "", "## Prompt Histories", ...event.prompts.map(AgenticaPromptReporter.markdown), "", ...event.type === "error" ? [ "## Error", "```json", JSON.stringify(AgenticaBenchmarkUtil.errorToJson(event.error), null, 2), "```" ] : [] ].join("\n");
254
- const drawStatus = (events, success) => {
255
- const count = Math.floor(events.filter(success).length / events.length * 10);
256
- return new Array(count).fill("").join("") + new Array(10 - count).fill("").join("");
257
- };
258
- })(AgenticaCallBenchmarkReporter || (AgenticaCallBenchmarkReporter = {}));
280
+ function markdown$1(result) {
281
+ return Object.fromEntries([ [ "./README.md", writeIndex$1(result) ], ...result.experiments.map((exp => [ [ `./${exp.scenario.name}/README.md`, writeExperimentIndex$1(exp) ], ...exp.events.map(((event, i) => [ `./${exp.scenario.name}/${i + 1}.${event.type}.md`, writeExperimentEvent$1(event, i) ])) ])).flat() ]);
282
+ }
283
+
284
+ function writeIndex$1(result) {
285
+ const events = result.experiments.map((r => r.events)).flat();
286
+ const average = events.map((e => e.completed_at.getTime() - e.started_at.getTime())).reduce(((a, b) => a + b), 0) / events.length;
287
+ const aggregate = result.usage.aggregate;
288
+ return [ "# LLM Function Call Benchmark", "## Summary", ` - Aggregation:`, ` - Scenarios: #${result.experiments.length.toLocaleString()}`, ` - Trial: ${events.length}`, ` - Success: ${events.filter((e => e.type === "success")).length}`, ` - Failure: ${events.filter((e => e.type === "failure")).length}`, ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`, ` - Token Usage`, ` - Total: ${aggregate.total.toLocaleString()}`, ` - Input`, ` - Total: ${aggregate.input.total.toLocaleString()}`, ` - Cached: ${aggregate.input.cached.toLocaleString()}`, ` - Output:`, ` - Total: ${aggregate.output.total.toLocaleString()}`, ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`, ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`, ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`, "", "## Experiments", " Name | Select | Call | Time/Avg ", ":-----|:-------|:-----|----------:", ...result.experiments.map((exp => [ `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`, drawStatus(exp.events, (e => e.type !== "error" && e.select === true)), drawStatus(exp.events, (e => e.type !== "error" && e.call === true)), `${MathUtil.round(exp.events.map((e => e.completed_at.getTime() - e.started_at.getTime())).reduce(((a, b) => a + b), 0) / exp.events.length).toLocaleString()} ms` ].join(" | "))) ].join("\n");
289
+ }
290
+
291
+ function writeExperimentIndex$1(exp) {
292
+ return [ `# ${exp.scenario.name}`, "## Summary", ` - Scenarios: #${exp.events.length.toLocaleString()}`, ` - Success: ${exp.events.filter((e => e.type === "success")).length}`, ` - Failure: ${exp.events.filter((e => e.type === "failure")).length}`, ` - Average Time: ${MathUtil.round(exp.events.map((e => e.completed_at.getTime() - e.started_at.getTime())).reduce(((a, b) => a + b), 0) / exp.events.length).toLocaleString()} ms`, "", "## Events", " Name | Type | Time", ":-----|:-----|----:", ...exp.events.map(((e, i) => [ `[${i + 1}.](./${i + 1}.${e.type}.md)`, e.type, `${MathUtil.round(e.completed_at.getTime() - e.started_at.getTime())} ms` ].join(" | "))), "", "## Scenario", "### User Prompt", exp.scenario.text, "", "### Expected", "```json", JSON.stringify(AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected), null, 2), "```" ].join("\n");
293
+ }
294
+
295
+ function writeExperimentEvent$1(event, index) {
296
+ return [ `# ${index + 1}. ${event.type}`, "## Summary", ` - Name: ${event.scenario.name}`, ` - Type: ${event.type}`, ` - Time: ${MathUtil.round(event.completed_at.getTime() - event.started_at.getTime()).toLocaleString()} ms`, ...event.type !== "error" ? [ ` - Select: ${event.select ? "✅" : "❌"}`, ` - Call: ${event.call ? "✅" : "❌"}` ] : [], ` - Token Usage:`, ` - Total: ${JSON.stringify(event.usage.aggregate.total)}`, ` - Input`, ` - Total: ${event.usage.aggregate.input.total}`, ` - Cached: ${event.usage.aggregate.input.cached}`, ` - Output:`, ` - Total: ${event.usage.aggregate.output.total}`, ` - Accepted Prediction: ${event.usage.aggregate.output.accepted_prediction}`, ` - Reasoning: ${event.usage.aggregate.output.reasoning}`, ` - Rejected Prediction: ${event.usage.aggregate.output.rejected_prediction}`, "", "## Scenario", "### User Prompt", event.scenario.text, "", "### Expected", "```json", JSON.stringify(AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected), null, 2), "```", "", "## Prompt Histories", ...event.prompts.map(AgenticaPromptReporter.markdown), "", ...event.type === "error" ? [ "## Error", "```json", JSON.stringify(AgenticaBenchmarkUtil.errorToJson(event.error), null, 2), "```" ] : [] ].join("\n");
297
+ }
298
+
299
+ function drawStatus(events, success) {
300
+ const count = Math.floor(events.filter(success).length / events.length * 10);
301
+ return Array.from({
302
+ length: count
303
+ }).fill("■").join("") + Array.from({
304
+ length: 10 - count
305
+ }).fill("□").join("");
306
+ }
259
307
 
260
308
  class AgenticaCallBenchmark {
261
309
  constructor(props) {
@@ -271,29 +319,36 @@ class AgenticaCallBenchmark {
271
319
  async execute(listener) {
272
320
  const started_at = new Date;
273
321
  const semaphore = new Semaphore(this.config_.simultaneous);
274
- const experiments = await Promise.all(this.scenarios_.map((async scenario => {
275
- const events = await Promise.all(new Array(this.config_.repeat).fill(0).map((async () => {
322
+ const task = this.scenarios_.map((async scenario => {
323
+ const events = await Promise.all(Array.from({
324
+ length: this.config_.repeat
325
+ }).map((async () => {
276
326
  await semaphore.acquire();
277
327
  const e = await this.step(scenario);
278
328
  await semaphore.release();
279
- if (listener !== undefined) listener(e);
329
+ if (listener !== undefined) {
330
+ listener(e);
331
+ }
280
332
  return e;
281
333
  })));
282
334
  return {
283
335
  scenario,
284
336
  events,
285
- usage: events.filter((e => e.type !== "error")).map((e => e.usage)).reduce(AgenticaTokenUsage.plus, AgenticaTokenUsage.zero())
337
+ usage: events.filter((e => e.type !== "error")).map((e => e.usage)).reduce(((acc, cur) => AgenticaTokenUsage.plus(acc, cur)), AgenticaTokenUsage.zero())
286
338
  };
287
- })));
339
+ }));
340
+ const experiments = await Promise.all(task);
288
341
  return this.result_ = {
289
342
  experiments,
290
343
  started_at,
291
344
  completed_at: new Date,
292
- usage: experiments.map((p => p.usage)).reduce(AgenticaTokenUsage.plus, AgenticaTokenUsage.zero())
345
+ usage: experiments.map((p => p.usage)).reduce(((acc, cur) => AgenticaTokenUsage.plus(acc, cur)), AgenticaTokenUsage.zero())
293
346
  };
294
347
  }
295
348
  report() {
296
- if (this.result_ === null) throw new Error("Benchmark is not executed yet.");
349
+ if (this.result_ === null) {
350
+ throw new Error("Benchmark is not executed yet.");
351
+ }
297
352
  return AgenticaCallBenchmarkReporter.markdown(this.result_);
298
353
  }
299
354
  async step(scenario) {
@@ -324,12 +379,18 @@ class AgenticaCallBenchmark {
324
379
  };
325
380
  try {
326
381
  await agent.conversate(scenario.text);
327
- if (success()) return out();
382
+ if (success()) {
383
+ return out();
384
+ }
328
385
  for (let i = 0; i < this.config_.consent; ++i) {
329
386
  const next = await AgenticaBenchmarkPredicator.isNext(agent);
330
- if (next === null) break;
387
+ if (next === null) {
388
+ break;
389
+ }
331
390
  await agent.conversate(next);
332
- if (success()) return out();
391
+ if (success()) {
392
+ return out();
393
+ }
333
394
  }
334
395
  return out();
335
396
  } catch (error) {
@@ -346,25 +407,37 @@ class AgenticaCallBenchmark {
346
407
  }
347
408
  }
348
409
 
349
- var AgenticaSelectBenchmarkReporter;
410
+ const AgenticaSelectBenchmarkReporter = {
411
+ markdown
412
+ };
413
+
414
+ function markdown(result) {
415
+ const iterator = [ [ "./README.md", writeIndex(result) ], ...result.experiments.map((exp => [ [ `./${exp.scenario.name}/README.md`, writeExperimentIndex(exp) ], ...exp.events.map(((event, i) => [ `./${exp.scenario.name}/${i + 1}.${event.type}.md`, writeExperimentEvent(event, i) ])) ])).flat() ];
416
+ return Object.fromEntries(iterator);
417
+ }
418
+
419
+ function writeIndex(result) {
420
+ const events = result.experiments.map((r => r.events)).flat();
421
+ const average = events.map((e => e.completed_at.getTime() - e.started_at.getTime())).reduce(((a, b) => a + b), 0) / events.length;
422
+ const aggregate = result.usage.aggregate;
423
+ return [ "# LLM Function Selection Benchmark", "## Summary", ` - Aggregation:`, ` - Scenarios: #${result.experiments.length.toLocaleString()}`, ` - Trial: ${events.length}`, ` - Success: ${events.filter((e => e.type === "success")).length}`, ` - Failure: ${events.filter((e => e.type === "failure")).length}`, ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`, ` - Token Usage`, ` - Total: ${aggregate.total.toLocaleString()}`, ` - Input`, ` - Total: ${aggregate.input.total.toLocaleString()}`, ` - Cached: ${aggregate.input.cached.toLocaleString()}`, ` - Output:`, ` - Total: ${aggregate.output.total.toLocaleString()}`, ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`, ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`, ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`, "", "## Experiments", " Name | Status | Time/Avg ", ":-----|:-------|----------:", ...result.experiments.map((exp => [ `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`, (() => {
424
+ const success = Math.floor(exp.events.filter((e => e.type === "success")).length / exp.events.length * 10);
425
+ return Array.from({
426
+ length: success
427
+ }).fill("■").join("") + Array.from({
428
+ length: 10 - success
429
+ }).fill("□").join("");
430
+ })(), `${MathUtil.round(exp.events.map((event => event.completed_at.getTime() - event.started_at.getTime())).reduce(((a, b) => a + b), 0) / exp.events.length).toLocaleString()} ms` ].join(" | "))) ].join("\n");
431
+ }
432
+
433
+ function writeExperimentIndex(exp) {
434
+ const aggregate = exp.usage.aggregate;
435
+ return [ `# ${exp.scenario.name}`, "## Summary", " - Aggregation:", ` - Trial: ${exp.events.length}`, ` - Success: ${exp.events.filter((e => e.type === "success")).length}`, ` - Failure: ${exp.events.filter((e => e.type === "failure")).length}`, ` - Average Time: ${MathUtil.round(exp.events.map((event => event.completed_at.getTime() - event.started_at.getTime())).reduce(((a, b) => a + b), 0) / exp.events.length).toLocaleString()} ms`, ` - Token Usage`, ` - Total: ${aggregate.total.toLocaleString()}`, ` - Input`, ` - Total: ${aggregate.input.total.toLocaleString()}`, ` - Cached: ${aggregate.input.cached.toLocaleString()}`, ` - Output:`, ` - Total: ${aggregate.output.total.toLocaleString()}`, ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`, ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`, ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`, "", "## Events", " No | Type | Time", "---:|:-----|----:", ...exp.events.map(((e, i) => [ `[${i + 1}.](./${i + 1}.${e.type}.md)`, e.type, `${MathUtil.round(e.completed_at.getTime() - e.started_at.getTime())} ms` ].join(" | "))), "", "## Scenario", "### User Prompt", exp.scenario.text, "", "### Expected", "```json", JSON.stringify(AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected), null, 2), "```" ].join("\n");
436
+ }
350
437
 
351
- (function(AgenticaSelectBenchmarkReporter) {
352
- AgenticaSelectBenchmarkReporter.markdown = result => Object.fromEntries([ [ "./README.md", writeIndex(result) ], ...result.experiments.map((exp => [ [ `./${exp.scenario.name}/README.md`, writeExperimentIndex(exp) ], ...exp.events.map(((event, i) => [ `./${exp.scenario.name}/${i + 1}.${event.type}.md`, writeExperimentEvent(event, i) ])) ])).flat() ]);
353
- const writeIndex = result => {
354
- const events = result.experiments.map((r => r.events)).flat();
355
- const average = events.map((e => e.completed_at.getTime() - e.started_at.getTime())).reduce(((a, b) => a + b), 0) / events.length;
356
- const aggregate = result.usage.aggregate;
357
- return [ "# LLM Function Selection Benchmark", "## Summary", ` - Aggregation:`, ` - Scenarios: #${result.experiments.length.toLocaleString()}`, ` - Trial: ${events.length}`, ` - Success: ${events.filter((e => e.type === "success")).length}`, ` - Failure: ${events.filter((e => e.type === "failure")).length}`, ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`, ` - Token Usage`, ` - Total: ${aggregate.total.toLocaleString()}`, ` - Input`, ` - Total: ${aggregate.input.total.toLocaleString()}`, ` - Cached: ${aggregate.input.cached.toLocaleString()}`, ` - Output:`, ` - Total: ${aggregate.output.total.toLocaleString()}`, ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`, ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`, ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`, "", "## Experiments", " Name | Status | Time/Avg ", ":-----|:-------|----------:", ...result.experiments.map((exp => [ `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`, (() => {
358
- const success = Math.floor(exp.events.filter((e => e.type === "success")).length / exp.events.length * 10);
359
- return new Array(success).fill("■").join("") + new Array(10 - success).fill("□").join("");
360
- })(), MathUtil.round(exp.events.map((event => event.completed_at.getTime() - event.started_at.getTime())).reduce(((a, b) => a + b), 0) / exp.events.length).toLocaleString() + " ms" ].join(" | "))) ].join("\n");
361
- };
362
- const writeExperimentIndex = exp => {
363
- const aggregate = exp.usage.aggregate;
364
- return [ `# ${exp.scenario.name}`, "## Summary", " - Aggregation:", ` - Trial: ${exp.events.length}`, ` - Success: ${exp.events.filter((e => e.type === "success")).length}`, ` - Failure: ${exp.events.filter((e => e.type === "failure")).length}`, ` - Average Time: ${MathUtil.round(exp.events.map((event => event.completed_at.getTime() - event.started_at.getTime())).reduce(((a, b) => a + b), 0) / exp.events.length).toLocaleString()} ms`, ` - Token Usage`, ` - Total: ${aggregate.total.toLocaleString()}`, ` - Input`, ` - Total: ${aggregate.input.total.toLocaleString()}`, ` - Cached: ${aggregate.input.cached.toLocaleString()}`, ` - Output:`, ` - Total: ${aggregate.output.total.toLocaleString()}`, ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`, ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`, ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`, "", "## Events", " No | Type | Time", "---:|:-----|----:", ...exp.events.map(((e, i) => [ `[${i + 1}.](./${i + 1}.${e.type}.md)`, e.type, MathUtil.round(e.completed_at.getTime() - e.started_at.getTime()) + " ms" ].join(" | "))), "", "## Scenario", "### User Prompt", exp.scenario.text, "", "### Expected", "```json", JSON.stringify(AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected), null, 2), "```" ].join("\n");
365
- };
366
- const writeExperimentEvent = (event, index) => [ `# ${index + 1}. ${event.type}`, `## Summary`, ` - Name: ${event.scenario.name}`, ` - Type: ${event.type}`, ` - Time: ${(event.completed_at.getTime() - event.started_at.getTime()).toLocaleString()} ms`, ...event.type !== "error" ? [ " - Token Usage", ` - Total: ${event.usage.aggregate.toLocaleString()}`, ` - Prompt`, ` - Total: ${event.usage.aggregate.input.total.toLocaleString()}`, ` - Cached: ${event.usage.aggregate.input.cached.toLocaleString()}`, ` - Completion:`, ` - Total: ${event.usage.aggregate.output.total.toLocaleString()}`, ` - Reasoning: ${event.usage.aggregate.output.reasoning.toLocaleString()}`, ` - Accepted Prediction: ${event.usage.aggregate.output.accepted_prediction.toLocaleString()}`, ` - Rejected Prediction: ${event.usage.aggregate.output.rejected_prediction.toLocaleString()}` ] : [], "", "## Scenario", "### User Prompt", event.scenario.text, "", "### Expected", "```json", JSON.stringify(AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected), null, 2), "```", "", ...event.type === "success" || event.type === "failure" ? [ "## Result", ...event.selected.map((s => [ `### ${s.operation.name}`, ` - Controller: \`${s.operation.controller.name}\``, ` - Function: \`${s.operation.function.name}\``, ` - Reason: ${s.reason}`, "", ...s.operation.function.description ? [ s.operation.function.description, "" ] : [] ].join("\n"))) ] : [], ...event.type === "error" ? [ "## Error", "```json", AgenticaBenchmarkUtil.errorToJson(JSON.stringify(event.error, null, 2)), "```", "" ] : [] ].join("\n");
367
- })(AgenticaSelectBenchmarkReporter || (AgenticaSelectBenchmarkReporter = {}));
438
+ function writeExperimentEvent(event, index) {
439
+ return [ `# ${index + 1}. ${event.type}`, `## Summary`, ` - Name: ${event.scenario.name}`, ` - Type: ${event.type}`, ` - Time: ${(event.completed_at.getTime() - event.started_at.getTime()).toLocaleString()} ms`, ...event.type !== "error" ? [ " - Token Usage", ` - Total: ${event.usage.aggregate.toLocaleString()}`, ` - Prompt`, ` - Total: ${event.usage.aggregate.input.total.toLocaleString()}`, ` - Cached: ${event.usage.aggregate.input.cached.toLocaleString()}`, ` - Completion:`, ` - Total: ${event.usage.aggregate.output.total.toLocaleString()}`, ` - Reasoning: ${event.usage.aggregate.output.reasoning.toLocaleString()}`, ` - Accepted Prediction: ${event.usage.aggregate.output.accepted_prediction.toLocaleString()}`, ` - Rejected Prediction: ${event.usage.aggregate.output.rejected_prediction.toLocaleString()}` ] : [], "", "## Scenario", "### User Prompt", event.scenario.text, "", "### Expected", "```json", JSON.stringify(AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected), null, 2), "```", "", ...event.type === "success" || event.type === "failure" ? [ "## Result", ...event.selected.map((s => [ `### ${s.operation.name}`, ` - Controller: \`${s.operation.controller.name}\``, ` - Function: \`${s.operation.function.name}\``, ` - Reason: ${s.reason}`, "", ...s.operation.function.description !== undefined && s.operation.function.description !== "" ? [ s.operation.function.description, "" ] : [] ].join("\n"))) ] : [], ...event.type === "error" ? [ "## Error", "```json", AgenticaBenchmarkUtil.errorToJson(JSON.stringify(event.error, null, 2)), "```", "" ] : [] ].join("\n");
440
+ }
368
441
 
369
442
  class AgenticaSelectBenchmark {
370
443
  constructor(props) {
@@ -381,28 +454,34 @@ class AgenticaSelectBenchmark {
381
454
  const started_at = new Date;
382
455
  const semaphore = new Semaphore(this.config_.simultaneous);
383
456
  const experiments = await Promise.all(this.scenarios_.map((async scenario => {
384
- const events = await Promise.all(new Array(this.config_.repeat).fill(0).map((async () => {
457
+ const events = await Promise.all(Array.from({
458
+ length: this.config_.repeat
459
+ }).map((async () => {
385
460
  await semaphore.acquire();
386
461
  const e = await this.step(scenario);
387
462
  await semaphore.release();
388
- if (listener !== undefined) listener(e);
463
+ if (listener !== undefined) {
464
+ listener(e);
465
+ }
389
466
  return e;
390
467
  })));
391
468
  return {
392
469
  scenario,
393
470
  events,
394
- usage: events.filter((e => e.type !== "error")).map((e => e.usage)).reduce(AgenticaTokenUsage.plus, AgenticaTokenUsage.zero())
471
+ usage: events.filter((e => e.type !== "error")).map((e => e.usage)).reduce(((acc, cur) => AgenticaTokenUsage.plus(acc, cur)), AgenticaTokenUsage.zero())
395
472
  };
396
473
  })));
397
474
  return this.result_ = {
398
475
  experiments,
399
476
  started_at,
400
477
  completed_at: new Date,
401
- usage: experiments.map((p => p.usage)).reduce(AgenticaTokenUsage.plus, AgenticaTokenUsage.zero())
478
+ usage: experiments.map((p => p.usage)).reduce(((acc, cur) => AgenticaTokenUsage.plus(acc, cur)), AgenticaTokenUsage.zero())
402
479
  };
403
480
  }
404
481
  report() {
405
- if (this.result_ === null) throw new Error("Benchmark is not executed yet.");
482
+ if (this.result_ === null) {
483
+ throw new Error("Benchmark is not executed yet.");
484
+ }
406
485
  return AgenticaSelectBenchmarkReporter.markdown(this.result_);
407
486
  }
408
487
  async step(scenario) {