@agentica/benchmark 0.7.0-dev.20250224-2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/LICENSE +21 -0
  2. package/lib/AgenticaCallBenchmark.d.ts +137 -0
  3. package/lib/AgenticaCallBenchmark.js +187 -0
  4. package/lib/AgenticaCallBenchmark.js.map +1 -0
  5. package/lib/AgenticaSelectBenchmark.d.ts +123 -0
  6. package/lib/AgenticaSelectBenchmark.js +185 -0
  7. package/lib/AgenticaSelectBenchmark.js.map +1 -0
  8. package/lib/index.d.ts +2 -0
  9. package/lib/index.js +19 -0
  10. package/lib/index.js.map +1 -0
  11. package/lib/index.mjs +449 -0
  12. package/lib/index.mjs.map +1 -0
  13. package/lib/internal/AgenticaBenchmarkPredicator.d.ts +32 -0
  14. package/lib/internal/AgenticaBenchmarkPredicator.js +179 -0
  15. package/lib/internal/AgenticaBenchmarkPredicator.js.map +1 -0
  16. package/lib/internal/AgenticaBenchmarkUtil.d.ts +5 -0
  17. package/lib/internal/AgenticaBenchmarkUtil.js +37 -0
  18. package/lib/internal/AgenticaBenchmarkUtil.js.map +1 -0
  19. package/lib/internal/AgenticaCallBenchmarkReporter.d.ts +4 -0
  20. package/lib/internal/AgenticaCallBenchmarkReporter.js +136 -0
  21. package/lib/internal/AgenticaCallBenchmarkReporter.js.map +1 -0
  22. package/lib/internal/AgenticaPromptReporter.d.ts +4 -0
  23. package/lib/internal/AgenticaPromptReporter.js +49 -0
  24. package/lib/internal/AgenticaPromptReporter.js.map +1 -0
  25. package/lib/internal/AgenticaSelectBenchmarkReporter.d.ts +1 -0
  26. package/lib/internal/AgenticaSelectBenchmarkReporter.js +172 -0
  27. package/lib/internal/AgenticaSelectBenchmarkReporter.js.map +1 -0
  28. package/lib/structures/IAgenticaBenchmarkExpected.d.ts +44 -0
  29. package/lib/structures/IAgenticaBenchmarkExpected.js +3 -0
  30. package/lib/structures/IAgenticaBenchmarkExpected.js.map +1 -0
  31. package/lib/structures/IAgenticaCallBenchmarkEvent.d.ts +95 -0
  32. package/lib/structures/IAgenticaCallBenchmarkEvent.js +3 -0
  33. package/lib/structures/IAgenticaCallBenchmarkEvent.js.map +1 -0
  34. package/lib/structures/IAgenticaCallBenchmarkResult.d.ts +62 -0
  35. package/lib/structures/IAgenticaCallBenchmarkResult.js +3 -0
  36. package/lib/structures/IAgenticaCallBenchmarkResult.js.map +1 -0
  37. package/lib/structures/IAgenticaCallBenchmarkScenario.d.ts +36 -0
  38. package/lib/structures/IAgenticaCallBenchmarkScenario.js +3 -0
  39. package/lib/structures/IAgenticaCallBenchmarkScenario.js.map +1 -0
  40. package/lib/structures/IAgenticaSelectBenchmarkEvent.d.ts +92 -0
  41. package/lib/structures/IAgenticaSelectBenchmarkEvent.js +3 -0
  42. package/lib/structures/IAgenticaSelectBenchmarkEvent.js.map +1 -0
  43. package/lib/structures/IAgenticaSelectBenchmarkResult.d.ts +62 -0
  44. package/lib/structures/IAgenticaSelectBenchmarkResult.js +3 -0
  45. package/lib/structures/IAgenticaSelectBenchmarkResult.js.map +1 -0
  46. package/lib/structures/IAgenticaSelectBenchmarkScenario.d.ts +36 -0
  47. package/lib/structures/IAgenticaSelectBenchmarkScenario.js +3 -0
  48. package/lib/structures/IAgenticaSelectBenchmarkScenario.js.map +1 -0
  49. package/lib/utils/MathUtil.d.ts +3 -0
  50. package/lib/utils/MathUtil.js +8 -0
  51. package/lib/utils/MathUtil.js.map +1 -0
  52. package/lib/utils/TokenUsageComputer.d.ts +5 -0
  53. package/lib/utils/TokenUsageComputer.js +37 -0
  54. package/lib/utils/TokenUsageComputer.js.map +1 -0
  55. package/package.json +57 -0
  56. package/src/AgenticaCallBenchmark.ts +259 -0
  57. package/src/AgenticaSelectBenchmark.ts +262 -0
  58. package/src/index.ts +3 -0
  59. package/src/internal/AgenticaBenchmarkPredicator.ts +216 -0
  60. package/src/internal/AgenticaBenchmarkUtil.ts +40 -0
  61. package/src/internal/AgenticaCallBenchmarkReporter.ts +177 -0
  62. package/src/internal/AgenticaPromptReporter.ts +43 -0
  63. package/src/internal/AgenticaSelectBenchmarkReporter.ts +212 -0
  64. package/src/structures/IAgenticaBenchmarkExpected.ts +58 -0
  65. package/src/structures/IAgenticaCallBenchmarkEvent.ts +109 -0
  66. package/src/structures/IAgenticaCallBenchmarkResult.ts +69 -0
  67. package/src/structures/IAgenticaCallBenchmarkScenario.ts +39 -0
  68. package/src/structures/IAgenticaSelectBenchmarkEvent.ts +110 -0
  69. package/src/structures/IAgenticaSelectBenchmarkResult.ts +69 -0
  70. package/src/structures/IAgenticaSelectBenchmarkScenario.ts +39 -0
  71. package/src/utils/MathUtil.ts +3 -0
  72. package/src/utils/TokenUsageComputer.ts +40 -0
@@ -0,0 +1,216 @@
1
+ import { Agentica, IAgenticaOperation, IAgenticaPrompt } from "@agentica/core";
2
+ import { ILlmFunction } from "@samchon/openapi";
3
+ import OpenAI from "openai";
4
+ import typia from "typia";
5
+
6
+ import { IAgenticaBenchmarkExpected } from "../structures/IAgenticaBenchmarkExpected";
7
+
8
+ export namespace AgenticaBenchmarkPredicator {
9
+ export const isNext = async (agent: Agentica): Promise<string | null> => {
10
+ const last: IAgenticaPrompt | undefined = agent.getPromptHistories().at(-1);
11
+ if (last?.type !== "text" || last.role !== "assistant") return null;
12
+
13
+ const consent: ILlmFunction<"chatgpt"> = typia.llm.application<
14
+ IPredicatorApplication,
15
+ "chatgpt"
16
+ >().functions[0]!;
17
+ const result: OpenAI.ChatCompletion = await agent[
18
+ "props"
19
+ ].provider.api.chat.completions.create(
20
+ {
21
+ model: agent["props"].provider.model,
22
+ messages: [
23
+ {
24
+ role: "system",
25
+ content: [
26
+ "You are an helpful assistant.",
27
+ "",
28
+ "If what the assistant said seems like to asking for",
29
+ "user's consent about some function calling at the next step,",
30
+ "use the tools appropriately to step to the next.",
31
+ ].join("\n"),
32
+ },
33
+ {
34
+ role: "assistant",
35
+ content: last.text,
36
+ },
37
+ ],
38
+ tools: [
39
+ {
40
+ type: "function",
41
+ function: {
42
+ name: consent.name,
43
+ description: consent.description,
44
+ parameters: consent.parameters as Record<string, any>,
45
+ },
46
+ },
47
+ ],
48
+ tool_choice: "required",
49
+ parallel_tool_calls: false,
50
+ },
51
+ agent["props"].provider.options,
52
+ );
53
+ const toolCall: OpenAI.ChatCompletionMessageToolCall | undefined = (
54
+ result.choices[0]?.message.tool_calls ?? []
55
+ ).filter(
56
+ (tc) => tc.type === "function" && tc.function.name === consent.name,
57
+ )?.[0];
58
+ if (toolCall === undefined) return null;
59
+ const input: IConsentProps = JSON.parse(toolCall.function.arguments);
60
+ return typia.is(input) ? input.reply : null;
61
+ };
62
+
63
+ /**
64
+ * Check if the called operations match the expected operations.
65
+ *
66
+ * @param props Properties for checking the match of the called operations
67
+ * and the expected operations
68
+ *
69
+ * @returns `true` if the called operations match the expected operations,
70
+ * otherwise `false`.
71
+ */
72
+ export const success = (props: {
73
+ /**
74
+ * Expected operations to be called.
75
+ *
76
+ * For 'allOf' within an 'array', the next expected element starts checking from the element that follows the last called element in 'allOf'.
77
+ */
78
+ expected: IAgenticaBenchmarkExpected;
79
+
80
+ /**
81
+ * Specified operations.
82
+ */
83
+ operations: Array<IAgenticaOperation | IAgenticaPrompt.IExecute>;
84
+
85
+ /**
86
+ * If it's `false`, check the array and let it go even if there's something wrong between them.
87
+ *
88
+ * @default `false`
89
+ */
90
+ strict?: boolean;
91
+ }): boolean => successInner(props).result;
92
+
93
+ const successInner = (
94
+ props: Parameters<typeof success>[0],
95
+ ):
96
+ | {
97
+ result: true;
98
+ take: number;
99
+ }
100
+ | {
101
+ result: false;
102
+ } => {
103
+ const call = (
104
+ expected: IAgenticaBenchmarkExpected,
105
+ overrideOperations?: Array<IAgenticaOperation | IAgenticaPrompt.IExecute>,
106
+ ) =>
107
+ successInner({
108
+ expected,
109
+ operations: overrideOperations ?? props.operations,
110
+ strict: props.strict,
111
+ });
112
+
113
+ switch (props.expected.type) {
114
+ case "array": {
115
+ let take = 0;
116
+ const targetIterator = props.expected.items[Symbol.iterator]();
117
+ let targeted = targetIterator.next();
118
+
119
+ while (true) {
120
+ if (targeted.done) {
121
+ return {
122
+ result: true,
123
+ take,
124
+ };
125
+ }
126
+ if (take >= props.operations.length) {
127
+ return { result: false };
128
+ }
129
+
130
+ const result = call(targeted.value, props.operations.slice(take));
131
+ if (!result.result) {
132
+ if (!props.strict) {
133
+ take += 1;
134
+ continue;
135
+ }
136
+ return { result: false };
137
+ }
138
+
139
+ take += result.take;
140
+ targeted = targetIterator.next();
141
+ }
142
+ }
143
+ case "standalone": {
144
+ const target = props.expected.operation;
145
+ const result = props.operations.some((op) => op.name === target.name);
146
+ if (result) {
147
+ return { result, take: 1 };
148
+ }
149
+ return {
150
+ result,
151
+ };
152
+ }
153
+ case "anyOf":
154
+ for (const expected of props.expected.anyOf) {
155
+ const callResult = call(expected);
156
+ if (callResult.result) {
157
+ return callResult;
158
+ }
159
+ }
160
+
161
+ return { result: false };
162
+ case "allOf": {
163
+ /**
164
+ * @example
165
+ * expected = [4, 2];
166
+ * called = [1, 2, 3, 4, 5];
167
+ *
168
+ * { result: true, take: 3 };
169
+ */
170
+ const result = props.expected.allOf.map((expected) => call(expected));
171
+ if (result.every((r) => r.result)) {
172
+ return {
173
+ result: true,
174
+ take: result.reduce((acc, r) => Math.max(acc, r.take), 0),
175
+ };
176
+ }
177
+
178
+ return {
179
+ result: false,
180
+ };
181
+ }
182
+ }
183
+ };
184
+ }
185
+
186
+ interface IPredicatorApplication {
187
+ /**
188
+ * Ask user to consent for what the AI agent wants to do next.
189
+ *
190
+ * If AI agent wants to do some function calling at next,
191
+ * but it needs the user's consent about the function calling to do,
192
+ * then call this tool function.
193
+ *
194
+ * @param props Properties for asking the user's consent
195
+ */
196
+ consent(props: IConsentProps): void;
197
+ }
198
+
199
+ /**
200
+ * Properties for asking the user's consent
201
+ */
202
+ interface IConsentProps {
203
+ /**
204
+ * Reason of the message implying what the AI agent wants
205
+ * to do at the next step after the user's consent.
206
+ */
207
+ content: string;
208
+
209
+ /**
210
+ * Recommended reply message for the user.
211
+ *
212
+ * The message what AI agent wants the user to reply
213
+ * accepting the AI agent's next job suggestion.
214
+ */
215
+ reply: string;
216
+ }
@@ -0,0 +1,40 @@
1
+ import { IAgenticaBenchmarkExpected } from "../structures/IAgenticaBenchmarkExpected";
2
+
3
+ export namespace AgenticaBenchmarkUtil {
4
+ export const errorToJson = (error: any): any => {
5
+ if (error instanceof Error)
6
+ return {
7
+ ...error,
8
+ name: error.name,
9
+ message: error.message,
10
+ stack: error.stack,
11
+ };
12
+ return error;
13
+ };
14
+
15
+ export const expectedToJson = (expected: IAgenticaBenchmarkExpected): any => {
16
+ if (expected.type === "standalone")
17
+ return {
18
+ type: expected.type,
19
+ operation: {
20
+ name: expected.operation.name,
21
+ description: expected.operation.function.description,
22
+ },
23
+ };
24
+ else if (expected.type === "array")
25
+ return {
26
+ type: expected.type,
27
+ items: expected.items.map(expectedToJson),
28
+ };
29
+ else if (expected.type === "allOf")
30
+ return {
31
+ type: expected.type,
32
+ allOf: expected.allOf.map(expectedToJson),
33
+ };
34
+ else
35
+ return {
36
+ type: expected.type,
37
+ anyOf: expected.anyOf.map(expectedToJson),
38
+ };
39
+ };
40
+ }
@@ -0,0 +1,177 @@
1
+ import { IAgenticaCallBenchmarkEvent } from "../structures/IAgenticaCallBenchmarkEvent";
2
+ import { IAgenticaCallBenchmarkResult } from "../structures/IAgenticaCallBenchmarkResult";
3
+ import { MathUtil } from "../utils/MathUtil";
4
+ import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
5
+ import { AgenticaPromptReporter } from "./AgenticaPromptReporter";
6
+
7
+ export namespace AgenticaCallBenchmarkReporter {
8
+ export const markdown = (
9
+ result: IAgenticaCallBenchmarkResult,
10
+ ): Record<string, string> =>
11
+ Object.fromEntries([
12
+ ["./README.md", writeIndex(result)],
13
+ ...result.experiments
14
+ .map((exp) => [
15
+ [`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
16
+ ...exp.events.map((event, i) => [
17
+ `./${exp.scenario.name}/${i + 1}.${event.type}.md`,
18
+ writeExperimentEvent(event, i),
19
+ ]),
20
+ ])
21
+ .flat(),
22
+ ]);
23
+
24
+ const writeIndex = (result: IAgenticaCallBenchmarkResult): string => {
25
+ const events: IAgenticaCallBenchmarkEvent[] = result.experiments
26
+ .map((r) => r.events)
27
+ .flat();
28
+ const average: number =
29
+ events
30
+ .map((e) => e.completed_at.getTime() - e.started_at.getTime())
31
+ .reduce((a, b) => a + b, 0) / events.length;
32
+ return [
33
+ "# LLM Function Call Benchmark",
34
+ "## Summary",
35
+ ` - Aggregation:`,
36
+ ` - Scenarios: #${result.experiments.length.toLocaleString()}`,
37
+ ` - Trial: ${events.length}`,
38
+ ` - Success: ${events.filter((e) => e.type === "success").length}`,
39
+ ` - Failure: ${events.filter((e) => e.type === "failure").length}`,
40
+ ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`,
41
+ ` - Token Usage`,
42
+ ` - Total: ${result.usage.total.toLocaleString()}`,
43
+ ` - Prompt`,
44
+ ` - Total: ${result.usage.prompt.total.toLocaleString()}`,
45
+ ` - Audio: ${result.usage.prompt.audio.toLocaleString()}`,
46
+ ` - Cached: ${result.usage.prompt.cached.toLocaleString()}`,
47
+ ` - Completion:`,
48
+ ` - Total: ${result.usage.completion.total.toLocaleString()}`,
49
+ ` - Accepted Prediction: ${result.usage.completion.accepted_prediction.toLocaleString()}`,
50
+ ` - Audio: ${result.usage.completion.audio.toLocaleString()}`,
51
+ ` - Reasoning: ${result.usage.completion.reasoning.toLocaleString()}`,
52
+ ` - Rejected Prediction: ${result.usage.completion.rejected_prediction.toLocaleString()}`,
53
+ "",
54
+ "## Experiments",
55
+ " Name | Select | Call | Time/Avg ",
56
+ ":-----|:-------|:-----|----------:",
57
+ ...result.experiments.map((exp) =>
58
+ [
59
+ `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`,
60
+ drawStatus(
61
+ exp.events,
62
+ (e) => e.type !== "error" && e.select === true,
63
+ ),
64
+ drawStatus(exp.events, (e) => e.type !== "error" && e.call === true),
65
+ `${MathUtil.round(
66
+ exp.events
67
+ .map((e) => e.completed_at.getTime() - e.started_at.getTime())
68
+ .reduce((a, b) => a + b, 0) / exp.events.length,
69
+ ).toLocaleString()} ms`,
70
+ ].join(" | "),
71
+ ),
72
+ ].join("\n");
73
+ };
74
+
75
+ const writeExperimentIndex = (
76
+ exp: IAgenticaCallBenchmarkResult.IExperiment,
77
+ ): string => {
78
+ return [
79
+ `# ${exp.scenario.name}`,
80
+ "## Summary",
81
+ ` - Scenarios: #${exp.events.length.toLocaleString()}`,
82
+ ` - Success: ${exp.events.filter((e) => e.type === "success").length}`,
83
+ ` - Failure: ${exp.events.filter((e) => e.type === "failure").length}`,
84
+ ` - Average Time: ${MathUtil.round(
85
+ exp.events
86
+ .map((e) => e.completed_at.getTime() - e.started_at.getTime())
87
+ .reduce((a, b) => a + b, 0) / exp.events.length,
88
+ ).toLocaleString()} ms`,
89
+ "",
90
+ "## Events",
91
+ " Name | Type | Time",
92
+ ":-----|:-----|----:",
93
+ ...exp.events.map((e, i) =>
94
+ [
95
+ `[${i + 1}.](./${i + 1}.${e.type}.md)`,
96
+ e.type,
97
+ `${MathUtil.round(e.completed_at.getTime() - e.started_at.getTime())} ms`,
98
+ ].join(" | "),
99
+ ),
100
+ "",
101
+ "## Scenario",
102
+ "### User Prompt",
103
+ exp.scenario.text,
104
+ "",
105
+ "### Expected",
106
+ "```json",
107
+ JSON.stringify(
108
+ AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected),
109
+ null,
110
+ 2,
111
+ ),
112
+ "```",
113
+ ].join("\n");
114
+ };
115
+
116
+ const writeExperimentEvent = (
117
+ event: IAgenticaCallBenchmarkEvent,
118
+ index: number,
119
+ ): string => {
120
+ return [
121
+ `# ${index}. ${event.type}`,
122
+ "## Summary",
123
+ ` - Name: ${event.scenario.name}`,
124
+ ` - Type: ${event.type}`,
125
+ ` - Time: ${MathUtil.round(
126
+ event.completed_at.getTime() - event.started_at.getTime(),
127
+ ).toLocaleString()} ms`,
128
+ ...(event.type !== "error"
129
+ ? [
130
+ ` - Select: ${event.select ? "✅" : "❌"}`,
131
+ ` - Call: ${event.call ? "✅" : "❌"}`,
132
+ ]
133
+ : []),
134
+ ` - Token Usage: ${event.usage.toLocaleString()}`,
135
+ "",
136
+ "## Scenario",
137
+ "### User Prompt",
138
+ event.scenario.text,
139
+ "",
140
+ "### Expected",
141
+ "```json",
142
+ JSON.stringify(
143
+ AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected),
144
+ null,
145
+ 2,
146
+ ),
147
+ "```",
148
+ "",
149
+ "## Prompt Histories",
150
+ ...event.prompts.map(AgenticaPromptReporter.markdown),
151
+ "",
152
+ ...(event.type === "error"
153
+ ? [
154
+ "## Error",
155
+ "```json",
156
+ JSON.stringify(
157
+ AgenticaBenchmarkUtil.errorToJson(event.error),
158
+ null,
159
+ 2,
160
+ ),
161
+ "```",
162
+ ]
163
+ : []),
164
+ ].join("\n");
165
+ };
166
+
167
+ const drawStatus = (
168
+ events: IAgenticaCallBenchmarkEvent[],
169
+ success: (e: IAgenticaCallBenchmarkEvent) => boolean,
170
+ ): string => {
171
+ const count: number = events.filter(success).length;
172
+ return (
173
+ new Array(count).fill("■").join("") +
174
+ new Array(10 - count).fill("□").join("")
175
+ );
176
+ };
177
+ }
@@ -0,0 +1,43 @@
1
+ import { IAgenticaPrompt } from "@agentica/core";
2
+
3
+ export namespace AgenticaPromptReporter {
4
+ export const markdown = (p: IAgenticaPrompt): string => {
5
+ if (p.type === "text")
6
+ return [`### Text (${p.role})`, p.text, ""].join("\n");
7
+ else if (p.type === "select" || p.type === "cancel")
8
+ return [
9
+ `### ${p.type === "select" ? "Select" : "Cancel"}`,
10
+ ...p.operations
11
+ .map((op) => [
12
+ `#### ${op.name}`,
13
+ ` - controller: ${op.controller.name}`,
14
+ ` - function: ${op.function.name}`,
15
+ ` - reason: ${op.reason}`,
16
+ "",
17
+ ...(!!op.function.description?.length
18
+ ? [op.function.description, ""]
19
+ : []),
20
+ ])
21
+ .flat(),
22
+ ].join("\n");
23
+ else if (p.type === "describe")
24
+ return [
25
+ "### Describe",
26
+ ...p.executions.map((e) => ` - ${e.name}`),
27
+ "",
28
+ ...p.text.split("\n").map((s) => `> ${s}`),
29
+ "",
30
+ ].join("\n");
31
+ return [
32
+ "### Execute",
33
+ ` - name: ${p.name}`,
34
+ ` - controller: ${p.controller.name}`,
35
+ ` - function: ${p.function.name}`,
36
+ "",
37
+ "```json",
38
+ JSON.stringify(p.arguments, null, 2),
39
+ "```",
40
+ "",
41
+ ].join("\n");
42
+ };
43
+ }
@@ -0,0 +1,212 @@
1
+ import { IAgenticaSelectBenchmarkEvent } from "../structures/IAgenticaSelectBenchmarkEvent";
2
+ import { IAgenticaSelectBenchmarkResult } from "../structures/IAgenticaSelectBenchmarkResult";
3
+ import { MathUtil } from "../utils/MathUtil";
4
+ import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
5
+
6
+ /**
7
+ * @internal
8
+ */
9
+ export namespace AgenticaSelectBenchmarkReporter {
10
+ export const markdown = (
11
+ result: IAgenticaSelectBenchmarkResult,
12
+ ): Record<string, string> =>
13
+ Object.fromEntries([
14
+ ["./README.md", writeIndex(result)],
15
+ ...result.experiments
16
+ .map((exp) => [
17
+ [`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
18
+ ...exp.events.map((event, i) => [
19
+ `./${exp.scenario.name}/${i + 1}.${event.type}.md`,
20
+ writeExperimentEvent(event, i),
21
+ ]),
22
+ ])
23
+ .flat(),
24
+ ]);
25
+
26
+ const writeIndex = (result: IAgenticaSelectBenchmarkResult): string => {
27
+ const events: IAgenticaSelectBenchmarkEvent[] = result.experiments
28
+ .map((r) => r.events)
29
+ .flat();
30
+ const average: number =
31
+ events
32
+ .map((e) => e.completed_at.getTime() - e.started_at.getTime())
33
+ .reduce((a, b) => a + b, 0) / events.length;
34
+ return [
35
+ "# LLM Function Selection Benchmark",
36
+ "## Summary",
37
+ ` - Aggregation:`,
38
+ ` - Scenarios: #${result.experiments.length.toLocaleString()}`,
39
+ ` - Trial: ${events.length}`,
40
+ ` - Success: ${events.filter((e) => e.type === "success").length}`,
41
+ ` - Failure: ${events.filter((e) => e.type === "failure").length}`,
42
+ ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`,
43
+ ` - Token Usage`,
44
+ ` - Total: ${result.usage.total.toLocaleString()}`,
45
+ ` - Prompt`,
46
+ ` - Total: ${result.usage.prompt.total.toLocaleString()}`,
47
+ ` - Audio: ${result.usage.prompt.audio.toLocaleString()}`,
48
+ ` - Cached: ${result.usage.prompt.cached.toLocaleString()}`,
49
+ ` - Completion:`,
50
+ ` - Total: ${result.usage.completion.total.toLocaleString()}`,
51
+ ` - Accepted Prediction: ${result.usage.completion.accepted_prediction.toLocaleString()}`,
52
+ ` - Audio: ${result.usage.completion.audio.toLocaleString()}`,
53
+ ` - Reasoning: ${result.usage.completion.reasoning.toLocaleString()}`,
54
+ ` - Rejected Prediction: ${result.usage.completion.rejected_prediction.toLocaleString()}`,
55
+ "",
56
+ "## Experiments",
57
+ " Name | Status | Time/Avg ",
58
+ ":-----|:-------|----------:",
59
+ ...result.experiments.map((exp) =>
60
+ [
61
+ `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`,
62
+ (() => {
63
+ const success: number = Math.floor(
64
+ (exp.events.filter((e) => e.type === "success").length /
65
+ exp.events.length) *
66
+ 10,
67
+ );
68
+ return (
69
+ new Array(success).fill("■").join("") +
70
+ new Array(10 - success).fill("□").join("")
71
+ );
72
+ })(),
73
+ MathUtil.round(
74
+ exp.events
75
+ .map(
76
+ (event) =>
77
+ event.completed_at.getTime() - event.started_at.getTime(),
78
+ )
79
+ .reduce((a, b) => a + b, 0) / exp.events.length,
80
+ ).toLocaleString() + " ms",
81
+ ].join(" | "),
82
+ ),
83
+ ].join("\n");
84
+ };
85
+
86
+ const writeExperimentIndex = (
87
+ exp: IAgenticaSelectBenchmarkResult.IExperiment,
88
+ ): string => {
89
+ return [
90
+ `# ${exp.scenario.name}`,
91
+ "## Summary",
92
+ " - Aggregation:",
93
+ ` - Trial: ${exp.events.length}`,
94
+ ` - Success: ${exp.events.filter((e) => e.type === "success").length}`,
95
+ ` - Failure: ${exp.events.filter((e) => e.type === "failure").length}`,
96
+ ` - Average Time: ${MathUtil.round(
97
+ exp.events
98
+ .map(
99
+ (event) =>
100
+ event.completed_at.getTime() - event.started_at.getTime(),
101
+ )
102
+ .reduce((a, b) => a + b, 0) / exp.events.length,
103
+ ).toLocaleString()} ms`,
104
+ " - Token Usage",
105
+ ` - Total: ${exp.usage.total.toLocaleString()}`,
106
+ ` - Prompt`,
107
+ ` - Total: ${exp.usage.prompt.total.toLocaleString()}`,
108
+ ` - Audio: ${exp.usage.prompt.audio.toLocaleString()}`,
109
+ ` - Cached: ${exp.usage.prompt.cached.toLocaleString()}`,
110
+ ` - Completion:`,
111
+ ` - Total: ${exp.usage.completion.total.toLocaleString()}`,
112
+ ` - Accepted Prediction: ${exp.usage.completion.accepted_prediction.toLocaleString()}`,
113
+ ` - Audio: ${exp.usage.completion.audio.toLocaleString()}`,
114
+ ` - Reasoning: ${exp.usage.completion.reasoning.toLocaleString()}`,
115
+ ` - Rejected Prediction: ${exp.usage.completion.rejected_prediction.toLocaleString()}`,
116
+ "",
117
+ "## Events",
118
+ " No | Type | Time",
119
+ "---:|:-----|----:",
120
+ ...exp.events.map((e, i) =>
121
+ [
122
+ `[${i + 1}.](./${i + 1}.${e.type}.md)`,
123
+ e.type,
124
+ MathUtil.round(e.completed_at.getTime() - e.started_at.getTime()) +
125
+ " ms",
126
+ ].join(" | "),
127
+ ),
128
+ "",
129
+ "## Scenario",
130
+ "### User Prompt",
131
+ exp.scenario.text,
132
+ "",
133
+ "### Expected",
134
+ "```json",
135
+ JSON.stringify(
136
+ AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected),
137
+ null,
138
+ 2,
139
+ ),
140
+ "```",
141
+ ].join("\n");
142
+ };
143
+
144
+ const writeExperimentEvent = (
145
+ event: IAgenticaSelectBenchmarkEvent,
146
+ index: number,
147
+ ): string => {
148
+ return [
149
+ `# ${index}. ${event.type}`,
150
+ `## Summary`,
151
+ ` - Name: ${event.scenario.name}`,
152
+ ` - Type: ${event.type}`,
153
+ ` - Time: ${(event.completed_at.getTime() - event.started_at.getTime()).toLocaleString()} ms`,
154
+ ...(event.type !== "error"
155
+ ? [
156
+ " - Token Usage",
157
+ ` - Total: ${event.usage.total.toLocaleString()}`,
158
+ ` - Prompt`,
159
+ ` - Total: ${event.usage.prompt.total.toLocaleString()}`,
160
+ ` - Audio: ${event.usage.prompt.audio.toLocaleString()}`,
161
+ ` - Cached: ${event.usage.prompt.cached.toLocaleString()}`,
162
+ ` - Completion:`,
163
+ ` - Total: ${event.usage.completion.total.toLocaleString()}`,
164
+ ` - Accepted Prediction: ${event.usage.completion.accepted_prediction.toLocaleString()}`,
165
+ ` - Audio: ${event.usage.completion.audio.toLocaleString()}`,
166
+ ` - Reasoning: ${event.usage.completion.reasoning.toLocaleString()}`,
167
+ ` - Rejected Prediction: ${event.usage.completion.rejected_prediction.toLocaleString()}`,
168
+ ]
169
+ : []),
170
+ "",
171
+ "## Scenario",
172
+ "### User Prompt",
173
+ event.scenario.text,
174
+ "",
175
+ "### Expected",
176
+ "```json",
177
+ JSON.stringify(
178
+ AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected),
179
+ null,
180
+ 2,
181
+ ),
182
+ "```",
183
+ "",
184
+ ...(event.type === "success" || event.type === "failure"
185
+ ? [
186
+ "## Result",
187
+ ...event.selected.map((s) =>
188
+ [
189
+ `### ${s.name}`,
190
+ ` - Controller: \`${s.controller.name}\``,
191
+ ` - Function: \`${s.function.name}\``,
192
+ ` - Reason: ${s.reason}`,
193
+ "",
194
+ ...(s.function.description ? [s.function.description, ""] : []),
195
+ ].join("\n"),
196
+ ),
197
+ ]
198
+ : []),
199
+ ...(event.type === "error"
200
+ ? [
201
+ "## Error",
202
+ "```json",
203
+ AgenticaBenchmarkUtil.errorToJson(
204
+ JSON.stringify(event.error, null, 2),
205
+ ),
206
+ "```",
207
+ "",
208
+ ]
209
+ : []),
210
+ ].join("\n");
211
+ };
212
+ }