@agentica/benchmark 0.9.0 → 0.10.0-dev.20250302

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,183 +1,183 @@
1
- import { IAgenticaTokenUsage } from "@agentica/core";
2
- import { ILlmSchema } from "@samchon/openapi";
3
-
4
- import { IAgenticaCallBenchmarkEvent } from "../structures/IAgenticaCallBenchmarkEvent";
5
- import { IAgenticaCallBenchmarkResult } from "../structures/IAgenticaCallBenchmarkResult";
6
- import { MathUtil } from "../utils/MathUtil";
7
- import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
8
- import { AgenticaPromptReporter } from "./AgenticaPromptReporter";
9
-
10
- export namespace AgenticaCallBenchmarkReporter {
11
- export const markdown = <Model extends ILlmSchema.Model>(
12
- result: IAgenticaCallBenchmarkResult<Model>,
13
- ): Record<string, string> =>
14
- Object.fromEntries([
15
- ["./README.md", writeIndex<Model>(result)],
16
- ...result.experiments
17
- .map((exp) => [
18
- [`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
19
- ...exp.events.map((event, i) => [
20
- `./${exp.scenario.name}/${i + 1}.${event.type}.md`,
21
- writeExperimentEvent(event, i),
22
- ]),
23
- ])
24
- .flat(),
25
- ]);
26
-
27
- const writeIndex = <Model extends ILlmSchema.Model>(
28
- result: IAgenticaCallBenchmarkResult<Model>,
29
- ): string => {
30
- const events: IAgenticaCallBenchmarkEvent<Model>[] = result.experiments
31
- .map((r) => r.events)
32
- .flat();
33
- const average: number =
34
- events
35
- .map((e) => e.completed_at.getTime() - e.started_at.getTime())
36
- .reduce((a, b) => a + b, 0) / events.length;
37
- const aggregate: IAgenticaTokenUsage.IComponent = result.usage.aggregate;
38
- return [
39
- "# LLM Function Call Benchmark",
40
- "## Summary",
41
- ` - Aggregation:`,
42
- ` - Scenarios: #${result.experiments.length.toLocaleString()}`,
43
- ` - Trial: ${events.length}`,
44
- ` - Success: ${events.filter((e) => e.type === "success").length}`,
45
- ` - Failure: ${events.filter((e) => e.type === "failure").length}`,
46
- ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`,
47
- ` - Token Usage`,
48
- ` - Total: ${aggregate.total.toLocaleString()}`,
49
- ` - Input`,
50
- ` - Total: ${aggregate.input.total.toLocaleString()}`,
51
- ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
52
- ` - Output:`,
53
- ` - Total: ${aggregate.output.total.toLocaleString()}`,
54
- ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
55
- ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
56
- ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
57
- "",
58
- "## Experiments",
59
- " Name | Select | Call | Time/Avg ",
60
- ":-----|:-------|:-----|----------:",
61
- ...result.experiments.map((exp) =>
62
- [
63
- `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`,
64
- drawStatus(
65
- exp.events,
66
- (e) => e.type !== "error" && e.select === true,
67
- ),
68
- drawStatus(exp.events, (e) => e.type !== "error" && e.call === true),
69
- `${MathUtil.round(
70
- exp.events
71
- .map((e) => e.completed_at.getTime() - e.started_at.getTime())
72
- .reduce((a, b) => a + b, 0) / exp.events.length,
73
- ).toLocaleString()} ms`,
74
- ].join(" | "),
75
- ),
76
- ].join("\n");
77
- };
78
-
79
- const writeExperimentIndex = <Model extends ILlmSchema.Model>(
80
- exp: IAgenticaCallBenchmarkResult.IExperiment<Model>,
81
- ): string => {
82
- return [
83
- `# ${exp.scenario.name}`,
84
- "## Summary",
85
- ` - Scenarios: #${exp.events.length.toLocaleString()}`,
86
- ` - Success: ${exp.events.filter((e) => e.type === "success").length}`,
87
- ` - Failure: ${exp.events.filter((e) => e.type === "failure").length}`,
88
- ` - Average Time: ${MathUtil.round(
89
- exp.events
90
- .map((e) => e.completed_at.getTime() - e.started_at.getTime())
91
- .reduce((a, b) => a + b, 0) / exp.events.length,
92
- ).toLocaleString()} ms`,
93
- "",
94
- "## Events",
95
- " Name | Type | Time",
96
- ":-----|:-----|----:",
97
- ...exp.events.map((e, i) =>
98
- [
99
- `[${i + 1}.](./${i + 1}.${e.type}.md)`,
100
- e.type,
101
- `${MathUtil.round(e.completed_at.getTime() - e.started_at.getTime())} ms`,
102
- ].join(" | "),
103
- ),
104
- "",
105
- "## Scenario",
106
- "### User Prompt",
107
- exp.scenario.text,
108
- "",
109
- "### Expected",
110
- "```json",
111
- JSON.stringify(
112
- AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected),
113
- null,
114
- 2,
115
- ),
116
- "```",
117
- ].join("\n");
118
- };
119
-
120
- const writeExperimentEvent = <Model extends ILlmSchema.Model>(
121
- event: IAgenticaCallBenchmarkEvent<Model>,
122
- index: number,
123
- ): string => {
124
- return [
125
- `# ${index + 1}. ${event.type}`,
126
- "## Summary",
127
- ` - Name: ${event.scenario.name}`,
128
- ` - Type: ${event.type}`,
129
- ` - Time: ${MathUtil.round(
130
- event.completed_at.getTime() - event.started_at.getTime(),
131
- ).toLocaleString()} ms`,
132
- ...(event.type !== "error"
133
- ? [
134
- ` - Select: ${event.select ? "✅" : "❌"}`,
135
- ` - Call: ${event.call ? "✅" : "❌"}`,
136
- ]
137
- : []),
138
- ` - Token Usage: ${event.usage.toLocaleString()}`,
139
- "",
140
- "## Scenario",
141
- "### User Prompt",
142
- event.scenario.text,
143
- "",
144
- "### Expected",
145
- "```json",
146
- JSON.stringify(
147
- AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected),
148
- null,
149
- 2,
150
- ),
151
- "```",
152
- "",
153
- "## Prompt Histories",
154
- ...event.prompts.map(AgenticaPromptReporter.markdown),
155
- "",
156
- ...(event.type === "error"
157
- ? [
158
- "## Error",
159
- "```json",
160
- JSON.stringify(
161
- AgenticaBenchmarkUtil.errorToJson(event.error),
162
- null,
163
- 2,
164
- ),
165
- "```",
166
- ]
167
- : []),
168
- ].join("\n");
169
- };
170
-
171
- const drawStatus = <Model extends ILlmSchema.Model>(
172
- events: IAgenticaCallBenchmarkEvent<Model>[],
173
- success: (e: IAgenticaCallBenchmarkEvent<Model>) => boolean,
174
- ): string => {
175
- const count: number = Math.floor(
176
- (events.filter(success).length / events.length) * 10,
177
- );
178
- return (
179
- new Array(count).fill("■").join("") +
180
- new Array(10 - count).fill("□").join("")
181
- );
182
- };
183
- }
1
+ import { IAgenticaTokenUsage } from "@agentica/core";
2
+ import { ILlmSchema } from "@samchon/openapi";
3
+
4
+ import { IAgenticaCallBenchmarkEvent } from "../structures/IAgenticaCallBenchmarkEvent";
5
+ import { IAgenticaCallBenchmarkResult } from "../structures/IAgenticaCallBenchmarkResult";
6
+ import { MathUtil } from "../utils/MathUtil";
7
+ import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
8
+ import { AgenticaPromptReporter } from "./AgenticaPromptReporter";
9
+
10
+ export namespace AgenticaCallBenchmarkReporter {
11
+ export const markdown = <Model extends ILlmSchema.Model>(
12
+ result: IAgenticaCallBenchmarkResult<Model>,
13
+ ): Record<string, string> =>
14
+ Object.fromEntries([
15
+ ["./README.md", writeIndex<Model>(result)],
16
+ ...result.experiments
17
+ .map((exp) => [
18
+ [`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
19
+ ...exp.events.map((event, i) => [
20
+ `./${exp.scenario.name}/${i + 1}.${event.type}.md`,
21
+ writeExperimentEvent(event, i),
22
+ ]),
23
+ ])
24
+ .flat(),
25
+ ]);
26
+
27
+ const writeIndex = <Model extends ILlmSchema.Model>(
28
+ result: IAgenticaCallBenchmarkResult<Model>,
29
+ ): string => {
30
+ const events: IAgenticaCallBenchmarkEvent<Model>[] = result.experiments
31
+ .map((r) => r.events)
32
+ .flat();
33
+ const average: number =
34
+ events
35
+ .map((e) => e.completed_at.getTime() - e.started_at.getTime())
36
+ .reduce((a, b) => a + b, 0) / events.length;
37
+ const aggregate: IAgenticaTokenUsage.IComponent = result.usage.aggregate;
38
+ return [
39
+ "# LLM Function Call Benchmark",
40
+ "## Summary",
41
+ ` - Aggregation:`,
42
+ ` - Scenarios: #${result.experiments.length.toLocaleString()}`,
43
+ ` - Trial: ${events.length}`,
44
+ ` - Success: ${events.filter((e) => e.type === "success").length}`,
45
+ ` - Failure: ${events.filter((e) => e.type === "failure").length}`,
46
+ ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`,
47
+ ` - Token Usage`,
48
+ ` - Total: ${aggregate.total.toLocaleString()}`,
49
+ ` - Input`,
50
+ ` - Total: ${aggregate.input.total.toLocaleString()}`,
51
+ ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
52
+ ` - Output:`,
53
+ ` - Total: ${aggregate.output.total.toLocaleString()}`,
54
+ ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
55
+ ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
56
+ ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
57
+ "",
58
+ "## Experiments",
59
+ " Name | Select | Call | Time/Avg ",
60
+ ":-----|:-------|:-----|----------:",
61
+ ...result.experiments.map((exp) =>
62
+ [
63
+ `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`,
64
+ drawStatus(
65
+ exp.events,
66
+ (e) => e.type !== "error" && e.select === true,
67
+ ),
68
+ drawStatus(exp.events, (e) => e.type !== "error" && e.call === true),
69
+ `${MathUtil.round(
70
+ exp.events
71
+ .map((e) => e.completed_at.getTime() - e.started_at.getTime())
72
+ .reduce((a, b) => a + b, 0) / exp.events.length,
73
+ ).toLocaleString()} ms`,
74
+ ].join(" | "),
75
+ ),
76
+ ].join("\n");
77
+ };
78
+
79
+ const writeExperimentIndex = <Model extends ILlmSchema.Model>(
80
+ exp: IAgenticaCallBenchmarkResult.IExperiment<Model>,
81
+ ): string => {
82
+ return [
83
+ `# ${exp.scenario.name}`,
84
+ "## Summary",
85
+ ` - Scenarios: #${exp.events.length.toLocaleString()}`,
86
+ ` - Success: ${exp.events.filter((e) => e.type === "success").length}`,
87
+ ` - Failure: ${exp.events.filter((e) => e.type === "failure").length}`,
88
+ ` - Average Time: ${MathUtil.round(
89
+ exp.events
90
+ .map((e) => e.completed_at.getTime() - e.started_at.getTime())
91
+ .reduce((a, b) => a + b, 0) / exp.events.length,
92
+ ).toLocaleString()} ms`,
93
+ "",
94
+ "## Events",
95
+ " Name | Type | Time",
96
+ ":-----|:-----|----:",
97
+ ...exp.events.map((e, i) =>
98
+ [
99
+ `[${i + 1}.](./${i + 1}.${e.type}.md)`,
100
+ e.type,
101
+ `${MathUtil.round(e.completed_at.getTime() - e.started_at.getTime())} ms`,
102
+ ].join(" | "),
103
+ ),
104
+ "",
105
+ "## Scenario",
106
+ "### User Prompt",
107
+ exp.scenario.text,
108
+ "",
109
+ "### Expected",
110
+ "```json",
111
+ JSON.stringify(
112
+ AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected),
113
+ null,
114
+ 2,
115
+ ),
116
+ "```",
117
+ ].join("\n");
118
+ };
119
+
120
+ const writeExperimentEvent = <Model extends ILlmSchema.Model>(
121
+ event: IAgenticaCallBenchmarkEvent<Model>,
122
+ index: number,
123
+ ): string => {
124
+ return [
125
+ `# ${index + 1}. ${event.type}`,
126
+ "## Summary",
127
+ ` - Name: ${event.scenario.name}`,
128
+ ` - Type: ${event.type}`,
129
+ ` - Time: ${MathUtil.round(
130
+ event.completed_at.getTime() - event.started_at.getTime(),
131
+ ).toLocaleString()} ms`,
132
+ ...(event.type !== "error"
133
+ ? [
134
+ ` - Select: ${event.select ? "✅" : "❌"}`,
135
+ ` - Call: ${event.call ? "✅" : "❌"}`,
136
+ ]
137
+ : []),
138
+ ` - Token Usage: ${event.usage.toLocaleString()}`,
139
+ "",
140
+ "## Scenario",
141
+ "### User Prompt",
142
+ event.scenario.text,
143
+ "",
144
+ "### Expected",
145
+ "```json",
146
+ JSON.stringify(
147
+ AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected),
148
+ null,
149
+ 2,
150
+ ),
151
+ "```",
152
+ "",
153
+ "## Prompt Histories",
154
+ ...event.prompts.map(AgenticaPromptReporter.markdown),
155
+ "",
156
+ ...(event.type === "error"
157
+ ? [
158
+ "## Error",
159
+ "```json",
160
+ JSON.stringify(
161
+ AgenticaBenchmarkUtil.errorToJson(event.error),
162
+ null,
163
+ 2,
164
+ ),
165
+ "```",
166
+ ]
167
+ : []),
168
+ ].join("\n");
169
+ };
170
+
171
+ const drawStatus = <Model extends ILlmSchema.Model>(
172
+ events: IAgenticaCallBenchmarkEvent<Model>[],
173
+ success: (e: IAgenticaCallBenchmarkEvent<Model>) => boolean,
174
+ ): string => {
175
+ const count: number = Math.floor(
176
+ (events.filter(success).length / events.length) * 10,
177
+ );
178
+ return (
179
+ new Array(count).fill("■").join("") +
180
+ new Array(10 - count).fill("□").join("")
181
+ );
182
+ };
183
+ }
@@ -1,46 +1,46 @@
1
- import { IAgenticaPrompt } from "@agentica/core";
2
- import { ILlmSchema } from "@samchon/openapi";
3
-
4
- export namespace AgenticaPromptReporter {
5
- export const markdown = <Model extends ILlmSchema.Model>(
6
- p: IAgenticaPrompt<Model>,
7
- ): string => {
8
- if (p.type === "text")
9
- return [`### Text (${p.role})`, p.text, ""].join("\n");
10
- else if (p.type === "select" || p.type === "cancel")
11
- return [
12
- `### ${p.type === "select" ? "Select" : "Cancel"}`,
13
- ...p.operations
14
- .map((op) => [
15
- `#### ${op.name}`,
16
- ` - controller: ${op.controller.name}`,
17
- ` - function: ${op.function.name}`,
18
- ` - reason: ${op.reason}`,
19
- "",
20
- ...(!!op.function.description?.length
21
- ? [op.function.description, ""]
22
- : []),
23
- ])
24
- .flat(),
25
- ].join("\n");
26
- else if (p.type === "describe")
27
- return [
28
- "### Describe",
29
- ...p.executions.map((e) => ` - ${e.name}`),
30
- "",
31
- ...p.text.split("\n").map((s) => `> ${s}`),
32
- "",
33
- ].join("\n");
34
- return [
35
- "### Execute",
36
- ` - name: ${p.name}`,
37
- ` - controller: ${p.controller.name}`,
38
- ` - function: ${p.function.name}`,
39
- "",
40
- "```json",
41
- JSON.stringify(p.arguments, null, 2),
42
- "```",
43
- "",
44
- ].join("\n");
45
- };
46
- }
1
+ import { IAgenticaPrompt } from "@agentica/core";
2
+ import { ILlmSchema } from "@samchon/openapi";
3
+
4
+ export namespace AgenticaPromptReporter {
5
+ export const markdown = <Model extends ILlmSchema.Model>(
6
+ p: IAgenticaPrompt<Model>,
7
+ ): string => {
8
+ if (p.type === "text")
9
+ return [`### Text (${p.role})`, p.text, ""].join("\n");
10
+ else if (p.type === "select" || p.type === "cancel")
11
+ return [
12
+ `### ${p.type === "select" ? "Select" : "Cancel"}`,
13
+ ...p.operations
14
+ .map((op) => [
15
+ `#### ${op.name}`,
16
+ ` - controller: ${op.controller.name}`,
17
+ ` - function: ${op.function.name}`,
18
+ ` - reason: ${op.reason}`,
19
+ "",
20
+ ...(!!op.function.description?.length
21
+ ? [op.function.description, ""]
22
+ : []),
23
+ ])
24
+ .flat(),
25
+ ].join("\n");
26
+ else if (p.type === "describe")
27
+ return [
28
+ "### Describe",
29
+ ...p.executions.map((e) => ` - ${e.name}`),
30
+ "",
31
+ ...p.text.split("\n").map((s) => `> ${s}`),
32
+ "",
33
+ ].join("\n");
34
+ return [
35
+ "### Execute",
36
+ ` - name: ${p.name}`,
37
+ ` - controller: ${p.controller.name}`,
38
+ ` - function: ${p.function.name}`,
39
+ "",
40
+ "```json",
41
+ JSON.stringify(p.arguments, null, 2),
42
+ "```",
43
+ "",
44
+ ].join("\n");
45
+ };
46
+ }