@agentica/benchmark 0.8.3 → 0.9.0-dev.20250302

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/LICENSE +21 -21
  2. package/README.md +326 -324
  3. package/lib/AgenticaCallBenchmark.d.ts +7 -6
  4. package/lib/AgenticaCallBenchmark.js.map +1 -1
  5. package/lib/AgenticaSelectBenchmark.d.ts +7 -6
  6. package/lib/AgenticaSelectBenchmark.js.map +1 -1
  7. package/lib/index.mjs +46 -1
  8. package/lib/index.mjs.map +1 -1
  9. package/lib/internal/AgenticaBenchmarkPredicator.d.ts +5 -4
  10. package/lib/internal/AgenticaBenchmarkPredicator.js +74 -2
  11. package/lib/internal/AgenticaBenchmarkPredicator.js.map +1 -1
  12. package/lib/internal/AgenticaBenchmarkUtil.d.ts +2 -1
  13. package/lib/internal/AgenticaBenchmarkUtil.js.map +1 -1
  14. package/lib/internal/AgenticaCallBenchmarkReporter.d.ts +2 -1
  15. package/lib/internal/AgenticaCallBenchmarkReporter.js.map +1 -1
  16. package/lib/internal/AgenticaPromptReporter.d.ts +2 -1
  17. package/lib/internal/AgenticaPromptReporter.js.map +1 -1
  18. package/lib/internal/AgenticaSelectBenchmarkReporter.js.map +1 -1
  19. package/lib/structures/IAgenticaBenchmarkExpected.d.ts +10 -9
  20. package/lib/structures/IAgenticaCallBenchmarkEvent.d.ts +8 -7
  21. package/lib/structures/IAgenticaCallBenchmarkResult.d.ts +6 -5
  22. package/lib/structures/IAgenticaCallBenchmarkScenario.d.ts +3 -2
  23. package/lib/structures/IAgenticaSelectBenchmarkEvent.d.ts +9 -8
  24. package/lib/structures/IAgenticaSelectBenchmarkResult.d.ts +6 -5
  25. package/lib/structures/IAgenticaSelectBenchmarkScenario.d.ts +3 -2
  26. package/package.json +5 -5
  27. package/src/AgenticaCallBenchmark.ts +268 -265
  28. package/src/AgenticaSelectBenchmark.ts +256 -254
  29. package/src/index.ts +3 -3
  30. package/src/internal/AgenticaBenchmarkPredicator.ts +224 -216
  31. package/src/internal/AgenticaBenchmarkUtil.ts +44 -40
  32. package/src/internal/AgenticaCallBenchmarkReporter.ts +183 -180
  33. package/src/internal/AgenticaPromptReporter.ts +46 -43
  34. package/src/internal/AgenticaSelectBenchmarkReporter.ts +213 -210
  35. package/src/structures/IAgenticaBenchmarkExpected.ts +68 -58
  36. package/src/structures/IAgenticaCallBenchmarkEvent.ts +113 -109
  37. package/src/structures/IAgenticaCallBenchmarkResult.ts +70 -69
  38. package/src/structures/IAgenticaCallBenchmarkScenario.ts +43 -39
  39. package/src/structures/IAgenticaSelectBenchmarkEvent.ts +114 -110
  40. package/src/structures/IAgenticaSelectBenchmarkResult.ts +72 -69
  41. package/src/structures/IAgenticaSelectBenchmarkScenario.ts +43 -39
  42. package/src/utils/MathUtil.ts +3 -3
@@ -1,180 +1,183 @@
1
- import { IAgenticaTokenUsage } from "@agentica/core";
2
-
3
- import { IAgenticaCallBenchmarkEvent } from "../structures/IAgenticaCallBenchmarkEvent";
4
- import { IAgenticaCallBenchmarkResult } from "../structures/IAgenticaCallBenchmarkResult";
5
- import { MathUtil } from "../utils/MathUtil";
6
- import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
7
- import { AgenticaPromptReporter } from "./AgenticaPromptReporter";
8
-
9
- export namespace AgenticaCallBenchmarkReporter {
10
- export const markdown = (
11
- result: IAgenticaCallBenchmarkResult,
12
- ): Record<string, string> =>
13
- Object.fromEntries([
14
- ["./README.md", writeIndex(result)],
15
- ...result.experiments
16
- .map((exp) => [
17
- [`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
18
- ...exp.events.map((event, i) => [
19
- `./${exp.scenario.name}/${i + 1}.${event.type}.md`,
20
- writeExperimentEvent(event, i),
21
- ]),
22
- ])
23
- .flat(),
24
- ]);
25
-
26
- const writeIndex = (result: IAgenticaCallBenchmarkResult): string => {
27
- const events: IAgenticaCallBenchmarkEvent[] = result.experiments
28
- .map((r) => r.events)
29
- .flat();
30
- const average: number =
31
- events
32
- .map((e) => e.completed_at.getTime() - e.started_at.getTime())
33
- .reduce((a, b) => a + b, 0) / events.length;
34
- const aggregate: IAgenticaTokenUsage.IComponent = result.usage.aggregate;
35
- return [
36
- "# LLM Function Call Benchmark",
37
- "## Summary",
38
- ` - Aggregation:`,
39
- ` - Scenarios: #${result.experiments.length.toLocaleString()}`,
40
- ` - Trial: ${events.length}`,
41
- ` - Success: ${events.filter((e) => e.type === "success").length}`,
42
- ` - Failure: ${events.filter((e) => e.type === "failure").length}`,
43
- ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`,
44
- ` - Token Usage`,
45
- ` - Total: ${aggregate.total.toLocaleString()}`,
46
- ` - Input`,
47
- ` - Total: ${aggregate.input.total.toLocaleString()}`,
48
- ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
49
- ` - Output:`,
50
- ` - Total: ${aggregate.output.total.toLocaleString()}`,
51
- ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
52
- ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
53
- ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
54
- "",
55
- "## Experiments",
56
- " Name | Select | Call | Time/Avg ",
57
- ":-----|:-------|:-----|----------:",
58
- ...result.experiments.map((exp) =>
59
- [
60
- `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`,
61
- drawStatus(
62
- exp.events,
63
- (e) => e.type !== "error" && e.select === true,
64
- ),
65
- drawStatus(exp.events, (e) => e.type !== "error" && e.call === true),
66
- `${MathUtil.round(
67
- exp.events
68
- .map((e) => e.completed_at.getTime() - e.started_at.getTime())
69
- .reduce((a, b) => a + b, 0) / exp.events.length,
70
- ).toLocaleString()} ms`,
71
- ].join(" | "),
72
- ),
73
- ].join("\n");
74
- };
75
-
76
- const writeExperimentIndex = (
77
- exp: IAgenticaCallBenchmarkResult.IExperiment,
78
- ): string => {
79
- return [
80
- `# ${exp.scenario.name}`,
81
- "## Summary",
82
- ` - Scenarios: #${exp.events.length.toLocaleString()}`,
83
- ` - Success: ${exp.events.filter((e) => e.type === "success").length}`,
84
- ` - Failure: ${exp.events.filter((e) => e.type === "failure").length}`,
85
- ` - Average Time: ${MathUtil.round(
86
- exp.events
87
- .map((e) => e.completed_at.getTime() - e.started_at.getTime())
88
- .reduce((a, b) => a + b, 0) / exp.events.length,
89
- ).toLocaleString()} ms`,
90
- "",
91
- "## Events",
92
- " Name | Type | Time",
93
- ":-----|:-----|----:",
94
- ...exp.events.map((e, i) =>
95
- [
96
- `[${i + 1}.](./${i + 1}.${e.type}.md)`,
97
- e.type,
98
- `${MathUtil.round(e.completed_at.getTime() - e.started_at.getTime())} ms`,
99
- ].join(" | "),
100
- ),
101
- "",
102
- "## Scenario",
103
- "### User Prompt",
104
- exp.scenario.text,
105
- "",
106
- "### Expected",
107
- "```json",
108
- JSON.stringify(
109
- AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected),
110
- null,
111
- 2,
112
- ),
113
- "```",
114
- ].join("\n");
115
- };
116
-
117
- const writeExperimentEvent = (
118
- event: IAgenticaCallBenchmarkEvent,
119
- index: number,
120
- ): string => {
121
- return [
122
- `# ${index + 1}. ${event.type}`,
123
- "## Summary",
124
- ` - Name: ${event.scenario.name}`,
125
- ` - Type: ${event.type}`,
126
- ` - Time: ${MathUtil.round(
127
- event.completed_at.getTime() - event.started_at.getTime(),
128
- ).toLocaleString()} ms`,
129
- ...(event.type !== "error"
130
- ? [
131
- ` - Select: ${event.select ? "✅" : "❌"}`,
132
- ` - Call: ${event.call ? "" : "❌"}`,
133
- ]
134
- : []),
135
- ` - Token Usage: ${event.usage.toLocaleString()}`,
136
- "",
137
- "## Scenario",
138
- "### User Prompt",
139
- event.scenario.text,
140
- "",
141
- "### Expected",
142
- "```json",
143
- JSON.stringify(
144
- AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected),
145
- null,
146
- 2,
147
- ),
148
- "```",
149
- "",
150
- "## Prompt Histories",
151
- ...event.prompts.map(AgenticaPromptReporter.markdown),
152
- "",
153
- ...(event.type === "error"
154
- ? [
155
- "## Error",
156
- "```json",
157
- JSON.stringify(
158
- AgenticaBenchmarkUtil.errorToJson(event.error),
159
- null,
160
- 2,
161
- ),
162
- "```",
163
- ]
164
- : []),
165
- ].join("\n");
166
- };
167
-
168
- const drawStatus = (
169
- events: IAgenticaCallBenchmarkEvent[],
170
- success: (e: IAgenticaCallBenchmarkEvent) => boolean,
171
- ): string => {
172
- const count: number = Math.floor(
173
- (events.filter(success).length / events.length) * 10,
174
- );
175
- return (
176
- new Array(count).fill("■").join("") +
177
- new Array(10 - count).fill("□").join("")
178
- );
179
- };
180
- }
1
+ import { IAgenticaTokenUsage } from "@agentica/core";
2
+ import { ILlmSchema } from "@samchon/openapi";
3
+
4
+ import { IAgenticaCallBenchmarkEvent } from "../structures/IAgenticaCallBenchmarkEvent";
5
+ import { IAgenticaCallBenchmarkResult } from "../structures/IAgenticaCallBenchmarkResult";
6
+ import { MathUtil } from "../utils/MathUtil";
7
+ import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
8
+ import { AgenticaPromptReporter } from "./AgenticaPromptReporter";
9
+
10
+ export namespace AgenticaCallBenchmarkReporter {
11
+ export const markdown = <Model extends ILlmSchema.Model>(
12
+ result: IAgenticaCallBenchmarkResult<Model>,
13
+ ): Record<string, string> =>
14
+ Object.fromEntries([
15
+ ["./README.md", writeIndex<Model>(result)],
16
+ ...result.experiments
17
+ .map((exp) => [
18
+ [`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
19
+ ...exp.events.map((event, i) => [
20
+ `./${exp.scenario.name}/${i + 1}.${event.type}.md`,
21
+ writeExperimentEvent(event, i),
22
+ ]),
23
+ ])
24
+ .flat(),
25
+ ]);
26
+
27
+ const writeIndex = <Model extends ILlmSchema.Model>(
28
+ result: IAgenticaCallBenchmarkResult<Model>,
29
+ ): string => {
30
+ const events: IAgenticaCallBenchmarkEvent<Model>[] = result.experiments
31
+ .map((r) => r.events)
32
+ .flat();
33
+ const average: number =
34
+ events
35
+ .map((e) => e.completed_at.getTime() - e.started_at.getTime())
36
+ .reduce((a, b) => a + b, 0) / events.length;
37
+ const aggregate: IAgenticaTokenUsage.IComponent = result.usage.aggregate;
38
+ return [
39
+ "# LLM Function Call Benchmark",
40
+ "## Summary",
41
+ ` - Aggregation:`,
42
+ ` - Scenarios: #${result.experiments.length.toLocaleString()}`,
43
+ ` - Trial: ${events.length}`,
44
+ ` - Success: ${events.filter((e) => e.type === "success").length}`,
45
+ ` - Failure: ${events.filter((e) => e.type === "failure").length}`,
46
+ ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`,
47
+ ` - Token Usage`,
48
+ ` - Total: ${aggregate.total.toLocaleString()}`,
49
+ ` - Input`,
50
+ ` - Total: ${aggregate.input.total.toLocaleString()}`,
51
+ ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
52
+ ` - Output:`,
53
+ ` - Total: ${aggregate.output.total.toLocaleString()}`,
54
+ ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
55
+ ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
56
+ ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
57
+ "",
58
+ "## Experiments",
59
+ " Name | Select | Call | Time/Avg ",
60
+ ":-----|:-------|:-----|----------:",
61
+ ...result.experiments.map((exp) =>
62
+ [
63
+ `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`,
64
+ drawStatus(
65
+ exp.events,
66
+ (e) => e.type !== "error" && e.select === true,
67
+ ),
68
+ drawStatus(exp.events, (e) => e.type !== "error" && e.call === true),
69
+ `${MathUtil.round(
70
+ exp.events
71
+ .map((e) => e.completed_at.getTime() - e.started_at.getTime())
72
+ .reduce((a, b) => a + b, 0) / exp.events.length,
73
+ ).toLocaleString()} ms`,
74
+ ].join(" | "),
75
+ ),
76
+ ].join("\n");
77
+ };
78
+
79
+ const writeExperimentIndex = <Model extends ILlmSchema.Model>(
80
+ exp: IAgenticaCallBenchmarkResult.IExperiment<Model>,
81
+ ): string => {
82
+ return [
83
+ `# ${exp.scenario.name}`,
84
+ "## Summary",
85
+ ` - Scenarios: #${exp.events.length.toLocaleString()}`,
86
+ ` - Success: ${exp.events.filter((e) => e.type === "success").length}`,
87
+ ` - Failure: ${exp.events.filter((e) => e.type === "failure").length}`,
88
+ ` - Average Time: ${MathUtil.round(
89
+ exp.events
90
+ .map((e) => e.completed_at.getTime() - e.started_at.getTime())
91
+ .reduce((a, b) => a + b, 0) / exp.events.length,
92
+ ).toLocaleString()} ms`,
93
+ "",
94
+ "## Events",
95
+ " Name | Type | Time",
96
+ ":-----|:-----|----:",
97
+ ...exp.events.map((e, i) =>
98
+ [
99
+ `[${i + 1}.](./${i + 1}.${e.type}.md)`,
100
+ e.type,
101
+ `${MathUtil.round(e.completed_at.getTime() - e.started_at.getTime())} ms`,
102
+ ].join(" | "),
103
+ ),
104
+ "",
105
+ "## Scenario",
106
+ "### User Prompt",
107
+ exp.scenario.text,
108
+ "",
109
+ "### Expected",
110
+ "```json",
111
+ JSON.stringify(
112
+ AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected),
113
+ null,
114
+ 2,
115
+ ),
116
+ "```",
117
+ ].join("\n");
118
+ };
119
+
120
+ const writeExperimentEvent = <Model extends ILlmSchema.Model>(
121
+ event: IAgenticaCallBenchmarkEvent<Model>,
122
+ index: number,
123
+ ): string => {
124
+ return [
125
+ `# ${index + 1}. ${event.type}`,
126
+ "## Summary",
127
+ ` - Name: ${event.scenario.name}`,
128
+ ` - Type: ${event.type}`,
129
+ ` - Time: ${MathUtil.round(
130
+ event.completed_at.getTime() - event.started_at.getTime(),
131
+ ).toLocaleString()} ms`,
132
+ ...(event.type !== "error"
133
+ ? [
134
+ ` - Select: ${event.select ? "✅" : "❌"}`,
135
+ ` - Call: ${event.call ? "✅" : "❌"}`,
136
+ ]
137
+ : []),
138
+ ` - Token Usage: ${event.usage.toLocaleString()}`,
139
+ "",
140
+ "## Scenario",
141
+ "### User Prompt",
142
+ event.scenario.text,
143
+ "",
144
+ "### Expected",
145
+ "```json",
146
+ JSON.stringify(
147
+ AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected),
148
+ null,
149
+ 2,
150
+ ),
151
+ "```",
152
+ "",
153
+ "## Prompt Histories",
154
+ ...event.prompts.map(AgenticaPromptReporter.markdown),
155
+ "",
156
+ ...(event.type === "error"
157
+ ? [
158
+ "## Error",
159
+ "```json",
160
+ JSON.stringify(
161
+ AgenticaBenchmarkUtil.errorToJson(event.error),
162
+ null,
163
+ 2,
164
+ ),
165
+ "```",
166
+ ]
167
+ : []),
168
+ ].join("\n");
169
+ };
170
+
171
+ const drawStatus = <Model extends ILlmSchema.Model>(
172
+ events: IAgenticaCallBenchmarkEvent<Model>[],
173
+ success: (e: IAgenticaCallBenchmarkEvent<Model>) => boolean,
174
+ ): string => {
175
+ const count: number = Math.floor(
176
+ (events.filter(success).length / events.length) * 10,
177
+ );
178
+ return (
179
+ new Array(count).fill("■").join("") +
180
+ new Array(10 - count).fill("□").join("")
181
+ );
182
+ };
183
+ }
@@ -1,43 +1,46 @@
1
- import { IAgenticaPrompt } from "@agentica/core";
2
-
3
- export namespace AgenticaPromptReporter {
4
- export const markdown = (p: IAgenticaPrompt): string => {
5
- if (p.type === "text")
6
- return [`### Text (${p.role})`, p.text, ""].join("\n");
7
- else if (p.type === "select" || p.type === "cancel")
8
- return [
9
- `### ${p.type === "select" ? "Select" : "Cancel"}`,
10
- ...p.operations
11
- .map((op) => [
12
- `#### ${op.name}`,
13
- ` - controller: ${op.controller.name}`,
14
- ` - function: ${op.function.name}`,
15
- ` - reason: ${op.reason}`,
16
- "",
17
- ...(!!op.function.description?.length
18
- ? [op.function.description, ""]
19
- : []),
20
- ])
21
- .flat(),
22
- ].join("\n");
23
- else if (p.type === "describe")
24
- return [
25
- "### Describe",
26
- ...p.executions.map((e) => ` - ${e.name}`),
27
- "",
28
- ...p.text.split("\n").map((s) => `> ${s}`),
29
- "",
30
- ].join("\n");
31
- return [
32
- "### Execute",
33
- ` - name: ${p.name}`,
34
- ` - controller: ${p.controller.name}`,
35
- ` - function: ${p.function.name}`,
36
- "",
37
- "```json",
38
- JSON.stringify(p.arguments, null, 2),
39
- "```",
40
- "",
41
- ].join("\n");
42
- };
43
- }
1
+ import { IAgenticaPrompt } from "@agentica/core";
2
+ import { ILlmSchema } from "@samchon/openapi";
3
+
4
+ export namespace AgenticaPromptReporter {
5
+ export const markdown = <Model extends ILlmSchema.Model>(
6
+ p: IAgenticaPrompt<Model>,
7
+ ): string => {
8
+ if (p.type === "text")
9
+ return [`### Text (${p.role})`, p.text, ""].join("\n");
10
+ else if (p.type === "select" || p.type === "cancel")
11
+ return [
12
+ `### ${p.type === "select" ? "Select" : "Cancel"}`,
13
+ ...p.operations
14
+ .map((op) => [
15
+ `#### ${op.name}`,
16
+ ` - controller: ${op.controller.name}`,
17
+ ` - function: ${op.function.name}`,
18
+ ` - reason: ${op.reason}`,
19
+ "",
20
+ ...(!!op.function.description?.length
21
+ ? [op.function.description, ""]
22
+ : []),
23
+ ])
24
+ .flat(),
25
+ ].join("\n");
26
+ else if (p.type === "describe")
27
+ return [
28
+ "### Describe",
29
+ ...p.executions.map((e) => ` - ${e.name}`),
30
+ "",
31
+ ...p.text.split("\n").map((s) => `> ${s}`),
32
+ "",
33
+ ].join("\n");
34
+ return [
35
+ "### Execute",
36
+ ` - name: ${p.name}`,
37
+ ` - controller: ${p.controller.name}`,
38
+ ` - function: ${p.function.name}`,
39
+ "",
40
+ "```json",
41
+ JSON.stringify(p.arguments, null, 2),
42
+ "```",
43
+ "",
44
+ ].join("\n");
45
+ };
46
+ }