@agentica/benchmark 0.12.21 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/README.md +39 -33
  2. package/lib/AgenticaCallBenchmark.d.ts +12 -6
  3. package/lib/AgenticaCallBenchmark.js +24 -18
  4. package/lib/AgenticaCallBenchmark.js.map +1 -1
  5. package/lib/AgenticaSelectBenchmark.d.ts +12 -6
  6. package/lib/AgenticaSelectBenchmark.js +14 -12
  7. package/lib/AgenticaSelectBenchmark.js.map +1 -1
  8. package/lib/index.mjs +315 -236
  9. package/lib/index.mjs.map +1 -1
  10. package/lib/internal/AgenticaBenchmarkPredicator.d.ts +38 -29
  11. package/lib/internal/AgenticaBenchmarkPredicator.js +100 -84
  12. package/lib/internal/AgenticaBenchmarkPredicator.js.map +1 -1
  13. package/lib/internal/AgenticaBenchmarkUtil.d.ts +21 -6
  14. package/lib/internal/AgenticaBenchmarkUtil.js +39 -33
  15. package/lib/internal/AgenticaBenchmarkUtil.js.map +1 -1
  16. package/lib/internal/AgenticaCallBenchmarkReporter.d.ts +6 -5
  17. package/lib/internal/AgenticaCallBenchmarkReporter.js +130 -126
  18. package/lib/internal/AgenticaCallBenchmarkReporter.js.map +1 -1
  19. package/lib/internal/AgenticaPromptReporter.d.ts +13 -5
  20. package/lib/internal/AgenticaPromptReporter.js +45 -41
  21. package/lib/internal/AgenticaPromptReporter.js.map +1 -1
  22. package/lib/internal/AgenticaSelectBenchmarkReporter.d.ts +3 -1
  23. package/lib/internal/AgenticaSelectBenchmarkReporter.js +153 -150
  24. package/lib/internal/AgenticaSelectBenchmarkReporter.js.map +1 -1
  25. package/lib/structures/IAgenticaBenchmarkExpected.d.ts +8 -2
  26. package/lib/structures/IAgenticaCallBenchmarkEvent.d.ts +9 -3
  27. package/lib/structures/IAgenticaCallBenchmarkResult.d.ts +10 -4
  28. package/lib/structures/IAgenticaCallBenchmarkScenario.d.ts +8 -2
  29. package/lib/structures/IAgenticaSelectBenchmarkEvent.d.ts +9 -3
  30. package/lib/structures/IAgenticaSelectBenchmarkResult.d.ts +10 -4
  31. package/lib/structures/IAgenticaSelectBenchmarkScenario.d.ts +8 -2
  32. package/lib/utils/MathUtil.d.ts +15 -3
  33. package/lib/utils/MathUtil.js +15 -4
  34. package/lib/utils/MathUtil.js.map +1 -1
  35. package/package.json +12 -10
  36. package/src/AgenticaCallBenchmark.ts +64 -45
  37. package/src/AgenticaSelectBenchmark.ts +42 -30
  38. package/src/internal/AgenticaBenchmarkPredicator.ts +208 -186
  39. package/src/internal/AgenticaBenchmarkUtil.ts +58 -40
  40. package/src/internal/AgenticaCallBenchmarkReporter.ts +180 -182
  41. package/src/internal/AgenticaPromptReporter.ts +46 -33
  42. package/src/internal/AgenticaSelectBenchmarkReporter.ts +205 -203
  43. package/src/structures/IAgenticaBenchmarkExpected.ts +9 -2
  44. package/src/structures/IAgenticaCallBenchmarkEvent.ts +9 -3
  45. package/src/structures/IAgenticaCallBenchmarkResult.ts +10 -4
  46. package/src/structures/IAgenticaCallBenchmarkScenario.ts +8 -2
  47. package/src/structures/IAgenticaSelectBenchmarkEvent.ts +9 -3
  48. package/src/structures/IAgenticaSelectBenchmarkResult.ts +10 -4
  49. package/src/structures/IAgenticaSelectBenchmarkScenario.ts +8 -2
  50. package/src/utils/MathUtil.ts +16 -3
@@ -1,193 +1,191 @@
1
- import { AgenticaTokenUsage } from "@agentica/core";
2
- import { ILlmSchema } from "@samchon/openapi";
1
+ /**
2
+ * @module
3
+ * This file contains functions to work with AgenticaCallBenchmarkReporter.
4
+ *
5
+ * @author Wrtn Technologies
6
+ */
7
+ import type { AgenticaTokenUsage } from "@agentica/core";
8
+ import type { ILlmSchema } from "@samchon/openapi";
3
9
 
4
- import { IAgenticaCallBenchmarkEvent } from "../structures/IAgenticaCallBenchmarkEvent";
5
- import { IAgenticaCallBenchmarkResult } from "../structures/IAgenticaCallBenchmarkResult";
10
+ import type { IAgenticaCallBenchmarkEvent } from "../structures/IAgenticaCallBenchmarkEvent";
11
+ import type { IAgenticaCallBenchmarkResult } from "../structures/IAgenticaCallBenchmarkResult";
6
12
  import { MathUtil } from "../utils/MathUtil";
7
13
  import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
8
14
  import { AgenticaPromptReporter } from "./AgenticaPromptReporter";
9
15
 
10
- export namespace AgenticaCallBenchmarkReporter {
11
- export const markdown = <Model extends ILlmSchema.Model>(
12
- result: IAgenticaCallBenchmarkResult<Model>,
13
- ): Record<string, string> =>
14
- Object.fromEntries([
15
- ["./README.md", writeIndex<Model>(result)],
16
- ...result.experiments
17
- .map((exp) => [
18
- [`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
19
- ...exp.events.map((event, i) => [
20
- `./${exp.scenario.name}/${i + 1}.${event.type}.md`,
21
- writeExperimentEvent(event, i),
22
- ]),
23
- ])
24
- .flat(),
25
- ]);
16
+ export const AgenticaCallBenchmarkReporter = {
17
+ markdown,
18
+ };
26
19
 
27
- const writeIndex = <Model extends ILlmSchema.Model>(
28
- result: IAgenticaCallBenchmarkResult<Model>,
29
- ): string => {
30
- const events: IAgenticaCallBenchmarkEvent<Model>[] = result.experiments
31
- .map((r) => r.events)
32
- .flat();
33
- const average: number =
34
- events
35
- .map((e) => e.completed_at.getTime() - e.started_at.getTime())
36
- .reduce((a, b) => a + b, 0) / events.length;
37
- const aggregate: AgenticaTokenUsage.IComponent = result.usage.aggregate;
38
- return [
39
- "# LLM Function Call Benchmark",
40
- "## Summary",
41
- ` - Aggregation:`,
42
- ` - Scenarios: #${result.experiments.length.toLocaleString()}`,
43
- ` - Trial: ${events.length}`,
44
- ` - Success: ${events.filter((e) => e.type === "success").length}`,
45
- ` - Failure: ${events.filter((e) => e.type === "failure").length}`,
46
- ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`,
47
- ` - Token Usage`,
48
- ` - Total: ${aggregate.total.toLocaleString()}`,
49
- ` - Input`,
50
- ` - Total: ${aggregate.input.total.toLocaleString()}`,
51
- ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
52
- ` - Output:`,
53
- ` - Total: ${aggregate.output.total.toLocaleString()}`,
54
- ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
55
- ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
56
- ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
57
- "",
58
- "## Experiments",
59
- " Name | Select | Call | Time/Avg ",
60
- ":-----|:-------|:-----|----------:",
61
- ...result.experiments.map((exp) =>
62
- [
63
- `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`,
64
- drawStatus(
65
- exp.events,
66
- (e) => e.type !== "error" && e.select === true,
67
- ),
68
- drawStatus(exp.events, (e) => e.type !== "error" && e.call === true),
69
- `${MathUtil.round(
70
- exp.events
71
- .map((e) => e.completed_at.getTime() - e.started_at.getTime())
72
- .reduce((a, b) => a + b, 0) / exp.events.length,
73
- ).toLocaleString()} ms`,
74
- ].join(" | "),
75
- ),
76
- ].join("\n");
77
- };
20
+ export function markdown<Model extends ILlmSchema.Model>(result: IAgenticaCallBenchmarkResult<Model>): Record<string, string> {
21
+ return Object.fromEntries([
22
+ ["./README.md", writeIndex<Model>(result)],
23
+ ...result.experiments
24
+ .map<[string, string][]>(exp => [
25
+ [`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
26
+ ...exp.events.map<[string, string]>((event, i) => [
27
+ `./${exp.scenario.name}/${i + 1}.${event.type}.md`,
28
+ writeExperimentEvent(event, i),
29
+ ]),
30
+ ])
31
+ .flat(),
32
+ ]);
33
+ }
78
34
 
79
- const writeExperimentIndex = <Model extends ILlmSchema.Model>(
80
- exp: IAgenticaCallBenchmarkResult.IExperiment<Model>,
81
- ): string => {
82
- return [
83
- `# ${exp.scenario.name}`,
84
- "## Summary",
85
- ` - Scenarios: #${exp.events.length.toLocaleString()}`,
86
- ` - Success: ${exp.events.filter((e) => e.type === "success").length}`,
87
- ` - Failure: ${exp.events.filter((e) => e.type === "failure").length}`,
88
- ` - Average Time: ${MathUtil.round(
89
- exp.events
90
- .map((e) => e.completed_at.getTime() - e.started_at.getTime())
91
- .reduce((a, b) => a + b, 0) / exp.events.length,
92
- ).toLocaleString()} ms`,
93
- "",
94
- "## Events",
95
- " Name | Type | Time",
96
- ":-----|:-----|----:",
97
- ...exp.events.map((e, i) =>
98
- [
99
- `[${i + 1}.](./${i + 1}.${e.type}.md)`,
100
- e.type,
101
- `${MathUtil.round(e.completed_at.getTime() - e.started_at.getTime())} ms`,
102
- ].join(" | "),
103
- ),
104
- "",
105
- "## Scenario",
106
- "### User Prompt",
107
- exp.scenario.text,
108
- "",
109
- "### Expected",
110
- "```json",
111
- JSON.stringify(
112
- AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected),
113
- null,
114
- 2,
115
- ),
116
- "```",
117
- ].join("\n");
118
- };
35
+ function writeIndex<Model extends ILlmSchema.Model>(result: IAgenticaCallBenchmarkResult<Model>): string {
36
+ const events: IAgenticaCallBenchmarkEvent<Model>[] = result.experiments
37
+ .map(r => r.events)
38
+ .flat();
39
+ const average: number
40
+ = events
41
+ .map(e => e.completed_at.getTime() - e.started_at.getTime())
42
+ .reduce((a, b) => a + b, 0) / events.length;
43
+ const aggregate: AgenticaTokenUsage.IComponent = result.usage.aggregate;
44
+ return [
45
+ "# LLM Function Call Benchmark",
46
+ "## Summary",
47
+ ` - Aggregation:`,
48
+ ` - Scenarios: #${result.experiments.length.toLocaleString()}`,
49
+ ` - Trial: ${events.length}`,
50
+ ` - Success: ${events.filter(e => e.type === "success").length}`,
51
+ ` - Failure: ${events.filter(e => e.type === "failure").length}`,
52
+ ` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`,
53
+ ` - Token Usage`,
54
+ ` - Total: ${aggregate.total.toLocaleString()}`,
55
+ ` - Input`,
56
+ ` - Total: ${aggregate.input.total.toLocaleString()}`,
57
+ ` - Cached: ${aggregate.input.cached.toLocaleString()}`,
58
+ ` - Output:`,
59
+ ` - Total: ${aggregate.output.total.toLocaleString()}`,
60
+ ` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
61
+ ` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
62
+ ` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
63
+ "",
64
+ "## Experiments",
65
+ " Name | Select | Call | Time/Avg ",
66
+ ":-----|:-------|:-----|----------:",
67
+ ...result.experiments.map(exp =>
68
+ [
69
+ `[${exp.scenario.name}](./${exp.scenario.name}/README.md)`,
70
+ drawStatus(
71
+ exp.events,
72
+ e => e.type !== "error" && e.select === true,
73
+ ),
74
+ drawStatus(exp.events, e => e.type !== "error" && e.call === true),
75
+ `${MathUtil.round(
76
+ exp.events
77
+ .map(e => e.completed_at.getTime() - e.started_at.getTime())
78
+ .reduce((a, b) => a + b, 0) / exp.events.length,
79
+ ).toLocaleString()} ms`,
80
+ ].join(" | "),
81
+ ),
82
+ ].join("\n");
83
+ }
119
84
 
120
- const writeExperimentEvent = <Model extends ILlmSchema.Model>(
121
- event: IAgenticaCallBenchmarkEvent<Model>,
122
- index: number,
123
- ): string => {
124
- return [
125
- `# ${index + 1}. ${event.type}`,
126
- "## Summary",
127
- ` - Name: ${event.scenario.name}`,
128
- ` - Type: ${event.type}`,
129
- ` - Time: ${MathUtil.round(
130
- event.completed_at.getTime() - event.started_at.getTime(),
131
- ).toLocaleString()} ms`,
132
- ...(event.type !== "error"
133
- ? [
134
- ` - Select: ${event.select ? "✅" : "❌"}`,
135
- ` - Call: ${event.call ? "" : "❌"}`,
136
- ]
137
- : []),
138
- ` - Token Usage:`,
139
- ` - Total: ${JSON.stringify(event.usage.aggregate.total)}`,
140
- ` - Input`,
141
- ` - Total: ${event.usage.aggregate.input.total}`,
142
- ` - Cached: ${event.usage.aggregate.input.cached}`,
143
- ` - Output:`,
144
- ` - Total: ${event.usage.aggregate.output.total}`,
145
- ` - Accepted Prediction: ${event.usage.aggregate.output.accepted_prediction}`,
146
- ` - Reasoning: ${event.usage.aggregate.output.reasoning}`,
147
- ` - Rejected Prediction: ${event.usage.aggregate.output.rejected_prediction}`,
85
+ function writeExperimentIndex<Model extends ILlmSchema.Model>(exp: IAgenticaCallBenchmarkResult.IExperiment<Model>): string {
86
+ return [
87
+ `# ${exp.scenario.name}`,
88
+ "## Summary",
89
+ ` - Scenarios: #${exp.events.length.toLocaleString()}`,
90
+ ` - Success: ${exp.events.filter(e => e.type === "success").length}`,
91
+ ` - Failure: ${exp.events.filter(e => e.type === "failure").length}`,
92
+ ` - Average Time: ${MathUtil.round(
93
+ exp.events
94
+ .map(e => e.completed_at.getTime() - e.started_at.getTime())
95
+ .reduce((a, b) => a + b, 0) / exp.events.length,
96
+ ).toLocaleString()} ms`,
97
+ "",
98
+ "## Events",
99
+ " Name | Type | Time",
100
+ ":-----|:-----|----:",
101
+ ...exp.events.map((e, i) =>
102
+ [
103
+ `[${i + 1}.](./${i + 1}.${e.type}.md)`,
104
+ e.type,
105
+ `${MathUtil.round(e.completed_at.getTime() - e.started_at.getTime())} ms`,
106
+ ].join(" | "),
107
+ ),
108
+ "",
109
+ "## Scenario",
110
+ "### User Prompt",
111
+ exp.scenario.text,
112
+ "",
113
+ "### Expected",
114
+ "```json",
115
+ JSON.stringify(
116
+ AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected),
117
+ null,
118
+ 2,
119
+ ),
120
+ "```",
121
+ ].join("\n");
122
+ }
148
123
 
149
- "",
150
- "## Scenario",
151
- "### User Prompt",
152
- event.scenario.text,
153
- "",
154
- "### Expected",
155
- "```json",
156
- JSON.stringify(
157
- AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected),
158
- null,
159
- 2,
160
- ),
161
- "```",
162
- "",
163
- "## Prompt Histories",
164
- ...event.prompts.map(AgenticaPromptReporter.markdown),
165
- "",
166
- ...(event.type === "error"
167
- ? [
168
- "## Error",
169
- "```json",
170
- JSON.stringify(
171
- AgenticaBenchmarkUtil.errorToJson(event.error),
172
- null,
173
- 2,
174
- ),
175
- "```",
176
- ]
177
- : []),
178
- ].join("\n");
179
- };
124
+ function writeExperimentEvent<Model extends ILlmSchema.Model>(event: IAgenticaCallBenchmarkEvent<Model>, index: number): string {
125
+ return [
126
+ `# ${index + 1}. ${event.type}`,
127
+ "## Summary",
128
+ ` - Name: ${event.scenario.name}`,
129
+ ` - Type: ${event.type}`,
130
+ ` - Time: ${MathUtil.round(
131
+ event.completed_at.getTime() - event.started_at.getTime(),
132
+ ).toLocaleString()} ms`,
133
+ ...(event.type !== "error"
134
+ ? [
135
+ ` - Select: ${event.select ? "✅" : "❌"}`,
136
+ ` - Call: ${event.call ? "" : "❌"}`,
137
+ ]
138
+ : []),
139
+ ` - Token Usage:`,
140
+ ` - Total: ${JSON.stringify(event.usage.aggregate.total)}`,
141
+ ` - Input`,
142
+ ` - Total: ${event.usage.aggregate.input.total}`,
143
+ ` - Cached: ${event.usage.aggregate.input.cached}`,
144
+ ` - Output:`,
145
+ ` - Total: ${event.usage.aggregate.output.total}`,
146
+ ` - Accepted Prediction: ${event.usage.aggregate.output.accepted_prediction}`,
147
+ ` - Reasoning: ${event.usage.aggregate.output.reasoning}`,
148
+ ` - Rejected Prediction: ${event.usage.aggregate.output.rejected_prediction}`,
149
+
150
+ "",
151
+ "## Scenario",
152
+ "### User Prompt",
153
+ event.scenario.text,
154
+ "",
155
+ "### Expected",
156
+ "```json",
157
+ JSON.stringify(
158
+ AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected),
159
+ null,
160
+ 2,
161
+ ),
162
+ "```",
163
+ "",
164
+ "## Prompt Histories",
165
+ ...event.prompts.map(AgenticaPromptReporter.markdown),
166
+ "",
167
+ ...(event.type === "error"
168
+ ? [
169
+ "## Error",
170
+ "```json",
171
+ JSON.stringify(
172
+ AgenticaBenchmarkUtil.errorToJson(event.error),
173
+ null,
174
+ 2,
175
+ ),
176
+ "```",
177
+ ]
178
+ : []),
179
+ ].join("\n");
180
+ }
180
181
 
181
- const drawStatus = <Model extends ILlmSchema.Model>(
182
- events: IAgenticaCallBenchmarkEvent<Model>[],
183
- success: (e: IAgenticaCallBenchmarkEvent<Model>) => boolean,
184
- ): string => {
185
- const count: number = Math.floor(
186
- (events.filter(success).length / events.length) * 10,
187
- );
188
- return (
189
- new Array(count).fill("■").join("") +
190
- new Array(10 - count).fill("□").join("")
191
- );
192
- };
182
+ function drawStatus<Model extends ILlmSchema.Model>(events: IAgenticaCallBenchmarkEvent<Model>[], success: (e: IAgenticaCallBenchmarkEvent<Model>) => boolean): string {
183
+ const count: number = Math.floor(
184
+ (events.filter(success).length / events.length) * 10,
185
+ );
186
+ // @TODO use String.prototype.padStart, padEnd or String.prototype.repeat
187
+ return (
188
+ Array.from({ length: count }).fill("■").join("")
189
+ + Array.from({ length: 10 - count }).fill("□").join("")
190
+ );
193
191
  }
@@ -1,46 +1,59 @@
1
- import { AgenticaPrompt } from "@agentica/core";
2
- import { ILlmSchema } from "@samchon/openapi";
1
+ /**
2
+ * @module
3
+ * This file contains functions to work with AgenticaPromptReporter.
4
+ *
5
+ * @author Wrtn Technologies
6
+ */
7
+ import type { AgenticaPrompt } from "@agentica/core";
8
+ import type { ILlmSchema } from "@samchon/openapi";
3
9
 
4
- export namespace AgenticaPromptReporter {
5
- export const markdown = <Model extends ILlmSchema.Model>(
6
- p: AgenticaPrompt<Model>,
7
- ): string => {
8
- if (p.type === "text")
9
- return [`### Text (${p.role})`, p.text, ""].join("\n");
10
- else if (p.type === "select" || p.type === "cancel")
11
- return [
12
- `### ${p.type === "select" ? "Select" : "Cancel"}`,
13
- ...p.selections
14
- .map((s) => [
10
+ export const AgenticaPromptReporter = {
11
+ markdown,
12
+ };
13
+
14
+ function markdown<Model extends ILlmSchema.Model>(p: AgenticaPrompt<Model>): string {
15
+ // @TODO use switch statement
16
+ if (p.type === "text") {
17
+ return [`### Text (${p.role})`, p.text, ""].join("\n");
18
+ }
19
+ else if (p.type === "select" || p.type === "cancel") {
20
+ return [
21
+ `### ${p.type === "select" ? "Select" : "Cancel"}`,
22
+ ...p.selections
23
+ .flatMap((s) => {
24
+ const functionDescriptionCount = s.operation.function.description?.length ?? 0;
25
+
26
+ return [
15
27
  `#### ${s.operation.name}`,
16
28
  ` - controller: ${s.operation.controller.name}`,
17
29
  ` - function: ${s.operation.function.name}`,
18
30
  ` - reason: ${s.reason}`,
19
31
  "",
20
- ...(!!s.operation.function.description?.length
32
+ ...(functionDescriptionCount > 0
21
33
  ? [s.operation.function.description, ""]
22
34
  : []),
23
- ])
24
- .flat(),
25
- ].join("\n");
26
- else if (p.type === "describe")
27
- return [
28
- "### Describe",
29
- ...p.executes.map((e) => ` - ${e.operation.name}`),
30
- "",
31
- ...p.text.split("\n").map((s) => `> ${s}`),
32
- "",
33
- ].join("\n");
35
+ ];
36
+ }),
37
+ ].join("\n");
38
+ }
39
+ else if (p.type === "describe") {
34
40
  return [
35
- "### Execute",
36
- ` - name: ${p.operation.name}`,
37
- ` - controller: ${p.operation.controller.name}`,
38
- ` - function: ${p.operation.function.name}`,
41
+ "### Describe",
42
+ ...p.executes.map(e => ` - ${e.operation.name}`),
39
43
  "",
40
- "```json",
41
- JSON.stringify(p.arguments, null, 2),
42
- "```",
44
+ ...p.text.split("\n").map(s => `> ${s}`),
43
45
  "",
44
46
  ].join("\n");
45
- };
47
+ }
48
+ return [
49
+ "### Execute",
50
+ ` - name: ${p.operation.name}`,
51
+ ` - controller: ${p.operation.controller.name}`,
52
+ ` - function: ${p.operation.function.name}`,
53
+ "",
54
+ "```json",
55
+ JSON.stringify(p.arguments, null, 2),
56
+ "```",
57
+ "",
58
+ ].join("\n");
46
59
  }