@agentica/benchmark 0.8.2 → 0.8.3-dev.20250227
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +324 -324
- package/lib/index.mjs.map +1 -1
- package/lib/internal/AgenticaCallBenchmarkReporter.js.map +1 -1
- package/lib/internal/AgenticaSelectBenchmarkReporter.js.map +1 -1
- package/package.json +2 -2
- package/src/AgenticaCallBenchmark.ts +265 -265
- package/src/AgenticaSelectBenchmark.ts +254 -254
- package/src/index.ts +3 -3
- package/src/internal/AgenticaBenchmarkPredicator.ts +216 -216
- package/src/internal/AgenticaBenchmarkUtil.ts +40 -40
- package/src/internal/AgenticaCallBenchmarkReporter.ts +180 -181
- package/src/internal/AgenticaPromptReporter.ts +43 -43
- package/src/internal/AgenticaSelectBenchmarkReporter.ts +210 -212
- package/src/structures/IAgenticaBenchmarkExpected.ts +58 -58
- package/src/structures/IAgenticaCallBenchmarkEvent.ts +109 -109
- package/src/structures/IAgenticaCallBenchmarkResult.ts +69 -69
- package/src/structures/IAgenticaCallBenchmarkScenario.ts +39 -39
- package/src/structures/IAgenticaSelectBenchmarkEvent.ts +110 -110
- package/src/structures/IAgenticaSelectBenchmarkResult.ts +69 -69
- package/src/structures/IAgenticaSelectBenchmarkScenario.ts +39 -39
- package/src/utils/MathUtil.ts +3 -3
|
@@ -1,181 +1,180 @@
|
|
|
1
|
-
import { IAgenticaTokenUsage } from "@agentica/core";
|
|
2
|
-
|
|
3
|
-
import { IAgenticaCallBenchmarkEvent } from "../structures/IAgenticaCallBenchmarkEvent";
|
|
4
|
-
import { IAgenticaCallBenchmarkResult } from "../structures/IAgenticaCallBenchmarkResult";
|
|
5
|
-
import { MathUtil } from "../utils/MathUtil";
|
|
6
|
-
import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
|
|
7
|
-
import { AgenticaPromptReporter } from "./AgenticaPromptReporter";
|
|
8
|
-
|
|
9
|
-
export namespace AgenticaCallBenchmarkReporter {
|
|
10
|
-
export const markdown = (
|
|
11
|
-
result: IAgenticaCallBenchmarkResult,
|
|
12
|
-
): Record<string, string> =>
|
|
13
|
-
Object.fromEntries([
|
|
14
|
-
["./README.md", writeIndex(result)],
|
|
15
|
-
...result.experiments
|
|
16
|
-
.map((exp) => [
|
|
17
|
-
[`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
|
|
18
|
-
...exp.events.map((event, i) => [
|
|
19
|
-
`./${exp.scenario.name}/${i + 1}.${event.type}.md`,
|
|
20
|
-
writeExperimentEvent(event, i),
|
|
21
|
-
]),
|
|
22
|
-
])
|
|
23
|
-
.flat(),
|
|
24
|
-
]);
|
|
25
|
-
|
|
26
|
-
const writeIndex = (result: IAgenticaCallBenchmarkResult): string => {
|
|
27
|
-
const events: IAgenticaCallBenchmarkEvent[] = result.experiments
|
|
28
|
-
.map((r) => r.events)
|
|
29
|
-
.flat();
|
|
30
|
-
const average: number =
|
|
31
|
-
events
|
|
32
|
-
.map((e) => e.completed_at.getTime() - e.started_at.getTime())
|
|
33
|
-
.reduce((a, b) => a + b, 0) / events.length;
|
|
34
|
-
const aggregate: IAgenticaTokenUsage.IComponent
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
"
|
|
38
|
-
|
|
39
|
-
`
|
|
40
|
-
` -
|
|
41
|
-
` -
|
|
42
|
-
` -
|
|
43
|
-
` -
|
|
44
|
-
`
|
|
45
|
-
`
|
|
46
|
-
` -
|
|
47
|
-
`
|
|
48
|
-
` -
|
|
49
|
-
`
|
|
50
|
-
`
|
|
51
|
-
` -
|
|
52
|
-
` -
|
|
53
|
-
` -
|
|
54
|
-
|
|
55
|
-
"",
|
|
56
|
-
"
|
|
57
|
-
"
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
),
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
.
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
` -
|
|
84
|
-
` -
|
|
85
|
-
` -
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
.
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
"",
|
|
92
|
-
"
|
|
93
|
-
"
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
e.
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
"",
|
|
103
|
-
"
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
"",
|
|
107
|
-
"
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
` -
|
|
126
|
-
` -
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
` -
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
"",
|
|
138
|
-
"
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
"",
|
|
142
|
-
"
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
"
|
|
150
|
-
"",
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
""
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
"
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
new Array(count).fill("
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
}
|
|
1
|
+
import { IAgenticaTokenUsage } from "@agentica/core";
|
|
2
|
+
|
|
3
|
+
import { IAgenticaCallBenchmarkEvent } from "../structures/IAgenticaCallBenchmarkEvent";
|
|
4
|
+
import { IAgenticaCallBenchmarkResult } from "../structures/IAgenticaCallBenchmarkResult";
|
|
5
|
+
import { MathUtil } from "../utils/MathUtil";
|
|
6
|
+
import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
|
|
7
|
+
import { AgenticaPromptReporter } from "./AgenticaPromptReporter";
|
|
8
|
+
|
|
9
|
+
export namespace AgenticaCallBenchmarkReporter {
|
|
10
|
+
export const markdown = (
|
|
11
|
+
result: IAgenticaCallBenchmarkResult,
|
|
12
|
+
): Record<string, string> =>
|
|
13
|
+
Object.fromEntries([
|
|
14
|
+
["./README.md", writeIndex(result)],
|
|
15
|
+
...result.experiments
|
|
16
|
+
.map((exp) => [
|
|
17
|
+
[`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
|
|
18
|
+
...exp.events.map((event, i) => [
|
|
19
|
+
`./${exp.scenario.name}/${i + 1}.${event.type}.md`,
|
|
20
|
+
writeExperimentEvent(event, i),
|
|
21
|
+
]),
|
|
22
|
+
])
|
|
23
|
+
.flat(),
|
|
24
|
+
]);
|
|
25
|
+
|
|
26
|
+
const writeIndex = (result: IAgenticaCallBenchmarkResult): string => {
|
|
27
|
+
const events: IAgenticaCallBenchmarkEvent[] = result.experiments
|
|
28
|
+
.map((r) => r.events)
|
|
29
|
+
.flat();
|
|
30
|
+
const average: number =
|
|
31
|
+
events
|
|
32
|
+
.map((e) => e.completed_at.getTime() - e.started_at.getTime())
|
|
33
|
+
.reduce((a, b) => a + b, 0) / events.length;
|
|
34
|
+
const aggregate: IAgenticaTokenUsage.IComponent = result.usage.aggregate;
|
|
35
|
+
return [
|
|
36
|
+
"# LLM Function Call Benchmark",
|
|
37
|
+
"## Summary",
|
|
38
|
+
` - Aggregation:`,
|
|
39
|
+
` - Scenarios: #${result.experiments.length.toLocaleString()}`,
|
|
40
|
+
` - Trial: ${events.length}`,
|
|
41
|
+
` - Success: ${events.filter((e) => e.type === "success").length}`,
|
|
42
|
+
` - Failure: ${events.filter((e) => e.type === "failure").length}`,
|
|
43
|
+
` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`,
|
|
44
|
+
` - Token Usage`,
|
|
45
|
+
` - Total: ${aggregate.total.toLocaleString()}`,
|
|
46
|
+
` - Input`,
|
|
47
|
+
` - Total: ${aggregate.input.total.toLocaleString()}`,
|
|
48
|
+
` - Cached: ${aggregate.input.cached.toLocaleString()}`,
|
|
49
|
+
` - Output:`,
|
|
50
|
+
` - Total: ${aggregate.output.total.toLocaleString()}`,
|
|
51
|
+
` - Reasoning: ${aggregate.output.reasoning.toLocaleString()}`,
|
|
52
|
+
` - Accepted Prediction: ${aggregate.output.accepted_prediction.toLocaleString()}`,
|
|
53
|
+
` - Rejected Prediction: ${aggregate.output.rejected_prediction.toLocaleString()}`,
|
|
54
|
+
"",
|
|
55
|
+
"## Experiments",
|
|
56
|
+
" Name | Select | Call | Time/Avg ",
|
|
57
|
+
":-----|:-------|:-----|----------:",
|
|
58
|
+
...result.experiments.map((exp) =>
|
|
59
|
+
[
|
|
60
|
+
`[${exp.scenario.name}](./${exp.scenario.name}/README.md)`,
|
|
61
|
+
drawStatus(
|
|
62
|
+
exp.events,
|
|
63
|
+
(e) => e.type !== "error" && e.select === true,
|
|
64
|
+
),
|
|
65
|
+
drawStatus(exp.events, (e) => e.type !== "error" && e.call === true),
|
|
66
|
+
`${MathUtil.round(
|
|
67
|
+
exp.events
|
|
68
|
+
.map((e) => e.completed_at.getTime() - e.started_at.getTime())
|
|
69
|
+
.reduce((a, b) => a + b, 0) / exp.events.length,
|
|
70
|
+
).toLocaleString()} ms`,
|
|
71
|
+
].join(" | "),
|
|
72
|
+
),
|
|
73
|
+
].join("\n");
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
const writeExperimentIndex = (
|
|
77
|
+
exp: IAgenticaCallBenchmarkResult.IExperiment,
|
|
78
|
+
): string => {
|
|
79
|
+
return [
|
|
80
|
+
`# ${exp.scenario.name}`,
|
|
81
|
+
"## Summary",
|
|
82
|
+
` - Scenarios: #${exp.events.length.toLocaleString()}`,
|
|
83
|
+
` - Success: ${exp.events.filter((e) => e.type === "success").length}`,
|
|
84
|
+
` - Failure: ${exp.events.filter((e) => e.type === "failure").length}`,
|
|
85
|
+
` - Average Time: ${MathUtil.round(
|
|
86
|
+
exp.events
|
|
87
|
+
.map((e) => e.completed_at.getTime() - e.started_at.getTime())
|
|
88
|
+
.reduce((a, b) => a + b, 0) / exp.events.length,
|
|
89
|
+
).toLocaleString()} ms`,
|
|
90
|
+
"",
|
|
91
|
+
"## Events",
|
|
92
|
+
" Name | Type | Time",
|
|
93
|
+
":-----|:-----|----:",
|
|
94
|
+
...exp.events.map((e, i) =>
|
|
95
|
+
[
|
|
96
|
+
`[${i + 1}.](./${i + 1}.${e.type}.md)`,
|
|
97
|
+
e.type,
|
|
98
|
+
`${MathUtil.round(e.completed_at.getTime() - e.started_at.getTime())} ms`,
|
|
99
|
+
].join(" | "),
|
|
100
|
+
),
|
|
101
|
+
"",
|
|
102
|
+
"## Scenario",
|
|
103
|
+
"### User Prompt",
|
|
104
|
+
exp.scenario.text,
|
|
105
|
+
"",
|
|
106
|
+
"### Expected",
|
|
107
|
+
"```json",
|
|
108
|
+
JSON.stringify(
|
|
109
|
+
AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected),
|
|
110
|
+
null,
|
|
111
|
+
2,
|
|
112
|
+
),
|
|
113
|
+
"```",
|
|
114
|
+
].join("\n");
|
|
115
|
+
};
|
|
116
|
+
|
|
117
|
+
const writeExperimentEvent = (
|
|
118
|
+
event: IAgenticaCallBenchmarkEvent,
|
|
119
|
+
index: number,
|
|
120
|
+
): string => {
|
|
121
|
+
return [
|
|
122
|
+
`# ${index + 1}. ${event.type}`,
|
|
123
|
+
"## Summary",
|
|
124
|
+
` - Name: ${event.scenario.name}`,
|
|
125
|
+
` - Type: ${event.type}`,
|
|
126
|
+
` - Time: ${MathUtil.round(
|
|
127
|
+
event.completed_at.getTime() - event.started_at.getTime(),
|
|
128
|
+
).toLocaleString()} ms`,
|
|
129
|
+
...(event.type !== "error"
|
|
130
|
+
? [
|
|
131
|
+
` - Select: ${event.select ? "✅" : "❌"}`,
|
|
132
|
+
` - Call: ${event.call ? "✅" : "❌"}`,
|
|
133
|
+
]
|
|
134
|
+
: []),
|
|
135
|
+
` - Token Usage: ${event.usage.toLocaleString()}`,
|
|
136
|
+
"",
|
|
137
|
+
"## Scenario",
|
|
138
|
+
"### User Prompt",
|
|
139
|
+
event.scenario.text,
|
|
140
|
+
"",
|
|
141
|
+
"### Expected",
|
|
142
|
+
"```json",
|
|
143
|
+
JSON.stringify(
|
|
144
|
+
AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected),
|
|
145
|
+
null,
|
|
146
|
+
2,
|
|
147
|
+
),
|
|
148
|
+
"```",
|
|
149
|
+
"",
|
|
150
|
+
"## Prompt Histories",
|
|
151
|
+
...event.prompts.map(AgenticaPromptReporter.markdown),
|
|
152
|
+
"",
|
|
153
|
+
...(event.type === "error"
|
|
154
|
+
? [
|
|
155
|
+
"## Error",
|
|
156
|
+
"```json",
|
|
157
|
+
JSON.stringify(
|
|
158
|
+
AgenticaBenchmarkUtil.errorToJson(event.error),
|
|
159
|
+
null,
|
|
160
|
+
2,
|
|
161
|
+
),
|
|
162
|
+
"```",
|
|
163
|
+
]
|
|
164
|
+
: []),
|
|
165
|
+
].join("\n");
|
|
166
|
+
};
|
|
167
|
+
|
|
168
|
+
const drawStatus = (
|
|
169
|
+
events: IAgenticaCallBenchmarkEvent[],
|
|
170
|
+
success: (e: IAgenticaCallBenchmarkEvent) => boolean,
|
|
171
|
+
): string => {
|
|
172
|
+
const count: number = Math.floor(
|
|
173
|
+
(events.filter(success).length / events.length) * 10,
|
|
174
|
+
);
|
|
175
|
+
return (
|
|
176
|
+
new Array(count).fill("■").join("") +
|
|
177
|
+
new Array(10 - count).fill("□").join("")
|
|
178
|
+
);
|
|
179
|
+
};
|
|
180
|
+
}
|
|
@@ -1,43 +1,43 @@
|
|
|
1
|
-
import { IAgenticaPrompt } from "@agentica/core";
|
|
2
|
-
|
|
3
|
-
export namespace AgenticaPromptReporter {
|
|
4
|
-
export const markdown = (p: IAgenticaPrompt): string => {
|
|
5
|
-
if (p.type === "text")
|
|
6
|
-
return [`### Text (${p.role})`, p.text, ""].join("\n");
|
|
7
|
-
else if (p.type === "select" || p.type === "cancel")
|
|
8
|
-
return [
|
|
9
|
-
`### ${p.type === "select" ? "Select" : "Cancel"}`,
|
|
10
|
-
...p.operations
|
|
11
|
-
.map((op) => [
|
|
12
|
-
`#### ${op.name}`,
|
|
13
|
-
` - controller: ${op.controller.name}`,
|
|
14
|
-
` - function: ${op.function.name}`,
|
|
15
|
-
` - reason: ${op.reason}`,
|
|
16
|
-
"",
|
|
17
|
-
...(!!op.function.description?.length
|
|
18
|
-
? [op.function.description, ""]
|
|
19
|
-
: []),
|
|
20
|
-
])
|
|
21
|
-
.flat(),
|
|
22
|
-
].join("\n");
|
|
23
|
-
else if (p.type === "describe")
|
|
24
|
-
return [
|
|
25
|
-
"### Describe",
|
|
26
|
-
...p.executions.map((e) => ` - ${e.name}`),
|
|
27
|
-
"",
|
|
28
|
-
...p.text.split("\n").map((s) => `> ${s}`),
|
|
29
|
-
"",
|
|
30
|
-
].join("\n");
|
|
31
|
-
return [
|
|
32
|
-
"### Execute",
|
|
33
|
-
` - name: ${p.name}`,
|
|
34
|
-
` - controller: ${p.controller.name}`,
|
|
35
|
-
` - function: ${p.function.name}`,
|
|
36
|
-
"",
|
|
37
|
-
"```json",
|
|
38
|
-
JSON.stringify(p.arguments, null, 2),
|
|
39
|
-
"```",
|
|
40
|
-
"",
|
|
41
|
-
].join("\n");
|
|
42
|
-
};
|
|
43
|
-
}
|
|
1
|
+
import { IAgenticaPrompt } from "@agentica/core";
|
|
2
|
+
|
|
3
|
+
export namespace AgenticaPromptReporter {
|
|
4
|
+
export const markdown = (p: IAgenticaPrompt): string => {
|
|
5
|
+
if (p.type === "text")
|
|
6
|
+
return [`### Text (${p.role})`, p.text, ""].join("\n");
|
|
7
|
+
else if (p.type === "select" || p.type === "cancel")
|
|
8
|
+
return [
|
|
9
|
+
`### ${p.type === "select" ? "Select" : "Cancel"}`,
|
|
10
|
+
...p.operations
|
|
11
|
+
.map((op) => [
|
|
12
|
+
`#### ${op.name}`,
|
|
13
|
+
` - controller: ${op.controller.name}`,
|
|
14
|
+
` - function: ${op.function.name}`,
|
|
15
|
+
` - reason: ${op.reason}`,
|
|
16
|
+
"",
|
|
17
|
+
...(!!op.function.description?.length
|
|
18
|
+
? [op.function.description, ""]
|
|
19
|
+
: []),
|
|
20
|
+
])
|
|
21
|
+
.flat(),
|
|
22
|
+
].join("\n");
|
|
23
|
+
else if (p.type === "describe")
|
|
24
|
+
return [
|
|
25
|
+
"### Describe",
|
|
26
|
+
...p.executions.map((e) => ` - ${e.name}`),
|
|
27
|
+
"",
|
|
28
|
+
...p.text.split("\n").map((s) => `> ${s}`),
|
|
29
|
+
"",
|
|
30
|
+
].join("\n");
|
|
31
|
+
return [
|
|
32
|
+
"### Execute",
|
|
33
|
+
` - name: ${p.name}`,
|
|
34
|
+
` - controller: ${p.controller.name}`,
|
|
35
|
+
` - function: ${p.function.name}`,
|
|
36
|
+
"",
|
|
37
|
+
"```json",
|
|
38
|
+
JSON.stringify(p.arguments, null, 2),
|
|
39
|
+
"```",
|
|
40
|
+
"",
|
|
41
|
+
].join("\n");
|
|
42
|
+
};
|
|
43
|
+
}
|