@agentica/benchmark 0.7.0-dev.20250224-2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/lib/AgenticaCallBenchmark.d.ts +137 -0
- package/lib/AgenticaCallBenchmark.js +187 -0
- package/lib/AgenticaCallBenchmark.js.map +1 -0
- package/lib/AgenticaSelectBenchmark.d.ts +123 -0
- package/lib/AgenticaSelectBenchmark.js +185 -0
- package/lib/AgenticaSelectBenchmark.js.map +1 -0
- package/lib/index.d.ts +2 -0
- package/lib/index.js +19 -0
- package/lib/index.js.map +1 -0
- package/lib/index.mjs +449 -0
- package/lib/index.mjs.map +1 -0
- package/lib/internal/AgenticaBenchmarkPredicator.d.ts +32 -0
- package/lib/internal/AgenticaBenchmarkPredicator.js +179 -0
- package/lib/internal/AgenticaBenchmarkPredicator.js.map +1 -0
- package/lib/internal/AgenticaBenchmarkUtil.d.ts +5 -0
- package/lib/internal/AgenticaBenchmarkUtil.js +37 -0
- package/lib/internal/AgenticaBenchmarkUtil.js.map +1 -0
- package/lib/internal/AgenticaCallBenchmarkReporter.d.ts +4 -0
- package/lib/internal/AgenticaCallBenchmarkReporter.js +136 -0
- package/lib/internal/AgenticaCallBenchmarkReporter.js.map +1 -0
- package/lib/internal/AgenticaPromptReporter.d.ts +4 -0
- package/lib/internal/AgenticaPromptReporter.js +49 -0
- package/lib/internal/AgenticaPromptReporter.js.map +1 -0
- package/lib/internal/AgenticaSelectBenchmarkReporter.d.ts +1 -0
- package/lib/internal/AgenticaSelectBenchmarkReporter.js +172 -0
- package/lib/internal/AgenticaSelectBenchmarkReporter.js.map +1 -0
- package/lib/structures/IAgenticaBenchmarkExpected.d.ts +44 -0
- package/lib/structures/IAgenticaBenchmarkExpected.js +3 -0
- package/lib/structures/IAgenticaBenchmarkExpected.js.map +1 -0
- package/lib/structures/IAgenticaCallBenchmarkEvent.d.ts +95 -0
- package/lib/structures/IAgenticaCallBenchmarkEvent.js +3 -0
- package/lib/structures/IAgenticaCallBenchmarkEvent.js.map +1 -0
- package/lib/structures/IAgenticaCallBenchmarkResult.d.ts +62 -0
- package/lib/structures/IAgenticaCallBenchmarkResult.js +3 -0
- package/lib/structures/IAgenticaCallBenchmarkResult.js.map +1 -0
- package/lib/structures/IAgenticaCallBenchmarkScenario.d.ts +36 -0
- package/lib/structures/IAgenticaCallBenchmarkScenario.js +3 -0
- package/lib/structures/IAgenticaCallBenchmarkScenario.js.map +1 -0
- package/lib/structures/IAgenticaSelectBenchmarkEvent.d.ts +92 -0
- package/lib/structures/IAgenticaSelectBenchmarkEvent.js +3 -0
- package/lib/structures/IAgenticaSelectBenchmarkEvent.js.map +1 -0
- package/lib/structures/IAgenticaSelectBenchmarkResult.d.ts +62 -0
- package/lib/structures/IAgenticaSelectBenchmarkResult.js +3 -0
- package/lib/structures/IAgenticaSelectBenchmarkResult.js.map +1 -0
- package/lib/structures/IAgenticaSelectBenchmarkScenario.d.ts +36 -0
- package/lib/structures/IAgenticaSelectBenchmarkScenario.js +3 -0
- package/lib/structures/IAgenticaSelectBenchmarkScenario.js.map +1 -0
- package/lib/utils/MathUtil.d.ts +3 -0
- package/lib/utils/MathUtil.js +8 -0
- package/lib/utils/MathUtil.js.map +1 -0
- package/lib/utils/TokenUsageComputer.d.ts +5 -0
- package/lib/utils/TokenUsageComputer.js +37 -0
- package/lib/utils/TokenUsageComputer.js.map +1 -0
- package/package.json +57 -0
- package/src/AgenticaCallBenchmark.ts +259 -0
- package/src/AgenticaSelectBenchmark.ts +262 -0
- package/src/index.ts +3 -0
- package/src/internal/AgenticaBenchmarkPredicator.ts +216 -0
- package/src/internal/AgenticaBenchmarkUtil.ts +40 -0
- package/src/internal/AgenticaCallBenchmarkReporter.ts +177 -0
- package/src/internal/AgenticaPromptReporter.ts +43 -0
- package/src/internal/AgenticaSelectBenchmarkReporter.ts +212 -0
- package/src/structures/IAgenticaBenchmarkExpected.ts +58 -0
- package/src/structures/IAgenticaCallBenchmarkEvent.ts +109 -0
- package/src/structures/IAgenticaCallBenchmarkResult.ts +69 -0
- package/src/structures/IAgenticaCallBenchmarkScenario.ts +39 -0
- package/src/structures/IAgenticaSelectBenchmarkEvent.ts +110 -0
- package/src/structures/IAgenticaSelectBenchmarkResult.ts +69 -0
- package/src/structures/IAgenticaSelectBenchmarkScenario.ts +39 -0
- package/src/utils/MathUtil.ts +3 -0
- package/src/utils/TokenUsageComputer.ts +40 -0
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
import { Agentica, IAgenticaOperation, IAgenticaPrompt } from "@agentica/core";
|
|
2
|
+
import { ILlmFunction } from "@samchon/openapi";
|
|
3
|
+
import OpenAI from "openai";
|
|
4
|
+
import typia from "typia";
|
|
5
|
+
|
|
6
|
+
import { IAgenticaBenchmarkExpected } from "../structures/IAgenticaBenchmarkExpected";
|
|
7
|
+
|
|
8
|
+
export namespace AgenticaBenchmarkPredicator {
|
|
9
|
+
export const isNext = async (agent: Agentica): Promise<string | null> => {
|
|
10
|
+
const last: IAgenticaPrompt | undefined = agent.getPromptHistories().at(-1);
|
|
11
|
+
if (last?.type !== "text" || last.role !== "assistant") return null;
|
|
12
|
+
|
|
13
|
+
const consent: ILlmFunction<"chatgpt"> = typia.llm.application<
|
|
14
|
+
IPredicatorApplication,
|
|
15
|
+
"chatgpt"
|
|
16
|
+
>().functions[0]!;
|
|
17
|
+
const result: OpenAI.ChatCompletion = await agent[
|
|
18
|
+
"props"
|
|
19
|
+
].provider.api.chat.completions.create(
|
|
20
|
+
{
|
|
21
|
+
model: agent["props"].provider.model,
|
|
22
|
+
messages: [
|
|
23
|
+
{
|
|
24
|
+
role: "system",
|
|
25
|
+
content: [
|
|
26
|
+
"You are an helpful assistant.",
|
|
27
|
+
"",
|
|
28
|
+
"If what the assistant said seems like to asking for",
|
|
29
|
+
"user's consent about some function calling at the next step,",
|
|
30
|
+
"use the tools appropriately to step to the next.",
|
|
31
|
+
].join("\n"),
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
role: "assistant",
|
|
35
|
+
content: last.text,
|
|
36
|
+
},
|
|
37
|
+
],
|
|
38
|
+
tools: [
|
|
39
|
+
{
|
|
40
|
+
type: "function",
|
|
41
|
+
function: {
|
|
42
|
+
name: consent.name,
|
|
43
|
+
description: consent.description,
|
|
44
|
+
parameters: consent.parameters as Record<string, any>,
|
|
45
|
+
},
|
|
46
|
+
},
|
|
47
|
+
],
|
|
48
|
+
tool_choice: "required",
|
|
49
|
+
parallel_tool_calls: false,
|
|
50
|
+
},
|
|
51
|
+
agent["props"].provider.options,
|
|
52
|
+
);
|
|
53
|
+
const toolCall: OpenAI.ChatCompletionMessageToolCall | undefined = (
|
|
54
|
+
result.choices[0]?.message.tool_calls ?? []
|
|
55
|
+
).filter(
|
|
56
|
+
(tc) => tc.type === "function" && tc.function.name === consent.name,
|
|
57
|
+
)?.[0];
|
|
58
|
+
if (toolCall === undefined) return null;
|
|
59
|
+
const input: IConsentProps = JSON.parse(toolCall.function.arguments);
|
|
60
|
+
return typia.is(input) ? input.reply : null;
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Check if the called operations match the expected operations.
|
|
65
|
+
*
|
|
66
|
+
* @param props Properties for checking the match of the called operations
|
|
67
|
+
* and the expected operations
|
|
68
|
+
*
|
|
69
|
+
* @returns `true` if the called operations match the expected operations,
|
|
70
|
+
* otherwise `false`.
|
|
71
|
+
*/
|
|
72
|
+
export const success = (props: {
|
|
73
|
+
/**
|
|
74
|
+
* Expected operations to be called.
|
|
75
|
+
*
|
|
76
|
+
* For 'allOf' within an 'array', the next expected element starts checking from the element that follows the last called element in 'allOf'.
|
|
77
|
+
*/
|
|
78
|
+
expected: IAgenticaBenchmarkExpected;
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Specified operations.
|
|
82
|
+
*/
|
|
83
|
+
operations: Array<IAgenticaOperation | IAgenticaPrompt.IExecute>;
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* If it's `false`, check the array and let it go even if there's something wrong between them.
|
|
87
|
+
*
|
|
88
|
+
* @default `false`
|
|
89
|
+
*/
|
|
90
|
+
strict?: boolean;
|
|
91
|
+
}): boolean => successInner(props).result;
|
|
92
|
+
|
|
93
|
+
const successInner = (
|
|
94
|
+
props: Parameters<typeof success>[0],
|
|
95
|
+
):
|
|
96
|
+
| {
|
|
97
|
+
result: true;
|
|
98
|
+
take: number;
|
|
99
|
+
}
|
|
100
|
+
| {
|
|
101
|
+
result: false;
|
|
102
|
+
} => {
|
|
103
|
+
const call = (
|
|
104
|
+
expected: IAgenticaBenchmarkExpected,
|
|
105
|
+
overrideOperations?: Array<IAgenticaOperation | IAgenticaPrompt.IExecute>,
|
|
106
|
+
) =>
|
|
107
|
+
successInner({
|
|
108
|
+
expected,
|
|
109
|
+
operations: overrideOperations ?? props.operations,
|
|
110
|
+
strict: props.strict,
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
switch (props.expected.type) {
|
|
114
|
+
case "array": {
|
|
115
|
+
let take = 0;
|
|
116
|
+
const targetIterator = props.expected.items[Symbol.iterator]();
|
|
117
|
+
let targeted = targetIterator.next();
|
|
118
|
+
|
|
119
|
+
while (true) {
|
|
120
|
+
if (targeted.done) {
|
|
121
|
+
return {
|
|
122
|
+
result: true,
|
|
123
|
+
take,
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
if (take >= props.operations.length) {
|
|
127
|
+
return { result: false };
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
const result = call(targeted.value, props.operations.slice(take));
|
|
131
|
+
if (!result.result) {
|
|
132
|
+
if (!props.strict) {
|
|
133
|
+
take += 1;
|
|
134
|
+
continue;
|
|
135
|
+
}
|
|
136
|
+
return { result: false };
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
take += result.take;
|
|
140
|
+
targeted = targetIterator.next();
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
case "standalone": {
|
|
144
|
+
const target = props.expected.operation;
|
|
145
|
+
const result = props.operations.some((op) => op.name === target.name);
|
|
146
|
+
if (result) {
|
|
147
|
+
return { result, take: 1 };
|
|
148
|
+
}
|
|
149
|
+
return {
|
|
150
|
+
result,
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
case "anyOf":
|
|
154
|
+
for (const expected of props.expected.anyOf) {
|
|
155
|
+
const callResult = call(expected);
|
|
156
|
+
if (callResult.result) {
|
|
157
|
+
return callResult;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
return { result: false };
|
|
162
|
+
case "allOf": {
|
|
163
|
+
/**
|
|
164
|
+
* @example
|
|
165
|
+
* expected = [4, 2];
|
|
166
|
+
* called = [1, 2, 3, 4, 5];
|
|
167
|
+
*
|
|
168
|
+
* { result: true, take: 3 };
|
|
169
|
+
*/
|
|
170
|
+
const result = props.expected.allOf.map((expected) => call(expected));
|
|
171
|
+
if (result.every((r) => r.result)) {
|
|
172
|
+
return {
|
|
173
|
+
result: true,
|
|
174
|
+
take: result.reduce((acc, r) => Math.max(acc, r.take), 0),
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
return {
|
|
179
|
+
result: false,
|
|
180
|
+
};
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
interface IPredicatorApplication {
|
|
187
|
+
/**
|
|
188
|
+
* Ask user to consent for what the AI agent wants to do next.
|
|
189
|
+
*
|
|
190
|
+
* If AI agent wants to do some function calling at next,
|
|
191
|
+
* but it needs the user's consent about the function calling to do,
|
|
192
|
+
* then call this tool function.
|
|
193
|
+
*
|
|
194
|
+
* @param props Properties for asking the user's consent
|
|
195
|
+
*/
|
|
196
|
+
consent(props: IConsentProps): void;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
/**
|
|
200
|
+
* Properties for asking the user's consent
|
|
201
|
+
*/
|
|
202
|
+
interface IConsentProps {
|
|
203
|
+
/**
|
|
204
|
+
* Reason of the message implying what the AI agent wants
|
|
205
|
+
* to do at the next step after the user's consent.
|
|
206
|
+
*/
|
|
207
|
+
content: string;
|
|
208
|
+
|
|
209
|
+
/**
|
|
210
|
+
* Recommended reply message for the user.
|
|
211
|
+
*
|
|
212
|
+
* The message what AI agent wants the user to reply
|
|
213
|
+
* accepting the AI agent's next job suggestion.
|
|
214
|
+
*/
|
|
215
|
+
reply: string;
|
|
216
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { IAgenticaBenchmarkExpected } from "../structures/IAgenticaBenchmarkExpected";
|
|
2
|
+
|
|
3
|
+
export namespace AgenticaBenchmarkUtil {
|
|
4
|
+
export const errorToJson = (error: any): any => {
|
|
5
|
+
if (error instanceof Error)
|
|
6
|
+
return {
|
|
7
|
+
...error,
|
|
8
|
+
name: error.name,
|
|
9
|
+
message: error.message,
|
|
10
|
+
stack: error.stack,
|
|
11
|
+
};
|
|
12
|
+
return error;
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
export const expectedToJson = (expected: IAgenticaBenchmarkExpected): any => {
|
|
16
|
+
if (expected.type === "standalone")
|
|
17
|
+
return {
|
|
18
|
+
type: expected.type,
|
|
19
|
+
operation: {
|
|
20
|
+
name: expected.operation.name,
|
|
21
|
+
description: expected.operation.function.description,
|
|
22
|
+
},
|
|
23
|
+
};
|
|
24
|
+
else if (expected.type === "array")
|
|
25
|
+
return {
|
|
26
|
+
type: expected.type,
|
|
27
|
+
items: expected.items.map(expectedToJson),
|
|
28
|
+
};
|
|
29
|
+
else if (expected.type === "allOf")
|
|
30
|
+
return {
|
|
31
|
+
type: expected.type,
|
|
32
|
+
allOf: expected.allOf.map(expectedToJson),
|
|
33
|
+
};
|
|
34
|
+
else
|
|
35
|
+
return {
|
|
36
|
+
type: expected.type,
|
|
37
|
+
anyOf: expected.anyOf.map(expectedToJson),
|
|
38
|
+
};
|
|
39
|
+
};
|
|
40
|
+
}
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import { IAgenticaCallBenchmarkEvent } from "../structures/IAgenticaCallBenchmarkEvent";
|
|
2
|
+
import { IAgenticaCallBenchmarkResult } from "../structures/IAgenticaCallBenchmarkResult";
|
|
3
|
+
import { MathUtil } from "../utils/MathUtil";
|
|
4
|
+
import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
|
|
5
|
+
import { AgenticaPromptReporter } from "./AgenticaPromptReporter";
|
|
6
|
+
|
|
7
|
+
export namespace AgenticaCallBenchmarkReporter {
|
|
8
|
+
export const markdown = (
|
|
9
|
+
result: IAgenticaCallBenchmarkResult,
|
|
10
|
+
): Record<string, string> =>
|
|
11
|
+
Object.fromEntries([
|
|
12
|
+
["./README.md", writeIndex(result)],
|
|
13
|
+
...result.experiments
|
|
14
|
+
.map((exp) => [
|
|
15
|
+
[`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
|
|
16
|
+
...exp.events.map((event, i) => [
|
|
17
|
+
`./${exp.scenario.name}/${i + 1}.${event.type}.md`,
|
|
18
|
+
writeExperimentEvent(event, i),
|
|
19
|
+
]),
|
|
20
|
+
])
|
|
21
|
+
.flat(),
|
|
22
|
+
]);
|
|
23
|
+
|
|
24
|
+
const writeIndex = (result: IAgenticaCallBenchmarkResult): string => {
|
|
25
|
+
const events: IAgenticaCallBenchmarkEvent[] = result.experiments
|
|
26
|
+
.map((r) => r.events)
|
|
27
|
+
.flat();
|
|
28
|
+
const average: number =
|
|
29
|
+
events
|
|
30
|
+
.map((e) => e.completed_at.getTime() - e.started_at.getTime())
|
|
31
|
+
.reduce((a, b) => a + b, 0) / events.length;
|
|
32
|
+
return [
|
|
33
|
+
"# LLM Function Call Benchmark",
|
|
34
|
+
"## Summary",
|
|
35
|
+
` - Aggregation:`,
|
|
36
|
+
` - Scenarios: #${result.experiments.length.toLocaleString()}`,
|
|
37
|
+
` - Trial: ${events.length}`,
|
|
38
|
+
` - Success: ${events.filter((e) => e.type === "success").length}`,
|
|
39
|
+
` - Failure: ${events.filter((e) => e.type === "failure").length}`,
|
|
40
|
+
` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`,
|
|
41
|
+
` - Token Usage`,
|
|
42
|
+
` - Total: ${result.usage.total.toLocaleString()}`,
|
|
43
|
+
` - Prompt`,
|
|
44
|
+
` - Total: ${result.usage.prompt.total.toLocaleString()}`,
|
|
45
|
+
` - Audio: ${result.usage.prompt.audio.toLocaleString()}`,
|
|
46
|
+
` - Cached: ${result.usage.prompt.cached.toLocaleString()}`,
|
|
47
|
+
` - Completion:`,
|
|
48
|
+
` - Total: ${result.usage.completion.total.toLocaleString()}`,
|
|
49
|
+
` - Accepted Prediction: ${result.usage.completion.accepted_prediction.toLocaleString()}`,
|
|
50
|
+
` - Audio: ${result.usage.completion.audio.toLocaleString()}`,
|
|
51
|
+
` - Reasoning: ${result.usage.completion.reasoning.toLocaleString()}`,
|
|
52
|
+
` - Rejected Prediction: ${result.usage.completion.rejected_prediction.toLocaleString()}`,
|
|
53
|
+
"",
|
|
54
|
+
"## Experiments",
|
|
55
|
+
" Name | Select | Call | Time/Avg ",
|
|
56
|
+
":-----|:-------|:-----|----------:",
|
|
57
|
+
...result.experiments.map((exp) =>
|
|
58
|
+
[
|
|
59
|
+
`[${exp.scenario.name}](./${exp.scenario.name}/README.md)`,
|
|
60
|
+
drawStatus(
|
|
61
|
+
exp.events,
|
|
62
|
+
(e) => e.type !== "error" && e.select === true,
|
|
63
|
+
),
|
|
64
|
+
drawStatus(exp.events, (e) => e.type !== "error" && e.call === true),
|
|
65
|
+
`${MathUtil.round(
|
|
66
|
+
exp.events
|
|
67
|
+
.map((e) => e.completed_at.getTime() - e.started_at.getTime())
|
|
68
|
+
.reduce((a, b) => a + b, 0) / exp.events.length,
|
|
69
|
+
).toLocaleString()} ms`,
|
|
70
|
+
].join(" | "),
|
|
71
|
+
),
|
|
72
|
+
].join("\n");
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
const writeExperimentIndex = (
|
|
76
|
+
exp: IAgenticaCallBenchmarkResult.IExperiment,
|
|
77
|
+
): string => {
|
|
78
|
+
return [
|
|
79
|
+
`# ${exp.scenario.name}`,
|
|
80
|
+
"## Summary",
|
|
81
|
+
` - Scenarios: #${exp.events.length.toLocaleString()}`,
|
|
82
|
+
` - Success: ${exp.events.filter((e) => e.type === "success").length}`,
|
|
83
|
+
` - Failure: ${exp.events.filter((e) => e.type === "failure").length}`,
|
|
84
|
+
` - Average Time: ${MathUtil.round(
|
|
85
|
+
exp.events
|
|
86
|
+
.map((e) => e.completed_at.getTime() - e.started_at.getTime())
|
|
87
|
+
.reduce((a, b) => a + b, 0) / exp.events.length,
|
|
88
|
+
).toLocaleString()} ms`,
|
|
89
|
+
"",
|
|
90
|
+
"## Events",
|
|
91
|
+
" Name | Type | Time",
|
|
92
|
+
":-----|:-----|----:",
|
|
93
|
+
...exp.events.map((e, i) =>
|
|
94
|
+
[
|
|
95
|
+
`[${i + 1}.](./${i + 1}.${e.type}.md)`,
|
|
96
|
+
e.type,
|
|
97
|
+
`${MathUtil.round(e.completed_at.getTime() - e.started_at.getTime())} ms`,
|
|
98
|
+
].join(" | "),
|
|
99
|
+
),
|
|
100
|
+
"",
|
|
101
|
+
"## Scenario",
|
|
102
|
+
"### User Prompt",
|
|
103
|
+
exp.scenario.text,
|
|
104
|
+
"",
|
|
105
|
+
"### Expected",
|
|
106
|
+
"```json",
|
|
107
|
+
JSON.stringify(
|
|
108
|
+
AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected),
|
|
109
|
+
null,
|
|
110
|
+
2,
|
|
111
|
+
),
|
|
112
|
+
"```",
|
|
113
|
+
].join("\n");
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
const writeExperimentEvent = (
|
|
117
|
+
event: IAgenticaCallBenchmarkEvent,
|
|
118
|
+
index: number,
|
|
119
|
+
): string => {
|
|
120
|
+
return [
|
|
121
|
+
`# ${index}. ${event.type}`,
|
|
122
|
+
"## Summary",
|
|
123
|
+
` - Name: ${event.scenario.name}`,
|
|
124
|
+
` - Type: ${event.type}`,
|
|
125
|
+
` - Time: ${MathUtil.round(
|
|
126
|
+
event.completed_at.getTime() - event.started_at.getTime(),
|
|
127
|
+
).toLocaleString()} ms`,
|
|
128
|
+
...(event.type !== "error"
|
|
129
|
+
? [
|
|
130
|
+
` - Select: ${event.select ? "✅" : "❌"}`,
|
|
131
|
+
` - Call: ${event.call ? "✅" : "❌"}`,
|
|
132
|
+
]
|
|
133
|
+
: []),
|
|
134
|
+
` - Token Usage: ${event.usage.toLocaleString()}`,
|
|
135
|
+
"",
|
|
136
|
+
"## Scenario",
|
|
137
|
+
"### User Prompt",
|
|
138
|
+
event.scenario.text,
|
|
139
|
+
"",
|
|
140
|
+
"### Expected",
|
|
141
|
+
"```json",
|
|
142
|
+
JSON.stringify(
|
|
143
|
+
AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected),
|
|
144
|
+
null,
|
|
145
|
+
2,
|
|
146
|
+
),
|
|
147
|
+
"```",
|
|
148
|
+
"",
|
|
149
|
+
"## Prompt Histories",
|
|
150
|
+
...event.prompts.map(AgenticaPromptReporter.markdown),
|
|
151
|
+
"",
|
|
152
|
+
...(event.type === "error"
|
|
153
|
+
? [
|
|
154
|
+
"## Error",
|
|
155
|
+
"```json",
|
|
156
|
+
JSON.stringify(
|
|
157
|
+
AgenticaBenchmarkUtil.errorToJson(event.error),
|
|
158
|
+
null,
|
|
159
|
+
2,
|
|
160
|
+
),
|
|
161
|
+
"```",
|
|
162
|
+
]
|
|
163
|
+
: []),
|
|
164
|
+
].join("\n");
|
|
165
|
+
};
|
|
166
|
+
|
|
167
|
+
const drawStatus = (
|
|
168
|
+
events: IAgenticaCallBenchmarkEvent[],
|
|
169
|
+
success: (e: IAgenticaCallBenchmarkEvent) => boolean,
|
|
170
|
+
): string => {
|
|
171
|
+
const count: number = events.filter(success).length;
|
|
172
|
+
return (
|
|
173
|
+
new Array(count).fill("■").join("") +
|
|
174
|
+
new Array(10 - count).fill("□").join("")
|
|
175
|
+
);
|
|
176
|
+
};
|
|
177
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import { IAgenticaPrompt } from "@agentica/core";
|
|
2
|
+
|
|
3
|
+
export namespace AgenticaPromptReporter {
|
|
4
|
+
export const markdown = (p: IAgenticaPrompt): string => {
|
|
5
|
+
if (p.type === "text")
|
|
6
|
+
return [`### Text (${p.role})`, p.text, ""].join("\n");
|
|
7
|
+
else if (p.type === "select" || p.type === "cancel")
|
|
8
|
+
return [
|
|
9
|
+
`### ${p.type === "select" ? "Select" : "Cancel"}`,
|
|
10
|
+
...p.operations
|
|
11
|
+
.map((op) => [
|
|
12
|
+
`#### ${op.name}`,
|
|
13
|
+
` - controller: ${op.controller.name}`,
|
|
14
|
+
` - function: ${op.function.name}`,
|
|
15
|
+
` - reason: ${op.reason}`,
|
|
16
|
+
"",
|
|
17
|
+
...(!!op.function.description?.length
|
|
18
|
+
? [op.function.description, ""]
|
|
19
|
+
: []),
|
|
20
|
+
])
|
|
21
|
+
.flat(),
|
|
22
|
+
].join("\n");
|
|
23
|
+
else if (p.type === "describe")
|
|
24
|
+
return [
|
|
25
|
+
"### Describe",
|
|
26
|
+
...p.executions.map((e) => ` - ${e.name}`),
|
|
27
|
+
"",
|
|
28
|
+
...p.text.split("\n").map((s) => `> ${s}`),
|
|
29
|
+
"",
|
|
30
|
+
].join("\n");
|
|
31
|
+
return [
|
|
32
|
+
"### Execute",
|
|
33
|
+
` - name: ${p.name}`,
|
|
34
|
+
` - controller: ${p.controller.name}`,
|
|
35
|
+
` - function: ${p.function.name}`,
|
|
36
|
+
"",
|
|
37
|
+
"```json",
|
|
38
|
+
JSON.stringify(p.arguments, null, 2),
|
|
39
|
+
"```",
|
|
40
|
+
"",
|
|
41
|
+
].join("\n");
|
|
42
|
+
};
|
|
43
|
+
}
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
import { IAgenticaSelectBenchmarkEvent } from "../structures/IAgenticaSelectBenchmarkEvent";
|
|
2
|
+
import { IAgenticaSelectBenchmarkResult } from "../structures/IAgenticaSelectBenchmarkResult";
|
|
3
|
+
import { MathUtil } from "../utils/MathUtil";
|
|
4
|
+
import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* @internal
|
|
8
|
+
*/
|
|
9
|
+
export namespace AgenticaSelectBenchmarkReporter {
|
|
10
|
+
export const markdown = (
|
|
11
|
+
result: IAgenticaSelectBenchmarkResult,
|
|
12
|
+
): Record<string, string> =>
|
|
13
|
+
Object.fromEntries([
|
|
14
|
+
["./README.md", writeIndex(result)],
|
|
15
|
+
...result.experiments
|
|
16
|
+
.map((exp) => [
|
|
17
|
+
[`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
|
|
18
|
+
...exp.events.map((event, i) => [
|
|
19
|
+
`./${exp.scenario.name}/${i + 1}.${event.type}.md`,
|
|
20
|
+
writeExperimentEvent(event, i),
|
|
21
|
+
]),
|
|
22
|
+
])
|
|
23
|
+
.flat(),
|
|
24
|
+
]);
|
|
25
|
+
|
|
26
|
+
const writeIndex = (result: IAgenticaSelectBenchmarkResult): string => {
|
|
27
|
+
const events: IAgenticaSelectBenchmarkEvent[] = result.experiments
|
|
28
|
+
.map((r) => r.events)
|
|
29
|
+
.flat();
|
|
30
|
+
const average: number =
|
|
31
|
+
events
|
|
32
|
+
.map((e) => e.completed_at.getTime() - e.started_at.getTime())
|
|
33
|
+
.reduce((a, b) => a + b, 0) / events.length;
|
|
34
|
+
return [
|
|
35
|
+
"# LLM Function Selection Benchmark",
|
|
36
|
+
"## Summary",
|
|
37
|
+
` - Aggregation:`,
|
|
38
|
+
` - Scenarios: #${result.experiments.length.toLocaleString()}`,
|
|
39
|
+
` - Trial: ${events.length}`,
|
|
40
|
+
` - Success: ${events.filter((e) => e.type === "success").length}`,
|
|
41
|
+
` - Failure: ${events.filter((e) => e.type === "failure").length}`,
|
|
42
|
+
` - Average Time: ${MathUtil.round(average).toLocaleString()} ms`,
|
|
43
|
+
` - Token Usage`,
|
|
44
|
+
` - Total: ${result.usage.total.toLocaleString()}`,
|
|
45
|
+
` - Prompt`,
|
|
46
|
+
` - Total: ${result.usage.prompt.total.toLocaleString()}`,
|
|
47
|
+
` - Audio: ${result.usage.prompt.audio.toLocaleString()}`,
|
|
48
|
+
` - Cached: ${result.usage.prompt.cached.toLocaleString()}`,
|
|
49
|
+
` - Completion:`,
|
|
50
|
+
` - Total: ${result.usage.completion.total.toLocaleString()}`,
|
|
51
|
+
` - Accepted Prediction: ${result.usage.completion.accepted_prediction.toLocaleString()}`,
|
|
52
|
+
` - Audio: ${result.usage.completion.audio.toLocaleString()}`,
|
|
53
|
+
` - Reasoning: ${result.usage.completion.reasoning.toLocaleString()}`,
|
|
54
|
+
` - Rejected Prediction: ${result.usage.completion.rejected_prediction.toLocaleString()}`,
|
|
55
|
+
"",
|
|
56
|
+
"## Experiments",
|
|
57
|
+
" Name | Status | Time/Avg ",
|
|
58
|
+
":-----|:-------|----------:",
|
|
59
|
+
...result.experiments.map((exp) =>
|
|
60
|
+
[
|
|
61
|
+
`[${exp.scenario.name}](./${exp.scenario.name}/README.md)`,
|
|
62
|
+
(() => {
|
|
63
|
+
const success: number = Math.floor(
|
|
64
|
+
(exp.events.filter((e) => e.type === "success").length /
|
|
65
|
+
exp.events.length) *
|
|
66
|
+
10,
|
|
67
|
+
);
|
|
68
|
+
return (
|
|
69
|
+
new Array(success).fill("■").join("") +
|
|
70
|
+
new Array(10 - success).fill("□").join("")
|
|
71
|
+
);
|
|
72
|
+
})(),
|
|
73
|
+
MathUtil.round(
|
|
74
|
+
exp.events
|
|
75
|
+
.map(
|
|
76
|
+
(event) =>
|
|
77
|
+
event.completed_at.getTime() - event.started_at.getTime(),
|
|
78
|
+
)
|
|
79
|
+
.reduce((a, b) => a + b, 0) / exp.events.length,
|
|
80
|
+
).toLocaleString() + " ms",
|
|
81
|
+
].join(" | "),
|
|
82
|
+
),
|
|
83
|
+
].join("\n");
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
const writeExperimentIndex = (
|
|
87
|
+
exp: IAgenticaSelectBenchmarkResult.IExperiment,
|
|
88
|
+
): string => {
|
|
89
|
+
return [
|
|
90
|
+
`# ${exp.scenario.name}`,
|
|
91
|
+
"## Summary",
|
|
92
|
+
" - Aggregation:",
|
|
93
|
+
` - Trial: ${exp.events.length}`,
|
|
94
|
+
` - Success: ${exp.events.filter((e) => e.type === "success").length}`,
|
|
95
|
+
` - Failure: ${exp.events.filter((e) => e.type === "failure").length}`,
|
|
96
|
+
` - Average Time: ${MathUtil.round(
|
|
97
|
+
exp.events
|
|
98
|
+
.map(
|
|
99
|
+
(event) =>
|
|
100
|
+
event.completed_at.getTime() - event.started_at.getTime(),
|
|
101
|
+
)
|
|
102
|
+
.reduce((a, b) => a + b, 0) / exp.events.length,
|
|
103
|
+
).toLocaleString()} ms`,
|
|
104
|
+
" - Token Usage",
|
|
105
|
+
` - Total: ${exp.usage.total.toLocaleString()}`,
|
|
106
|
+
` - Prompt`,
|
|
107
|
+
` - Total: ${exp.usage.prompt.total.toLocaleString()}`,
|
|
108
|
+
` - Audio: ${exp.usage.prompt.audio.toLocaleString()}`,
|
|
109
|
+
` - Cached: ${exp.usage.prompt.cached.toLocaleString()}`,
|
|
110
|
+
` - Completion:`,
|
|
111
|
+
` - Total: ${exp.usage.completion.total.toLocaleString()}`,
|
|
112
|
+
` - Accepted Prediction: ${exp.usage.completion.accepted_prediction.toLocaleString()}`,
|
|
113
|
+
` - Audio: ${exp.usage.completion.audio.toLocaleString()}`,
|
|
114
|
+
` - Reasoning: ${exp.usage.completion.reasoning.toLocaleString()}`,
|
|
115
|
+
` - Rejected Prediction: ${exp.usage.completion.rejected_prediction.toLocaleString()}`,
|
|
116
|
+
"",
|
|
117
|
+
"## Events",
|
|
118
|
+
" No | Type | Time",
|
|
119
|
+
"---:|:-----|----:",
|
|
120
|
+
...exp.events.map((e, i) =>
|
|
121
|
+
[
|
|
122
|
+
`[${i + 1}.](./${i + 1}.${e.type}.md)`,
|
|
123
|
+
e.type,
|
|
124
|
+
MathUtil.round(e.completed_at.getTime() - e.started_at.getTime()) +
|
|
125
|
+
" ms",
|
|
126
|
+
].join(" | "),
|
|
127
|
+
),
|
|
128
|
+
"",
|
|
129
|
+
"## Scenario",
|
|
130
|
+
"### User Prompt",
|
|
131
|
+
exp.scenario.text,
|
|
132
|
+
"",
|
|
133
|
+
"### Expected",
|
|
134
|
+
"```json",
|
|
135
|
+
JSON.stringify(
|
|
136
|
+
AgenticaBenchmarkUtil.expectedToJson(exp.scenario.expected),
|
|
137
|
+
null,
|
|
138
|
+
2,
|
|
139
|
+
),
|
|
140
|
+
"```",
|
|
141
|
+
].join("\n");
|
|
142
|
+
};
|
|
143
|
+
|
|
144
|
+
const writeExperimentEvent = (
|
|
145
|
+
event: IAgenticaSelectBenchmarkEvent,
|
|
146
|
+
index: number,
|
|
147
|
+
): string => {
|
|
148
|
+
return [
|
|
149
|
+
`# ${index}. ${event.type}`,
|
|
150
|
+
`## Summary`,
|
|
151
|
+
` - Name: ${event.scenario.name}`,
|
|
152
|
+
` - Type: ${event.type}`,
|
|
153
|
+
` - Time: ${(event.completed_at.getTime() - event.started_at.getTime()).toLocaleString()} ms`,
|
|
154
|
+
...(event.type !== "error"
|
|
155
|
+
? [
|
|
156
|
+
" - Token Usage",
|
|
157
|
+
` - Total: ${event.usage.total.toLocaleString()}`,
|
|
158
|
+
` - Prompt`,
|
|
159
|
+
` - Total: ${event.usage.prompt.total.toLocaleString()}`,
|
|
160
|
+
` - Audio: ${event.usage.prompt.audio.toLocaleString()}`,
|
|
161
|
+
` - Cached: ${event.usage.prompt.cached.toLocaleString()}`,
|
|
162
|
+
` - Completion:`,
|
|
163
|
+
` - Total: ${event.usage.completion.total.toLocaleString()}`,
|
|
164
|
+
` - Accepted Prediction: ${event.usage.completion.accepted_prediction.toLocaleString()}`,
|
|
165
|
+
` - Audio: ${event.usage.completion.audio.toLocaleString()}`,
|
|
166
|
+
` - Reasoning: ${event.usage.completion.reasoning.toLocaleString()}`,
|
|
167
|
+
` - Rejected Prediction: ${event.usage.completion.rejected_prediction.toLocaleString()}`,
|
|
168
|
+
]
|
|
169
|
+
: []),
|
|
170
|
+
"",
|
|
171
|
+
"## Scenario",
|
|
172
|
+
"### User Prompt",
|
|
173
|
+
event.scenario.text,
|
|
174
|
+
"",
|
|
175
|
+
"### Expected",
|
|
176
|
+
"```json",
|
|
177
|
+
JSON.stringify(
|
|
178
|
+
AgenticaBenchmarkUtil.expectedToJson(event.scenario.expected),
|
|
179
|
+
null,
|
|
180
|
+
2,
|
|
181
|
+
),
|
|
182
|
+
"```",
|
|
183
|
+
"",
|
|
184
|
+
...(event.type === "success" || event.type === "failure"
|
|
185
|
+
? [
|
|
186
|
+
"## Result",
|
|
187
|
+
...event.selected.map((s) =>
|
|
188
|
+
[
|
|
189
|
+
`### ${s.name}`,
|
|
190
|
+
` - Controller: \`${s.controller.name}\``,
|
|
191
|
+
` - Function: \`${s.function.name}\``,
|
|
192
|
+
` - Reason: ${s.reason}`,
|
|
193
|
+
"",
|
|
194
|
+
...(s.function.description ? [s.function.description, ""] : []),
|
|
195
|
+
].join("\n"),
|
|
196
|
+
),
|
|
197
|
+
]
|
|
198
|
+
: []),
|
|
199
|
+
...(event.type === "error"
|
|
200
|
+
? [
|
|
201
|
+
"## Error",
|
|
202
|
+
"```json",
|
|
203
|
+
AgenticaBenchmarkUtil.errorToJson(
|
|
204
|
+
JSON.stringify(event.error, null, 2),
|
|
205
|
+
),
|
|
206
|
+
"```",
|
|
207
|
+
"",
|
|
208
|
+
]
|
|
209
|
+
: []),
|
|
210
|
+
].join("\n");
|
|
211
|
+
};
|
|
212
|
+
}
|