@agentica/benchmark 0.8.3 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +124 -122
- package/lib/AgenticaCallBenchmark.d.ts +7 -6
- package/lib/AgenticaCallBenchmark.js.map +1 -1
- package/lib/AgenticaSelectBenchmark.d.ts +7 -6
- package/lib/AgenticaSelectBenchmark.js.map +1 -1
- package/lib/index.mjs +46 -1
- package/lib/index.mjs.map +1 -1
- package/lib/internal/AgenticaBenchmarkPredicator.d.ts +5 -4
- package/lib/internal/AgenticaBenchmarkPredicator.js +74 -2
- package/lib/internal/AgenticaBenchmarkPredicator.js.map +1 -1
- package/lib/internal/AgenticaBenchmarkUtil.d.ts +2 -1
- package/lib/internal/AgenticaBenchmarkUtil.js.map +1 -1
- package/lib/internal/AgenticaCallBenchmarkReporter.d.ts +2 -1
- package/lib/internal/AgenticaCallBenchmarkReporter.js.map +1 -1
- package/lib/internal/AgenticaPromptReporter.d.ts +2 -1
- package/lib/internal/AgenticaPromptReporter.js.map +1 -1
- package/lib/internal/AgenticaSelectBenchmarkReporter.js.map +1 -1
- package/lib/structures/IAgenticaBenchmarkExpected.d.ts +10 -9
- package/lib/structures/IAgenticaCallBenchmarkEvent.d.ts +8 -7
- package/lib/structures/IAgenticaCallBenchmarkResult.d.ts +6 -5
- package/lib/structures/IAgenticaCallBenchmarkScenario.d.ts +3 -2
- package/lib/structures/IAgenticaSelectBenchmarkEvent.d.ts +9 -8
- package/lib/structures/IAgenticaSelectBenchmarkResult.d.ts +6 -5
- package/lib/structures/IAgenticaSelectBenchmarkScenario.d.ts +3 -2
- package/package.json +5 -5
- package/src/AgenticaCallBenchmark.ts +28 -25
- package/src/AgenticaSelectBenchmark.ts +32 -30
- package/src/internal/AgenticaBenchmarkPredicator.ts +18 -10
- package/src/internal/AgenticaBenchmarkUtil.ts +5 -1
- package/src/internal/AgenticaCallBenchmarkReporter.ts +15 -12
- package/src/internal/AgenticaPromptReporter.ts +4 -1
- package/src/internal/AgenticaSelectBenchmarkReporter.ts +11 -8
- package/src/structures/IAgenticaBenchmarkExpected.ts +23 -13
- package/src/structures/IAgenticaCallBenchmarkEvent.ts +14 -10
- package/src/structures/IAgenticaCallBenchmarkResult.ts +6 -5
- package/src/structures/IAgenticaCallBenchmarkScenario.ts +6 -2
- package/src/structures/IAgenticaSelectBenchmarkEvent.ts +15 -11
- package/src/structures/IAgenticaSelectBenchmarkResult.ts +8 -5
- package/src/structures/IAgenticaSelectBenchmarkScenario.ts +6 -2
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { Agentica } from "@agentica/core";
|
|
2
2
|
import { AgenticaTokenUsageAggregator } from "@agentica/core/src/internal/AgenticaTokenUsageAggregator";
|
|
3
|
+
import { ILlmSchema } from "@samchon/openapi";
|
|
3
4
|
import { Semaphore } from "tstl";
|
|
4
5
|
import { tags } from "typia";
|
|
5
6
|
|
|
@@ -29,18 +30,18 @@ import { IAgenticaCallBenchmarkScenario } from "./structures/IAgenticaCallBenchm
|
|
|
29
30
|
*
|
|
30
31
|
* @author Samchon
|
|
31
32
|
*/
|
|
32
|
-
export class AgenticaCallBenchmark {
|
|
33
|
-
private agent_: Agentica
|
|
34
|
-
private scenarios_: IAgenticaCallBenchmarkScenario[];
|
|
33
|
+
export class AgenticaCallBenchmark<Model extends ILlmSchema.Model> {
|
|
34
|
+
private agent_: Agentica<Model>;
|
|
35
|
+
private scenarios_: IAgenticaCallBenchmarkScenario<Model>[];
|
|
35
36
|
private config_: AgenticaCallBenchmark.IConfig;
|
|
36
|
-
private result_: IAgenticaCallBenchmarkResult | null;
|
|
37
|
+
private result_: IAgenticaCallBenchmarkResult<Model> | null;
|
|
37
38
|
|
|
38
39
|
/**
|
|
39
40
|
* Initializer Constructor.
|
|
40
41
|
*
|
|
41
42
|
* @param props Properties of the selection benchmark
|
|
42
43
|
*/
|
|
43
|
-
public constructor(props: AgenticaCallBenchmark.IProps) {
|
|
44
|
+
public constructor(props: AgenticaCallBenchmark.IProps<Model>) {
|
|
44
45
|
this.agent_ = props.agent;
|
|
45
46
|
this.scenarios_ = props.scenarios.slice();
|
|
46
47
|
this.config_ = {
|
|
@@ -68,22 +69,24 @@ export class AgenticaCallBenchmark {
|
|
|
68
69
|
* @returns Results of the function calling benchmark
|
|
69
70
|
*/
|
|
70
71
|
public async execute(
|
|
71
|
-
listener?: (event: IAgenticaCallBenchmarkEvent) => void,
|
|
72
|
-
): Promise<IAgenticaCallBenchmarkResult
|
|
72
|
+
listener?: (event: IAgenticaCallBenchmarkEvent<Model>) => void,
|
|
73
|
+
): Promise<IAgenticaCallBenchmarkResult<Model>> {
|
|
73
74
|
const started_at: Date = new Date();
|
|
74
75
|
const semaphore: Semaphore = new Semaphore(this.config_.simultaneous);
|
|
75
|
-
const experiments: IAgenticaCallBenchmarkResult.IExperiment[] =
|
|
76
|
+
const experiments: IAgenticaCallBenchmarkResult.IExperiment<Model>[] =
|
|
76
77
|
await Promise.all(
|
|
77
78
|
this.scenarios_.map(async (scenario) => {
|
|
78
|
-
const events: IAgenticaCallBenchmarkEvent[] =
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
79
|
+
const events: IAgenticaCallBenchmarkEvent<Model>[] =
|
|
80
|
+
await Promise.all(
|
|
81
|
+
new Array(this.config_.repeat).fill(0).map(async () => {
|
|
82
|
+
await semaphore.acquire();
|
|
83
|
+
const e: IAgenticaCallBenchmarkEvent<Model> =
|
|
84
|
+
await this.step(scenario);
|
|
85
|
+
await semaphore.release();
|
|
86
|
+
if (listener !== undefined) listener(e);
|
|
87
|
+
return e;
|
|
88
|
+
}),
|
|
89
|
+
);
|
|
87
90
|
return {
|
|
88
91
|
scenario,
|
|
89
92
|
events,
|
|
@@ -135,9 +138,9 @@ export class AgenticaCallBenchmark {
|
|
|
135
138
|
}
|
|
136
139
|
|
|
137
140
|
private async step(
|
|
138
|
-
scenario: IAgenticaCallBenchmarkScenario
|
|
139
|
-
): Promise<IAgenticaCallBenchmarkEvent
|
|
140
|
-
const agent: Agentica = this.agent_.clone();
|
|
141
|
+
scenario: IAgenticaCallBenchmarkScenario<Model>,
|
|
142
|
+
): Promise<IAgenticaCallBenchmarkEvent<Model>> {
|
|
143
|
+
const agent: Agentica<Model> = this.agent_.clone();
|
|
141
144
|
const started_at: Date = new Date();
|
|
142
145
|
const success = () =>
|
|
143
146
|
AgenticaBenchmarkPredicator.success({
|
|
@@ -147,7 +150,7 @@ export class AgenticaCallBenchmark {
|
|
|
147
150
|
.filter((p) => p.type === "execute"),
|
|
148
151
|
strict: false,
|
|
149
152
|
});
|
|
150
|
-
const out = (): IAgenticaCallBenchmarkEvent => {
|
|
153
|
+
const out = (): IAgenticaCallBenchmarkEvent<Model> => {
|
|
151
154
|
const select = AgenticaBenchmarkPredicator.success({
|
|
152
155
|
expected: scenario.expected,
|
|
153
156
|
operations: agent
|
|
@@ -167,7 +170,7 @@ export class AgenticaCallBenchmark {
|
|
|
167
170
|
usage: agent.getTokenUsage(),
|
|
168
171
|
started_at,
|
|
169
172
|
completed_at: new Date(),
|
|
170
|
-
} satisfies IAgenticaCallBenchmarkEvent.IFailure
|
|
173
|
+
} satisfies IAgenticaCallBenchmarkEvent.IFailure<Model>;
|
|
171
174
|
};
|
|
172
175
|
|
|
173
176
|
try {
|
|
@@ -199,16 +202,16 @@ export namespace AgenticaCallBenchmark {
|
|
|
199
202
|
/**
|
|
200
203
|
* Properties of the {@link AgenticaCallBenchmark} constructor.
|
|
201
204
|
*/
|
|
202
|
-
export interface IProps {
|
|
205
|
+
export interface IProps<Model extends ILlmSchema.Model> {
|
|
203
206
|
/**
|
|
204
207
|
* AI agent instance.
|
|
205
208
|
*/
|
|
206
|
-
agent: Agentica
|
|
209
|
+
agent: Agentica<Model>;
|
|
207
210
|
|
|
208
211
|
/**
|
|
209
212
|
* List of scenarios what you expect.
|
|
210
213
|
*/
|
|
211
|
-
scenarios: IAgenticaCallBenchmarkScenario[];
|
|
214
|
+
scenarios: IAgenticaCallBenchmarkScenario<Model>[];
|
|
212
215
|
|
|
213
216
|
/**
|
|
214
217
|
* Configuration for the benchmark.
|
|
@@ -7,6 +7,7 @@ import {
|
|
|
7
7
|
} from "@agentica/core";
|
|
8
8
|
import { ChatGptSelectFunctionAgent } from "@agentica/core/src/chatgpt/ChatGptSelectFunctionAgent";
|
|
9
9
|
import { AgenticaTokenUsageAggregator } from "@agentica/core/src/internal/AgenticaTokenUsageAggregator";
|
|
10
|
+
import { ILlmSchema } from "@samchon/openapi";
|
|
10
11
|
import { Semaphore } from "tstl";
|
|
11
12
|
import { tags } from "typia";
|
|
12
13
|
|
|
@@ -33,19 +34,19 @@ import { IAgenticaSelectBenchmarkScenario } from "./structures/IAgenticaSelectBe
|
|
|
33
34
|
*
|
|
34
35
|
* @author Samchon
|
|
35
36
|
*/
|
|
36
|
-
export class AgenticaSelectBenchmark {
|
|
37
|
-
private agent_: Agentica
|
|
38
|
-
private scenarios_: IAgenticaSelectBenchmarkScenario[];
|
|
37
|
+
export class AgenticaSelectBenchmark<Model extends ILlmSchema.Model> {
|
|
38
|
+
private agent_: Agentica<Model>;
|
|
39
|
+
private scenarios_: IAgenticaSelectBenchmarkScenario<Model>[];
|
|
39
40
|
private config_: AgenticaSelectBenchmark.IConfig;
|
|
40
|
-
private histories_: IAgenticaPrompt[];
|
|
41
|
-
private result_: IAgenticaSelectBenchmarkResult | null;
|
|
41
|
+
private histories_: IAgenticaPrompt<Model>[];
|
|
42
|
+
private result_: IAgenticaSelectBenchmarkResult<Model> | null;
|
|
42
43
|
|
|
43
44
|
/**
|
|
44
45
|
* Initializer Constructor.
|
|
45
46
|
*
|
|
46
47
|
* @param props Properties of the selection benchmark
|
|
47
48
|
*/
|
|
48
|
-
public constructor(props: AgenticaSelectBenchmark.IProps) {
|
|
49
|
+
public constructor(props: AgenticaSelectBenchmark.IProps<Model>) {
|
|
49
50
|
this.agent_ = props.agent;
|
|
50
51
|
this.scenarios_ = props.scenarios.slice();
|
|
51
52
|
this.config_ = {
|
|
@@ -73,23 +74,24 @@ export class AgenticaSelectBenchmark {
|
|
|
73
74
|
* @returns Results of the function selection benchmark
|
|
74
75
|
*/
|
|
75
76
|
public async execute(
|
|
76
|
-
listener?: (event: IAgenticaSelectBenchmarkEvent) => void,
|
|
77
|
-
): Promise<IAgenticaSelectBenchmarkResult
|
|
77
|
+
listener?: (event: IAgenticaSelectBenchmarkEvent<Model>) => void,
|
|
78
|
+
): Promise<IAgenticaSelectBenchmarkResult<Model>> {
|
|
78
79
|
const started_at: Date = new Date();
|
|
79
80
|
const semaphore: Semaphore = new Semaphore(this.config_.simultaneous);
|
|
80
|
-
const experiments: IAgenticaSelectBenchmarkResult.IExperiment[] =
|
|
81
|
+
const experiments: IAgenticaSelectBenchmarkResult.IExperiment<Model>[] =
|
|
81
82
|
await Promise.all(
|
|
82
83
|
this.scenarios_.map(async (scenario) => {
|
|
83
|
-
const events: IAgenticaSelectBenchmarkEvent[] =
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
84
|
+
const events: IAgenticaSelectBenchmarkEvent<Model>[] =
|
|
85
|
+
await Promise.all(
|
|
86
|
+
new Array(this.config_.repeat).fill(0).map(async () => {
|
|
87
|
+
await semaphore.acquire();
|
|
88
|
+
const e: IAgenticaSelectBenchmarkEvent<Model> =
|
|
89
|
+
await this.step(scenario);
|
|
90
|
+
await semaphore.release();
|
|
91
|
+
if (listener !== undefined) listener(e);
|
|
92
|
+
return e;
|
|
93
|
+
}),
|
|
94
|
+
);
|
|
93
95
|
return {
|
|
94
96
|
scenario,
|
|
95
97
|
events,
|
|
@@ -142,12 +144,12 @@ export class AgenticaSelectBenchmark {
|
|
|
142
144
|
}
|
|
143
145
|
|
|
144
146
|
private async step(
|
|
145
|
-
scenario: IAgenticaSelectBenchmarkScenario
|
|
146
|
-
): Promise<IAgenticaSelectBenchmarkEvent
|
|
147
|
+
scenario: IAgenticaSelectBenchmarkScenario<Model>,
|
|
148
|
+
): Promise<IAgenticaSelectBenchmarkEvent<Model>> {
|
|
147
149
|
const started_at: Date = new Date();
|
|
148
150
|
try {
|
|
149
151
|
const usage: IAgenticaTokenUsage = AgenticaTokenUsageAggregator.zero();
|
|
150
|
-
const prompts: IAgenticaPrompt[] =
|
|
152
|
+
const prompts: IAgenticaPrompt<Model>[] =
|
|
151
153
|
await ChatGptSelectFunctionAgent.execute({
|
|
152
154
|
...this.agent_.getContext({
|
|
153
155
|
prompt: {
|
|
@@ -161,8 +163,8 @@ export class AgenticaSelectBenchmark {
|
|
|
161
163
|
stack: [],
|
|
162
164
|
ready: () => true,
|
|
163
165
|
dispatch: async () => {},
|
|
164
|
-
} satisfies IAgenticaContext);
|
|
165
|
-
const selected: IAgenticaOperationSelection[] = prompts
|
|
166
|
+
} satisfies IAgenticaContext<Model>);
|
|
167
|
+
const selected: IAgenticaOperationSelection<Model>[] = prompts
|
|
166
168
|
.filter((p) => p.type === "select")
|
|
167
169
|
.map((p) => p.operations)
|
|
168
170
|
.flat();
|
|
@@ -185,8 +187,8 @@ export class AgenticaSelectBenchmark {
|
|
|
185
187
|
started_at,
|
|
186
188
|
completed_at: new Date(),
|
|
187
189
|
} satisfies
|
|
188
|
-
| IAgenticaSelectBenchmarkEvent.ISuccess
|
|
189
|
-
| IAgenticaSelectBenchmarkEvent.IFailure
|
|
190
|
+
| IAgenticaSelectBenchmarkEvent.ISuccess<Model>
|
|
191
|
+
| IAgenticaSelectBenchmarkEvent.IFailure<Model>;
|
|
190
192
|
} catch (error) {
|
|
191
193
|
return {
|
|
192
194
|
type: "error",
|
|
@@ -194,7 +196,7 @@ export class AgenticaSelectBenchmark {
|
|
|
194
196
|
error,
|
|
195
197
|
started_at,
|
|
196
198
|
completed_at: new Date(),
|
|
197
|
-
} satisfies IAgenticaSelectBenchmarkEvent.IError
|
|
199
|
+
} satisfies IAgenticaSelectBenchmarkEvent.IError<Model>;
|
|
198
200
|
}
|
|
199
201
|
}
|
|
200
202
|
}
|
|
@@ -202,16 +204,16 @@ export namespace AgenticaSelectBenchmark {
|
|
|
202
204
|
/**
|
|
203
205
|
* Properties of the {@link AgenticaSelectBenchmark} constructor.
|
|
204
206
|
*/
|
|
205
|
-
export interface IProps {
|
|
207
|
+
export interface IProps<Model extends ILlmSchema.Model> {
|
|
206
208
|
/**
|
|
207
209
|
* AI agent instance.
|
|
208
210
|
*/
|
|
209
|
-
agent: Agentica
|
|
211
|
+
agent: Agentica<Model>;
|
|
210
212
|
|
|
211
213
|
/**
|
|
212
214
|
* List of scenarios what you expect.
|
|
213
215
|
*/
|
|
214
|
-
scenarios: IAgenticaSelectBenchmarkScenario[];
|
|
216
|
+
scenarios: IAgenticaSelectBenchmarkScenario<Model>[];
|
|
215
217
|
|
|
216
218
|
/**
|
|
217
219
|
* Configuration for the benchmark.
|
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
import { Agentica, IAgenticaOperation, IAgenticaPrompt } from "@agentica/core";
|
|
2
|
-
import { ILlmFunction } from "@samchon/openapi";
|
|
2
|
+
import { ILlmFunction, ILlmSchema } from "@samchon/openapi";
|
|
3
3
|
import OpenAI from "openai";
|
|
4
4
|
import typia from "typia";
|
|
5
5
|
|
|
6
6
|
import { IAgenticaBenchmarkExpected } from "../structures/IAgenticaBenchmarkExpected";
|
|
7
7
|
|
|
8
8
|
export namespace AgenticaBenchmarkPredicator {
|
|
9
|
-
export const isNext = async
|
|
10
|
-
|
|
9
|
+
export const isNext = async <Model extends ILlmSchema.Model>(
|
|
10
|
+
agent: Agentica<Model>,
|
|
11
|
+
): Promise<string | null> => {
|
|
12
|
+
const last: IAgenticaPrompt<Model> | undefined = agent
|
|
13
|
+
.getPromptHistories()
|
|
14
|
+
.at(-1);
|
|
11
15
|
if (last?.type !== "text" || last.role !== "assistant") return null;
|
|
12
16
|
|
|
13
17
|
const consent: ILlmFunction<"chatgpt"> = typia.llm.application<
|
|
@@ -69,18 +73,20 @@ export namespace AgenticaBenchmarkPredicator {
|
|
|
69
73
|
* @returns `true` if the called operations match the expected operations,
|
|
70
74
|
* otherwise `false`.
|
|
71
75
|
*/
|
|
72
|
-
export const success = (props: {
|
|
76
|
+
export const success = <Model extends ILlmSchema.Model>(props: {
|
|
73
77
|
/**
|
|
74
78
|
* Expected operations to be called.
|
|
75
79
|
*
|
|
76
80
|
* For 'allOf' within an 'array', the next expected element starts checking from the element that follows the last called element in 'allOf'.
|
|
77
81
|
*/
|
|
78
|
-
expected: IAgenticaBenchmarkExpected
|
|
82
|
+
expected: IAgenticaBenchmarkExpected<Model>;
|
|
79
83
|
|
|
80
84
|
/**
|
|
81
85
|
* Specified operations.
|
|
82
86
|
*/
|
|
83
|
-
operations: Array<
|
|
87
|
+
operations: Array<
|
|
88
|
+
IAgenticaOperation<Model> | IAgenticaPrompt.IExecute<Model>
|
|
89
|
+
>;
|
|
84
90
|
|
|
85
91
|
/**
|
|
86
92
|
* If it's `false`, check the array and let it go even if there's something wrong between them.
|
|
@@ -90,8 +96,8 @@ export namespace AgenticaBenchmarkPredicator {
|
|
|
90
96
|
strict?: boolean;
|
|
91
97
|
}): boolean => successInner(props).result;
|
|
92
98
|
|
|
93
|
-
const successInner = (
|
|
94
|
-
props: Parameters<typeof success
|
|
99
|
+
const successInner = <Model extends ILlmSchema.Model>(
|
|
100
|
+
props: Parameters<typeof success<Model>>[0],
|
|
95
101
|
):
|
|
96
102
|
| {
|
|
97
103
|
result: true;
|
|
@@ -101,8 +107,10 @@ export namespace AgenticaBenchmarkPredicator {
|
|
|
101
107
|
result: false;
|
|
102
108
|
} => {
|
|
103
109
|
const call = (
|
|
104
|
-
expected: IAgenticaBenchmarkExpected
|
|
105
|
-
overrideOperations?: Array<
|
|
110
|
+
expected: IAgenticaBenchmarkExpected<Model>,
|
|
111
|
+
overrideOperations?: Array<
|
|
112
|
+
IAgenticaOperation<Model> | IAgenticaPrompt.IExecute<Model>
|
|
113
|
+
>,
|
|
106
114
|
) =>
|
|
107
115
|
successInner({
|
|
108
116
|
expected,
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { ILlmSchema } from "@samchon/openapi";
|
|
2
|
+
|
|
1
3
|
import { IAgenticaBenchmarkExpected } from "../structures/IAgenticaBenchmarkExpected";
|
|
2
4
|
|
|
3
5
|
export namespace AgenticaBenchmarkUtil {
|
|
@@ -12,7 +14,9 @@ export namespace AgenticaBenchmarkUtil {
|
|
|
12
14
|
return error;
|
|
13
15
|
};
|
|
14
16
|
|
|
15
|
-
export const expectedToJson =
|
|
17
|
+
export const expectedToJson = <Model extends ILlmSchema.Model>(
|
|
18
|
+
expected: IAgenticaBenchmarkExpected<Model>,
|
|
19
|
+
): any => {
|
|
16
20
|
if (expected.type === "standalone")
|
|
17
21
|
return {
|
|
18
22
|
type: expected.type,
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { IAgenticaTokenUsage } from "@agentica/core";
|
|
2
|
+
import { ILlmSchema } from "@samchon/openapi";
|
|
2
3
|
|
|
3
4
|
import { IAgenticaCallBenchmarkEvent } from "../structures/IAgenticaCallBenchmarkEvent";
|
|
4
5
|
import { IAgenticaCallBenchmarkResult } from "../structures/IAgenticaCallBenchmarkResult";
|
|
@@ -7,11 +8,11 @@ import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
|
|
|
7
8
|
import { AgenticaPromptReporter } from "./AgenticaPromptReporter";
|
|
8
9
|
|
|
9
10
|
export namespace AgenticaCallBenchmarkReporter {
|
|
10
|
-
export const markdown = (
|
|
11
|
-
result: IAgenticaCallBenchmarkResult
|
|
11
|
+
export const markdown = <Model extends ILlmSchema.Model>(
|
|
12
|
+
result: IAgenticaCallBenchmarkResult<Model>,
|
|
12
13
|
): Record<string, string> =>
|
|
13
14
|
Object.fromEntries([
|
|
14
|
-
["./README.md", writeIndex(result)],
|
|
15
|
+
["./README.md", writeIndex<Model>(result)],
|
|
15
16
|
...result.experiments
|
|
16
17
|
.map((exp) => [
|
|
17
18
|
[`./${exp.scenario.name}/README.md`, writeExperimentIndex(exp)],
|
|
@@ -23,8 +24,10 @@ export namespace AgenticaCallBenchmarkReporter {
|
|
|
23
24
|
.flat(),
|
|
24
25
|
]);
|
|
25
26
|
|
|
26
|
-
const writeIndex =
|
|
27
|
-
|
|
27
|
+
const writeIndex = <Model extends ILlmSchema.Model>(
|
|
28
|
+
result: IAgenticaCallBenchmarkResult<Model>,
|
|
29
|
+
): string => {
|
|
30
|
+
const events: IAgenticaCallBenchmarkEvent<Model>[] = result.experiments
|
|
28
31
|
.map((r) => r.events)
|
|
29
32
|
.flat();
|
|
30
33
|
const average: number =
|
|
@@ -73,8 +76,8 @@ export namespace AgenticaCallBenchmarkReporter {
|
|
|
73
76
|
].join("\n");
|
|
74
77
|
};
|
|
75
78
|
|
|
76
|
-
const writeExperimentIndex = (
|
|
77
|
-
exp: IAgenticaCallBenchmarkResult.IExperiment
|
|
79
|
+
const writeExperimentIndex = <Model extends ILlmSchema.Model>(
|
|
80
|
+
exp: IAgenticaCallBenchmarkResult.IExperiment<Model>,
|
|
78
81
|
): string => {
|
|
79
82
|
return [
|
|
80
83
|
`# ${exp.scenario.name}`,
|
|
@@ -114,8 +117,8 @@ export namespace AgenticaCallBenchmarkReporter {
|
|
|
114
117
|
].join("\n");
|
|
115
118
|
};
|
|
116
119
|
|
|
117
|
-
const writeExperimentEvent = (
|
|
118
|
-
event: IAgenticaCallBenchmarkEvent
|
|
120
|
+
const writeExperimentEvent = <Model extends ILlmSchema.Model>(
|
|
121
|
+
event: IAgenticaCallBenchmarkEvent<Model>,
|
|
119
122
|
index: number,
|
|
120
123
|
): string => {
|
|
121
124
|
return [
|
|
@@ -165,9 +168,9 @@ export namespace AgenticaCallBenchmarkReporter {
|
|
|
165
168
|
].join("\n");
|
|
166
169
|
};
|
|
167
170
|
|
|
168
|
-
const drawStatus = (
|
|
169
|
-
events: IAgenticaCallBenchmarkEvent[],
|
|
170
|
-
success: (e: IAgenticaCallBenchmarkEvent) => boolean,
|
|
171
|
+
const drawStatus = <Model extends ILlmSchema.Model>(
|
|
172
|
+
events: IAgenticaCallBenchmarkEvent<Model>[],
|
|
173
|
+
success: (e: IAgenticaCallBenchmarkEvent<Model>) => boolean,
|
|
171
174
|
): string => {
|
|
172
175
|
const count: number = Math.floor(
|
|
173
176
|
(events.filter(success).length / events.length) * 10,
|
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
import { IAgenticaPrompt } from "@agentica/core";
|
|
2
|
+
import { ILlmSchema } from "@samchon/openapi";
|
|
2
3
|
|
|
3
4
|
export namespace AgenticaPromptReporter {
|
|
4
|
-
export const markdown =
|
|
5
|
+
export const markdown = <Model extends ILlmSchema.Model>(
|
|
6
|
+
p: IAgenticaPrompt<Model>,
|
|
7
|
+
): string => {
|
|
5
8
|
if (p.type === "text")
|
|
6
9
|
return [`### Text (${p.role})`, p.text, ""].join("\n");
|
|
7
10
|
else if (p.type === "select" || p.type === "cancel")
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { IAgenticaTokenUsage } from "@agentica/core";
|
|
2
|
+
import { ILlmSchema } from "@samchon/openapi";
|
|
2
3
|
|
|
3
4
|
import { IAgenticaSelectBenchmarkEvent } from "../structures/IAgenticaSelectBenchmarkEvent";
|
|
4
5
|
import { IAgenticaSelectBenchmarkResult } from "../structures/IAgenticaSelectBenchmarkResult";
|
|
@@ -9,8 +10,8 @@ import { AgenticaBenchmarkUtil } from "./AgenticaBenchmarkUtil";
|
|
|
9
10
|
* @internal
|
|
10
11
|
*/
|
|
11
12
|
export namespace AgenticaSelectBenchmarkReporter {
|
|
12
|
-
export const markdown = (
|
|
13
|
-
result: IAgenticaSelectBenchmarkResult
|
|
13
|
+
export const markdown = <Model extends ILlmSchema.Model>(
|
|
14
|
+
result: IAgenticaSelectBenchmarkResult<Model>,
|
|
14
15
|
): Record<string, string> =>
|
|
15
16
|
Object.fromEntries([
|
|
16
17
|
["./README.md", writeIndex(result)],
|
|
@@ -25,8 +26,10 @@ export namespace AgenticaSelectBenchmarkReporter {
|
|
|
25
26
|
.flat(),
|
|
26
27
|
]);
|
|
27
28
|
|
|
28
|
-
const writeIndex =
|
|
29
|
-
|
|
29
|
+
const writeIndex = <Model extends ILlmSchema.Model>(
|
|
30
|
+
result: IAgenticaSelectBenchmarkResult<Model>,
|
|
31
|
+
): string => {
|
|
32
|
+
const events: IAgenticaSelectBenchmarkEvent<Model>[] = result.experiments
|
|
30
33
|
.map((r) => r.events)
|
|
31
34
|
.flat();
|
|
32
35
|
const average: number =
|
|
@@ -84,8 +87,8 @@ export namespace AgenticaSelectBenchmarkReporter {
|
|
|
84
87
|
].join("\n");
|
|
85
88
|
};
|
|
86
89
|
|
|
87
|
-
const writeExperimentIndex = (
|
|
88
|
-
exp: IAgenticaSelectBenchmarkResult.IExperiment
|
|
90
|
+
const writeExperimentIndex = <Model extends ILlmSchema.Model>(
|
|
91
|
+
exp: IAgenticaSelectBenchmarkResult.IExperiment<Model>,
|
|
89
92
|
): string => {
|
|
90
93
|
const aggregate: IAgenticaTokenUsage.IComponent = exp.usage.aggregate;
|
|
91
94
|
return [
|
|
@@ -141,8 +144,8 @@ export namespace AgenticaSelectBenchmarkReporter {
|
|
|
141
144
|
].join("\n");
|
|
142
145
|
};
|
|
143
146
|
|
|
144
|
-
const writeExperimentEvent = (
|
|
145
|
-
event: IAgenticaSelectBenchmarkEvent
|
|
147
|
+
const writeExperimentEvent = <Model extends ILlmSchema.Model>(
|
|
148
|
+
event: IAgenticaSelectBenchmarkEvent<Model>,
|
|
146
149
|
index: number,
|
|
147
150
|
): string => {
|
|
148
151
|
return [
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { IAgenticaOperation } from "@agentica/core";
|
|
2
|
+
import { ILlmSchema } from "@samchon/openapi";
|
|
2
3
|
|
|
3
4
|
/**
|
|
4
5
|
* Expected operation determinant.
|
|
@@ -12,47 +13,56 @@ import { IAgenticaOperation } from "@agentica/core";
|
|
|
12
13
|
*
|
|
13
14
|
* @author Samchon
|
|
14
15
|
*/
|
|
15
|
-
export type IAgenticaBenchmarkExpected =
|
|
16
|
-
| IAgenticaBenchmarkExpected.IAllOf
|
|
17
|
-
| IAgenticaBenchmarkExpected.IAnyOf
|
|
18
|
-
| IAgenticaBenchmarkExpected.IArray
|
|
19
|
-
| IAgenticaBenchmarkExpected.IStandalone
|
|
16
|
+
export type IAgenticaBenchmarkExpected<Model extends ILlmSchema.Model> =
|
|
17
|
+
| IAgenticaBenchmarkExpected.IAllOf<Model>
|
|
18
|
+
| IAgenticaBenchmarkExpected.IAnyOf<Model>
|
|
19
|
+
| IAgenticaBenchmarkExpected.IArray<Model>
|
|
20
|
+
| IAgenticaBenchmarkExpected.IStandalone<Model>;
|
|
20
21
|
export namespace IAgenticaBenchmarkExpected {
|
|
21
22
|
/**
|
|
22
23
|
* All of them must meet the condition, but sequence is not important.
|
|
23
24
|
*/
|
|
24
|
-
export interface IAllOf {
|
|
25
|
+
export interface IAllOf<Model extends ILlmSchema.Model> {
|
|
25
26
|
type: "allOf";
|
|
26
27
|
allOf: Array<
|
|
27
|
-
Exclude<
|
|
28
|
+
Exclude<
|
|
29
|
+
IAgenticaBenchmarkExpected<Model>,
|
|
30
|
+
IAgenticaBenchmarkExpected.IAllOf<Model>
|
|
31
|
+
>
|
|
28
32
|
>;
|
|
29
33
|
}
|
|
30
34
|
|
|
31
35
|
/**
|
|
32
36
|
* At least one of them must meet the condition.
|
|
33
37
|
*/
|
|
34
|
-
export interface IAnyOf {
|
|
38
|
+
export interface IAnyOf<Model extends ILlmSchema.Model> {
|
|
35
39
|
type: "anyOf";
|
|
36
40
|
anyOf: Array<
|
|
37
|
-
Exclude<
|
|
41
|
+
Exclude<
|
|
42
|
+
IAgenticaBenchmarkExpected<Model>,
|
|
43
|
+
IAgenticaBenchmarkExpected.IAnyOf<Model>
|
|
44
|
+
>
|
|
38
45
|
>;
|
|
39
46
|
}
|
|
40
47
|
|
|
41
48
|
/**
|
|
42
49
|
* All of them must meet the condition, and sequence is important.
|
|
43
50
|
*/
|
|
44
|
-
export interface IArray {
|
|
51
|
+
export interface IArray<Model extends ILlmSchema.Model> {
|
|
45
52
|
type: "array";
|
|
46
53
|
items: Array<
|
|
47
|
-
Exclude<
|
|
54
|
+
Exclude<
|
|
55
|
+
IAgenticaBenchmarkExpected<Model>,
|
|
56
|
+
IAgenticaBenchmarkExpected.IArray<Model>
|
|
57
|
+
>
|
|
48
58
|
>;
|
|
49
59
|
}
|
|
50
60
|
|
|
51
61
|
/**
|
|
52
62
|
* Standalone operation.
|
|
53
63
|
*/
|
|
54
|
-
export interface IStandalone {
|
|
64
|
+
export interface IStandalone<Model extends ILlmSchema.Model> {
|
|
55
65
|
type: "standalone";
|
|
56
|
-
operation: IAgenticaOperation
|
|
66
|
+
operation: IAgenticaOperation<Model>;
|
|
57
67
|
}
|
|
58
68
|
}
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { IAgenticaPrompt, IAgenticaTokenUsage } from "@agentica/core";
|
|
2
|
+
import { ILlmSchema } from "@samchon/openapi";
|
|
2
3
|
|
|
3
4
|
import { IAgenticaCallBenchmarkScenario } from "./IAgenticaCallBenchmarkScenario";
|
|
4
5
|
|
|
@@ -24,10 +25,10 @@ import { IAgenticaCallBenchmarkScenario } from "./IAgenticaCallBenchmarkScenario
|
|
|
24
25
|
*
|
|
25
26
|
* @author Samchon
|
|
26
27
|
*/
|
|
27
|
-
export type IAgenticaCallBenchmarkEvent =
|
|
28
|
-
| IAgenticaCallBenchmarkEvent.ISuccess
|
|
29
|
-
| IAgenticaCallBenchmarkEvent.IFailure
|
|
30
|
-
| IAgenticaCallBenchmarkEvent.IError
|
|
28
|
+
export type IAgenticaCallBenchmarkEvent<Model extends ILlmSchema.Model> =
|
|
29
|
+
| IAgenticaCallBenchmarkEvent.ISuccess<Model>
|
|
30
|
+
| IAgenticaCallBenchmarkEvent.IFailure<Model>
|
|
31
|
+
| IAgenticaCallBenchmarkEvent.IError<Model>;
|
|
31
32
|
export namespace IAgenticaCallBenchmarkEvent {
|
|
32
33
|
/**
|
|
33
34
|
* Success event type.
|
|
@@ -35,7 +36,8 @@ export namespace IAgenticaCallBenchmarkEvent {
|
|
|
35
36
|
* The `success` event type represents that the benchmark
|
|
36
37
|
* testing is fully meet the expected scenario.
|
|
37
38
|
*/
|
|
38
|
-
export interface ISuccess extends
|
|
39
|
+
export interface ISuccess<Model extends ILlmSchema.Model>
|
|
40
|
+
extends IEventBase<"success", Model> {
|
|
39
41
|
/**
|
|
40
42
|
* Whether succeeded to function selection.
|
|
41
43
|
*/
|
|
@@ -54,7 +56,8 @@ export namespace IAgenticaCallBenchmarkEvent {
|
|
|
54
56
|
* or `caller` agents have not selected or called following the
|
|
55
57
|
* expected scenario in the benchmark testing.
|
|
56
58
|
*/
|
|
57
|
-
export interface IFailure extends
|
|
59
|
+
export interface IFailure<Model extends ILlmSchema.Model>
|
|
60
|
+
extends IEventBase<"failure", Model> {
|
|
58
61
|
/**
|
|
59
62
|
* Whether succeeded to function selection.
|
|
60
63
|
*/
|
|
@@ -66,14 +69,15 @@ export namespace IAgenticaCallBenchmarkEvent {
|
|
|
66
69
|
call: boolean;
|
|
67
70
|
}
|
|
68
71
|
|
|
69
|
-
export interface IError extends
|
|
72
|
+
export interface IError<Model extends ILlmSchema.Model>
|
|
73
|
+
extends IEventBase<"error", Model> {
|
|
70
74
|
/**
|
|
71
75
|
* Error occurred during the benchmark.
|
|
72
76
|
*/
|
|
73
77
|
error: unknown;
|
|
74
78
|
}
|
|
75
79
|
|
|
76
|
-
interface IEventBase<Type extends string> {
|
|
80
|
+
interface IEventBase<Type extends string, Model extends ILlmSchema.Model> {
|
|
77
81
|
/**
|
|
78
82
|
* Discriminant type.
|
|
79
83
|
*/
|
|
@@ -82,14 +86,14 @@ export namespace IAgenticaCallBenchmarkEvent {
|
|
|
82
86
|
/**
|
|
83
87
|
* Expected scenario.
|
|
84
88
|
*/
|
|
85
|
-
scenario: IAgenticaCallBenchmarkScenario
|
|
89
|
+
scenario: IAgenticaCallBenchmarkScenario<Model>;
|
|
86
90
|
|
|
87
91
|
/**
|
|
88
92
|
* Prompt histories.
|
|
89
93
|
*
|
|
90
94
|
* List of prompts occurred during the benchmark testing.
|
|
91
95
|
*/
|
|
92
|
-
prompts: IAgenticaPrompt[];
|
|
96
|
+
prompts: IAgenticaPrompt<Model>[];
|
|
93
97
|
|
|
94
98
|
/**
|
|
95
99
|
* Usage of the token during the benchmark.
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { IAgenticaTokenUsage } from "@agentica/core";
|
|
2
|
+
import { ILlmSchema } from "@samchon/openapi";
|
|
2
3
|
|
|
3
4
|
import { IAgenticaCallBenchmarkEvent } from "./IAgenticaCallBenchmarkEvent";
|
|
4
5
|
import { IAgenticaCallBenchmarkScenario } from "./IAgenticaCallBenchmarkScenario";
|
|
@@ -20,11 +21,11 @@ import { IAgenticaCallBenchmarkScenario } from "./IAgenticaCallBenchmarkScenario
|
|
|
20
21
|
*
|
|
21
22
|
* @author Samchon
|
|
22
23
|
*/
|
|
23
|
-
export interface IAgenticaCallBenchmarkResult {
|
|
24
|
+
export interface IAgenticaCallBenchmarkResult<Model extends ILlmSchema.Model> {
|
|
24
25
|
/**
|
|
25
26
|
* Experiments for each scenario.
|
|
26
27
|
*/
|
|
27
|
-
experiments: IAgenticaCallBenchmarkResult.IExperiment[];
|
|
28
|
+
experiments: IAgenticaCallBenchmarkResult.IExperiment<Model>[];
|
|
28
29
|
|
|
29
30
|
/**
|
|
30
31
|
* Aggregated token usage information.
|
|
@@ -45,11 +46,11 @@ export namespace IAgenticaCallBenchmarkResult {
|
|
|
45
46
|
/**
|
|
46
47
|
* Experiment result about a scenario.
|
|
47
48
|
*/
|
|
48
|
-
export interface IExperiment {
|
|
49
|
+
export interface IExperiment<Model extends ILlmSchema.Model> {
|
|
49
50
|
/**
|
|
50
51
|
* Scenario of the experiment.
|
|
51
52
|
*/
|
|
52
|
-
scenario: IAgenticaCallBenchmarkScenario
|
|
53
|
+
scenario: IAgenticaCallBenchmarkScenario<Model>;
|
|
53
54
|
|
|
54
55
|
/**
|
|
55
56
|
* Events occurred during the benchmark in the scenario.
|
|
@@ -59,7 +60,7 @@ export namespace IAgenticaCallBenchmarkResult {
|
|
|
59
60
|
* {@link AgenticaCallBenchmark.IConfig.repeat repeat} count.
|
|
60
61
|
* And the event is one of the repeated benchmark results.
|
|
61
62
|
*/
|
|
62
|
-
events: IAgenticaCallBenchmarkEvent[];
|
|
63
|
+
events: IAgenticaCallBenchmarkEvent<Model>[];
|
|
63
64
|
|
|
64
65
|
/**
|
|
65
66
|
* LLM token usage information.
|