@agentica/benchmark 0.7.0-dev.20250224-2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/lib/AgenticaCallBenchmark.d.ts +137 -0
- package/lib/AgenticaCallBenchmark.js +187 -0
- package/lib/AgenticaCallBenchmark.js.map +1 -0
- package/lib/AgenticaSelectBenchmark.d.ts +123 -0
- package/lib/AgenticaSelectBenchmark.js +185 -0
- package/lib/AgenticaSelectBenchmark.js.map +1 -0
- package/lib/index.d.ts +2 -0
- package/lib/index.js +19 -0
- package/lib/index.js.map +1 -0
- package/lib/index.mjs +449 -0
- package/lib/index.mjs.map +1 -0
- package/lib/internal/AgenticaBenchmarkPredicator.d.ts +32 -0
- package/lib/internal/AgenticaBenchmarkPredicator.js +179 -0
- package/lib/internal/AgenticaBenchmarkPredicator.js.map +1 -0
- package/lib/internal/AgenticaBenchmarkUtil.d.ts +5 -0
- package/lib/internal/AgenticaBenchmarkUtil.js +37 -0
- package/lib/internal/AgenticaBenchmarkUtil.js.map +1 -0
- package/lib/internal/AgenticaCallBenchmarkReporter.d.ts +4 -0
- package/lib/internal/AgenticaCallBenchmarkReporter.js +136 -0
- package/lib/internal/AgenticaCallBenchmarkReporter.js.map +1 -0
- package/lib/internal/AgenticaPromptReporter.d.ts +4 -0
- package/lib/internal/AgenticaPromptReporter.js +49 -0
- package/lib/internal/AgenticaPromptReporter.js.map +1 -0
- package/lib/internal/AgenticaSelectBenchmarkReporter.d.ts +1 -0
- package/lib/internal/AgenticaSelectBenchmarkReporter.js +172 -0
- package/lib/internal/AgenticaSelectBenchmarkReporter.js.map +1 -0
- package/lib/structures/IAgenticaBenchmarkExpected.d.ts +44 -0
- package/lib/structures/IAgenticaBenchmarkExpected.js +3 -0
- package/lib/structures/IAgenticaBenchmarkExpected.js.map +1 -0
- package/lib/structures/IAgenticaCallBenchmarkEvent.d.ts +95 -0
- package/lib/structures/IAgenticaCallBenchmarkEvent.js +3 -0
- package/lib/structures/IAgenticaCallBenchmarkEvent.js.map +1 -0
- package/lib/structures/IAgenticaCallBenchmarkResult.d.ts +62 -0
- package/lib/structures/IAgenticaCallBenchmarkResult.js +3 -0
- package/lib/structures/IAgenticaCallBenchmarkResult.js.map +1 -0
- package/lib/structures/IAgenticaCallBenchmarkScenario.d.ts +36 -0
- package/lib/structures/IAgenticaCallBenchmarkScenario.js +3 -0
- package/lib/structures/IAgenticaCallBenchmarkScenario.js.map +1 -0
- package/lib/structures/IAgenticaSelectBenchmarkEvent.d.ts +92 -0
- package/lib/structures/IAgenticaSelectBenchmarkEvent.js +3 -0
- package/lib/structures/IAgenticaSelectBenchmarkEvent.js.map +1 -0
- package/lib/structures/IAgenticaSelectBenchmarkResult.d.ts +62 -0
- package/lib/structures/IAgenticaSelectBenchmarkResult.js +3 -0
- package/lib/structures/IAgenticaSelectBenchmarkResult.js.map +1 -0
- package/lib/structures/IAgenticaSelectBenchmarkScenario.d.ts +36 -0
- package/lib/structures/IAgenticaSelectBenchmarkScenario.js +3 -0
- package/lib/structures/IAgenticaSelectBenchmarkScenario.js.map +1 -0
- package/lib/utils/MathUtil.d.ts +3 -0
- package/lib/utils/MathUtil.js +8 -0
- package/lib/utils/MathUtil.js.map +1 -0
- package/lib/utils/TokenUsageComputer.d.ts +5 -0
- package/lib/utils/TokenUsageComputer.js +37 -0
- package/lib/utils/TokenUsageComputer.js.map +1 -0
- package/package.json +57 -0
- package/src/AgenticaCallBenchmark.ts +259 -0
- package/src/AgenticaSelectBenchmark.ts +262 -0
- package/src/index.ts +3 -0
- package/src/internal/AgenticaBenchmarkPredicator.ts +216 -0
- package/src/internal/AgenticaBenchmarkUtil.ts +40 -0
- package/src/internal/AgenticaCallBenchmarkReporter.ts +177 -0
- package/src/internal/AgenticaPromptReporter.ts +43 -0
- package/src/internal/AgenticaSelectBenchmarkReporter.ts +212 -0
- package/src/structures/IAgenticaBenchmarkExpected.ts +58 -0
- package/src/structures/IAgenticaCallBenchmarkEvent.ts +109 -0
- package/src/structures/IAgenticaCallBenchmarkResult.ts +69 -0
- package/src/structures/IAgenticaCallBenchmarkScenario.ts +39 -0
- package/src/structures/IAgenticaSelectBenchmarkEvent.ts +110 -0
- package/src/structures/IAgenticaSelectBenchmarkResult.ts +69 -0
- package/src/structures/IAgenticaSelectBenchmarkScenario.ts +39 -0
- package/src/utils/MathUtil.ts +3 -0
- package/src/utils/TokenUsageComputer.ts +40 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Wrtn Technologies
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import { Agentica } from "@agentica/core";
|
|
2
|
+
import { tags } from "typia";
|
|
3
|
+
import { IAgenticaCallBenchmarkEvent } from "./structures/IAgenticaCallBenchmarkEvent";
|
|
4
|
+
import { IAgenticaCallBenchmarkResult } from "./structures/IAgenticaCallBenchmarkResult";
|
|
5
|
+
import { IAgenticaCallBenchmarkScenario } from "./structures/IAgenticaCallBenchmarkScenario";
|
|
6
|
+
/**
|
|
7
|
+
* LLM function calling selection benchmark.
|
|
8
|
+
*
|
|
9
|
+
* `AgenticaCallBenchmark` is a class for the benchmark of the
|
|
10
|
+
* LLM (Large Model Language) function calling part. It utilizes both
|
|
11
|
+
* `selector` and `caller` agents and tests whether the expected
|
|
12
|
+
* {@link IAgenticaOperation operations} are properly selected and
|
|
13
|
+
* called from the given
|
|
14
|
+
* {@link IAgenticaCallBenchmarkScenario scenarios}.
|
|
15
|
+
*
|
|
16
|
+
* Note that, this `AgenticaCallBenchmark` consumes a lot of time and
|
|
17
|
+
* LLM token costs because it needs the whole process of the
|
|
18
|
+
* {@link Agentica} class with a lot of repetitions. If you don't want
|
|
19
|
+
* such a heavy benchmark, consider to using
|
|
20
|
+
* {@link AgenticaSelectBenchmark} instead. In my experience,
|
|
21
|
+
* {@link Agentica} does not fail to function calling, so the function
|
|
22
|
+
* selection benchmark is much economical.
|
|
23
|
+
*
|
|
24
|
+
* @author Samchon
|
|
25
|
+
*/
|
|
26
|
+
export declare class AgenticaCallBenchmark {
|
|
27
|
+
private agent_;
|
|
28
|
+
private scenarios_;
|
|
29
|
+
private config_;
|
|
30
|
+
private result_;
|
|
31
|
+
/**
|
|
32
|
+
* Initializer Constructor.
|
|
33
|
+
*
|
|
34
|
+
* @param props Properties of the selection benchmark
|
|
35
|
+
*/
|
|
36
|
+
constructor(props: AgenticaCallBenchmark.IProps);
|
|
37
|
+
/**
|
|
38
|
+
* Execute the benchmark.
|
|
39
|
+
*
|
|
40
|
+
* Execute the benchmark of the LLM function calling, and returns
|
|
41
|
+
* the result of the benchmark.
|
|
42
|
+
*
|
|
43
|
+
* If you wanna see progress of the benchmark, you can pass a callback
|
|
44
|
+
* function as the argument of the `listener`. The callback function
|
|
45
|
+
* would be called whenever a benchmark event is occurred.
|
|
46
|
+
*
|
|
47
|
+
* Also, you can publish a markdown format report by calling
|
|
48
|
+
* the {@link report} function after the benchmark execution.
|
|
49
|
+
*
|
|
50
|
+
* @param listener Callback function listening the benchmark events
|
|
51
|
+
* @returns Results of the function calling benchmark
|
|
52
|
+
*/
|
|
53
|
+
execute(listener?: (event: IAgenticaCallBenchmarkEvent) => void): Promise<IAgenticaCallBenchmarkResult>;
|
|
54
|
+
/**
|
|
55
|
+
* Report the benchmark result as markdown files.
|
|
56
|
+
*
|
|
57
|
+
* Report the benchmark result {@link execute}d by
|
|
58
|
+
* `AgenticaCallBenchmark` as markdown files, and returns a dictionary
|
|
59
|
+
* object of the markdown reporting files. The key of the dictionary
|
|
60
|
+
* would be file name, and the value would be the markdown content.
|
|
61
|
+
*
|
|
62
|
+
* For reference, the markdown files are composed like below:
|
|
63
|
+
*
|
|
64
|
+
* - `./README.md`
|
|
65
|
+
* - `./scenario-1/README.md`
|
|
66
|
+
* - `./scenario-1/1.success.md`
|
|
67
|
+
* - `./scenario-1/2.failure.md`
|
|
68
|
+
* - `./scenario-1/3.error.md`
|
|
69
|
+
*
|
|
70
|
+
* @returns Dictionary of markdown files.
|
|
71
|
+
*/
|
|
72
|
+
report(): Record<string, string>;
|
|
73
|
+
private step;
|
|
74
|
+
}
|
|
75
|
+
export declare namespace AgenticaCallBenchmark {
|
|
76
|
+
/**
|
|
77
|
+
* Properties of the {@link AgenticaCallBenchmark} constructor.
|
|
78
|
+
*/
|
|
79
|
+
interface IProps {
|
|
80
|
+
/**
|
|
81
|
+
* AI agent instance.
|
|
82
|
+
*/
|
|
83
|
+
agent: Agentica;
|
|
84
|
+
/**
|
|
85
|
+
* List of scenarios what you expect.
|
|
86
|
+
*/
|
|
87
|
+
scenarios: IAgenticaCallBenchmarkScenario[];
|
|
88
|
+
/**
|
|
89
|
+
* Configuration for the benchmark.
|
|
90
|
+
*/
|
|
91
|
+
config?: Partial<IConfig>;
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Configuration for the benchmark.
|
|
95
|
+
*
|
|
96
|
+
* `AgenticaSelectBenchmark.IConfig` is a data structure which
|
|
97
|
+
* represents a configuration for the benchmark, especially the
|
|
98
|
+
* capacity information of the benchmark execution.
|
|
99
|
+
*/
|
|
100
|
+
interface IConfig {
|
|
101
|
+
/**
|
|
102
|
+
* Repeat count.
|
|
103
|
+
*
|
|
104
|
+
* The number of repeating count for the benchmark execution
|
|
105
|
+
* for each scenario.
|
|
106
|
+
*
|
|
107
|
+
* @default 10
|
|
108
|
+
*/
|
|
109
|
+
repeat: number & tags.Type<"uint32"> & tags.Minimum<1>;
|
|
110
|
+
/**
|
|
111
|
+
* Simultaneous count.
|
|
112
|
+
*
|
|
113
|
+
* The number of simultaneous count for the parallel benchmark
|
|
114
|
+
* execution.
|
|
115
|
+
*
|
|
116
|
+
* If you configure this property greater than `1`, the benchmark
|
|
117
|
+
* for each scenario would be executed in parallel in the given
|
|
118
|
+
* count.
|
|
119
|
+
*
|
|
120
|
+
* @default 10
|
|
121
|
+
*/
|
|
122
|
+
simultaneous: number & tags.Type<"uint32"> & tags.Minimum<1>;
|
|
123
|
+
/**
|
|
124
|
+
* Number of consents.
|
|
125
|
+
*
|
|
126
|
+
* AI agent sometimes asks user to consent to the function
|
|
127
|
+
* calling, and perform it at the next step.
|
|
128
|
+
*
|
|
129
|
+
* This property represents the number of consents to allow.
|
|
130
|
+
* If the number of consents from the AI agent exceeds the
|
|
131
|
+
* configured value, the benchmark will be failed.
|
|
132
|
+
*
|
|
133
|
+
* @default 3
|
|
134
|
+
*/
|
|
135
|
+
consent: number;
|
|
136
|
+
}
|
|
137
|
+
}
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.AgenticaCallBenchmark = void 0;
|
|
13
|
+
const tstl_1 = require("tstl");
|
|
14
|
+
const AgenticaBenchmarkPredicator_1 = require("./internal/AgenticaBenchmarkPredicator");
|
|
15
|
+
const AgenticaCallBenchmarkReporter_1 = require("./internal/AgenticaCallBenchmarkReporter");
|
|
16
|
+
const TokenUsageComputer_1 = require("./utils/TokenUsageComputer");
|
|
17
|
+
/**
|
|
18
|
+
* LLM function calling selection benchmark.
|
|
19
|
+
*
|
|
20
|
+
* `AgenticaCallBenchmark` is a class for the benchmark of the
|
|
21
|
+
* LLM (Large Model Language) function calling part. It utilizes both
|
|
22
|
+
* `selector` and `caller` agents and tests whether the expected
|
|
23
|
+
* {@link IAgenticaOperation operations} are properly selected and
|
|
24
|
+
* called from the given
|
|
25
|
+
* {@link IAgenticaCallBenchmarkScenario scenarios}.
|
|
26
|
+
*
|
|
27
|
+
* Note that, this `AgenticaCallBenchmark` consumes a lot of time and
|
|
28
|
+
* LLM token costs because it needs the whole process of the
|
|
29
|
+
* {@link Agentica} class with a lot of repetitions. If you don't want
|
|
30
|
+
* such a heavy benchmark, consider to using
|
|
31
|
+
* {@link AgenticaSelectBenchmark} instead. In my experience,
|
|
32
|
+
* {@link Agentica} does not fail to function calling, so the function
|
|
33
|
+
* selection benchmark is much economical.
|
|
34
|
+
*
|
|
35
|
+
* @author Samchon
|
|
36
|
+
*/
|
|
37
|
+
class AgenticaCallBenchmark {
|
|
38
|
+
/**
|
|
39
|
+
* Initializer Constructor.
|
|
40
|
+
*
|
|
41
|
+
* @param props Properties of the selection benchmark
|
|
42
|
+
*/
|
|
43
|
+
constructor(props) {
|
|
44
|
+
var _a, _b, _c, _d, _e, _f;
|
|
45
|
+
this.agent_ = props.agent;
|
|
46
|
+
this.scenarios_ = props.scenarios.slice();
|
|
47
|
+
this.config_ = {
|
|
48
|
+
repeat: (_b = (_a = props.config) === null || _a === void 0 ? void 0 : _a.repeat) !== null && _b !== void 0 ? _b : 10,
|
|
49
|
+
simultaneous: (_d = (_c = props.config) === null || _c === void 0 ? void 0 : _c.simultaneous) !== null && _d !== void 0 ? _d : 10,
|
|
50
|
+
consent: (_f = (_e = props.config) === null || _e === void 0 ? void 0 : _e.consent) !== null && _f !== void 0 ? _f : 3,
|
|
51
|
+
};
|
|
52
|
+
this.result_ = null;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Execute the benchmark.
|
|
56
|
+
*
|
|
57
|
+
* Execute the benchmark of the LLM function calling, and returns
|
|
58
|
+
* the result of the benchmark.
|
|
59
|
+
*
|
|
60
|
+
* If you wanna see progress of the benchmark, you can pass a callback
|
|
61
|
+
* function as the argument of the `listener`. The callback function
|
|
62
|
+
* would be called whenever a benchmark event is occurred.
|
|
63
|
+
*
|
|
64
|
+
* Also, you can publish a markdown format report by calling
|
|
65
|
+
* the {@link report} function after the benchmark execution.
|
|
66
|
+
*
|
|
67
|
+
* @param listener Callback function listening the benchmark events
|
|
68
|
+
* @returns Results of the function calling benchmark
|
|
69
|
+
*/
|
|
70
|
+
execute(listener) {
|
|
71
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
72
|
+
const started_at = new Date();
|
|
73
|
+
const semaphore = new tstl_1.Semaphore(this.config_.simultaneous);
|
|
74
|
+
const experiments = yield Promise.all(this.scenarios_.map((scenario) => __awaiter(this, void 0, void 0, function* () {
|
|
75
|
+
const events = yield Promise.all(new Array(this.config_.repeat).fill(0).map(() => __awaiter(this, void 0, void 0, function* () {
|
|
76
|
+
yield semaphore.acquire();
|
|
77
|
+
const e = yield this.step(scenario);
|
|
78
|
+
yield semaphore.release();
|
|
79
|
+
if (listener !== undefined)
|
|
80
|
+
listener(e);
|
|
81
|
+
return e;
|
|
82
|
+
})));
|
|
83
|
+
return {
|
|
84
|
+
scenario,
|
|
85
|
+
events,
|
|
86
|
+
usage: events
|
|
87
|
+
.filter((e) => e.type !== "error")
|
|
88
|
+
.map((e) => e.usage)
|
|
89
|
+
.reduce(TokenUsageComputer_1.TokenUsageComputer.plus, TokenUsageComputer_1.TokenUsageComputer.zero()),
|
|
90
|
+
};
|
|
91
|
+
})));
|
|
92
|
+
return (this.result_ = {
|
|
93
|
+
experiments,
|
|
94
|
+
started_at,
|
|
95
|
+
completed_at: new Date(),
|
|
96
|
+
usage: experiments
|
|
97
|
+
.map((p) => p.usage)
|
|
98
|
+
.reduce(TokenUsageComputer_1.TokenUsageComputer.plus, TokenUsageComputer_1.TokenUsageComputer.zero()),
|
|
99
|
+
});
|
|
100
|
+
});
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Report the benchmark result as markdown files.
|
|
104
|
+
*
|
|
105
|
+
* Report the benchmark result {@link execute}d by
|
|
106
|
+
* `AgenticaCallBenchmark` as markdown files, and returns a dictionary
|
|
107
|
+
* object of the markdown reporting files. The key of the dictionary
|
|
108
|
+
* would be file name, and the value would be the markdown content.
|
|
109
|
+
*
|
|
110
|
+
* For reference, the markdown files are composed like below:
|
|
111
|
+
*
|
|
112
|
+
* - `./README.md`
|
|
113
|
+
* - `./scenario-1/README.md`
|
|
114
|
+
* - `./scenario-1/1.success.md`
|
|
115
|
+
* - `./scenario-1/2.failure.md`
|
|
116
|
+
* - `./scenario-1/3.error.md`
|
|
117
|
+
*
|
|
118
|
+
* @returns Dictionary of markdown files.
|
|
119
|
+
*/
|
|
120
|
+
report() {
|
|
121
|
+
if (this.result_ === null)
|
|
122
|
+
throw new Error("Benchmark is not executed yet.");
|
|
123
|
+
return AgenticaCallBenchmarkReporter_1.AgenticaCallBenchmarkReporter.markdown(this.result_);
|
|
124
|
+
}
|
|
125
|
+
step(scenario) {
|
|
126
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
127
|
+
const agent = this.agent_.clone();
|
|
128
|
+
const started_at = new Date();
|
|
129
|
+
const success = () => AgenticaBenchmarkPredicator_1.AgenticaBenchmarkPredicator.success({
|
|
130
|
+
expected: scenario.expected,
|
|
131
|
+
operations: agent
|
|
132
|
+
.getPromptHistories()
|
|
133
|
+
.filter((p) => p.type === "execute"),
|
|
134
|
+
strict: false,
|
|
135
|
+
});
|
|
136
|
+
const out = () => {
|
|
137
|
+
const select = AgenticaBenchmarkPredicator_1.AgenticaBenchmarkPredicator.success({
|
|
138
|
+
expected: scenario.expected,
|
|
139
|
+
operations: agent
|
|
140
|
+
.getPromptHistories()
|
|
141
|
+
.filter((p) => p.type === "select")
|
|
142
|
+
.map((p) => p.operations)
|
|
143
|
+
.flat(),
|
|
144
|
+
strict: false,
|
|
145
|
+
});
|
|
146
|
+
const call = success();
|
|
147
|
+
return {
|
|
148
|
+
type: (call ? "success" : "failure"),
|
|
149
|
+
scenario,
|
|
150
|
+
select,
|
|
151
|
+
call,
|
|
152
|
+
prompts: agent.getPromptHistories(),
|
|
153
|
+
usage: agent.getTokenUsage(),
|
|
154
|
+
started_at,
|
|
155
|
+
completed_at: new Date(),
|
|
156
|
+
};
|
|
157
|
+
};
|
|
158
|
+
try {
|
|
159
|
+
yield agent.conversate(scenario.text);
|
|
160
|
+
if (success())
|
|
161
|
+
return out();
|
|
162
|
+
for (let i = 0; i < this.config_.consent; ++i) {
|
|
163
|
+
const next = yield AgenticaBenchmarkPredicator_1.AgenticaBenchmarkPredicator.isNext(agent);
|
|
164
|
+
if (next === null)
|
|
165
|
+
break;
|
|
166
|
+
yield agent.conversate(next);
|
|
167
|
+
if (success())
|
|
168
|
+
return out();
|
|
169
|
+
}
|
|
170
|
+
return out();
|
|
171
|
+
}
|
|
172
|
+
catch (error) {
|
|
173
|
+
return {
|
|
174
|
+
type: "error",
|
|
175
|
+
scenario,
|
|
176
|
+
prompts: agent.getPromptHistories(),
|
|
177
|
+
usage: agent.getTokenUsage(),
|
|
178
|
+
error,
|
|
179
|
+
started_at,
|
|
180
|
+
completed_at: new Date(),
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
});
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
exports.AgenticaCallBenchmark = AgenticaCallBenchmark;
|
|
187
|
+
//# sourceMappingURL=AgenticaCallBenchmark.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"AgenticaCallBenchmark.js","sourceRoot":"","sources":["../src/AgenticaCallBenchmark.ts"],"names":[],"mappings":";;;;;;;;;;;;AACA,+BAAiC;AAGjC,wFAAqF;AACrF,4FAAyF;AAIzF,mEAAgE;AAEhE;;;;;;;;;;;;;;;;;;;GAmBG;AACH,MAAa,qBAAqB;IAMhC;;;;OAIG;IACH,YAAmB,KAAmC;;QACpD,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC,KAAK,CAAC;QAC1B,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC,SAAS,CAAC,KAAK,EAAE,CAAC;QAC1C,IAAI,CAAC,OAAO,GAAG;YACb,MAAM,EAAE,MAAA,MAAA,KAAK,CAAC,MAAM,0CAAE,MAAM,mCAAI,EAAE;YAClC,YAAY,EAAE,MAAA,MAAA,KAAK,CAAC,MAAM,0CAAE,YAAY,mCAAI,EAAE;YAC9C,OAAO,EAAE,MAAA,MAAA,KAAK,CAAC,MAAM,0CAAE,OAAO,mCAAI,CAAC;SACpC,CAAC;QACF,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;IACtB,CAAC;IAED;;;;;;;;;;;;;;;OAeG;IACU,OAAO,CAClB,QAAuD;;YAEvD,MAAM,UAAU,GAAS,IAAI,IAAI,EAAE,CAAC;YACpC,MAAM,SAAS,GAAc,IAAI,gBAAS,CAAC,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;YACtE,MAAM,WAAW,GACf,MAAM,OAAO,CAAC,GAAG,CACf,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAO,QAAQ,EAAE,EAAE;gBACrC,MAAM,MAAM,GAAkC,MAAM,OAAO,CAAC,GAAG,CAC7D,IAAI,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAS,EAAE;oBACpD,MAAM,SAAS,CAAC,OAAO,EAAE,CAAC;oBAC1B,MAAM,CAAC,GAAgC,MAAM,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;oBACjE,MAAM,SAAS,CAAC,OAAO,EAAE,CAAC;oBAC1B,IAAI,QAAQ,KAAK,SAAS;wBAAE,QAAQ,CAAC,CAAC,CAAC,CAAC;oBACxC,OAAO,CAAC,CAAC;gBACX,CAAC,CAAA,CAAC,CACH,CAAC;gBACF,OAAO;oBACL,QAAQ;oBACR,MAAM;oBACN,KAAK,EAAE,MAAM;yBACV,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,OAAO,CAAC;yBACjC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;yBACnB,MAAM,CAAC,uCAAkB,CAAC,IAAI,EAAE,uCAAkB,CAAC,IAAI,EAAE,CAAC;iBAC9D,CAAC;YACJ,CAAC,CAAA,CAAC,CACH,CAAC;YACJ,OAAO,CAAC,IAAI,CAAC,OAAO,GAAG;gBACrB,WAAW;gBACX,UAAU;gBACV,YAAY,EAAE,IAAI,IAAI,EAAE;gBACxB,KAAK,EAAE,WAAW;qBACf,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;qBACnB,MAAM,CAAC,uCAAkB,CAAC,IAAI,EAAE,uCAAkB,CAAC,IAAI,EAAE,CAAC;aAC9D,CAAC,CAAC;QACL,CAAC;KAAA;IAED;;;;;;;;;;;;;;;;;OAiBG;IACI,MAAM;QACX,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI;YACvB,MAAM,IAAI,KAAK,CAAC,gCAAgC,CAAC,CAAC;QACpD,OAAO,6DAA6B,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAC9D,CAAC;IAEa,IAAI,CAChB,QAAwC;;YAExC,MAAM,KAAK,GAAa,IAAI,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC;YAC5C,MAAM,UAAU,GAAS,IAAI,IAAI,EAAE,CAAC;YACpC,MAAM,OAAO,GAAG,GAAG,EAAE,CACnB,yDAA2B,CAAC,OAAO,CAAC;gBAClC,QAAQ,EAAE,QAAQ,CAAC,QAAQ;gBAC3B,UAAU,EAAE,KAAK;qBACd,kBAAkB,EAAE;qBACpB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,SAAS,CAAC;gBACtC,MAAM,EAAE,KAAK;aACd,CAAC,CAAC;YACL,MAAM,GAAG,GAAG,GAAgC,EAAE;gBAC5C,MAAM,MAAM,GAAG,yDAA2B,CAAC,OAAO,CAAC;oBACjD,QAAQ,EAAE,QAAQ,CAAC,QAAQ;oBAC3B,UAAU,EAAE,KAAK;yBACd,kBAAkB,EAAE;yBACpB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC;yBAClC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC;yBACxB,IAAI,EAAE;oBACT,MAAM,EAAE,KAAK;iBACd,CAAC,CAAC;gBACH,MAAM,IAAI,GAAG,OAAO,EAAE,CAAC;gBACvB,OAAO;oBACL,IAAI,EAAE,CAAC,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,SAAS,CAAc;oBACjD,QAAQ;oBACR,MAAM;oBACN,IAAI;oBACJ,OAAO,EAAE,KAAK,CAAC,kBAAkB,EAAE;oBACnC,KAAK,EAAE,KAAK,CAAC,aAAa,EAAE;oBAC5B,UAAU;oBACV,YAAY,EAAE,IAAI,IAAI,EAAE;iBACsB,CAAC;YACnD,CAAC,CAAC;YAEF,IAAI,CAAC;gBACH,MAAM,KAAK,CAAC,UAAU,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;gBACtC,IAAI,OAAO,EAAE;oBAAE,OAAO,GAAG,EAAE,CAAC;gBAC5B,KAAK,IAAI,CAAC,GAAW,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,EAAE,CAAC;oBACtD,MAAM,IAAI,GACR,MAAM,yDAA2B,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;oBAClD,IAAI,IAAI,KAAK,IAAI;wBAAE,MAAM;oBAEzB,MAAM,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;oBAC7B,IAAI,OAAO,EAAE;wBAAE,OAAO,GAAG,EAAE,CAAC;gBAC9B,CAAC;gBACD,OAAO,GAAG,EAAE,CAAC;YACf,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,OAAO;oBACL,IAAI,EAAE,OAAO;oBACb,QAAQ;oBACR,OAAO,EAAE,KAAK,CAAC,kBAAkB,EAAE;oBACnC,KAAK,EAAE,KAAK,CAAC,aAAa,EAAE;oBAC5B,KAAK;oBACL,UAAU;oBACV,YAAY,EAAE,IAAI,IAAI,EAAE;iBACzB,CAAC;YACJ,CAAC;QACH,CAAC;KAAA;CACF;AA/JD,sDA+JC"}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import { Agentica } from "@agentica/core";
|
|
2
|
+
import { tags } from "typia";
|
|
3
|
+
import { IAgenticaSelectBenchmarkEvent } from "./structures/IAgenticaSelectBenchmarkEvent";
|
|
4
|
+
import { IAgenticaSelectBenchmarkResult } from "./structures/IAgenticaSelectBenchmarkResult";
|
|
5
|
+
import { IAgenticaSelectBenchmarkScenario } from "./structures/IAgenticaSelectBenchmarkScenario";
|
|
6
|
+
/**
|
|
7
|
+
* LLM function calling selection benchmark.
|
|
8
|
+
*
|
|
9
|
+
* `AgenticaSelectBenchmark` is a class for the benchmark of the
|
|
10
|
+
* LLM (Large Model Language) function calling's selection part.
|
|
11
|
+
* It utilizes the `selector` agent and tests whether the expected
|
|
12
|
+
* {@link IAgenticaOperation operations} are properly selected from
|
|
13
|
+
* the given {@link IAgenticaSelectBenchmarkScenario scenarios}.
|
|
14
|
+
*
|
|
15
|
+
* Note that, this `AgenticaSelectBenchmark` class measures only the
|
|
16
|
+
* selection benchmark, testing whether the `selector` agent can select
|
|
17
|
+
* candidate functions to call as expected. Therefore, it does not test
|
|
18
|
+
* about the actual function calling which is done by the `executor` agent.
|
|
19
|
+
* If you want that feature, use {@link AgenticaCallBenchmark} class instead.
|
|
20
|
+
*
|
|
21
|
+
* @author Samchon
|
|
22
|
+
*/
|
|
23
|
+
export declare class AgenticaSelectBenchmark {
|
|
24
|
+
private agent_;
|
|
25
|
+
private scenarios_;
|
|
26
|
+
private config_;
|
|
27
|
+
private histories_;
|
|
28
|
+
private result_;
|
|
29
|
+
/**
|
|
30
|
+
* Initializer Constructor.
|
|
31
|
+
*
|
|
32
|
+
* @param props Properties of the selection benchmark
|
|
33
|
+
*/
|
|
34
|
+
constructor(props: AgenticaSelectBenchmark.IProps);
|
|
35
|
+
/**
|
|
36
|
+
* Execute the benchmark.
|
|
37
|
+
*
|
|
38
|
+
* Execute the benchmark of the LLM function selection, and returns
|
|
39
|
+
* the result of the benchmark.
|
|
40
|
+
*
|
|
41
|
+
* If you wanna see progress of the benchmark, you can pass a callback
|
|
42
|
+
* function as the argument of the `listener`. The callback function
|
|
43
|
+
* would be called whenever a benchmark event is occurred.
|
|
44
|
+
*
|
|
45
|
+
* Also, you can publish a markdown format report by calling
|
|
46
|
+
* the {@link report} function after the benchmark execution.
|
|
47
|
+
*
|
|
48
|
+
* @param listener Callback function listening the benchmark events
|
|
49
|
+
* @returns Results of the function selection benchmark
|
|
50
|
+
*/
|
|
51
|
+
execute(listener?: (event: IAgenticaSelectBenchmarkEvent) => void): Promise<IAgenticaSelectBenchmarkResult>;
|
|
52
|
+
/**
|
|
53
|
+
* Report the benchmark result as markdown files.
|
|
54
|
+
*
|
|
55
|
+
* Report the benchmark result {@link execute}d by
|
|
56
|
+
* `AgenticaSelectBenchmark` as markdown files, and returns a
|
|
57
|
+
* dictionary object of the markdown reporting files. The key of
|
|
58
|
+
* the dictionary would be file name, and the value would be the
|
|
59
|
+
* markdown content.
|
|
60
|
+
*
|
|
61
|
+
* For reference, the markdown files are composed like below:
|
|
62
|
+
*
|
|
63
|
+
* - `./README.md`
|
|
64
|
+
* - `./scenario-1/README.md`
|
|
65
|
+
* - `./scenario-1/1.success.md`
|
|
66
|
+
* - `./scenario-1/2.failure.md`
|
|
67
|
+
* - `./scenario-1/3.error.md`
|
|
68
|
+
*
|
|
69
|
+
* @returns Dictionary of markdown files.
|
|
70
|
+
*/
|
|
71
|
+
report(): Record<string, string>;
|
|
72
|
+
private step;
|
|
73
|
+
}
|
|
74
|
+
export declare namespace AgenticaSelectBenchmark {
|
|
75
|
+
/**
|
|
76
|
+
* Properties of the {@link AgenticaSelectBenchmark} constructor.
|
|
77
|
+
*/
|
|
78
|
+
interface IProps {
|
|
79
|
+
/**
|
|
80
|
+
* AI agent instance.
|
|
81
|
+
*/
|
|
82
|
+
agent: Agentica;
|
|
83
|
+
/**
|
|
84
|
+
* List of scenarios what you expect.
|
|
85
|
+
*/
|
|
86
|
+
scenarios: IAgenticaSelectBenchmarkScenario[];
|
|
87
|
+
/**
|
|
88
|
+
* Configuration for the benchmark.
|
|
89
|
+
*/
|
|
90
|
+
config?: Partial<IConfig>;
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Configuration for the benchmark.
|
|
94
|
+
*
|
|
95
|
+
* `AgenticaSelectBenchmark.IConfig` is a data structure which
|
|
96
|
+
* represents a configuration for the benchmark, especially the
|
|
97
|
+
* capacity information of the benchmark execution.
|
|
98
|
+
*/
|
|
99
|
+
interface IConfig {
|
|
100
|
+
/**
|
|
101
|
+
* Repeat count.
|
|
102
|
+
*
|
|
103
|
+
* The number of repeating count for the benchmark execution
|
|
104
|
+
* for each scenario.
|
|
105
|
+
*
|
|
106
|
+
* @default 10
|
|
107
|
+
*/
|
|
108
|
+
repeat: number & tags.Type<"uint32"> & tags.Minimum<1>;
|
|
109
|
+
/**
|
|
110
|
+
* Simultaneous count.
|
|
111
|
+
*
|
|
112
|
+
* The number of simultaneous count for the parallel benchmark
|
|
113
|
+
* execution.
|
|
114
|
+
*
|
|
115
|
+
* If you configure this property greater than `1`, the benchmark
|
|
116
|
+
* for each scenario would be executed in parallel in the given
|
|
117
|
+
* count.
|
|
118
|
+
*
|
|
119
|
+
* @default 10
|
|
120
|
+
*/
|
|
121
|
+
simultaneous: number & tags.Type<"uint32"> & tags.Minimum<1>;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.AgenticaSelectBenchmark = void 0;
|
|
13
|
+
const ChatGptSelectFunctionAgent_1 = require("@agentica/core/src/chatgpt/ChatGptSelectFunctionAgent");
|
|
14
|
+
const tstl_1 = require("tstl");
|
|
15
|
+
const AgenticaBenchmarkPredicator_1 = require("./internal/AgenticaBenchmarkPredicator");
|
|
16
|
+
const AgenticaSelectBenchmarkReporter_1 = require("./internal/AgenticaSelectBenchmarkReporter");
|
|
17
|
+
const TokenUsageComputer_1 = require("./utils/TokenUsageComputer");
|
|
18
|
+
/**
|
|
19
|
+
* LLM function calling selection benchmark.
|
|
20
|
+
*
|
|
21
|
+
* `AgenticaSelectBenchmark` is a class for the benchmark of the
|
|
22
|
+
* LLM (Large Model Language) function calling's selection part.
|
|
23
|
+
* It utilizes the `selector` agent and tests whether the expected
|
|
24
|
+
* {@link IAgenticaOperation operations} are properly selected from
|
|
25
|
+
* the given {@link IAgenticaSelectBenchmarkScenario scenarios}.
|
|
26
|
+
*
|
|
27
|
+
* Note that, this `AgenticaSelectBenchmark` class measures only the
|
|
28
|
+
* selection benchmark, testing whether the `selector` agent can select
|
|
29
|
+
* candidate functions to call as expected. Therefore, it does not test
|
|
30
|
+
* about the actual function calling which is done by the `executor` agent.
|
|
31
|
+
* If you want that feature, use {@link AgenticaCallBenchmark} class instead.
|
|
32
|
+
*
|
|
33
|
+
* @author Samchon
|
|
34
|
+
*/
|
|
35
|
+
class AgenticaSelectBenchmark {
|
|
36
|
+
/**
|
|
37
|
+
* Initializer Constructor.
|
|
38
|
+
*
|
|
39
|
+
* @param props Properties of the selection benchmark
|
|
40
|
+
*/
|
|
41
|
+
constructor(props) {
|
|
42
|
+
var _a, _b, _c, _d;
|
|
43
|
+
this.agent_ = props.agent;
|
|
44
|
+
this.scenarios_ = props.scenarios.slice();
|
|
45
|
+
this.config_ = {
|
|
46
|
+
repeat: (_b = (_a = props.config) === null || _a === void 0 ? void 0 : _a.repeat) !== null && _b !== void 0 ? _b : 10,
|
|
47
|
+
simultaneous: (_d = (_c = props.config) === null || _c === void 0 ? void 0 : _c.simultaneous) !== null && _d !== void 0 ? _d : 10,
|
|
48
|
+
};
|
|
49
|
+
this.histories_ = props.agent.getPromptHistories().slice();
|
|
50
|
+
this.result_ = null;
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Execute the benchmark.
|
|
54
|
+
*
|
|
55
|
+
* Execute the benchmark of the LLM function selection, and returns
|
|
56
|
+
* the result of the benchmark.
|
|
57
|
+
*
|
|
58
|
+
* If you wanna see progress of the benchmark, you can pass a callback
|
|
59
|
+
* function as the argument of the `listener`. The callback function
|
|
60
|
+
* would be called whenever a benchmark event is occurred.
|
|
61
|
+
*
|
|
62
|
+
* Also, you can publish a markdown format report by calling
|
|
63
|
+
* the {@link report} function after the benchmark execution.
|
|
64
|
+
*
|
|
65
|
+
* @param listener Callback function listening the benchmark events
|
|
66
|
+
* @returns Results of the function selection benchmark
|
|
67
|
+
*/
|
|
68
|
+
execute(listener) {
|
|
69
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
70
|
+
const started_at = new Date();
|
|
71
|
+
const semaphore = new tstl_1.Semaphore(this.config_.simultaneous);
|
|
72
|
+
const experiments = yield Promise.all(this.scenarios_.map((scenario) => __awaiter(this, void 0, void 0, function* () {
|
|
73
|
+
const events = yield Promise.all(new Array(this.config_.repeat).fill(0).map(() => __awaiter(this, void 0, void 0, function* () {
|
|
74
|
+
yield semaphore.acquire();
|
|
75
|
+
const e = yield this.step(scenario);
|
|
76
|
+
yield semaphore.release();
|
|
77
|
+
if (listener !== undefined)
|
|
78
|
+
listener(e);
|
|
79
|
+
return e;
|
|
80
|
+
})));
|
|
81
|
+
return {
|
|
82
|
+
scenario,
|
|
83
|
+
events,
|
|
84
|
+
usage: events
|
|
85
|
+
.filter((e) => e.type !== "error")
|
|
86
|
+
.map((e) => e.usage)
|
|
87
|
+
.reduce(TokenUsageComputer_1.TokenUsageComputer.plus, TokenUsageComputer_1.TokenUsageComputer.zero()),
|
|
88
|
+
};
|
|
89
|
+
})));
|
|
90
|
+
return (this.result_ = {
|
|
91
|
+
experiments,
|
|
92
|
+
started_at,
|
|
93
|
+
completed_at: new Date(),
|
|
94
|
+
usage: experiments
|
|
95
|
+
.map((p) => p.usage)
|
|
96
|
+
.reduce(TokenUsageComputer_1.TokenUsageComputer.plus, TokenUsageComputer_1.TokenUsageComputer.zero()),
|
|
97
|
+
});
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Report the benchmark result as markdown files.
|
|
102
|
+
*
|
|
103
|
+
* Report the benchmark result {@link execute}d by
|
|
104
|
+
* `AgenticaSelectBenchmark` as markdown files, and returns a
|
|
105
|
+
* dictionary object of the markdown reporting files. The key of
|
|
106
|
+
* the dictionary would be file name, and the value would be the
|
|
107
|
+
* markdown content.
|
|
108
|
+
*
|
|
109
|
+
* For reference, the markdown files are composed like below:
|
|
110
|
+
*
|
|
111
|
+
* - `./README.md`
|
|
112
|
+
* - `./scenario-1/README.md`
|
|
113
|
+
* - `./scenario-1/1.success.md`
|
|
114
|
+
* - `./scenario-1/2.failure.md`
|
|
115
|
+
* - `./scenario-1/3.error.md`
|
|
116
|
+
*
|
|
117
|
+
* @returns Dictionary of markdown files.
|
|
118
|
+
*/
|
|
119
|
+
report() {
|
|
120
|
+
if (this.result_ === null)
|
|
121
|
+
throw new Error("Benchmark is not executed yet.");
|
|
122
|
+
return AgenticaSelectBenchmarkReporter_1.AgenticaSelectBenchmarkReporter.markdown(this.result_);
|
|
123
|
+
}
|
|
124
|
+
step(scenario) {
|
|
125
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
126
|
+
const started_at = new Date();
|
|
127
|
+
try {
|
|
128
|
+
const usage = {
|
|
129
|
+
total: 0,
|
|
130
|
+
prompt: {
|
|
131
|
+
total: 0,
|
|
132
|
+
audio: 0,
|
|
133
|
+
cached: 0,
|
|
134
|
+
},
|
|
135
|
+
completion: {
|
|
136
|
+
total: 0,
|
|
137
|
+
accepted_prediction: 0,
|
|
138
|
+
audio: 0,
|
|
139
|
+
reasoning: 0,
|
|
140
|
+
rejected_prediction: 0,
|
|
141
|
+
},
|
|
142
|
+
};
|
|
143
|
+
const prompts = yield ChatGptSelectFunctionAgent_1.ChatGptSelectFunctionAgent.execute(Object.assign(Object.assign({}, this.agent_.getContext({
|
|
144
|
+
prompt: {
|
|
145
|
+
type: "text",
|
|
146
|
+
role: "user",
|
|
147
|
+
text: scenario.text,
|
|
148
|
+
},
|
|
149
|
+
usage,
|
|
150
|
+
})), { histories: this.histories_.slice(), stack: [], ready: () => true, dispatch: () => __awaiter(this, void 0, void 0, function* () { }) }));
|
|
151
|
+
const selected = prompts
|
|
152
|
+
.filter((p) => p.type === "select")
|
|
153
|
+
.map((p) => p.operations)
|
|
154
|
+
.flat();
|
|
155
|
+
return {
|
|
156
|
+
type: AgenticaBenchmarkPredicator_1.AgenticaBenchmarkPredicator.success({
|
|
157
|
+
expected: scenario.expected,
|
|
158
|
+
operations: selected,
|
|
159
|
+
})
|
|
160
|
+
? "success"
|
|
161
|
+
: "failure",
|
|
162
|
+
scenario,
|
|
163
|
+
selected,
|
|
164
|
+
usage,
|
|
165
|
+
assistantPrompts: prompts
|
|
166
|
+
.filter((p) => p.type === "text")
|
|
167
|
+
.filter((p) => p.role === "assistant"),
|
|
168
|
+
started_at,
|
|
169
|
+
completed_at: new Date(),
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
catch (error) {
|
|
173
|
+
return {
|
|
174
|
+
type: "error",
|
|
175
|
+
scenario,
|
|
176
|
+
error,
|
|
177
|
+
started_at,
|
|
178
|
+
completed_at: new Date(),
|
|
179
|
+
};
|
|
180
|
+
}
|
|
181
|
+
});
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
exports.AgenticaSelectBenchmark = AgenticaSelectBenchmark;
|
|
185
|
+
//# sourceMappingURL=AgenticaSelectBenchmark.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"AgenticaSelectBenchmark.js","sourceRoot":"","sources":["../src/AgenticaSelectBenchmark.ts"],"names":[],"mappings":";;;;;;;;;;;;AAOA,sGAAmG;AACnG,+BAAiC;AAGjC,wFAAqF;AACrF,gGAA6F;AAI7F,mEAAgE;AAEhE;;;;;;;;;;;;;;;;GAgBG;AACH,MAAa,uBAAuB;IAOlC;;;;OAIG;IACH,YAAmB,KAAqC;;QACtD,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC,KAAK,CAAC;QAC1B,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC,SAAS,CAAC,KAAK,EAAE,CAAC;QAC1C,IAAI,CAAC,OAAO,GAAG;YACb,MAAM,EAAE,MAAA,MAAA,KAAK,CAAC,MAAM,0CAAE,MAAM,mCAAI,EAAE;YAClC,YAAY,EAAE,MAAA,MAAA,KAAK,CAAC,MAAM,0CAAE,YAAY,mCAAI,EAAE;SAC/C,CAAC;QACF,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC,KAAK,CAAC,kBAAkB,EAAE,CAAC,KAAK,EAAE,CAAC;QAC3D,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;IACtB,CAAC;IAED;;;;;;;;;;;;;;;OAeG;IACU,OAAO,CAClB,QAAyD;;YAEzD,MAAM,UAAU,GAAS,IAAI,IAAI,EAAE,CAAC;YACpC,MAAM,SAAS,GAAc,IAAI,gBAAS,CAAC,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;YACtE,MAAM,WAAW,GACf,MAAM,OAAO,CAAC,GAAG,CACf,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAO,QAAQ,EAAE,EAAE;gBACrC,MAAM,MAAM,GAAoC,MAAM,OAAO,CAAC,GAAG,CAC/D,IAAI,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAS,EAAE;oBACpD,MAAM,SAAS,CAAC,OAAO,EAAE,CAAC;oBAC1B,MAAM,CAAC,GACL,MAAM,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;oBAC5B,MAAM,SAAS,CAAC,OAAO,EAAE,CAAC;oBAC1B,IAAI,QAAQ,KAAK,SAAS;wBAAE,QAAQ,CAAC,CAAC,CAAC,CAAC;oBACxC,OAAO,CAAC,CAAC;gBACX,CAAC,CAAA,CAAC,CACH,CAAC;gBACF,OAAO;oBACL,QAAQ;oBACR,MAAM;oBACN,KAAK,EAAE,MAAM;yBACV,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,OAAO,CAAC;yBACjC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;yBACnB,MAAM,CAAC,uCAAkB,CAAC,IAAI,EAAE,uCAAkB,CAAC,IAAI,EAAE,CAAC;iBAC9D,CAAC;YACJ,CAAC,CAAA,CAAC,CACH,CAAC;YACJ,OAAO,CAAC,IAAI,CAAC,OAAO,GAAG;gBACrB,WAAW;gBACX,UAAU;gBACV,YAAY,EAAE,IAAI,IAAI,EAAE;gBACxB,KAAK,EAAE,WAAW;qBACf,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;qBACnB,MAAM,CAAC,uCAAkB,CAAC,IAAI,EAAE,uCAAkB,CAAC,IAAI,EAAE,CAAC;aAC9D,CAAC,CAAC;QACL,CAAC;KAAA;IAED;;;;;;;;;;;;;;;;;;OAkBG;IACI,MAAM;QACX,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI;YACvB,MAAM,IAAI,KAAK,CAAC,gCAAgC,CAAC,CAAC;QACpD,OAAO,iEAA+B,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAChE,CAAC;IAEa,IAAI,CAChB,QAA0C;;YAE1C,MAAM,UAAU,GAAS,IAAI,IAAI,EAAE,CAAC;YACpC,IAAI,CAAC;gBACH,MAAM,KAAK,GAAwB;oBACjC,KAAK,EAAE,CAAC;oBACR,MAAM,EAAE;wBACN,KAAK,EAAE,CAAC;wBACR,KAAK,EAAE,CAAC;wBACR,MAAM,EAAE,CAAC;qBACV;oBACD,UAAU,EAAE;wBACV,KAAK,EAAE,CAAC;wBACR,mBAAmB,EAAE,CAAC;wBACtB,KAAK,EAAE,CAAC;wBACR,SAAS,EAAE,CAAC;wBACZ,mBAAmB,EAAE,CAAC;qBACvB;iBACF,CAAC;gBACF,MAAM,OAAO,GACX,MAAM,uDAA0B,CAAC,OAAO,CAAC,gCACpC,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC;oBACxB,MAAM,EAAE;wBACN,IAAI,EAAE,MAAM;wBACZ,IAAI,EAAE,MAAM;wBACZ,IAAI,EAAE,QAAQ,CAAC,IAAI;qBACpB;oBACD,KAAK;iBACN,CAAC,KACF,SAAS,EAAE,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,EAClC,KAAK,EAAE,EAAE,EACT,KAAK,EAAE,GAAG,EAAE,CAAC,IAAI,EACjB,QAAQ,EAAE,GAAS,EAAE,gDAAE,CAAC,CAAA,GACE,CAAC,CAAC;gBAChC,MAAM,QAAQ,GAAkC,OAAO;qBACpD,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC;qBAClC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC;qBACxB,IAAI,EAAE,CAAC;gBACV,OAAO;oBACL,IAAI,EAAE,yDAA2B,CAAC,OAAO,CAAC;wBACxC,QAAQ,EAAE,QAAQ,CAAC,QAAQ;wBAC3B,UAAU,EAAE,QAAQ;qBACrB,CAAC;wBACA,CAAC,CAAC,SAAS;wBACX,CAAC,CAAC,SAAS;oBACb,QAAQ;oBACR,QAAQ;oBACR,KAAK;oBACL,gBAAgB,EAAE,OAAO;yBACtB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,MAAM,CAAC;yBAChC,MAAM,CACL,CAAC,CAAC,EAA2C,EAAE,CAC7C,CAAC,CAAC,IAAI,KAAK,WAAW,CACzB;oBACH,UAAU;oBACV,YAAY,EAAE,IAAI,IAAI,EAAE;iBAGgB,CAAC;YAC7C,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,OAAO;oBACL,IAAI,EAAE,OAAO;oBACb,QAAQ;oBACR,KAAK;oBACL,UAAU;oBACV,YAAY,EAAE,IAAI,IAAI,EAAE;iBACsB,CAAC;YACnD,CAAC;QACH,CAAC;KAAA;CACF;AA5KD,0DA4KC"}
|
package/lib/index.d.ts
ADDED