llmtester 1.0.7 → 1.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/config.d.ts +17 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +58 -0
- package/dist/config.js.map +1 -0
- package/dist/index.d.ts +1 -1
- package/dist/index.js +12 -560
- package/dist/index.js.map +1 -1
- package/dist/menu.d.ts +17 -0
- package/dist/menu.d.ts.map +1 -0
- package/dist/menu.js +166 -0
- package/dist/menu.js.map +1 -0
- package/dist/runner.d.ts +28 -0
- package/dist/runner.d.ts.map +1 -0
- package/dist/runner.js +359 -0
- package/dist/runner.js.map +1 -0
- package/package.json +1 -1
package/dist/config.d.ts
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { ProviderType } from './client.js';
|
|
2
|
+
export interface Config {
|
|
3
|
+
provider: ProviderType;
|
|
4
|
+
mode?: 'openai' | 'anthropic';
|
|
5
|
+
apiKey: string;
|
|
6
|
+
baseUrl: string;
|
|
7
|
+
modelName: string;
|
|
8
|
+
judgeProvider?: ProviderType;
|
|
9
|
+
judgeApiKey?: string;
|
|
10
|
+
judgeBaseUrl?: string;
|
|
11
|
+
judgeModelName?: string;
|
|
12
|
+
}
|
|
13
|
+
export declare function getConfigPath(): string;
|
|
14
|
+
export declare function loadConfig(): Promise<Partial<Config> | null>;
|
|
15
|
+
export declare function saveConfig(config: Config): Promise<void>;
|
|
16
|
+
export declare function prompt(message: string): Promise<string>;
|
|
17
|
+
//# sourceMappingURL=config.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"config.d.ts","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":"AAKA,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAE3C,MAAM,WAAW,MAAM;IACrB,QAAQ,EAAE,YAAY,CAAC;IACvB,IAAI,CAAC,EAAE,QAAQ,GAAG,WAAW,CAAC;IAC9B,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,CAAC,EAAE,YAAY,CAAC;IAC7B,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,wBAAgB,aAAa,IAAI,MAAM,CAEtC;AAED,wBAAsB,UAAU,IAAI,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,CAUlE;AAED,wBAAsB,UAAU,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAS9D;AAED,wBAAsB,MAAM,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAmB7D"}
|
package/dist/config.js
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.getConfigPath = getConfigPath;
|
|
7
|
+
exports.loadConfig = loadConfig;
|
|
8
|
+
exports.saveConfig = saveConfig;
|
|
9
|
+
exports.prompt = prompt;
|
|
10
|
+
const chalk_1 = __importDefault(require("chalk"));
|
|
11
|
+
const fs_extra_1 = __importDefault(require("fs-extra"));
|
|
12
|
+
const path_1 = __importDefault(require("path"));
|
|
13
|
+
const readline_1 = __importDefault(require("readline"));
|
|
14
|
+
const paths_js_1 = require("./paths.js");
|
|
15
|
+
function getConfigPath() {
|
|
16
|
+
return path_1.default.join((0, paths_js_1.getConfigDir)(), 'config.json');
|
|
17
|
+
}
|
|
18
|
+
async function loadConfig() {
|
|
19
|
+
const configPath = getConfigPath();
|
|
20
|
+
try {
|
|
21
|
+
if (await fs_extra_1.default.pathExists(configPath)) {
|
|
22
|
+
return await fs_extra_1.default.readJson(configPath);
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
catch (e) {
|
|
26
|
+
console.log(chalk_1.default.yellow(`Warning: Could not load config: ${e}`));
|
|
27
|
+
}
|
|
28
|
+
return null;
|
|
29
|
+
}
|
|
30
|
+
async function saveConfig(config) {
|
|
31
|
+
const configPath = getConfigPath();
|
|
32
|
+
try {
|
|
33
|
+
await fs_extra_1.default.ensureDir(path_1.default.dirname(configPath));
|
|
34
|
+
await fs_extra_1.default.writeJson(configPath, config, { spaces: 2 });
|
|
35
|
+
console.log(chalk_1.default.green(`Config saved to: ${configPath}`));
|
|
36
|
+
}
|
|
37
|
+
catch (e) {
|
|
38
|
+
console.log(chalk_1.default.yellow(`Warning: Could not save config: ${e}`));
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
async function prompt(message) {
|
|
42
|
+
return new Promise((resolve) => {
|
|
43
|
+
process.stdin.removeAllListeners('keypress');
|
|
44
|
+
process.stdin.setRawMode?.(false);
|
|
45
|
+
const rl = readline_1.default.createInterface({
|
|
46
|
+
input: process.stdin,
|
|
47
|
+
output: process.stdout,
|
|
48
|
+
});
|
|
49
|
+
rl.question(message, (answer) => {
|
|
50
|
+
rl.close();
|
|
51
|
+
resolve(answer);
|
|
52
|
+
});
|
|
53
|
+
if (process.stdin.isTTY) {
|
|
54
|
+
process.stdin.setRawMode(false);
|
|
55
|
+
}
|
|
56
|
+
});
|
|
57
|
+
}
|
|
58
|
+
//# sourceMappingURL=config.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"config.js","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":";;;;;AAmBA,sCAEC;AAED,gCAUC;AAED,gCASC;AAED,wBAmBC;AAjED,kDAA0B;AAC1B,wDAA0B;AAC1B,gDAAwB;AACxB,wDAAgC;AAChC,yCAA0C;AAe1C,SAAgB,aAAa;IAC3B,OAAO,cAAI,CAAC,IAAI,CAAC,IAAA,uBAAY,GAAE,EAAE,aAAa,CAAC,CAAC;AAClD,CAAC;AAEM,KAAK,UAAU,UAAU;IAC9B,MAAM,UAAU,GAAG,aAAa,EAAE,CAAC;IACnC,IAAI,CAAC;QACH,IAAI,MAAM,kBAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;YACpC,OAAO,MAAM,kBAAE,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC;QACvC,CAAC;IACH,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,OAAO,CAAC,GAAG,CAAC,eAAK,CAAC,MAAM,CAAC,mCAAmC,CAAC,EAAE,CAAC,CAAC,CAAC;IACpE,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAEM,KAAK,UAAU,UAAU,CAAC,MAAc;IAC7C,MAAM,UAAU,GAAG,aAAa,EAAE,CAAC;IACnC,IAAI,CAAC;QACH,MAAM,kBAAE,CAAC,SAAS,CAAC,cAAI,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC;QAC7C,MAAM,kBAAE,CAAC,SAAS,CAAC,UAAU,EAAE,MAAM,EAAE,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC,CAAC;QACtD,OAAO,CAAC,GAAG,CAAC,eAAK,CAAC,KAAK,CAAC,oBAAoB,UAAU,EAAE,CAAC,CAAC,CAAC;IAC7D,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,OAAO,CAAC,GAAG,CAAC,eAAK,CAAC,MAAM,CAAC,mCAAmC,CAAC,EAAE,CAAC,CAAC,CAAC;IACpE,CAAC;AACH,CAAC;AAEM,KAAK,UAAU,MAAM,CAAC,OAAe;IAC1C,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;QAC7B,OAAO,CAAC,KAAK,CAAC,kBAAkB,CAAC,UAAU,CAAC,CAAC;QAC7C,OAAO,CAAC,KAAK,CAAC,UAAU,EAAE,CAAC,KAAK,CAAC,CAAC;QAElC,MAAM,EAAE,GAAG,kBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM;SACvB,CAAC,CAAC;QAEH,EAAE,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC,MAAc,EAAE,EAAE;YACtC,EAAE,CAAC,KAAK,EAAE,CAAC;YACX,OAAO,CAAC,MAAM,CAAC,CAAC;QAClB,CAAC,CAAC,CAAC;QAEH,IAAI,OAAO,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;YACxB,OAAO,CAAC,KAAK,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC;QAClC,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC"}
|
package/dist/index.d.ts
CHANGED
package/dist/index.js
CHANGED
|
@@ -1,572 +1,24 @@
|
|
|
1
|
-
#!/usr/bin
|
|
1
|
+
#!/usr/bin//env node
|
|
2
2
|
"use strict";
|
|
3
3
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
4
4
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
5
5
|
};
|
|
6
6
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
-
const dotenv_1 = __importDefault(require("dotenv"));
|
|
8
|
-
const chalk_1 = __importDefault(require("chalk"));
|
|
9
|
-
const fs_extra_1 = __importDefault(require("fs-extra"));
|
|
10
|
-
const path_1 = __importDefault(require("path"));
|
|
11
|
-
const readline_1 = __importDefault(require("readline"));
|
|
12
7
|
const child_process_1 = require("child_process");
|
|
13
|
-
const
|
|
14
|
-
const
|
|
15
|
-
const
|
|
16
|
-
|
|
17
|
-
const logger_js_1 = require("./logger.js");
|
|
18
|
-
const paths_js_1 = require("./paths.js");
|
|
19
|
-
dotenv_1.default.config();
|
|
20
|
-
function getConfigPath() {
|
|
21
|
-
return path_1.default.join((0, paths_js_1.getConfigDir)(), 'config.json');
|
|
22
|
-
}
|
|
23
|
-
async function loadConfig() {
|
|
24
|
-
const configPath = getConfigPath();
|
|
25
|
-
try {
|
|
26
|
-
if (await fs_extra_1.default.pathExists(configPath)) {
|
|
27
|
-
return await fs_extra_1.default.readJson(configPath);
|
|
28
|
-
}
|
|
29
|
-
}
|
|
30
|
-
catch (e) {
|
|
31
|
-
console.log(chalk_1.default.yellow(`Warning: Could not load config: ${e}`));
|
|
32
|
-
}
|
|
33
|
-
return null;
|
|
34
|
-
}
|
|
35
|
-
async function saveConfig(config) {
|
|
36
|
-
const configPath = getConfigPath();
|
|
37
|
-
try {
|
|
38
|
-
await fs_extra_1.default.ensureDir(path_1.default.dirname(configPath));
|
|
39
|
-
await fs_extra_1.default.writeJson(configPath, config, { spaces: 2 });
|
|
40
|
-
console.log(chalk_1.default.green(`Config saved to: ${configPath}`));
|
|
41
|
-
}
|
|
42
|
-
catch (e) {
|
|
43
|
-
console.log(chalk_1.default.yellow(`Warning: Could not save config: ${e}`));
|
|
44
|
-
}
|
|
45
|
-
}
|
|
46
|
-
const PROVIDERS = [
|
|
47
|
-
{ id: 'openai', name: 'OpenAI', defaultUrl: 'https://api.openai.com/v1' },
|
|
48
|
-
{ id: 'anthropic', name: 'Anthropic', defaultUrl: 'https://api.anthropic.com' },
|
|
49
|
-
{ id: 'custom', name: 'Custom (OpenAI-compatible)', defaultUrl: '' },
|
|
50
|
-
];
|
|
51
|
-
const COMMON_OPENAI_ENDPOINTS = [
|
|
52
|
-
{ name: 'OpenAI', url: 'https://api.openai.com/v1' },
|
|
53
|
-
{ name: 'Together.ai', url: 'https://api.together.xyz/v1' },
|
|
54
|
-
{ name: 'Groq', url: 'https://api.groq.com/openai/v1' },
|
|
55
|
-
{ name: 'Fireworks AI', url: 'https://api.fireworks.ai/inference/v1' },
|
|
56
|
-
{ name: 'Perplexity', url: 'https://api.perplexity.ai' },
|
|
57
|
-
{ name: 'OpenRouter', url: 'https://openrouter.ai/api/v1' },
|
|
58
|
-
{ name: 'Custom URL', url: '' },
|
|
59
|
-
];
|
|
60
|
-
async function prompt(message) {
|
|
61
|
-
return new Promise((resolve) => {
|
|
62
|
-
process.stdin.removeAllListeners('keypress');
|
|
63
|
-
process.stdin.setRawMode?.(false);
|
|
64
|
-
const rl = readline_1.default.createInterface({
|
|
65
|
-
input: process.stdin,
|
|
66
|
-
output: process.stdout,
|
|
67
|
-
});
|
|
68
|
-
rl.question(message, (answer) => {
|
|
69
|
-
rl.close();
|
|
70
|
-
resolve(answer);
|
|
71
|
-
});
|
|
72
|
-
if (process.stdin.isTTY) {
|
|
73
|
-
process.stdin.setRawMode(false);
|
|
74
|
-
}
|
|
75
|
-
});
|
|
76
|
-
}
|
|
77
|
-
async function selectOption(items, message) {
|
|
78
|
-
// Remove all existing keypress listeners first
|
|
79
|
-
process.stdin.removeAllListeners('keypress');
|
|
80
|
-
readline_1.default.emitKeypressEvents(process.stdin);
|
|
81
|
-
if (process.stdin.isTTY) {
|
|
82
|
-
process.stdin.setRawMode(true);
|
|
83
|
-
}
|
|
84
|
-
let cursor = 0;
|
|
85
|
-
const render = () => {
|
|
86
|
-
process.stdout.write('\x1b[H\x1b[2J');
|
|
87
|
-
console.log(chalk_1.default.cyan(`${message}\n`));
|
|
88
|
-
items.forEach((item, i) => {
|
|
89
|
-
const prefix = i === cursor ? chalk_1.default.cyan('> ') : ' ';
|
|
90
|
-
console.log(`${prefix}${item.name}`);
|
|
91
|
-
});
|
|
92
|
-
console.log(chalk_1.default.gray('\nUse arrow keys, Enter to select'));
|
|
93
|
-
};
|
|
94
|
-
render();
|
|
95
|
-
return new Promise((resolve) => {
|
|
96
|
-
const handleKeypress = (str, key) => {
|
|
97
|
-
if (key.ctrl && key.name === 'c') {
|
|
98
|
-
if (process.stdin.isTTY)
|
|
99
|
-
process.stdin.setRawMode(false);
|
|
100
|
-
process.exit(0);
|
|
101
|
-
}
|
|
102
|
-
if (key.name === 'up') {
|
|
103
|
-
cursor = Math.max(0, cursor - 1);
|
|
104
|
-
render();
|
|
105
|
-
}
|
|
106
|
-
else if (key.name === 'down') {
|
|
107
|
-
cursor = Math.min(items.length - 1, cursor + 1);
|
|
108
|
-
render();
|
|
109
|
-
}
|
|
110
|
-
else if (key.name === 'return') {
|
|
111
|
-
process.stdin.removeListener('keypress', handleKeypress);
|
|
112
|
-
if (process.stdin.isTTY)
|
|
113
|
-
process.stdin.setRawMode(false);
|
|
114
|
-
resolve(items[cursor]);
|
|
115
|
-
}
|
|
116
|
-
};
|
|
117
|
-
process.stdin.on('keypress', handleKeypress);
|
|
118
|
-
});
|
|
119
|
-
}
|
|
120
|
-
async function selectMulti(items, message, formatItem) {
|
|
121
|
-
// Remove all existing keypress listeners first
|
|
122
|
-
process.stdin.removeAllListeners('keypress');
|
|
123
|
-
readline_1.default.emitKeypressEvents(process.stdin);
|
|
124
|
-
if (process.stdin.isTTY) {
|
|
125
|
-
process.stdin.setRawMode(true);
|
|
126
|
-
}
|
|
127
|
-
process.stdout.write('\x1b[H\x1b[2J');
|
|
128
|
-
console.log(chalk_1.default.cyan(`\n${message}`));
|
|
129
|
-
console.log(chalk_1.default.gray('Space to toggle, Enter to confirm\n'));
|
|
130
|
-
const selected = [];
|
|
131
|
-
let cursor = 0;
|
|
132
|
-
const render = () => {
|
|
133
|
-
process.stdout.write('\x1b[H\x1b[2J');
|
|
134
|
-
console.log(chalk_1.default.cyan(`${message}\n`));
|
|
135
|
-
items.forEach((item, i) => {
|
|
136
|
-
const mark = selected.includes(item) ? chalk_1.default.green('[x]') : '[ ]';
|
|
137
|
-
const prefix = i === cursor ? chalk_1.default.cyan('> ') : ' ';
|
|
138
|
-
const extra = formatItem ? ` ${chalk_1.default.gray(formatItem(item))}` : '';
|
|
139
|
-
console.log(`${prefix}${mark} ${item.name}${extra}`);
|
|
140
|
-
});
|
|
141
|
-
console.log(chalk_1.default.gray('\nSpace to toggle, Enter to confirm'));
|
|
142
|
-
};
|
|
143
|
-
render();
|
|
144
|
-
return new Promise((resolve) => {
|
|
145
|
-
const handleKeypress = (str, key) => {
|
|
146
|
-
if (key.ctrl && key.name === 'c') {
|
|
147
|
-
if (process.stdin.isTTY)
|
|
148
|
-
process.stdin.setRawMode(false);
|
|
149
|
-
process.exit(0);
|
|
150
|
-
}
|
|
151
|
-
if (key.name === 'up') {
|
|
152
|
-
cursor = Math.max(0, cursor - 1);
|
|
153
|
-
render();
|
|
154
|
-
}
|
|
155
|
-
else if (key.name === 'down') {
|
|
156
|
-
cursor = Math.min(items.length - 1, cursor + 1);
|
|
157
|
-
render();
|
|
158
|
-
}
|
|
159
|
-
else if (key.name === 'space' || str === ' ') {
|
|
160
|
-
const item = items[cursor];
|
|
161
|
-
if (selected.includes(item)) {
|
|
162
|
-
selected.splice(selected.indexOf(item), 1);
|
|
163
|
-
}
|
|
164
|
-
else {
|
|
165
|
-
selected.push(item);
|
|
166
|
-
}
|
|
167
|
-
render();
|
|
168
|
-
}
|
|
169
|
-
else if (key.name === 'return') {
|
|
170
|
-
process.stdin.removeListener('keypress', handleKeypress);
|
|
171
|
-
if (process.stdin.isTTY)
|
|
172
|
-
process.stdin.setRawMode(false);
|
|
173
|
-
resolve(selected);
|
|
174
|
-
}
|
|
175
|
-
};
|
|
176
|
-
process.stdin.on('keypress', handleKeypress);
|
|
177
|
-
});
|
|
178
|
-
}
|
|
179
|
-
async function selectProvider() {
|
|
180
|
-
return selectOption(PROVIDERS, 'Select your provider');
|
|
181
|
-
}
|
|
182
|
-
async function selectProviderWithPrompt(message) {
|
|
183
|
-
console.log(chalk_1.default.cyan('\n' + message));
|
|
184
|
-
PROVIDERS.forEach((p, i) => { console.log(String(i + 1) + '. ' + p.name); });
|
|
185
|
-
const providerIds = ['openai', 'anthropic', 'custom'];
|
|
8
|
+
const path_1 = __importDefault(require("path"));
|
|
9
|
+
const runner_js_1 = require("./runner.js");
|
|
10
|
+
const menu_js_1 = require("./menu.js");
|
|
11
|
+
async function main() {
|
|
186
12
|
while (true) {
|
|
187
|
-
const
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
if (num >= 1 && num <= PROVIDERS.length) {
|
|
191
|
-
return PROVIDERS[num - 1];
|
|
192
|
-
}
|
|
193
|
-
if (providerIds.includes(trimmed)) {
|
|
194
|
-
return PROVIDERS.find(p => p.id === trimmed);
|
|
195
|
-
}
|
|
196
|
-
console.log(chalk_1.default.yellow('Invalid selection. Please try again.'));
|
|
197
|
-
}
|
|
198
|
-
}
|
|
199
|
-
async function selectEndpoint(provider) {
|
|
200
|
-
if (provider === 'anthropic') {
|
|
201
|
-
return 'https://api.anthropic.com';
|
|
202
|
-
}
|
|
203
|
-
if (provider === 'custom' || provider === 'openai') {
|
|
204
|
-
const endpoints = COMMON_OPENAI_ENDPOINTS.map(e => ({ id: e.url || 'custom', name: e.name, url: e.url }));
|
|
205
|
-
const selection = await selectOption(endpoints, 'Select or enter endpoint');
|
|
206
|
-
if (selection.url === '') {
|
|
207
|
-
return (await prompt('Enter custom endpoint URL: ')).trim();
|
|
13
|
+
const choice = await (0, menu_js_1.showMainMenu)();
|
|
14
|
+
if (choice === 0) {
|
|
15
|
+
await (0, runner_js_1.runBenchmarks)();
|
|
208
16
|
}
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
}
|
|
213
|
-
async function selectEndpointWithPrompt(provider) {
|
|
214
|
-
if (provider === 'anthropic') {
|
|
215
|
-
return 'https://api.anthropic.com';
|
|
216
|
-
}
|
|
217
|
-
if (provider === 'custom') {
|
|
218
|
-
const answer = await prompt('Enter custom endpoint URL: ');
|
|
219
|
-
return answer.trim();
|
|
220
|
-
}
|
|
221
|
-
return PROVIDERS.find(p => p.id === provider)?.defaultUrl || '';
|
|
222
|
-
}
|
|
223
|
-
async function getConfig() {
|
|
224
|
-
const envProvider = process.env.LLM_PROVIDER || '';
|
|
225
|
-
const envApiKey = process.env.LLM_API_KEY || process.env.OPENAI_API_KEY || '';
|
|
226
|
-
const envBaseUrl = process.env.LLM_BASE_URL || process.env.OPENAI_BASE_URL || '';
|
|
227
|
-
const envModelName = process.env.LLM_MODEL || process.env.MODEL_NAME || '';
|
|
228
|
-
// Load saved config
|
|
229
|
-
const savedConfig = await loadConfig();
|
|
230
|
-
let provider;
|
|
231
|
-
let providerMode = savedConfig?.mode || 'openai';
|
|
232
|
-
let providerInfo;
|
|
233
|
-
if (envProvider && PROVIDERS.some(p => p.id === envProvider)) {
|
|
234
|
-
provider = envProvider;
|
|
235
|
-
providerInfo = PROVIDERS.find(p => p.id === provider);
|
|
236
|
-
console.log(chalk_1.default.green(`Provider: ${providerInfo.name} (from env)`));
|
|
237
|
-
}
|
|
238
|
-
else if (savedConfig?.provider) {
|
|
239
|
-
provider = savedConfig.provider;
|
|
240
|
-
providerInfo = PROVIDERS.find(p => p.id === provider);
|
|
241
|
-
console.log(chalk_1.default.green(`Provider: ${providerInfo.name} (from config)`));
|
|
242
|
-
}
|
|
243
|
-
else {
|
|
244
|
-
providerInfo = await selectProvider();
|
|
245
|
-
provider = providerInfo.id;
|
|
246
|
-
}
|
|
247
|
-
// Ask for mode when custom provider
|
|
248
|
-
if (provider === 'custom') {
|
|
249
|
-
const envMode = process.env.LLM_MODE || '';
|
|
250
|
-
if (envMode === 'anthropic') {
|
|
251
|
-
providerMode = 'anthropic';
|
|
17
|
+
else if (choice === 1) {
|
|
18
|
+
const tuiPath = path_1.default.join(__dirname, '..', 'bin', 'tui.js');
|
|
19
|
+
(0, child_process_1.execFileSync)('node', [tuiPath], { stdio: 'inherit' });
|
|
252
20
|
}
|
|
253
|
-
else if (savedConfig?.mode) {
|
|
254
|
-
providerMode = savedConfig.mode;
|
|
255
|
-
}
|
|
256
|
-
else {
|
|
257
|
-
console.log(chalk_1.default.cyan(''));
|
|
258
|
-
const modeAnswer = (await prompt('API mode - openai (OpenAI-compatible) or anthropic (Anthropic-compatible)? (o/a): ')).trim().toLowerCase();
|
|
259
|
-
providerMode = modeAnswer === 'a' ? 'anthropic' : 'openai';
|
|
260
|
-
}
|
|
261
|
-
console.log(chalk_1.default.green('Mode: ' + providerMode));
|
|
262
|
-
}
|
|
263
|
-
let endpoint = envBaseUrl || savedConfig?.baseUrl || '';
|
|
264
|
-
if (!endpoint) {
|
|
265
|
-
endpoint = await selectEndpoint(provider);
|
|
266
|
-
}
|
|
267
|
-
else {
|
|
268
|
-
console.log(chalk_1.default.green(`Endpoint: ${endpoint}`));
|
|
269
|
-
}
|
|
270
|
-
let key = envApiKey || savedConfig?.apiKey || '';
|
|
271
|
-
if (!key) {
|
|
272
|
-
const keyPrompt = provider === 'anthropic'
|
|
273
|
-
? 'Enter your Anthropic API key: '
|
|
274
|
-
: 'Enter your API key: ';
|
|
275
|
-
key = (await prompt(keyPrompt)).trim();
|
|
276
|
-
}
|
|
277
|
-
else {
|
|
278
|
-
const maskedKey = key.length > 8 ? `${key.slice(0, 4)}...${key.slice(-4)}` : '***';
|
|
279
|
-
console.log(chalk_1.default.green(`API key: ${maskedKey}`));
|
|
280
|
-
}
|
|
281
|
-
let model = envModelName || savedConfig?.modelName || '';
|
|
282
|
-
if (!model) {
|
|
283
|
-
model = (await prompt('Enter model name (e.g., gpt-4o, claude-3-opus): ')).trim();
|
|
284
|
-
}
|
|
285
|
-
else {
|
|
286
|
-
console.log(chalk_1.default.green(`Model: ${model}`));
|
|
287
|
-
}
|
|
288
|
-
const config = { provider, apiKey: key, baseUrl: endpoint, modelName: model };
|
|
289
|
-
if (provider === 'custom' && providerMode)
|
|
290
|
-
config.mode = providerMode;
|
|
291
|
-
// Preserve existing judge config when saving
|
|
292
|
-
if (savedConfig) {
|
|
293
|
-
if (savedConfig.judgeProvider)
|
|
294
|
-
config.judgeProvider = savedConfig.judgeProvider;
|
|
295
|
-
if (savedConfig.judgeApiKey)
|
|
296
|
-
config.judgeApiKey = savedConfig.judgeApiKey;
|
|
297
|
-
if (savedConfig.judgeBaseUrl)
|
|
298
|
-
config.judgeBaseUrl = savedConfig.judgeBaseUrl;
|
|
299
|
-
if (savedConfig.judgeModelName)
|
|
300
|
-
config.judgeModelName = savedConfig.judgeModelName;
|
|
301
|
-
}
|
|
302
|
-
// Save config if not from env
|
|
303
|
-
if (!envProvider && !envApiKey && !envModelName) {
|
|
304
|
-
await saveConfig(config);
|
|
305
|
-
}
|
|
306
|
-
return config;
|
|
307
|
-
}
|
|
308
|
-
async function selectBenchmarks() {
|
|
309
|
-
const available = Object.values(benchmarks_js_1.BENCHMARK_DEFINITIONS);
|
|
310
|
-
const selection = await selectMulti(available, 'Select benchmarks to run', (bench) => `(${bench.defaultSamples.toLocaleString()} tests) - ${bench.description}`);
|
|
311
|
-
if (selection.length === 0) {
|
|
312
|
-
console.log(chalk_1.default.yellow('No benchmarks selected, exiting'));
|
|
313
|
-
process.exit(0);
|
|
314
|
-
}
|
|
315
|
-
console.log(chalk_1.default.green(`\nSelected ${selection.length} benchmark(s)\n`));
|
|
316
|
-
const benchmarks = [];
|
|
317
|
-
for (const bench of selection) {
|
|
318
|
-
const pctStr = await prompt(`${bench.name}: Enter % to run (1-100, default 100): `);
|
|
319
|
-
const pct = parseInt(pctStr) || 100;
|
|
320
|
-
benchmarks.push({ ...bench, percentage: Math.min(100, Math.max(1, pct)) });
|
|
321
|
-
}
|
|
322
|
-
return benchmarks;
|
|
323
|
-
}
|
|
324
|
-
async function runBenchmarks() {
|
|
325
|
-
const config = await getConfig();
|
|
326
|
-
const client = (0, client_js_1.createLLMClient)(config.provider, config.apiKey, config.baseUrl, config.modelName, config.mode);
|
|
327
|
-
const benchmarks = await selectBenchmarks();
|
|
328
|
-
// Ask about shuffling
|
|
329
|
-
const shuffleAnswer = (await prompt('Shuffle samples for diverse distribution? (Y/n): ')).trim().toLowerCase();
|
|
330
|
-
const shouldShuffle = shuffleAnswer !== 'n';
|
|
331
|
-
benchmarks.forEach((b) => b.shuffle = shouldShuffle);
|
|
332
|
-
// Check if any selected benchmark supports judge
|
|
333
|
-
const judgeBenchmarks = benchmarks.filter(b => b.useJudge);
|
|
334
|
-
let useJudge = false;
|
|
335
|
-
let judgeClient;
|
|
336
|
-
let judgeProvider = config.judgeProvider || 'openai';
|
|
337
|
-
let judgeBaseUrl = config.judgeBaseUrl || '';
|
|
338
|
-
let judgeModel = config.judgeModelName || '';
|
|
339
|
-
if (judgeBenchmarks.length > 0) {
|
|
340
|
-
const hasCritical = judgeBenchmarks.some(b => b.id === 'truthfulqa' || b.id === 'spider' || b.id === 'math');
|
|
341
|
-
const label = hasCritical ? '(highly recommended)' : '(recommended)';
|
|
342
|
-
const judgeAnswer = (await prompt(`Use judge for evaluation? (y/N) - ${label}: `)).trim().toLowerCase();
|
|
343
|
-
useJudge = judgeAnswer === 'y';
|
|
344
|
-
if (useJudge) {
|
|
345
|
-
const useJudgeEnv = process.env.JUDGE_PROVIDER || config.judgeProvider || '';
|
|
346
|
-
if (useJudgeEnv && PROVIDERS.some(p => p.id === useJudgeEnv)) {
|
|
347
|
-
judgeProvider = useJudgeEnv;
|
|
348
|
-
}
|
|
349
|
-
else {
|
|
350
|
-
const providerInfo = await selectProviderWithPrompt('Select judge provider:');
|
|
351
|
-
judgeProvider = providerInfo.id;
|
|
352
|
-
}
|
|
353
|
-
judgeBaseUrl = process.env.JUDGE_BASE_URL || config.judgeBaseUrl || '';
|
|
354
|
-
if (!judgeBaseUrl) {
|
|
355
|
-
judgeBaseUrl = await selectEndpointWithPrompt(judgeProvider);
|
|
356
|
-
}
|
|
357
|
-
const judgeApiKey = process.env.JUDGE_API_KEY || config.judgeApiKey || '';
|
|
358
|
-
const finalJudgeApiKey = judgeApiKey || (await prompt('Enter judge API key: ')).trim();
|
|
359
|
-
judgeModel = process.env.JUDGE_MODEL || config.judgeModelName || '';
|
|
360
|
-
if (!judgeModel) {
|
|
361
|
-
const isOpenAI = judgeProvider === 'openai' || judgeBaseUrl.includes('openai.com');
|
|
362
|
-
const isOpenRouter = judgeBaseUrl.includes('openrouter.ai') || judgeBaseUrl.includes('openrouter');
|
|
363
|
-
const modelRecommendation = (isOpenAI || isOpenRouter) ? 'gpt-4o-mini' : '';
|
|
364
|
-
const promptText = modelRecommendation
|
|
365
|
-
? `Enter judge model name (recommended: ${modelRecommendation}): `
|
|
366
|
-
: 'Enter judge model name: ';
|
|
367
|
-
judgeModel = (await prompt(promptText)).trim() || modelRecommendation;
|
|
368
|
-
}
|
|
369
|
-
judgeClient = (0, client_js_1.createLLMClient)(judgeProvider, finalJudgeApiKey, judgeBaseUrl, judgeModel);
|
|
370
|
-
// Save judge config if not from env
|
|
371
|
-
if (!process.env.JUDGE_PROVIDER && !process.env.JUDGE_API_KEY && !process.env.JUDGE_MODEL) {
|
|
372
|
-
const fullConfig = {
|
|
373
|
-
...config,
|
|
374
|
-
judgeProvider,
|
|
375
|
-
judgeApiKey: finalJudgeApiKey,
|
|
376
|
-
judgeBaseUrl,
|
|
377
|
-
judgeModelName: judgeModel
|
|
378
|
-
};
|
|
379
|
-
await saveConfig(fullConfig);
|
|
380
|
-
}
|
|
381
|
-
}
|
|
382
|
-
}
|
|
383
|
-
console.log(chalk_1.default.bold('\n=== Configuration ==='));
|
|
384
|
-
console.log(` Model: ${config.modelName}`);
|
|
385
|
-
console.log(` Provider: ${config.provider}`);
|
|
386
|
-
console.log(` Endpoint: ${config.baseUrl}`);
|
|
387
|
-
if (useJudge) {
|
|
388
|
-
console.log(chalk_1.default.cyan(' Judge: enabled'));
|
|
389
|
-
console.log(` Provider: ${judgeProvider}`);
|
|
390
|
-
console.log(` Endpoint: ${judgeBaseUrl}`);
|
|
391
|
-
console.log(` Model: ${judgeModel}`);
|
|
392
|
-
}
|
|
393
|
-
else {
|
|
394
|
-
console.log(chalk_1.default.gray(' Judge: disabled'));
|
|
395
|
-
}
|
|
396
|
-
if (shouldShuffle) {
|
|
397
|
-
console.log(chalk_1.default.green(' Shuffle: enabled'));
|
|
398
|
-
}
|
|
399
|
-
else {
|
|
400
|
-
console.log(chalk_1.default.gray(' Shuffle: disabled'));
|
|
401
|
-
}
|
|
402
|
-
const proceed = (await prompt('\nProceed with running benchmarks? (Y/n): ')).trim().toLowerCase();
|
|
403
|
-
if (proceed === 'n') {
|
|
404
|
-
console.log(chalk_1.default.yellow('Exiting...'));
|
|
405
|
-
process.exit(0);
|
|
406
|
-
}
|
|
407
|
-
const detailedLogsDir = (0, paths_js_1.getDetailedLogsDir)();
|
|
408
|
-
const progressDir = (0, paths_js_1.getProgressDir)();
|
|
409
|
-
await fs_extra_1.default.ensureDir(detailedLogsDir);
|
|
410
|
-
await fs_extra_1.default.ensureDir(progressDir);
|
|
411
|
-
console.log(chalk_1.default.gray(`\nResults: ${detailedLogsDir}`));
|
|
412
|
-
const logger = new logger_js_1.Logger(detailedLogsDir);
|
|
413
|
-
const progress = new progress_js_1.ProgressTracker(progressDir);
|
|
414
|
-
const evaluator = new evaluator_js_1.Evaluator(client, { timeout: 120, retries: 3, temperature: 0 }, judgeClient);
|
|
415
|
-
const results = [];
|
|
416
|
-
for (const benchmark of benchmarks) {
|
|
417
|
-
console.log(chalk_1.default.bold(`\n=== Running ${benchmark.name} ===`));
|
|
418
|
-
const safeModelName = config.modelName.replace(/[^a-zA-Z0-9]/g, '_');
|
|
419
|
-
const progressFile = `${safeModelName}_${benchmark.id}_progress.json`;
|
|
420
|
-
const previousProgress = progress.load(progressFile);
|
|
421
|
-
let startIdx = 0;
|
|
422
|
-
let runSeed = Date.now();
|
|
423
|
-
if (previousProgress && previousProgress.completed > 0) {
|
|
424
|
-
console.log(chalk_1.default.yellow(`Found existing progress: ${previousProgress.completed}/${previousProgress.total} completed`));
|
|
425
|
-
const resume = await prompt('Resume from where you left off? (y/N): ');
|
|
426
|
-
if (resume.trim().toLowerCase() === 'y') {
|
|
427
|
-
startIdx = previousProgress.completed;
|
|
428
|
-
runSeed = previousProgress.seed || runSeed;
|
|
429
|
-
console.log(chalk_1.default.cyan('Resuming...\n'));
|
|
430
|
-
}
|
|
431
|
-
else {
|
|
432
|
-
console.log(chalk_1.default.cyan('Starting fresh...\n'));
|
|
433
|
-
progress.clear(progressFile);
|
|
434
|
-
}
|
|
435
|
-
}
|
|
436
|
-
let data;
|
|
437
|
-
try {
|
|
438
|
-
data = await (0, benchmarks_js_1.fetchBenchmark)(benchmark.id, benchmark.percentage, benchmark.shuffle, runSeed);
|
|
439
|
-
}
|
|
440
|
-
catch (error) {
|
|
441
|
-
console.log(chalk_1.default.red(`Failed to fetch ${benchmark.name}: ${error}`));
|
|
442
|
-
continue;
|
|
443
|
-
}
|
|
444
|
-
let correct = 0;
|
|
445
|
-
const runTimestamp = Date.now();
|
|
446
|
-
const runLogEntries = [];
|
|
447
|
-
for (let i = startIdx; i < data.length; i++) {
|
|
448
|
-
const item = data[i];
|
|
449
|
-
try {
|
|
450
|
-
const pct = i === 0 ? '0.0' : ((correct / i) * 100).toFixed(1);
|
|
451
|
-
process.stdout.write(`\r${benchmark.name}: [solving... ${i + 1}/${data.length}] ${pct}% correct`);
|
|
452
|
-
const response = await evaluator.evaluate(benchmark, item);
|
|
453
|
-
let isCorrect;
|
|
454
|
-
let judgeResponse;
|
|
455
|
-
if (benchmark.useJudge && judgeClient) {
|
|
456
|
-
process.stdout.write(`\r${benchmark.name}: [judging... ${i + 1}/${data.length}] ${pct}% correct`);
|
|
457
|
-
const result = await evaluator.evaluateAndCheckWithJudge(benchmark, item, response);
|
|
458
|
-
isCorrect = result.correct;
|
|
459
|
-
judgeResponse = result.judgeResponse;
|
|
460
|
-
}
|
|
461
|
-
else {
|
|
462
|
-
isCorrect = evaluator.checkAnswer(benchmark, item, response);
|
|
463
|
-
}
|
|
464
|
-
if (isCorrect)
|
|
465
|
-
correct++;
|
|
466
|
-
runLogEntries.push({
|
|
467
|
-
benchmark: benchmark.id,
|
|
468
|
-
model: config.modelName,
|
|
469
|
-
question: item,
|
|
470
|
-
response,
|
|
471
|
-
isCorrect,
|
|
472
|
-
judgeResponse,
|
|
473
|
-
timestamp: new Date().toISOString(),
|
|
474
|
-
index: i,
|
|
475
|
-
});
|
|
476
|
-
progress.save(progressFile, { completed: i + 1, total: data.length, seed: runSeed });
|
|
477
|
-
}
|
|
478
|
-
catch (error) {
|
|
479
|
-
console.log(chalk_1.default.red(`\nError on item ${i}: ${error}`));
|
|
480
|
-
}
|
|
481
|
-
}
|
|
482
|
-
// Batch write all entries for this run
|
|
483
|
-
logger.logBatch(runLogEntries, `${benchmark.id}_${config.modelName}_${runTimestamp}`);
|
|
484
|
-
const result = {
|
|
485
|
-
benchmark: benchmark.id,
|
|
486
|
-
model: config.modelName,
|
|
487
|
-
total: data.length,
|
|
488
|
-
correct,
|
|
489
|
-
accuracy: (correct / data.length) * 100,
|
|
490
|
-
timestamp: new Date().toISOString(),
|
|
491
|
-
seed: runSeed,
|
|
492
|
-
judge: useJudge && judgeModel ? judgeModel : undefined,
|
|
493
|
-
};
|
|
494
|
-
results.push(result);
|
|
495
|
-
console.log(chalk_1.default.green(`\n${benchmark.name}: ${correct}/${data.length} (${result.accuracy.toFixed(2)}%)`));
|
|
496
|
-
progress.clear(progressFile);
|
|
497
|
-
}
|
|
498
|
-
const resultsFile = path_1.default.join((0, paths_js_1.getResultsDir)(), `eval_results_${Date.now()}.json`);
|
|
499
|
-
await fs_extra_1.default.ensureDir((0, paths_js_1.getResultsDir)());
|
|
500
|
-
await fs_extra_1.default.writeJson(resultsFile, { results, timestamp: new Date().toISOString() }, { spaces: 2 });
|
|
501
|
-
console.log(chalk_1.default.bold(`\n=== Results saved to ${resultsFile} ===`));
|
|
502
|
-
console.log(chalk_1.default.bold('\n=== Summary ==='));
|
|
503
|
-
for (const r of results) {
|
|
504
|
-
console.log(`${r.benchmark}: ${r.correct}/${r.total} (${r.accuracy.toFixed(2)}%)`);
|
|
505
|
-
}
|
|
506
|
-
}
|
|
507
|
-
async function showMenu() {
|
|
508
|
-
// Remove all existing keypress listeners first
|
|
509
|
-
process.stdin.removeAllListeners('keypress');
|
|
510
|
-
readline_1.default.emitKeypressEvents(process.stdin);
|
|
511
|
-
if (process.stdin.isTTY) {
|
|
512
|
-
process.stdin.setRawMode(true);
|
|
513
21
|
}
|
|
514
|
-
let cursor = 0;
|
|
515
|
-
const items = ['Run benchmarks', 'Explore past results'];
|
|
516
|
-
const render = () => {
|
|
517
|
-
process.stdout.write('\x1b[H\x1b[2J');
|
|
518
|
-
console.log(chalk_1.default.bold.cyan('\n=== LLM Benchmark Runner ===\n'));
|
|
519
|
-
items.forEach((item, i) => {
|
|
520
|
-
const prefix = i === cursor ? chalk_1.default.cyan('> ') : ' ';
|
|
521
|
-
console.log(`${prefix}${item}`);
|
|
522
|
-
});
|
|
523
|
-
console.log(chalk_1.default.gray('\nArrow keys, Enter to select, Esc to quit'));
|
|
524
|
-
};
|
|
525
|
-
render();
|
|
526
|
-
return new Promise((resolve) => {
|
|
527
|
-
const handleKeypress = async (str, key) => {
|
|
528
|
-
if (key.ctrl && key.name === 'c') {
|
|
529
|
-
process.stdin.removeListener('keypress', handleKeypress);
|
|
530
|
-
if (process.stdin.isTTY)
|
|
531
|
-
process.stdin.setRawMode(false);
|
|
532
|
-
console.log(chalk_1.default.green('\nGoodbye!'));
|
|
533
|
-
process.exit(0);
|
|
534
|
-
}
|
|
535
|
-
if (key.name === 'up') {
|
|
536
|
-
cursor = Math.max(0, cursor - 1);
|
|
537
|
-
render();
|
|
538
|
-
}
|
|
539
|
-
else if (key.name === 'down') {
|
|
540
|
-
cursor = Math.min(items.length - 1, cursor + 1);
|
|
541
|
-
render();
|
|
542
|
-
}
|
|
543
|
-
else if (key.name === 'escape') {
|
|
544
|
-
process.stdin.removeListener('keypress', handleKeypress);
|
|
545
|
-
if (process.stdin.isTTY)
|
|
546
|
-
process.stdin.setRawMode(false);
|
|
547
|
-
console.log(chalk_1.default.green('\nGoodbye!'));
|
|
548
|
-
process.exit(0);
|
|
549
|
-
}
|
|
550
|
-
else if (key.name === 'return') {
|
|
551
|
-
process.stdin.removeListener('keypress', handleKeypress);
|
|
552
|
-
if (process.stdin.isTTY)
|
|
553
|
-
process.stdin.setRawMode(false);
|
|
554
|
-
if (cursor === 0) {
|
|
555
|
-
runBenchmarks().then(() => showMenu()).then(() => resolve());
|
|
556
|
-
}
|
|
557
|
-
else if (cursor === 1) {
|
|
558
|
-
if (process.stdin.isTTY)
|
|
559
|
-
process.stdin.setRawMode(false);
|
|
560
|
-
process.stdin.removeAllListeners('keypress');
|
|
561
|
-
process.stdout.write('\x1b[H\x1b[2J');
|
|
562
|
-
const tuiPath = path_1.default.join(__dirname, '..', 'bin', 'tui.js');
|
|
563
|
-
(0, child_process_1.execFileSync)('node', [tuiPath], { stdio: 'inherit' });
|
|
564
|
-
showMenu().then(() => resolve());
|
|
565
|
-
}
|
|
566
|
-
}
|
|
567
|
-
};
|
|
568
|
-
process.stdin.on('keypress', handleKeypress);
|
|
569
|
-
});
|
|
570
22
|
}
|
|
571
|
-
|
|
23
|
+
main().catch(console.error);
|
|
572
24
|
//# sourceMappingURL=index.js.map
|