llmtester 1.0.7 → 1.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ import { ProviderType } from './client.js';
2
+ export interface Config {
3
+ provider: ProviderType;
4
+ mode?: 'openai' | 'anthropic';
5
+ apiKey: string;
6
+ baseUrl: string;
7
+ modelName: string;
8
+ judgeProvider?: ProviderType;
9
+ judgeApiKey?: string;
10
+ judgeBaseUrl?: string;
11
+ judgeModelName?: string;
12
+ }
13
+ export declare function getConfigPath(): string;
14
+ export declare function loadConfig(): Promise<Partial<Config> | null>;
15
+ export declare function saveConfig(config: Config): Promise<void>;
16
+ export declare function prompt(message: string): Promise<string>;
17
+ //# sourceMappingURL=config.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"config.d.ts","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":"AAKA,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAE3C,MAAM,WAAW,MAAM;IACrB,QAAQ,EAAE,YAAY,CAAC;IACvB,IAAI,CAAC,EAAE,QAAQ,GAAG,WAAW,CAAC;IAC9B,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,CAAC,EAAE,YAAY,CAAC;IAC7B,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,wBAAgB,aAAa,IAAI,MAAM,CAEtC;AAED,wBAAsB,UAAU,IAAI,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,CAUlE;AAED,wBAAsB,UAAU,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAS9D;AAED,wBAAsB,MAAM,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAmB7D"}
package/dist/config.js ADDED
@@ -0,0 +1,58 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.getConfigPath = getConfigPath;
7
+ exports.loadConfig = loadConfig;
8
+ exports.saveConfig = saveConfig;
9
+ exports.prompt = prompt;
10
+ const chalk_1 = __importDefault(require("chalk"));
11
+ const fs_extra_1 = __importDefault(require("fs-extra"));
12
+ const path_1 = __importDefault(require("path"));
13
+ const readline_1 = __importDefault(require("readline"));
14
+ const paths_js_1 = require("./paths.js");
15
+ function getConfigPath() {
16
+ return path_1.default.join((0, paths_js_1.getConfigDir)(), 'config.json');
17
+ }
18
+ async function loadConfig() {
19
+ const configPath = getConfigPath();
20
+ try {
21
+ if (await fs_extra_1.default.pathExists(configPath)) {
22
+ return await fs_extra_1.default.readJson(configPath);
23
+ }
24
+ }
25
+ catch (e) {
26
+ console.log(chalk_1.default.yellow(`Warning: Could not load config: ${e}`));
27
+ }
28
+ return null;
29
+ }
30
+ async function saveConfig(config) {
31
+ const configPath = getConfigPath();
32
+ try {
33
+ await fs_extra_1.default.ensureDir(path_1.default.dirname(configPath));
34
+ await fs_extra_1.default.writeJson(configPath, config, { spaces: 2 });
35
+ console.log(chalk_1.default.green(`Config saved to: ${configPath}`));
36
+ }
37
+ catch (e) {
38
+ console.log(chalk_1.default.yellow(`Warning: Could not save config: ${e}`));
39
+ }
40
+ }
41
+ async function prompt(message) {
42
+ return new Promise((resolve) => {
43
+ process.stdin.removeAllListeners('keypress');
44
+ process.stdin.setRawMode?.(false);
45
+ const rl = readline_1.default.createInterface({
46
+ input: process.stdin,
47
+ output: process.stdout,
48
+ });
49
+ rl.question(message, (answer) => {
50
+ rl.close();
51
+ resolve(answer);
52
+ });
53
+ if (process.stdin.isTTY) {
54
+ process.stdin.setRawMode(false);
55
+ }
56
+ });
57
+ }
58
+ //# sourceMappingURL=config.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"config.js","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":";;;;;AAmBA,sCAEC;AAED,gCAUC;AAED,gCASC;AAED,wBAmBC;AAjED,kDAA0B;AAC1B,wDAA0B;AAC1B,gDAAwB;AACxB,wDAAgC;AAChC,yCAA0C;AAe1C,SAAgB,aAAa;IAC3B,OAAO,cAAI,CAAC,IAAI,CAAC,IAAA,uBAAY,GAAE,EAAE,aAAa,CAAC,CAAC;AAClD,CAAC;AAEM,KAAK,UAAU,UAAU;IAC9B,MAAM,UAAU,GAAG,aAAa,EAAE,CAAC;IACnC,IAAI,CAAC;QACH,IAAI,MAAM,kBAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;YACpC,OAAO,MAAM,kBAAE,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC;QACvC,CAAC;IACH,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,OAAO,CAAC,GAAG,CAAC,eAAK,CAAC,MAAM,CAAC,mCAAmC,CAAC,EAAE,CAAC,CAAC,CAAC;IACpE,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAEM,KAAK,UAAU,UAAU,CAAC,MAAc;IAC7C,MAAM,UAAU,GAAG,aAAa,EAAE,CAAC;IACnC,IAAI,CAAC;QACH,MAAM,kBAAE,CAAC,SAAS,CAAC,cAAI,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC;QAC7C,MAAM,kBAAE,CAAC,SAAS,CAAC,UAAU,EAAE,MAAM,EAAE,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC,CAAC;QACtD,OAAO,CAAC,GAAG,CAAC,eAAK,CAAC,KAAK,CAAC,oBAAoB,UAAU,EAAE,CAAC,CAAC,CAAC;IAC7D,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,OAAO,CAAC,GAAG,CAAC,eAAK,CAAC,MAAM,CAAC,mCAAmC,CAAC,EAAE,CAAC,CAAC,CAAC;IACpE,CAAC;AACH,CAAC;AAEM,KAAK,UAAU,MAAM,CAAC,OAAe;IAC1C,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;QAC7B,OAAO,CAAC,KAAK,CAAC,kBAAkB,CAAC,UAAU,CAAC,CAAC;QAC7C,OAAO,CAAC,KAAK,CAAC,UAAU,EAAE,CAAC,KAAK,CAAC,CAAC;QAElC,MAAM,EAAE,GAAG,kBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM;SACvB,CAAC,CAAC;QAEH,EAAE,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC,MAAc,EAAE,EAAE;YACtC,EAAE,CAAC,KAAK,EAAE,CAAC;YACX,OAAO,CAAC,MAAM,CAAC,CAAC;QAClB,CAAC,CAAC,CAAC;QAEH,IAAI,OAAO,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;YACxB,OAAO,CAAC,KAAK,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC;QAClC,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC"}
package/dist/index.d.ts CHANGED
@@ -1,3 +1,3 @@
1
- #!/usr/bin/env node
1
+ #!/usr/bin//env node
2
2
  export {};
3
3
  //# sourceMappingURL=index.d.ts.map
package/dist/index.js CHANGED
@@ -1,572 +1,24 @@
1
- #!/usr/bin/env node
1
+ #!/usr/bin//env node
2
2
  "use strict";
3
3
  var __importDefault = (this && this.__importDefault) || function (mod) {
4
4
  return (mod && mod.__esModule) ? mod : { "default": mod };
5
5
  };
6
6
  Object.defineProperty(exports, "__esModule", { value: true });
7
- const dotenv_1 = __importDefault(require("dotenv"));
8
- const chalk_1 = __importDefault(require("chalk"));
9
- const fs_extra_1 = __importDefault(require("fs-extra"));
10
- const path_1 = __importDefault(require("path"));
11
- const readline_1 = __importDefault(require("readline"));
12
7
  const child_process_1 = require("child_process");
13
- const client_js_1 = require("./client.js");
14
- const benchmarks_js_1 = require("./benchmarks.js");
15
- const evaluator_js_1 = require("./evaluator.js");
16
- const progress_js_1 = require("./progress.js");
17
- const logger_js_1 = require("./logger.js");
18
- const paths_js_1 = require("./paths.js");
19
- dotenv_1.default.config();
20
- function getConfigPath() {
21
- return path_1.default.join((0, paths_js_1.getConfigDir)(), 'config.json');
22
- }
23
- async function loadConfig() {
24
- const configPath = getConfigPath();
25
- try {
26
- if (await fs_extra_1.default.pathExists(configPath)) {
27
- return await fs_extra_1.default.readJson(configPath);
28
- }
29
- }
30
- catch (e) {
31
- console.log(chalk_1.default.yellow(`Warning: Could not load config: ${e}`));
32
- }
33
- return null;
34
- }
35
- async function saveConfig(config) {
36
- const configPath = getConfigPath();
37
- try {
38
- await fs_extra_1.default.ensureDir(path_1.default.dirname(configPath));
39
- await fs_extra_1.default.writeJson(configPath, config, { spaces: 2 });
40
- console.log(chalk_1.default.green(`Config saved to: ${configPath}`));
41
- }
42
- catch (e) {
43
- console.log(chalk_1.default.yellow(`Warning: Could not save config: ${e}`));
44
- }
45
- }
46
- const PROVIDERS = [
47
- { id: 'openai', name: 'OpenAI', defaultUrl: 'https://api.openai.com/v1' },
48
- { id: 'anthropic', name: 'Anthropic', defaultUrl: 'https://api.anthropic.com' },
49
- { id: 'custom', name: 'Custom (OpenAI-compatible)', defaultUrl: '' },
50
- ];
51
- const COMMON_OPENAI_ENDPOINTS = [
52
- { name: 'OpenAI', url: 'https://api.openai.com/v1' },
53
- { name: 'Together.ai', url: 'https://api.together.xyz/v1' },
54
- { name: 'Groq', url: 'https://api.groq.com/openai/v1' },
55
- { name: 'Fireworks AI', url: 'https://api.fireworks.ai/inference/v1' },
56
- { name: 'Perplexity', url: 'https://api.perplexity.ai' },
57
- { name: 'OpenRouter', url: 'https://openrouter.ai/api/v1' },
58
- { name: 'Custom URL', url: '' },
59
- ];
60
- async function prompt(message) {
61
- return new Promise((resolve) => {
62
- process.stdin.removeAllListeners('keypress');
63
- process.stdin.setRawMode?.(false);
64
- const rl = readline_1.default.createInterface({
65
- input: process.stdin,
66
- output: process.stdout,
67
- });
68
- rl.question(message, (answer) => {
69
- rl.close();
70
- resolve(answer);
71
- });
72
- if (process.stdin.isTTY) {
73
- process.stdin.setRawMode(false);
74
- }
75
- });
76
- }
77
- async function selectOption(items, message) {
78
- // Remove all existing keypress listeners first
79
- process.stdin.removeAllListeners('keypress');
80
- readline_1.default.emitKeypressEvents(process.stdin);
81
- if (process.stdin.isTTY) {
82
- process.stdin.setRawMode(true);
83
- }
84
- let cursor = 0;
85
- const render = () => {
86
- process.stdout.write('\x1b[H\x1b[2J');
87
- console.log(chalk_1.default.cyan(`${message}\n`));
88
- items.forEach((item, i) => {
89
- const prefix = i === cursor ? chalk_1.default.cyan('> ') : ' ';
90
- console.log(`${prefix}${item.name}`);
91
- });
92
- console.log(chalk_1.default.gray('\nUse arrow keys, Enter to select'));
93
- };
94
- render();
95
- return new Promise((resolve) => {
96
- const handleKeypress = (str, key) => {
97
- if (key.ctrl && key.name === 'c') {
98
- if (process.stdin.isTTY)
99
- process.stdin.setRawMode(false);
100
- process.exit(0);
101
- }
102
- if (key.name === 'up') {
103
- cursor = Math.max(0, cursor - 1);
104
- render();
105
- }
106
- else if (key.name === 'down') {
107
- cursor = Math.min(items.length - 1, cursor + 1);
108
- render();
109
- }
110
- else if (key.name === 'return') {
111
- process.stdin.removeListener('keypress', handleKeypress);
112
- if (process.stdin.isTTY)
113
- process.stdin.setRawMode(false);
114
- resolve(items[cursor]);
115
- }
116
- };
117
- process.stdin.on('keypress', handleKeypress);
118
- });
119
- }
120
- async function selectMulti(items, message, formatItem) {
121
- // Remove all existing keypress listeners first
122
- process.stdin.removeAllListeners('keypress');
123
- readline_1.default.emitKeypressEvents(process.stdin);
124
- if (process.stdin.isTTY) {
125
- process.stdin.setRawMode(true);
126
- }
127
- process.stdout.write('\x1b[H\x1b[2J');
128
- console.log(chalk_1.default.cyan(`\n${message}`));
129
- console.log(chalk_1.default.gray('Space to toggle, Enter to confirm\n'));
130
- const selected = [];
131
- let cursor = 0;
132
- const render = () => {
133
- process.stdout.write('\x1b[H\x1b[2J');
134
- console.log(chalk_1.default.cyan(`${message}\n`));
135
- items.forEach((item, i) => {
136
- const mark = selected.includes(item) ? chalk_1.default.green('[x]') : '[ ]';
137
- const prefix = i === cursor ? chalk_1.default.cyan('> ') : ' ';
138
- const extra = formatItem ? ` ${chalk_1.default.gray(formatItem(item))}` : '';
139
- console.log(`${prefix}${mark} ${item.name}${extra}`);
140
- });
141
- console.log(chalk_1.default.gray('\nSpace to toggle, Enter to confirm'));
142
- };
143
- render();
144
- return new Promise((resolve) => {
145
- const handleKeypress = (str, key) => {
146
- if (key.ctrl && key.name === 'c') {
147
- if (process.stdin.isTTY)
148
- process.stdin.setRawMode(false);
149
- process.exit(0);
150
- }
151
- if (key.name === 'up') {
152
- cursor = Math.max(0, cursor - 1);
153
- render();
154
- }
155
- else if (key.name === 'down') {
156
- cursor = Math.min(items.length - 1, cursor + 1);
157
- render();
158
- }
159
- else if (key.name === 'space' || str === ' ') {
160
- const item = items[cursor];
161
- if (selected.includes(item)) {
162
- selected.splice(selected.indexOf(item), 1);
163
- }
164
- else {
165
- selected.push(item);
166
- }
167
- render();
168
- }
169
- else if (key.name === 'return') {
170
- process.stdin.removeListener('keypress', handleKeypress);
171
- if (process.stdin.isTTY)
172
- process.stdin.setRawMode(false);
173
- resolve(selected);
174
- }
175
- };
176
- process.stdin.on('keypress', handleKeypress);
177
- });
178
- }
179
- async function selectProvider() {
180
- return selectOption(PROVIDERS, 'Select your provider');
181
- }
182
- async function selectProviderWithPrompt(message) {
183
- console.log(chalk_1.default.cyan('\n' + message));
184
- PROVIDERS.forEach((p, i) => { console.log(String(i + 1) + '. ' + p.name); });
185
- const providerIds = ['openai', 'anthropic', 'custom'];
8
+ const path_1 = __importDefault(require("path"));
9
+ const runner_js_1 = require("./runner.js");
10
+ const menu_js_1 = require("./menu.js");
11
+ async function main() {
186
12
  while (true) {
187
- const answer = await prompt('Enter a number (1-3) or name: ');
188
- const trimmed = answer.trim().toLowerCase();
189
- const num = parseInt(trimmed, 10);
190
- if (num >= 1 && num <= PROVIDERS.length) {
191
- return PROVIDERS[num - 1];
192
- }
193
- if (providerIds.includes(trimmed)) {
194
- return PROVIDERS.find(p => p.id === trimmed);
195
- }
196
- console.log(chalk_1.default.yellow('Invalid selection. Please try again.'));
197
- }
198
- }
199
- async function selectEndpoint(provider) {
200
- if (provider === 'anthropic') {
201
- return 'https://api.anthropic.com';
202
- }
203
- if (provider === 'custom' || provider === 'openai') {
204
- const endpoints = COMMON_OPENAI_ENDPOINTS.map(e => ({ id: e.url || 'custom', name: e.name, url: e.url }));
205
- const selection = await selectOption(endpoints, 'Select or enter endpoint');
206
- if (selection.url === '') {
207
- return (await prompt('Enter custom endpoint URL: ')).trim();
13
+ const choice = await (0, menu_js_1.showMainMenu)();
14
+ if (choice === 0) {
15
+ await (0, runner_js_1.runBenchmarks)();
208
16
  }
209
- return selection.url;
210
- }
211
- return PROVIDERS.find(p => p.id === provider)?.defaultUrl || '';
212
- }
213
- async function selectEndpointWithPrompt(provider) {
214
- if (provider === 'anthropic') {
215
- return 'https://api.anthropic.com';
216
- }
217
- if (provider === 'custom') {
218
- const answer = await prompt('Enter custom endpoint URL: ');
219
- return answer.trim();
220
- }
221
- return PROVIDERS.find(p => p.id === provider)?.defaultUrl || '';
222
- }
223
- async function getConfig() {
224
- const envProvider = process.env.LLM_PROVIDER || '';
225
- const envApiKey = process.env.LLM_API_KEY || process.env.OPENAI_API_KEY || '';
226
- const envBaseUrl = process.env.LLM_BASE_URL || process.env.OPENAI_BASE_URL || '';
227
- const envModelName = process.env.LLM_MODEL || process.env.MODEL_NAME || '';
228
- // Load saved config
229
- const savedConfig = await loadConfig();
230
- let provider;
231
- let providerMode = savedConfig?.mode || 'openai';
232
- let providerInfo;
233
- if (envProvider && PROVIDERS.some(p => p.id === envProvider)) {
234
- provider = envProvider;
235
- providerInfo = PROVIDERS.find(p => p.id === provider);
236
- console.log(chalk_1.default.green(`Provider: ${providerInfo.name} (from env)`));
237
- }
238
- else if (savedConfig?.provider) {
239
- provider = savedConfig.provider;
240
- providerInfo = PROVIDERS.find(p => p.id === provider);
241
- console.log(chalk_1.default.green(`Provider: ${providerInfo.name} (from config)`));
242
- }
243
- else {
244
- providerInfo = await selectProvider();
245
- provider = providerInfo.id;
246
- }
247
- // Ask for mode when custom provider
248
- if (provider === 'custom') {
249
- const envMode = process.env.LLM_MODE || '';
250
- if (envMode === 'anthropic') {
251
- providerMode = 'anthropic';
17
+ else if (choice === 1) {
18
+ const tuiPath = path_1.default.join(__dirname, '..', 'bin', 'tui.js');
19
+ (0, child_process_1.execFileSync)('node', [tuiPath], { stdio: 'inherit' });
252
20
  }
253
- else if (savedConfig?.mode) {
254
- providerMode = savedConfig.mode;
255
- }
256
- else {
257
- console.log(chalk_1.default.cyan(''));
258
- const modeAnswer = (await prompt('API mode - openai (OpenAI-compatible) or anthropic (Anthropic-compatible)? (o/a): ')).trim().toLowerCase();
259
- providerMode = modeAnswer === 'a' ? 'anthropic' : 'openai';
260
- }
261
- console.log(chalk_1.default.green('Mode: ' + providerMode));
262
- }
263
- let endpoint = envBaseUrl || savedConfig?.baseUrl || '';
264
- if (!endpoint) {
265
- endpoint = await selectEndpoint(provider);
266
- }
267
- else {
268
- console.log(chalk_1.default.green(`Endpoint: ${endpoint}`));
269
- }
270
- let key = envApiKey || savedConfig?.apiKey || '';
271
- if (!key) {
272
- const keyPrompt = provider === 'anthropic'
273
- ? 'Enter your Anthropic API key: '
274
- : 'Enter your API key: ';
275
- key = (await prompt(keyPrompt)).trim();
276
- }
277
- else {
278
- const maskedKey = key.length > 8 ? `${key.slice(0, 4)}...${key.slice(-4)}` : '***';
279
- console.log(chalk_1.default.green(`API key: ${maskedKey}`));
280
- }
281
- let model = envModelName || savedConfig?.modelName || '';
282
- if (!model) {
283
- model = (await prompt('Enter model name (e.g., gpt-4o, claude-3-opus): ')).trim();
284
- }
285
- else {
286
- console.log(chalk_1.default.green(`Model: ${model}`));
287
- }
288
- const config = { provider, apiKey: key, baseUrl: endpoint, modelName: model };
289
- if (provider === 'custom' && providerMode)
290
- config.mode = providerMode;
291
- // Preserve existing judge config when saving
292
- if (savedConfig) {
293
- if (savedConfig.judgeProvider)
294
- config.judgeProvider = savedConfig.judgeProvider;
295
- if (savedConfig.judgeApiKey)
296
- config.judgeApiKey = savedConfig.judgeApiKey;
297
- if (savedConfig.judgeBaseUrl)
298
- config.judgeBaseUrl = savedConfig.judgeBaseUrl;
299
- if (savedConfig.judgeModelName)
300
- config.judgeModelName = savedConfig.judgeModelName;
301
- }
302
- // Save config if not from env
303
- if (!envProvider && !envApiKey && !envModelName) {
304
- await saveConfig(config);
305
- }
306
- return config;
307
- }
308
- async function selectBenchmarks() {
309
- const available = Object.values(benchmarks_js_1.BENCHMARK_DEFINITIONS);
310
- const selection = await selectMulti(available, 'Select benchmarks to run', (bench) => `(${bench.defaultSamples.toLocaleString()} tests) - ${bench.description}`);
311
- if (selection.length === 0) {
312
- console.log(chalk_1.default.yellow('No benchmarks selected, exiting'));
313
- process.exit(0);
314
- }
315
- console.log(chalk_1.default.green(`\nSelected ${selection.length} benchmark(s)\n`));
316
- const benchmarks = [];
317
- for (const bench of selection) {
318
- const pctStr = await prompt(`${bench.name}: Enter % to run (1-100, default 100): `);
319
- const pct = parseInt(pctStr) || 100;
320
- benchmarks.push({ ...bench, percentage: Math.min(100, Math.max(1, pct)) });
321
- }
322
- return benchmarks;
323
- }
324
- async function runBenchmarks() {
325
- const config = await getConfig();
326
- const client = (0, client_js_1.createLLMClient)(config.provider, config.apiKey, config.baseUrl, config.modelName, config.mode);
327
- const benchmarks = await selectBenchmarks();
328
- // Ask about shuffling
329
- const shuffleAnswer = (await prompt('Shuffle samples for diverse distribution? (Y/n): ')).trim().toLowerCase();
330
- const shouldShuffle = shuffleAnswer !== 'n';
331
- benchmarks.forEach((b) => b.shuffle = shouldShuffle);
332
- // Check if any selected benchmark supports judge
333
- const judgeBenchmarks = benchmarks.filter(b => b.useJudge);
334
- let useJudge = false;
335
- let judgeClient;
336
- let judgeProvider = config.judgeProvider || 'openai';
337
- let judgeBaseUrl = config.judgeBaseUrl || '';
338
- let judgeModel = config.judgeModelName || '';
339
- if (judgeBenchmarks.length > 0) {
340
- const hasCritical = judgeBenchmarks.some(b => b.id === 'truthfulqa' || b.id === 'spider' || b.id === 'math');
341
- const label = hasCritical ? '(highly recommended)' : '(recommended)';
342
- const judgeAnswer = (await prompt(`Use judge for evaluation? (y/N) - ${label}: `)).trim().toLowerCase();
343
- useJudge = judgeAnswer === 'y';
344
- if (useJudge) {
345
- const useJudgeEnv = process.env.JUDGE_PROVIDER || config.judgeProvider || '';
346
- if (useJudgeEnv && PROVIDERS.some(p => p.id === useJudgeEnv)) {
347
- judgeProvider = useJudgeEnv;
348
- }
349
- else {
350
- const providerInfo = await selectProviderWithPrompt('Select judge provider:');
351
- judgeProvider = providerInfo.id;
352
- }
353
- judgeBaseUrl = process.env.JUDGE_BASE_URL || config.judgeBaseUrl || '';
354
- if (!judgeBaseUrl) {
355
- judgeBaseUrl = await selectEndpointWithPrompt(judgeProvider);
356
- }
357
- const judgeApiKey = process.env.JUDGE_API_KEY || config.judgeApiKey || '';
358
- const finalJudgeApiKey = judgeApiKey || (await prompt('Enter judge API key: ')).trim();
359
- judgeModel = process.env.JUDGE_MODEL || config.judgeModelName || '';
360
- if (!judgeModel) {
361
- const isOpenAI = judgeProvider === 'openai' || judgeBaseUrl.includes('openai.com');
362
- const isOpenRouter = judgeBaseUrl.includes('openrouter.ai') || judgeBaseUrl.includes('openrouter');
363
- const modelRecommendation = (isOpenAI || isOpenRouter) ? 'gpt-4o-mini' : '';
364
- const promptText = modelRecommendation
365
- ? `Enter judge model name (recommended: ${modelRecommendation}): `
366
- : 'Enter judge model name: ';
367
- judgeModel = (await prompt(promptText)).trim() || modelRecommendation;
368
- }
369
- judgeClient = (0, client_js_1.createLLMClient)(judgeProvider, finalJudgeApiKey, judgeBaseUrl, judgeModel);
370
- // Save judge config if not from env
371
- if (!process.env.JUDGE_PROVIDER && !process.env.JUDGE_API_KEY && !process.env.JUDGE_MODEL) {
372
- const fullConfig = {
373
- ...config,
374
- judgeProvider,
375
- judgeApiKey: finalJudgeApiKey,
376
- judgeBaseUrl,
377
- judgeModelName: judgeModel
378
- };
379
- await saveConfig(fullConfig);
380
- }
381
- }
382
- }
383
- console.log(chalk_1.default.bold('\n=== Configuration ==='));
384
- console.log(` Model: ${config.modelName}`);
385
- console.log(` Provider: ${config.provider}`);
386
- console.log(` Endpoint: ${config.baseUrl}`);
387
- if (useJudge) {
388
- console.log(chalk_1.default.cyan(' Judge: enabled'));
389
- console.log(` Provider: ${judgeProvider}`);
390
- console.log(` Endpoint: ${judgeBaseUrl}`);
391
- console.log(` Model: ${judgeModel}`);
392
- }
393
- else {
394
- console.log(chalk_1.default.gray(' Judge: disabled'));
395
- }
396
- if (shouldShuffle) {
397
- console.log(chalk_1.default.green(' Shuffle: enabled'));
398
- }
399
- else {
400
- console.log(chalk_1.default.gray(' Shuffle: disabled'));
401
- }
402
- const proceed = (await prompt('\nProceed with running benchmarks? (Y/n): ')).trim().toLowerCase();
403
- if (proceed === 'n') {
404
- console.log(chalk_1.default.yellow('Exiting...'));
405
- process.exit(0);
406
- }
407
- const detailedLogsDir = (0, paths_js_1.getDetailedLogsDir)();
408
- const progressDir = (0, paths_js_1.getProgressDir)();
409
- await fs_extra_1.default.ensureDir(detailedLogsDir);
410
- await fs_extra_1.default.ensureDir(progressDir);
411
- console.log(chalk_1.default.gray(`\nResults: ${detailedLogsDir}`));
412
- const logger = new logger_js_1.Logger(detailedLogsDir);
413
- const progress = new progress_js_1.ProgressTracker(progressDir);
414
- const evaluator = new evaluator_js_1.Evaluator(client, { timeout: 120, retries: 3, temperature: 0 }, judgeClient);
415
- const results = [];
416
- for (const benchmark of benchmarks) {
417
- console.log(chalk_1.default.bold(`\n=== Running ${benchmark.name} ===`));
418
- const safeModelName = config.modelName.replace(/[^a-zA-Z0-9]/g, '_');
419
- const progressFile = `${safeModelName}_${benchmark.id}_progress.json`;
420
- const previousProgress = progress.load(progressFile);
421
- let startIdx = 0;
422
- let runSeed = Date.now();
423
- if (previousProgress && previousProgress.completed > 0) {
424
- console.log(chalk_1.default.yellow(`Found existing progress: ${previousProgress.completed}/${previousProgress.total} completed`));
425
- const resume = await prompt('Resume from where you left off? (y/N): ');
426
- if (resume.trim().toLowerCase() === 'y') {
427
- startIdx = previousProgress.completed;
428
- runSeed = previousProgress.seed || runSeed;
429
- console.log(chalk_1.default.cyan('Resuming...\n'));
430
- }
431
- else {
432
- console.log(chalk_1.default.cyan('Starting fresh...\n'));
433
- progress.clear(progressFile);
434
- }
435
- }
436
- let data;
437
- try {
438
- data = await (0, benchmarks_js_1.fetchBenchmark)(benchmark.id, benchmark.percentage, benchmark.shuffle, runSeed);
439
- }
440
- catch (error) {
441
- console.log(chalk_1.default.red(`Failed to fetch ${benchmark.name}: ${error}`));
442
- continue;
443
- }
444
- let correct = 0;
445
- const runTimestamp = Date.now();
446
- const runLogEntries = [];
447
- for (let i = startIdx; i < data.length; i++) {
448
- const item = data[i];
449
- try {
450
- const pct = i === 0 ? '0.0' : ((correct / i) * 100).toFixed(1);
451
- process.stdout.write(`\r${benchmark.name}: [solving... ${i + 1}/${data.length}] ${pct}% correct`);
452
- const response = await evaluator.evaluate(benchmark, item);
453
- let isCorrect;
454
- let judgeResponse;
455
- if (benchmark.useJudge && judgeClient) {
456
- process.stdout.write(`\r${benchmark.name}: [judging... ${i + 1}/${data.length}] ${pct}% correct`);
457
- const result = await evaluator.evaluateAndCheckWithJudge(benchmark, item, response);
458
- isCorrect = result.correct;
459
- judgeResponse = result.judgeResponse;
460
- }
461
- else {
462
- isCorrect = evaluator.checkAnswer(benchmark, item, response);
463
- }
464
- if (isCorrect)
465
- correct++;
466
- runLogEntries.push({
467
- benchmark: benchmark.id,
468
- model: config.modelName,
469
- question: item,
470
- response,
471
- isCorrect,
472
- judgeResponse,
473
- timestamp: new Date().toISOString(),
474
- index: i,
475
- });
476
- progress.save(progressFile, { completed: i + 1, total: data.length, seed: runSeed });
477
- }
478
- catch (error) {
479
- console.log(chalk_1.default.red(`\nError on item ${i}: ${error}`));
480
- }
481
- }
482
- // Batch write all entries for this run
483
- logger.logBatch(runLogEntries, `${benchmark.id}_${config.modelName}_${runTimestamp}`);
484
- const result = {
485
- benchmark: benchmark.id,
486
- model: config.modelName,
487
- total: data.length,
488
- correct,
489
- accuracy: (correct / data.length) * 100,
490
- timestamp: new Date().toISOString(),
491
- seed: runSeed,
492
- judge: useJudge && judgeModel ? judgeModel : undefined,
493
- };
494
- results.push(result);
495
- console.log(chalk_1.default.green(`\n${benchmark.name}: ${correct}/${data.length} (${result.accuracy.toFixed(2)}%)`));
496
- progress.clear(progressFile);
497
- }
498
- const resultsFile = path_1.default.join((0, paths_js_1.getResultsDir)(), `eval_results_${Date.now()}.json`);
499
- await fs_extra_1.default.ensureDir((0, paths_js_1.getResultsDir)());
500
- await fs_extra_1.default.writeJson(resultsFile, { results, timestamp: new Date().toISOString() }, { spaces: 2 });
501
- console.log(chalk_1.default.bold(`\n=== Results saved to ${resultsFile} ===`));
502
- console.log(chalk_1.default.bold('\n=== Summary ==='));
503
- for (const r of results) {
504
- console.log(`${r.benchmark}: ${r.correct}/${r.total} (${r.accuracy.toFixed(2)}%)`);
505
- }
506
- }
507
- async function showMenu() {
508
- // Remove all existing keypress listeners first
509
- process.stdin.removeAllListeners('keypress');
510
- readline_1.default.emitKeypressEvents(process.stdin);
511
- if (process.stdin.isTTY) {
512
- process.stdin.setRawMode(true);
513
21
  }
514
- let cursor = 0;
515
- const items = ['Run benchmarks', 'Explore past results'];
516
- const render = () => {
517
- process.stdout.write('\x1b[H\x1b[2J');
518
- console.log(chalk_1.default.bold.cyan('\n=== LLM Benchmark Runner ===\n'));
519
- items.forEach((item, i) => {
520
- const prefix = i === cursor ? chalk_1.default.cyan('> ') : ' ';
521
- console.log(`${prefix}${item}`);
522
- });
523
- console.log(chalk_1.default.gray('\nArrow keys, Enter to select, Esc to quit'));
524
- };
525
- render();
526
- return new Promise((resolve) => {
527
- const handleKeypress = async (str, key) => {
528
- if (key.ctrl && key.name === 'c') {
529
- process.stdin.removeListener('keypress', handleKeypress);
530
- if (process.stdin.isTTY)
531
- process.stdin.setRawMode(false);
532
- console.log(chalk_1.default.green('\nGoodbye!'));
533
- process.exit(0);
534
- }
535
- if (key.name === 'up') {
536
- cursor = Math.max(0, cursor - 1);
537
- render();
538
- }
539
- else if (key.name === 'down') {
540
- cursor = Math.min(items.length - 1, cursor + 1);
541
- render();
542
- }
543
- else if (key.name === 'escape') {
544
- process.stdin.removeListener('keypress', handleKeypress);
545
- if (process.stdin.isTTY)
546
- process.stdin.setRawMode(false);
547
- console.log(chalk_1.default.green('\nGoodbye!'));
548
- process.exit(0);
549
- }
550
- else if (key.name === 'return') {
551
- process.stdin.removeListener('keypress', handleKeypress);
552
- if (process.stdin.isTTY)
553
- process.stdin.setRawMode(false);
554
- if (cursor === 0) {
555
- runBenchmarks().then(() => showMenu()).then(() => resolve());
556
- }
557
- else if (cursor === 1) {
558
- if (process.stdin.isTTY)
559
- process.stdin.setRawMode(false);
560
- process.stdin.removeAllListeners('keypress');
561
- process.stdout.write('\x1b[H\x1b[2J');
562
- const tuiPath = path_1.default.join(__dirname, '..', 'bin', 'tui.js');
563
- (0, child_process_1.execFileSync)('node', [tuiPath], { stdio: 'inherit' });
564
- showMenu().then(() => resolve());
565
- }
566
- }
567
- };
568
- process.stdin.on('keypress', handleKeypress);
569
- });
570
22
  }
571
- showMenu().catch(console.error);
23
+ main().catch(console.error);
572
24
  //# sourceMappingURL=index.js.map