llmtester 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +156 -0
- package/bin/cli.js +2 -0
- package/bin/tui.js +2 -0
- package/dist/benchmarks.d.ts +17 -0
- package/dist/benchmarks.d.ts.map +1 -0
- package/dist/benchmarks.js +612 -0
- package/dist/benchmarks.js.map +1 -0
- package/dist/client.d.ts +69 -0
- package/dist/client.d.ts.map +1 -0
- package/dist/client.js +103 -0
- package/dist/client.js.map +1 -0
- package/dist/evaluator.d.ts +57 -0
- package/dist/evaluator.d.ts.map +1 -0
- package/dist/evaluator.js +410 -0
- package/dist/evaluator.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +515 -0
- package/dist/index.js.map +1 -0
- package/dist/logger.d.ts +16 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/logger.js +30 -0
- package/dist/logger.js.map +1 -0
- package/dist/paths.d.ts +6 -0
- package/dist/paths.d.ts.map +1 -0
- package/dist/paths.js +49 -0
- package/dist/paths.js.map +1 -0
- package/dist/progress.d.ts +13 -0
- package/dist/progress.d.ts.map +1 -0
- package/dist/progress.js +47 -0
- package/dist/progress.js.map +1 -0
- package/dist/tui.d.ts +3 -0
- package/dist/tui.d.ts.map +1 -0
- package/dist/tui.js +326 -0
- package/dist/tui.js.map +1 -0
- package/package.json +45 -0
package/README.md
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# llmbenchmark
|
|
2
|
+
|
|
3
|
+
An interactive CLI tool for benchmarking LLMs across multiple benchmarks. Run via `npx` without installing.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Interactive CLI** - Keyboard-driven benchmark selection and configuration
|
|
8
|
+
- **Multi-Provider Support** - OpenAI, Anthropic, Together.ai, Groq, Fireworks AI, Perplexity, OpenRouter, and any OpenAI-compatible API
|
|
9
|
+
- **LLM-as-Judge** - Optional secondary model evaluation for code, math, SQL, bash, and truthfulness benchmarks
|
|
10
|
+
- **Progress Tracking** - Resume interrupted evaluations from where you left off
|
|
11
|
+
- **Result Explorer** - Built-in TUI to browse past results, filter by pass/fail, and inspect individual responses
|
|
12
|
+
- **Config Persistence** - Saves provider, endpoint, and model settings between runs
|
|
13
|
+
- **Shuffle & Sampling** - Run a percentage of each benchmark with optional shuffling for diverse distribution
|
|
14
|
+
|
|
15
|
+
## Screenshots
|
|
16
|
+
|
|
17
|
+
**Select benchmarks to run**
|
|
18
|
+

|
|
19
|
+
|
|
20
|
+
**Progress during benchmark run**
|
|
21
|
+

|
|
22
|
+
|
|
23
|
+
**Browse past benchmark runs**
|
|
24
|
+

|
|
25
|
+
|
|
26
|
+
**Run a specific benchmark**
|
|
27
|
+

|
|
28
|
+
|
|
29
|
+
**Inspect individual test results**
|
|
30
|
+

|
|
31
|
+
|
|
32
|
+
## Supported Benchmarks
|
|
33
|
+
|
|
34
|
+
| Benchmark | Tests | Description | Judge |
|
|
35
|
+
|-----------|-------|-------------|-------|
|
|
36
|
+
| GSM8K | 1,319 | Grade School Math | |
|
|
37
|
+
| MATH | 12,500 | Competition Math | Yes |
|
|
38
|
+
| BIG-Bench Hard | 6,511 | 23 Challenging Tasks | |
|
|
39
|
+
| ARC-Challenge | 1,172 | Advanced Reasoning Challenge | |
|
|
40
|
+
| HellaSwag | 10,042 | Commonsense Reasoning | |
|
|
41
|
+
| MMLU | 14,042 | Multitask Language Understanding | |
|
|
42
|
+
| HumanEval | 164 | Python Code Generation | Yes |
|
|
43
|
+
| MBPP | 500 | Mostly Basic Programming Problems | Yes |
|
|
44
|
+
| APPS | 5,000 | Programming Progress System | Yes |
|
|
45
|
+
| TypeScript (MultiPL-E) | 161 | TypeScript Code Generation | |
|
|
46
|
+
| NL2Bash | 24 | Natural Language to Bash | Yes |
|
|
47
|
+
| Spider | 1,034 | Text-to-SQL Generation | Yes |
|
|
48
|
+
| TruthfulQA | 817 | Truthfulness Evaluation | Yes |
|
|
49
|
+
|
|
50
|
+
## Installation
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
# Run directly with npx
|
|
54
|
+
npx llmtester
|
|
55
|
+
|
|
56
|
+
# Or clone and run locally
|
|
57
|
+
git clone <repo>
|
|
58
|
+
cd llmbenchmark
|
|
59
|
+
npm install
|
|
60
|
+
npm run build
|
|
61
|
+
npm start
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Configuration
|
|
65
|
+
|
|
66
|
+
The app prompts for all configuration on first run and saves it to a config file (`~/.config/llmtester/config.json` on Linux). You can edit this file manually to change settings:
|
|
67
|
+
|
|
68
|
+
```json
|
|
69
|
+
{
|
|
70
|
+
"provider": "openai",
|
|
71
|
+
"apiKey": "your_api_key",
|
|
72
|
+
"baseUrl": "https://api.openai.com/v1",
|
|
73
|
+
"modelName": "gpt-4o"
|
|
74
|
+
}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## LLM-as-Judge
|
|
78
|
+
|
|
79
|
+
Some benchmarks use a secondary LLM ("judge") to evaluate responses instead of exact matching. This is necessary for:
|
|
80
|
+
|
|
81
|
+
- **Code generation** (HumanEval, MBPP, APPS) - since many code solutions can be functionally correct
|
|
82
|
+
- **Math** (MATH) - since multiple answer formats can represent the same solution
|
|
83
|
+
- **SQL** (Spider) - since multiple SQL queries can return the same result
|
|
84
|
+
- **Truthfulness** (TruthfulQA) - requires semantic evaluation
|
|
85
|
+
|
|
86
|
+
**Recommended judge models:**
|
|
87
|
+
- `gpt-4o-mini` - Fast and cost-effective, good quality
|
|
88
|
+
- `gpt-4o` - Higher quality for more accurate evaluation
|
|
89
|
+
- `claude-sonnet-4-20250514` - Good alternative from Anthropic
|
|
90
|
+
- `deepseek-ai/DeepSeek-V3` - Cost-effective option
|
|
91
|
+
|
|
92
|
+
The judge is configured separately from the main model when you run benchmarks that require one.
|
|
93
|
+
|
|
94
|
+
## Usage
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
npx llmtester
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
1. Enter your API key
|
|
101
|
+
2. Select or enter the endpoint URL
|
|
102
|
+
3. Enter the model name
|
|
103
|
+
4. Select benchmarks to run (multi-select with Space, confirm with Enter)
|
|
104
|
+
5. Set percentage to run per benchmark (1-100)
|
|
105
|
+
6. Choose whether to shuffle samples
|
|
106
|
+
7. If any selected benchmark uses a judge, configure the judge model
|
|
107
|
+
8. Review configuration summary and confirm to start
|
|
108
|
+
|
|
109
|
+
## Preset Endpoints
|
|
110
|
+
|
|
111
|
+
| Provider | URL |
|
|
112
|
+
|----------|-----|
|
|
113
|
+
| OpenAI | `https://api.openai.com/v1` |
|
|
114
|
+
| Together.ai | `https://api.together.xyz/v1` |
|
|
115
|
+
| Groq | `https://api.groq.com/openai/v1` |
|
|
116
|
+
| Fireworks AI | `https://api.fireworks.ai/inference/v1` |
|
|
117
|
+
| Perplexity | `https://api.perplexity.ai` |
|
|
118
|
+
| OpenRouter | `https://openrouter.ai/api/v1` |
|
|
119
|
+
|
|
120
|
+
## Output
|
|
121
|
+
|
|
122
|
+
Results are stored in platform-specific application data directories:
|
|
123
|
+
|
|
124
|
+
| Platform | Data Directory |
|
|
125
|
+
|----------|---------------|
|
|
126
|
+
| Linux | `~/.local/share/llmtester/` |
|
|
127
|
+
| macOS | `~/Library/Application Support/llmbenchmark/` |
|
|
128
|
+
| Windows | `%APPDATA%/llmtester/` |
|
|
129
|
+
|
|
130
|
+
Within the data directory:
|
|
131
|
+
|
|
132
|
+
- **`results/eval_results_{timestamp}.json`** - Aggregate results per benchmark run
|
|
133
|
+
- **`detailed_logs/*.jsonl`** - Per-question logs (question, response, correctness, judge feedback)
|
|
134
|
+
- **`eval_progress/`** - Resume files for interrupted runs
|
|
135
|
+
|
|
136
|
+
## Config Directory
|
|
137
|
+
|
|
138
|
+
Config is stored in the platform config directory:
|
|
139
|
+
|
|
140
|
+
| Platform | Config Directory |
|
|
141
|
+
|----------|-----------------|
|
|
142
|
+
| Linux | `~/.config/llmbenchmark/` |
|
|
143
|
+
| macOS | `~/Library/Application Support/llmbenchmark/` |
|
|
144
|
+
| Windows | `%APPDATA%/llmtester/` |
|
|
145
|
+
|
|
146
|
+
## Development
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
npm install
|
|
150
|
+
npm run build
|
|
151
|
+
npm start
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## License
|
|
155
|
+
|
|
156
|
+
MIT
|
package/bin/cli.js
ADDED
package/bin/tui.js
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
export interface Benchmark {
|
|
2
|
+
id: string;
|
|
3
|
+
name: string;
|
|
4
|
+
description: string;
|
|
5
|
+
type: string;
|
|
6
|
+
requiresHarness: boolean;
|
|
7
|
+
defaultSamples: number;
|
|
8
|
+
percentage?: number;
|
|
9
|
+
shuffle?: boolean;
|
|
10
|
+
promptTemplate?: string;
|
|
11
|
+
answerField?: string;
|
|
12
|
+
useJudge?: boolean;
|
|
13
|
+
judgePromptTemplate?: string;
|
|
14
|
+
}
|
|
15
|
+
export declare const BENCHMARK_DEFINITIONS: Record<string, Benchmark>;
|
|
16
|
+
export declare function fetchBenchmark(benchmarkId: string, percentage?: number, shuffle?: boolean, seed?: number): Promise<any[]>;
|
|
17
|
+
//# sourceMappingURL=benchmarks.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"benchmarks.d.ts","sourceRoot":"","sources":["../src/benchmarks.ts"],"names":[],"mappings":"AAmBA,MAAM,WAAW,SAAS;IACxB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB,IAAI,EAAE,MAAM,CAAC;IACb,eAAe,EAAE,OAAO,CAAC;IACzB,cAAc,EAAE,MAAM,CAAC;IACvB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,mBAAmB,CAAC,EAAE,MAAM,CAAC;CAC9B;AAED,eAAO,MAAM,qBAAqB,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,CAiR3D,CAAC;AAwUF,wBAAsB,cAAc,CAAC,WAAW,EAAE,MAAM,EAAE,UAAU,GAAE,MAAY,EAAE,OAAO,GAAE,OAAc,EAAE,IAAI,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,GAAG,EAAE,CAAC,CA+B1I"}
|