promptbench-cli 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promptbench_cli-0.1.0/LICENSE +21 -0
- promptbench_cli-0.1.0/PKG-INFO +240 -0
- promptbench_cli-0.1.0/README.md +200 -0
- promptbench_cli-0.1.0/promptbench/__init__.py +4 -0
- promptbench_cli-0.1.0/promptbench/__main__.py +6 -0
- promptbench_cli-0.1.0/promptbench/cli.py +228 -0
- promptbench_cli-0.1.0/promptbench/display.py +209 -0
- promptbench_cli-0.1.0/promptbench/providers.py +420 -0
- promptbench_cli-0.1.0/promptbench/runner.py +168 -0
- promptbench_cli-0.1.0/promptbench_cli.egg-info/PKG-INFO +240 -0
- promptbench_cli-0.1.0/promptbench_cli.egg-info/SOURCES.txt +16 -0
- promptbench_cli-0.1.0/promptbench_cli.egg-info/dependency_links.txt +1 -0
- promptbench_cli-0.1.0/promptbench_cli.egg-info/entry_points.txt +2 -0
- promptbench_cli-0.1.0/promptbench_cli.egg-info/requires.txt +19 -0
- promptbench_cli-0.1.0/promptbench_cli.egg-info/top_level.txt +1 -0
- promptbench_cli-0.1.0/pyproject.toml +53 -0
- promptbench_cli-0.1.0/setup.cfg +4 -0
- promptbench_cli-0.1.0/tests/test_promptbench.py +316 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Shehriyar Ali Rustam
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: promptbench-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A/B test prompts across LLM providers from your terminal
|
|
5
|
+
Author-email: Shehriyar Ali Rustam <shehriyar@example.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Shehriyar-Ali-Rustam/promptbench
|
|
8
|
+
Project-URL: Repository, https://github.com/Shehriyar-Ali-Rustam/promptbench
|
|
9
|
+
Project-URL: Issues, https://github.com/Shehriyar-Ali-Rustam/promptbench/issues
|
|
10
|
+
Keywords: llm,prompt,benchmark,openai,anthropic,gemini,cli,ai,testing
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Software Development :: Testing
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Provides-Extra: openai
|
|
26
|
+
Requires-Dist: openai>=1.0; extra == "openai"
|
|
27
|
+
Provides-Extra: anthropic
|
|
28
|
+
Requires-Dist: anthropic>=0.30; extra == "anthropic"
|
|
29
|
+
Provides-Extra: google
|
|
30
|
+
Requires-Dist: google-genai>=1.0; extra == "google"
|
|
31
|
+
Provides-Extra: all
|
|
32
|
+
Requires-Dist: openai>=1.0; extra == "all"
|
|
33
|
+
Requires-Dist: anthropic>=0.30; extra == "all"
|
|
34
|
+
Requires-Dist: google-genai>=1.0; extra == "all"
|
|
35
|
+
Provides-Extra: dev
|
|
36
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
37
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
38
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
39
|
+
Dynamic: license-file
|
|
40
|
+
|
|
41
|
+
# ⚡ PromptBench
|
|
42
|
+
|
|
43
|
+
**A/B test prompts across LLM providers from your terminal.**
|
|
44
|
+
|
|
45
|
+
[](https://github.com/Shehriyar-Ali-Rustam/promptbench/actions/workflows/ci.yml)
|
|
46
|
+
[](https://www.python.org/downloads/)
|
|
47
|
+
[](https://opensource.org/licenses/MIT)
|
|
48
|
+
[](https://pypi.org/project/promptbench-cli/)
|
|
49
|
+
|
|
50
|
+
Compare how different LLMs respond to the same prompt — see latency, token usage, cost, and responses side by side in one command.
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
$ promptbench "Explain quantum computing in one sentence"
|
|
54
|
+
|
|
55
|
+
⚡ PROMPTBENCH RESULTS
|
|
56
|
+
────────────────────────────────────────────────────────────
|
|
57
|
+
|
|
58
|
+
Prompt: Explain quantum computing in one sentence
|
|
59
|
+
|
|
60
|
+
Model Latency Tokens Cost
|
|
61
|
+
─────────────────────────────────── ────────── ────────── ────────────
|
|
62
|
+
claude-sonnet-4-20250514 1.24s 142 $0.0006
|
|
63
|
+
gemini-2.0-flash ⚡💰 312ms 98 $0.0000
|
|
64
|
+
gpt-4o 845ms 127 $0.0010
|
|
65
|
+
|
|
66
|
+
┌─ claude-sonnet-4-20250514
|
|
67
|
+
│ Quantum computing uses qubits that can exist in superpositions of
|
|
68
|
+
│ 0 and 1 simultaneously, enabling parallel computation that can
|
|
69
|
+
│ solve certain problems exponentially faster than classical computers.
|
|
70
|
+
└───────────────────────────────────────────────────────────
|
|
71
|
+
|
|
72
|
+
┌─ gemini-2.0-flash
|
|
73
|
+
│ Quantum computing harnesses quantum mechanical phenomena like
|
|
74
|
+
│ superposition and entanglement to process information in ways
|
|
75
|
+
│ impossible for traditional computers.
|
|
76
|
+
└───────────────────────────────────────────────────────────
|
|
77
|
+
|
|
78
|
+
┌─ gpt-4o
|
|
79
|
+
│ Quantum computing leverages the principles of quantum mechanics —
|
|
80
|
+
│ superposition and entanglement — to perform computations that
|
|
81
|
+
│ would be infeasible for classical computers.
|
|
82
|
+
└───────────────────────────────────────────────────────────
|
|
83
|
+
|
|
84
|
+
Total cost: $0.0016 · Avg latency: 799ms · 3 model(s)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Why PromptBench?
|
|
88
|
+
|
|
89
|
+
- **One command, multiple models** — no switching between playgrounds
|
|
90
|
+
- **Side-by-side comparison** — latency, tokens, cost, and full responses
|
|
91
|
+
- **Supports major providers** — OpenAI, Anthropic, Google Gemini
|
|
92
|
+
- **Multiple output formats** — terminal table, JSON, CSV, Markdown
|
|
93
|
+
- **Fast parallel execution** — all models run concurrently by default
|
|
94
|
+
- **Batch mode** — test multiple prompts from a file
|
|
95
|
+
- **Zero required dependencies** — install only the providers you need
|
|
96
|
+
- **Pipe-friendly** — works with stdin for scripting workflows
|
|
97
|
+
|
|
98
|
+
## Install
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
pip install promptbench-cli
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Install with provider SDKs:
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
# Individual providers
|
|
108
|
+
pip install "promptbench-cli[openai]"
|
|
109
|
+
pip install "promptbench-cli[anthropic]"
|
|
110
|
+
pip install "promptbench-cli[google]"
|
|
111
|
+
|
|
112
|
+
# All providers at once
|
|
113
|
+
pip install "promptbench-cli[all]"
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Setup
|
|
117
|
+
|
|
118
|
+
Export your API keys:
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
export OPENAI_API_KEY="sk-..."
|
|
122
|
+
export ANTHROPIC_API_KEY="sk-ant-..."
|
|
123
|
+
export GOOGLE_API_KEY="AI..."
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
You only need keys for the providers you want to use. PromptBench will warn you if a key is missing.
|
|
127
|
+
|
|
128
|
+
## Usage
|
|
129
|
+
|
|
130
|
+
### Basic comparison (default: GPT-4o, Claude Sonnet, Gemini Flash)
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
promptbench "Explain quantum computing in one sentence"
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### Pick specific models
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
promptbench "Write a haiku about coding" -m gpt4o sonnet flash
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### With a system prompt
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
promptbench "Summarize this text" -s "You are a concise technical writer" -m gpt4mini haiku
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Batch prompts from a file
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
# prompts.txt — one prompt per line
|
|
152
|
+
promptbench -f prompts.txt -m gpt4o sonnet
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### JSON output
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
promptbench "What is Python?" -o json
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### CSV output
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
promptbench "What is Python?" -o csv > results.csv
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### Save results to file
|
|
168
|
+
|
|
169
|
+
```bash
|
|
170
|
+
promptbench "Compare REST vs GraphQL" --save results.json
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### Pipe from stdin
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
echo "What is the meaning of life?" | promptbench -m gpt4o sonnet
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### List all supported models
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
promptbench --list-models
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## Supported Models
|
|
186
|
+
|
|
187
|
+
| Alias | Model | Provider |
|
|
188
|
+
|-------|-------|----------|
|
|
189
|
+
| `gpt4o` / `gpt4` | gpt-4o | OpenAI |
|
|
190
|
+
| `gpt4mini` | gpt-4o-mini | OpenAI |
|
|
191
|
+
| `gpt3.5` | gpt-3.5-turbo | OpenAI |
|
|
192
|
+
| `sonnet` / `claude-sonnet` | claude-sonnet-4-20250514 | Anthropic |
|
|
193
|
+
| `haiku` / `claude-haiku` | claude-haiku-4-5-20251001 | Anthropic |
|
|
194
|
+
| `flash` / `gemini-flash` | gemini-2.0-flash | Google |
|
|
195
|
+
| `gemini-pro` | gemini-1.5-pro | Google |
|
|
196
|
+
|
|
197
|
+
You can also use full model names directly (e.g., `gpt-4-turbo`, `gemini-1.5-flash`).
|
|
198
|
+
|
|
199
|
+
## Python Library Usage
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
from promptbench.runner import run_bench
|
|
203
|
+
from promptbench.display import display_comparison
|
|
204
|
+
|
|
205
|
+
run = run_bench(
|
|
206
|
+
prompt="Explain recursion simply",
|
|
207
|
+
models=["gpt4o", "sonnet", "flash"],
|
|
208
|
+
temperature=0.5,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
print(display_comparison(run))
|
|
212
|
+
|
|
213
|
+
# Access individual results
|
|
214
|
+
for result in run.results:
|
|
215
|
+
print(f"{result.model}: {result.latency_ms:.0f}ms, {result.cost_usd:.6f} USD")
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
## Configuration Flags
|
|
219
|
+
|
|
220
|
+
| Flag | Description | Default |
|
|
221
|
+
|------|-------------|---------|
|
|
222
|
+
| `-m, --models` | Models to test | `gpt4o sonnet flash` |
|
|
223
|
+
| `-s, --system` | System prompt | None |
|
|
224
|
+
| `-t, --temperature` | Sampling temperature | `0.7` |
|
|
225
|
+
| `--max-tokens` | Max output tokens | `1024` |
|
|
226
|
+
| `-f, --file` | Prompts file path | None |
|
|
227
|
+
| `-o, --output` | Output format: table, json, csv, markdown | `table` |
|
|
228
|
+
| `--full` | Show full responses (no truncation) | Off |
|
|
229
|
+
| `--no-parallel` | Run models sequentially | Off |
|
|
230
|
+
| `--save` | Save results to JSON file | None |
|
|
231
|
+
| `--list-models` | List supported models | — |
|
|
232
|
+
| `--version` | Show version | — |
|
|
233
|
+
|
|
234
|
+
## Contributing
|
|
235
|
+
|
|
236
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup, how to add providers, and PR guidelines.
|
|
237
|
+
|
|
238
|
+
## License
|
|
239
|
+
|
|
240
|
+
MIT — see [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
# ⚡ PromptBench
|
|
2
|
+
|
|
3
|
+
**A/B test prompts across LLM providers from your terminal.**
|
|
4
|
+
|
|
5
|
+
[](https://github.com/Shehriyar-Ali-Rustam/promptbench/actions/workflows/ci.yml)
|
|
6
|
+
[](https://www.python.org/downloads/)
|
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
|
8
|
+
[](https://pypi.org/project/promptbench-cli/)
|
|
9
|
+
|
|
10
|
+
Compare how different LLMs respond to the same prompt — see latency, token usage, cost, and responses side by side in one command.
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
$ promptbench "Explain quantum computing in one sentence"
|
|
14
|
+
|
|
15
|
+
⚡ PROMPTBENCH RESULTS
|
|
16
|
+
────────────────────────────────────────────────────────────
|
|
17
|
+
|
|
18
|
+
Prompt: Explain quantum computing in one sentence
|
|
19
|
+
|
|
20
|
+
Model Latency Tokens Cost
|
|
21
|
+
─────────────────────────────────── ────────── ────────── ────────────
|
|
22
|
+
claude-sonnet-4-20250514 1.24s 142 $0.0006
|
|
23
|
+
gemini-2.0-flash ⚡💰 312ms 98 $0.0000
|
|
24
|
+
gpt-4o 845ms 127 $0.0010
|
|
25
|
+
|
|
26
|
+
┌─ claude-sonnet-4-20250514
|
|
27
|
+
│ Quantum computing uses qubits that can exist in superpositions of
|
|
28
|
+
│ 0 and 1 simultaneously, enabling parallel computation that can
|
|
29
|
+
│ solve certain problems exponentially faster than classical computers.
|
|
30
|
+
└───────────────────────────────────────────────────────────
|
|
31
|
+
|
|
32
|
+
┌─ gemini-2.0-flash
|
|
33
|
+
│ Quantum computing harnesses quantum mechanical phenomena like
|
|
34
|
+
│ superposition and entanglement to process information in ways
|
|
35
|
+
│ impossible for traditional computers.
|
|
36
|
+
└───────────────────────────────────────────────────────────
|
|
37
|
+
|
|
38
|
+
┌─ gpt-4o
|
|
39
|
+
│ Quantum computing leverages the principles of quantum mechanics —
|
|
40
|
+
│ superposition and entanglement — to perform computations that
|
|
41
|
+
│ would be infeasible for classical computers.
|
|
42
|
+
└───────────────────────────────────────────────────────────
|
|
43
|
+
|
|
44
|
+
Total cost: $0.0016 · Avg latency: 799ms · 3 model(s)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Why PromptBench?
|
|
48
|
+
|
|
49
|
+
- **One command, multiple models** — no switching between playgrounds
|
|
50
|
+
- **Side-by-side comparison** — latency, tokens, cost, and full responses
|
|
51
|
+
- **Supports major providers** — OpenAI, Anthropic, Google Gemini
|
|
52
|
+
- **Multiple output formats** — terminal table, JSON, CSV, Markdown
|
|
53
|
+
- **Fast parallel execution** — all models run concurrently by default
|
|
54
|
+
- **Batch mode** — test multiple prompts from a file
|
|
55
|
+
- **Zero required dependencies** — install only the providers you need
|
|
56
|
+
- **Pipe-friendly** — works with stdin for scripting workflows
|
|
57
|
+
|
|
58
|
+
## Install
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
pip install promptbench-cli
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Install with provider SDKs:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
# Individual providers
|
|
68
|
+
pip install "promptbench-cli[openai]"
|
|
69
|
+
pip install "promptbench-cli[anthropic]"
|
|
70
|
+
pip install "promptbench-cli[google]"
|
|
71
|
+
|
|
72
|
+
# All providers at once
|
|
73
|
+
pip install "promptbench-cli[all]"
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Setup
|
|
77
|
+
|
|
78
|
+
Export your API keys:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
export OPENAI_API_KEY="sk-..."
|
|
82
|
+
export ANTHROPIC_API_KEY="sk-ant-..."
|
|
83
|
+
export GOOGLE_API_KEY="AI..."
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
You only need keys for the providers you want to use. PromptBench will warn you if a key is missing.
|
|
87
|
+
|
|
88
|
+
## Usage
|
|
89
|
+
|
|
90
|
+
### Basic comparison (default: GPT-4o, Claude Sonnet, Gemini Flash)
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
promptbench "Explain quantum computing in one sentence"
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Pick specific models
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
promptbench "Write a haiku about coding" -m gpt4o sonnet flash
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### With a system prompt
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
promptbench "Summarize this text" -s "You are a concise technical writer" -m gpt4mini haiku
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Batch prompts from a file
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
# prompts.txt — one prompt per line
|
|
112
|
+
promptbench -f prompts.txt -m gpt4o sonnet
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### JSON output
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
promptbench "What is Python?" -o json
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### CSV output
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
promptbench "What is Python?" -o csv > results.csv
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### Save results to file
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
promptbench "Compare REST vs GraphQL" --save results.json
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Pipe from stdin
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
echo "What is the meaning of life?" | promptbench -m gpt4o sonnet
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### List all supported models
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
promptbench --list-models
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Supported Models
|
|
146
|
+
|
|
147
|
+
| Alias | Model | Provider |
|
|
148
|
+
|-------|-------|----------|
|
|
149
|
+
| `gpt4o` / `gpt4` | gpt-4o | OpenAI |
|
|
150
|
+
| `gpt4mini` | gpt-4o-mini | OpenAI |
|
|
151
|
+
| `gpt3.5` | gpt-3.5-turbo | OpenAI |
|
|
152
|
+
| `sonnet` / `claude-sonnet` | claude-sonnet-4-20250514 | Anthropic |
|
|
153
|
+
| `haiku` / `claude-haiku` | claude-haiku-4-5-20251001 | Anthropic |
|
|
154
|
+
| `flash` / `gemini-flash` | gemini-2.0-flash | Google |
|
|
155
|
+
| `gemini-pro` | gemini-1.5-pro | Google |
|
|
156
|
+
|
|
157
|
+
You can also use full model names directly (e.g., `gpt-4-turbo`, `gemini-1.5-flash`).
|
|
158
|
+
|
|
159
|
+
## Python Library Usage
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
from promptbench.runner import run_bench
|
|
163
|
+
from promptbench.display import display_comparison
|
|
164
|
+
|
|
165
|
+
run = run_bench(
|
|
166
|
+
prompt="Explain recursion simply",
|
|
167
|
+
models=["gpt4o", "sonnet", "flash"],
|
|
168
|
+
temperature=0.5,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
print(display_comparison(run))
|
|
172
|
+
|
|
173
|
+
# Access individual results
|
|
174
|
+
for result in run.results:
|
|
175
|
+
print(f"{result.model}: {result.latency_ms:.0f}ms, {result.cost_usd:.6f} USD")
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## Configuration Flags
|
|
179
|
+
|
|
180
|
+
| Flag | Description | Default |
|
|
181
|
+
|------|-------------|---------|
|
|
182
|
+
| `-m, --models` | Models to test | `gpt4o sonnet flash` |
|
|
183
|
+
| `-s, --system` | System prompt | None |
|
|
184
|
+
| `-t, --temperature` | Sampling temperature | `0.7` |
|
|
185
|
+
| `--max-tokens` | Max output tokens | `1024` |
|
|
186
|
+
| `-f, --file` | Prompts file path | None |
|
|
187
|
+
| `-o, --output` | Output format: table, json, csv, markdown | `table` |
|
|
188
|
+
| `--full` | Show full responses (no truncation) | Off |
|
|
189
|
+
| `--no-parallel` | Run models sequentially | Off |
|
|
190
|
+
| `--save` | Save results to JSON file | None |
|
|
191
|
+
| `--list-models` | List supported models | — |
|
|
192
|
+
| `--version` | Show version | — |
|
|
193
|
+
|
|
194
|
+
## Contributing
|
|
195
|
+
|
|
196
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup, how to add providers, and PR guidelines.
|
|
197
|
+
|
|
198
|
+
## License
|
|
199
|
+
|
|
200
|
+
MIT — see [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""CLI entry point for PromptBench."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
|
|
10
|
+
from promptbench import __version__
|
|
11
|
+
from promptbench.display import display_comparison, display_csv, display_json, display_markdown
|
|
12
|
+
from promptbench.providers import (
|
|
13
|
+
MODEL_ALIASES,
|
|
14
|
+
PRICING,
|
|
15
|
+
detect_provider,
|
|
16
|
+
resolve_model,
|
|
17
|
+
)
|
|
18
|
+
from promptbench.runner import run_bench, run_bench_from_file
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _check_api_keys(models: list[str]) -> list[str]:
|
|
22
|
+
"""Check which API keys are missing for the requested models and return warnings."""
|
|
23
|
+
warnings: list[str] = []
|
|
24
|
+
providers_needed: set[str] = set()
|
|
25
|
+
for model in models:
|
|
26
|
+
resolved = resolve_model(model)
|
|
27
|
+
providers_needed.add(detect_provider(resolved))
|
|
28
|
+
|
|
29
|
+
key_map = {
|
|
30
|
+
"openai": "OPENAI_API_KEY",
|
|
31
|
+
"anthropic": "ANTHROPIC_API_KEY",
|
|
32
|
+
"google": "GOOGLE_API_KEY",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
for provider in sorted(providers_needed):
|
|
36
|
+
env_var = key_map.get(provider)
|
|
37
|
+
if env_var and not os.environ.get(env_var):
|
|
38
|
+
warnings.append(f"⚠ {env_var} not set — {provider} models will fail")
|
|
39
|
+
|
|
40
|
+
return warnings
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _list_models() -> str:
|
|
44
|
+
"""Format the list of supported models and aliases."""
|
|
45
|
+
lines: list[str] = []
|
|
46
|
+
lines.append("\n⚡ PromptBench — Supported Models\n")
|
|
47
|
+
|
|
48
|
+
lines.append("Models with pricing info:")
|
|
49
|
+
lines.append(f" {'Model':<35} {'Input $/1M':>12} {'Output $/1M':>12} {'Provider':<10}")
|
|
50
|
+
lines.append(f" {'─' * 35} {'─' * 12} {'─' * 12} {'─' * 10}")
|
|
51
|
+
for model, (inp, out) in sorted(PRICING.items()):
|
|
52
|
+
provider = detect_provider(model)
|
|
53
|
+
lines.append(f" {model:<35} ${inp:<11.3f} ${out:<11.3f} {provider:<10}")
|
|
54
|
+
|
|
55
|
+
lines.append("\nAliases:")
|
|
56
|
+
lines.append(f" {'Alias':<20} → {'Model':<35}")
|
|
57
|
+
lines.append(f" {'─' * 20} {'─' * 35}")
|
|
58
|
+
for alias, model in sorted(MODEL_ALIASES.items()):
|
|
59
|
+
lines.append(f" {alias:<20} → {model:<35}")
|
|
60
|
+
|
|
61
|
+
lines.append("")
|
|
62
|
+
return "\n".join(lines)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
66
|
+
"""Build and return the argument parser."""
|
|
67
|
+
parser = argparse.ArgumentParser(
|
|
68
|
+
prog="promptbench",
|
|
69
|
+
description="⚡ PromptBench — A/B test prompts across LLM providers from your terminal.",
|
|
70
|
+
epilog="""Examples:
|
|
71
|
+
promptbench "Explain quantum computing in one sentence"
|
|
72
|
+
promptbench "Write a haiku about coding" -m gpt4o sonnet flash
|
|
73
|
+
promptbench "Summarize this" -s "You are a helpful assistant" -m gpt4mini haiku
|
|
74
|
+
promptbench -f prompts.txt -m gpt4o sonnet -o json
|
|
75
|
+
promptbench "Hello world" -o csv --save results.json
|
|
76
|
+
echo "What is Python?" | promptbench -m gpt4o
|
|
77
|
+
promptbench --list-models
|
|
78
|
+
""",
|
|
79
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
parser.add_argument(
|
|
83
|
+
"prompt",
|
|
84
|
+
nargs="?",
|
|
85
|
+
default=None,
|
|
86
|
+
help="The prompt to test across models",
|
|
87
|
+
)
|
|
88
|
+
parser.add_argument(
|
|
89
|
+
"-m", "--models",
|
|
90
|
+
nargs="+",
|
|
91
|
+
default=["gpt4o", "sonnet", "flash"],
|
|
92
|
+
help="Models to test (names or aliases). Default: gpt4o sonnet flash",
|
|
93
|
+
)
|
|
94
|
+
parser.add_argument(
|
|
95
|
+
"-s", "--system",
|
|
96
|
+
default=None,
|
|
97
|
+
help="System prompt to use for all models",
|
|
98
|
+
)
|
|
99
|
+
parser.add_argument(
|
|
100
|
+
"-t", "--temperature",
|
|
101
|
+
type=float,
|
|
102
|
+
default=0.7,
|
|
103
|
+
help="Sampling temperature (default: 0.7)",
|
|
104
|
+
)
|
|
105
|
+
parser.add_argument(
|
|
106
|
+
"--max-tokens",
|
|
107
|
+
type=int,
|
|
108
|
+
default=1024,
|
|
109
|
+
help="Maximum output tokens (default: 1024)",
|
|
110
|
+
)
|
|
111
|
+
parser.add_argument(
|
|
112
|
+
"-f", "--file",
|
|
113
|
+
default=None,
|
|
114
|
+
help="Path to a file with prompts (one per line or JSON array)",
|
|
115
|
+
)
|
|
116
|
+
parser.add_argument(
|
|
117
|
+
"-o", "--output",
|
|
118
|
+
choices=["table", "json", "csv", "markdown"],
|
|
119
|
+
default="table",
|
|
120
|
+
help="Output format (default: table)",
|
|
121
|
+
)
|
|
122
|
+
parser.add_argument(
|
|
123
|
+
"--full",
|
|
124
|
+
action="store_true",
|
|
125
|
+
help="Show full responses (no truncation)",
|
|
126
|
+
)
|
|
127
|
+
parser.add_argument(
|
|
128
|
+
"--no-parallel",
|
|
129
|
+
action="store_true",
|
|
130
|
+
help="Run models sequentially instead of in parallel",
|
|
131
|
+
)
|
|
132
|
+
parser.add_argument(
|
|
133
|
+
"--save",
|
|
134
|
+
default=None,
|
|
135
|
+
metavar="FILE",
|
|
136
|
+
help="Save results to a JSON file",
|
|
137
|
+
)
|
|
138
|
+
parser.add_argument(
|
|
139
|
+
"--list-models",
|
|
140
|
+
action="store_true",
|
|
141
|
+
help="List all supported models and aliases",
|
|
142
|
+
)
|
|
143
|
+
parser.add_argument(
|
|
144
|
+
"--version",
|
|
145
|
+
action="version",
|
|
146
|
+
version=f"promptbench {__version__}",
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
return parser
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _format_run(run, output: str, full: bool) -> str:
|
|
153
|
+
"""Format a BenchRun according to the chosen output format."""
|
|
154
|
+
if output == "json":
|
|
155
|
+
return display_json(run)
|
|
156
|
+
if output == "csv":
|
|
157
|
+
return display_csv(run)
|
|
158
|
+
if output == "markdown":
|
|
159
|
+
return display_markdown(run)
|
|
160
|
+
return display_comparison(run, show_full=full)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def main(argv: list[str] | None = None) -> None:
|
|
164
|
+
"""Main CLI entry point."""
|
|
165
|
+
parser = _build_parser()
|
|
166
|
+
args = parser.parse_args(argv)
|
|
167
|
+
|
|
168
|
+
# List models
|
|
169
|
+
if args.list_models:
|
|
170
|
+
print(_list_models())
|
|
171
|
+
return
|
|
172
|
+
|
|
173
|
+
# Determine prompt source
|
|
174
|
+
prompt = args.prompt
|
|
175
|
+
file_path = args.file
|
|
176
|
+
|
|
177
|
+
# Check for piped stdin
|
|
178
|
+
if not prompt and not file_path and not sys.stdin.isatty():
|
|
179
|
+
prompt = sys.stdin.read().strip()
|
|
180
|
+
|
|
181
|
+
if not prompt and not file_path:
|
|
182
|
+
parser.print_help()
|
|
183
|
+
sys.exit(1)
|
|
184
|
+
|
|
185
|
+
# Check API keys
|
|
186
|
+
warnings = _check_api_keys(args.models)
|
|
187
|
+
for w in warnings:
|
|
188
|
+
print(w, file=sys.stderr)
|
|
189
|
+
|
|
190
|
+
parallel = not args.no_parallel
|
|
191
|
+
|
|
192
|
+
# Run from file
|
|
193
|
+
if file_path:
|
|
194
|
+
runs = run_bench_from_file(
|
|
195
|
+
filepath=file_path,
|
|
196
|
+
models=args.models,
|
|
197
|
+
system=args.system,
|
|
198
|
+
temperature=args.temperature,
|
|
199
|
+
max_tokens=args.max_tokens,
|
|
200
|
+
parallel=parallel,
|
|
201
|
+
verbose=True,
|
|
202
|
+
)
|
|
203
|
+
for run in runs:
|
|
204
|
+
print(_format_run(run, args.output, args.full))
|
|
205
|
+
|
|
206
|
+
if args.save:
|
|
207
|
+
data = [r.to_dict() for r in runs]
|
|
208
|
+
with open(args.save, "w", encoding="utf-8") as f:
|
|
209
|
+
json.dump(data, f, indent=2)
|
|
210
|
+
print(f"\n✓ Results saved to {args.save}")
|
|
211
|
+
return
|
|
212
|
+
|
|
213
|
+
# Run single prompt
|
|
214
|
+
run = run_bench(
|
|
215
|
+
prompt=prompt,
|
|
216
|
+
models=args.models,
|
|
217
|
+
system=args.system,
|
|
218
|
+
temperature=args.temperature,
|
|
219
|
+
max_tokens=args.max_tokens,
|
|
220
|
+
parallel=parallel,
|
|
221
|
+
verbose=True,
|
|
222
|
+
)
|
|
223
|
+
print(_format_run(run, args.output, args.full))
|
|
224
|
+
|
|
225
|
+
if args.save:
|
|
226
|
+
with open(args.save, "w", encoding="utf-8") as f:
|
|
227
|
+
json.dump(run.to_dict(), f, indent=2)
|
|
228
|
+
print(f"\n✓ Results saved to {args.save}")
|