promptbench-cli 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Shehriyar Ali Rustam
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,240 @@
1
+ Metadata-Version: 2.4
2
+ Name: promptbench-cli
3
+ Version: 0.1.0
4
+ Summary: A/B test prompts across LLM providers from your terminal
5
+ Author-email: Shehriyar Ali Rustam <shehriyar@example.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/Shehriyar-Ali-Rustam/promptbench
8
+ Project-URL: Repository, https://github.com/Shehriyar-Ali-Rustam/promptbench
9
+ Project-URL: Issues, https://github.com/Shehriyar-Ali-Rustam/promptbench/issues
10
+ Keywords: llm,prompt,benchmark,openai,anthropic,gemini,cli,ai,testing
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Software Development :: Testing
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Requires-Python: >=3.9
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Provides-Extra: openai
26
+ Requires-Dist: openai>=1.0; extra == "openai"
27
+ Provides-Extra: anthropic
28
+ Requires-Dist: anthropic>=0.30; extra == "anthropic"
29
+ Provides-Extra: google
30
+ Requires-Dist: google-genai>=1.0; extra == "google"
31
+ Provides-Extra: all
32
+ Requires-Dist: openai>=1.0; extra == "all"
33
+ Requires-Dist: anthropic>=0.30; extra == "all"
34
+ Requires-Dist: google-genai>=1.0; extra == "all"
35
+ Provides-Extra: dev
36
+ Requires-Dist: pytest>=7.0; extra == "dev"
37
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
38
+ Requires-Dist: ruff>=0.4; extra == "dev"
39
+ Dynamic: license-file
40
+
41
+ # ⚡ PromptBench
42
+
43
+ **A/B test prompts across LLM providers from your terminal.**
44
+
45
+ [![CI](https://github.com/Shehriyar-Ali-Rustam/promptbench/actions/workflows/ci.yml/badge.svg)](https://github.com/Shehriyar-Ali-Rustam/promptbench/actions/workflows/ci.yml)
46
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
47
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
48
+ [![PyPI](https://img.shields.io/pypi/v/promptbench-cli.svg)](https://pypi.org/project/promptbench-cli/)
49
+
50
+ Compare how different LLMs respond to the same prompt — see latency, token usage, cost, and responses side by side in one command.
51
+
52
+ ```
53
+ $ promptbench "Explain quantum computing in one sentence"
54
+
55
+ ⚡ PROMPTBENCH RESULTS
56
+ ────────────────────────────────────────────────────────────
57
+
58
+ Prompt: Explain quantum computing in one sentence
59
+
60
+ Model Latency Tokens Cost
61
+ ─────────────────────────────────── ────────── ────────── ────────────
62
+ claude-sonnet-4-20250514 1.24s 142 $0.0006
63
+ gemini-2.0-flash ⚡💰 312ms 98 $0.0000
64
+ gpt-4o 845ms 127 $0.0010
65
+
66
+ ┌─ claude-sonnet-4-20250514
67
+ │ Quantum computing uses qubits that can exist in superpositions of
68
+ │ 0 and 1 simultaneously, enabling parallel computation that can
69
+ │ solve certain problems exponentially faster than classical computers.
70
+ └───────────────────────────────────────────────────────────
71
+
72
+ ┌─ gemini-2.0-flash
73
+ │ Quantum computing harnesses quantum mechanical phenomena like
74
+ │ superposition and entanglement to process information in ways
75
+ │ impossible for traditional computers.
76
+ └───────────────────────────────────────────────────────────
77
+
78
+ ┌─ gpt-4o
79
+ │ Quantum computing leverages the principles of quantum mechanics —
80
+ │ superposition and entanglement — to perform computations that
81
+ │ would be infeasible for classical computers.
82
+ └───────────────────────────────────────────────────────────
83
+
84
+ Total cost: $0.0016 · Avg latency: 799ms · 3 model(s)
85
+ ```
86
+
87
+ ## Why PromptBench?
88
+
89
+ - **One command, multiple models** — no switching between playgrounds
90
+ - **Side-by-side comparison** — latency, tokens, cost, and full responses
91
+ - **Supports major providers** — OpenAI, Anthropic, Google Gemini
92
+ - **Multiple output formats** — terminal table, JSON, CSV, Markdown
93
+ - **Fast parallel execution** — all models run concurrently by default
94
+ - **Batch mode** — test multiple prompts from a file
95
+ - **Zero required dependencies** — install only the providers you need
96
+ - **Pipe-friendly** — works with stdin for scripting workflows
97
+
98
+ ## Install
99
+
100
+ ```bash
101
+ pip install promptbench-cli
102
+ ```
103
+
104
+ Install with provider SDKs:
105
+
106
+ ```bash
107
+ # Individual providers
108
+ pip install "promptbench-cli[openai]"
109
+ pip install "promptbench-cli[anthropic]"
110
+ pip install "promptbench-cli[google]"
111
+
112
+ # All providers at once
113
+ pip install "promptbench-cli[all]"
114
+ ```
115
+
116
+ ## Setup
117
+
118
+ Export your API keys:
119
+
120
+ ```bash
121
+ export OPENAI_API_KEY="sk-..."
122
+ export ANTHROPIC_API_KEY="sk-ant-..."
123
+ export GOOGLE_API_KEY="AI..."
124
+ ```
125
+
126
+ You only need keys for the providers you want to use. PromptBench will warn you if a key is missing.
127
+
128
+ ## Usage
129
+
130
+ ### Basic comparison (default: GPT-4o, Claude Sonnet, Gemini Flash)
131
+
132
+ ```bash
133
+ promptbench "Explain quantum computing in one sentence"
134
+ ```
135
+
136
+ ### Pick specific models
137
+
138
+ ```bash
139
+ promptbench "Write a haiku about coding" -m gpt4o sonnet flash
140
+ ```
141
+
142
+ ### With a system prompt
143
+
144
+ ```bash
145
+ promptbench "Summarize this text" -s "You are a concise technical writer" -m gpt4mini haiku
146
+ ```
147
+
148
+ ### Batch prompts from a file
149
+
150
+ ```bash
151
+ # prompts.txt — one prompt per line
152
+ promptbench -f prompts.txt -m gpt4o sonnet
153
+ ```
154
+
155
+ ### JSON output
156
+
157
+ ```bash
158
+ promptbench "What is Python?" -o json
159
+ ```
160
+
161
+ ### CSV output
162
+
163
+ ```bash
164
+ promptbench "What is Python?" -o csv > results.csv
165
+ ```
166
+
167
+ ### Save results to file
168
+
169
+ ```bash
170
+ promptbench "Compare REST vs GraphQL" --save results.json
171
+ ```
172
+
173
+ ### Pipe from stdin
174
+
175
+ ```bash
176
+ echo "What is the meaning of life?" | promptbench -m gpt4o sonnet
177
+ ```
178
+
179
+ ### List all supported models
180
+
181
+ ```bash
182
+ promptbench --list-models
183
+ ```
184
+
185
+ ## Supported Models
186
+
187
+ | Alias | Model | Provider |
188
+ |-------|-------|----------|
189
+ | `gpt4o` / `gpt4` | gpt-4o | OpenAI |
190
+ | `gpt4mini` | gpt-4o-mini | OpenAI |
191
+ | `gpt3.5` | gpt-3.5-turbo | OpenAI |
192
+ | `sonnet` / `claude-sonnet` | claude-sonnet-4-20250514 | Anthropic |
193
+ | `haiku` / `claude-haiku` | claude-haiku-4-5-20251001 | Anthropic |
194
+ | `flash` / `gemini-flash` | gemini-2.0-flash | Google |
195
+ | `gemini-pro` | gemini-1.5-pro | Google |
196
+
197
+ You can also use full model names directly (e.g., `gpt-4-turbo`, `gemini-1.5-flash`).
198
+
199
+ ## Python Library Usage
200
+
201
+ ```python
202
+ from promptbench.runner import run_bench
203
+ from promptbench.display import display_comparison
204
+
205
+ run = run_bench(
206
+ prompt="Explain recursion simply",
207
+ models=["gpt4o", "sonnet", "flash"],
208
+ temperature=0.5,
209
+ )
210
+
211
+ print(display_comparison(run))
212
+
213
+ # Access individual results
214
+ for result in run.results:
215
+ print(f"{result.model}: {result.latency_ms:.0f}ms, {result.cost_usd:.6f} USD")
216
+ ```
217
+
218
+ ## Configuration Flags
219
+
220
+ | Flag | Description | Default |
221
+ |------|-------------|---------|
222
+ | `-m, --models` | Models to test | `gpt4o sonnet flash` |
223
+ | `-s, --system` | System prompt | None |
224
+ | `-t, --temperature` | Sampling temperature | `0.7` |
225
+ | `--max-tokens` | Max output tokens | `1024` |
226
+ | `-f, --file` | Prompts file path | None |
227
+ | `-o, --output` | Output format: table, json, csv, markdown | `table` |
228
+ | `--full` | Show full responses (no truncation) | Off |
229
+ | `--no-parallel` | Run models sequentially | Off |
230
+ | `--save` | Save results to JSON file | None |
231
+ | `--list-models` | List supported models | — |
232
+ | `--version` | Show version | — |
233
+
234
+ ## Contributing
235
+
236
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup, how to add providers, and PR guidelines.
237
+
238
+ ## License
239
+
240
+ MIT — see [LICENSE](LICENSE) for details.
@@ -0,0 +1,200 @@
1
+ # ⚡ PromptBench
2
+
3
+ **A/B test prompts across LLM providers from your terminal.**
4
+
5
+ [![CI](https://github.com/Shehriyar-Ali-Rustam/promptbench/actions/workflows/ci.yml/badge.svg)](https://github.com/Shehriyar-Ali-Rustam/promptbench/actions/workflows/ci.yml)
6
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
7
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
+ [![PyPI](https://img.shields.io/pypi/v/promptbench-cli.svg)](https://pypi.org/project/promptbench-cli/)
9
+
10
+ Compare how different LLMs respond to the same prompt — see latency, token usage, cost, and responses side by side in one command.
11
+
12
+ ```
13
+ $ promptbench "Explain quantum computing in one sentence"
14
+
15
+ ⚡ PROMPTBENCH RESULTS
16
+ ────────────────────────────────────────────────────────────
17
+
18
+ Prompt: Explain quantum computing in one sentence
19
+
20
+ Model Latency Tokens Cost
21
+ ─────────────────────────────────── ────────── ────────── ────────────
22
+ claude-sonnet-4-20250514 1.24s 142 $0.0006
23
+ gemini-2.0-flash ⚡💰 312ms 98 $0.0000
24
+ gpt-4o 845ms 127 $0.0010
25
+
26
+ ┌─ claude-sonnet-4-20250514
27
+ │ Quantum computing uses qubits that can exist in superpositions of
28
+ │ 0 and 1 simultaneously, enabling parallel computation that can
29
+ │ solve certain problems exponentially faster than classical computers.
30
+ └───────────────────────────────────────────────────────────
31
+
32
+ ┌─ gemini-2.0-flash
33
+ │ Quantum computing harnesses quantum mechanical phenomena like
34
+ │ superposition and entanglement to process information in ways
35
+ │ impossible for traditional computers.
36
+ └───────────────────────────────────────────────────────────
37
+
38
+ ┌─ gpt-4o
39
+ │ Quantum computing leverages the principles of quantum mechanics —
40
+ │ superposition and entanglement — to perform computations that
41
+ │ would be infeasible for classical computers.
42
+ └───────────────────────────────────────────────────────────
43
+
44
+ Total cost: $0.0016 · Avg latency: 799ms · 3 model(s)
45
+ ```
46
+
47
+ ## Why PromptBench?
48
+
49
+ - **One command, multiple models** — no switching between playgrounds
50
+ - **Side-by-side comparison** — latency, tokens, cost, and full responses
51
+ - **Supports major providers** — OpenAI, Anthropic, Google Gemini
52
+ - **Multiple output formats** — terminal table, JSON, CSV, Markdown
53
+ - **Fast parallel execution** — all models run concurrently by default
54
+ - **Batch mode** — test multiple prompts from a file
55
+ - **Zero required dependencies** — install only the providers you need
56
+ - **Pipe-friendly** — works with stdin for scripting workflows
57
+
58
+ ## Install
59
+
60
+ ```bash
61
+ pip install promptbench-cli
62
+ ```
63
+
64
+ Install with provider SDKs:
65
+
66
+ ```bash
67
+ # Individual providers
68
+ pip install "promptbench-cli[openai]"
69
+ pip install "promptbench-cli[anthropic]"
70
+ pip install "promptbench-cli[google]"
71
+
72
+ # All providers at once
73
+ pip install "promptbench-cli[all]"
74
+ ```
75
+
76
+ ## Setup
77
+
78
+ Export your API keys:
79
+
80
+ ```bash
81
+ export OPENAI_API_KEY="sk-..."
82
+ export ANTHROPIC_API_KEY="sk-ant-..."
83
+ export GOOGLE_API_KEY="AI..."
84
+ ```
85
+
86
+ You only need keys for the providers you want to use. PromptBench will warn you if a key is missing.
87
+
88
+ ## Usage
89
+
90
+ ### Basic comparison (default: GPT-4o, Claude Sonnet, Gemini Flash)
91
+
92
+ ```bash
93
+ promptbench "Explain quantum computing in one sentence"
94
+ ```
95
+
96
+ ### Pick specific models
97
+
98
+ ```bash
99
+ promptbench "Write a haiku about coding" -m gpt4o sonnet flash
100
+ ```
101
+
102
+ ### With a system prompt
103
+
104
+ ```bash
105
+ promptbench "Summarize this text" -s "You are a concise technical writer" -m gpt4mini haiku
106
+ ```
107
+
108
+ ### Batch prompts from a file
109
+
110
+ ```bash
111
+ # prompts.txt — one prompt per line
112
+ promptbench -f prompts.txt -m gpt4o sonnet
113
+ ```
114
+
115
+ ### JSON output
116
+
117
+ ```bash
118
+ promptbench "What is Python?" -o json
119
+ ```
120
+
121
+ ### CSV output
122
+
123
+ ```bash
124
+ promptbench "What is Python?" -o csv > results.csv
125
+ ```
126
+
127
+ ### Save results to file
128
+
129
+ ```bash
130
+ promptbench "Compare REST vs GraphQL" --save results.json
131
+ ```
132
+
133
+ ### Pipe from stdin
134
+
135
+ ```bash
136
+ echo "What is the meaning of life?" | promptbench -m gpt4o sonnet
137
+ ```
138
+
139
+ ### List all supported models
140
+
141
+ ```bash
142
+ promptbench --list-models
143
+ ```
144
+
145
+ ## Supported Models
146
+
147
+ | Alias | Model | Provider |
148
+ |-------|-------|----------|
149
+ | `gpt4o` / `gpt4` | gpt-4o | OpenAI |
150
+ | `gpt4mini` | gpt-4o-mini | OpenAI |
151
+ | `gpt3.5` | gpt-3.5-turbo | OpenAI |
152
+ | `sonnet` / `claude-sonnet` | claude-sonnet-4-20250514 | Anthropic |
153
+ | `haiku` / `claude-haiku` | claude-haiku-4-5-20251001 | Anthropic |
154
+ | `flash` / `gemini-flash` | gemini-2.0-flash | Google |
155
+ | `gemini-pro` | gemini-1.5-pro | Google |
156
+
157
+ You can also use full model names directly (e.g., `gpt-4-turbo`, `gemini-1.5-flash`).
158
+
159
+ ## Python Library Usage
160
+
161
+ ```python
162
+ from promptbench.runner import run_bench
163
+ from promptbench.display import display_comparison
164
+
165
+ run = run_bench(
166
+ prompt="Explain recursion simply",
167
+ models=["gpt4o", "sonnet", "flash"],
168
+ temperature=0.5,
169
+ )
170
+
171
+ print(display_comparison(run))
172
+
173
+ # Access individual results
174
+ for result in run.results:
175
+ print(f"{result.model}: {result.latency_ms:.0f}ms, {result.cost_usd:.6f} USD")
176
+ ```
177
+
178
+ ## Configuration Flags
179
+
180
+ | Flag | Description | Default |
181
+ |------|-------------|---------|
182
+ | `-m, --models` | Models to test | `gpt4o sonnet flash` |
183
+ | `-s, --system` | System prompt | None |
184
+ | `-t, --temperature` | Sampling temperature | `0.7` |
185
+ | `--max-tokens` | Max output tokens | `1024` |
186
+ | `-f, --file` | Prompts file path | None |
187
+ | `-o, --output` | Output format: table, json, csv, markdown | `table` |
188
+ | `--full` | Show full responses (no truncation) | Off |
189
+ | `--no-parallel` | Run models sequentially | Off |
190
+ | `--save` | Save results to JSON file | None |
191
+ | `--list-models` | List supported models | — |
192
+ | `--version` | Show version | — |
193
+
194
+ ## Contributing
195
+
196
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup, how to add providers, and PR guidelines.
197
+
198
+ ## License
199
+
200
+ MIT — see [LICENSE](LICENSE) for details.
@@ -0,0 +1,4 @@
1
+ """PromptBench - A/B test prompts across LLM providers from your terminal."""
2
+
3
+ __version__ = "0.1.0"
4
+ __author__ = "Shehriyar Ali Rustam"
@@ -0,0 +1,6 @@
1
+ """Allow running promptbench as a module: python -m promptbench."""
2
+
3
+ from promptbench.cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
@@ -0,0 +1,228 @@
1
+ """CLI entry point for PromptBench."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import os
8
+ import sys
9
+
10
+ from promptbench import __version__
11
+ from promptbench.display import display_comparison, display_csv, display_json, display_markdown
12
+ from promptbench.providers import (
13
+ MODEL_ALIASES,
14
+ PRICING,
15
+ detect_provider,
16
+ resolve_model,
17
+ )
18
+ from promptbench.runner import run_bench, run_bench_from_file
19
+
20
+
21
+ def _check_api_keys(models: list[str]) -> list[str]:
22
+ """Check which API keys are missing for the requested models and return warnings."""
23
+ warnings: list[str] = []
24
+ providers_needed: set[str] = set()
25
+ for model in models:
26
+ resolved = resolve_model(model)
27
+ providers_needed.add(detect_provider(resolved))
28
+
29
+ key_map = {
30
+ "openai": "OPENAI_API_KEY",
31
+ "anthropic": "ANTHROPIC_API_KEY",
32
+ "google": "GOOGLE_API_KEY",
33
+ }
34
+
35
+ for provider in sorted(providers_needed):
36
+ env_var = key_map.get(provider)
37
+ if env_var and not os.environ.get(env_var):
38
+ warnings.append(f"⚠ {env_var} not set — {provider} models will fail")
39
+
40
+ return warnings
41
+
42
+
43
+ def _list_models() -> str:
44
+ """Format the list of supported models and aliases."""
45
+ lines: list[str] = []
46
+ lines.append("\n⚡ PromptBench — Supported Models\n")
47
+
48
+ lines.append("Models with pricing info:")
49
+ lines.append(f" {'Model':<35} {'Input $/1M':>12} {'Output $/1M':>12} {'Provider':<10}")
50
+ lines.append(f" {'─' * 35} {'─' * 12} {'─' * 12} {'─' * 10}")
51
+ for model, (inp, out) in sorted(PRICING.items()):
52
+ provider = detect_provider(model)
53
+ lines.append(f" {model:<35} ${inp:<11.3f} ${out:<11.3f} {provider:<10}")
54
+
55
+ lines.append("\nAliases:")
56
+ lines.append(f" {'Alias':<20} → {'Model':<35}")
57
+ lines.append(f" {'─' * 20} {'─' * 35}")
58
+ for alias, model in sorted(MODEL_ALIASES.items()):
59
+ lines.append(f" {alias:<20} → {model:<35}")
60
+
61
+ lines.append("")
62
+ return "\n".join(lines)
63
+
64
+
65
+ def _build_parser() -> argparse.ArgumentParser:
66
+ """Build and return the argument parser."""
67
+ parser = argparse.ArgumentParser(
68
+ prog="promptbench",
69
+ description="⚡ PromptBench — A/B test prompts across LLM providers from your terminal.",
70
+ epilog="""Examples:
71
+ promptbench "Explain quantum computing in one sentence"
72
+ promptbench "Write a haiku about coding" -m gpt4o sonnet flash
73
+ promptbench "Summarize this" -s "You are a helpful assistant" -m gpt4mini haiku
74
+ promptbench -f prompts.txt -m gpt4o sonnet -o json
75
+ promptbench "Hello world" -o csv --save results.json
76
+ echo "What is Python?" | promptbench -m gpt4o
77
+ promptbench --list-models
78
+ """,
79
+ formatter_class=argparse.RawDescriptionHelpFormatter,
80
+ )
81
+
82
+ parser.add_argument(
83
+ "prompt",
84
+ nargs="?",
85
+ default=None,
86
+ help="The prompt to test across models",
87
+ )
88
+ parser.add_argument(
89
+ "-m", "--models",
90
+ nargs="+",
91
+ default=["gpt4o", "sonnet", "flash"],
92
+ help="Models to test (names or aliases). Default: gpt4o sonnet flash",
93
+ )
94
+ parser.add_argument(
95
+ "-s", "--system",
96
+ default=None,
97
+ help="System prompt to use for all models",
98
+ )
99
+ parser.add_argument(
100
+ "-t", "--temperature",
101
+ type=float,
102
+ default=0.7,
103
+ help="Sampling temperature (default: 0.7)",
104
+ )
105
+ parser.add_argument(
106
+ "--max-tokens",
107
+ type=int,
108
+ default=1024,
109
+ help="Maximum output tokens (default: 1024)",
110
+ )
111
+ parser.add_argument(
112
+ "-f", "--file",
113
+ default=None,
114
+ help="Path to a file with prompts (one per line or JSON array)",
115
+ )
116
+ parser.add_argument(
117
+ "-o", "--output",
118
+ choices=["table", "json", "csv", "markdown"],
119
+ default="table",
120
+ help="Output format (default: table)",
121
+ )
122
+ parser.add_argument(
123
+ "--full",
124
+ action="store_true",
125
+ help="Show full responses (no truncation)",
126
+ )
127
+ parser.add_argument(
128
+ "--no-parallel",
129
+ action="store_true",
130
+ help="Run models sequentially instead of in parallel",
131
+ )
132
+ parser.add_argument(
133
+ "--save",
134
+ default=None,
135
+ metavar="FILE",
136
+ help="Save results to a JSON file",
137
+ )
138
+ parser.add_argument(
139
+ "--list-models",
140
+ action="store_true",
141
+ help="List all supported models and aliases",
142
+ )
143
+ parser.add_argument(
144
+ "--version",
145
+ action="version",
146
+ version=f"promptbench {__version__}",
147
+ )
148
+
149
+ return parser
150
+
151
+
152
+ def _format_run(run, output: str, full: bool) -> str:
153
+ """Format a BenchRun according to the chosen output format."""
154
+ if output == "json":
155
+ return display_json(run)
156
+ if output == "csv":
157
+ return display_csv(run)
158
+ if output == "markdown":
159
+ return display_markdown(run)
160
+ return display_comparison(run, show_full=full)
161
+
162
+
163
+ def main(argv: list[str] | None = None) -> None:
164
+ """Main CLI entry point."""
165
+ parser = _build_parser()
166
+ args = parser.parse_args(argv)
167
+
168
+ # List models
169
+ if args.list_models:
170
+ print(_list_models())
171
+ return
172
+
173
+ # Determine prompt source
174
+ prompt = args.prompt
175
+ file_path = args.file
176
+
177
+ # Check for piped stdin
178
+ if not prompt and not file_path and not sys.stdin.isatty():
179
+ prompt = sys.stdin.read().strip()
180
+
181
+ if not prompt and not file_path:
182
+ parser.print_help()
183
+ sys.exit(1)
184
+
185
+ # Check API keys
186
+ warnings = _check_api_keys(args.models)
187
+ for w in warnings:
188
+ print(w, file=sys.stderr)
189
+
190
+ parallel = not args.no_parallel
191
+
192
+ # Run from file
193
+ if file_path:
194
+ runs = run_bench_from_file(
195
+ filepath=file_path,
196
+ models=args.models,
197
+ system=args.system,
198
+ temperature=args.temperature,
199
+ max_tokens=args.max_tokens,
200
+ parallel=parallel,
201
+ verbose=True,
202
+ )
203
+ for run in runs:
204
+ print(_format_run(run, args.output, args.full))
205
+
206
+ if args.save:
207
+ data = [r.to_dict() for r in runs]
208
+ with open(args.save, "w", encoding="utf-8") as f:
209
+ json.dump(data, f, indent=2)
210
+ print(f"\n✓ Results saved to {args.save}")
211
+ return
212
+
213
+ # Run single prompt
214
+ run = run_bench(
215
+ prompt=prompt,
216
+ models=args.models,
217
+ system=args.system,
218
+ temperature=args.temperature,
219
+ max_tokens=args.max_tokens,
220
+ parallel=parallel,
221
+ verbose=True,
222
+ )
223
+ print(_format_run(run, args.output, args.full))
224
+
225
+ if args.save:
226
+ with open(args.save, "w", encoding="utf-8") as f:
227
+ json.dump(run.to_dict(), f, indent=2)
228
+ print(f"\n✓ Results saved to {args.save}")