promptum 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promptum/__init__.py +44 -0
- promptum/benchmark/__init__.py +4 -0
- promptum/benchmark/benchmark.py +50 -0
- promptum/benchmark/report.py +75 -0
- promptum/core/__init__.py +12 -0
- promptum/core/metrics.py +16 -0
- promptum/core/result.py +17 -0
- promptum/core/retry.py +19 -0
- promptum/core/test_case.py +22 -0
- promptum/execution/__init__.py +3 -0
- promptum/execution/runner.py +75 -0
- promptum/providers/__init__.py +7 -0
- promptum/providers/openrouter.py +123 -0
- promptum/providers/protocol.py +22 -0
- promptum/py.typed +0 -0
- promptum/serialization/__init__.py +11 -0
- promptum/serialization/base.py +48 -0
- promptum/serialization/html.py +52 -0
- promptum/serialization/json.py +28 -0
- promptum/serialization/protocol.py +13 -0
- promptum/serialization/report_template.html +293 -0
- promptum/serialization/yaml.py +17 -0
- promptum/storage/__init__.py +7 -0
- promptum/storage/file.py +157 -0
- promptum/storage/protocol.py +23 -0
- promptum/validation/__init__.py +15 -0
- promptum/validation/protocol.py +16 -0
- promptum/validation/validators.py +108 -0
- promptum-0.0.1.dist-info/METADATA +280 -0
- promptum-0.0.1.dist-info/RECORD +32 -0
- promptum-0.0.1.dist-info/WHEEL +4 -0
- promptum-0.0.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: promptum
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Async LLM benchmarking library with protocol-based extensibility
|
|
5
|
+
Project-URL: Homepage, https://github.com/deyna256/promptum
|
|
6
|
+
Project-URL: Repository, https://github.com/deyna256/promptum
|
|
7
|
+
Project-URL: Issues, https://github.com/deyna256/promptum/issues
|
|
8
|
+
Author-email: deyna256 <literallybugcreator@gmail.com>
|
|
9
|
+
License: MIT License
|
|
10
|
+
|
|
11
|
+
Copyright (c) 2026 Ivan Deyna
|
|
12
|
+
|
|
13
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
14
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
15
|
+
in the Software without restriction, including without limitation the rights
|
|
16
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
17
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
18
|
+
furnished to do so, subject to the following conditions:
|
|
19
|
+
|
|
20
|
+
The above copyright notice and this permission notice shall be included in all
|
|
21
|
+
copies or substantial portions of the Software.
|
|
22
|
+
|
|
23
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
24
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
25
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
26
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
27
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
28
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
29
|
+
SOFTWARE.
|
|
30
|
+
License-File: LICENSE
|
|
31
|
+
Keywords: anthropic,async,benchmarking,llm,openai
|
|
32
|
+
Classifier: Development Status :: 3 - Alpha
|
|
33
|
+
Classifier: Intended Audience :: Developers
|
|
34
|
+
Classifier: Programming Language :: Python :: 3
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
36
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
37
|
+
Requires-Python: >=3.13
|
|
38
|
+
Requires-Dist: httpx>=0.27.0
|
|
39
|
+
Requires-Dist: jinja2>=3.1.0
|
|
40
|
+
Requires-Dist: pyyaml>=6.0
|
|
41
|
+
Description-Content-Type: text/markdown
|
|
42
|
+
|
|
43
|
+
# promptum
|
|
44
|
+
|
|
45
|
+
<div align="center">
|
|
46
|
+
|
|
47
|
+

|
|
48
|
+

|
|
49
|
+

|
|
50
|
+
|
|
51
|
+
**Benchmark LLMs Like a Pro. In 5 Lines of Code.**
|
|
52
|
+
|
|
53
|
+
Stop writing boilerplate to test LLMs. Start getting results.
|
|
54
|
+
|
|
55
|
+
</div>
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## What's This?
|
|
60
|
+
|
|
61
|
+
A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get beautiful reports.
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
benchmark = Benchmark(provider=client, name="my_test")
|
|
65
|
+
benchmark.add_test(TestCase(
|
|
66
|
+
prompt="What is 2+2?",
|
|
67
|
+
model="gpt-3.5-turbo",
|
|
68
|
+
validator=Contains("4")
|
|
69
|
+
))
|
|
70
|
+
report = await benchmark.run_async()
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
That's it. No setup. No config files. Just results.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Why You Need This
|
|
78
|
+
|
|
79
|
+
**Before promptum:**
|
|
80
|
+
```python
|
|
81
|
+
# Custom API client for each provider
|
|
82
|
+
openai_client = OpenAI(api_key=...)
|
|
83
|
+
anthropic_client = Anthropic(api_key=...)
|
|
84
|
+
|
|
85
|
+
# Manual validation logic
|
|
86
|
+
if "correct answer" not in response:
|
|
87
|
+
failed_tests.append(...)
|
|
88
|
+
|
|
89
|
+
# Track metrics yourself
|
|
90
|
+
latency = end_time - start_time
|
|
91
|
+
tokens = response.usage.total_tokens
|
|
92
|
+
|
|
93
|
+
# Write your own retry logic
|
|
94
|
+
for attempt in range(max_retries):
|
|
95
|
+
try:
|
|
96
|
+
response = client.chat.completions.create(...)
|
|
97
|
+
break
|
|
98
|
+
except Exception:
|
|
99
|
+
sleep(2 ** attempt)
|
|
100
|
+
|
|
101
|
+
# Export results manually
|
|
102
|
+
json.dump(results, open("results.json", "w"))
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
**After promptum:**
|
|
106
|
+
```python
|
|
107
|
+
report = await benchmark.run_async()
|
|
108
|
+
HTMLSerializer().serialize(report) # Beautiful HTML report
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## Quick Start
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
pip install promptum # (or: uv pip install promptum)
|
|
117
|
+
export OPENROUTER_API_KEY="your-key"
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
import asyncio
|
|
122
|
+
from promptum import Benchmark, TestCase, OpenRouterClient, Contains
|
|
123
|
+
|
|
124
|
+
async def main():
|
|
125
|
+
async with OpenRouterClient(api_key="your-key") as client:
|
|
126
|
+
benchmark = Benchmark(provider=client, name="quick_test")
|
|
127
|
+
|
|
128
|
+
benchmark.add_test(TestCase(
|
|
129
|
+
name="basic_math",
|
|
130
|
+
prompt="What is 15 * 7? Reply with just the number.",
|
|
131
|
+
model="openai/gpt-3.5-turbo",
|
|
132
|
+
validator=Contains("105")
|
|
133
|
+
))
|
|
134
|
+
|
|
135
|
+
report = await benchmark.run_async()
|
|
136
|
+
summary = report.get_summary()
|
|
137
|
+
|
|
138
|
+
print(f"✓ {summary['passed']}/{summary['total']} tests passed")
|
|
139
|
+
print(f"⚡ {summary['avg_latency_ms']:.0f}ms average")
|
|
140
|
+
print(f"💰 ${summary['total_cost_usd']:.6f} total cost")
|
|
141
|
+
|
|
142
|
+
asyncio.run(main())
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Run it:
|
|
146
|
+
```bash
|
|
147
|
+
python your_script.py
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## What You Get
|
|
153
|
+
|
|
154
|
+
✅ **One API for 100+ Models** - OpenRouter support out of the box (OpenAI, Anthropic, Google, etc.)
|
|
155
|
+
✅ **Smart Validation** - ExactMatch, Contains, Regex, JsonSchema, or write your own
|
|
156
|
+
✅ **Automatic Retries** - Exponential/linear backoff with configurable attempts
|
|
157
|
+
✅ **Metrics Tracking** - Latency, tokens, cost - automatically captured
|
|
158
|
+
✅ **Beautiful Reports** - JSON, YAML, or interactive HTML with charts
|
|
159
|
+
✅ **Async by Default** - Run 100 tests in parallel without breaking a sweat
|
|
160
|
+
✅ **Type Safe** - Full type hints, catches errors before runtime
|
|
161
|
+
✅ **Zero Config** - No YAML files, no setup scripts, just Python
|
|
162
|
+
|
|
163
|
+
---
|
|
164
|
+
|
|
165
|
+
## Real Example
|
|
166
|
+
|
|
167
|
+
Compare GPT-4 vs Claude on your tasks:
|
|
168
|
+
|
|
169
|
+
```python
|
|
170
|
+
from promptum import Benchmark, TestCase, ExactMatch, Contains, Regex
|
|
171
|
+
|
|
172
|
+
tests = [
|
|
173
|
+
TestCase(
|
|
174
|
+
name="json_output",
|
|
175
|
+
prompt='Output JSON: {"status": "ok"}',
|
|
176
|
+
model="openai/gpt-4",
|
|
177
|
+
validator=Regex(r'\{"status":\s*"ok"\}')
|
|
178
|
+
),
|
|
179
|
+
TestCase(
|
|
180
|
+
name="json_output",
|
|
181
|
+
prompt='Output JSON: {"status": "ok"}',
|
|
182
|
+
model="anthropic/claude-3-5-sonnet",
|
|
183
|
+
validator=Regex(r'\{"status":\s*"ok"\}')
|
|
184
|
+
),
|
|
185
|
+
TestCase(
|
|
186
|
+
name="creative_writing",
|
|
187
|
+
prompt="Write a haiku about Python",
|
|
188
|
+
model="openai/gpt-4",
|
|
189
|
+
validator=Contains("Python", case_sensitive=False)
|
|
190
|
+
),
|
|
191
|
+
]
|
|
192
|
+
|
|
193
|
+
benchmark.add_tests(tests)
|
|
194
|
+
report = await benchmark.run_async()
|
|
195
|
+
|
|
196
|
+
# Export as HTML
|
|
197
|
+
from promptum import HTMLSerializer
|
|
198
|
+
html = HTMLSerializer().serialize(report)
|
|
199
|
+
open("comparison.html", "w").write(html)
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
Open `comparison.html` in your browser - see side-by-side model performance with charts.
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
## Use Cases
|
|
207
|
+
|
|
208
|
+
**🔬 Model Evaluation** - Compare GPT-4, Claude, Gemini on your specific tasks
|
|
209
|
+
**🎯 Prompt Engineering** - Test 100 prompt variations, find what works
|
|
210
|
+
**⚡ Latency Testing** - Measure real-world response times across providers
|
|
211
|
+
**💰 Cost Analysis** - Track spending per model/task before production
|
|
212
|
+
**🔄 Regression Testing** - Ensure model updates don't break your prompts
|
|
213
|
+
**📊 A/B Testing** - Data-driven model selection for your product
|
|
214
|
+
|
|
215
|
+
---
|
|
216
|
+
|
|
217
|
+
## Requirements
|
|
218
|
+
|
|
219
|
+
- Python 3.13+
|
|
220
|
+
- An OpenRouter API key (or implement your own provider)
|
|
221
|
+
|
|
222
|
+
That's it. No Docker, no complex setup.
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## Why Protocol-Based?
|
|
227
|
+
|
|
228
|
+
Most libraries force inheritance:
|
|
229
|
+
```python
|
|
230
|
+
class MyProvider(BaseProvider): # Tightly coupled
|
|
231
|
+
def generate(self): ...
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
We use protocols (structural typing):
|
|
235
|
+
```python
|
|
236
|
+
class MyProvider: # No inheritance needed
|
|
237
|
+
async def generate(self) -> tuple[str, Metrics]:
|
|
238
|
+
# Your implementation
|
|
239
|
+
return response, metrics
|
|
240
|
+
|
|
241
|
+
# It just works
|
|
242
|
+
benchmark = Benchmark(provider=MyProvider())
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
Cleaner. More flexible. More Pythonic.
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
249
|
+
## Contributing
|
|
250
|
+
|
|
251
|
+
Found a bug? Want a feature? PRs welcome!
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
# Development setup
|
|
255
|
+
git clone https://github.com/yourusername/promptum.git
|
|
256
|
+
cd promptum
|
|
257
|
+
just sync # Install dependencies
|
|
258
|
+
just test # Run tests
|
|
259
|
+
|
|
260
|
+
# Development commands
|
|
261
|
+
just lint # Check code style
|
|
262
|
+
just format # Format code
|
|
263
|
+
just typecheck # Type checking
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
---
|
|
267
|
+
|
|
268
|
+
## License
|
|
269
|
+
|
|
270
|
+
MIT - do whatever you want with it.
|
|
271
|
+
|
|
272
|
+
---
|
|
273
|
+
|
|
274
|
+
<div align="center">
|
|
275
|
+
|
|
276
|
+
**[⭐ Star on GitHub](https://github.com/yourusername/promptum)** | **[🐛 Report Bug](https://github.com/yourusername/promptum/issues)** | **[💡 Request Feature](https://github.com/yourusername/promptum/issues)**
|
|
277
|
+
|
|
278
|
+
Made for developers who value their time.
|
|
279
|
+
|
|
280
|
+
</div>
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
promptum/__init__.py,sha256=AjeGgmIbpp9Uv-0ybq6knejEJMK-Dnn_-fV9Z86Bp74,932
|
|
2
|
+
promptum/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
promptum/benchmark/__init__.py,sha256=NJYiXm6wVFKMloxKNAXMY4H3bMQORTtLh6__nYWYWa0,131
|
|
4
|
+
promptum/benchmark/benchmark.py,sha256=3enQSACdLwHW78fqSZj0Un3r7_Ua3V-MjfbEIIKFSWs,1589
|
|
5
|
+
promptum/benchmark/report.py,sha256=ol_UO8rw43zbQxhs2o4AwYN5TP7O_Apa77V-pZKq6Uw,2754
|
|
6
|
+
promptum/core/__init__.py,sha256=mqajsOdUBNJfcR2krxpwa7rM_wd88vJaAov-9SnVm68,294
|
|
7
|
+
promptum/core/metrics.py,sha256=FnS10nHFjQ5Clj5X21C_nW6zAUJU_ZHt0s2fLgp6L28,427
|
|
8
|
+
promptum/core/result.py,sha256=nyuVMQFY6DmZwzpgqDPsj0FaAuairpKLJ-0be5WQtTg,472
|
|
9
|
+
promptum/core/retry.py,sha256=mA_RRz9_9J_mge_AUd9f1A-gACOxZLGTI8vTIstAr8s,538
|
|
10
|
+
promptum/core/test_case.py,sha256=YNlVNj7FkoCyBFb2N0Dzrhce6o3DzUtke4PR6WoXhZo,593
|
|
11
|
+
promptum/execution/__init__.py,sha256=fUZa7Bo7yn921sl49cS6TCGsG-lOUNVdhdeRsIa5vCc,67
|
|
12
|
+
promptum/execution/runner.py,sha256=sP3uDu2VDLxFi9BkltMHwsyMuCXnz4oP1kVN28KpVZ0,2434
|
|
13
|
+
promptum/providers/__init__.py,sha256=OW-CK198wOV7_bz_keOaxxQeRlFPZgINQcVJUZq_uus,169
|
|
14
|
+
promptum/providers/openrouter.py,sha256=owquGxHaTB-pZ8jr06l4HouETuFj1lEg92oGX2mM5uo,4601
|
|
15
|
+
promptum/providers/protocol.py,sha256=vdTGAGKN3FzThHLwyMMWicU87_LpW-gn0cM3vMcWiEY,488
|
|
16
|
+
promptum/serialization/__init__.py,sha256=0dlpgF3dngaw_oR4mg7nuc4Z_VFVl2bATmhe2mHA9T4,319
|
|
17
|
+
promptum/serialization/base.py,sha256=JnB4zb7D4oy44k6ndbJu3Xw1PVLpY_9-Y7k3Et2p43g,1851
|
|
18
|
+
promptum/serialization/html.py,sha256=kJEd2s6fVfFHH7snJWrD5RGaUW66x3vtMKGMJ_ekmcI,1901
|
|
19
|
+
promptum/serialization/json.py,sha256=koqgr5_WHmrpWUOCq6rWXoC07um3mkDDaob2k9vkEK8,870
|
|
20
|
+
promptum/serialization/protocol.py,sha256=MZeMYt_HZJIYSyrRd_ZYbEJXDiXLMuJ5tosAeHLxpTM,353
|
|
21
|
+
promptum/serialization/report_template.html,sha256=RC8qSLzolqWkWBIGfyhPtPkRWM7_0JkauEWPkaKiB9A,10802
|
|
22
|
+
promptum/serialization/yaml.py,sha256=50A612OkX2L3EjhxTZJMZQb5zL8-2PmwcBjjNUhCWsA,528
|
|
23
|
+
promptum/storage/__init__.py,sha256=QWOP5Al43WmmQ_kFCM9JGi8amXJzO_pR-x5AKDNy4ds,153
|
|
24
|
+
promptum/storage/file.py,sha256=gnNBpNBQ_NeAWn7P2itsw2L99AxS7zOd8Nef6PyYxlk,5750
|
|
25
|
+
promptum/storage/protocol.py,sha256=_NpkJzOQB_98Ud_TA_ZYubHf3o2DDXGMveRN3kRyYKI,517
|
|
26
|
+
promptum/validation/__init__.py,sha256=mhykyxaIwn2PJh2RXAi0fi2NRIveFmlC5bg1nyCbfVU,252
|
|
27
|
+
promptum/validation/protocol.py,sha256=xqxm23YX6eNeZHKMLMZ-Wz8iQKn4ZRzAI5Xryxg0uq4,418
|
|
28
|
+
promptum/validation/validators.py,sha256=3lJwSMhhWb9x8BK_-S0FJBj7PFgno79II_i3Z1mCKTs,3217
|
|
29
|
+
promptum-0.0.1.dist-info/METADATA,sha256=vt_PN0Ns0JuJalM7p8hJZsz-Y2hwQrbHZ4Jacy7P6L8,8083
|
|
30
|
+
promptum-0.0.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
31
|
+
promptum-0.0.1.dist-info/licenses/LICENSE,sha256=Fgn285H5Vy9diOlqO1TzS3hD97WcdF6-GFHvUcFNtmg,1067
|
|
32
|
+
promptum-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ivan Deyna
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|