promptum 0.0.2__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promptum/__init__.py +3 -2
- promptum/benchmark/__init__.py +2 -1
- promptum/benchmark/report.py +13 -26
- promptum/benchmark/summary.py +14 -0
- {promptum-0.0.2.dist-info → promptum-0.0.3.dist-info}/METADATA +43 -35
- {promptum-0.0.2.dist-info → promptum-0.0.3.dist-info}/RECORD +8 -7
- {promptum-0.0.2.dist-info → promptum-0.0.3.dist-info}/WHEEL +0 -0
- {promptum-0.0.2.dist-info → promptum-0.0.3.dist-info}/licenses/LICENSE +0 -0
promptum/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from promptum.benchmark import Benchmark, Report, Runner, TestCase, TestResult
|
|
1
|
+
from promptum.benchmark import Benchmark, Report, Runner, Summary, TestCase, TestResult
|
|
2
2
|
from promptum.providers import LLMProvider, Metrics, OpenRouterClient, RetryConfig, RetryStrategy
|
|
3
3
|
from promptum.validation import (
|
|
4
4
|
Contains,
|
|
@@ -8,11 +8,12 @@ from promptum.validation import (
|
|
|
8
8
|
Validator,
|
|
9
9
|
)
|
|
10
10
|
|
|
11
|
-
__version__ = "0.0.
|
|
11
|
+
__version__ = "0.0.3"
|
|
12
12
|
|
|
13
13
|
__all__ = [
|
|
14
14
|
"TestCase",
|
|
15
15
|
"TestResult",
|
|
16
|
+
"Summary",
|
|
16
17
|
"Metrics",
|
|
17
18
|
"RetryConfig",
|
|
18
19
|
"RetryStrategy",
|
promptum/benchmark/__init__.py
CHANGED
|
@@ -2,6 +2,7 @@ from promptum.benchmark.benchmark import Benchmark
|
|
|
2
2
|
from promptum.benchmark.report import Report
|
|
3
3
|
from promptum.benchmark.result import TestResult
|
|
4
4
|
from promptum.benchmark.runner import Runner
|
|
5
|
+
from promptum.benchmark.summary import Summary
|
|
5
6
|
from promptum.benchmark.test_case import TestCase
|
|
6
7
|
|
|
7
|
-
__all__ = ["Benchmark", "Report", "Runner", "TestCase", "TestResult"]
|
|
8
|
+
__all__ = ["Benchmark", "Report", "Runner", "Summary", "TestCase", "TestResult"]
|
promptum/benchmark/report.py
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
from collections.abc import Callable, Sequence
|
|
2
2
|
from dataclasses import dataclass
|
|
3
|
-
from typing import Any
|
|
4
3
|
|
|
5
4
|
from promptum.benchmark.result import TestResult
|
|
5
|
+
from promptum.benchmark.summary import Summary
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
@dataclass(frozen=True, slots=True)
|
|
9
9
|
class Report:
|
|
10
10
|
results: Sequence[TestResult]
|
|
11
11
|
|
|
12
|
-
def get_summary(self) ->
|
|
12
|
+
def get_summary(self) -> Summary:
|
|
13
13
|
total = len(self.results)
|
|
14
14
|
passed = sum(1 for r in self.results if r.passed)
|
|
15
15
|
|
|
@@ -17,18 +17,17 @@ class Report:
|
|
|
17
17
|
total_cost = sum(r.metrics.cost_usd or 0 for r in self.results if r.metrics)
|
|
18
18
|
total_tokens = sum(r.metrics.total_tokens or 0 for r in self.results if r.metrics)
|
|
19
19
|
|
|
20
|
-
return
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
}
|
|
20
|
+
return Summary(
|
|
21
|
+
total=total,
|
|
22
|
+
passed=passed,
|
|
23
|
+
failed=total - passed,
|
|
24
|
+
pass_rate=passed / total if total > 0 else 0,
|
|
25
|
+
avg_latency_ms=sum(latencies) / len(latencies) if latencies else 0,
|
|
26
|
+
min_latency_ms=min(latencies) if latencies else 0,
|
|
27
|
+
max_latency_ms=max(latencies) if latencies else 0,
|
|
28
|
+
total_cost_usd=total_cost,
|
|
29
|
+
total_tokens=total_tokens,
|
|
30
|
+
)
|
|
32
31
|
|
|
33
32
|
def filter(
|
|
34
33
|
self,
|
|
@@ -60,15 +59,3 @@ class Report:
|
|
|
60
59
|
groups[group_key].append(result)
|
|
61
60
|
|
|
62
61
|
return {k: Report(results=v) for k, v in groups.items()}
|
|
63
|
-
|
|
64
|
-
def compare_models(self) -> dict[str, dict[str, Any]]:
|
|
65
|
-
by_model = self.group_by(lambda r: r.test_case.model)
|
|
66
|
-
return {model: report.get_summary() for model, report in by_model.items()}
|
|
67
|
-
|
|
68
|
-
@staticmethod
|
|
69
|
-
def _percentile(values: list[float], p: float) -> float:
|
|
70
|
-
if not values:
|
|
71
|
-
return 0
|
|
72
|
-
sorted_values = sorted(values)
|
|
73
|
-
index = int((len(sorted_values) - 1) * p)
|
|
74
|
-
return sorted_values[index]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
@dataclass(frozen=True, slots=True)
|
|
5
|
+
class Summary:
|
|
6
|
+
total: int
|
|
7
|
+
passed: int
|
|
8
|
+
failed: int
|
|
9
|
+
pass_rate: float
|
|
10
|
+
avg_latency_ms: float
|
|
11
|
+
min_latency_ms: float
|
|
12
|
+
max_latency_ms: float
|
|
13
|
+
total_cost_usd: float
|
|
14
|
+
total_tokens: int
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: promptum
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.3
|
|
4
4
|
Summary: Async LLM benchmarking library with protocol-based extensibility
|
|
5
5
|
Project-URL: Homepage, https://github.com/deyna256/promptum
|
|
6
6
|
Project-URL: Repository, https://github.com/deyna256/promptum
|
|
@@ -46,7 +46,7 @@ Description-Content-Type: text/markdown
|
|
|
46
46
|

|
|
47
47
|

|
|
48
48
|
|
|
49
|
-
**Benchmark LLMs Like a Pro
|
|
49
|
+
**Benchmark LLMs Like a Pro.**
|
|
50
50
|
|
|
51
51
|
Stop writing boilerplate to test LLMs. Start getting results.
|
|
52
52
|
|
|
@@ -56,11 +56,12 @@ Stop writing boilerplate to test LLMs. Start getting results.
|
|
|
56
56
|
|
|
57
57
|
## What's This?
|
|
58
58
|
|
|
59
|
-
A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get
|
|
59
|
+
A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get structured results.
|
|
60
60
|
|
|
61
61
|
```python
|
|
62
62
|
benchmark = Benchmark(provider=client, name="my_test")
|
|
63
63
|
benchmark.add_test(TestCase(
|
|
64
|
+
name="basic_math",
|
|
64
65
|
prompt="What is 2+2?",
|
|
65
66
|
model="gpt-3.5-turbo",
|
|
66
67
|
validator=Contains("4")
|
|
@@ -130,9 +131,9 @@ async def main():
|
|
|
130
131
|
report = await benchmark.run_async()
|
|
131
132
|
summary = report.get_summary()
|
|
132
133
|
|
|
133
|
-
print(f"✓ {summary
|
|
134
|
-
print(f"⚡ {summary
|
|
135
|
-
print(f"💰 ${summary
|
|
134
|
+
print(f"✓ {summary.passed}/{summary.total} tests passed")
|
|
135
|
+
print(f"⚡ {summary.avg_latency_ms:.0f}ms average")
|
|
136
|
+
print(f"💰 ${summary.total_cost_usd:.6f} total cost")
|
|
136
137
|
|
|
137
138
|
asyncio.run(main())
|
|
138
139
|
```
|
|
@@ -146,7 +147,7 @@ python your_script.py
|
|
|
146
147
|
|
|
147
148
|
## What You Get
|
|
148
149
|
|
|
149
|
-
- [x] **
|
|
150
|
+
- [x] **100+ Models via OpenRouter** - One client for OpenAI, Anthropic, Google, and more
|
|
150
151
|
- [x] **Smart Validation** - ExactMatch, Contains, Regex, JsonSchema, or write your own
|
|
151
152
|
- [x] **Automatic Retries** - Exponential/linear backoff with configurable attempts
|
|
152
153
|
- [x] **Metrics Tracking** - Latency, tokens, cost - automatically captured
|
|
@@ -161,35 +162,42 @@ python your_script.py
|
|
|
161
162
|
Compare GPT-4 vs Claude on your tasks:
|
|
162
163
|
|
|
163
164
|
```python
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
165
|
+
import asyncio
|
|
166
|
+
from promptum import Benchmark, TestCase, Contains, Regex, OpenRouterClient
|
|
167
|
+
|
|
168
|
+
async def main():
|
|
169
|
+
async with OpenRouterClient(api_key="your-key") as client:
|
|
170
|
+
benchmark = Benchmark(provider=client, name="model_comparison")
|
|
171
|
+
|
|
172
|
+
benchmark.add_tests([
|
|
173
|
+
TestCase(
|
|
174
|
+
name="json_output_gpt4",
|
|
175
|
+
prompt='Output JSON: {"status": "ok"}',
|
|
176
|
+
model="openai/gpt-4",
|
|
177
|
+
validator=Regex(r'\{"status":\s*"ok"\}')
|
|
178
|
+
),
|
|
179
|
+
TestCase(
|
|
180
|
+
name="json_output_claude",
|
|
181
|
+
prompt='Output JSON: {"status": "ok"}',
|
|
182
|
+
model="anthropic/claude-3-5-sonnet",
|
|
183
|
+
validator=Regex(r'\{"status":\s*"ok"\}')
|
|
184
|
+
),
|
|
185
|
+
TestCase(
|
|
186
|
+
name="creative_writing",
|
|
187
|
+
prompt="Write a haiku about Python",
|
|
188
|
+
model="openai/gpt-4",
|
|
189
|
+
validator=Contains("Python", case_sensitive=False)
|
|
190
|
+
),
|
|
191
|
+
])
|
|
189
192
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
+
report = await benchmark.run_async()
|
|
194
|
+
|
|
195
|
+
# Side-by-side model comparison
|
|
196
|
+
for model, model_report in report.group_by(lambda r: r.test_case.model).items():
|
|
197
|
+
summary = model_report.get_summary()
|
|
198
|
+
print(f"{model}: {summary.pass_rate:.0%} pass rate, {summary.avg_latency_ms:.0f}ms avg")
|
|
199
|
+
|
|
200
|
+
asyncio.run(main())
|
|
193
201
|
```
|
|
194
202
|
|
|
195
203
|
---
|
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
promptum/__init__.py,sha256=
|
|
1
|
+
promptum/__init__.py,sha256=F2dfvZUHDnAwQOGDXINgFnu8vtnwfS_twr6O5tmL1K8,606
|
|
2
2
|
promptum/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
promptum/benchmark/__init__.py,sha256=
|
|
3
|
+
promptum/benchmark/__init__.py,sha256=eyeJHW4mZdKv2vuQRfXFOAKSH07YNROeNlBKJCOfWHg,369
|
|
4
4
|
promptum/benchmark/benchmark.py,sha256=hZ3557qPKqFeNNuxrRLPs-b6XBy2JCowIhRDDwatfeI,1403
|
|
5
|
-
promptum/benchmark/report.py,sha256=
|
|
5
|
+
promptum/benchmark/report.py,sha256=IkCpd3cswLtL9vTUIXAAYGvZk_t9wyQAzbIFpOSXnkI,2080
|
|
6
6
|
promptum/benchmark/result.py,sha256=nKh-T4zlam2LxsaFoL8jeVaO6kZJ1sfB_tnp4gdNPhM,482
|
|
7
7
|
promptum/benchmark/runner.py,sha256=5p6JBwjTlEHTh6jNv_iuFH1nIrI4_Gv3wmzCT0TWpvA,2407
|
|
8
|
+
promptum/benchmark/summary.py,sha256=dD-i8m2BUMFhMC_7v-ITDNUP5X7fG0ARErwKB5D9_yE,281
|
|
8
9
|
promptum/benchmark/test_case.py,sha256=Okypf2334ewVrvmQG7M3I3D7BzqXDsQ2ihjNw9gGF00,598
|
|
9
10
|
promptum/providers/__init__.py,sha256=UprvJ4vxHqo-VTzzUmZ4wFCj6VybP9xBd7HtpPPSvbI,335
|
|
10
11
|
promptum/providers/metrics.py,sha256=FnS10nHFjQ5Clj5X21C_nW6zAUJU_ZHt0s2fLgp6L28,427
|
|
@@ -14,7 +15,7 @@ promptum/providers/retry.py,sha256=mA_RRz9_9J_mge_AUd9f1A-gACOxZLGTI8vTIstAr8s,5
|
|
|
14
15
|
promptum/validation/__init__.py,sha256=mhykyxaIwn2PJh2RXAi0fi2NRIveFmlC5bg1nyCbfVU,252
|
|
15
16
|
promptum/validation/protocol.py,sha256=xqxm23YX6eNeZHKMLMZ-Wz8iQKn4ZRzAI5Xryxg0uq4,418
|
|
16
17
|
promptum/validation/validators.py,sha256=qSMva2P2miXXJJ5XeTKJsyYgh2x5wORi3dhOnBYuACE,2686
|
|
17
|
-
promptum-0.0.
|
|
18
|
-
promptum-0.0.
|
|
19
|
-
promptum-0.0.
|
|
20
|
-
promptum-0.0.
|
|
18
|
+
promptum-0.0.3.dist-info/METADATA,sha256=P4d6Mf35ly_xATjxAUWZXjQPYyfb5xfBMqiXoGkkDkc,8278
|
|
19
|
+
promptum-0.0.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
20
|
+
promptum-0.0.3.dist-info/licenses/LICENSE,sha256=Fgn285H5Vy9diOlqO1TzS3hD97WcdF6-GFHvUcFNtmg,1067
|
|
21
|
+
promptum-0.0.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|