promptum 0.0.2__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
promptum/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- from promptum.benchmark import Benchmark, Report, Runner, TestCase, TestResult
1
+ from promptum.benchmark import Benchmark, Report, Runner, Summary, TestCase, TestResult
2
2
  from promptum.providers import LLMProvider, Metrics, OpenRouterClient, RetryConfig, RetryStrategy
3
3
  from promptum.validation import (
4
4
  Contains,
@@ -8,11 +8,12 @@ from promptum.validation import (
8
8
  Validator,
9
9
  )
10
10
 
11
- __version__ = "0.0.1"
11
+ __version__ = "0.0.3"
12
12
 
13
13
  __all__ = [
14
14
  "TestCase",
15
15
  "TestResult",
16
+ "Summary",
16
17
  "Metrics",
17
18
  "RetryConfig",
18
19
  "RetryStrategy",
@@ -2,6 +2,7 @@ from promptum.benchmark.benchmark import Benchmark
2
2
  from promptum.benchmark.report import Report
3
3
  from promptum.benchmark.result import TestResult
4
4
  from promptum.benchmark.runner import Runner
5
+ from promptum.benchmark.summary import Summary
5
6
  from promptum.benchmark.test_case import TestCase
6
7
 
7
- __all__ = ["Benchmark", "Report", "Runner", "TestCase", "TestResult"]
8
+ __all__ = ["Benchmark", "Report", "Runner", "Summary", "TestCase", "TestResult"]
@@ -1,15 +1,15 @@
1
1
  from collections.abc import Callable, Sequence
2
2
  from dataclasses import dataclass
3
- from typing import Any
4
3
 
5
4
  from promptum.benchmark.result import TestResult
5
+ from promptum.benchmark.summary import Summary
6
6
 
7
7
 
8
8
  @dataclass(frozen=True, slots=True)
9
9
  class Report:
10
10
  results: Sequence[TestResult]
11
11
 
12
- def get_summary(self) -> dict[str, Any]:
12
+ def get_summary(self) -> Summary:
13
13
  total = len(self.results)
14
14
  passed = sum(1 for r in self.results if r.passed)
15
15
 
@@ -17,18 +17,17 @@ class Report:
17
17
  total_cost = sum(r.metrics.cost_usd or 0 for r in self.results if r.metrics)
18
18
  total_tokens = sum(r.metrics.total_tokens or 0 for r in self.results if r.metrics)
19
19
 
20
- return {
21
- "total": total,
22
- "passed": passed,
23
- "failed": total - passed,
24
- "pass_rate": passed / total if total > 0 else 0,
25
- "avg_latency_ms": sum(latencies) / len(latencies) if latencies else 0,
26
- "p50_latency_ms": self._percentile(latencies, 0.5) if latencies else 0,
27
- "p95_latency_ms": self._percentile(latencies, 0.95) if latencies else 0,
28
- "p99_latency_ms": self._percentile(latencies, 0.99) if latencies else 0,
29
- "total_cost_usd": total_cost,
30
- "total_tokens": total_tokens,
31
- }
20
+ return Summary(
21
+ total=total,
22
+ passed=passed,
23
+ failed=total - passed,
24
+ pass_rate=passed / total if total > 0 else 0,
25
+ avg_latency_ms=sum(latencies) / len(latencies) if latencies else 0,
26
+ min_latency_ms=min(latencies) if latencies else 0,
27
+ max_latency_ms=max(latencies) if latencies else 0,
28
+ total_cost_usd=total_cost,
29
+ total_tokens=total_tokens,
30
+ )
32
31
 
33
32
  def filter(
34
33
  self,
@@ -60,15 +59,3 @@ class Report:
60
59
  groups[group_key].append(result)
61
60
 
62
61
  return {k: Report(results=v) for k, v in groups.items()}
63
-
64
- def compare_models(self) -> dict[str, dict[str, Any]]:
65
- by_model = self.group_by(lambda r: r.test_case.model)
66
- return {model: report.get_summary() for model, report in by_model.items()}
67
-
68
- @staticmethod
69
- def _percentile(values: list[float], p: float) -> float:
70
- if not values:
71
- return 0
72
- sorted_values = sorted(values)
73
- index = int((len(sorted_values) - 1) * p)
74
- return sorted_values[index]
@@ -0,0 +1,14 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass(frozen=True, slots=True)
5
+ class Summary:
6
+ total: int
7
+ passed: int
8
+ failed: int
9
+ pass_rate: float
10
+ avg_latency_ms: float
11
+ min_latency_ms: float
12
+ max_latency_ms: float
13
+ total_cost_usd: float
14
+ total_tokens: int
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: promptum
3
- Version: 0.0.2
3
+ Version: 0.0.3
4
4
  Summary: Async LLM benchmarking library with protocol-based extensibility
5
5
  Project-URL: Homepage, https://github.com/deyna256/promptum
6
6
  Project-URL: Repository, https://github.com/deyna256/promptum
@@ -46,7 +46,7 @@ Description-Content-Type: text/markdown
46
46
  ![Async](https://img.shields.io/badge/Async-First-green?style=for-the-badge)
47
47
  ![License: MIT](https://img.shields.io/badge/License-MIT-yellow?style=for-the-badge)
48
48
 
49
- **Benchmark LLMs Like a Pro. In 5 Lines of Code.**
49
+ **Benchmark LLMs Like a Pro.**
50
50
 
51
51
  Stop writing boilerplate to test LLMs. Start getting results.
52
52
 
@@ -56,11 +56,12 @@ Stop writing boilerplate to test LLMs. Start getting results.
56
56
 
57
57
  ## What's This?
58
58
 
59
- A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get beautiful reports.
59
+ A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get structured results.
60
60
 
61
61
  ```python
62
62
  benchmark = Benchmark(provider=client, name="my_test")
63
63
  benchmark.add_test(TestCase(
64
+ name="basic_math",
64
65
  prompt="What is 2+2?",
65
66
  model="gpt-3.5-turbo",
66
67
  validator=Contains("4")
@@ -130,9 +131,9 @@ async def main():
130
131
  report = await benchmark.run_async()
131
132
  summary = report.get_summary()
132
133
 
133
- print(f"✓ {summary['passed']}/{summary['total']} tests passed")
134
- print(f"⚡ {summary['avg_latency_ms']:.0f}ms average")
135
- print(f"💰 ${summary['total_cost_usd']:.6f} total cost")
134
+ print(f"✓ {summary.passed}/{summary.total} tests passed")
135
+ print(f"⚡ {summary.avg_latency_ms:.0f}ms average")
136
+ print(f"💰 ${summary.total_cost_usd:.6f} total cost")
136
137
 
137
138
  asyncio.run(main())
138
139
  ```
@@ -146,7 +147,7 @@ python your_script.py
146
147
 
147
148
  ## What You Get
148
149
 
149
- - [x] **One API for 100+ Models** - OpenRouter support out of the box (OpenAI, Anthropic, Google, etc.)
150
+ - [x] **100+ Models via OpenRouter** - One client for OpenAI, Anthropic, Google, and more
150
151
  - [x] **Smart Validation** - ExactMatch, Contains, Regex, JsonSchema, or write your own
151
152
  - [x] **Automatic Retries** - Exponential/linear backoff with configurable attempts
152
153
  - [x] **Metrics Tracking** - Latency, tokens, cost - automatically captured
@@ -161,35 +162,42 @@ python your_script.py
161
162
  Compare GPT-4 vs Claude on your tasks:
162
163
 
163
164
  ```python
164
- from promptum import Benchmark, TestCase, ExactMatch, Contains, Regex
165
-
166
- tests = [
167
- TestCase(
168
- name="json_output",
169
- prompt='Output JSON: {"status": "ok"}',
170
- model="openai/gpt-4",
171
- validator=Regex(r'\{"status":\s*"ok"\}')
172
- ),
173
- TestCase(
174
- name="json_output",
175
- prompt='Output JSON: {"status": "ok"}',
176
- model="anthropic/claude-3-5-sonnet",
177
- validator=Regex(r'\{"status":\s*"ok"\}')
178
- ),
179
- TestCase(
180
- name="creative_writing",
181
- prompt="Write a haiku about Python",
182
- model="openai/gpt-4",
183
- validator=Contains("Python", case_sensitive=False)
184
- ),
185
- ]
186
-
187
- benchmark.add_tests(tests)
188
- report = await benchmark.run_async()
165
+ import asyncio
166
+ from promptum import Benchmark, TestCase, Contains, Regex, OpenRouterClient
167
+
168
+ async def main():
169
+ async with OpenRouterClient(api_key="your-key") as client:
170
+ benchmark = Benchmark(provider=client, name="model_comparison")
171
+
172
+ benchmark.add_tests([
173
+ TestCase(
174
+ name="json_output_gpt4",
175
+ prompt='Output JSON: {"status": "ok"}',
176
+ model="openai/gpt-4",
177
+ validator=Regex(r'\{"status":\s*"ok"\}')
178
+ ),
179
+ TestCase(
180
+ name="json_output_claude",
181
+ prompt='Output JSON: {"status": "ok"}',
182
+ model="anthropic/claude-3-5-sonnet",
183
+ validator=Regex(r'\{"status":\s*"ok"\}')
184
+ ),
185
+ TestCase(
186
+ name="creative_writing",
187
+ prompt="Write a haiku about Python",
188
+ model="openai/gpt-4",
189
+ validator=Contains("Python", case_sensitive=False)
190
+ ),
191
+ ])
189
192
 
190
- # Side-by-side model comparison
191
- for model, summary in report.compare_models().items():
192
- print(f"{model}: {summary['pass_rate']:.0%} pass rate, {summary['avg_latency_ms']:.0f}ms avg")
193
+ report = await benchmark.run_async()
194
+
195
+ # Side-by-side model comparison
196
+ for model, model_report in report.group_by(lambda r: r.test_case.model).items():
197
+ summary = model_report.get_summary()
198
+ print(f"{model}: {summary.pass_rate:.0%} pass rate, {summary.avg_latency_ms:.0f}ms avg")
199
+
200
+ asyncio.run(main())
193
201
  ```
194
202
 
195
203
  ---
@@ -1,10 +1,11 @@
1
- promptum/__init__.py,sha256=8IAk_9VlnKEJIdwf-hEDkOfOCV456H2Jng-HrZfewso,582
1
+ promptum/__init__.py,sha256=F2dfvZUHDnAwQOGDXINgFnu8vtnwfS_twr6O5tmL1K8,606
2
2
  promptum/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- promptum/benchmark/__init__.py,sha256=0FXYDnK4SGa5ZqX2k9aVkwy3ENDlF_5nW2Mut_OCCbg,311
3
+ promptum/benchmark/__init__.py,sha256=eyeJHW4mZdKv2vuQRfXFOAKSH07YNROeNlBKJCOfWHg,369
4
4
  promptum/benchmark/benchmark.py,sha256=hZ3557qPKqFeNNuxrRLPs-b6XBy2JCowIhRDDwatfeI,1403
5
- promptum/benchmark/report.py,sha256=DhY1p3n29xOSwRYUiQW6V6FhGFGGn-JF6nuNuvj9rro,2659
5
+ promptum/benchmark/report.py,sha256=IkCpd3cswLtL9vTUIXAAYGvZk_t9wyQAzbIFpOSXnkI,2080
6
6
  promptum/benchmark/result.py,sha256=nKh-T4zlam2LxsaFoL8jeVaO6kZJ1sfB_tnp4gdNPhM,482
7
7
  promptum/benchmark/runner.py,sha256=5p6JBwjTlEHTh6jNv_iuFH1nIrI4_Gv3wmzCT0TWpvA,2407
8
+ promptum/benchmark/summary.py,sha256=dD-i8m2BUMFhMC_7v-ITDNUP5X7fG0ARErwKB5D9_yE,281
8
9
  promptum/benchmark/test_case.py,sha256=Okypf2334ewVrvmQG7M3I3D7BzqXDsQ2ihjNw9gGF00,598
9
10
  promptum/providers/__init__.py,sha256=UprvJ4vxHqo-VTzzUmZ4wFCj6VybP9xBd7HtpPPSvbI,335
10
11
  promptum/providers/metrics.py,sha256=FnS10nHFjQ5Clj5X21C_nW6zAUJU_ZHt0s2fLgp6L28,427
@@ -14,7 +15,7 @@ promptum/providers/retry.py,sha256=mA_RRz9_9J_mge_AUd9f1A-gACOxZLGTI8vTIstAr8s,5
14
15
  promptum/validation/__init__.py,sha256=mhykyxaIwn2PJh2RXAi0fi2NRIveFmlC5bg1nyCbfVU,252
15
16
  promptum/validation/protocol.py,sha256=xqxm23YX6eNeZHKMLMZ-Wz8iQKn4ZRzAI5Xryxg0uq4,418
16
17
  promptum/validation/validators.py,sha256=qSMva2P2miXXJJ5XeTKJsyYgh2x5wORi3dhOnBYuACE,2686
17
- promptum-0.0.2.dist-info/METADATA,sha256=MQcy0pxUoMpu4uZgM_Q3HEE_RnY3Krcg-_FTF9vvQ54,7845
18
- promptum-0.0.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
19
- promptum-0.0.2.dist-info/licenses/LICENSE,sha256=Fgn285H5Vy9diOlqO1TzS3hD97WcdF6-GFHvUcFNtmg,1067
20
- promptum-0.0.2.dist-info/RECORD,,
18
+ promptum-0.0.3.dist-info/METADATA,sha256=P4d6Mf35ly_xATjxAUWZXjQPYyfb5xfBMqiXoGkkDkc,8278
19
+ promptum-0.0.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
20
+ promptum-0.0.3.dist-info/licenses/LICENSE,sha256=Fgn285H5Vy9diOlqO1TzS3hD97WcdF6-GFHvUcFNtmg,1067
21
+ promptum-0.0.3.dist-info/RECORD,,