promptum 0.0.2__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {promptum-0.0.2 → promptum-0.0.3}/PKG-INFO +43 -35
  2. {promptum-0.0.2 → promptum-0.0.3}/README.md +42 -34
  3. {promptum-0.0.2 → promptum-0.0.3}/pyproject.toml +1 -1
  4. {promptum-0.0.2 → promptum-0.0.3}/src/promptum/__init__.py +3 -2
  5. {promptum-0.0.2 → promptum-0.0.3}/src/promptum/benchmark/__init__.py +2 -1
  6. {promptum-0.0.2 → promptum-0.0.3}/src/promptum/benchmark/report.py +13 -26
  7. promptum-0.0.3/src/promptum/benchmark/summary.py +14 -0
  8. {promptum-0.0.2 → promptum-0.0.3}/tests/benchmark/test_report_filtering.py +0 -7
  9. promptum-0.0.3/tests/benchmark/test_report_summary.py +28 -0
  10. {promptum-0.0.2 → promptum-0.0.3}/uv.lock +1 -1
  11. promptum-0.0.2/tests/benchmark/test_report_summary.py +0 -24
  12. {promptum-0.0.2 → promptum-0.0.3}/.coveragerc +0 -0
  13. {promptum-0.0.2 → promptum-0.0.3}/.github/workflows/lint.yml +0 -0
  14. {promptum-0.0.2 → promptum-0.0.3}/.github/workflows/publish-test.yml +0 -0
  15. {promptum-0.0.2 → promptum-0.0.3}/.github/workflows/publish.yml +0 -0
  16. {promptum-0.0.2 → promptum-0.0.3}/.github/workflows/test.yml +0 -0
  17. {promptum-0.0.2 → promptum-0.0.3}/.github/workflows/typecheck.yml +0 -0
  18. {promptum-0.0.2 → promptum-0.0.3}/.gitignore +0 -0
  19. {promptum-0.0.2 → promptum-0.0.3}/.python-version +0 -0
  20. {promptum-0.0.2 → promptum-0.0.3}/CONTRIBUTING.md +0 -0
  21. {promptum-0.0.2 → promptum-0.0.3}/Justfile +0 -0
  22. {promptum-0.0.2 → promptum-0.0.3}/LICENSE +0 -0
  23. {promptum-0.0.2 → promptum-0.0.3}/pytest.ini +0 -0
  24. {promptum-0.0.2 → promptum-0.0.3}/ruff.toml +0 -0
  25. {promptum-0.0.2 → promptum-0.0.3}/src/promptum/benchmark/benchmark.py +0 -0
  26. {promptum-0.0.2 → promptum-0.0.3}/src/promptum/benchmark/result.py +0 -0
  27. {promptum-0.0.2 → promptum-0.0.3}/src/promptum/benchmark/runner.py +0 -0
  28. {promptum-0.0.2 → promptum-0.0.3}/src/promptum/benchmark/test_case.py +0 -0
  29. {promptum-0.0.2 → promptum-0.0.3}/src/promptum/providers/__init__.py +0 -0
  30. {promptum-0.0.2 → promptum-0.0.3}/src/promptum/providers/metrics.py +0 -0
  31. {promptum-0.0.2 → promptum-0.0.3}/src/promptum/providers/openrouter.py +0 -0
  32. {promptum-0.0.2 → promptum-0.0.3}/src/promptum/providers/protocol.py +0 -0
  33. {promptum-0.0.2 → promptum-0.0.3}/src/promptum/providers/retry.py +0 -0
  34. {promptum-0.0.2 → promptum-0.0.3}/src/promptum/py.typed +0 -0
  35. {promptum-0.0.2 → promptum-0.0.3}/src/promptum/validation/__init__.py +0 -0
  36. {promptum-0.0.2 → promptum-0.0.3}/src/promptum/validation/protocol.py +0 -0
  37. {promptum-0.0.2 → promptum-0.0.3}/src/promptum/validation/validators.py +0 -0
  38. {promptum-0.0.2 → promptum-0.0.3}/tests/__init__.py +0 -0
  39. {promptum-0.0.2 → promptum-0.0.3}/tests/benchmark/__init__.py +0 -0
  40. {promptum-0.0.2 → promptum-0.0.3}/tests/benchmark/conftest.py +0 -0
  41. {promptum-0.0.2 → promptum-0.0.3}/tests/benchmark/test_test_case.py +0 -0
  42. {promptum-0.0.2 → promptum-0.0.3}/tests/conftest.py +0 -0
  43. {promptum-0.0.2 → promptum-0.0.3}/tests/providers/__init__.py +0 -0
  44. {promptum-0.0.2 → promptum-0.0.3}/tests/providers/conftest.py +0 -0
  45. {promptum-0.0.2 → promptum-0.0.3}/tests/providers/test_metrics.py +0 -0
  46. {promptum-0.0.2 → promptum-0.0.3}/tests/providers/test_retry.py +0 -0
  47. {promptum-0.0.2 → promptum-0.0.3}/tests/validation/__init__.py +0 -0
  48. {promptum-0.0.2 → promptum-0.0.3}/tests/validation/conftest.py +0 -0
  49. {promptum-0.0.2 → promptum-0.0.3}/tests/validation/test_contains.py +0 -0
  50. {promptum-0.0.2 → promptum-0.0.3}/tests/validation/test_exact_match.py +0 -0
  51. {promptum-0.0.2 → promptum-0.0.3}/tests/validation/test_json_schema.py +0 -0
  52. {promptum-0.0.2 → promptum-0.0.3}/tests/validation/test_regex.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: promptum
3
- Version: 0.0.2
3
+ Version: 0.0.3
4
4
  Summary: Async LLM benchmarking library with protocol-based extensibility
5
5
  Project-URL: Homepage, https://github.com/deyna256/promptum
6
6
  Project-URL: Repository, https://github.com/deyna256/promptum
@@ -46,7 +46,7 @@ Description-Content-Type: text/markdown
46
46
  ![Async](https://img.shields.io/badge/Async-First-green?style=for-the-badge)
47
47
  ![License: MIT](https://img.shields.io/badge/License-MIT-yellow?style=for-the-badge)
48
48
 
49
- **Benchmark LLMs Like a Pro. In 5 Lines of Code.**
49
+ **Benchmark LLMs Like a Pro.**
50
50
 
51
51
  Stop writing boilerplate to test LLMs. Start getting results.
52
52
 
@@ -56,11 +56,12 @@ Stop writing boilerplate to test LLMs. Start getting results.
56
56
 
57
57
  ## What's This?
58
58
 
59
- A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get beautiful reports.
59
+ A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get structured results.
60
60
 
61
61
  ```python
62
62
  benchmark = Benchmark(provider=client, name="my_test")
63
63
  benchmark.add_test(TestCase(
64
+ name="basic_math",
64
65
  prompt="What is 2+2?",
65
66
  model="gpt-3.5-turbo",
66
67
  validator=Contains("4")
@@ -130,9 +131,9 @@ async def main():
130
131
  report = await benchmark.run_async()
131
132
  summary = report.get_summary()
132
133
 
133
- print(f"✓ {summary['passed']}/{summary['total']} tests passed")
134
- print(f"⚡ {summary['avg_latency_ms']:.0f}ms average")
135
- print(f"💰 ${summary['total_cost_usd']:.6f} total cost")
134
+ print(f"✓ {summary.passed}/{summary.total} tests passed")
135
+ print(f"⚡ {summary.avg_latency_ms:.0f}ms average")
136
+ print(f"💰 ${summary.total_cost_usd:.6f} total cost")
136
137
 
137
138
  asyncio.run(main())
138
139
  ```
@@ -146,7 +147,7 @@ python your_script.py
146
147
 
147
148
  ## What You Get
148
149
 
149
- - [x] **One API for 100+ Models** - OpenRouter support out of the box (OpenAI, Anthropic, Google, etc.)
150
+ - [x] **100+ Models via OpenRouter** - One client for OpenAI, Anthropic, Google, and more
150
151
  - [x] **Smart Validation** - ExactMatch, Contains, Regex, JsonSchema, or write your own
151
152
  - [x] **Automatic Retries** - Exponential/linear backoff with configurable attempts
152
153
  - [x] **Metrics Tracking** - Latency, tokens, cost - automatically captured
@@ -161,35 +162,42 @@ python your_script.py
161
162
  Compare GPT-4 vs Claude on your tasks:
162
163
 
163
164
  ```python
164
- from promptum import Benchmark, TestCase, ExactMatch, Contains, Regex
165
-
166
- tests = [
167
- TestCase(
168
- name="json_output",
169
- prompt='Output JSON: {"status": "ok"}',
170
- model="openai/gpt-4",
171
- validator=Regex(r'\{"status":\s*"ok"\}')
172
- ),
173
- TestCase(
174
- name="json_output",
175
- prompt='Output JSON: {"status": "ok"}',
176
- model="anthropic/claude-3-5-sonnet",
177
- validator=Regex(r'\{"status":\s*"ok"\}')
178
- ),
179
- TestCase(
180
- name="creative_writing",
181
- prompt="Write a haiku about Python",
182
- model="openai/gpt-4",
183
- validator=Contains("Python", case_sensitive=False)
184
- ),
185
- ]
186
-
187
- benchmark.add_tests(tests)
188
- report = await benchmark.run_async()
165
+ import asyncio
166
+ from promptum import Benchmark, TestCase, Contains, Regex, OpenRouterClient
167
+
168
+ async def main():
169
+ async with OpenRouterClient(api_key="your-key") as client:
170
+ benchmark = Benchmark(provider=client, name="model_comparison")
171
+
172
+ benchmark.add_tests([
173
+ TestCase(
174
+ name="json_output_gpt4",
175
+ prompt='Output JSON: {"status": "ok"}',
176
+ model="openai/gpt-4",
177
+ validator=Regex(r'\{"status":\s*"ok"\}')
178
+ ),
179
+ TestCase(
180
+ name="json_output_claude",
181
+ prompt='Output JSON: {"status": "ok"}',
182
+ model="anthropic/claude-3-5-sonnet",
183
+ validator=Regex(r'\{"status":\s*"ok"\}')
184
+ ),
185
+ TestCase(
186
+ name="creative_writing",
187
+ prompt="Write a haiku about Python",
188
+ model="openai/gpt-4",
189
+ validator=Contains("Python", case_sensitive=False)
190
+ ),
191
+ ])
189
192
 
190
- # Side-by-side model comparison
191
- for model, summary in report.compare_models().items():
192
- print(f"{model}: {summary['pass_rate']:.0%} pass rate, {summary['avg_latency_ms']:.0f}ms avg")
193
+ report = await benchmark.run_async()
194
+
195
+ # Side-by-side model comparison
196
+ for model, model_report in report.group_by(lambda r: r.test_case.model).items():
197
+ summary = model_report.get_summary()
198
+ print(f"{model}: {summary.pass_rate:.0%} pass rate, {summary.avg_latency_ms:.0f}ms avg")
199
+
200
+ asyncio.run(main())
193
201
  ```
194
202
 
195
203
  ---
@@ -6,7 +6,7 @@
6
6
  ![Async](https://img.shields.io/badge/Async-First-green?style=for-the-badge)
7
7
  ![License: MIT](https://img.shields.io/badge/License-MIT-yellow?style=for-the-badge)
8
8
 
9
- **Benchmark LLMs Like a Pro. In 5 Lines of Code.**
9
+ **Benchmark LLMs Like a Pro.**
10
10
 
11
11
  Stop writing boilerplate to test LLMs. Start getting results.
12
12
 
@@ -16,11 +16,12 @@ Stop writing boilerplate to test LLMs. Start getting results.
16
16
 
17
17
  ## What's This?
18
18
 
19
- A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get beautiful reports.
19
+ A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get structured results.
20
20
 
21
21
  ```python
22
22
  benchmark = Benchmark(provider=client, name="my_test")
23
23
  benchmark.add_test(TestCase(
24
+ name="basic_math",
24
25
  prompt="What is 2+2?",
25
26
  model="gpt-3.5-turbo",
26
27
  validator=Contains("4")
@@ -90,9 +91,9 @@ async def main():
90
91
  report = await benchmark.run_async()
91
92
  summary = report.get_summary()
92
93
 
93
- print(f"✓ {summary['passed']}/{summary['total']} tests passed")
94
- print(f"⚡ {summary['avg_latency_ms']:.0f}ms average")
95
- print(f"💰 ${summary['total_cost_usd']:.6f} total cost")
94
+ print(f"✓ {summary.passed}/{summary.total} tests passed")
95
+ print(f"⚡ {summary.avg_latency_ms:.0f}ms average")
96
+ print(f"💰 ${summary.total_cost_usd:.6f} total cost")
96
97
 
97
98
  asyncio.run(main())
98
99
  ```
@@ -106,7 +107,7 @@ python your_script.py
106
107
 
107
108
  ## What You Get
108
109
 
109
- - [x] **One API for 100+ Models** - OpenRouter support out of the box (OpenAI, Anthropic, Google, etc.)
110
+ - [x] **100+ Models via OpenRouter** - One client for OpenAI, Anthropic, Google, and more
110
111
  - [x] **Smart Validation** - ExactMatch, Contains, Regex, JsonSchema, or write your own
111
112
  - [x] **Automatic Retries** - Exponential/linear backoff with configurable attempts
112
113
  - [x] **Metrics Tracking** - Latency, tokens, cost - automatically captured
@@ -121,35 +122,42 @@ python your_script.py
121
122
  Compare GPT-4 vs Claude on your tasks:
122
123
 
123
124
  ```python
124
- from promptum import Benchmark, TestCase, ExactMatch, Contains, Regex
125
-
126
- tests = [
127
- TestCase(
128
- name="json_output",
129
- prompt='Output JSON: {"status": "ok"}',
130
- model="openai/gpt-4",
131
- validator=Regex(r'\{"status":\s*"ok"\}')
132
- ),
133
- TestCase(
134
- name="json_output",
135
- prompt='Output JSON: {"status": "ok"}',
136
- model="anthropic/claude-3-5-sonnet",
137
- validator=Regex(r'\{"status":\s*"ok"\}')
138
- ),
139
- TestCase(
140
- name="creative_writing",
141
- prompt="Write a haiku about Python",
142
- model="openai/gpt-4",
143
- validator=Contains("Python", case_sensitive=False)
144
- ),
145
- ]
146
-
147
- benchmark.add_tests(tests)
148
- report = await benchmark.run_async()
125
+ import asyncio
126
+ from promptum import Benchmark, TestCase, Contains, Regex, OpenRouterClient
127
+
128
+ async def main():
129
+ async with OpenRouterClient(api_key="your-key") as client:
130
+ benchmark = Benchmark(provider=client, name="model_comparison")
131
+
132
+ benchmark.add_tests([
133
+ TestCase(
134
+ name="json_output_gpt4",
135
+ prompt='Output JSON: {"status": "ok"}',
136
+ model="openai/gpt-4",
137
+ validator=Regex(r'\{"status":\s*"ok"\}')
138
+ ),
139
+ TestCase(
140
+ name="json_output_claude",
141
+ prompt='Output JSON: {"status": "ok"}',
142
+ model="anthropic/claude-3-5-sonnet",
143
+ validator=Regex(r'\{"status":\s*"ok"\}')
144
+ ),
145
+ TestCase(
146
+ name="creative_writing",
147
+ prompt="Write a haiku about Python",
148
+ model="openai/gpt-4",
149
+ validator=Contains("Python", case_sensitive=False)
150
+ ),
151
+ ])
149
152
 
150
- # Side-by-side model comparison
151
- for model, summary in report.compare_models().items():
152
- print(f"{model}: {summary['pass_rate']:.0%} pass rate, {summary['avg_latency_ms']:.0f}ms avg")
153
+ report = await benchmark.run_async()
154
+
155
+ # Side-by-side model comparison
156
+ for model, model_report in report.group_by(lambda r: r.test_case.model).items():
157
+ summary = model_report.get_summary()
158
+ print(f"{model}: {summary.pass_rate:.0%} pass rate, {summary.avg_latency_ms:.0f}ms avg")
159
+
160
+ asyncio.run(main())
153
161
  ```
154
162
 
155
163
  ---
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "promptum"
3
- version = "0.0.2"
3
+ version = "0.0.3"
4
4
  description = "Async LLM benchmarking library with protocol-based extensibility"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.13"
@@ -1,4 +1,4 @@
1
- from promptum.benchmark import Benchmark, Report, Runner, TestCase, TestResult
1
+ from promptum.benchmark import Benchmark, Report, Runner, Summary, TestCase, TestResult
2
2
  from promptum.providers import LLMProvider, Metrics, OpenRouterClient, RetryConfig, RetryStrategy
3
3
  from promptum.validation import (
4
4
  Contains,
@@ -8,11 +8,12 @@ from promptum.validation import (
8
8
  Validator,
9
9
  )
10
10
 
11
- __version__ = "0.0.1"
11
+ __version__ = "0.0.3"
12
12
 
13
13
  __all__ = [
14
14
  "TestCase",
15
15
  "TestResult",
16
+ "Summary",
16
17
  "Metrics",
17
18
  "RetryConfig",
18
19
  "RetryStrategy",
@@ -2,6 +2,7 @@ from promptum.benchmark.benchmark import Benchmark
2
2
  from promptum.benchmark.report import Report
3
3
  from promptum.benchmark.result import TestResult
4
4
  from promptum.benchmark.runner import Runner
5
+ from promptum.benchmark.summary import Summary
5
6
  from promptum.benchmark.test_case import TestCase
6
7
 
7
- __all__ = ["Benchmark", "Report", "Runner", "TestCase", "TestResult"]
8
+ __all__ = ["Benchmark", "Report", "Runner", "Summary", "TestCase", "TestResult"]
@@ -1,15 +1,15 @@
1
1
  from collections.abc import Callable, Sequence
2
2
  from dataclasses import dataclass
3
- from typing import Any
4
3
 
5
4
  from promptum.benchmark.result import TestResult
5
+ from promptum.benchmark.summary import Summary
6
6
 
7
7
 
8
8
  @dataclass(frozen=True, slots=True)
9
9
  class Report:
10
10
  results: Sequence[TestResult]
11
11
 
12
- def get_summary(self) -> dict[str, Any]:
12
+ def get_summary(self) -> Summary:
13
13
  total = len(self.results)
14
14
  passed = sum(1 for r in self.results if r.passed)
15
15
 
@@ -17,18 +17,17 @@ class Report:
17
17
  total_cost = sum(r.metrics.cost_usd or 0 for r in self.results if r.metrics)
18
18
  total_tokens = sum(r.metrics.total_tokens or 0 for r in self.results if r.metrics)
19
19
 
20
- return {
21
- "total": total,
22
- "passed": passed,
23
- "failed": total - passed,
24
- "pass_rate": passed / total if total > 0 else 0,
25
- "avg_latency_ms": sum(latencies) / len(latencies) if latencies else 0,
26
- "p50_latency_ms": self._percentile(latencies, 0.5) if latencies else 0,
27
- "p95_latency_ms": self._percentile(latencies, 0.95) if latencies else 0,
28
- "p99_latency_ms": self._percentile(latencies, 0.99) if latencies else 0,
29
- "total_cost_usd": total_cost,
30
- "total_tokens": total_tokens,
31
- }
20
+ return Summary(
21
+ total=total,
22
+ passed=passed,
23
+ failed=total - passed,
24
+ pass_rate=passed / total if total > 0 else 0,
25
+ avg_latency_ms=sum(latencies) / len(latencies) if latencies else 0,
26
+ min_latency_ms=min(latencies) if latencies else 0,
27
+ max_latency_ms=max(latencies) if latencies else 0,
28
+ total_cost_usd=total_cost,
29
+ total_tokens=total_tokens,
30
+ )
32
31
 
33
32
  def filter(
34
33
  self,
@@ -60,15 +59,3 @@ class Report:
60
59
  groups[group_key].append(result)
61
60
 
62
61
  return {k: Report(results=v) for k, v in groups.items()}
63
-
64
- def compare_models(self) -> dict[str, dict[str, Any]]:
65
- by_model = self.group_by(lambda r: r.test_case.model)
66
- return {model: report.get_summary() for model, report in by_model.items()}
67
-
68
- @staticmethod
69
- def _percentile(values: list[float], p: float) -> float:
70
- if not values:
71
- return 0
72
- sorted_values = sorted(values)
73
- index = int((len(sorted_values) - 1) * p)
74
- return sorted_values[index]
@@ -0,0 +1,14 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass(frozen=True, slots=True)
5
+ class Summary:
6
+ total: int
7
+ passed: int
8
+ failed: int
9
+ pass_rate: float
10
+ avg_latency_ms: float
11
+ min_latency_ms: float
12
+ max_latency_ms: float
13
+ total_cost_usd: float
14
+ total_tokens: int
@@ -42,10 +42,3 @@ def test_report_group_by_model(sample_report: Report) -> None:
42
42
  assert len(grouped["model2"].results) == 1
43
43
 
44
44
 
45
- def test_report_compare_models(sample_report: Report) -> None:
46
- comparison = sample_report.compare_models()
47
-
48
- assert "model1" in comparison
49
- assert "model2" in comparison
50
- assert comparison["model1"]["total"] == 2
51
- assert comparison["model2"]["total"] == 1
@@ -0,0 +1,28 @@
1
+ from promptum.benchmark import Report
2
+
3
+
4
+ def test_report_summary(sample_report: Report) -> None:
5
+ summary = sample_report.get_summary()
6
+
7
+ assert summary.total == 3
8
+ assert summary.passed == 2
9
+ assert summary.failed == 1
10
+ assert summary.pass_rate == 2 / 3
11
+ assert summary.total_cost_usd == 0.045
12
+ assert summary.avg_latency_ms == 123.33333333333333
13
+ assert summary.min_latency_ms == 100.0
14
+ assert summary.max_latency_ms == 150.0
15
+
16
+
17
+ def test_report_summary_empty() -> None:
18
+ report = Report(results=[])
19
+ summary = report.get_summary()
20
+
21
+ assert summary.total == 0
22
+ assert summary.passed == 0
23
+ assert summary.failed == 0
24
+ assert summary.pass_rate == 0
25
+ assert summary.total_cost_usd == 0
26
+ assert summary.avg_latency_ms == 0
27
+ assert summary.min_latency_ms == 0
28
+ assert summary.max_latency_ms == 0
@@ -168,7 +168,7 @@ wheels = [
168
168
 
169
169
  [[package]]
170
170
  name = "promptum"
171
- version = "0.0.1"
171
+ version = "0.0.3"
172
172
  source = { editable = "." }
173
173
  dependencies = [
174
174
  { name = "httpx" },
@@ -1,24 +0,0 @@
1
- from promptum.benchmark import Report
2
-
3
-
4
- def test_report_summary(sample_report: Report) -> None:
5
- summary = sample_report.get_summary()
6
-
7
- assert summary["total"] == 3
8
- assert summary["passed"] == 2
9
- assert summary["failed"] == 1
10
- assert summary["pass_rate"] == 2 / 3
11
- assert summary["total_cost_usd"] == 0.045
12
- assert summary["avg_latency_ms"] == 123.33333333333333
13
-
14
-
15
- def test_report_summary_empty() -> None:
16
- report = Report(results=[])
17
- summary = report.get_summary()
18
-
19
- assert summary["total"] == 0
20
- assert summary["passed"] == 0
21
- assert summary["failed"] == 0
22
- assert summary["pass_rate"] == 0
23
- assert summary["total_cost_usd"] == 0
24
- assert summary["avg_latency_ms"] == 0
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes