promptum 0.0.1__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. promptum-0.0.3/CONTRIBUTING.md +78 -0
  2. {promptum-0.0.1 → promptum-0.0.3}/Justfile +0 -4
  3. {promptum-0.0.1 → promptum-0.0.3}/PKG-INFO +52 -53
  4. {promptum-0.0.1 → promptum-0.0.3}/README.md +51 -50
  5. {promptum-0.0.1 → promptum-0.0.3}/pyproject.toml +1 -3
  6. promptum-0.0.3/src/promptum/__init__.py +30 -0
  7. promptum-0.0.3/src/promptum/benchmark/__init__.py +8 -0
  8. {promptum-0.0.1 → promptum-0.0.3}/src/promptum/benchmark/benchmark.py +8 -12
  9. {promptum-0.0.1 → promptum-0.0.3}/src/promptum/benchmark/report.py +16 -30
  10. {promptum-0.0.1/src/promptum/core → promptum-0.0.3/src/promptum/benchmark}/result.py +2 -2
  11. {promptum-0.0.1/src/promptum/execution → promptum-0.0.3/src/promptum/benchmark}/runner.py +2 -3
  12. promptum-0.0.3/src/promptum/benchmark/summary.py +14 -0
  13. {promptum-0.0.1/src/promptum/core → promptum-0.0.3/src/promptum/benchmark}/test_case.py +1 -1
  14. {promptum-0.0.1 → promptum-0.0.3}/src/promptum/providers/__init__.py +5 -0
  15. {promptum-0.0.1 → promptum-0.0.3}/src/promptum/providers/openrouter.py +3 -3
  16. {promptum-0.0.1 → promptum-0.0.3}/src/promptum/providers/protocol.py +1 -1
  17. {promptum-0.0.1 → promptum-0.0.3}/src/promptum/validation/validators.py +0 -18
  18. {promptum-0.0.1 → promptum-0.0.3}/tests/benchmark/conftest.py +3 -3
  19. {promptum-0.0.1 → promptum-0.0.3}/tests/benchmark/test_report_filtering.py +0 -7
  20. promptum-0.0.3/tests/benchmark/test_report_summary.py +28 -0
  21. {promptum-0.0.1/tests/core → promptum-0.0.3/tests/benchmark}/test_test_case.py +1 -1
  22. promptum-0.0.3/tests/conftest.py +1 -0
  23. promptum-0.0.3/tests/providers/__init__.py +1 -0
  24. {promptum-0.0.1/tests/core → promptum-0.0.3/tests/providers}/conftest.py +1 -1
  25. {promptum-0.0.1/tests/core → promptum-0.0.3/tests/providers}/test_metrics.py +1 -1
  26. {promptum-0.0.1/tests/core → promptum-0.0.3/tests/providers}/test_retry.py +1 -1
  27. {promptum-0.0.1 → promptum-0.0.3}/uv.lock +2 -108
  28. promptum-0.0.1/src/promptum/__init__.py +0 -44
  29. promptum-0.0.1/src/promptum/benchmark/__init__.py +0 -4
  30. promptum-0.0.1/src/promptum/core/__init__.py +0 -12
  31. promptum-0.0.1/src/promptum/execution/__init__.py +0 -3
  32. promptum-0.0.1/src/promptum/serialization/__init__.py +0 -11
  33. promptum-0.0.1/src/promptum/serialization/base.py +0 -48
  34. promptum-0.0.1/src/promptum/serialization/html.py +0 -52
  35. promptum-0.0.1/src/promptum/serialization/json.py +0 -28
  36. promptum-0.0.1/src/promptum/serialization/protocol.py +0 -13
  37. promptum-0.0.1/src/promptum/serialization/report_template.html +0 -293
  38. promptum-0.0.1/src/promptum/serialization/yaml.py +0 -17
  39. promptum-0.0.1/src/promptum/storage/__init__.py +0 -7
  40. promptum-0.0.1/src/promptum/storage/file.py +0 -157
  41. promptum-0.0.1/src/promptum/storage/protocol.py +0 -23
  42. promptum-0.0.1/tests/benchmark/test_report_summary.py +0 -24
  43. promptum-0.0.1/tests/conftest.py +0 -40
  44. promptum-0.0.1/tests/validation/__init__.py +0 -0
  45. {promptum-0.0.1 → promptum-0.0.3}/.coveragerc +0 -0
  46. {promptum-0.0.1 → promptum-0.0.3}/.github/workflows/lint.yml +0 -0
  47. {promptum-0.0.1 → promptum-0.0.3}/.github/workflows/publish-test.yml +0 -0
  48. {promptum-0.0.1 → promptum-0.0.3}/.github/workflows/publish.yml +0 -0
  49. {promptum-0.0.1 → promptum-0.0.3}/.github/workflows/test.yml +0 -0
  50. {promptum-0.0.1 → promptum-0.0.3}/.github/workflows/typecheck.yml +0 -0
  51. {promptum-0.0.1 → promptum-0.0.3}/.gitignore +0 -0
  52. {promptum-0.0.1 → promptum-0.0.3}/.python-version +0 -0
  53. {promptum-0.0.1 → promptum-0.0.3}/LICENSE +0 -0
  54. {promptum-0.0.1 → promptum-0.0.3}/pytest.ini +0 -0
  55. {promptum-0.0.1 → promptum-0.0.3}/ruff.toml +0 -0
  56. {promptum-0.0.1/src/promptum/core → promptum-0.0.3/src/promptum/providers}/metrics.py +0 -0
  57. {promptum-0.0.1/src/promptum/core → promptum-0.0.3/src/promptum/providers}/retry.py +0 -0
  58. {promptum-0.0.1 → promptum-0.0.3}/src/promptum/py.typed +0 -0
  59. {promptum-0.0.1 → promptum-0.0.3}/src/promptum/validation/__init__.py +0 -0
  60. {promptum-0.0.1 → promptum-0.0.3}/src/promptum/validation/protocol.py +0 -0
  61. {promptum-0.0.1 → promptum-0.0.3}/tests/__init__.py +0 -0
  62. {promptum-0.0.1 → promptum-0.0.3}/tests/benchmark/__init__.py +0 -0
  63. {promptum-0.0.1/tests/core → promptum-0.0.3/tests/validation}/__init__.py +0 -0
  64. {promptum-0.0.1 → promptum-0.0.3}/tests/validation/conftest.py +0 -0
  65. {promptum-0.0.1 → promptum-0.0.3}/tests/validation/test_contains.py +0 -0
  66. {promptum-0.0.1 → promptum-0.0.3}/tests/validation/test_exact_match.py +0 -0
  67. {promptum-0.0.1 → promptum-0.0.3}/tests/validation/test_json_schema.py +0 -0
  68. {promptum-0.0.1 → promptum-0.0.3}/tests/validation/test_regex.py +0 -0
@@ -0,0 +1,78 @@
1
+ # Contributing to Promptum
2
+
3
+ Thank you for your interest in contributing to Promptum! We welcome contributions from the community.
4
+
5
+ ## Getting Started
6
+
7
+ 1. **Fork the repository** to your own GitHub account
8
+ 2. **Clone your fork** locally:
9
+ ```bash
10
+ git clone https://github.com/YOUR_USERNAME/promptum.git
11
+ cd promptum
12
+ ```
13
+ 3. **Set up the development environment**:
14
+ ```bash
15
+ just sync # Install/sync dependencies
16
+ ```
17
+
18
+ ## Making Changes
19
+
20
+ ### Branch Naming
21
+
22
+ Create a new branch named after the issue number you're working on:
23
+
24
+ ```bash
25
+ git checkout -b 42 # For issue #42
26
+ ```
27
+
28
+ ### One PR = One Issue
29
+
30
+ Each pull request should address exactly one issue. If you want to work on multiple issues, create separate branches and PRs for each.
31
+
32
+ ### Work in Progress
33
+
34
+ If your PR is not ready for review, add `[WIP]` to the title:
35
+
36
+ ```
37
+ [WIP] #42: Fix retry logic in OpenRouterClient
38
+ ```
39
+
40
+ Remove `[WIP]` when the PR is ready for review.
41
+
42
+ ## Submitting Changes
43
+
44
+ 1. **Run tests and linting** before committing:
45
+ ```bash
46
+ just lint # Lint and auto-fix
47
+ just typecheck # Type check
48
+ just test # Run tests
49
+ ```
50
+
51
+ 2. **Commit your changes** with clear, descriptive messages:
52
+ ```bash
53
+ git commit -m "#42: Fix retry logic in OpenRouterClient"
54
+ ```
55
+
56
+ 3. **Push to your fork**:
57
+ ```bash
58
+ git push origin 42
59
+ ```
60
+
61
+ 4. **Create a Pull Request** from your fork to the main repository
62
+
63
+ 5. **Tag the maintainer** (@deyna256) in a comment when your PR is ready for review
64
+
65
+ ## CI Requirements
66
+
67
+ Pull requests must pass all CI checks before review. The maintainer will not review PRs with failing checks.
68
+
69
+ CI runs:
70
+ - Linting
71
+ - Type checking
72
+ - Tests
73
+
74
+ ## Questions?
75
+
76
+ Feel free to ask questions in the issue comments or open a discussion.
77
+
78
+ Thank you for contributing!
@@ -23,10 +23,6 @@ cov-html:
23
23
  uv run pytest tests/ --cov-report=html
24
24
  xdg-open htmlcov/index.html
25
25
 
26
- # Open benchmark HTML report
27
- report:
28
- xdg-open results/report.html
29
-
30
26
  # Clean up generated files and caches
31
27
  clean:
32
28
  rm -rf .pytest_cache .ruff_cache .coverage htmlcov results/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: promptum
3
- Version: 0.0.1
3
+ Version: 0.0.3
4
4
  Summary: Async LLM benchmarking library with protocol-based extensibility
5
5
  Project-URL: Homepage, https://github.com/deyna256/promptum
6
6
  Project-URL: Repository, https://github.com/deyna256/promptum
@@ -36,8 +36,6 @@ Classifier: Programming Language :: Python :: 3.13
36
36
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
37
37
  Requires-Python: >=3.13
38
38
  Requires-Dist: httpx>=0.27.0
39
- Requires-Dist: jinja2>=3.1.0
40
- Requires-Dist: pyyaml>=6.0
41
39
  Description-Content-Type: text/markdown
42
40
 
43
41
  # promptum
@@ -48,7 +46,7 @@ Description-Content-Type: text/markdown
48
46
  ![Async](https://img.shields.io/badge/Async-First-green?style=for-the-badge)
49
47
  ![License: MIT](https://img.shields.io/badge/License-MIT-yellow?style=for-the-badge)
50
48
 
51
- **Benchmark LLMs Like a Pro. In 5 Lines of Code.**
49
+ **Benchmark LLMs Like a Pro.**
52
50
 
53
51
  Stop writing boilerplate to test LLMs. Start getting results.
54
52
 
@@ -58,11 +56,12 @@ Stop writing boilerplate to test LLMs. Start getting results.
58
56
 
59
57
  ## What's This?
60
58
 
61
- A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get beautiful reports.
59
+ A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get structured results.
62
60
 
63
61
  ```python
64
62
  benchmark = Benchmark(provider=client, name="my_test")
65
63
  benchmark.add_test(TestCase(
64
+ name="basic_math",
66
65
  prompt="What is 2+2?",
67
66
  model="gpt-3.5-turbo",
68
67
  validator=Contains("4")
@@ -97,15 +96,12 @@ for attempt in range(max_retries):
97
96
  break
98
97
  except Exception:
99
98
  sleep(2 ** attempt)
100
-
101
- # Export results manually
102
- json.dump(results, open("results.json", "w"))
103
99
  ```
104
100
 
105
101
  **After promptum:**
106
102
  ```python
107
103
  report = await benchmark.run_async()
108
- HTMLSerializer().serialize(report) # Beautiful HTML report
104
+ summary = report.get_summary() # Metrics captured automatically
109
105
  ```
110
106
 
111
107
  ---
@@ -135,9 +131,9 @@ async def main():
135
131
  report = await benchmark.run_async()
136
132
  summary = report.get_summary()
137
133
 
138
- print(f"✓ {summary['passed']}/{summary['total']} tests passed")
139
- print(f"⚡ {summary['avg_latency_ms']:.0f}ms average")
140
- print(f"💰 ${summary['total_cost_usd']:.6f} total cost")
134
+ print(f"✓ {summary.passed}/{summary.total} tests passed")
135
+ print(f"⚡ {summary.avg_latency_ms:.0f}ms average")
136
+ print(f"💰 ${summary.total_cost_usd:.6f} total cost")
141
137
 
142
138
  asyncio.run(main())
143
139
  ```
@@ -151,14 +147,13 @@ python your_script.py
151
147
 
152
148
  ## What You Get
153
149
 
154
- **One API for 100+ Models** - OpenRouter support out of the box (OpenAI, Anthropic, Google, etc.)
155
- **Smart Validation** - ExactMatch, Contains, Regex, JsonSchema, or write your own
156
- **Automatic Retries** - Exponential/linear backoff with configurable attempts
157
- **Metrics Tracking** - Latency, tokens, cost - automatically captured
158
- **Beautiful Reports** - JSON, YAML, or interactive HTML with charts
159
- **Async by Default** - Run 100 tests in parallel without breaking a sweat
160
- **Type Safe** - Full type hints, catches errors before runtime
161
- ✅ **Zero Config** - No YAML files, no setup scripts, just Python
150
+ - [x] **100+ Models via OpenRouter** - One client for OpenAI, Anthropic, Google, and more
151
+ - [x] **Smart Validation** - ExactMatch, Contains, Regex, JsonSchema, or write your own
152
+ - [x] **Automatic Retries** - Exponential/linear backoff with configurable attempts
153
+ - [x] **Metrics Tracking** - Latency, tokens, cost - automatically captured
154
+ - [x] **Async by Default** - Run 100 tests in parallel without breaking a sweat
155
+ - [x] **Type Safe** - Full type hints, catches errors before runtime
156
+ - [x] **Zero Config** - No YAML files, no setup scripts, just Python
162
157
 
163
158
  ---
164
159
 
@@ -167,39 +162,43 @@ python your_script.py
167
162
  Compare GPT-4 vs Claude on your tasks:
168
163
 
169
164
  ```python
170
- from promptum import Benchmark, TestCase, ExactMatch, Contains, Regex
171
-
172
- tests = [
173
- TestCase(
174
- name="json_output",
175
- prompt='Output JSON: {"status": "ok"}',
176
- model="openai/gpt-4",
177
- validator=Regex(r'\{"status":\s*"ok"\}')
178
- ),
179
- TestCase(
180
- name="json_output",
181
- prompt='Output JSON: {"status": "ok"}',
182
- model="anthropic/claude-3-5-sonnet",
183
- validator=Regex(r'\{"status":\s*"ok"\}')
184
- ),
185
- TestCase(
186
- name="creative_writing",
187
- prompt="Write a haiku about Python",
188
- model="openai/gpt-4",
189
- validator=Contains("Python", case_sensitive=False)
190
- ),
191
- ]
192
-
193
- benchmark.add_tests(tests)
194
- report = await benchmark.run_async()
165
+ import asyncio
166
+ from promptum import Benchmark, TestCase, Contains, Regex, OpenRouterClient
195
167
 
196
- # Export as HTML
197
- from promptum import HTMLSerializer
198
- html = HTMLSerializer().serialize(report)
199
- open("comparison.html", "w").write(html)
200
- ```
168
+ async def main():
169
+ async with OpenRouterClient(api_key="your-key") as client:
170
+ benchmark = Benchmark(provider=client, name="model_comparison")
171
+
172
+ benchmark.add_tests([
173
+ TestCase(
174
+ name="json_output_gpt4",
175
+ prompt='Output JSON: {"status": "ok"}',
176
+ model="openai/gpt-4",
177
+ validator=Regex(r'\{"status":\s*"ok"\}')
178
+ ),
179
+ TestCase(
180
+ name="json_output_claude",
181
+ prompt='Output JSON: {"status": "ok"}',
182
+ model="anthropic/claude-3-5-sonnet",
183
+ validator=Regex(r'\{"status":\s*"ok"\}')
184
+ ),
185
+ TestCase(
186
+ name="creative_writing",
187
+ prompt="Write a haiku about Python",
188
+ model="openai/gpt-4",
189
+ validator=Contains("Python", case_sensitive=False)
190
+ ),
191
+ ])
201
192
 
202
- Open `comparison.html` in your browser - see side-by-side model performance with charts.
193
+ report = await benchmark.run_async()
194
+
195
+ # Side-by-side model comparison
196
+ for model, model_report in report.group_by(lambda r: r.test_case.model).items():
197
+ summary = model_report.get_summary()
198
+ print(f"{model}: {summary.pass_rate:.0%} pass rate, {summary.avg_latency_ms:.0f}ms avg")
199
+
200
+ asyncio.run(main())
201
+ ```
203
202
 
204
203
  ---
205
204
 
@@ -252,7 +251,7 @@ Found a bug? Want a feature? PRs welcome!
252
251
 
253
252
  ```bash
254
253
  # Development setup
255
- git clone https://github.com/yourusername/promptum.git
254
+ git clone https://github.com/deyna256/promptum.git
256
255
  cd promptum
257
256
  just sync # Install dependencies
258
257
  just test # Run tests
@@ -273,7 +272,7 @@ MIT - do whatever you want with it.
273
272
 
274
273
  <div align="center">
275
274
 
276
- **[⭐ Star on GitHub](https://github.com/yourusername/promptum)** | **[🐛 Report Bug](https://github.com/yourusername/promptum/issues)** | **[💡 Request Feature](https://github.com/yourusername/promptum/issues)**
275
+ **[⭐ Star on GitHub](https://github.com/deyna256/promptum)** | **[🐛 Report Bug](https://github.com/deyna256/promptum/issues)** | **[💡 Request Feature](https://github.com/deyna256/promptum/issues)**
277
276
 
278
277
  Made for developers who value their time.
279
278
 
@@ -6,7 +6,7 @@
6
6
  ![Async](https://img.shields.io/badge/Async-First-green?style=for-the-badge)
7
7
  ![License: MIT](https://img.shields.io/badge/License-MIT-yellow?style=for-the-badge)
8
8
 
9
- **Benchmark LLMs Like a Pro. In 5 Lines of Code.**
9
+ **Benchmark LLMs Like a Pro.**
10
10
 
11
11
  Stop writing boilerplate to test LLMs. Start getting results.
12
12
 
@@ -16,11 +16,12 @@ Stop writing boilerplate to test LLMs. Start getting results.
16
16
 
17
17
  ## What's This?
18
18
 
19
- A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get beautiful reports.
19
+ A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get structured results.
20
20
 
21
21
  ```python
22
22
  benchmark = Benchmark(provider=client, name="my_test")
23
23
  benchmark.add_test(TestCase(
24
+ name="basic_math",
24
25
  prompt="What is 2+2?",
25
26
  model="gpt-3.5-turbo",
26
27
  validator=Contains("4")
@@ -55,15 +56,12 @@ for attempt in range(max_retries):
55
56
  break
56
57
  except Exception:
57
58
  sleep(2 ** attempt)
58
-
59
- # Export results manually
60
- json.dump(results, open("results.json", "w"))
61
59
  ```
62
60
 
63
61
  **After promptum:**
64
62
  ```python
65
63
  report = await benchmark.run_async()
66
- HTMLSerializer().serialize(report) # Beautiful HTML report
64
+ summary = report.get_summary() # Metrics captured automatically
67
65
  ```
68
66
 
69
67
  ---
@@ -93,9 +91,9 @@ async def main():
93
91
  report = await benchmark.run_async()
94
92
  summary = report.get_summary()
95
93
 
96
- print(f"✓ {summary['passed']}/{summary['total']} tests passed")
97
- print(f"⚡ {summary['avg_latency_ms']:.0f}ms average")
98
- print(f"💰 ${summary['total_cost_usd']:.6f} total cost")
94
+ print(f"✓ {summary.passed}/{summary.total} tests passed")
95
+ print(f"⚡ {summary.avg_latency_ms:.0f}ms average")
96
+ print(f"💰 ${summary.total_cost_usd:.6f} total cost")
99
97
 
100
98
  asyncio.run(main())
101
99
  ```
@@ -109,14 +107,13 @@ python your_script.py
109
107
 
110
108
  ## What You Get
111
109
 
112
- **One API for 100+ Models** - OpenRouter support out of the box (OpenAI, Anthropic, Google, etc.)
113
- **Smart Validation** - ExactMatch, Contains, Regex, JsonSchema, or write your own
114
- **Automatic Retries** - Exponential/linear backoff with configurable attempts
115
- **Metrics Tracking** - Latency, tokens, cost - automatically captured
116
- **Beautiful Reports** - JSON, YAML, or interactive HTML with charts
117
- **Async by Default** - Run 100 tests in parallel without breaking a sweat
118
- **Type Safe** - Full type hints, catches errors before runtime
119
- ✅ **Zero Config** - No YAML files, no setup scripts, just Python
110
+ - [x] **100+ Models via OpenRouter** - One client for OpenAI, Anthropic, Google, and more
111
+ - [x] **Smart Validation** - ExactMatch, Contains, Regex, JsonSchema, or write your own
112
+ - [x] **Automatic Retries** - Exponential/linear backoff with configurable attempts
113
+ - [x] **Metrics Tracking** - Latency, tokens, cost - automatically captured
114
+ - [x] **Async by Default** - Run 100 tests in parallel without breaking a sweat
115
+ - [x] **Type Safe** - Full type hints, catches errors before runtime
116
+ - [x] **Zero Config** - No YAML files, no setup scripts, just Python
120
117
 
121
118
  ---
122
119
 
@@ -125,39 +122,43 @@ python your_script.py
125
122
  Compare GPT-4 vs Claude on your tasks:
126
123
 
127
124
  ```python
128
- from promptum import Benchmark, TestCase, ExactMatch, Contains, Regex
129
-
130
- tests = [
131
- TestCase(
132
- name="json_output",
133
- prompt='Output JSON: {"status": "ok"}',
134
- model="openai/gpt-4",
135
- validator=Regex(r'\{"status":\s*"ok"\}')
136
- ),
137
- TestCase(
138
- name="json_output",
139
- prompt='Output JSON: {"status": "ok"}',
140
- model="anthropic/claude-3-5-sonnet",
141
- validator=Regex(r'\{"status":\s*"ok"\}')
142
- ),
143
- TestCase(
144
- name="creative_writing",
145
- prompt="Write a haiku about Python",
146
- model="openai/gpt-4",
147
- validator=Contains("Python", case_sensitive=False)
148
- ),
149
- ]
150
-
151
- benchmark.add_tests(tests)
152
- report = await benchmark.run_async()
125
+ import asyncio
126
+ from promptum import Benchmark, TestCase, Contains, Regex, OpenRouterClient
153
127
 
154
- # Export as HTML
155
- from promptum import HTMLSerializer
156
- html = HTMLSerializer().serialize(report)
157
- open("comparison.html", "w").write(html)
158
- ```
128
+ async def main():
129
+ async with OpenRouterClient(api_key="your-key") as client:
130
+ benchmark = Benchmark(provider=client, name="model_comparison")
131
+
132
+ benchmark.add_tests([
133
+ TestCase(
134
+ name="json_output_gpt4",
135
+ prompt='Output JSON: {"status": "ok"}',
136
+ model="openai/gpt-4",
137
+ validator=Regex(r'\{"status":\s*"ok"\}')
138
+ ),
139
+ TestCase(
140
+ name="json_output_claude",
141
+ prompt='Output JSON: {"status": "ok"}',
142
+ model="anthropic/claude-3-5-sonnet",
143
+ validator=Regex(r'\{"status":\s*"ok"\}')
144
+ ),
145
+ TestCase(
146
+ name="creative_writing",
147
+ prompt="Write a haiku about Python",
148
+ model="openai/gpt-4",
149
+ validator=Contains("Python", case_sensitive=False)
150
+ ),
151
+ ])
159
152
 
160
- Open `comparison.html` in your browser - see side-by-side model performance with charts.
153
+ report = await benchmark.run_async()
154
+
155
+ # Side-by-side model comparison
156
+ for model, model_report in report.group_by(lambda r: r.test_case.model).items():
157
+ summary = model_report.get_summary()
158
+ print(f"{model}: {summary.pass_rate:.0%} pass rate, {summary.avg_latency_ms:.0f}ms avg")
159
+
160
+ asyncio.run(main())
161
+ ```
161
162
 
162
163
  ---
163
164
 
@@ -210,7 +211,7 @@ Found a bug? Want a feature? PRs welcome!
210
211
 
211
212
  ```bash
212
213
  # Development setup
213
- git clone https://github.com/yourusername/promptum.git
214
+ git clone https://github.com/deyna256/promptum.git
214
215
  cd promptum
215
216
  just sync # Install dependencies
216
217
  just test # Run tests
@@ -231,7 +232,7 @@ MIT - do whatever you want with it.
231
232
 
232
233
  <div align="center">
233
234
 
234
- **[⭐ Star on GitHub](https://github.com/yourusername/promptum)** | **[🐛 Report Bug](https://github.com/yourusername/promptum/issues)** | **[💡 Request Feature](https://github.com/yourusername/promptum/issues)**
235
+ **[⭐ Star on GitHub](https://github.com/deyna256/promptum)** | **[🐛 Report Bug](https://github.com/deyna256/promptum/issues)** | **[💡 Request Feature](https://github.com/deyna256/promptum/issues)**
235
236
 
236
237
  Made for developers who value their time.
237
238
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "promptum"
3
- version = "0.0.1"
3
+ version = "0.0.3"
4
4
  description = "Async LLM benchmarking library with protocol-based extensibility"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.13"
@@ -18,8 +18,6 @@ classifiers = [
18
18
  ]
19
19
  dependencies = [
20
20
  "httpx>=0.27.0",
21
- "pyyaml>=6.0",
22
- "jinja2>=3.1.0",
23
21
  ]
24
22
 
25
23
  [project.urls]
@@ -0,0 +1,30 @@
1
+ from promptum.benchmark import Benchmark, Report, Runner, Summary, TestCase, TestResult
2
+ from promptum.providers import LLMProvider, Metrics, OpenRouterClient, RetryConfig, RetryStrategy
3
+ from promptum.validation import (
4
+ Contains,
5
+ ExactMatch,
6
+ JsonSchema,
7
+ Regex,
8
+ Validator,
9
+ )
10
+
11
+ __version__ = "0.0.3"
12
+
13
+ __all__ = [
14
+ "TestCase",
15
+ "TestResult",
16
+ "Summary",
17
+ "Metrics",
18
+ "RetryConfig",
19
+ "RetryStrategy",
20
+ "Validator",
21
+ "ExactMatch",
22
+ "Contains",
23
+ "Regex",
24
+ "JsonSchema",
25
+ "LLMProvider",
26
+ "OpenRouterClient",
27
+ "Runner",
28
+ "Benchmark",
29
+ "Report",
30
+ ]
@@ -0,0 +1,8 @@
1
+ from promptum.benchmark.benchmark import Benchmark
2
+ from promptum.benchmark.report import Report
3
+ from promptum.benchmark.result import TestResult
4
+ from promptum.benchmark.runner import Runner
5
+ from promptum.benchmark.summary import Summary
6
+ from promptum.benchmark.test_case import TestCase
7
+
8
+ __all__ = ["Benchmark", "Report", "Runner", "Summary", "TestCase", "TestResult"]
@@ -1,11 +1,10 @@
1
1
  import asyncio
2
2
  from collections.abc import Callable, Sequence
3
- from typing import Any
4
3
 
5
4
  from promptum.benchmark.report import Report
6
- from promptum.core.result import TestResult
7
- from promptum.core.test_case import TestCase
8
- from promptum.execution.runner import Runner
5
+ from promptum.benchmark.result import TestResult
6
+ from promptum.benchmark.runner import Runner
7
+ from promptum.benchmark.test_case import TestCase
9
8
  from promptum.providers.protocol import LLMProvider
10
9
 
11
10
 
@@ -29,12 +28,12 @@ class Benchmark:
29
28
  def add_tests(self, test_cases: Sequence[TestCase]) -> None:
30
29
  self._test_cases.extend(test_cases)
31
30
 
32
- def run(self, metadata: dict[str, Any] | None = None) -> Report:
33
- return asyncio.run(self.run_async(metadata))
31
+ def run(self) -> Report:
32
+ return asyncio.run(self.run_async())
34
33
 
35
- async def run_async(self, metadata: dict[str, Any] | None = None) -> Report:
34
+ async def run_async(self) -> Report:
36
35
  if not self._test_cases:
37
- return Report(results=[], metadata=metadata or {})
36
+ return Report(results=[])
38
37
 
39
38
  runner = Runner(
40
39
  provider=self.provider,
@@ -44,7 +43,4 @@ class Benchmark:
44
43
 
45
44
  results = await runner.run(self._test_cases)
46
45
 
47
- return Report(
48
- results=results,
49
- metadata=metadata or {},
50
- )
46
+ return Report(results=results)
@@ -1,16 +1,15 @@
1
1
  from collections.abc import Callable, Sequence
2
2
  from dataclasses import dataclass
3
- from typing import Any
4
3
 
5
- from promptum.core.result import TestResult
4
+ from promptum.benchmark.result import TestResult
5
+ from promptum.benchmark.summary import Summary
6
6
 
7
7
 
8
8
  @dataclass(frozen=True, slots=True)
9
9
  class Report:
10
10
  results: Sequence[TestResult]
11
- metadata: dict[str, Any]
12
11
 
13
- def get_summary(self) -> dict[str, Any]:
12
+ def get_summary(self) -> Summary:
14
13
  total = len(self.results)
15
14
  passed = sum(1 for r in self.results if r.passed)
16
15
 
@@ -18,18 +17,17 @@ class Report:
18
17
  total_cost = sum(r.metrics.cost_usd or 0 for r in self.results if r.metrics)
19
18
  total_tokens = sum(r.metrics.total_tokens or 0 for r in self.results if r.metrics)
20
19
 
21
- return {
22
- "total": total,
23
- "passed": passed,
24
- "failed": total - passed,
25
- "pass_rate": passed / total if total > 0 else 0,
26
- "avg_latency_ms": sum(latencies) / len(latencies) if latencies else 0,
27
- "p50_latency_ms": self._percentile(latencies, 0.5) if latencies else 0,
28
- "p95_latency_ms": self._percentile(latencies, 0.95) if latencies else 0,
29
- "p99_latency_ms": self._percentile(latencies, 0.99) if latencies else 0,
30
- "total_cost_usd": total_cost,
31
- "total_tokens": total_tokens,
32
- }
20
+ return Summary(
21
+ total=total,
22
+ passed=passed,
23
+ failed=total - passed,
24
+ pass_rate=passed / total if total > 0 else 0,
25
+ avg_latency_ms=sum(latencies) / len(latencies) if latencies else 0,
26
+ min_latency_ms=min(latencies) if latencies else 0,
27
+ max_latency_ms=max(latencies) if latencies else 0,
28
+ total_cost_usd=total_cost,
29
+ total_tokens=total_tokens,
30
+ )
33
31
 
34
32
  def filter(
35
33
  self,
@@ -49,7 +47,7 @@ class Report:
49
47
  if passed is not None:
50
48
  filtered = [r for r in filtered if r.passed == passed]
51
49
 
52
- return Report(results=filtered, metadata=self.metadata)
50
+ return Report(results=filtered)
53
51
 
54
52
  def group_by(self, key: Callable[[TestResult], str]) -> dict[str, "Report"]:
55
53
  groups: dict[str, list[TestResult]] = {}
@@ -60,16 +58,4 @@ class Report:
60
58
  groups[group_key] = []
61
59
  groups[group_key].append(result)
62
60
 
63
- return {k: Report(results=v, metadata=self.metadata) for k, v in groups.items()}
64
-
65
- def compare_models(self) -> dict[str, dict[str, Any]]:
66
- by_model = self.group_by(lambda r: r.test_case.model)
67
- return {model: report.get_summary() for model, report in by_model.items()}
68
-
69
- @staticmethod
70
- def _percentile(values: list[float], p: float) -> float:
71
- if not values:
72
- return 0
73
- sorted_values = sorted(values)
74
- index = int(len(sorted_values) * p)
75
- return sorted_values[min(index, len(sorted_values) - 1)]
61
+ return {k: Report(results=v) for k, v in groups.items()}
@@ -2,8 +2,8 @@ from dataclasses import dataclass, field
2
2
  from datetime import datetime
3
3
  from typing import Any
4
4
 
5
- from promptum.core.metrics import Metrics
6
- from promptum.core.test_case import TestCase
5
+ from promptum.benchmark.test_case import TestCase
6
+ from promptum.providers.metrics import Metrics
7
7
 
8
8
 
9
9
  @dataclass(frozen=True, slots=True)
@@ -3,8 +3,8 @@ from collections.abc import Callable, Sequence
3
3
 
4
4
  import httpx
5
5
 
6
- from promptum.core.result import TestResult
7
- from promptum.core.test_case import TestCase
6
+ from promptum.benchmark.result import TestResult
7
+ from promptum.benchmark.test_case import TestCase
8
8
  from promptum.providers.protocol import LLMProvider
9
9
 
10
10
 
@@ -37,7 +37,6 @@ class Runner:
37
37
 
38
38
  results = await asyncio.gather(
39
39
  *[run_with_semaphore(tc) for tc in test_cases],
40
- return_exceptions=False,
41
40
  )
42
41
 
43
42
  return list(results)
@@ -0,0 +1,14 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass(frozen=True, slots=True)
5
+ class Summary:
6
+ total: int
7
+ passed: int
8
+ failed: int
9
+ pass_rate: float
10
+ avg_latency_ms: float
11
+ min_latency_ms: float
12
+ max_latency_ms: float
13
+ total_cost_usd: float
14
+ total_tokens: int
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Any
5
5
  if TYPE_CHECKING:
6
6
  from promptum.validation.protocol import Validator
7
7
 
8
- from promptum.core.retry import RetryConfig
8
+ from promptum.providers.retry import RetryConfig
9
9
 
10
10
 
11
11
  @dataclass(frozen=True, slots=True)