promptum 0.0.1__tar.gz → 0.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promptum-0.0.3/CONTRIBUTING.md +78 -0
- {promptum-0.0.1 → promptum-0.0.3}/Justfile +0 -4
- {promptum-0.0.1 → promptum-0.0.3}/PKG-INFO +52 -53
- {promptum-0.0.1 → promptum-0.0.3}/README.md +51 -50
- {promptum-0.0.1 → promptum-0.0.3}/pyproject.toml +1 -3
- promptum-0.0.3/src/promptum/__init__.py +30 -0
- promptum-0.0.3/src/promptum/benchmark/__init__.py +8 -0
- {promptum-0.0.1 → promptum-0.0.3}/src/promptum/benchmark/benchmark.py +8 -12
- {promptum-0.0.1 → promptum-0.0.3}/src/promptum/benchmark/report.py +16 -30
- {promptum-0.0.1/src/promptum/core → promptum-0.0.3/src/promptum/benchmark}/result.py +2 -2
- {promptum-0.0.1/src/promptum/execution → promptum-0.0.3/src/promptum/benchmark}/runner.py +2 -3
- promptum-0.0.3/src/promptum/benchmark/summary.py +14 -0
- {promptum-0.0.1/src/promptum/core → promptum-0.0.3/src/promptum/benchmark}/test_case.py +1 -1
- {promptum-0.0.1 → promptum-0.0.3}/src/promptum/providers/__init__.py +5 -0
- {promptum-0.0.1 → promptum-0.0.3}/src/promptum/providers/openrouter.py +3 -3
- {promptum-0.0.1 → promptum-0.0.3}/src/promptum/providers/protocol.py +1 -1
- {promptum-0.0.1 → promptum-0.0.3}/src/promptum/validation/validators.py +0 -18
- {promptum-0.0.1 → promptum-0.0.3}/tests/benchmark/conftest.py +3 -3
- {promptum-0.0.1 → promptum-0.0.3}/tests/benchmark/test_report_filtering.py +0 -7
- promptum-0.0.3/tests/benchmark/test_report_summary.py +28 -0
- {promptum-0.0.1/tests/core → promptum-0.0.3/tests/benchmark}/test_test_case.py +1 -1
- promptum-0.0.3/tests/conftest.py +1 -0
- promptum-0.0.3/tests/providers/__init__.py +1 -0
- {promptum-0.0.1/tests/core → promptum-0.0.3/tests/providers}/conftest.py +1 -1
- {promptum-0.0.1/tests/core → promptum-0.0.3/tests/providers}/test_metrics.py +1 -1
- {promptum-0.0.1/tests/core → promptum-0.0.3/tests/providers}/test_retry.py +1 -1
- {promptum-0.0.1 → promptum-0.0.3}/uv.lock +2 -108
- promptum-0.0.1/src/promptum/__init__.py +0 -44
- promptum-0.0.1/src/promptum/benchmark/__init__.py +0 -4
- promptum-0.0.1/src/promptum/core/__init__.py +0 -12
- promptum-0.0.1/src/promptum/execution/__init__.py +0 -3
- promptum-0.0.1/src/promptum/serialization/__init__.py +0 -11
- promptum-0.0.1/src/promptum/serialization/base.py +0 -48
- promptum-0.0.1/src/promptum/serialization/html.py +0 -52
- promptum-0.0.1/src/promptum/serialization/json.py +0 -28
- promptum-0.0.1/src/promptum/serialization/protocol.py +0 -13
- promptum-0.0.1/src/promptum/serialization/report_template.html +0 -293
- promptum-0.0.1/src/promptum/serialization/yaml.py +0 -17
- promptum-0.0.1/src/promptum/storage/__init__.py +0 -7
- promptum-0.0.1/src/promptum/storage/file.py +0 -157
- promptum-0.0.1/src/promptum/storage/protocol.py +0 -23
- promptum-0.0.1/tests/benchmark/test_report_summary.py +0 -24
- promptum-0.0.1/tests/conftest.py +0 -40
- promptum-0.0.1/tests/validation/__init__.py +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/.coveragerc +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/.github/workflows/lint.yml +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/.github/workflows/publish-test.yml +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/.github/workflows/publish.yml +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/.github/workflows/test.yml +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/.github/workflows/typecheck.yml +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/.gitignore +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/.python-version +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/LICENSE +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/pytest.ini +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/ruff.toml +0 -0
- {promptum-0.0.1/src/promptum/core → promptum-0.0.3/src/promptum/providers}/metrics.py +0 -0
- {promptum-0.0.1/src/promptum/core → promptum-0.0.3/src/promptum/providers}/retry.py +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/src/promptum/py.typed +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/src/promptum/validation/__init__.py +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/src/promptum/validation/protocol.py +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/tests/__init__.py +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/tests/benchmark/__init__.py +0 -0
- {promptum-0.0.1/tests/core → promptum-0.0.3/tests/validation}/__init__.py +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/tests/validation/conftest.py +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/tests/validation/test_contains.py +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/tests/validation/test_exact_match.py +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/tests/validation/test_json_schema.py +0 -0
- {promptum-0.0.1 → promptum-0.0.3}/tests/validation/test_regex.py +0 -0
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# Contributing to Promptum
|
|
2
|
+
|
|
3
|
+
Thank you for your interest in contributing to Promptum! We welcome contributions from the community.
|
|
4
|
+
|
|
5
|
+
## Getting Started
|
|
6
|
+
|
|
7
|
+
1. **Fork the repository** to your own GitHub account
|
|
8
|
+
2. **Clone your fork** locally:
|
|
9
|
+
```bash
|
|
10
|
+
git clone https://github.com/YOUR_USERNAME/promptum.git
|
|
11
|
+
cd promptum
|
|
12
|
+
```
|
|
13
|
+
3. **Set up the development environment**:
|
|
14
|
+
```bash
|
|
15
|
+
just sync # Install/sync dependencies
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Making Changes
|
|
19
|
+
|
|
20
|
+
### Branch Naming
|
|
21
|
+
|
|
22
|
+
Create a new branch named after the issue number you're working on:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
git checkout -b 42 # For issue #42
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### One PR = One Issue
|
|
29
|
+
|
|
30
|
+
Each pull request should address exactly one issue. If you want to work on multiple issues, create separate branches and PRs for each.
|
|
31
|
+
|
|
32
|
+
### Work in Progress
|
|
33
|
+
|
|
34
|
+
If your PR is not ready for review, add `[WIP]` to the title:
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
[WIP] #42: Fix retry logic in OpenRouterClient
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Remove `[WIP]` when the PR is ready for review.
|
|
41
|
+
|
|
42
|
+
## Submitting Changes
|
|
43
|
+
|
|
44
|
+
1. **Run tests and linting** before committing:
|
|
45
|
+
```bash
|
|
46
|
+
just lint # Lint and auto-fix
|
|
47
|
+
just typecheck # Type check
|
|
48
|
+
just test # Run tests
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
2. **Commit your changes** with clear, descriptive messages:
|
|
52
|
+
```bash
|
|
53
|
+
git commit -m "#42: Fix retry logic in OpenRouterClient"
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
3. **Push to your fork**:
|
|
57
|
+
```bash
|
|
58
|
+
git push origin 42
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
4. **Create a Pull Request** from your fork to the main repository
|
|
62
|
+
|
|
63
|
+
5. **Tag the maintainer** (@deyna256) in a comment when your PR is ready for review
|
|
64
|
+
|
|
65
|
+
## CI Requirements
|
|
66
|
+
|
|
67
|
+
Pull requests must pass all CI checks before review. The maintainer will not review PRs with failing checks.
|
|
68
|
+
|
|
69
|
+
CI runs:
|
|
70
|
+
- Linting
|
|
71
|
+
- Type checking
|
|
72
|
+
- Tests
|
|
73
|
+
|
|
74
|
+
## Questions?
|
|
75
|
+
|
|
76
|
+
Feel free to ask questions in the issue comments or open a discussion.
|
|
77
|
+
|
|
78
|
+
Thank you for contributing!
|
|
@@ -23,10 +23,6 @@ cov-html:
|
|
|
23
23
|
uv run pytest tests/ --cov-report=html
|
|
24
24
|
xdg-open htmlcov/index.html
|
|
25
25
|
|
|
26
|
-
# Open benchmark HTML report
|
|
27
|
-
report:
|
|
28
|
-
xdg-open results/report.html
|
|
29
|
-
|
|
30
26
|
# Clean up generated files and caches
|
|
31
27
|
clean:
|
|
32
28
|
rm -rf .pytest_cache .ruff_cache .coverage htmlcov results/
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: promptum
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.3
|
|
4
4
|
Summary: Async LLM benchmarking library with protocol-based extensibility
|
|
5
5
|
Project-URL: Homepage, https://github.com/deyna256/promptum
|
|
6
6
|
Project-URL: Repository, https://github.com/deyna256/promptum
|
|
@@ -36,8 +36,6 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
36
36
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
37
37
|
Requires-Python: >=3.13
|
|
38
38
|
Requires-Dist: httpx>=0.27.0
|
|
39
|
-
Requires-Dist: jinja2>=3.1.0
|
|
40
|
-
Requires-Dist: pyyaml>=6.0
|
|
41
39
|
Description-Content-Type: text/markdown
|
|
42
40
|
|
|
43
41
|
# promptum
|
|
@@ -48,7 +46,7 @@ Description-Content-Type: text/markdown
|
|
|
48
46
|

|
|
49
47
|

|
|
50
48
|
|
|
51
|
-
**Benchmark LLMs Like a Pro
|
|
49
|
+
**Benchmark LLMs Like a Pro.**
|
|
52
50
|
|
|
53
51
|
Stop writing boilerplate to test LLMs. Start getting results.
|
|
54
52
|
|
|
@@ -58,11 +56,12 @@ Stop writing boilerplate to test LLMs. Start getting results.
|
|
|
58
56
|
|
|
59
57
|
## What's This?
|
|
60
58
|
|
|
61
|
-
A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get
|
|
59
|
+
A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get structured results.
|
|
62
60
|
|
|
63
61
|
```python
|
|
64
62
|
benchmark = Benchmark(provider=client, name="my_test")
|
|
65
63
|
benchmark.add_test(TestCase(
|
|
64
|
+
name="basic_math",
|
|
66
65
|
prompt="What is 2+2?",
|
|
67
66
|
model="gpt-3.5-turbo",
|
|
68
67
|
validator=Contains("4")
|
|
@@ -97,15 +96,12 @@ for attempt in range(max_retries):
|
|
|
97
96
|
break
|
|
98
97
|
except Exception:
|
|
99
98
|
sleep(2 ** attempt)
|
|
100
|
-
|
|
101
|
-
# Export results manually
|
|
102
|
-
json.dump(results, open("results.json", "w"))
|
|
103
99
|
```
|
|
104
100
|
|
|
105
101
|
**After promptum:**
|
|
106
102
|
```python
|
|
107
103
|
report = await benchmark.run_async()
|
|
108
|
-
|
|
104
|
+
summary = report.get_summary() # Metrics captured automatically
|
|
109
105
|
```
|
|
110
106
|
|
|
111
107
|
---
|
|
@@ -135,9 +131,9 @@ async def main():
|
|
|
135
131
|
report = await benchmark.run_async()
|
|
136
132
|
summary = report.get_summary()
|
|
137
133
|
|
|
138
|
-
print(f"✓ {summary
|
|
139
|
-
print(f"⚡ {summary
|
|
140
|
-
print(f"💰 ${summary
|
|
134
|
+
print(f"✓ {summary.passed}/{summary.total} tests passed")
|
|
135
|
+
print(f"⚡ {summary.avg_latency_ms:.0f}ms average")
|
|
136
|
+
print(f"💰 ${summary.total_cost_usd:.6f} total cost")
|
|
141
137
|
|
|
142
138
|
asyncio.run(main())
|
|
143
139
|
```
|
|
@@ -151,14 +147,13 @@ python your_script.py
|
|
|
151
147
|
|
|
152
148
|
## What You Get
|
|
153
149
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
✅ **Zero Config** - No YAML files, no setup scripts, just Python
|
|
150
|
+
- [x] **100+ Models via OpenRouter** - One client for OpenAI, Anthropic, Google, and more
|
|
151
|
+
- [x] **Smart Validation** - ExactMatch, Contains, Regex, JsonSchema, or write your own
|
|
152
|
+
- [x] **Automatic Retries** - Exponential/linear backoff with configurable attempts
|
|
153
|
+
- [x] **Metrics Tracking** - Latency, tokens, cost - automatically captured
|
|
154
|
+
- [x] **Async by Default** - Run 100 tests in parallel without breaking a sweat
|
|
155
|
+
- [x] **Type Safe** - Full type hints, catches errors before runtime
|
|
156
|
+
- [x] **Zero Config** - No YAML files, no setup scripts, just Python
|
|
162
157
|
|
|
163
158
|
---
|
|
164
159
|
|
|
@@ -167,39 +162,43 @@ python your_script.py
|
|
|
167
162
|
Compare GPT-4 vs Claude on your tasks:
|
|
168
163
|
|
|
169
164
|
```python
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
tests = [
|
|
173
|
-
TestCase(
|
|
174
|
-
name="json_output",
|
|
175
|
-
prompt='Output JSON: {"status": "ok"}',
|
|
176
|
-
model="openai/gpt-4",
|
|
177
|
-
validator=Regex(r'\{"status":\s*"ok"\}')
|
|
178
|
-
),
|
|
179
|
-
TestCase(
|
|
180
|
-
name="json_output",
|
|
181
|
-
prompt='Output JSON: {"status": "ok"}',
|
|
182
|
-
model="anthropic/claude-3-5-sonnet",
|
|
183
|
-
validator=Regex(r'\{"status":\s*"ok"\}')
|
|
184
|
-
),
|
|
185
|
-
TestCase(
|
|
186
|
-
name="creative_writing",
|
|
187
|
-
prompt="Write a haiku about Python",
|
|
188
|
-
model="openai/gpt-4",
|
|
189
|
-
validator=Contains("Python", case_sensitive=False)
|
|
190
|
-
),
|
|
191
|
-
]
|
|
192
|
-
|
|
193
|
-
benchmark.add_tests(tests)
|
|
194
|
-
report = await benchmark.run_async()
|
|
165
|
+
import asyncio
|
|
166
|
+
from promptum import Benchmark, TestCase, Contains, Regex, OpenRouterClient
|
|
195
167
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
168
|
+
async def main():
|
|
169
|
+
async with OpenRouterClient(api_key="your-key") as client:
|
|
170
|
+
benchmark = Benchmark(provider=client, name="model_comparison")
|
|
171
|
+
|
|
172
|
+
benchmark.add_tests([
|
|
173
|
+
TestCase(
|
|
174
|
+
name="json_output_gpt4",
|
|
175
|
+
prompt='Output JSON: {"status": "ok"}',
|
|
176
|
+
model="openai/gpt-4",
|
|
177
|
+
validator=Regex(r'\{"status":\s*"ok"\}')
|
|
178
|
+
),
|
|
179
|
+
TestCase(
|
|
180
|
+
name="json_output_claude",
|
|
181
|
+
prompt='Output JSON: {"status": "ok"}',
|
|
182
|
+
model="anthropic/claude-3-5-sonnet",
|
|
183
|
+
validator=Regex(r'\{"status":\s*"ok"\}')
|
|
184
|
+
),
|
|
185
|
+
TestCase(
|
|
186
|
+
name="creative_writing",
|
|
187
|
+
prompt="Write a haiku about Python",
|
|
188
|
+
model="openai/gpt-4",
|
|
189
|
+
validator=Contains("Python", case_sensitive=False)
|
|
190
|
+
),
|
|
191
|
+
])
|
|
201
192
|
|
|
202
|
-
|
|
193
|
+
report = await benchmark.run_async()
|
|
194
|
+
|
|
195
|
+
# Side-by-side model comparison
|
|
196
|
+
for model, model_report in report.group_by(lambda r: r.test_case.model).items():
|
|
197
|
+
summary = model_report.get_summary()
|
|
198
|
+
print(f"{model}: {summary.pass_rate:.0%} pass rate, {summary.avg_latency_ms:.0f}ms avg")
|
|
199
|
+
|
|
200
|
+
asyncio.run(main())
|
|
201
|
+
```
|
|
203
202
|
|
|
204
203
|
---
|
|
205
204
|
|
|
@@ -252,7 +251,7 @@ Found a bug? Want a feature? PRs welcome!
|
|
|
252
251
|
|
|
253
252
|
```bash
|
|
254
253
|
# Development setup
|
|
255
|
-
git clone https://github.com/
|
|
254
|
+
git clone https://github.com/deyna256/promptum.git
|
|
256
255
|
cd promptum
|
|
257
256
|
just sync # Install dependencies
|
|
258
257
|
just test # Run tests
|
|
@@ -273,7 +272,7 @@ MIT - do whatever you want with it.
|
|
|
273
272
|
|
|
274
273
|
<div align="center">
|
|
275
274
|
|
|
276
|
-
**[⭐ Star on GitHub](https://github.com/
|
|
275
|
+
**[⭐ Star on GitHub](https://github.com/deyna256/promptum)** | **[🐛 Report Bug](https://github.com/deyna256/promptum/issues)** | **[💡 Request Feature](https://github.com/deyna256/promptum/issues)**
|
|
277
276
|
|
|
278
277
|
Made for developers who value their time.
|
|
279
278
|
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|

|
|
7
7
|

|
|
8
8
|
|
|
9
|
-
**Benchmark LLMs Like a Pro
|
|
9
|
+
**Benchmark LLMs Like a Pro.**
|
|
10
10
|
|
|
11
11
|
Stop writing boilerplate to test LLMs. Start getting results.
|
|
12
12
|
|
|
@@ -16,11 +16,12 @@ Stop writing boilerplate to test LLMs. Start getting results.
|
|
|
16
16
|
|
|
17
17
|
## What's This?
|
|
18
18
|
|
|
19
|
-
A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get
|
|
19
|
+
A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get structured results.
|
|
20
20
|
|
|
21
21
|
```python
|
|
22
22
|
benchmark = Benchmark(provider=client, name="my_test")
|
|
23
23
|
benchmark.add_test(TestCase(
|
|
24
|
+
name="basic_math",
|
|
24
25
|
prompt="What is 2+2?",
|
|
25
26
|
model="gpt-3.5-turbo",
|
|
26
27
|
validator=Contains("4")
|
|
@@ -55,15 +56,12 @@ for attempt in range(max_retries):
|
|
|
55
56
|
break
|
|
56
57
|
except Exception:
|
|
57
58
|
sleep(2 ** attempt)
|
|
58
|
-
|
|
59
|
-
# Export results manually
|
|
60
|
-
json.dump(results, open("results.json", "w"))
|
|
61
59
|
```
|
|
62
60
|
|
|
63
61
|
**After promptum:**
|
|
64
62
|
```python
|
|
65
63
|
report = await benchmark.run_async()
|
|
66
|
-
|
|
64
|
+
summary = report.get_summary() # Metrics captured automatically
|
|
67
65
|
```
|
|
68
66
|
|
|
69
67
|
---
|
|
@@ -93,9 +91,9 @@ async def main():
|
|
|
93
91
|
report = await benchmark.run_async()
|
|
94
92
|
summary = report.get_summary()
|
|
95
93
|
|
|
96
|
-
print(f"✓ {summary
|
|
97
|
-
print(f"⚡ {summary
|
|
98
|
-
print(f"💰 ${summary
|
|
94
|
+
print(f"✓ {summary.passed}/{summary.total} tests passed")
|
|
95
|
+
print(f"⚡ {summary.avg_latency_ms:.0f}ms average")
|
|
96
|
+
print(f"💰 ${summary.total_cost_usd:.6f} total cost")
|
|
99
97
|
|
|
100
98
|
asyncio.run(main())
|
|
101
99
|
```
|
|
@@ -109,14 +107,13 @@ python your_script.py
|
|
|
109
107
|
|
|
110
108
|
## What You Get
|
|
111
109
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
✅ **Zero Config** - No YAML files, no setup scripts, just Python
|
|
110
|
+
- [x] **100+ Models via OpenRouter** - One client for OpenAI, Anthropic, Google, and more
|
|
111
|
+
- [x] **Smart Validation** - ExactMatch, Contains, Regex, JsonSchema, or write your own
|
|
112
|
+
- [x] **Automatic Retries** - Exponential/linear backoff with configurable attempts
|
|
113
|
+
- [x] **Metrics Tracking** - Latency, tokens, cost - automatically captured
|
|
114
|
+
- [x] **Async by Default** - Run 100 tests in parallel without breaking a sweat
|
|
115
|
+
- [x] **Type Safe** - Full type hints, catches errors before runtime
|
|
116
|
+
- [x] **Zero Config** - No YAML files, no setup scripts, just Python
|
|
120
117
|
|
|
121
118
|
---
|
|
122
119
|
|
|
@@ -125,39 +122,43 @@ python your_script.py
|
|
|
125
122
|
Compare GPT-4 vs Claude on your tasks:
|
|
126
123
|
|
|
127
124
|
```python
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
tests = [
|
|
131
|
-
TestCase(
|
|
132
|
-
name="json_output",
|
|
133
|
-
prompt='Output JSON: {"status": "ok"}',
|
|
134
|
-
model="openai/gpt-4",
|
|
135
|
-
validator=Regex(r'\{"status":\s*"ok"\}')
|
|
136
|
-
),
|
|
137
|
-
TestCase(
|
|
138
|
-
name="json_output",
|
|
139
|
-
prompt='Output JSON: {"status": "ok"}',
|
|
140
|
-
model="anthropic/claude-3-5-sonnet",
|
|
141
|
-
validator=Regex(r'\{"status":\s*"ok"\}')
|
|
142
|
-
),
|
|
143
|
-
TestCase(
|
|
144
|
-
name="creative_writing",
|
|
145
|
-
prompt="Write a haiku about Python",
|
|
146
|
-
model="openai/gpt-4",
|
|
147
|
-
validator=Contains("Python", case_sensitive=False)
|
|
148
|
-
),
|
|
149
|
-
]
|
|
150
|
-
|
|
151
|
-
benchmark.add_tests(tests)
|
|
152
|
-
report = await benchmark.run_async()
|
|
125
|
+
import asyncio
|
|
126
|
+
from promptum import Benchmark, TestCase, Contains, Regex, OpenRouterClient
|
|
153
127
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
128
|
+
async def main():
|
|
129
|
+
async with OpenRouterClient(api_key="your-key") as client:
|
|
130
|
+
benchmark = Benchmark(provider=client, name="model_comparison")
|
|
131
|
+
|
|
132
|
+
benchmark.add_tests([
|
|
133
|
+
TestCase(
|
|
134
|
+
name="json_output_gpt4",
|
|
135
|
+
prompt='Output JSON: {"status": "ok"}',
|
|
136
|
+
model="openai/gpt-4",
|
|
137
|
+
validator=Regex(r'\{"status":\s*"ok"\}')
|
|
138
|
+
),
|
|
139
|
+
TestCase(
|
|
140
|
+
name="json_output_claude",
|
|
141
|
+
prompt='Output JSON: {"status": "ok"}',
|
|
142
|
+
model="anthropic/claude-3-5-sonnet",
|
|
143
|
+
validator=Regex(r'\{"status":\s*"ok"\}')
|
|
144
|
+
),
|
|
145
|
+
TestCase(
|
|
146
|
+
name="creative_writing",
|
|
147
|
+
prompt="Write a haiku about Python",
|
|
148
|
+
model="openai/gpt-4",
|
|
149
|
+
validator=Contains("Python", case_sensitive=False)
|
|
150
|
+
),
|
|
151
|
+
])
|
|
159
152
|
|
|
160
|
-
|
|
153
|
+
report = await benchmark.run_async()
|
|
154
|
+
|
|
155
|
+
# Side-by-side model comparison
|
|
156
|
+
for model, model_report in report.group_by(lambda r: r.test_case.model).items():
|
|
157
|
+
summary = model_report.get_summary()
|
|
158
|
+
print(f"{model}: {summary.pass_rate:.0%} pass rate, {summary.avg_latency_ms:.0f}ms avg")
|
|
159
|
+
|
|
160
|
+
asyncio.run(main())
|
|
161
|
+
```
|
|
161
162
|
|
|
162
163
|
---
|
|
163
164
|
|
|
@@ -210,7 +211,7 @@ Found a bug? Want a feature? PRs welcome!
|
|
|
210
211
|
|
|
211
212
|
```bash
|
|
212
213
|
# Development setup
|
|
213
|
-
git clone https://github.com/
|
|
214
|
+
git clone https://github.com/deyna256/promptum.git
|
|
214
215
|
cd promptum
|
|
215
216
|
just sync # Install dependencies
|
|
216
217
|
just test # Run tests
|
|
@@ -231,7 +232,7 @@ MIT - do whatever you want with it.
|
|
|
231
232
|
|
|
232
233
|
<div align="center">
|
|
233
234
|
|
|
234
|
-
**[⭐ Star on GitHub](https://github.com/
|
|
235
|
+
**[⭐ Star on GitHub](https://github.com/deyna256/promptum)** | **[🐛 Report Bug](https://github.com/deyna256/promptum/issues)** | **[💡 Request Feature](https://github.com/deyna256/promptum/issues)**
|
|
235
236
|
|
|
236
237
|
Made for developers who value their time.
|
|
237
238
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "promptum"
|
|
3
|
-
version = "0.0.
|
|
3
|
+
version = "0.0.3"
|
|
4
4
|
description = "Async LLM benchmarking library with protocol-based extensibility"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.13"
|
|
@@ -18,8 +18,6 @@ classifiers = [
|
|
|
18
18
|
]
|
|
19
19
|
dependencies = [
|
|
20
20
|
"httpx>=0.27.0",
|
|
21
|
-
"pyyaml>=6.0",
|
|
22
|
-
"jinja2>=3.1.0",
|
|
23
21
|
]
|
|
24
22
|
|
|
25
23
|
[project.urls]
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from promptum.benchmark import Benchmark, Report, Runner, Summary, TestCase, TestResult
|
|
2
|
+
from promptum.providers import LLMProvider, Metrics, OpenRouterClient, RetryConfig, RetryStrategy
|
|
3
|
+
from promptum.validation import (
|
|
4
|
+
Contains,
|
|
5
|
+
ExactMatch,
|
|
6
|
+
JsonSchema,
|
|
7
|
+
Regex,
|
|
8
|
+
Validator,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__version__ = "0.0.3"
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"TestCase",
|
|
15
|
+
"TestResult",
|
|
16
|
+
"Summary",
|
|
17
|
+
"Metrics",
|
|
18
|
+
"RetryConfig",
|
|
19
|
+
"RetryStrategy",
|
|
20
|
+
"Validator",
|
|
21
|
+
"ExactMatch",
|
|
22
|
+
"Contains",
|
|
23
|
+
"Regex",
|
|
24
|
+
"JsonSchema",
|
|
25
|
+
"LLMProvider",
|
|
26
|
+
"OpenRouterClient",
|
|
27
|
+
"Runner",
|
|
28
|
+
"Benchmark",
|
|
29
|
+
"Report",
|
|
30
|
+
]
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
from promptum.benchmark.benchmark import Benchmark
|
|
2
|
+
from promptum.benchmark.report import Report
|
|
3
|
+
from promptum.benchmark.result import TestResult
|
|
4
|
+
from promptum.benchmark.runner import Runner
|
|
5
|
+
from promptum.benchmark.summary import Summary
|
|
6
|
+
from promptum.benchmark.test_case import TestCase
|
|
7
|
+
|
|
8
|
+
__all__ = ["Benchmark", "Report", "Runner", "Summary", "TestCase", "TestResult"]
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from collections.abc import Callable, Sequence
|
|
3
|
-
from typing import Any
|
|
4
3
|
|
|
5
4
|
from promptum.benchmark.report import Report
|
|
6
|
-
from promptum.
|
|
7
|
-
from promptum.
|
|
8
|
-
from promptum.
|
|
5
|
+
from promptum.benchmark.result import TestResult
|
|
6
|
+
from promptum.benchmark.runner import Runner
|
|
7
|
+
from promptum.benchmark.test_case import TestCase
|
|
9
8
|
from promptum.providers.protocol import LLMProvider
|
|
10
9
|
|
|
11
10
|
|
|
@@ -29,12 +28,12 @@ class Benchmark:
|
|
|
29
28
|
def add_tests(self, test_cases: Sequence[TestCase]) -> None:
|
|
30
29
|
self._test_cases.extend(test_cases)
|
|
31
30
|
|
|
32
|
-
def run(self
|
|
33
|
-
return asyncio.run(self.run_async(
|
|
31
|
+
def run(self) -> Report:
|
|
32
|
+
return asyncio.run(self.run_async())
|
|
34
33
|
|
|
35
|
-
async def run_async(self
|
|
34
|
+
async def run_async(self) -> Report:
|
|
36
35
|
if not self._test_cases:
|
|
37
|
-
return Report(results=[]
|
|
36
|
+
return Report(results=[])
|
|
38
37
|
|
|
39
38
|
runner = Runner(
|
|
40
39
|
provider=self.provider,
|
|
@@ -44,7 +43,4 @@ class Benchmark:
|
|
|
44
43
|
|
|
45
44
|
results = await runner.run(self._test_cases)
|
|
46
45
|
|
|
47
|
-
return Report(
|
|
48
|
-
results=results,
|
|
49
|
-
metadata=metadata or {},
|
|
50
|
-
)
|
|
46
|
+
return Report(results=results)
|
|
@@ -1,16 +1,15 @@
|
|
|
1
1
|
from collections.abc import Callable, Sequence
|
|
2
2
|
from dataclasses import dataclass
|
|
3
|
-
from typing import Any
|
|
4
3
|
|
|
5
|
-
from promptum.
|
|
4
|
+
from promptum.benchmark.result import TestResult
|
|
5
|
+
from promptum.benchmark.summary import Summary
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
@dataclass(frozen=True, slots=True)
|
|
9
9
|
class Report:
|
|
10
10
|
results: Sequence[TestResult]
|
|
11
|
-
metadata: dict[str, Any]
|
|
12
11
|
|
|
13
|
-
def get_summary(self) ->
|
|
12
|
+
def get_summary(self) -> Summary:
|
|
14
13
|
total = len(self.results)
|
|
15
14
|
passed = sum(1 for r in self.results if r.passed)
|
|
16
15
|
|
|
@@ -18,18 +17,17 @@ class Report:
|
|
|
18
17
|
total_cost = sum(r.metrics.cost_usd or 0 for r in self.results if r.metrics)
|
|
19
18
|
total_tokens = sum(r.metrics.total_tokens or 0 for r in self.results if r.metrics)
|
|
20
19
|
|
|
21
|
-
return
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
}
|
|
20
|
+
return Summary(
|
|
21
|
+
total=total,
|
|
22
|
+
passed=passed,
|
|
23
|
+
failed=total - passed,
|
|
24
|
+
pass_rate=passed / total if total > 0 else 0,
|
|
25
|
+
avg_latency_ms=sum(latencies) / len(latencies) if latencies else 0,
|
|
26
|
+
min_latency_ms=min(latencies) if latencies else 0,
|
|
27
|
+
max_latency_ms=max(latencies) if latencies else 0,
|
|
28
|
+
total_cost_usd=total_cost,
|
|
29
|
+
total_tokens=total_tokens,
|
|
30
|
+
)
|
|
33
31
|
|
|
34
32
|
def filter(
|
|
35
33
|
self,
|
|
@@ -49,7 +47,7 @@ class Report:
|
|
|
49
47
|
if passed is not None:
|
|
50
48
|
filtered = [r for r in filtered if r.passed == passed]
|
|
51
49
|
|
|
52
|
-
return Report(results=filtered
|
|
50
|
+
return Report(results=filtered)
|
|
53
51
|
|
|
54
52
|
def group_by(self, key: Callable[[TestResult], str]) -> dict[str, "Report"]:
|
|
55
53
|
groups: dict[str, list[TestResult]] = {}
|
|
@@ -60,16 +58,4 @@ class Report:
|
|
|
60
58
|
groups[group_key] = []
|
|
61
59
|
groups[group_key].append(result)
|
|
62
60
|
|
|
63
|
-
return {k: Report(results=v
|
|
64
|
-
|
|
65
|
-
def compare_models(self) -> dict[str, dict[str, Any]]:
|
|
66
|
-
by_model = self.group_by(lambda r: r.test_case.model)
|
|
67
|
-
return {model: report.get_summary() for model, report in by_model.items()}
|
|
68
|
-
|
|
69
|
-
@staticmethod
|
|
70
|
-
def _percentile(values: list[float], p: float) -> float:
|
|
71
|
-
if not values:
|
|
72
|
-
return 0
|
|
73
|
-
sorted_values = sorted(values)
|
|
74
|
-
index = int(len(sorted_values) * p)
|
|
75
|
-
return sorted_values[min(index, len(sorted_values) - 1)]
|
|
61
|
+
return {k: Report(results=v) for k, v in groups.items()}
|
|
@@ -2,8 +2,8 @@ from dataclasses import dataclass, field
|
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from typing import Any
|
|
4
4
|
|
|
5
|
-
from promptum.
|
|
6
|
-
from promptum.
|
|
5
|
+
from promptum.benchmark.test_case import TestCase
|
|
6
|
+
from promptum.providers.metrics import Metrics
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
@dataclass(frozen=True, slots=True)
|
|
@@ -3,8 +3,8 @@ from collections.abc import Callable, Sequence
|
|
|
3
3
|
|
|
4
4
|
import httpx
|
|
5
5
|
|
|
6
|
-
from promptum.
|
|
7
|
-
from promptum.
|
|
6
|
+
from promptum.benchmark.result import TestResult
|
|
7
|
+
from promptum.benchmark.test_case import TestCase
|
|
8
8
|
from promptum.providers.protocol import LLMProvider
|
|
9
9
|
|
|
10
10
|
|
|
@@ -37,7 +37,6 @@ class Runner:
|
|
|
37
37
|
|
|
38
38
|
results = await asyncio.gather(
|
|
39
39
|
*[run_with_semaphore(tc) for tc in test_cases],
|
|
40
|
-
return_exceptions=False,
|
|
41
40
|
)
|
|
42
41
|
|
|
43
42
|
return list(results)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
@dataclass(frozen=True, slots=True)
|
|
5
|
+
class Summary:
|
|
6
|
+
total: int
|
|
7
|
+
passed: int
|
|
8
|
+
failed: int
|
|
9
|
+
pass_rate: float
|
|
10
|
+
avg_latency_ms: float
|
|
11
|
+
min_latency_ms: float
|
|
12
|
+
max_latency_ms: float
|
|
13
|
+
total_cost_usd: float
|
|
14
|
+
total_tokens: int
|