promptum 0.0.1__tar.gz → 0.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promptum-0.0.2/CONTRIBUTING.md +78 -0
- {promptum-0.0.1 → promptum-0.0.2}/Justfile +0 -4
- {promptum-0.0.1 → promptum-0.0.2}/PKG-INFO +14 -23
- {promptum-0.0.1 → promptum-0.0.2}/README.md +13 -20
- {promptum-0.0.1 → promptum-0.0.2}/pyproject.toml +1 -3
- promptum-0.0.2/src/promptum/__init__.py +29 -0
- promptum-0.0.2/src/promptum/benchmark/__init__.py +7 -0
- {promptum-0.0.1 → promptum-0.0.2}/src/promptum/benchmark/benchmark.py +8 -12
- {promptum-0.0.1 → promptum-0.0.2}/src/promptum/benchmark/report.py +5 -6
- {promptum-0.0.1/src/promptum/core → promptum-0.0.2/src/promptum/benchmark}/result.py +2 -2
- {promptum-0.0.1/src/promptum/execution → promptum-0.0.2/src/promptum/benchmark}/runner.py +2 -3
- {promptum-0.0.1/src/promptum/core → promptum-0.0.2/src/promptum/benchmark}/test_case.py +1 -1
- {promptum-0.0.1 → promptum-0.0.2}/src/promptum/providers/__init__.py +5 -0
- {promptum-0.0.1 → promptum-0.0.2}/src/promptum/providers/openrouter.py +3 -3
- {promptum-0.0.1 → promptum-0.0.2}/src/promptum/providers/protocol.py +1 -1
- {promptum-0.0.1 → promptum-0.0.2}/src/promptum/validation/validators.py +0 -18
- {promptum-0.0.1 → promptum-0.0.2}/tests/benchmark/conftest.py +3 -3
- {promptum-0.0.1 → promptum-0.0.2}/tests/benchmark/test_report_summary.py +1 -1
- {promptum-0.0.1/tests/core → promptum-0.0.2/tests/benchmark}/test_test_case.py +1 -1
- promptum-0.0.2/tests/conftest.py +1 -0
- promptum-0.0.2/tests/providers/__init__.py +1 -0
- {promptum-0.0.1/tests/core → promptum-0.0.2/tests/providers}/conftest.py +1 -1
- {promptum-0.0.1/tests/core → promptum-0.0.2/tests/providers}/test_metrics.py +1 -1
- {promptum-0.0.1/tests/core → promptum-0.0.2/tests/providers}/test_retry.py +1 -1
- {promptum-0.0.1 → promptum-0.0.2}/uv.lock +1 -107
- promptum-0.0.1/src/promptum/__init__.py +0 -44
- promptum-0.0.1/src/promptum/benchmark/__init__.py +0 -4
- promptum-0.0.1/src/promptum/core/__init__.py +0 -12
- promptum-0.0.1/src/promptum/execution/__init__.py +0 -3
- promptum-0.0.1/src/promptum/serialization/__init__.py +0 -11
- promptum-0.0.1/src/promptum/serialization/base.py +0 -48
- promptum-0.0.1/src/promptum/serialization/html.py +0 -52
- promptum-0.0.1/src/promptum/serialization/json.py +0 -28
- promptum-0.0.1/src/promptum/serialization/protocol.py +0 -13
- promptum-0.0.1/src/promptum/serialization/report_template.html +0 -293
- promptum-0.0.1/src/promptum/serialization/yaml.py +0 -17
- promptum-0.0.1/src/promptum/storage/__init__.py +0 -7
- promptum-0.0.1/src/promptum/storage/file.py +0 -157
- promptum-0.0.1/src/promptum/storage/protocol.py +0 -23
- promptum-0.0.1/tests/conftest.py +0 -40
- promptum-0.0.1/tests/validation/__init__.py +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/.coveragerc +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/.github/workflows/lint.yml +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/.github/workflows/publish-test.yml +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/.github/workflows/publish.yml +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/.github/workflows/test.yml +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/.github/workflows/typecheck.yml +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/.gitignore +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/.python-version +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/LICENSE +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/pytest.ini +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/ruff.toml +0 -0
- {promptum-0.0.1/src/promptum/core → promptum-0.0.2/src/promptum/providers}/metrics.py +0 -0
- {promptum-0.0.1/src/promptum/core → promptum-0.0.2/src/promptum/providers}/retry.py +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/src/promptum/py.typed +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/src/promptum/validation/__init__.py +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/src/promptum/validation/protocol.py +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/tests/__init__.py +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/tests/benchmark/__init__.py +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/tests/benchmark/test_report_filtering.py +0 -0
- {promptum-0.0.1/tests/core → promptum-0.0.2/tests/validation}/__init__.py +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/tests/validation/conftest.py +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/tests/validation/test_contains.py +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/tests/validation/test_exact_match.py +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/tests/validation/test_json_schema.py +0 -0
- {promptum-0.0.1 → promptum-0.0.2}/tests/validation/test_regex.py +0 -0
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# Contributing to Promptum
|
|
2
|
+
|
|
3
|
+
Thank you for your interest in contributing to Promptum! We welcome contributions from the community.
|
|
4
|
+
|
|
5
|
+
## Getting Started
|
|
6
|
+
|
|
7
|
+
1. **Fork the repository** to your own GitHub account
|
|
8
|
+
2. **Clone your fork** locally:
|
|
9
|
+
```bash
|
|
10
|
+
git clone https://github.com/YOUR_USERNAME/promptum.git
|
|
11
|
+
cd promptum
|
|
12
|
+
```
|
|
13
|
+
3. **Set up the development environment**:
|
|
14
|
+
```bash
|
|
15
|
+
just sync # Install/sync dependencies
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Making Changes
|
|
19
|
+
|
|
20
|
+
### Branch Naming
|
|
21
|
+
|
|
22
|
+
Create a new branch named after the issue number you're working on:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
git checkout -b 42 # For issue #42
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### One PR = One Issue
|
|
29
|
+
|
|
30
|
+
Each pull request should address exactly one issue. If you want to work on multiple issues, create separate branches and PRs for each.
|
|
31
|
+
|
|
32
|
+
### Work in Progress
|
|
33
|
+
|
|
34
|
+
If your PR is not ready for review, add `[WIP]` to the title:
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
[WIP] #42: Fix retry logic in OpenRouterClient
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Remove `[WIP]` when the PR is ready for review.
|
|
41
|
+
|
|
42
|
+
## Submitting Changes
|
|
43
|
+
|
|
44
|
+
1. **Run tests and linting** before committing:
|
|
45
|
+
```bash
|
|
46
|
+
just lint # Lint and auto-fix
|
|
47
|
+
just typecheck # Type check
|
|
48
|
+
just test # Run tests
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
2. **Commit your changes** with clear, descriptive messages:
|
|
52
|
+
```bash
|
|
53
|
+
git commit -m "#42: Fix retry logic in OpenRouterClient"
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
3. **Push to your fork**:
|
|
57
|
+
```bash
|
|
58
|
+
git push origin 42
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
4. **Create a Pull Request** from your fork to the main repository
|
|
62
|
+
|
|
63
|
+
5. **Tag the maintainer** (@deyna256) in a comment when your PR is ready for review
|
|
64
|
+
|
|
65
|
+
## CI Requirements
|
|
66
|
+
|
|
67
|
+
Pull requests must pass all CI checks before review. The maintainer will not review PRs with failing checks.
|
|
68
|
+
|
|
69
|
+
CI runs:
|
|
70
|
+
- Linting
|
|
71
|
+
- Type checking
|
|
72
|
+
- Tests
|
|
73
|
+
|
|
74
|
+
## Questions?
|
|
75
|
+
|
|
76
|
+
Feel free to ask questions in the issue comments or open a discussion.
|
|
77
|
+
|
|
78
|
+
Thank you for contributing!
|
|
@@ -23,10 +23,6 @@ cov-html:
|
|
|
23
23
|
uv run pytest tests/ --cov-report=html
|
|
24
24
|
xdg-open htmlcov/index.html
|
|
25
25
|
|
|
26
|
-
# Open benchmark HTML report
|
|
27
|
-
report:
|
|
28
|
-
xdg-open results/report.html
|
|
29
|
-
|
|
30
26
|
# Clean up generated files and caches
|
|
31
27
|
clean:
|
|
32
28
|
rm -rf .pytest_cache .ruff_cache .coverage htmlcov results/
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: promptum
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.2
|
|
4
4
|
Summary: Async LLM benchmarking library with protocol-based extensibility
|
|
5
5
|
Project-URL: Homepage, https://github.com/deyna256/promptum
|
|
6
6
|
Project-URL: Repository, https://github.com/deyna256/promptum
|
|
@@ -36,8 +36,6 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
36
36
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
37
37
|
Requires-Python: >=3.13
|
|
38
38
|
Requires-Dist: httpx>=0.27.0
|
|
39
|
-
Requires-Dist: jinja2>=3.1.0
|
|
40
|
-
Requires-Dist: pyyaml>=6.0
|
|
41
39
|
Description-Content-Type: text/markdown
|
|
42
40
|
|
|
43
41
|
# promptum
|
|
@@ -97,15 +95,12 @@ for attempt in range(max_retries):
|
|
|
97
95
|
break
|
|
98
96
|
except Exception:
|
|
99
97
|
sleep(2 ** attempt)
|
|
100
|
-
|
|
101
|
-
# Export results manually
|
|
102
|
-
json.dump(results, open("results.json", "w"))
|
|
103
98
|
```
|
|
104
99
|
|
|
105
100
|
**After promptum:**
|
|
106
101
|
```python
|
|
107
102
|
report = await benchmark.run_async()
|
|
108
|
-
|
|
103
|
+
summary = report.get_summary() # Metrics captured automatically
|
|
109
104
|
```
|
|
110
105
|
|
|
111
106
|
---
|
|
@@ -151,14 +146,13 @@ python your_script.py
|
|
|
151
146
|
|
|
152
147
|
## What You Get
|
|
153
148
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
✅ **Zero Config** - No YAML files, no setup scripts, just Python
|
|
149
|
+
- [x] **One API for 100+ Models** - OpenRouter support out of the box (OpenAI, Anthropic, Google, etc.)
|
|
150
|
+
- [x] **Smart Validation** - ExactMatch, Contains, Regex, JsonSchema, or write your own
|
|
151
|
+
- [x] **Automatic Retries** - Exponential/linear backoff with configurable attempts
|
|
152
|
+
- [x] **Metrics Tracking** - Latency, tokens, cost - automatically captured
|
|
153
|
+
- [x] **Async by Default** - Run 100 tests in parallel without breaking a sweat
|
|
154
|
+
- [x] **Type Safe** - Full type hints, catches errors before runtime
|
|
155
|
+
- [x] **Zero Config** - No YAML files, no setup scripts, just Python
|
|
162
156
|
|
|
163
157
|
---
|
|
164
158
|
|
|
@@ -193,14 +187,11 @@ tests = [
|
|
|
193
187
|
benchmark.add_tests(tests)
|
|
194
188
|
report = await benchmark.run_async()
|
|
195
189
|
|
|
196
|
-
#
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
open("comparison.html", "w").write(html)
|
|
190
|
+
# Side-by-side model comparison
|
|
191
|
+
for model, summary in report.compare_models().items():
|
|
192
|
+
print(f"{model}: {summary['pass_rate']:.0%} pass rate, {summary['avg_latency_ms']:.0f}ms avg")
|
|
200
193
|
```
|
|
201
194
|
|
|
202
|
-
Open `comparison.html` in your browser - see side-by-side model performance with charts.
|
|
203
|
-
|
|
204
195
|
---
|
|
205
196
|
|
|
206
197
|
## Use Cases
|
|
@@ -252,7 +243,7 @@ Found a bug? Want a feature? PRs welcome!
|
|
|
252
243
|
|
|
253
244
|
```bash
|
|
254
245
|
# Development setup
|
|
255
|
-
git clone https://github.com/
|
|
246
|
+
git clone https://github.com/deyna256/promptum.git
|
|
256
247
|
cd promptum
|
|
257
248
|
just sync # Install dependencies
|
|
258
249
|
just test # Run tests
|
|
@@ -273,7 +264,7 @@ MIT - do whatever you want with it.
|
|
|
273
264
|
|
|
274
265
|
<div align="center">
|
|
275
266
|
|
|
276
|
-
**[⭐ Star on GitHub](https://github.com/
|
|
267
|
+
**[⭐ Star on GitHub](https://github.com/deyna256/promptum)** | **[🐛 Report Bug](https://github.com/deyna256/promptum/issues)** | **[💡 Request Feature](https://github.com/deyna256/promptum/issues)**
|
|
277
268
|
|
|
278
269
|
Made for developers who value their time.
|
|
279
270
|
|
|
@@ -55,15 +55,12 @@ for attempt in range(max_retries):
|
|
|
55
55
|
break
|
|
56
56
|
except Exception:
|
|
57
57
|
sleep(2 ** attempt)
|
|
58
|
-
|
|
59
|
-
# Export results manually
|
|
60
|
-
json.dump(results, open("results.json", "w"))
|
|
61
58
|
```
|
|
62
59
|
|
|
63
60
|
**After promptum:**
|
|
64
61
|
```python
|
|
65
62
|
report = await benchmark.run_async()
|
|
66
|
-
|
|
63
|
+
summary = report.get_summary() # Metrics captured automatically
|
|
67
64
|
```
|
|
68
65
|
|
|
69
66
|
---
|
|
@@ -109,14 +106,13 @@ python your_script.py
|
|
|
109
106
|
|
|
110
107
|
## What You Get
|
|
111
108
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
✅ **Zero Config** - No YAML files, no setup scripts, just Python
|
|
109
|
+
- [x] **One API for 100+ Models** - OpenRouter support out of the box (OpenAI, Anthropic, Google, etc.)
|
|
110
|
+
- [x] **Smart Validation** - ExactMatch, Contains, Regex, JsonSchema, or write your own
|
|
111
|
+
- [x] **Automatic Retries** - Exponential/linear backoff with configurable attempts
|
|
112
|
+
- [x] **Metrics Tracking** - Latency, tokens, cost - automatically captured
|
|
113
|
+
- [x] **Async by Default** - Run 100 tests in parallel without breaking a sweat
|
|
114
|
+
- [x] **Type Safe** - Full type hints, catches errors before runtime
|
|
115
|
+
- [x] **Zero Config** - No YAML files, no setup scripts, just Python
|
|
120
116
|
|
|
121
117
|
---
|
|
122
118
|
|
|
@@ -151,14 +147,11 @@ tests = [
|
|
|
151
147
|
benchmark.add_tests(tests)
|
|
152
148
|
report = await benchmark.run_async()
|
|
153
149
|
|
|
154
|
-
#
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
open("comparison.html", "w").write(html)
|
|
150
|
+
# Side-by-side model comparison
|
|
151
|
+
for model, summary in report.compare_models().items():
|
|
152
|
+
print(f"{model}: {summary['pass_rate']:.0%} pass rate, {summary['avg_latency_ms']:.0f}ms avg")
|
|
158
153
|
```
|
|
159
154
|
|
|
160
|
-
Open `comparison.html` in your browser - see side-by-side model performance with charts.
|
|
161
|
-
|
|
162
155
|
---
|
|
163
156
|
|
|
164
157
|
## Use Cases
|
|
@@ -210,7 +203,7 @@ Found a bug? Want a feature? PRs welcome!
|
|
|
210
203
|
|
|
211
204
|
```bash
|
|
212
205
|
# Development setup
|
|
213
|
-
git clone https://github.com/
|
|
206
|
+
git clone https://github.com/deyna256/promptum.git
|
|
214
207
|
cd promptum
|
|
215
208
|
just sync # Install dependencies
|
|
216
209
|
just test # Run tests
|
|
@@ -231,7 +224,7 @@ MIT - do whatever you want with it.
|
|
|
231
224
|
|
|
232
225
|
<div align="center">
|
|
233
226
|
|
|
234
|
-
**[⭐ Star on GitHub](https://github.com/
|
|
227
|
+
**[⭐ Star on GitHub](https://github.com/deyna256/promptum)** | **[🐛 Report Bug](https://github.com/deyna256/promptum/issues)** | **[💡 Request Feature](https://github.com/deyna256/promptum/issues)**
|
|
235
228
|
|
|
236
229
|
Made for developers who value their time.
|
|
237
230
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "promptum"
|
|
3
|
-
version = "0.0.
|
|
3
|
+
version = "0.0.2"
|
|
4
4
|
description = "Async LLM benchmarking library with protocol-based extensibility"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.13"
|
|
@@ -18,8 +18,6 @@ classifiers = [
|
|
|
18
18
|
]
|
|
19
19
|
dependencies = [
|
|
20
20
|
"httpx>=0.27.0",
|
|
21
|
-
"pyyaml>=6.0",
|
|
22
|
-
"jinja2>=3.1.0",
|
|
23
21
|
]
|
|
24
22
|
|
|
25
23
|
[project.urls]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from promptum.benchmark import Benchmark, Report, Runner, TestCase, TestResult
|
|
2
|
+
from promptum.providers import LLMProvider, Metrics, OpenRouterClient, RetryConfig, RetryStrategy
|
|
3
|
+
from promptum.validation import (
|
|
4
|
+
Contains,
|
|
5
|
+
ExactMatch,
|
|
6
|
+
JsonSchema,
|
|
7
|
+
Regex,
|
|
8
|
+
Validator,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__version__ = "0.0.1"
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"TestCase",
|
|
15
|
+
"TestResult",
|
|
16
|
+
"Metrics",
|
|
17
|
+
"RetryConfig",
|
|
18
|
+
"RetryStrategy",
|
|
19
|
+
"Validator",
|
|
20
|
+
"ExactMatch",
|
|
21
|
+
"Contains",
|
|
22
|
+
"Regex",
|
|
23
|
+
"JsonSchema",
|
|
24
|
+
"LLMProvider",
|
|
25
|
+
"OpenRouterClient",
|
|
26
|
+
"Runner",
|
|
27
|
+
"Benchmark",
|
|
28
|
+
"Report",
|
|
29
|
+
]
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
from promptum.benchmark.benchmark import Benchmark
|
|
2
|
+
from promptum.benchmark.report import Report
|
|
3
|
+
from promptum.benchmark.result import TestResult
|
|
4
|
+
from promptum.benchmark.runner import Runner
|
|
5
|
+
from promptum.benchmark.test_case import TestCase
|
|
6
|
+
|
|
7
|
+
__all__ = ["Benchmark", "Report", "Runner", "TestCase", "TestResult"]
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from collections.abc import Callable, Sequence
|
|
3
|
-
from typing import Any
|
|
4
3
|
|
|
5
4
|
from promptum.benchmark.report import Report
|
|
6
|
-
from promptum.
|
|
7
|
-
from promptum.
|
|
8
|
-
from promptum.
|
|
5
|
+
from promptum.benchmark.result import TestResult
|
|
6
|
+
from promptum.benchmark.runner import Runner
|
|
7
|
+
from promptum.benchmark.test_case import TestCase
|
|
9
8
|
from promptum.providers.protocol import LLMProvider
|
|
10
9
|
|
|
11
10
|
|
|
@@ -29,12 +28,12 @@ class Benchmark:
|
|
|
29
28
|
def add_tests(self, test_cases: Sequence[TestCase]) -> None:
|
|
30
29
|
self._test_cases.extend(test_cases)
|
|
31
30
|
|
|
32
|
-
def run(self
|
|
33
|
-
return asyncio.run(self.run_async(
|
|
31
|
+
def run(self) -> Report:
|
|
32
|
+
return asyncio.run(self.run_async())
|
|
34
33
|
|
|
35
|
-
async def run_async(self
|
|
34
|
+
async def run_async(self) -> Report:
|
|
36
35
|
if not self._test_cases:
|
|
37
|
-
return Report(results=[]
|
|
36
|
+
return Report(results=[])
|
|
38
37
|
|
|
39
38
|
runner = Runner(
|
|
40
39
|
provider=self.provider,
|
|
@@ -44,7 +43,4 @@ class Benchmark:
|
|
|
44
43
|
|
|
45
44
|
results = await runner.run(self._test_cases)
|
|
46
45
|
|
|
47
|
-
return Report(
|
|
48
|
-
results=results,
|
|
49
|
-
metadata=metadata or {},
|
|
50
|
-
)
|
|
46
|
+
return Report(results=results)
|
|
@@ -2,13 +2,12 @@ from collections.abc import Callable, Sequence
|
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from typing import Any
|
|
4
4
|
|
|
5
|
-
from promptum.
|
|
5
|
+
from promptum.benchmark.result import TestResult
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
@dataclass(frozen=True, slots=True)
|
|
9
9
|
class Report:
|
|
10
10
|
results: Sequence[TestResult]
|
|
11
|
-
metadata: dict[str, Any]
|
|
12
11
|
|
|
13
12
|
def get_summary(self) -> dict[str, Any]:
|
|
14
13
|
total = len(self.results)
|
|
@@ -49,7 +48,7 @@ class Report:
|
|
|
49
48
|
if passed is not None:
|
|
50
49
|
filtered = [r for r in filtered if r.passed == passed]
|
|
51
50
|
|
|
52
|
-
return Report(results=filtered
|
|
51
|
+
return Report(results=filtered)
|
|
53
52
|
|
|
54
53
|
def group_by(self, key: Callable[[TestResult], str]) -> dict[str, "Report"]:
|
|
55
54
|
groups: dict[str, list[TestResult]] = {}
|
|
@@ -60,7 +59,7 @@ class Report:
|
|
|
60
59
|
groups[group_key] = []
|
|
61
60
|
groups[group_key].append(result)
|
|
62
61
|
|
|
63
|
-
return {k: Report(results=v
|
|
62
|
+
return {k: Report(results=v) for k, v in groups.items()}
|
|
64
63
|
|
|
65
64
|
def compare_models(self) -> dict[str, dict[str, Any]]:
|
|
66
65
|
by_model = self.group_by(lambda r: r.test_case.model)
|
|
@@ -71,5 +70,5 @@ class Report:
|
|
|
71
70
|
if not values:
|
|
72
71
|
return 0
|
|
73
72
|
sorted_values = sorted(values)
|
|
74
|
-
index = int(len(sorted_values) * p)
|
|
75
|
-
return sorted_values[
|
|
73
|
+
index = int((len(sorted_values) - 1) * p)
|
|
74
|
+
return sorted_values[index]
|
|
@@ -2,8 +2,8 @@ from dataclasses import dataclass, field
|
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from typing import Any
|
|
4
4
|
|
|
5
|
-
from promptum.
|
|
6
|
-
from promptum.
|
|
5
|
+
from promptum.benchmark.test_case import TestCase
|
|
6
|
+
from promptum.providers.metrics import Metrics
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
@dataclass(frozen=True, slots=True)
|
|
@@ -3,8 +3,8 @@ from collections.abc import Callable, Sequence
|
|
|
3
3
|
|
|
4
4
|
import httpx
|
|
5
5
|
|
|
6
|
-
from promptum.
|
|
7
|
-
from promptum.
|
|
6
|
+
from promptum.benchmark.result import TestResult
|
|
7
|
+
from promptum.benchmark.test_case import TestCase
|
|
8
8
|
from promptum.providers.protocol import LLMProvider
|
|
9
9
|
|
|
10
10
|
|
|
@@ -37,7 +37,6 @@ class Runner:
|
|
|
37
37
|
|
|
38
38
|
results = await asyncio.gather(
|
|
39
39
|
*[run_with_semaphore(tc) for tc in test_cases],
|
|
40
|
-
return_exceptions=False,
|
|
41
40
|
)
|
|
42
41
|
|
|
43
42
|
return list(results)
|
|
@@ -1,7 +1,12 @@
|
|
|
1
|
+
from promptum.providers.metrics import Metrics
|
|
1
2
|
from promptum.providers.openrouter import OpenRouterClient
|
|
2
3
|
from promptum.providers.protocol import LLMProvider
|
|
4
|
+
from promptum.providers.retry import RetryConfig, RetryStrategy
|
|
3
5
|
|
|
4
6
|
__all__ = [
|
|
5
7
|
"LLMProvider",
|
|
8
|
+
"Metrics",
|
|
6
9
|
"OpenRouterClient",
|
|
10
|
+
"RetryConfig",
|
|
11
|
+
"RetryStrategy",
|
|
7
12
|
]
|
|
@@ -4,8 +4,8 @@ from typing import Any
|
|
|
4
4
|
|
|
5
5
|
import httpx
|
|
6
6
|
|
|
7
|
-
from promptum.
|
|
8
|
-
from promptum.
|
|
7
|
+
from promptum.providers.metrics import Metrics
|
|
8
|
+
from promptum.providers.retry import RetryConfig, RetryStrategy
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class OpenRouterClient:
|
|
@@ -61,7 +61,7 @@ class OpenRouterClient:
|
|
|
61
61
|
"messages": messages,
|
|
62
62
|
"temperature": temperature,
|
|
63
63
|
}
|
|
64
|
-
if max_tokens:
|
|
64
|
+
if max_tokens is not None:
|
|
65
65
|
payload["max_tokens"] = max_tokens
|
|
66
66
|
payload.update(kwargs)
|
|
67
67
|
|
|
@@ -88,21 +88,3 @@ class JsonSchema:
|
|
|
88
88
|
keys = ", ".join(self.required_keys)
|
|
89
89
|
return f"Valid JSON with keys: {keys}"
|
|
90
90
|
return "Valid JSON object"
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
@dataclass(frozen=True, slots=True)
|
|
94
|
-
class PlaceholderValidator:
|
|
95
|
-
"""
|
|
96
|
-
Placeholder validator for deserialized reports.
|
|
97
|
-
|
|
98
|
-
Used when original validator cannot be reconstructed from storage.
|
|
99
|
-
Always returns True. Original validator logic is not preserved.
|
|
100
|
-
"""
|
|
101
|
-
|
|
102
|
-
description: str
|
|
103
|
-
|
|
104
|
-
def validate(self, response: str) -> tuple[bool, dict[str, Any]]:
|
|
105
|
-
return True, {"placeholder": True, "note": "Original validator could not be reconstructed"}
|
|
106
|
-
|
|
107
|
-
def describe(self) -> str:
|
|
108
|
-
return self.description
|
|
@@ -2,8 +2,8 @@ from datetime import datetime
|
|
|
2
2
|
|
|
3
3
|
import pytest
|
|
4
4
|
|
|
5
|
-
from promptum.benchmark import Report
|
|
6
|
-
from promptum.
|
|
5
|
+
from promptum.benchmark import Report, TestCase, TestResult
|
|
6
|
+
from promptum.providers import Metrics
|
|
7
7
|
from promptum.validation import Contains
|
|
8
8
|
|
|
9
9
|
|
|
@@ -57,4 +57,4 @@ def sample_results() -> list[TestResult]:
|
|
|
57
57
|
|
|
58
58
|
@pytest.fixture
|
|
59
59
|
def sample_report(sample_results: list[TestResult]) -> Report:
|
|
60
|
-
return Report(results=sample_results
|
|
60
|
+
return Report(results=sample_results)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|