promptum 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promptum/__init__.py +3 -18
- promptum/benchmark/__init__.py +4 -1
- promptum/benchmark/benchmark.py +8 -12
- promptum/benchmark/report.py +5 -6
- promptum/{core → benchmark}/result.py +2 -2
- promptum/{execution → benchmark}/runner.py +2 -3
- promptum/{core → benchmark}/test_case.py +1 -1
- promptum/providers/__init__.py +5 -0
- promptum/providers/openrouter.py +3 -3
- promptum/providers/protocol.py +1 -1
- promptum/validation/validators.py +0 -18
- {promptum-0.0.1.dist-info → promptum-0.0.2.dist-info}/METADATA +14 -23
- promptum-0.0.2.dist-info/RECORD +20 -0
- promptum/core/__init__.py +0 -12
- promptum/execution/__init__.py +0 -3
- promptum/serialization/__init__.py +0 -11
- promptum/serialization/base.py +0 -48
- promptum/serialization/html.py +0 -52
- promptum/serialization/json.py +0 -28
- promptum/serialization/protocol.py +0 -13
- promptum/serialization/report_template.html +0 -293
- promptum/serialization/yaml.py +0 -17
- promptum/storage/__init__.py +0 -7
- promptum/storage/file.py +0 -157
- promptum/storage/protocol.py +0 -23
- promptum-0.0.1.dist-info/RECORD +0 -32
- /promptum/{core → providers}/metrics.py +0 -0
- /promptum/{core → providers}/retry.py +0 -0
- {promptum-0.0.1.dist-info → promptum-0.0.2.dist-info}/WHEEL +0 -0
- {promptum-0.0.1.dist-info → promptum-0.0.2.dist-info}/licenses/LICENSE +0 -0
promptum/__init__.py
CHANGED
|
@@ -1,14 +1,5 @@
|
|
|
1
|
-
from promptum.benchmark import Benchmark, Report
|
|
2
|
-
from promptum.
|
|
3
|
-
from promptum.execution import Runner
|
|
4
|
-
from promptum.providers import LLMProvider, OpenRouterClient
|
|
5
|
-
from promptum.serialization import (
|
|
6
|
-
HTMLSerializer,
|
|
7
|
-
JSONSerializer,
|
|
8
|
-
Serializer,
|
|
9
|
-
YAMLSerializer,
|
|
10
|
-
)
|
|
11
|
-
from promptum.storage import FileStorage, ResultStorage
|
|
1
|
+
from promptum.benchmark import Benchmark, Report, Runner, TestCase, TestResult
|
|
2
|
+
from promptum.providers import LLMProvider, Metrics, OpenRouterClient, RetryConfig, RetryStrategy
|
|
12
3
|
from promptum.validation import (
|
|
13
4
|
Contains,
|
|
14
5
|
ExactMatch,
|
|
@@ -17,7 +8,7 @@ from promptum.validation import (
|
|
|
17
8
|
Validator,
|
|
18
9
|
)
|
|
19
10
|
|
|
20
|
-
__version__ = "0.1
|
|
11
|
+
__version__ = "0.0.1"
|
|
21
12
|
|
|
22
13
|
__all__ = [
|
|
23
14
|
"TestCase",
|
|
@@ -35,10 +26,4 @@ __all__ = [
|
|
|
35
26
|
"Runner",
|
|
36
27
|
"Benchmark",
|
|
37
28
|
"Report",
|
|
38
|
-
"Serializer",
|
|
39
|
-
"JSONSerializer",
|
|
40
|
-
"YAMLSerializer",
|
|
41
|
-
"HTMLSerializer",
|
|
42
|
-
"ResultStorage",
|
|
43
|
-
"FileStorage",
|
|
44
29
|
]
|
promptum/benchmark/__init__.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
1
|
from promptum.benchmark.benchmark import Benchmark
|
|
2
2
|
from promptum.benchmark.report import Report
|
|
3
|
+
from promptum.benchmark.result import TestResult
|
|
4
|
+
from promptum.benchmark.runner import Runner
|
|
5
|
+
from promptum.benchmark.test_case import TestCase
|
|
3
6
|
|
|
4
|
-
__all__ = ["Benchmark", "Report"]
|
|
7
|
+
__all__ = ["Benchmark", "Report", "Runner", "TestCase", "TestResult"]
|
promptum/benchmark/benchmark.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from collections.abc import Callable, Sequence
|
|
3
|
-
from typing import Any
|
|
4
3
|
|
|
5
4
|
from promptum.benchmark.report import Report
|
|
6
|
-
from promptum.
|
|
7
|
-
from promptum.
|
|
8
|
-
from promptum.
|
|
5
|
+
from promptum.benchmark.result import TestResult
|
|
6
|
+
from promptum.benchmark.runner import Runner
|
|
7
|
+
from promptum.benchmark.test_case import TestCase
|
|
9
8
|
from promptum.providers.protocol import LLMProvider
|
|
10
9
|
|
|
11
10
|
|
|
@@ -29,12 +28,12 @@ class Benchmark:
|
|
|
29
28
|
def add_tests(self, test_cases: Sequence[TestCase]) -> None:
|
|
30
29
|
self._test_cases.extend(test_cases)
|
|
31
30
|
|
|
32
|
-
def run(self
|
|
33
|
-
return asyncio.run(self.run_async(
|
|
31
|
+
def run(self) -> Report:
|
|
32
|
+
return asyncio.run(self.run_async())
|
|
34
33
|
|
|
35
|
-
async def run_async(self
|
|
34
|
+
async def run_async(self) -> Report:
|
|
36
35
|
if not self._test_cases:
|
|
37
|
-
return Report(results=[]
|
|
36
|
+
return Report(results=[])
|
|
38
37
|
|
|
39
38
|
runner = Runner(
|
|
40
39
|
provider=self.provider,
|
|
@@ -44,7 +43,4 @@ class Benchmark:
|
|
|
44
43
|
|
|
45
44
|
results = await runner.run(self._test_cases)
|
|
46
45
|
|
|
47
|
-
return Report(
|
|
48
|
-
results=results,
|
|
49
|
-
metadata=metadata or {},
|
|
50
|
-
)
|
|
46
|
+
return Report(results=results)
|
promptum/benchmark/report.py
CHANGED
|
@@ -2,13 +2,12 @@ from collections.abc import Callable, Sequence
|
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from typing import Any
|
|
4
4
|
|
|
5
|
-
from promptum.
|
|
5
|
+
from promptum.benchmark.result import TestResult
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
@dataclass(frozen=True, slots=True)
|
|
9
9
|
class Report:
|
|
10
10
|
results: Sequence[TestResult]
|
|
11
|
-
metadata: dict[str, Any]
|
|
12
11
|
|
|
13
12
|
def get_summary(self) -> dict[str, Any]:
|
|
14
13
|
total = len(self.results)
|
|
@@ -49,7 +48,7 @@ class Report:
|
|
|
49
48
|
if passed is not None:
|
|
50
49
|
filtered = [r for r in filtered if r.passed == passed]
|
|
51
50
|
|
|
52
|
-
return Report(results=filtered
|
|
51
|
+
return Report(results=filtered)
|
|
53
52
|
|
|
54
53
|
def group_by(self, key: Callable[[TestResult], str]) -> dict[str, "Report"]:
|
|
55
54
|
groups: dict[str, list[TestResult]] = {}
|
|
@@ -60,7 +59,7 @@ class Report:
|
|
|
60
59
|
groups[group_key] = []
|
|
61
60
|
groups[group_key].append(result)
|
|
62
61
|
|
|
63
|
-
return {k: Report(results=v
|
|
62
|
+
return {k: Report(results=v) for k, v in groups.items()}
|
|
64
63
|
|
|
65
64
|
def compare_models(self) -> dict[str, dict[str, Any]]:
|
|
66
65
|
by_model = self.group_by(lambda r: r.test_case.model)
|
|
@@ -71,5 +70,5 @@ class Report:
|
|
|
71
70
|
if not values:
|
|
72
71
|
return 0
|
|
73
72
|
sorted_values = sorted(values)
|
|
74
|
-
index = int(len(sorted_values) * p)
|
|
75
|
-
return sorted_values[
|
|
73
|
+
index = int((len(sorted_values) - 1) * p)
|
|
74
|
+
return sorted_values[index]
|
|
@@ -2,8 +2,8 @@ from dataclasses import dataclass, field
|
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from typing import Any
|
|
4
4
|
|
|
5
|
-
from promptum.
|
|
6
|
-
from promptum.
|
|
5
|
+
from promptum.benchmark.test_case import TestCase
|
|
6
|
+
from promptum.providers.metrics import Metrics
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
@dataclass(frozen=True, slots=True)
|
|
@@ -3,8 +3,8 @@ from collections.abc import Callable, Sequence
|
|
|
3
3
|
|
|
4
4
|
import httpx
|
|
5
5
|
|
|
6
|
-
from promptum.
|
|
7
|
-
from promptum.
|
|
6
|
+
from promptum.benchmark.result import TestResult
|
|
7
|
+
from promptum.benchmark.test_case import TestCase
|
|
8
8
|
from promptum.providers.protocol import LLMProvider
|
|
9
9
|
|
|
10
10
|
|
|
@@ -37,7 +37,6 @@ class Runner:
|
|
|
37
37
|
|
|
38
38
|
results = await asyncio.gather(
|
|
39
39
|
*[run_with_semaphore(tc) for tc in test_cases],
|
|
40
|
-
return_exceptions=False,
|
|
41
40
|
)
|
|
42
41
|
|
|
43
42
|
return list(results)
|
promptum/providers/__init__.py
CHANGED
|
@@ -1,7 +1,12 @@
|
|
|
1
|
+
from promptum.providers.metrics import Metrics
|
|
1
2
|
from promptum.providers.openrouter import OpenRouterClient
|
|
2
3
|
from promptum.providers.protocol import LLMProvider
|
|
4
|
+
from promptum.providers.retry import RetryConfig, RetryStrategy
|
|
3
5
|
|
|
4
6
|
__all__ = [
|
|
5
7
|
"LLMProvider",
|
|
8
|
+
"Metrics",
|
|
6
9
|
"OpenRouterClient",
|
|
10
|
+
"RetryConfig",
|
|
11
|
+
"RetryStrategy",
|
|
7
12
|
]
|
promptum/providers/openrouter.py
CHANGED
|
@@ -4,8 +4,8 @@ from typing import Any
|
|
|
4
4
|
|
|
5
5
|
import httpx
|
|
6
6
|
|
|
7
|
-
from promptum.
|
|
8
|
-
from promptum.
|
|
7
|
+
from promptum.providers.metrics import Metrics
|
|
8
|
+
from promptum.providers.retry import RetryConfig, RetryStrategy
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class OpenRouterClient:
|
|
@@ -61,7 +61,7 @@ class OpenRouterClient:
|
|
|
61
61
|
"messages": messages,
|
|
62
62
|
"temperature": temperature,
|
|
63
63
|
}
|
|
64
|
-
if max_tokens:
|
|
64
|
+
if max_tokens is not None:
|
|
65
65
|
payload["max_tokens"] = max_tokens
|
|
66
66
|
payload.update(kwargs)
|
|
67
67
|
|
promptum/providers/protocol.py
CHANGED
|
@@ -88,21 +88,3 @@ class JsonSchema:
|
|
|
88
88
|
keys = ", ".join(self.required_keys)
|
|
89
89
|
return f"Valid JSON with keys: {keys}"
|
|
90
90
|
return "Valid JSON object"
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
@dataclass(frozen=True, slots=True)
|
|
94
|
-
class PlaceholderValidator:
|
|
95
|
-
"""
|
|
96
|
-
Placeholder validator for deserialized reports.
|
|
97
|
-
|
|
98
|
-
Used when original validator cannot be reconstructed from storage.
|
|
99
|
-
Always returns True. Original validator logic is not preserved.
|
|
100
|
-
"""
|
|
101
|
-
|
|
102
|
-
description: str
|
|
103
|
-
|
|
104
|
-
def validate(self, response: str) -> tuple[bool, dict[str, Any]]:
|
|
105
|
-
return True, {"placeholder": True, "note": "Original validator could not be reconstructed"}
|
|
106
|
-
|
|
107
|
-
def describe(self) -> str:
|
|
108
|
-
return self.description
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: promptum
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.2
|
|
4
4
|
Summary: Async LLM benchmarking library with protocol-based extensibility
|
|
5
5
|
Project-URL: Homepage, https://github.com/deyna256/promptum
|
|
6
6
|
Project-URL: Repository, https://github.com/deyna256/promptum
|
|
@@ -36,8 +36,6 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
36
36
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
37
37
|
Requires-Python: >=3.13
|
|
38
38
|
Requires-Dist: httpx>=0.27.0
|
|
39
|
-
Requires-Dist: jinja2>=3.1.0
|
|
40
|
-
Requires-Dist: pyyaml>=6.0
|
|
41
39
|
Description-Content-Type: text/markdown
|
|
42
40
|
|
|
43
41
|
# promptum
|
|
@@ -97,15 +95,12 @@ for attempt in range(max_retries):
|
|
|
97
95
|
break
|
|
98
96
|
except Exception:
|
|
99
97
|
sleep(2 ** attempt)
|
|
100
|
-
|
|
101
|
-
# Export results manually
|
|
102
|
-
json.dump(results, open("results.json", "w"))
|
|
103
98
|
```
|
|
104
99
|
|
|
105
100
|
**After promptum:**
|
|
106
101
|
```python
|
|
107
102
|
report = await benchmark.run_async()
|
|
108
|
-
|
|
103
|
+
summary = report.get_summary() # Metrics captured automatically
|
|
109
104
|
```
|
|
110
105
|
|
|
111
106
|
---
|
|
@@ -151,14 +146,13 @@ python your_script.py
|
|
|
151
146
|
|
|
152
147
|
## What You Get
|
|
153
148
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
✅ **Zero Config** - No YAML files, no setup scripts, just Python
|
|
149
|
+
- [x] **One API for 100+ Models** - OpenRouter support out of the box (OpenAI, Anthropic, Google, etc.)
|
|
150
|
+
- [x] **Smart Validation** - ExactMatch, Contains, Regex, JsonSchema, or write your own
|
|
151
|
+
- [x] **Automatic Retries** - Exponential/linear backoff with configurable attempts
|
|
152
|
+
- [x] **Metrics Tracking** - Latency, tokens, cost - automatically captured
|
|
153
|
+
- [x] **Async by Default** - Run 100 tests in parallel without breaking a sweat
|
|
154
|
+
- [x] **Type Safe** - Full type hints, catches errors before runtime
|
|
155
|
+
- [x] **Zero Config** - No YAML files, no setup scripts, just Python
|
|
162
156
|
|
|
163
157
|
---
|
|
164
158
|
|
|
@@ -193,14 +187,11 @@ tests = [
|
|
|
193
187
|
benchmark.add_tests(tests)
|
|
194
188
|
report = await benchmark.run_async()
|
|
195
189
|
|
|
196
|
-
#
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
open("comparison.html", "w").write(html)
|
|
190
|
+
# Side-by-side model comparison
|
|
191
|
+
for model, summary in report.compare_models().items():
|
|
192
|
+
print(f"{model}: {summary['pass_rate']:.0%} pass rate, {summary['avg_latency_ms']:.0f}ms avg")
|
|
200
193
|
```
|
|
201
194
|
|
|
202
|
-
Open `comparison.html` in your browser - see side-by-side model performance with charts.
|
|
203
|
-
|
|
204
195
|
---
|
|
205
196
|
|
|
206
197
|
## Use Cases
|
|
@@ -252,7 +243,7 @@ Found a bug? Want a feature? PRs welcome!
|
|
|
252
243
|
|
|
253
244
|
```bash
|
|
254
245
|
# Development setup
|
|
255
|
-
git clone https://github.com/
|
|
246
|
+
git clone https://github.com/deyna256/promptum.git
|
|
256
247
|
cd promptum
|
|
257
248
|
just sync # Install dependencies
|
|
258
249
|
just test # Run tests
|
|
@@ -273,7 +264,7 @@ MIT - do whatever you want with it.
|
|
|
273
264
|
|
|
274
265
|
<div align="center">
|
|
275
266
|
|
|
276
|
-
**[⭐ Star on GitHub](https://github.com/
|
|
267
|
+
**[⭐ Star on GitHub](https://github.com/deyna256/promptum)** | **[🐛 Report Bug](https://github.com/deyna256/promptum/issues)** | **[💡 Request Feature](https://github.com/deyna256/promptum/issues)**
|
|
277
268
|
|
|
278
269
|
Made for developers who value their time.
|
|
279
270
|
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
promptum/__init__.py,sha256=8IAk_9VlnKEJIdwf-hEDkOfOCV456H2Jng-HrZfewso,582
|
|
2
|
+
promptum/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
promptum/benchmark/__init__.py,sha256=0FXYDnK4SGa5ZqX2k9aVkwy3ENDlF_5nW2Mut_OCCbg,311
|
|
4
|
+
promptum/benchmark/benchmark.py,sha256=hZ3557qPKqFeNNuxrRLPs-b6XBy2JCowIhRDDwatfeI,1403
|
|
5
|
+
promptum/benchmark/report.py,sha256=DhY1p3n29xOSwRYUiQW6V6FhGFGGn-JF6nuNuvj9rro,2659
|
|
6
|
+
promptum/benchmark/result.py,sha256=nKh-T4zlam2LxsaFoL8jeVaO6kZJ1sfB_tnp4gdNPhM,482
|
|
7
|
+
promptum/benchmark/runner.py,sha256=5p6JBwjTlEHTh6jNv_iuFH1nIrI4_Gv3wmzCT0TWpvA,2407
|
|
8
|
+
promptum/benchmark/test_case.py,sha256=Okypf2334ewVrvmQG7M3I3D7BzqXDsQ2ihjNw9gGF00,598
|
|
9
|
+
promptum/providers/__init__.py,sha256=UprvJ4vxHqo-VTzzUmZ4wFCj6VybP9xBd7HtpPPSvbI,335
|
|
10
|
+
promptum/providers/metrics.py,sha256=FnS10nHFjQ5Clj5X21C_nW6zAUJU_ZHt0s2fLgp6L28,427
|
|
11
|
+
promptum/providers/openrouter.py,sha256=fOqBm4ak7szNNeKNhSI6y4WpFsUx6iQg_3jaFsXc0dQ,4623
|
|
12
|
+
promptum/providers/protocol.py,sha256=g9zIH91HysBIATMHd9Z2Mpk1tKiTOkAyd-zynRaQsuk,493
|
|
13
|
+
promptum/providers/retry.py,sha256=mA_RRz9_9J_mge_AUd9f1A-gACOxZLGTI8vTIstAr8s,538
|
|
14
|
+
promptum/validation/__init__.py,sha256=mhykyxaIwn2PJh2RXAi0fi2NRIveFmlC5bg1nyCbfVU,252
|
|
15
|
+
promptum/validation/protocol.py,sha256=xqxm23YX6eNeZHKMLMZ-Wz8iQKn4ZRzAI5Xryxg0uq4,418
|
|
16
|
+
promptum/validation/validators.py,sha256=qSMva2P2miXXJJ5XeTKJsyYgh2x5wORi3dhOnBYuACE,2686
|
|
17
|
+
promptum-0.0.2.dist-info/METADATA,sha256=MQcy0pxUoMpu4uZgM_Q3HEE_RnY3Krcg-_FTF9vvQ54,7845
|
|
18
|
+
promptum-0.0.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
19
|
+
promptum-0.0.2.dist-info/licenses/LICENSE,sha256=Fgn285H5Vy9diOlqO1TzS3hD97WcdF6-GFHvUcFNtmg,1067
|
|
20
|
+
promptum-0.0.2.dist-info/RECORD,,
|
promptum/core/__init__.py
DELETED
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
from promptum.core.metrics import Metrics
|
|
2
|
-
from promptum.core.result import TestResult
|
|
3
|
-
from promptum.core.retry import RetryConfig, RetryStrategy
|
|
4
|
-
from promptum.core.test_case import TestCase
|
|
5
|
-
|
|
6
|
-
__all__ = [
|
|
7
|
-
"Metrics",
|
|
8
|
-
"RetryConfig",
|
|
9
|
-
"RetryStrategy",
|
|
10
|
-
"TestCase",
|
|
11
|
-
"TestResult",
|
|
12
|
-
]
|
promptum/execution/__init__.py
DELETED
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
from promptum.serialization.html import HTMLSerializer
|
|
2
|
-
from promptum.serialization.json import JSONSerializer
|
|
3
|
-
from promptum.serialization.protocol import Serializer
|
|
4
|
-
from promptum.serialization.yaml import YAMLSerializer
|
|
5
|
-
|
|
6
|
-
__all__ = [
|
|
7
|
-
"Serializer",
|
|
8
|
-
"JSONSerializer",
|
|
9
|
-
"YAMLSerializer",
|
|
10
|
-
"HTMLSerializer",
|
|
11
|
-
]
|
promptum/serialization/base.py
DELETED
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
"""Base serializer with shared result serialization logic."""
|
|
2
|
-
|
|
3
|
-
from typing import Any
|
|
4
|
-
|
|
5
|
-
from promptum.core.result import TestResult
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class BaseSerializer:
|
|
9
|
-
"""
|
|
10
|
-
Base class for serializers with common result serialization logic.
|
|
11
|
-
|
|
12
|
-
Subclasses should implement:
|
|
13
|
-
- serialize(report: Report) -> str
|
|
14
|
-
- get_file_extension() -> str
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
@staticmethod
|
|
18
|
-
def _serialize_result(result: TestResult) -> dict[str, Any]:
|
|
19
|
-
"""Convert TestResult to dictionary representation."""
|
|
20
|
-
return {
|
|
21
|
-
"test_case": {
|
|
22
|
-
"name": result.test_case.name,
|
|
23
|
-
"prompt": result.test_case.prompt,
|
|
24
|
-
"model": result.test_case.model,
|
|
25
|
-
"tags": list(result.test_case.tags),
|
|
26
|
-
"system_prompt": result.test_case.system_prompt,
|
|
27
|
-
"temperature": result.test_case.temperature,
|
|
28
|
-
"max_tokens": result.test_case.max_tokens,
|
|
29
|
-
"metadata": result.test_case.metadata,
|
|
30
|
-
"validator": result.test_case.validator.describe(),
|
|
31
|
-
},
|
|
32
|
-
"response": result.response,
|
|
33
|
-
"passed": result.passed,
|
|
34
|
-
"metrics": {
|
|
35
|
-
"latency_ms": result.metrics.latency_ms,
|
|
36
|
-
"prompt_tokens": result.metrics.prompt_tokens,
|
|
37
|
-
"completion_tokens": result.metrics.completion_tokens,
|
|
38
|
-
"total_tokens": result.metrics.total_tokens,
|
|
39
|
-
"cost_usd": result.metrics.cost_usd,
|
|
40
|
-
"retry_delays": list(result.metrics.retry_delays),
|
|
41
|
-
"total_attempts": result.metrics.total_attempts,
|
|
42
|
-
}
|
|
43
|
-
if result.metrics
|
|
44
|
-
else None,
|
|
45
|
-
"validation_details": result.validation_details,
|
|
46
|
-
"execution_error": result.execution_error,
|
|
47
|
-
"timestamp": result.timestamp.isoformat(),
|
|
48
|
-
}
|
promptum/serialization/html.py
DELETED
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
from jinja2 import Template
|
|
5
|
-
|
|
6
|
-
from promptum.benchmark.report import Report
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class HTMLSerializer:
|
|
10
|
-
def __init__(self) -> None:
|
|
11
|
-
template_path = Path(__file__).parent / "report_template.html"
|
|
12
|
-
self._template = Template(template_path.read_text())
|
|
13
|
-
|
|
14
|
-
def serialize(self, report: Report) -> str:
|
|
15
|
-
summary = report.get_summary()
|
|
16
|
-
|
|
17
|
-
results_data = []
|
|
18
|
-
for result in report.results:
|
|
19
|
-
results_data.append(
|
|
20
|
-
{
|
|
21
|
-
"test_case": {
|
|
22
|
-
"name": result.test_case.name,
|
|
23
|
-
"prompt": result.test_case.prompt,
|
|
24
|
-
"model": result.test_case.model,
|
|
25
|
-
"tags": list(result.test_case.tags),
|
|
26
|
-
"system_prompt": result.test_case.system_prompt,
|
|
27
|
-
"validator": result.test_case.validator.describe(),
|
|
28
|
-
},
|
|
29
|
-
"response": result.response,
|
|
30
|
-
"passed": result.passed,
|
|
31
|
-
"metrics": {
|
|
32
|
-
"latency_ms": result.metrics.latency_ms,
|
|
33
|
-
"prompt_tokens": result.metrics.prompt_tokens,
|
|
34
|
-
"completion_tokens": result.metrics.completion_tokens,
|
|
35
|
-
"total_tokens": result.metrics.total_tokens,
|
|
36
|
-
"cost_usd": result.metrics.cost_usd,
|
|
37
|
-
"total_attempts": result.metrics.total_attempts,
|
|
38
|
-
}
|
|
39
|
-
if result.metrics
|
|
40
|
-
else None,
|
|
41
|
-
"execution_error": result.execution_error,
|
|
42
|
-
}
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
return self._template.render(
|
|
46
|
-
summary=summary,
|
|
47
|
-
results=results_data,
|
|
48
|
-
results_json=json.dumps(results_data),
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
def get_file_extension(self) -> str:
|
|
52
|
-
return "html"
|
promptum/serialization/json.py
DELETED
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from datetime import datetime
|
|
3
|
-
from typing import Any
|
|
4
|
-
|
|
5
|
-
from promptum.benchmark.report import Report
|
|
6
|
-
from promptum.serialization.base import BaseSerializer
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class JSONSerializer(BaseSerializer):
|
|
10
|
-
def __init__(self, indent: int = 2):
|
|
11
|
-
self.indent = indent
|
|
12
|
-
|
|
13
|
-
def serialize(self, report: Report) -> str:
|
|
14
|
-
data = {
|
|
15
|
-
"metadata": report.metadata,
|
|
16
|
-
"summary": report.get_summary(),
|
|
17
|
-
"results": [self._serialize_result(r) for r in report.results],
|
|
18
|
-
}
|
|
19
|
-
return json.dumps(data, indent=self.indent, default=self._json_default)
|
|
20
|
-
|
|
21
|
-
def get_file_extension(self) -> str:
|
|
22
|
-
return "json"
|
|
23
|
-
|
|
24
|
-
@staticmethod
|
|
25
|
-
def _json_default(obj: Any) -> Any:
|
|
26
|
-
if isinstance(obj, datetime):
|
|
27
|
-
return obj.isoformat()
|
|
28
|
-
raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
from typing import Protocol
|
|
2
|
-
|
|
3
|
-
from promptum.benchmark.report import Report
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class Serializer(Protocol):
|
|
7
|
-
def serialize(self, report: Report) -> str:
|
|
8
|
-
"""Serializes a Report to a string format."""
|
|
9
|
-
...
|
|
10
|
-
|
|
11
|
-
def get_file_extension(self) -> str:
|
|
12
|
-
"""Returns the file extension for this format (e.g., 'json', 'html')."""
|
|
13
|
-
...
|
|
@@ -1,293 +0,0 @@
|
|
|
1
|
-
<!DOCTYPE html>
|
|
2
|
-
<html lang="en">
|
|
3
|
-
<head>
|
|
4
|
-
<meta charset="UTF-8">
|
|
5
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
-
<title>LLM Benchmark Report</title>
|
|
7
|
-
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.0/dist/chart.umd.js"></script>
|
|
8
|
-
<style>
|
|
9
|
-
* { margin: 0; padding: 0; box-sizing: border-box; }
|
|
10
|
-
:root {
|
|
11
|
-
--bg: #ffffff;
|
|
12
|
-
--surface: #f5f5f5;
|
|
13
|
-
--text: #1a1a1a;
|
|
14
|
-
--text-muted: #666;
|
|
15
|
-
--border: #ddd;
|
|
16
|
-
--success: #22c55e;
|
|
17
|
-
--error: #ef4444;
|
|
18
|
-
--warning: #f59e0b;
|
|
19
|
-
}
|
|
20
|
-
@media (prefers-color-scheme: dark) {
|
|
21
|
-
:root {
|
|
22
|
-
--bg: #0a0a0a;
|
|
23
|
-
--surface: #1a1a1a;
|
|
24
|
-
--text: #e5e5e5;
|
|
25
|
-
--text-muted: #a3a3a3;
|
|
26
|
-
--border: #333;
|
|
27
|
-
}
|
|
28
|
-
}
|
|
29
|
-
body {
|
|
30
|
-
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
|
|
31
|
-
background: var(--bg);
|
|
32
|
-
color: var(--text);
|
|
33
|
-
line-height: 1.6;
|
|
34
|
-
}
|
|
35
|
-
.container { max-width: 1400px; margin: 0 auto; padding: 2rem; }
|
|
36
|
-
h1 { font-size: 2rem; margin-bottom: 0.5rem; }
|
|
37
|
-
h2 { font-size: 1.5rem; margin: 2rem 0 1rem; }
|
|
38
|
-
.summary { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
|
|
39
|
-
.card {
|
|
40
|
-
background: var(--surface);
|
|
41
|
-
border: 1px solid var(--border);
|
|
42
|
-
border-radius: 8px;
|
|
43
|
-
padding: 1.5rem;
|
|
44
|
-
}
|
|
45
|
-
.card-title { font-size: 0.875rem; color: var(--text-muted); margin-bottom: 0.5rem; }
|
|
46
|
-
.card-value { font-size: 2rem; font-weight: 700; }
|
|
47
|
-
.chart-container { height: 300px; margin-bottom: 2rem; }
|
|
48
|
-
table {
|
|
49
|
-
width: 100%;
|
|
50
|
-
border-collapse: collapse;
|
|
51
|
-
background: var(--surface);
|
|
52
|
-
border-radius: 8px;
|
|
53
|
-
overflow: hidden;
|
|
54
|
-
}
|
|
55
|
-
th, td {
|
|
56
|
-
text-align: left;
|
|
57
|
-
padding: 1rem;
|
|
58
|
-
border-bottom: 1px solid var(--border);
|
|
59
|
-
}
|
|
60
|
-
th {
|
|
61
|
-
background: var(--surface);
|
|
62
|
-
font-weight: 600;
|
|
63
|
-
position: sticky;
|
|
64
|
-
top: 0;
|
|
65
|
-
}
|
|
66
|
-
tr:hover { background: var(--bg); }
|
|
67
|
-
.badge {
|
|
68
|
-
display: inline-block;
|
|
69
|
-
padding: 0.25rem 0.75rem;
|
|
70
|
-
border-radius: 12px;
|
|
71
|
-
font-size: 0.75rem;
|
|
72
|
-
font-weight: 600;
|
|
73
|
-
}
|
|
74
|
-
.badge-success { background: var(--success); color: white; }
|
|
75
|
-
.badge-error { background: var(--error); color: white; }
|
|
76
|
-
.tag {
|
|
77
|
-
display: inline-block;
|
|
78
|
-
padding: 0.125rem 0.5rem;
|
|
79
|
-
background: var(--border);
|
|
80
|
-
border-radius: 4px;
|
|
81
|
-
font-size: 0.75rem;
|
|
82
|
-
margin-right: 0.25rem;
|
|
83
|
-
}
|
|
84
|
-
.search {
|
|
85
|
-
width: 100%;
|
|
86
|
-
padding: 0.75rem;
|
|
87
|
-
margin-bottom: 1rem;
|
|
88
|
-
background: var(--surface);
|
|
89
|
-
border: 1px solid var(--border);
|
|
90
|
-
border-radius: 8px;
|
|
91
|
-
color: var(--text);
|
|
92
|
-
font-size: 1rem;
|
|
93
|
-
}
|
|
94
|
-
.truncate {
|
|
95
|
-
max-width: 300px;
|
|
96
|
-
white-space: nowrap;
|
|
97
|
-
overflow: hidden;
|
|
98
|
-
text-overflow: ellipsis;
|
|
99
|
-
}
|
|
100
|
-
button {
|
|
101
|
-
background: var(--surface);
|
|
102
|
-
border: 1px solid var(--border);
|
|
103
|
-
color: var(--text);
|
|
104
|
-
padding: 0.5rem 1rem;
|
|
105
|
-
border-radius: 6px;
|
|
106
|
-
cursor: pointer;
|
|
107
|
-
font-size: 0.875rem;
|
|
108
|
-
}
|
|
109
|
-
button:hover { background: var(--border); }
|
|
110
|
-
.modal {
|
|
111
|
-
display: none;
|
|
112
|
-
position: fixed;
|
|
113
|
-
top: 0;
|
|
114
|
-
left: 0;
|
|
115
|
-
width: 100%;
|
|
116
|
-
height: 100%;
|
|
117
|
-
background: rgba(0, 0, 0, 0.7);
|
|
118
|
-
z-index: 1000;
|
|
119
|
-
overflow: auto;
|
|
120
|
-
}
|
|
121
|
-
.modal-content {
|
|
122
|
-
background: var(--surface);
|
|
123
|
-
margin: 2rem auto;
|
|
124
|
-
padding: 2rem;
|
|
125
|
-
max-width: 800px;
|
|
126
|
-
border-radius: 12px;
|
|
127
|
-
position: relative;
|
|
128
|
-
}
|
|
129
|
-
.modal-close {
|
|
130
|
-
position: absolute;
|
|
131
|
-
top: 1rem;
|
|
132
|
-
right: 1rem;
|
|
133
|
-
font-size: 1.5rem;
|
|
134
|
-
cursor: pointer;
|
|
135
|
-
}
|
|
136
|
-
pre {
|
|
137
|
-
background: var(--bg);
|
|
138
|
-
padding: 1rem;
|
|
139
|
-
border-radius: 6px;
|
|
140
|
-
overflow-x: auto;
|
|
141
|
-
margin: 0.5rem 0;
|
|
142
|
-
}
|
|
143
|
-
code { font-family: 'Courier New', monospace; font-size: 0.875rem; }
|
|
144
|
-
</style>
|
|
145
|
-
</head>
|
|
146
|
-
<body>
|
|
147
|
-
<div class="container">
|
|
148
|
-
<h1>LLM Benchmark Report</h1>
|
|
149
|
-
<p style="color: var(--text-muted); margin-bottom: 2rem;">{{ summary.total }} tests executed</p>
|
|
150
|
-
|
|
151
|
-
<div class="summary">
|
|
152
|
-
<div class="card">
|
|
153
|
-
<div class="card-title">Pass Rate</div>
|
|
154
|
-
<div class="card-value" style="color: var(--success);">{{ "%.1f"|format(summary.pass_rate * 100) }}%</div>
|
|
155
|
-
</div>
|
|
156
|
-
<div class="card">
|
|
157
|
-
<div class="card-title">Avg Latency</div>
|
|
158
|
-
<div class="card-value">{{ "%.0f"|format(summary.avg_latency_ms) }}ms</div>
|
|
159
|
-
</div>
|
|
160
|
-
<div class="card">
|
|
161
|
-
<div class="card-title">Total Cost</div>
|
|
162
|
-
<div class="card-value">${{ "%.6f"|format(summary.total_cost_usd) }}</div>
|
|
163
|
-
</div>
|
|
164
|
-
<div class="card">
|
|
165
|
-
<div class="card-title">Total Tokens</div>
|
|
166
|
-
<div class="card-value">{{ "{:,}".format(summary.total_tokens) }}</div>
|
|
167
|
-
</div>
|
|
168
|
-
</div>
|
|
169
|
-
|
|
170
|
-
<div class="card chart-container">
|
|
171
|
-
<canvas id="latencyChart"></canvas>
|
|
172
|
-
</div>
|
|
173
|
-
|
|
174
|
-
<h2>Test Results</h2>
|
|
175
|
-
<input type="text" class="search" id="searchInput" placeholder="Search tests...">
|
|
176
|
-
|
|
177
|
-
<table id="resultsTable">
|
|
178
|
-
<thead>
|
|
179
|
-
<tr>
|
|
180
|
-
<th>Status</th>
|
|
181
|
-
<th>Name</th>
|
|
182
|
-
<th>Model</th>
|
|
183
|
-
<th>Latency</th>
|
|
184
|
-
<th>Cost</th>
|
|
185
|
-
<th>Tags</th>
|
|
186
|
-
<th>Actions</th>
|
|
187
|
-
</tr>
|
|
188
|
-
</thead>
|
|
189
|
-
<tbody>
|
|
190
|
-
{% for result in results %}
|
|
191
|
-
<tr class="result-row">
|
|
192
|
-
<td>
|
|
193
|
-
{% if result.passed %}
|
|
194
|
-
<span class="badge badge-success">PASS</span>
|
|
195
|
-
{% else %}
|
|
196
|
-
<span class="badge badge-error">FAIL</span>
|
|
197
|
-
{% endif %}
|
|
198
|
-
</td>
|
|
199
|
-
<td>{{ result.test_case.name }}</td>
|
|
200
|
-
<td>{{ result.test_case.model }}</td>
|
|
201
|
-
<td>{{ "%.0f"|format(result.metrics.latency_ms if result.metrics else 0) }}ms</td>
|
|
202
|
-
<td>${{ "%.6f"|format(result.metrics.cost_usd if result.metrics and result.metrics.cost_usd else 0) }}</td>
|
|
203
|
-
<td>
|
|
204
|
-
{% for tag in result.test_case.tags %}
|
|
205
|
-
<span class="tag">{{ tag }}</span>
|
|
206
|
-
{% endfor %}
|
|
207
|
-
</td>
|
|
208
|
-
<td><button onclick="showDetails({{ loop.index0 }})">Details</button></td>
|
|
209
|
-
</tr>
|
|
210
|
-
{% endfor %}
|
|
211
|
-
</tbody>
|
|
212
|
-
</table>
|
|
213
|
-
</div>
|
|
214
|
-
|
|
215
|
-
<div id="detailsModal" class="modal">
|
|
216
|
-
<div class="modal-content">
|
|
217
|
-
<span class="modal-close" onclick="closeModal()">×</span>
|
|
218
|
-
<div id="modalBody"></div>
|
|
219
|
-
</div>
|
|
220
|
-
</div>
|
|
221
|
-
|
|
222
|
-
<script>
|
|
223
|
-
const results = {{ results_json }};
|
|
224
|
-
|
|
225
|
-
new Chart(document.getElementById('latencyChart'), {
|
|
226
|
-
type: 'bar',
|
|
227
|
-
data: {
|
|
228
|
-
labels: results.map((r, i) => r.test_case.name),
|
|
229
|
-
datasets: [{
|
|
230
|
-
label: 'Latency (ms)',
|
|
231
|
-
data: results.map(r => r.metrics ? r.metrics.latency_ms : 0),
|
|
232
|
-
backgroundColor: results.map(r => r.passed ? '#22c55e' : '#ef4444')
|
|
233
|
-
}]
|
|
234
|
-
},
|
|
235
|
-
options: {
|
|
236
|
-
responsive: true,
|
|
237
|
-
maintainAspectRatio: false,
|
|
238
|
-
plugins: { legend: { display: false } }
|
|
239
|
-
}
|
|
240
|
-
});
|
|
241
|
-
|
|
242
|
-
document.getElementById('searchInput').addEventListener('input', function(e) {
|
|
243
|
-
const term = e.target.value.toLowerCase();
|
|
244
|
-
document.querySelectorAll('.result-row').forEach(row => {
|
|
245
|
-
const text = row.textContent.toLowerCase();
|
|
246
|
-
row.style.display = text.includes(term) ? '' : 'none';
|
|
247
|
-
});
|
|
248
|
-
});
|
|
249
|
-
|
|
250
|
-
function showDetails(index) {
|
|
251
|
-
const result = results[index];
|
|
252
|
-
const html = `
|
|
253
|
-
<h2>${result.test_case.name}</h2>
|
|
254
|
-
<p><strong>Status:</strong> <span class="badge ${result.passed ? 'badge-success' : 'badge-error'}">${result.passed ? 'PASS' : 'FAIL'}</span></p>
|
|
255
|
-
<p><strong>Model:</strong> ${result.test_case.model}</p>
|
|
256
|
-
<p><strong>Validator:</strong> ${result.test_case.validator}</p>
|
|
257
|
-
<h3>Prompt</h3>
|
|
258
|
-
<pre><code>${escapeHtml(result.test_case.prompt)}</code></pre>
|
|
259
|
-
${result.test_case.system_prompt ? `<h3>System Prompt</h3><pre><code>${escapeHtml(result.test_case.system_prompt)}</code></pre>` : ''}
|
|
260
|
-
<h3>Response</h3>
|
|
261
|
-
<pre><code>${escapeHtml(result.response || 'No response')}</code></pre>
|
|
262
|
-
${result.execution_error ? `<h3>Error</h3><pre style="color: var(--error);"><code>${escapeHtml(result.execution_error)}</code></pre>` : ''}
|
|
263
|
-
${result.metrics ? `
|
|
264
|
-
<h3>Metrics</h3>
|
|
265
|
-
<ul>
|
|
266
|
-
<li>Latency: ${result.metrics.latency_ms.toFixed(0)}ms</li>
|
|
267
|
-
<li>Tokens: ${result.metrics.total_tokens || 'N/A'}</li>
|
|
268
|
-
<li>Cost: $${(result.metrics.cost_usd || 0).toFixed(6)}</li>
|
|
269
|
-
<li>Attempts: ${result.metrics.total_attempts}</li>
|
|
270
|
-
</ul>
|
|
271
|
-
` : ''}
|
|
272
|
-
`;
|
|
273
|
-
document.getElementById('modalBody').innerHTML = html;
|
|
274
|
-
document.getElementById('detailsModal').style.display = 'block';
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
function closeModal() {
|
|
278
|
-
document.getElementById('detailsModal').style.display = 'none';
|
|
279
|
-
}
|
|
280
|
-
|
|
281
|
-
function escapeHtml(text) {
|
|
282
|
-
const div = document.createElement('div');
|
|
283
|
-
div.textContent = text;
|
|
284
|
-
return div.innerHTML;
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
window.onclick = function(event) {
|
|
288
|
-
const modal = document.getElementById('detailsModal');
|
|
289
|
-
if (event.target === modal) closeModal();
|
|
290
|
-
}
|
|
291
|
-
</script>
|
|
292
|
-
</body>
|
|
293
|
-
</html>
|
promptum/serialization/yaml.py
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
import yaml
|
|
2
|
-
|
|
3
|
-
from promptum.benchmark.report import Report
|
|
4
|
-
from promptum.serialization.base import BaseSerializer
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class YAMLSerializer(BaseSerializer):
|
|
8
|
-
def serialize(self, report: Report) -> str:
|
|
9
|
-
data = {
|
|
10
|
-
"metadata": report.metadata,
|
|
11
|
-
"summary": report.get_summary(),
|
|
12
|
-
"results": [self._serialize_result(r) for r in report.results],
|
|
13
|
-
}
|
|
14
|
-
return yaml.dump(data, default_flow_style=False, sort_keys=False)
|
|
15
|
-
|
|
16
|
-
def get_file_extension(self) -> str:
|
|
17
|
-
return "yaml"
|
promptum/storage/__init__.py
DELETED
promptum/storage/file.py
DELETED
|
@@ -1,157 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import tempfile
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from typing import Any
|
|
6
|
-
|
|
7
|
-
from promptum.benchmark.report import Report
|
|
8
|
-
from promptum.core.metrics import Metrics
|
|
9
|
-
from promptum.core.result import TestResult
|
|
10
|
-
from promptum.core.test_case import TestCase
|
|
11
|
-
from promptum.validation.validators import PlaceholderValidator
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class FileStorage:
|
|
15
|
-
def __init__(self, base_dir: str = "results"):
|
|
16
|
-
self.base_dir = Path(base_dir)
|
|
17
|
-
self.reports_dir = self.base_dir / "reports"
|
|
18
|
-
self.metadata_file = self.base_dir / "metadata.json"
|
|
19
|
-
|
|
20
|
-
self.reports_dir.mkdir(parents=True, exist_ok=True)
|
|
21
|
-
|
|
22
|
-
def save(self, report: Report, name: str) -> str:
|
|
23
|
-
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
|
24
|
-
identifier = f"{timestamp}_{name}"
|
|
25
|
-
filename = f"{identifier}.json"
|
|
26
|
-
filepath = self.reports_dir / filename
|
|
27
|
-
|
|
28
|
-
data = self._serialize_report(report)
|
|
29
|
-
|
|
30
|
-
with tempfile.NamedTemporaryFile(
|
|
31
|
-
mode="w", delete=False, dir=self.reports_dir, suffix=".tmp"
|
|
32
|
-
) as tmp:
|
|
33
|
-
json.dump(data, tmp, indent=2)
|
|
34
|
-
tmp_path = Path(tmp.name)
|
|
35
|
-
|
|
36
|
-
tmp_path.replace(filepath)
|
|
37
|
-
|
|
38
|
-
self._update_metadata(identifier, name, str(filepath))
|
|
39
|
-
|
|
40
|
-
return identifier
|
|
41
|
-
|
|
42
|
-
def load(self, identifier: str) -> Report:
|
|
43
|
-
filepath = self.reports_dir / f"{identifier}.json"
|
|
44
|
-
|
|
45
|
-
if not filepath.exists():
|
|
46
|
-
raise FileNotFoundError(f"Report not found: {identifier}")
|
|
47
|
-
|
|
48
|
-
with open(filepath) as f:
|
|
49
|
-
data = json.load(f)
|
|
50
|
-
|
|
51
|
-
return self._deserialize_report(data)
|
|
52
|
-
|
|
53
|
-
def list_reports(self) -> list[dict[str, Any]]:
|
|
54
|
-
if not self.metadata_file.exists():
|
|
55
|
-
return []
|
|
56
|
-
|
|
57
|
-
with open(self.metadata_file) as f:
|
|
58
|
-
return json.load(f)
|
|
59
|
-
|
|
60
|
-
def _update_metadata(self, identifier: str, name: str, path: str) -> None:
|
|
61
|
-
metadata = self.list_reports()
|
|
62
|
-
|
|
63
|
-
metadata.append(
|
|
64
|
-
{
|
|
65
|
-
"id": identifier,
|
|
66
|
-
"name": name,
|
|
67
|
-
"path": path,
|
|
68
|
-
"timestamp": datetime.now().isoformat(),
|
|
69
|
-
}
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
with tempfile.NamedTemporaryFile(
|
|
73
|
-
mode="w", delete=False, dir=self.base_dir, suffix=".tmp"
|
|
74
|
-
) as tmp:
|
|
75
|
-
json.dump(metadata, tmp, indent=2)
|
|
76
|
-
tmp_path = Path(tmp.name)
|
|
77
|
-
|
|
78
|
-
tmp_path.replace(self.metadata_file)
|
|
79
|
-
|
|
80
|
-
@staticmethod
|
|
81
|
-
def _serialize_report(report: Report) -> dict[str, Any]:
|
|
82
|
-
return {
|
|
83
|
-
"metadata": report.metadata,
|
|
84
|
-
"results": [
|
|
85
|
-
{
|
|
86
|
-
"test_case": {
|
|
87
|
-
"name": r.test_case.name,
|
|
88
|
-
"prompt": r.test_case.prompt,
|
|
89
|
-
"model": r.test_case.model,
|
|
90
|
-
"tags": list(r.test_case.tags),
|
|
91
|
-
"system_prompt": r.test_case.system_prompt,
|
|
92
|
-
"temperature": r.test_case.temperature,
|
|
93
|
-
"max_tokens": r.test_case.max_tokens,
|
|
94
|
-
"metadata": r.test_case.metadata,
|
|
95
|
-
"validator_description": r.test_case.validator.describe(),
|
|
96
|
-
},
|
|
97
|
-
"response": r.response,
|
|
98
|
-
"passed": r.passed,
|
|
99
|
-
"metrics": {
|
|
100
|
-
"latency_ms": r.metrics.latency_ms,
|
|
101
|
-
"prompt_tokens": r.metrics.prompt_tokens,
|
|
102
|
-
"completion_tokens": r.metrics.completion_tokens,
|
|
103
|
-
"total_tokens": r.metrics.total_tokens,
|
|
104
|
-
"cost_usd": r.metrics.cost_usd,
|
|
105
|
-
"retry_delays": list(r.metrics.retry_delays),
|
|
106
|
-
}
|
|
107
|
-
if r.metrics
|
|
108
|
-
else None,
|
|
109
|
-
"validation_details": r.validation_details,
|
|
110
|
-
"execution_error": r.execution_error,
|
|
111
|
-
"timestamp": r.timestamp.isoformat(),
|
|
112
|
-
}
|
|
113
|
-
for r in report.results
|
|
114
|
-
],
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
@staticmethod
|
|
118
|
-
def _deserialize_report(data: dict[str, Any]) -> Report:
|
|
119
|
-
results = []
|
|
120
|
-
for r in data["results"]:
|
|
121
|
-
test_case = TestCase(
|
|
122
|
-
name=r["test_case"]["name"],
|
|
123
|
-
prompt=r["test_case"]["prompt"],
|
|
124
|
-
model=r["test_case"]["model"],
|
|
125
|
-
validator=PlaceholderValidator(
|
|
126
|
-
description=r["test_case"]["validator_description"],
|
|
127
|
-
),
|
|
128
|
-
tags=tuple(r["test_case"]["tags"]),
|
|
129
|
-
system_prompt=r["test_case"]["system_prompt"],
|
|
130
|
-
temperature=r["test_case"]["temperature"],
|
|
131
|
-
max_tokens=r["test_case"]["max_tokens"],
|
|
132
|
-
metadata=r["test_case"]["metadata"],
|
|
133
|
-
)
|
|
134
|
-
|
|
135
|
-
metrics = None
|
|
136
|
-
if r["metrics"]:
|
|
137
|
-
metrics = Metrics(
|
|
138
|
-
latency_ms=r["metrics"]["latency_ms"],
|
|
139
|
-
prompt_tokens=r["metrics"]["prompt_tokens"],
|
|
140
|
-
completion_tokens=r["metrics"]["completion_tokens"],
|
|
141
|
-
total_tokens=r["metrics"]["total_tokens"],
|
|
142
|
-
cost_usd=r["metrics"]["cost_usd"],
|
|
143
|
-
retry_delays=tuple(r["metrics"]["retry_delays"]),
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
result = TestResult(
|
|
147
|
-
test_case=test_case,
|
|
148
|
-
response=r["response"],
|
|
149
|
-
passed=r["passed"],
|
|
150
|
-
metrics=metrics,
|
|
151
|
-
validation_details=r["validation_details"],
|
|
152
|
-
execution_error=r["execution_error"],
|
|
153
|
-
timestamp=datetime.fromisoformat(r["timestamp"]),
|
|
154
|
-
)
|
|
155
|
-
results.append(result)
|
|
156
|
-
|
|
157
|
-
return Report(results=results, metadata=data["metadata"])
|
promptum/storage/protocol.py
DELETED
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
from typing import Any, Protocol
|
|
2
|
-
|
|
3
|
-
from promptum.benchmark.report import Report
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class ResultStorage(Protocol):
|
|
7
|
-
def save(self, report: Report, name: str) -> str:
|
|
8
|
-
"""
|
|
9
|
-
Saves a report and returns its identifier.
|
|
10
|
-
"""
|
|
11
|
-
...
|
|
12
|
-
|
|
13
|
-
def load(self, identifier: str) -> Report:
|
|
14
|
-
"""
|
|
15
|
-
Loads a report by its identifier.
|
|
16
|
-
"""
|
|
17
|
-
...
|
|
18
|
-
|
|
19
|
-
def list_reports(self) -> list[dict[str, Any]]:
|
|
20
|
-
"""
|
|
21
|
-
Returns metadata for all stored reports.
|
|
22
|
-
"""
|
|
23
|
-
...
|
promptum-0.0.1.dist-info/RECORD
DELETED
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
promptum/__init__.py,sha256=AjeGgmIbpp9Uv-0ybq6knejEJMK-Dnn_-fV9Z86Bp74,932
|
|
2
|
-
promptum/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
promptum/benchmark/__init__.py,sha256=NJYiXm6wVFKMloxKNAXMY4H3bMQORTtLh6__nYWYWa0,131
|
|
4
|
-
promptum/benchmark/benchmark.py,sha256=3enQSACdLwHW78fqSZj0Un3r7_Ua3V-MjfbEIIKFSWs,1589
|
|
5
|
-
promptum/benchmark/report.py,sha256=ol_UO8rw43zbQxhs2o4AwYN5TP7O_Apa77V-pZKq6Uw,2754
|
|
6
|
-
promptum/core/__init__.py,sha256=mqajsOdUBNJfcR2krxpwa7rM_wd88vJaAov-9SnVm68,294
|
|
7
|
-
promptum/core/metrics.py,sha256=FnS10nHFjQ5Clj5X21C_nW6zAUJU_ZHt0s2fLgp6L28,427
|
|
8
|
-
promptum/core/result.py,sha256=nyuVMQFY6DmZwzpgqDPsj0FaAuairpKLJ-0be5WQtTg,472
|
|
9
|
-
promptum/core/retry.py,sha256=mA_RRz9_9J_mge_AUd9f1A-gACOxZLGTI8vTIstAr8s,538
|
|
10
|
-
promptum/core/test_case.py,sha256=YNlVNj7FkoCyBFb2N0Dzrhce6o3DzUtke4PR6WoXhZo,593
|
|
11
|
-
promptum/execution/__init__.py,sha256=fUZa7Bo7yn921sl49cS6TCGsG-lOUNVdhdeRsIa5vCc,67
|
|
12
|
-
promptum/execution/runner.py,sha256=sP3uDu2VDLxFi9BkltMHwsyMuCXnz4oP1kVN28KpVZ0,2434
|
|
13
|
-
promptum/providers/__init__.py,sha256=OW-CK198wOV7_bz_keOaxxQeRlFPZgINQcVJUZq_uus,169
|
|
14
|
-
promptum/providers/openrouter.py,sha256=owquGxHaTB-pZ8jr06l4HouETuFj1lEg92oGX2mM5uo,4601
|
|
15
|
-
promptum/providers/protocol.py,sha256=vdTGAGKN3FzThHLwyMMWicU87_LpW-gn0cM3vMcWiEY,488
|
|
16
|
-
promptum/serialization/__init__.py,sha256=0dlpgF3dngaw_oR4mg7nuc4Z_VFVl2bATmhe2mHA9T4,319
|
|
17
|
-
promptum/serialization/base.py,sha256=JnB4zb7D4oy44k6ndbJu3Xw1PVLpY_9-Y7k3Et2p43g,1851
|
|
18
|
-
promptum/serialization/html.py,sha256=kJEd2s6fVfFHH7snJWrD5RGaUW66x3vtMKGMJ_ekmcI,1901
|
|
19
|
-
promptum/serialization/json.py,sha256=koqgr5_WHmrpWUOCq6rWXoC07um3mkDDaob2k9vkEK8,870
|
|
20
|
-
promptum/serialization/protocol.py,sha256=MZeMYt_HZJIYSyrRd_ZYbEJXDiXLMuJ5tosAeHLxpTM,353
|
|
21
|
-
promptum/serialization/report_template.html,sha256=RC8qSLzolqWkWBIGfyhPtPkRWM7_0JkauEWPkaKiB9A,10802
|
|
22
|
-
promptum/serialization/yaml.py,sha256=50A612OkX2L3EjhxTZJMZQb5zL8-2PmwcBjjNUhCWsA,528
|
|
23
|
-
promptum/storage/__init__.py,sha256=QWOP5Al43WmmQ_kFCM9JGi8amXJzO_pR-x5AKDNy4ds,153
|
|
24
|
-
promptum/storage/file.py,sha256=gnNBpNBQ_NeAWn7P2itsw2L99AxS7zOd8Nef6PyYxlk,5750
|
|
25
|
-
promptum/storage/protocol.py,sha256=_NpkJzOQB_98Ud_TA_ZYubHf3o2DDXGMveRN3kRyYKI,517
|
|
26
|
-
promptum/validation/__init__.py,sha256=mhykyxaIwn2PJh2RXAi0fi2NRIveFmlC5bg1nyCbfVU,252
|
|
27
|
-
promptum/validation/protocol.py,sha256=xqxm23YX6eNeZHKMLMZ-Wz8iQKn4ZRzAI5Xryxg0uq4,418
|
|
28
|
-
promptum/validation/validators.py,sha256=3lJwSMhhWb9x8BK_-S0FJBj7PFgno79II_i3Z1mCKTs,3217
|
|
29
|
-
promptum-0.0.1.dist-info/METADATA,sha256=vt_PN0Ns0JuJalM7p8hJZsz-Y2hwQrbHZ4Jacy7P6L8,8083
|
|
30
|
-
promptum-0.0.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
31
|
-
promptum-0.0.1.dist-info/licenses/LICENSE,sha256=Fgn285H5Vy9diOlqO1TzS3hD97WcdF6-GFHvUcFNtmg,1067
|
|
32
|
-
promptum-0.0.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|