promptum 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promptum/__init__.py +4 -18
- promptum/benchmark/__init__.py +5 -1
- promptum/benchmark/benchmark.py +8 -12
- promptum/benchmark/report.py +16 -30
- promptum/{core → benchmark}/result.py +2 -2
- promptum/{execution → benchmark}/runner.py +2 -3
- promptum/benchmark/summary.py +14 -0
- promptum/{core → benchmark}/test_case.py +1 -1
- promptum/providers/__init__.py +5 -0
- promptum/providers/openrouter.py +3 -3
- promptum/providers/protocol.py +1 -1
- promptum/validation/validators.py +0 -18
- {promptum-0.0.1.dist-info → promptum-0.0.3.dist-info}/METADATA +52 -53
- promptum-0.0.3.dist-info/RECORD +21 -0
- promptum/core/__init__.py +0 -12
- promptum/execution/__init__.py +0 -3
- promptum/serialization/__init__.py +0 -11
- promptum/serialization/base.py +0 -48
- promptum/serialization/html.py +0 -52
- promptum/serialization/json.py +0 -28
- promptum/serialization/protocol.py +0 -13
- promptum/serialization/report_template.html +0 -293
- promptum/serialization/yaml.py +0 -17
- promptum/storage/__init__.py +0 -7
- promptum/storage/file.py +0 -157
- promptum/storage/protocol.py +0 -23
- promptum-0.0.1.dist-info/RECORD +0 -32
- /promptum/{core → providers}/metrics.py +0 -0
- /promptum/{core → providers}/retry.py +0 -0
- {promptum-0.0.1.dist-info → promptum-0.0.3.dist-info}/WHEEL +0 -0
- {promptum-0.0.1.dist-info → promptum-0.0.3.dist-info}/licenses/LICENSE +0 -0
promptum/__init__.py
CHANGED
|
@@ -1,14 +1,5 @@
|
|
|
1
|
-
from promptum.benchmark import Benchmark, Report
|
|
2
|
-
from promptum.
|
|
3
|
-
from promptum.execution import Runner
|
|
4
|
-
from promptum.providers import LLMProvider, OpenRouterClient
|
|
5
|
-
from promptum.serialization import (
|
|
6
|
-
HTMLSerializer,
|
|
7
|
-
JSONSerializer,
|
|
8
|
-
Serializer,
|
|
9
|
-
YAMLSerializer,
|
|
10
|
-
)
|
|
11
|
-
from promptum.storage import FileStorage, ResultStorage
|
|
1
|
+
from promptum.benchmark import Benchmark, Report, Runner, Summary, TestCase, TestResult
|
|
2
|
+
from promptum.providers import LLMProvider, Metrics, OpenRouterClient, RetryConfig, RetryStrategy
|
|
12
3
|
from promptum.validation import (
|
|
13
4
|
Contains,
|
|
14
5
|
ExactMatch,
|
|
@@ -17,11 +8,12 @@ from promptum.validation import (
|
|
|
17
8
|
Validator,
|
|
18
9
|
)
|
|
19
10
|
|
|
20
|
-
__version__ = "0.
|
|
11
|
+
__version__ = "0.0.3"
|
|
21
12
|
|
|
22
13
|
__all__ = [
|
|
23
14
|
"TestCase",
|
|
24
15
|
"TestResult",
|
|
16
|
+
"Summary",
|
|
25
17
|
"Metrics",
|
|
26
18
|
"RetryConfig",
|
|
27
19
|
"RetryStrategy",
|
|
@@ -35,10 +27,4 @@ __all__ = [
|
|
|
35
27
|
"Runner",
|
|
36
28
|
"Benchmark",
|
|
37
29
|
"Report",
|
|
38
|
-
"Serializer",
|
|
39
|
-
"JSONSerializer",
|
|
40
|
-
"YAMLSerializer",
|
|
41
|
-
"HTMLSerializer",
|
|
42
|
-
"ResultStorage",
|
|
43
|
-
"FileStorage",
|
|
44
30
|
]
|
promptum/benchmark/__init__.py
CHANGED
|
@@ -1,4 +1,8 @@
|
|
|
1
1
|
from promptum.benchmark.benchmark import Benchmark
|
|
2
2
|
from promptum.benchmark.report import Report
|
|
3
|
+
from promptum.benchmark.result import TestResult
|
|
4
|
+
from promptum.benchmark.runner import Runner
|
|
5
|
+
from promptum.benchmark.summary import Summary
|
|
6
|
+
from promptum.benchmark.test_case import TestCase
|
|
3
7
|
|
|
4
|
-
__all__ = ["Benchmark", "Report"]
|
|
8
|
+
__all__ = ["Benchmark", "Report", "Runner", "Summary", "TestCase", "TestResult"]
|
promptum/benchmark/benchmark.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from collections.abc import Callable, Sequence
|
|
3
|
-
from typing import Any
|
|
4
3
|
|
|
5
4
|
from promptum.benchmark.report import Report
|
|
6
|
-
from promptum.
|
|
7
|
-
from promptum.
|
|
8
|
-
from promptum.
|
|
5
|
+
from promptum.benchmark.result import TestResult
|
|
6
|
+
from promptum.benchmark.runner import Runner
|
|
7
|
+
from promptum.benchmark.test_case import TestCase
|
|
9
8
|
from promptum.providers.protocol import LLMProvider
|
|
10
9
|
|
|
11
10
|
|
|
@@ -29,12 +28,12 @@ class Benchmark:
|
|
|
29
28
|
def add_tests(self, test_cases: Sequence[TestCase]) -> None:
|
|
30
29
|
self._test_cases.extend(test_cases)
|
|
31
30
|
|
|
32
|
-
def run(self
|
|
33
|
-
return asyncio.run(self.run_async(
|
|
31
|
+
def run(self) -> Report:
|
|
32
|
+
return asyncio.run(self.run_async())
|
|
34
33
|
|
|
35
|
-
async def run_async(self
|
|
34
|
+
async def run_async(self) -> Report:
|
|
36
35
|
if not self._test_cases:
|
|
37
|
-
return Report(results=[]
|
|
36
|
+
return Report(results=[])
|
|
38
37
|
|
|
39
38
|
runner = Runner(
|
|
40
39
|
provider=self.provider,
|
|
@@ -44,7 +43,4 @@ class Benchmark:
|
|
|
44
43
|
|
|
45
44
|
results = await runner.run(self._test_cases)
|
|
46
45
|
|
|
47
|
-
return Report(
|
|
48
|
-
results=results,
|
|
49
|
-
metadata=metadata or {},
|
|
50
|
-
)
|
|
46
|
+
return Report(results=results)
|
promptum/benchmark/report.py
CHANGED
|
@@ -1,16 +1,15 @@
|
|
|
1
1
|
from collections.abc import Callable, Sequence
|
|
2
2
|
from dataclasses import dataclass
|
|
3
|
-
from typing import Any
|
|
4
3
|
|
|
5
|
-
from promptum.
|
|
4
|
+
from promptum.benchmark.result import TestResult
|
|
5
|
+
from promptum.benchmark.summary import Summary
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
@dataclass(frozen=True, slots=True)
|
|
9
9
|
class Report:
|
|
10
10
|
results: Sequence[TestResult]
|
|
11
|
-
metadata: dict[str, Any]
|
|
12
11
|
|
|
13
|
-
def get_summary(self) ->
|
|
12
|
+
def get_summary(self) -> Summary:
|
|
14
13
|
total = len(self.results)
|
|
15
14
|
passed = sum(1 for r in self.results if r.passed)
|
|
16
15
|
|
|
@@ -18,18 +17,17 @@ class Report:
|
|
|
18
17
|
total_cost = sum(r.metrics.cost_usd or 0 for r in self.results if r.metrics)
|
|
19
18
|
total_tokens = sum(r.metrics.total_tokens or 0 for r in self.results if r.metrics)
|
|
20
19
|
|
|
21
|
-
return
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
}
|
|
20
|
+
return Summary(
|
|
21
|
+
total=total,
|
|
22
|
+
passed=passed,
|
|
23
|
+
failed=total - passed,
|
|
24
|
+
pass_rate=passed / total if total > 0 else 0,
|
|
25
|
+
avg_latency_ms=sum(latencies) / len(latencies) if latencies else 0,
|
|
26
|
+
min_latency_ms=min(latencies) if latencies else 0,
|
|
27
|
+
max_latency_ms=max(latencies) if latencies else 0,
|
|
28
|
+
total_cost_usd=total_cost,
|
|
29
|
+
total_tokens=total_tokens,
|
|
30
|
+
)
|
|
33
31
|
|
|
34
32
|
def filter(
|
|
35
33
|
self,
|
|
@@ -49,7 +47,7 @@ class Report:
|
|
|
49
47
|
if passed is not None:
|
|
50
48
|
filtered = [r for r in filtered if r.passed == passed]
|
|
51
49
|
|
|
52
|
-
return Report(results=filtered
|
|
50
|
+
return Report(results=filtered)
|
|
53
51
|
|
|
54
52
|
def group_by(self, key: Callable[[TestResult], str]) -> dict[str, "Report"]:
|
|
55
53
|
groups: dict[str, list[TestResult]] = {}
|
|
@@ -60,16 +58,4 @@ class Report:
|
|
|
60
58
|
groups[group_key] = []
|
|
61
59
|
groups[group_key].append(result)
|
|
62
60
|
|
|
63
|
-
return {k: Report(results=v
|
|
64
|
-
|
|
65
|
-
def compare_models(self) -> dict[str, dict[str, Any]]:
|
|
66
|
-
by_model = self.group_by(lambda r: r.test_case.model)
|
|
67
|
-
return {model: report.get_summary() for model, report in by_model.items()}
|
|
68
|
-
|
|
69
|
-
@staticmethod
|
|
70
|
-
def _percentile(values: list[float], p: float) -> float:
|
|
71
|
-
if not values:
|
|
72
|
-
return 0
|
|
73
|
-
sorted_values = sorted(values)
|
|
74
|
-
index = int(len(sorted_values) * p)
|
|
75
|
-
return sorted_values[min(index, len(sorted_values) - 1)]
|
|
61
|
+
return {k: Report(results=v) for k, v in groups.items()}
|
|
@@ -2,8 +2,8 @@ from dataclasses import dataclass, field
|
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from typing import Any
|
|
4
4
|
|
|
5
|
-
from promptum.
|
|
6
|
-
from promptum.
|
|
5
|
+
from promptum.benchmark.test_case import TestCase
|
|
6
|
+
from promptum.providers.metrics import Metrics
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
@dataclass(frozen=True, slots=True)
|
|
@@ -3,8 +3,8 @@ from collections.abc import Callable, Sequence
|
|
|
3
3
|
|
|
4
4
|
import httpx
|
|
5
5
|
|
|
6
|
-
from promptum.
|
|
7
|
-
from promptum.
|
|
6
|
+
from promptum.benchmark.result import TestResult
|
|
7
|
+
from promptum.benchmark.test_case import TestCase
|
|
8
8
|
from promptum.providers.protocol import LLMProvider
|
|
9
9
|
|
|
10
10
|
|
|
@@ -37,7 +37,6 @@ class Runner:
|
|
|
37
37
|
|
|
38
38
|
results = await asyncio.gather(
|
|
39
39
|
*[run_with_semaphore(tc) for tc in test_cases],
|
|
40
|
-
return_exceptions=False,
|
|
41
40
|
)
|
|
42
41
|
|
|
43
42
|
return list(results)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
@dataclass(frozen=True, slots=True)
|
|
5
|
+
class Summary:
|
|
6
|
+
total: int
|
|
7
|
+
passed: int
|
|
8
|
+
failed: int
|
|
9
|
+
pass_rate: float
|
|
10
|
+
avg_latency_ms: float
|
|
11
|
+
min_latency_ms: float
|
|
12
|
+
max_latency_ms: float
|
|
13
|
+
total_cost_usd: float
|
|
14
|
+
total_tokens: int
|
promptum/providers/__init__.py
CHANGED
|
@@ -1,7 +1,12 @@
|
|
|
1
|
+
from promptum.providers.metrics import Metrics
|
|
1
2
|
from promptum.providers.openrouter import OpenRouterClient
|
|
2
3
|
from promptum.providers.protocol import LLMProvider
|
|
4
|
+
from promptum.providers.retry import RetryConfig, RetryStrategy
|
|
3
5
|
|
|
4
6
|
__all__ = [
|
|
5
7
|
"LLMProvider",
|
|
8
|
+
"Metrics",
|
|
6
9
|
"OpenRouterClient",
|
|
10
|
+
"RetryConfig",
|
|
11
|
+
"RetryStrategy",
|
|
7
12
|
]
|
promptum/providers/openrouter.py
CHANGED
|
@@ -4,8 +4,8 @@ from typing import Any
|
|
|
4
4
|
|
|
5
5
|
import httpx
|
|
6
6
|
|
|
7
|
-
from promptum.
|
|
8
|
-
from promptum.
|
|
7
|
+
from promptum.providers.metrics import Metrics
|
|
8
|
+
from promptum.providers.retry import RetryConfig, RetryStrategy
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class OpenRouterClient:
|
|
@@ -61,7 +61,7 @@ class OpenRouterClient:
|
|
|
61
61
|
"messages": messages,
|
|
62
62
|
"temperature": temperature,
|
|
63
63
|
}
|
|
64
|
-
if max_tokens:
|
|
64
|
+
if max_tokens is not None:
|
|
65
65
|
payload["max_tokens"] = max_tokens
|
|
66
66
|
payload.update(kwargs)
|
|
67
67
|
|
promptum/providers/protocol.py
CHANGED
|
@@ -88,21 +88,3 @@ class JsonSchema:
|
|
|
88
88
|
keys = ", ".join(self.required_keys)
|
|
89
89
|
return f"Valid JSON with keys: {keys}"
|
|
90
90
|
return "Valid JSON object"
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
@dataclass(frozen=True, slots=True)
|
|
94
|
-
class PlaceholderValidator:
|
|
95
|
-
"""
|
|
96
|
-
Placeholder validator for deserialized reports.
|
|
97
|
-
|
|
98
|
-
Used when original validator cannot be reconstructed from storage.
|
|
99
|
-
Always returns True. Original validator logic is not preserved.
|
|
100
|
-
"""
|
|
101
|
-
|
|
102
|
-
description: str
|
|
103
|
-
|
|
104
|
-
def validate(self, response: str) -> tuple[bool, dict[str, Any]]:
|
|
105
|
-
return True, {"placeholder": True, "note": "Original validator could not be reconstructed"}
|
|
106
|
-
|
|
107
|
-
def describe(self) -> str:
|
|
108
|
-
return self.description
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: promptum
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.3
|
|
4
4
|
Summary: Async LLM benchmarking library with protocol-based extensibility
|
|
5
5
|
Project-URL: Homepage, https://github.com/deyna256/promptum
|
|
6
6
|
Project-URL: Repository, https://github.com/deyna256/promptum
|
|
@@ -36,8 +36,6 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
36
36
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
37
37
|
Requires-Python: >=3.13
|
|
38
38
|
Requires-Dist: httpx>=0.27.0
|
|
39
|
-
Requires-Dist: jinja2>=3.1.0
|
|
40
|
-
Requires-Dist: pyyaml>=6.0
|
|
41
39
|
Description-Content-Type: text/markdown
|
|
42
40
|
|
|
43
41
|
# promptum
|
|
@@ -48,7 +46,7 @@ Description-Content-Type: text/markdown
|
|
|
48
46
|

|
|
49
47
|

|
|
50
48
|
|
|
51
|
-
**Benchmark LLMs Like a Pro
|
|
49
|
+
**Benchmark LLMs Like a Pro.**
|
|
52
50
|
|
|
53
51
|
Stop writing boilerplate to test LLMs. Start getting results.
|
|
54
52
|
|
|
@@ -58,11 +56,12 @@ Stop writing boilerplate to test LLMs. Start getting results.
|
|
|
58
56
|
|
|
59
57
|
## What's This?
|
|
60
58
|
|
|
61
|
-
A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get
|
|
59
|
+
A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get structured results.
|
|
62
60
|
|
|
63
61
|
```python
|
|
64
62
|
benchmark = Benchmark(provider=client, name="my_test")
|
|
65
63
|
benchmark.add_test(TestCase(
|
|
64
|
+
name="basic_math",
|
|
66
65
|
prompt="What is 2+2?",
|
|
67
66
|
model="gpt-3.5-turbo",
|
|
68
67
|
validator=Contains("4")
|
|
@@ -97,15 +96,12 @@ for attempt in range(max_retries):
|
|
|
97
96
|
break
|
|
98
97
|
except Exception:
|
|
99
98
|
sleep(2 ** attempt)
|
|
100
|
-
|
|
101
|
-
# Export results manually
|
|
102
|
-
json.dump(results, open("results.json", "w"))
|
|
103
99
|
```
|
|
104
100
|
|
|
105
101
|
**After promptum:**
|
|
106
102
|
```python
|
|
107
103
|
report = await benchmark.run_async()
|
|
108
|
-
|
|
104
|
+
summary = report.get_summary() # Metrics captured automatically
|
|
109
105
|
```
|
|
110
106
|
|
|
111
107
|
---
|
|
@@ -135,9 +131,9 @@ async def main():
|
|
|
135
131
|
report = await benchmark.run_async()
|
|
136
132
|
summary = report.get_summary()
|
|
137
133
|
|
|
138
|
-
print(f"✓ {summary
|
|
139
|
-
print(f"⚡ {summary
|
|
140
|
-
print(f"💰 ${summary
|
|
134
|
+
print(f"✓ {summary.passed}/{summary.total} tests passed")
|
|
135
|
+
print(f"⚡ {summary.avg_latency_ms:.0f}ms average")
|
|
136
|
+
print(f"💰 ${summary.total_cost_usd:.6f} total cost")
|
|
141
137
|
|
|
142
138
|
asyncio.run(main())
|
|
143
139
|
```
|
|
@@ -151,14 +147,13 @@ python your_script.py
|
|
|
151
147
|
|
|
152
148
|
## What You Get
|
|
153
149
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
✅ **Zero Config** - No YAML files, no setup scripts, just Python
|
|
150
|
+
- [x] **100+ Models via OpenRouter** - One client for OpenAI, Anthropic, Google, and more
|
|
151
|
+
- [x] **Smart Validation** - ExactMatch, Contains, Regex, JsonSchema, or write your own
|
|
152
|
+
- [x] **Automatic Retries** - Exponential/linear backoff with configurable attempts
|
|
153
|
+
- [x] **Metrics Tracking** - Latency, tokens, cost - automatically captured
|
|
154
|
+
- [x] **Async by Default** - Run 100 tests in parallel without breaking a sweat
|
|
155
|
+
- [x] **Type Safe** - Full type hints, catches errors before runtime
|
|
156
|
+
- [x] **Zero Config** - No YAML files, no setup scripts, just Python
|
|
162
157
|
|
|
163
158
|
---
|
|
164
159
|
|
|
@@ -167,39 +162,43 @@ python your_script.py
|
|
|
167
162
|
Compare GPT-4 vs Claude on your tasks:
|
|
168
163
|
|
|
169
164
|
```python
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
tests = [
|
|
173
|
-
TestCase(
|
|
174
|
-
name="json_output",
|
|
175
|
-
prompt='Output JSON: {"status": "ok"}',
|
|
176
|
-
model="openai/gpt-4",
|
|
177
|
-
validator=Regex(r'\{"status":\s*"ok"\}')
|
|
178
|
-
),
|
|
179
|
-
TestCase(
|
|
180
|
-
name="json_output",
|
|
181
|
-
prompt='Output JSON: {"status": "ok"}',
|
|
182
|
-
model="anthropic/claude-3-5-sonnet",
|
|
183
|
-
validator=Regex(r'\{"status":\s*"ok"\}')
|
|
184
|
-
),
|
|
185
|
-
TestCase(
|
|
186
|
-
name="creative_writing",
|
|
187
|
-
prompt="Write a haiku about Python",
|
|
188
|
-
model="openai/gpt-4",
|
|
189
|
-
validator=Contains("Python", case_sensitive=False)
|
|
190
|
-
),
|
|
191
|
-
]
|
|
192
|
-
|
|
193
|
-
benchmark.add_tests(tests)
|
|
194
|
-
report = await benchmark.run_async()
|
|
165
|
+
import asyncio
|
|
166
|
+
from promptum import Benchmark, TestCase, Contains, Regex, OpenRouterClient
|
|
195
167
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
168
|
+
async def main():
|
|
169
|
+
async with OpenRouterClient(api_key="your-key") as client:
|
|
170
|
+
benchmark = Benchmark(provider=client, name="model_comparison")
|
|
171
|
+
|
|
172
|
+
benchmark.add_tests([
|
|
173
|
+
TestCase(
|
|
174
|
+
name="json_output_gpt4",
|
|
175
|
+
prompt='Output JSON: {"status": "ok"}',
|
|
176
|
+
model="openai/gpt-4",
|
|
177
|
+
validator=Regex(r'\{"status":\s*"ok"\}')
|
|
178
|
+
),
|
|
179
|
+
TestCase(
|
|
180
|
+
name="json_output_claude",
|
|
181
|
+
prompt='Output JSON: {"status": "ok"}',
|
|
182
|
+
model="anthropic/claude-3-5-sonnet",
|
|
183
|
+
validator=Regex(r'\{"status":\s*"ok"\}')
|
|
184
|
+
),
|
|
185
|
+
TestCase(
|
|
186
|
+
name="creative_writing",
|
|
187
|
+
prompt="Write a haiku about Python",
|
|
188
|
+
model="openai/gpt-4",
|
|
189
|
+
validator=Contains("Python", case_sensitive=False)
|
|
190
|
+
),
|
|
191
|
+
])
|
|
201
192
|
|
|
202
|
-
|
|
193
|
+
report = await benchmark.run_async()
|
|
194
|
+
|
|
195
|
+
# Side-by-side model comparison
|
|
196
|
+
for model, model_report in report.group_by(lambda r: r.test_case.model).items():
|
|
197
|
+
summary = model_report.get_summary()
|
|
198
|
+
print(f"{model}: {summary.pass_rate:.0%} pass rate, {summary.avg_latency_ms:.0f}ms avg")
|
|
199
|
+
|
|
200
|
+
asyncio.run(main())
|
|
201
|
+
```
|
|
203
202
|
|
|
204
203
|
---
|
|
205
204
|
|
|
@@ -252,7 +251,7 @@ Found a bug? Want a feature? PRs welcome!
|
|
|
252
251
|
|
|
253
252
|
```bash
|
|
254
253
|
# Development setup
|
|
255
|
-
git clone https://github.com/
|
|
254
|
+
git clone https://github.com/deyna256/promptum.git
|
|
256
255
|
cd promptum
|
|
257
256
|
just sync # Install dependencies
|
|
258
257
|
just test # Run tests
|
|
@@ -273,7 +272,7 @@ MIT - do whatever you want with it.
|
|
|
273
272
|
|
|
274
273
|
<div align="center">
|
|
275
274
|
|
|
276
|
-
**[⭐ Star on GitHub](https://github.com/
|
|
275
|
+
**[⭐ Star on GitHub](https://github.com/deyna256/promptum)** | **[🐛 Report Bug](https://github.com/deyna256/promptum/issues)** | **[💡 Request Feature](https://github.com/deyna256/promptum/issues)**
|
|
277
276
|
|
|
278
277
|
Made for developers who value their time.
|
|
279
278
|
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
promptum/__init__.py,sha256=F2dfvZUHDnAwQOGDXINgFnu8vtnwfS_twr6O5tmL1K8,606
|
|
2
|
+
promptum/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
promptum/benchmark/__init__.py,sha256=eyeJHW4mZdKv2vuQRfXFOAKSH07YNROeNlBKJCOfWHg,369
|
|
4
|
+
promptum/benchmark/benchmark.py,sha256=hZ3557qPKqFeNNuxrRLPs-b6XBy2JCowIhRDDwatfeI,1403
|
|
5
|
+
promptum/benchmark/report.py,sha256=IkCpd3cswLtL9vTUIXAAYGvZk_t9wyQAzbIFpOSXnkI,2080
|
|
6
|
+
promptum/benchmark/result.py,sha256=nKh-T4zlam2LxsaFoL8jeVaO6kZJ1sfB_tnp4gdNPhM,482
|
|
7
|
+
promptum/benchmark/runner.py,sha256=5p6JBwjTlEHTh6jNv_iuFH1nIrI4_Gv3wmzCT0TWpvA,2407
|
|
8
|
+
promptum/benchmark/summary.py,sha256=dD-i8m2BUMFhMC_7v-ITDNUP5X7fG0ARErwKB5D9_yE,281
|
|
9
|
+
promptum/benchmark/test_case.py,sha256=Okypf2334ewVrvmQG7M3I3D7BzqXDsQ2ihjNw9gGF00,598
|
|
10
|
+
promptum/providers/__init__.py,sha256=UprvJ4vxHqo-VTzzUmZ4wFCj6VybP9xBd7HtpPPSvbI,335
|
|
11
|
+
promptum/providers/metrics.py,sha256=FnS10nHFjQ5Clj5X21C_nW6zAUJU_ZHt0s2fLgp6L28,427
|
|
12
|
+
promptum/providers/openrouter.py,sha256=fOqBm4ak7szNNeKNhSI6y4WpFsUx6iQg_3jaFsXc0dQ,4623
|
|
13
|
+
promptum/providers/protocol.py,sha256=g9zIH91HysBIATMHd9Z2Mpk1tKiTOkAyd-zynRaQsuk,493
|
|
14
|
+
promptum/providers/retry.py,sha256=mA_RRz9_9J_mge_AUd9f1A-gACOxZLGTI8vTIstAr8s,538
|
|
15
|
+
promptum/validation/__init__.py,sha256=mhykyxaIwn2PJh2RXAi0fi2NRIveFmlC5bg1nyCbfVU,252
|
|
16
|
+
promptum/validation/protocol.py,sha256=xqxm23YX6eNeZHKMLMZ-Wz8iQKn4ZRzAI5Xryxg0uq4,418
|
|
17
|
+
promptum/validation/validators.py,sha256=qSMva2P2miXXJJ5XeTKJsyYgh2x5wORi3dhOnBYuACE,2686
|
|
18
|
+
promptum-0.0.3.dist-info/METADATA,sha256=P4d6Mf35ly_xATjxAUWZXjQPYyfb5xfBMqiXoGkkDkc,8278
|
|
19
|
+
promptum-0.0.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
20
|
+
promptum-0.0.3.dist-info/licenses/LICENSE,sha256=Fgn285H5Vy9diOlqO1TzS3hD97WcdF6-GFHvUcFNtmg,1067
|
|
21
|
+
promptum-0.0.3.dist-info/RECORD,,
|
promptum/core/__init__.py
DELETED
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
from promptum.core.metrics import Metrics
|
|
2
|
-
from promptum.core.result import TestResult
|
|
3
|
-
from promptum.core.retry import RetryConfig, RetryStrategy
|
|
4
|
-
from promptum.core.test_case import TestCase
|
|
5
|
-
|
|
6
|
-
__all__ = [
|
|
7
|
-
"Metrics",
|
|
8
|
-
"RetryConfig",
|
|
9
|
-
"RetryStrategy",
|
|
10
|
-
"TestCase",
|
|
11
|
-
"TestResult",
|
|
12
|
-
]
|
promptum/execution/__init__.py
DELETED
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
from promptum.serialization.html import HTMLSerializer
|
|
2
|
-
from promptum.serialization.json import JSONSerializer
|
|
3
|
-
from promptum.serialization.protocol import Serializer
|
|
4
|
-
from promptum.serialization.yaml import YAMLSerializer
|
|
5
|
-
|
|
6
|
-
__all__ = [
|
|
7
|
-
"Serializer",
|
|
8
|
-
"JSONSerializer",
|
|
9
|
-
"YAMLSerializer",
|
|
10
|
-
"HTMLSerializer",
|
|
11
|
-
]
|
promptum/serialization/base.py
DELETED
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
"""Base serializer with shared result serialization logic."""
|
|
2
|
-
|
|
3
|
-
from typing import Any
|
|
4
|
-
|
|
5
|
-
from promptum.core.result import TestResult
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class BaseSerializer:
|
|
9
|
-
"""
|
|
10
|
-
Base class for serializers with common result serialization logic.
|
|
11
|
-
|
|
12
|
-
Subclasses should implement:
|
|
13
|
-
- serialize(report: Report) -> str
|
|
14
|
-
- get_file_extension() -> str
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
@staticmethod
|
|
18
|
-
def _serialize_result(result: TestResult) -> dict[str, Any]:
|
|
19
|
-
"""Convert TestResult to dictionary representation."""
|
|
20
|
-
return {
|
|
21
|
-
"test_case": {
|
|
22
|
-
"name": result.test_case.name,
|
|
23
|
-
"prompt": result.test_case.prompt,
|
|
24
|
-
"model": result.test_case.model,
|
|
25
|
-
"tags": list(result.test_case.tags),
|
|
26
|
-
"system_prompt": result.test_case.system_prompt,
|
|
27
|
-
"temperature": result.test_case.temperature,
|
|
28
|
-
"max_tokens": result.test_case.max_tokens,
|
|
29
|
-
"metadata": result.test_case.metadata,
|
|
30
|
-
"validator": result.test_case.validator.describe(),
|
|
31
|
-
},
|
|
32
|
-
"response": result.response,
|
|
33
|
-
"passed": result.passed,
|
|
34
|
-
"metrics": {
|
|
35
|
-
"latency_ms": result.metrics.latency_ms,
|
|
36
|
-
"prompt_tokens": result.metrics.prompt_tokens,
|
|
37
|
-
"completion_tokens": result.metrics.completion_tokens,
|
|
38
|
-
"total_tokens": result.metrics.total_tokens,
|
|
39
|
-
"cost_usd": result.metrics.cost_usd,
|
|
40
|
-
"retry_delays": list(result.metrics.retry_delays),
|
|
41
|
-
"total_attempts": result.metrics.total_attempts,
|
|
42
|
-
}
|
|
43
|
-
if result.metrics
|
|
44
|
-
else None,
|
|
45
|
-
"validation_details": result.validation_details,
|
|
46
|
-
"execution_error": result.execution_error,
|
|
47
|
-
"timestamp": result.timestamp.isoformat(),
|
|
48
|
-
}
|
promptum/serialization/html.py
DELETED
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
from jinja2 import Template
|
|
5
|
-
|
|
6
|
-
from promptum.benchmark.report import Report
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class HTMLSerializer:
|
|
10
|
-
def __init__(self) -> None:
|
|
11
|
-
template_path = Path(__file__).parent / "report_template.html"
|
|
12
|
-
self._template = Template(template_path.read_text())
|
|
13
|
-
|
|
14
|
-
def serialize(self, report: Report) -> str:
|
|
15
|
-
summary = report.get_summary()
|
|
16
|
-
|
|
17
|
-
results_data = []
|
|
18
|
-
for result in report.results:
|
|
19
|
-
results_data.append(
|
|
20
|
-
{
|
|
21
|
-
"test_case": {
|
|
22
|
-
"name": result.test_case.name,
|
|
23
|
-
"prompt": result.test_case.prompt,
|
|
24
|
-
"model": result.test_case.model,
|
|
25
|
-
"tags": list(result.test_case.tags),
|
|
26
|
-
"system_prompt": result.test_case.system_prompt,
|
|
27
|
-
"validator": result.test_case.validator.describe(),
|
|
28
|
-
},
|
|
29
|
-
"response": result.response,
|
|
30
|
-
"passed": result.passed,
|
|
31
|
-
"metrics": {
|
|
32
|
-
"latency_ms": result.metrics.latency_ms,
|
|
33
|
-
"prompt_tokens": result.metrics.prompt_tokens,
|
|
34
|
-
"completion_tokens": result.metrics.completion_tokens,
|
|
35
|
-
"total_tokens": result.metrics.total_tokens,
|
|
36
|
-
"cost_usd": result.metrics.cost_usd,
|
|
37
|
-
"total_attempts": result.metrics.total_attempts,
|
|
38
|
-
}
|
|
39
|
-
if result.metrics
|
|
40
|
-
else None,
|
|
41
|
-
"execution_error": result.execution_error,
|
|
42
|
-
}
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
return self._template.render(
|
|
46
|
-
summary=summary,
|
|
47
|
-
results=results_data,
|
|
48
|
-
results_json=json.dumps(results_data),
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
def get_file_extension(self) -> str:
|
|
52
|
-
return "html"
|
promptum/serialization/json.py
DELETED
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from datetime import datetime
|
|
3
|
-
from typing import Any
|
|
4
|
-
|
|
5
|
-
from promptum.benchmark.report import Report
|
|
6
|
-
from promptum.serialization.base import BaseSerializer
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class JSONSerializer(BaseSerializer):
|
|
10
|
-
def __init__(self, indent: int = 2):
|
|
11
|
-
self.indent = indent
|
|
12
|
-
|
|
13
|
-
def serialize(self, report: Report) -> str:
|
|
14
|
-
data = {
|
|
15
|
-
"metadata": report.metadata,
|
|
16
|
-
"summary": report.get_summary(),
|
|
17
|
-
"results": [self._serialize_result(r) for r in report.results],
|
|
18
|
-
}
|
|
19
|
-
return json.dumps(data, indent=self.indent, default=self._json_default)
|
|
20
|
-
|
|
21
|
-
def get_file_extension(self) -> str:
|
|
22
|
-
return "json"
|
|
23
|
-
|
|
24
|
-
@staticmethod
|
|
25
|
-
def _json_default(obj: Any) -> Any:
|
|
26
|
-
if isinstance(obj, datetime):
|
|
27
|
-
return obj.isoformat()
|
|
28
|
-
raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
from typing import Protocol
|
|
2
|
-
|
|
3
|
-
from promptum.benchmark.report import Report
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class Serializer(Protocol):
|
|
7
|
-
def serialize(self, report: Report) -> str:
|
|
8
|
-
"""Serializes a Report to a string format."""
|
|
9
|
-
...
|
|
10
|
-
|
|
11
|
-
def get_file_extension(self) -> str:
|
|
12
|
-
"""Returns the file extension for this format (e.g., 'json', 'html')."""
|
|
13
|
-
...
|
|
@@ -1,293 +0,0 @@
|
|
|
1
|
-
<!DOCTYPE html>
|
|
2
|
-
<html lang="en">
|
|
3
|
-
<head>
|
|
4
|
-
<meta charset="UTF-8">
|
|
5
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
-
<title>LLM Benchmark Report</title>
|
|
7
|
-
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.0/dist/chart.umd.js"></script>
|
|
8
|
-
<style>
|
|
9
|
-
* { margin: 0; padding: 0; box-sizing: border-box; }
|
|
10
|
-
:root {
|
|
11
|
-
--bg: #ffffff;
|
|
12
|
-
--surface: #f5f5f5;
|
|
13
|
-
--text: #1a1a1a;
|
|
14
|
-
--text-muted: #666;
|
|
15
|
-
--border: #ddd;
|
|
16
|
-
--success: #22c55e;
|
|
17
|
-
--error: #ef4444;
|
|
18
|
-
--warning: #f59e0b;
|
|
19
|
-
}
|
|
20
|
-
@media (prefers-color-scheme: dark) {
|
|
21
|
-
:root {
|
|
22
|
-
--bg: #0a0a0a;
|
|
23
|
-
--surface: #1a1a1a;
|
|
24
|
-
--text: #e5e5e5;
|
|
25
|
-
--text-muted: #a3a3a3;
|
|
26
|
-
--border: #333;
|
|
27
|
-
}
|
|
28
|
-
}
|
|
29
|
-
body {
|
|
30
|
-
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
|
|
31
|
-
background: var(--bg);
|
|
32
|
-
color: var(--text);
|
|
33
|
-
line-height: 1.6;
|
|
34
|
-
}
|
|
35
|
-
.container { max-width: 1400px; margin: 0 auto; padding: 2rem; }
|
|
36
|
-
h1 { font-size: 2rem; margin-bottom: 0.5rem; }
|
|
37
|
-
h2 { font-size: 1.5rem; margin: 2rem 0 1rem; }
|
|
38
|
-
.summary { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
|
|
39
|
-
.card {
|
|
40
|
-
background: var(--surface);
|
|
41
|
-
border: 1px solid var(--border);
|
|
42
|
-
border-radius: 8px;
|
|
43
|
-
padding: 1.5rem;
|
|
44
|
-
}
|
|
45
|
-
.card-title { font-size: 0.875rem; color: var(--text-muted); margin-bottom: 0.5rem; }
|
|
46
|
-
.card-value { font-size: 2rem; font-weight: 700; }
|
|
47
|
-
.chart-container { height: 300px; margin-bottom: 2rem; }
|
|
48
|
-
table {
|
|
49
|
-
width: 100%;
|
|
50
|
-
border-collapse: collapse;
|
|
51
|
-
background: var(--surface);
|
|
52
|
-
border-radius: 8px;
|
|
53
|
-
overflow: hidden;
|
|
54
|
-
}
|
|
55
|
-
th, td {
|
|
56
|
-
text-align: left;
|
|
57
|
-
padding: 1rem;
|
|
58
|
-
border-bottom: 1px solid var(--border);
|
|
59
|
-
}
|
|
60
|
-
th {
|
|
61
|
-
background: var(--surface);
|
|
62
|
-
font-weight: 600;
|
|
63
|
-
position: sticky;
|
|
64
|
-
top: 0;
|
|
65
|
-
}
|
|
66
|
-
tr:hover { background: var(--bg); }
|
|
67
|
-
.badge {
|
|
68
|
-
display: inline-block;
|
|
69
|
-
padding: 0.25rem 0.75rem;
|
|
70
|
-
border-radius: 12px;
|
|
71
|
-
font-size: 0.75rem;
|
|
72
|
-
font-weight: 600;
|
|
73
|
-
}
|
|
74
|
-
.badge-success { background: var(--success); color: white; }
|
|
75
|
-
.badge-error { background: var(--error); color: white; }
|
|
76
|
-
.tag {
|
|
77
|
-
display: inline-block;
|
|
78
|
-
padding: 0.125rem 0.5rem;
|
|
79
|
-
background: var(--border);
|
|
80
|
-
border-radius: 4px;
|
|
81
|
-
font-size: 0.75rem;
|
|
82
|
-
margin-right: 0.25rem;
|
|
83
|
-
}
|
|
84
|
-
.search {
|
|
85
|
-
width: 100%;
|
|
86
|
-
padding: 0.75rem;
|
|
87
|
-
margin-bottom: 1rem;
|
|
88
|
-
background: var(--surface);
|
|
89
|
-
border: 1px solid var(--border);
|
|
90
|
-
border-radius: 8px;
|
|
91
|
-
color: var(--text);
|
|
92
|
-
font-size: 1rem;
|
|
93
|
-
}
|
|
94
|
-
.truncate {
|
|
95
|
-
max-width: 300px;
|
|
96
|
-
white-space: nowrap;
|
|
97
|
-
overflow: hidden;
|
|
98
|
-
text-overflow: ellipsis;
|
|
99
|
-
}
|
|
100
|
-
button {
|
|
101
|
-
background: var(--surface);
|
|
102
|
-
border: 1px solid var(--border);
|
|
103
|
-
color: var(--text);
|
|
104
|
-
padding: 0.5rem 1rem;
|
|
105
|
-
border-radius: 6px;
|
|
106
|
-
cursor: pointer;
|
|
107
|
-
font-size: 0.875rem;
|
|
108
|
-
}
|
|
109
|
-
button:hover { background: var(--border); }
|
|
110
|
-
.modal {
|
|
111
|
-
display: none;
|
|
112
|
-
position: fixed;
|
|
113
|
-
top: 0;
|
|
114
|
-
left: 0;
|
|
115
|
-
width: 100%;
|
|
116
|
-
height: 100%;
|
|
117
|
-
background: rgba(0, 0, 0, 0.7);
|
|
118
|
-
z-index: 1000;
|
|
119
|
-
overflow: auto;
|
|
120
|
-
}
|
|
121
|
-
.modal-content {
|
|
122
|
-
background: var(--surface);
|
|
123
|
-
margin: 2rem auto;
|
|
124
|
-
padding: 2rem;
|
|
125
|
-
max-width: 800px;
|
|
126
|
-
border-radius: 12px;
|
|
127
|
-
position: relative;
|
|
128
|
-
}
|
|
129
|
-
.modal-close {
|
|
130
|
-
position: absolute;
|
|
131
|
-
top: 1rem;
|
|
132
|
-
right: 1rem;
|
|
133
|
-
font-size: 1.5rem;
|
|
134
|
-
cursor: pointer;
|
|
135
|
-
}
|
|
136
|
-
pre {
|
|
137
|
-
background: var(--bg);
|
|
138
|
-
padding: 1rem;
|
|
139
|
-
border-radius: 6px;
|
|
140
|
-
overflow-x: auto;
|
|
141
|
-
margin: 0.5rem 0;
|
|
142
|
-
}
|
|
143
|
-
code { font-family: 'Courier New', monospace; font-size: 0.875rem; }
|
|
144
|
-
</style>
|
|
145
|
-
</head>
|
|
146
|
-
<body>
|
|
147
|
-
<div class="container">
|
|
148
|
-
<h1>LLM Benchmark Report</h1>
|
|
149
|
-
<p style="color: var(--text-muted); margin-bottom: 2rem;">{{ summary.total }} tests executed</p>
|
|
150
|
-
|
|
151
|
-
<div class="summary">
|
|
152
|
-
<div class="card">
|
|
153
|
-
<div class="card-title">Pass Rate</div>
|
|
154
|
-
<div class="card-value" style="color: var(--success);">{{ "%.1f"|format(summary.pass_rate * 100) }}%</div>
|
|
155
|
-
</div>
|
|
156
|
-
<div class="card">
|
|
157
|
-
<div class="card-title">Avg Latency</div>
|
|
158
|
-
<div class="card-value">{{ "%.0f"|format(summary.avg_latency_ms) }}ms</div>
|
|
159
|
-
</div>
|
|
160
|
-
<div class="card">
|
|
161
|
-
<div class="card-title">Total Cost</div>
|
|
162
|
-
<div class="card-value">${{ "%.6f"|format(summary.total_cost_usd) }}</div>
|
|
163
|
-
</div>
|
|
164
|
-
<div class="card">
|
|
165
|
-
<div class="card-title">Total Tokens</div>
|
|
166
|
-
<div class="card-value">{{ "{:,}".format(summary.total_tokens) }}</div>
|
|
167
|
-
</div>
|
|
168
|
-
</div>
|
|
169
|
-
|
|
170
|
-
<div class="card chart-container">
|
|
171
|
-
<canvas id="latencyChart"></canvas>
|
|
172
|
-
</div>
|
|
173
|
-
|
|
174
|
-
<h2>Test Results</h2>
|
|
175
|
-
<input type="text" class="search" id="searchInput" placeholder="Search tests...">
|
|
176
|
-
|
|
177
|
-
<table id="resultsTable">
|
|
178
|
-
<thead>
|
|
179
|
-
<tr>
|
|
180
|
-
<th>Status</th>
|
|
181
|
-
<th>Name</th>
|
|
182
|
-
<th>Model</th>
|
|
183
|
-
<th>Latency</th>
|
|
184
|
-
<th>Cost</th>
|
|
185
|
-
<th>Tags</th>
|
|
186
|
-
<th>Actions</th>
|
|
187
|
-
</tr>
|
|
188
|
-
</thead>
|
|
189
|
-
<tbody>
|
|
190
|
-
{% for result in results %}
|
|
191
|
-
<tr class="result-row">
|
|
192
|
-
<td>
|
|
193
|
-
{% if result.passed %}
|
|
194
|
-
<span class="badge badge-success">PASS</span>
|
|
195
|
-
{% else %}
|
|
196
|
-
<span class="badge badge-error">FAIL</span>
|
|
197
|
-
{% endif %}
|
|
198
|
-
</td>
|
|
199
|
-
<td>{{ result.test_case.name }}</td>
|
|
200
|
-
<td>{{ result.test_case.model }}</td>
|
|
201
|
-
<td>{{ "%.0f"|format(result.metrics.latency_ms if result.metrics else 0) }}ms</td>
|
|
202
|
-
<td>${{ "%.6f"|format(result.metrics.cost_usd if result.metrics and result.metrics.cost_usd else 0) }}</td>
|
|
203
|
-
<td>
|
|
204
|
-
{% for tag in result.test_case.tags %}
|
|
205
|
-
<span class="tag">{{ tag }}</span>
|
|
206
|
-
{% endfor %}
|
|
207
|
-
</td>
|
|
208
|
-
<td><button onclick="showDetails({{ loop.index0 }})">Details</button></td>
|
|
209
|
-
</tr>
|
|
210
|
-
{% endfor %}
|
|
211
|
-
</tbody>
|
|
212
|
-
</table>
|
|
213
|
-
</div>
|
|
214
|
-
|
|
215
|
-
<div id="detailsModal" class="modal">
|
|
216
|
-
<div class="modal-content">
|
|
217
|
-
<span class="modal-close" onclick="closeModal()">×</span>
|
|
218
|
-
<div id="modalBody"></div>
|
|
219
|
-
</div>
|
|
220
|
-
</div>
|
|
221
|
-
|
|
222
|
-
<script>
|
|
223
|
-
const results = {{ results_json }};
|
|
224
|
-
|
|
225
|
-
new Chart(document.getElementById('latencyChart'), {
|
|
226
|
-
type: 'bar',
|
|
227
|
-
data: {
|
|
228
|
-
labels: results.map((r, i) => r.test_case.name),
|
|
229
|
-
datasets: [{
|
|
230
|
-
label: 'Latency (ms)',
|
|
231
|
-
data: results.map(r => r.metrics ? r.metrics.latency_ms : 0),
|
|
232
|
-
backgroundColor: results.map(r => r.passed ? '#22c55e' : '#ef4444')
|
|
233
|
-
}]
|
|
234
|
-
},
|
|
235
|
-
options: {
|
|
236
|
-
responsive: true,
|
|
237
|
-
maintainAspectRatio: false,
|
|
238
|
-
plugins: { legend: { display: false } }
|
|
239
|
-
}
|
|
240
|
-
});
|
|
241
|
-
|
|
242
|
-
document.getElementById('searchInput').addEventListener('input', function(e) {
|
|
243
|
-
const term = e.target.value.toLowerCase();
|
|
244
|
-
document.querySelectorAll('.result-row').forEach(row => {
|
|
245
|
-
const text = row.textContent.toLowerCase();
|
|
246
|
-
row.style.display = text.includes(term) ? '' : 'none';
|
|
247
|
-
});
|
|
248
|
-
});
|
|
249
|
-
|
|
250
|
-
function showDetails(index) {
|
|
251
|
-
const result = results[index];
|
|
252
|
-
const html = `
|
|
253
|
-
<h2>${result.test_case.name}</h2>
|
|
254
|
-
<p><strong>Status:</strong> <span class="badge ${result.passed ? 'badge-success' : 'badge-error'}">${result.passed ? 'PASS' : 'FAIL'}</span></p>
|
|
255
|
-
<p><strong>Model:</strong> ${result.test_case.model}</p>
|
|
256
|
-
<p><strong>Validator:</strong> ${result.test_case.validator}</p>
|
|
257
|
-
<h3>Prompt</h3>
|
|
258
|
-
<pre><code>${escapeHtml(result.test_case.prompt)}</code></pre>
|
|
259
|
-
${result.test_case.system_prompt ? `<h3>System Prompt</h3><pre><code>${escapeHtml(result.test_case.system_prompt)}</code></pre>` : ''}
|
|
260
|
-
<h3>Response</h3>
|
|
261
|
-
<pre><code>${escapeHtml(result.response || 'No response')}</code></pre>
|
|
262
|
-
${result.execution_error ? `<h3>Error</h3><pre style="color: var(--error);"><code>${escapeHtml(result.execution_error)}</code></pre>` : ''}
|
|
263
|
-
${result.metrics ? `
|
|
264
|
-
<h3>Metrics</h3>
|
|
265
|
-
<ul>
|
|
266
|
-
<li>Latency: ${result.metrics.latency_ms.toFixed(0)}ms</li>
|
|
267
|
-
<li>Tokens: ${result.metrics.total_tokens || 'N/A'}</li>
|
|
268
|
-
<li>Cost: $${(result.metrics.cost_usd || 0).toFixed(6)}</li>
|
|
269
|
-
<li>Attempts: ${result.metrics.total_attempts}</li>
|
|
270
|
-
</ul>
|
|
271
|
-
` : ''}
|
|
272
|
-
`;
|
|
273
|
-
document.getElementById('modalBody').innerHTML = html;
|
|
274
|
-
document.getElementById('detailsModal').style.display = 'block';
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
function closeModal() {
|
|
278
|
-
document.getElementById('detailsModal').style.display = 'none';
|
|
279
|
-
}
|
|
280
|
-
|
|
281
|
-
function escapeHtml(text) {
|
|
282
|
-
const div = document.createElement('div');
|
|
283
|
-
div.textContent = text;
|
|
284
|
-
return div.innerHTML;
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
window.onclick = function(event) {
|
|
288
|
-
const modal = document.getElementById('detailsModal');
|
|
289
|
-
if (event.target === modal) closeModal();
|
|
290
|
-
}
|
|
291
|
-
</script>
|
|
292
|
-
</body>
|
|
293
|
-
</html>
|
promptum/serialization/yaml.py
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
import yaml
|
|
2
|
-
|
|
3
|
-
from promptum.benchmark.report import Report
|
|
4
|
-
from promptum.serialization.base import BaseSerializer
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class YAMLSerializer(BaseSerializer):
|
|
8
|
-
def serialize(self, report: Report) -> str:
|
|
9
|
-
data = {
|
|
10
|
-
"metadata": report.metadata,
|
|
11
|
-
"summary": report.get_summary(),
|
|
12
|
-
"results": [self._serialize_result(r) for r in report.results],
|
|
13
|
-
}
|
|
14
|
-
return yaml.dump(data, default_flow_style=False, sort_keys=False)
|
|
15
|
-
|
|
16
|
-
def get_file_extension(self) -> str:
|
|
17
|
-
return "yaml"
|
promptum/storage/__init__.py
DELETED
promptum/storage/file.py
DELETED
|
@@ -1,157 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import tempfile
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from typing import Any
|
|
6
|
-
|
|
7
|
-
from promptum.benchmark.report import Report
|
|
8
|
-
from promptum.core.metrics import Metrics
|
|
9
|
-
from promptum.core.result import TestResult
|
|
10
|
-
from promptum.core.test_case import TestCase
|
|
11
|
-
from promptum.validation.validators import PlaceholderValidator
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class FileStorage:
|
|
15
|
-
def __init__(self, base_dir: str = "results"):
|
|
16
|
-
self.base_dir = Path(base_dir)
|
|
17
|
-
self.reports_dir = self.base_dir / "reports"
|
|
18
|
-
self.metadata_file = self.base_dir / "metadata.json"
|
|
19
|
-
|
|
20
|
-
self.reports_dir.mkdir(parents=True, exist_ok=True)
|
|
21
|
-
|
|
22
|
-
def save(self, report: Report, name: str) -> str:
|
|
23
|
-
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
|
24
|
-
identifier = f"{timestamp}_{name}"
|
|
25
|
-
filename = f"{identifier}.json"
|
|
26
|
-
filepath = self.reports_dir / filename
|
|
27
|
-
|
|
28
|
-
data = self._serialize_report(report)
|
|
29
|
-
|
|
30
|
-
with tempfile.NamedTemporaryFile(
|
|
31
|
-
mode="w", delete=False, dir=self.reports_dir, suffix=".tmp"
|
|
32
|
-
) as tmp:
|
|
33
|
-
json.dump(data, tmp, indent=2)
|
|
34
|
-
tmp_path = Path(tmp.name)
|
|
35
|
-
|
|
36
|
-
tmp_path.replace(filepath)
|
|
37
|
-
|
|
38
|
-
self._update_metadata(identifier, name, str(filepath))
|
|
39
|
-
|
|
40
|
-
return identifier
|
|
41
|
-
|
|
42
|
-
def load(self, identifier: str) -> Report:
|
|
43
|
-
filepath = self.reports_dir / f"{identifier}.json"
|
|
44
|
-
|
|
45
|
-
if not filepath.exists():
|
|
46
|
-
raise FileNotFoundError(f"Report not found: {identifier}")
|
|
47
|
-
|
|
48
|
-
with open(filepath) as f:
|
|
49
|
-
data = json.load(f)
|
|
50
|
-
|
|
51
|
-
return self._deserialize_report(data)
|
|
52
|
-
|
|
53
|
-
def list_reports(self) -> list[dict[str, Any]]:
|
|
54
|
-
if not self.metadata_file.exists():
|
|
55
|
-
return []
|
|
56
|
-
|
|
57
|
-
with open(self.metadata_file) as f:
|
|
58
|
-
return json.load(f)
|
|
59
|
-
|
|
60
|
-
def _update_metadata(self, identifier: str, name: str, path: str) -> None:
|
|
61
|
-
metadata = self.list_reports()
|
|
62
|
-
|
|
63
|
-
metadata.append(
|
|
64
|
-
{
|
|
65
|
-
"id": identifier,
|
|
66
|
-
"name": name,
|
|
67
|
-
"path": path,
|
|
68
|
-
"timestamp": datetime.now().isoformat(),
|
|
69
|
-
}
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
with tempfile.NamedTemporaryFile(
|
|
73
|
-
mode="w", delete=False, dir=self.base_dir, suffix=".tmp"
|
|
74
|
-
) as tmp:
|
|
75
|
-
json.dump(metadata, tmp, indent=2)
|
|
76
|
-
tmp_path = Path(tmp.name)
|
|
77
|
-
|
|
78
|
-
tmp_path.replace(self.metadata_file)
|
|
79
|
-
|
|
80
|
-
@staticmethod
|
|
81
|
-
def _serialize_report(report: Report) -> dict[str, Any]:
|
|
82
|
-
return {
|
|
83
|
-
"metadata": report.metadata,
|
|
84
|
-
"results": [
|
|
85
|
-
{
|
|
86
|
-
"test_case": {
|
|
87
|
-
"name": r.test_case.name,
|
|
88
|
-
"prompt": r.test_case.prompt,
|
|
89
|
-
"model": r.test_case.model,
|
|
90
|
-
"tags": list(r.test_case.tags),
|
|
91
|
-
"system_prompt": r.test_case.system_prompt,
|
|
92
|
-
"temperature": r.test_case.temperature,
|
|
93
|
-
"max_tokens": r.test_case.max_tokens,
|
|
94
|
-
"metadata": r.test_case.metadata,
|
|
95
|
-
"validator_description": r.test_case.validator.describe(),
|
|
96
|
-
},
|
|
97
|
-
"response": r.response,
|
|
98
|
-
"passed": r.passed,
|
|
99
|
-
"metrics": {
|
|
100
|
-
"latency_ms": r.metrics.latency_ms,
|
|
101
|
-
"prompt_tokens": r.metrics.prompt_tokens,
|
|
102
|
-
"completion_tokens": r.metrics.completion_tokens,
|
|
103
|
-
"total_tokens": r.metrics.total_tokens,
|
|
104
|
-
"cost_usd": r.metrics.cost_usd,
|
|
105
|
-
"retry_delays": list(r.metrics.retry_delays),
|
|
106
|
-
}
|
|
107
|
-
if r.metrics
|
|
108
|
-
else None,
|
|
109
|
-
"validation_details": r.validation_details,
|
|
110
|
-
"execution_error": r.execution_error,
|
|
111
|
-
"timestamp": r.timestamp.isoformat(),
|
|
112
|
-
}
|
|
113
|
-
for r in report.results
|
|
114
|
-
],
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
@staticmethod
|
|
118
|
-
def _deserialize_report(data: dict[str, Any]) -> Report:
|
|
119
|
-
results = []
|
|
120
|
-
for r in data["results"]:
|
|
121
|
-
test_case = TestCase(
|
|
122
|
-
name=r["test_case"]["name"],
|
|
123
|
-
prompt=r["test_case"]["prompt"],
|
|
124
|
-
model=r["test_case"]["model"],
|
|
125
|
-
validator=PlaceholderValidator(
|
|
126
|
-
description=r["test_case"]["validator_description"],
|
|
127
|
-
),
|
|
128
|
-
tags=tuple(r["test_case"]["tags"]),
|
|
129
|
-
system_prompt=r["test_case"]["system_prompt"],
|
|
130
|
-
temperature=r["test_case"]["temperature"],
|
|
131
|
-
max_tokens=r["test_case"]["max_tokens"],
|
|
132
|
-
metadata=r["test_case"]["metadata"],
|
|
133
|
-
)
|
|
134
|
-
|
|
135
|
-
metrics = None
|
|
136
|
-
if r["metrics"]:
|
|
137
|
-
metrics = Metrics(
|
|
138
|
-
latency_ms=r["metrics"]["latency_ms"],
|
|
139
|
-
prompt_tokens=r["metrics"]["prompt_tokens"],
|
|
140
|
-
completion_tokens=r["metrics"]["completion_tokens"],
|
|
141
|
-
total_tokens=r["metrics"]["total_tokens"],
|
|
142
|
-
cost_usd=r["metrics"]["cost_usd"],
|
|
143
|
-
retry_delays=tuple(r["metrics"]["retry_delays"]),
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
result = TestResult(
|
|
147
|
-
test_case=test_case,
|
|
148
|
-
response=r["response"],
|
|
149
|
-
passed=r["passed"],
|
|
150
|
-
metrics=metrics,
|
|
151
|
-
validation_details=r["validation_details"],
|
|
152
|
-
execution_error=r["execution_error"],
|
|
153
|
-
timestamp=datetime.fromisoformat(r["timestamp"]),
|
|
154
|
-
)
|
|
155
|
-
results.append(result)
|
|
156
|
-
|
|
157
|
-
return Report(results=results, metadata=data["metadata"])
|
promptum/storage/protocol.py
DELETED
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
from typing import Any, Protocol
|
|
2
|
-
|
|
3
|
-
from promptum.benchmark.report import Report
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class ResultStorage(Protocol):
|
|
7
|
-
def save(self, report: Report, name: str) -> str:
|
|
8
|
-
"""
|
|
9
|
-
Saves a report and returns its identifier.
|
|
10
|
-
"""
|
|
11
|
-
...
|
|
12
|
-
|
|
13
|
-
def load(self, identifier: str) -> Report:
|
|
14
|
-
"""
|
|
15
|
-
Loads a report by its identifier.
|
|
16
|
-
"""
|
|
17
|
-
...
|
|
18
|
-
|
|
19
|
-
def list_reports(self) -> list[dict[str, Any]]:
|
|
20
|
-
"""
|
|
21
|
-
Returns metadata for all stored reports.
|
|
22
|
-
"""
|
|
23
|
-
...
|
promptum-0.0.1.dist-info/RECORD
DELETED
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
promptum/__init__.py,sha256=AjeGgmIbpp9Uv-0ybq6knejEJMK-Dnn_-fV9Z86Bp74,932
|
|
2
|
-
promptum/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
promptum/benchmark/__init__.py,sha256=NJYiXm6wVFKMloxKNAXMY4H3bMQORTtLh6__nYWYWa0,131
|
|
4
|
-
promptum/benchmark/benchmark.py,sha256=3enQSACdLwHW78fqSZj0Un3r7_Ua3V-MjfbEIIKFSWs,1589
|
|
5
|
-
promptum/benchmark/report.py,sha256=ol_UO8rw43zbQxhs2o4AwYN5TP7O_Apa77V-pZKq6Uw,2754
|
|
6
|
-
promptum/core/__init__.py,sha256=mqajsOdUBNJfcR2krxpwa7rM_wd88vJaAov-9SnVm68,294
|
|
7
|
-
promptum/core/metrics.py,sha256=FnS10nHFjQ5Clj5X21C_nW6zAUJU_ZHt0s2fLgp6L28,427
|
|
8
|
-
promptum/core/result.py,sha256=nyuVMQFY6DmZwzpgqDPsj0FaAuairpKLJ-0be5WQtTg,472
|
|
9
|
-
promptum/core/retry.py,sha256=mA_RRz9_9J_mge_AUd9f1A-gACOxZLGTI8vTIstAr8s,538
|
|
10
|
-
promptum/core/test_case.py,sha256=YNlVNj7FkoCyBFb2N0Dzrhce6o3DzUtke4PR6WoXhZo,593
|
|
11
|
-
promptum/execution/__init__.py,sha256=fUZa7Bo7yn921sl49cS6TCGsG-lOUNVdhdeRsIa5vCc,67
|
|
12
|
-
promptum/execution/runner.py,sha256=sP3uDu2VDLxFi9BkltMHwsyMuCXnz4oP1kVN28KpVZ0,2434
|
|
13
|
-
promptum/providers/__init__.py,sha256=OW-CK198wOV7_bz_keOaxxQeRlFPZgINQcVJUZq_uus,169
|
|
14
|
-
promptum/providers/openrouter.py,sha256=owquGxHaTB-pZ8jr06l4HouETuFj1lEg92oGX2mM5uo,4601
|
|
15
|
-
promptum/providers/protocol.py,sha256=vdTGAGKN3FzThHLwyMMWicU87_LpW-gn0cM3vMcWiEY,488
|
|
16
|
-
promptum/serialization/__init__.py,sha256=0dlpgF3dngaw_oR4mg7nuc4Z_VFVl2bATmhe2mHA9T4,319
|
|
17
|
-
promptum/serialization/base.py,sha256=JnB4zb7D4oy44k6ndbJu3Xw1PVLpY_9-Y7k3Et2p43g,1851
|
|
18
|
-
promptum/serialization/html.py,sha256=kJEd2s6fVfFHH7snJWrD5RGaUW66x3vtMKGMJ_ekmcI,1901
|
|
19
|
-
promptum/serialization/json.py,sha256=koqgr5_WHmrpWUOCq6rWXoC07um3mkDDaob2k9vkEK8,870
|
|
20
|
-
promptum/serialization/protocol.py,sha256=MZeMYt_HZJIYSyrRd_ZYbEJXDiXLMuJ5tosAeHLxpTM,353
|
|
21
|
-
promptum/serialization/report_template.html,sha256=RC8qSLzolqWkWBIGfyhPtPkRWM7_0JkauEWPkaKiB9A,10802
|
|
22
|
-
promptum/serialization/yaml.py,sha256=50A612OkX2L3EjhxTZJMZQb5zL8-2PmwcBjjNUhCWsA,528
|
|
23
|
-
promptum/storage/__init__.py,sha256=QWOP5Al43WmmQ_kFCM9JGi8amXJzO_pR-x5AKDNy4ds,153
|
|
24
|
-
promptum/storage/file.py,sha256=gnNBpNBQ_NeAWn7P2itsw2L99AxS7zOd8Nef6PyYxlk,5750
|
|
25
|
-
promptum/storage/protocol.py,sha256=_NpkJzOQB_98Ud_TA_ZYubHf3o2DDXGMveRN3kRyYKI,517
|
|
26
|
-
promptum/validation/__init__.py,sha256=mhykyxaIwn2PJh2RXAi0fi2NRIveFmlC5bg1nyCbfVU,252
|
|
27
|
-
promptum/validation/protocol.py,sha256=xqxm23YX6eNeZHKMLMZ-Wz8iQKn4ZRzAI5Xryxg0uq4,418
|
|
28
|
-
promptum/validation/validators.py,sha256=3lJwSMhhWb9x8BK_-S0FJBj7PFgno79II_i3Z1mCKTs,3217
|
|
29
|
-
promptum-0.0.1.dist-info/METADATA,sha256=vt_PN0Ns0JuJalM7p8hJZsz-Y2hwQrbHZ4Jacy7P6L8,8083
|
|
30
|
-
promptum-0.0.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
31
|
-
promptum-0.0.1.dist-info/licenses/LICENSE,sha256=Fgn285H5Vy9diOlqO1TzS3hD97WcdF6-GFHvUcFNtmg,1067
|
|
32
|
-
promptum-0.0.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|