promptum 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
promptum/__init__.py CHANGED
@@ -1,14 +1,5 @@
1
- from promptum.benchmark import Benchmark, Report
2
- from promptum.core import Metrics, RetryConfig, RetryStrategy, TestCase, TestResult
3
- from promptum.execution import Runner
4
- from promptum.providers import LLMProvider, OpenRouterClient
5
- from promptum.serialization import (
6
- HTMLSerializer,
7
- JSONSerializer,
8
- Serializer,
9
- YAMLSerializer,
10
- )
11
- from promptum.storage import FileStorage, ResultStorage
1
+ from promptum.benchmark import Benchmark, Report, Runner, TestCase, TestResult
2
+ from promptum.providers import LLMProvider, Metrics, OpenRouterClient, RetryConfig, RetryStrategy
12
3
  from promptum.validation import (
13
4
  Contains,
14
5
  ExactMatch,
@@ -17,7 +8,7 @@ from promptum.validation import (
17
8
  Validator,
18
9
  )
19
10
 
20
- __version__ = "0.1.0"
11
+ __version__ = "0.0.1"
21
12
 
22
13
  __all__ = [
23
14
  "TestCase",
@@ -35,10 +26,4 @@ __all__ = [
35
26
  "Runner",
36
27
  "Benchmark",
37
28
  "Report",
38
- "Serializer",
39
- "JSONSerializer",
40
- "YAMLSerializer",
41
- "HTMLSerializer",
42
- "ResultStorage",
43
- "FileStorage",
44
29
  ]
@@ -1,4 +1,7 @@
1
1
  from promptum.benchmark.benchmark import Benchmark
2
2
  from promptum.benchmark.report import Report
3
+ from promptum.benchmark.result import TestResult
4
+ from promptum.benchmark.runner import Runner
5
+ from promptum.benchmark.test_case import TestCase
3
6
 
4
- __all__ = ["Benchmark", "Report"]
7
+ __all__ = ["Benchmark", "Report", "Runner", "TestCase", "TestResult"]
@@ -1,11 +1,10 @@
1
1
  import asyncio
2
2
  from collections.abc import Callable, Sequence
3
- from typing import Any
4
3
 
5
4
  from promptum.benchmark.report import Report
6
- from promptum.core.result import TestResult
7
- from promptum.core.test_case import TestCase
8
- from promptum.execution.runner import Runner
5
+ from promptum.benchmark.result import TestResult
6
+ from promptum.benchmark.runner import Runner
7
+ from promptum.benchmark.test_case import TestCase
9
8
  from promptum.providers.protocol import LLMProvider
10
9
 
11
10
 
@@ -29,12 +28,12 @@ class Benchmark:
29
28
  def add_tests(self, test_cases: Sequence[TestCase]) -> None:
30
29
  self._test_cases.extend(test_cases)
31
30
 
32
- def run(self, metadata: dict[str, Any] | None = None) -> Report:
33
- return asyncio.run(self.run_async(metadata))
31
+ def run(self) -> Report:
32
+ return asyncio.run(self.run_async())
34
33
 
35
- async def run_async(self, metadata: dict[str, Any] | None = None) -> Report:
34
+ async def run_async(self) -> Report:
36
35
  if not self._test_cases:
37
- return Report(results=[], metadata=metadata or {})
36
+ return Report(results=[])
38
37
 
39
38
  runner = Runner(
40
39
  provider=self.provider,
@@ -44,7 +43,4 @@ class Benchmark:
44
43
 
45
44
  results = await runner.run(self._test_cases)
46
45
 
47
- return Report(
48
- results=results,
49
- metadata=metadata or {},
50
- )
46
+ return Report(results=results)
@@ -2,13 +2,12 @@ from collections.abc import Callable, Sequence
2
2
  from dataclasses import dataclass
3
3
  from typing import Any
4
4
 
5
- from promptum.core.result import TestResult
5
+ from promptum.benchmark.result import TestResult
6
6
 
7
7
 
8
8
  @dataclass(frozen=True, slots=True)
9
9
  class Report:
10
10
  results: Sequence[TestResult]
11
- metadata: dict[str, Any]
12
11
 
13
12
  def get_summary(self) -> dict[str, Any]:
14
13
  total = len(self.results)
@@ -49,7 +48,7 @@ class Report:
49
48
  if passed is not None:
50
49
  filtered = [r for r in filtered if r.passed == passed]
51
50
 
52
- return Report(results=filtered, metadata=self.metadata)
51
+ return Report(results=filtered)
53
52
 
54
53
  def group_by(self, key: Callable[[TestResult], str]) -> dict[str, "Report"]:
55
54
  groups: dict[str, list[TestResult]] = {}
@@ -60,7 +59,7 @@ class Report:
60
59
  groups[group_key] = []
61
60
  groups[group_key].append(result)
62
61
 
63
- return {k: Report(results=v, metadata=self.metadata) for k, v in groups.items()}
62
+ return {k: Report(results=v) for k, v in groups.items()}
64
63
 
65
64
  def compare_models(self) -> dict[str, dict[str, Any]]:
66
65
  by_model = self.group_by(lambda r: r.test_case.model)
@@ -71,5 +70,5 @@ class Report:
71
70
  if not values:
72
71
  return 0
73
72
  sorted_values = sorted(values)
74
- index = int(len(sorted_values) * p)
75
- return sorted_values[min(index, len(sorted_values) - 1)]
73
+ index = int((len(sorted_values) - 1) * p)
74
+ return sorted_values[index]
@@ -2,8 +2,8 @@ from dataclasses import dataclass, field
2
2
  from datetime import datetime
3
3
  from typing import Any
4
4
 
5
- from promptum.core.metrics import Metrics
6
- from promptum.core.test_case import TestCase
5
+ from promptum.benchmark.test_case import TestCase
6
+ from promptum.providers.metrics import Metrics
7
7
 
8
8
 
9
9
  @dataclass(frozen=True, slots=True)
@@ -3,8 +3,8 @@ from collections.abc import Callable, Sequence
3
3
 
4
4
  import httpx
5
5
 
6
- from promptum.core.result import TestResult
7
- from promptum.core.test_case import TestCase
6
+ from promptum.benchmark.result import TestResult
7
+ from promptum.benchmark.test_case import TestCase
8
8
  from promptum.providers.protocol import LLMProvider
9
9
 
10
10
 
@@ -37,7 +37,6 @@ class Runner:
37
37
 
38
38
  results = await asyncio.gather(
39
39
  *[run_with_semaphore(tc) for tc in test_cases],
40
- return_exceptions=False,
41
40
  )
42
41
 
43
42
  return list(results)
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Any
5
5
  if TYPE_CHECKING:
6
6
  from promptum.validation.protocol import Validator
7
7
 
8
- from promptum.core.retry import RetryConfig
8
+ from promptum.providers.retry import RetryConfig
9
9
 
10
10
 
11
11
  @dataclass(frozen=True, slots=True)
@@ -1,7 +1,12 @@
1
+ from promptum.providers.metrics import Metrics
1
2
  from promptum.providers.openrouter import OpenRouterClient
2
3
  from promptum.providers.protocol import LLMProvider
4
+ from promptum.providers.retry import RetryConfig, RetryStrategy
3
5
 
4
6
  __all__ = [
5
7
  "LLMProvider",
8
+ "Metrics",
6
9
  "OpenRouterClient",
10
+ "RetryConfig",
11
+ "RetryStrategy",
7
12
  ]
@@ -4,8 +4,8 @@ from typing import Any
4
4
 
5
5
  import httpx
6
6
 
7
- from promptum.core.metrics import Metrics
8
- from promptum.core.retry import RetryConfig, RetryStrategy
7
+ from promptum.providers.metrics import Metrics
8
+ from promptum.providers.retry import RetryConfig, RetryStrategy
9
9
 
10
10
 
11
11
  class OpenRouterClient:
@@ -61,7 +61,7 @@ class OpenRouterClient:
61
61
  "messages": messages,
62
62
  "temperature": temperature,
63
63
  }
64
- if max_tokens:
64
+ if max_tokens is not None:
65
65
  payload["max_tokens"] = max_tokens
66
66
  payload.update(kwargs)
67
67
 
@@ -1,6 +1,6 @@
1
1
  from typing import Any, Protocol
2
2
 
3
- from promptum.core.metrics import Metrics
3
+ from promptum.providers.metrics import Metrics
4
4
 
5
5
 
6
6
  class LLMProvider(Protocol):
@@ -88,21 +88,3 @@ class JsonSchema:
88
88
  keys = ", ".join(self.required_keys)
89
89
  return f"Valid JSON with keys: {keys}"
90
90
  return "Valid JSON object"
91
-
92
-
93
- @dataclass(frozen=True, slots=True)
94
- class PlaceholderValidator:
95
- """
96
- Placeholder validator for deserialized reports.
97
-
98
- Used when original validator cannot be reconstructed from storage.
99
- Always returns True. Original validator logic is not preserved.
100
- """
101
-
102
- description: str
103
-
104
- def validate(self, response: str) -> tuple[bool, dict[str, Any]]:
105
- return True, {"placeholder": True, "note": "Original validator could not be reconstructed"}
106
-
107
- def describe(self) -> str:
108
- return self.description
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: promptum
3
- Version: 0.0.1
3
+ Version: 0.0.2
4
4
  Summary: Async LLM benchmarking library with protocol-based extensibility
5
5
  Project-URL: Homepage, https://github.com/deyna256/promptum
6
6
  Project-URL: Repository, https://github.com/deyna256/promptum
@@ -36,8 +36,6 @@ Classifier: Programming Language :: Python :: 3.13
36
36
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
37
37
  Requires-Python: >=3.13
38
38
  Requires-Dist: httpx>=0.27.0
39
- Requires-Dist: jinja2>=3.1.0
40
- Requires-Dist: pyyaml>=6.0
41
39
  Description-Content-Type: text/markdown
42
40
 
43
41
  # promptum
@@ -97,15 +95,12 @@ for attempt in range(max_retries):
97
95
  break
98
96
  except Exception:
99
97
  sleep(2 ** attempt)
100
-
101
- # Export results manually
102
- json.dump(results, open("results.json", "w"))
103
98
  ```
104
99
 
105
100
  **After promptum:**
106
101
  ```python
107
102
  report = await benchmark.run_async()
108
- HTMLSerializer().serialize(report) # Beautiful HTML report
103
+ summary = report.get_summary() # Metrics captured automatically
109
104
  ```
110
105
 
111
106
  ---
@@ -151,14 +146,13 @@ python your_script.py
151
146
 
152
147
  ## What You Get
153
148
 
154
- **One API for 100+ Models** - OpenRouter support out of the box (OpenAI, Anthropic, Google, etc.)
155
- **Smart Validation** - ExactMatch, Contains, Regex, JsonSchema, or write your own
156
- **Automatic Retries** - Exponential/linear backoff with configurable attempts
157
- **Metrics Tracking** - Latency, tokens, cost - automatically captured
158
- **Beautiful Reports** - JSON, YAML, or interactive HTML with charts
159
- **Async by Default** - Run 100 tests in parallel without breaking a sweat
160
- **Type Safe** - Full type hints, catches errors before runtime
161
- ✅ **Zero Config** - No YAML files, no setup scripts, just Python
149
+ - [x] **One API for 100+ Models** - OpenRouter support out of the box (OpenAI, Anthropic, Google, etc.)
150
+ - [x] **Smart Validation** - ExactMatch, Contains, Regex, JsonSchema, or write your own
151
+ - [x] **Automatic Retries** - Exponential/linear backoff with configurable attempts
152
+ - [x] **Metrics Tracking** - Latency, tokens, cost - automatically captured
153
+ - [x] **Async by Default** - Run 100 tests in parallel without breaking a sweat
154
+ - [x] **Type Safe** - Full type hints, catches errors before runtime
155
+ - [x] **Zero Config** - No YAML files, no setup scripts, just Python
162
156
 
163
157
  ---
164
158
 
@@ -193,14 +187,11 @@ tests = [
193
187
  benchmark.add_tests(tests)
194
188
  report = await benchmark.run_async()
195
189
 
196
- # Export as HTML
197
- from promptum import HTMLSerializer
198
- html = HTMLSerializer().serialize(report)
199
- open("comparison.html", "w").write(html)
190
+ # Side-by-side model comparison
191
+ for model, summary in report.compare_models().items():
192
+ print(f"{model}: {summary['pass_rate']:.0%} pass rate, {summary['avg_latency_ms']:.0f}ms avg")
200
193
  ```
201
194
 
202
- Open `comparison.html` in your browser - see side-by-side model performance with charts.
203
-
204
195
  ---
205
196
 
206
197
  ## Use Cases
@@ -252,7 +243,7 @@ Found a bug? Want a feature? PRs welcome!
252
243
 
253
244
  ```bash
254
245
  # Development setup
255
- git clone https://github.com/yourusername/promptum.git
246
+ git clone https://github.com/deyna256/promptum.git
256
247
  cd promptum
257
248
  just sync # Install dependencies
258
249
  just test # Run tests
@@ -273,7 +264,7 @@ MIT - do whatever you want with it.
273
264
 
274
265
  <div align="center">
275
266
 
276
- **[⭐ Star on GitHub](https://github.com/yourusername/promptum)** | **[🐛 Report Bug](https://github.com/yourusername/promptum/issues)** | **[💡 Request Feature](https://github.com/yourusername/promptum/issues)**
267
+ **[⭐ Star on GitHub](https://github.com/deyna256/promptum)** | **[🐛 Report Bug](https://github.com/deyna256/promptum/issues)** | **[💡 Request Feature](https://github.com/deyna256/promptum/issues)**
277
268
 
278
269
  Made for developers who value their time.
279
270
 
@@ -0,0 +1,20 @@
1
+ promptum/__init__.py,sha256=8IAk_9VlnKEJIdwf-hEDkOfOCV456H2Jng-HrZfewso,582
2
+ promptum/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ promptum/benchmark/__init__.py,sha256=0FXYDnK4SGa5ZqX2k9aVkwy3ENDlF_5nW2Mut_OCCbg,311
4
+ promptum/benchmark/benchmark.py,sha256=hZ3557qPKqFeNNuxrRLPs-b6XBy2JCowIhRDDwatfeI,1403
5
+ promptum/benchmark/report.py,sha256=DhY1p3n29xOSwRYUiQW6V6FhGFGGn-JF6nuNuvj9rro,2659
6
+ promptum/benchmark/result.py,sha256=nKh-T4zlam2LxsaFoL8jeVaO6kZJ1sfB_tnp4gdNPhM,482
7
+ promptum/benchmark/runner.py,sha256=5p6JBwjTlEHTh6jNv_iuFH1nIrI4_Gv3wmzCT0TWpvA,2407
8
+ promptum/benchmark/test_case.py,sha256=Okypf2334ewVrvmQG7M3I3D7BzqXDsQ2ihjNw9gGF00,598
9
+ promptum/providers/__init__.py,sha256=UprvJ4vxHqo-VTzzUmZ4wFCj6VybP9xBd7HtpPPSvbI,335
10
+ promptum/providers/metrics.py,sha256=FnS10nHFjQ5Clj5X21C_nW6zAUJU_ZHt0s2fLgp6L28,427
11
+ promptum/providers/openrouter.py,sha256=fOqBm4ak7szNNeKNhSI6y4WpFsUx6iQg_3jaFsXc0dQ,4623
12
+ promptum/providers/protocol.py,sha256=g9zIH91HysBIATMHd9Z2Mpk1tKiTOkAyd-zynRaQsuk,493
13
+ promptum/providers/retry.py,sha256=mA_RRz9_9J_mge_AUd9f1A-gACOxZLGTI8vTIstAr8s,538
14
+ promptum/validation/__init__.py,sha256=mhykyxaIwn2PJh2RXAi0fi2NRIveFmlC5bg1nyCbfVU,252
15
+ promptum/validation/protocol.py,sha256=xqxm23YX6eNeZHKMLMZ-Wz8iQKn4ZRzAI5Xryxg0uq4,418
16
+ promptum/validation/validators.py,sha256=qSMva2P2miXXJJ5XeTKJsyYgh2x5wORi3dhOnBYuACE,2686
17
+ promptum-0.0.2.dist-info/METADATA,sha256=MQcy0pxUoMpu4uZgM_Q3HEE_RnY3Krcg-_FTF9vvQ54,7845
18
+ promptum-0.0.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
19
+ promptum-0.0.2.dist-info/licenses/LICENSE,sha256=Fgn285H5Vy9diOlqO1TzS3hD97WcdF6-GFHvUcFNtmg,1067
20
+ promptum-0.0.2.dist-info/RECORD,,
promptum/core/__init__.py DELETED
@@ -1,12 +0,0 @@
1
- from promptum.core.metrics import Metrics
2
- from promptum.core.result import TestResult
3
- from promptum.core.retry import RetryConfig, RetryStrategy
4
- from promptum.core.test_case import TestCase
5
-
6
- __all__ = [
7
- "Metrics",
8
- "RetryConfig",
9
- "RetryStrategy",
10
- "TestCase",
11
- "TestResult",
12
- ]
@@ -1,3 +0,0 @@
1
- from promptum.execution.runner import Runner
2
-
3
- __all__ = ["Runner"]
@@ -1,11 +0,0 @@
1
- from promptum.serialization.html import HTMLSerializer
2
- from promptum.serialization.json import JSONSerializer
3
- from promptum.serialization.protocol import Serializer
4
- from promptum.serialization.yaml import YAMLSerializer
5
-
6
- __all__ = [
7
- "Serializer",
8
- "JSONSerializer",
9
- "YAMLSerializer",
10
- "HTMLSerializer",
11
- ]
@@ -1,48 +0,0 @@
1
- """Base serializer with shared result serialization logic."""
2
-
3
- from typing import Any
4
-
5
- from promptum.core.result import TestResult
6
-
7
-
8
- class BaseSerializer:
9
- """
10
- Base class for serializers with common result serialization logic.
11
-
12
- Subclasses should implement:
13
- - serialize(report: Report) -> str
14
- - get_file_extension() -> str
15
- """
16
-
17
- @staticmethod
18
- def _serialize_result(result: TestResult) -> dict[str, Any]:
19
- """Convert TestResult to dictionary representation."""
20
- return {
21
- "test_case": {
22
- "name": result.test_case.name,
23
- "prompt": result.test_case.prompt,
24
- "model": result.test_case.model,
25
- "tags": list(result.test_case.tags),
26
- "system_prompt": result.test_case.system_prompt,
27
- "temperature": result.test_case.temperature,
28
- "max_tokens": result.test_case.max_tokens,
29
- "metadata": result.test_case.metadata,
30
- "validator": result.test_case.validator.describe(),
31
- },
32
- "response": result.response,
33
- "passed": result.passed,
34
- "metrics": {
35
- "latency_ms": result.metrics.latency_ms,
36
- "prompt_tokens": result.metrics.prompt_tokens,
37
- "completion_tokens": result.metrics.completion_tokens,
38
- "total_tokens": result.metrics.total_tokens,
39
- "cost_usd": result.metrics.cost_usd,
40
- "retry_delays": list(result.metrics.retry_delays),
41
- "total_attempts": result.metrics.total_attempts,
42
- }
43
- if result.metrics
44
- else None,
45
- "validation_details": result.validation_details,
46
- "execution_error": result.execution_error,
47
- "timestamp": result.timestamp.isoformat(),
48
- }
@@ -1,52 +0,0 @@
1
- import json
2
- from pathlib import Path
3
-
4
- from jinja2 import Template
5
-
6
- from promptum.benchmark.report import Report
7
-
8
-
9
- class HTMLSerializer:
10
- def __init__(self) -> None:
11
- template_path = Path(__file__).parent / "report_template.html"
12
- self._template = Template(template_path.read_text())
13
-
14
- def serialize(self, report: Report) -> str:
15
- summary = report.get_summary()
16
-
17
- results_data = []
18
- for result in report.results:
19
- results_data.append(
20
- {
21
- "test_case": {
22
- "name": result.test_case.name,
23
- "prompt": result.test_case.prompt,
24
- "model": result.test_case.model,
25
- "tags": list(result.test_case.tags),
26
- "system_prompt": result.test_case.system_prompt,
27
- "validator": result.test_case.validator.describe(),
28
- },
29
- "response": result.response,
30
- "passed": result.passed,
31
- "metrics": {
32
- "latency_ms": result.metrics.latency_ms,
33
- "prompt_tokens": result.metrics.prompt_tokens,
34
- "completion_tokens": result.metrics.completion_tokens,
35
- "total_tokens": result.metrics.total_tokens,
36
- "cost_usd": result.metrics.cost_usd,
37
- "total_attempts": result.metrics.total_attempts,
38
- }
39
- if result.metrics
40
- else None,
41
- "execution_error": result.execution_error,
42
- }
43
- )
44
-
45
- return self._template.render(
46
- summary=summary,
47
- results=results_data,
48
- results_json=json.dumps(results_data),
49
- )
50
-
51
- def get_file_extension(self) -> str:
52
- return "html"
@@ -1,28 +0,0 @@
1
- import json
2
- from datetime import datetime
3
- from typing import Any
4
-
5
- from promptum.benchmark.report import Report
6
- from promptum.serialization.base import BaseSerializer
7
-
8
-
9
- class JSONSerializer(BaseSerializer):
10
- def __init__(self, indent: int = 2):
11
- self.indent = indent
12
-
13
- def serialize(self, report: Report) -> str:
14
- data = {
15
- "metadata": report.metadata,
16
- "summary": report.get_summary(),
17
- "results": [self._serialize_result(r) for r in report.results],
18
- }
19
- return json.dumps(data, indent=self.indent, default=self._json_default)
20
-
21
- def get_file_extension(self) -> str:
22
- return "json"
23
-
24
- @staticmethod
25
- def _json_default(obj: Any) -> Any:
26
- if isinstance(obj, datetime):
27
- return obj.isoformat()
28
- raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
@@ -1,13 +0,0 @@
1
- from typing import Protocol
2
-
3
- from promptum.benchmark.report import Report
4
-
5
-
6
- class Serializer(Protocol):
7
- def serialize(self, report: Report) -> str:
8
- """Serializes a Report to a string format."""
9
- ...
10
-
11
- def get_file_extension(self) -> str:
12
- """Returns the file extension for this format (e.g., 'json', 'html')."""
13
- ...
@@ -1,293 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <meta charset="UTF-8">
5
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
- <title>LLM Benchmark Report</title>
7
- <script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.0/dist/chart.umd.js"></script>
8
- <style>
9
- * { margin: 0; padding: 0; box-sizing: border-box; }
10
- :root {
11
- --bg: #ffffff;
12
- --surface: #f5f5f5;
13
- --text: #1a1a1a;
14
- --text-muted: #666;
15
- --border: #ddd;
16
- --success: #22c55e;
17
- --error: #ef4444;
18
- --warning: #f59e0b;
19
- }
20
- @media (prefers-color-scheme: dark) {
21
- :root {
22
- --bg: #0a0a0a;
23
- --surface: #1a1a1a;
24
- --text: #e5e5e5;
25
- --text-muted: #a3a3a3;
26
- --border: #333;
27
- }
28
- }
29
- body {
30
- font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
31
- background: var(--bg);
32
- color: var(--text);
33
- line-height: 1.6;
34
- }
35
- .container { max-width: 1400px; margin: 0 auto; padding: 2rem; }
36
- h1 { font-size: 2rem; margin-bottom: 0.5rem; }
37
- h2 { font-size: 1.5rem; margin: 2rem 0 1rem; }
38
- .summary { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
39
- .card {
40
- background: var(--surface);
41
- border: 1px solid var(--border);
42
- border-radius: 8px;
43
- padding: 1.5rem;
44
- }
45
- .card-title { font-size: 0.875rem; color: var(--text-muted); margin-bottom: 0.5rem; }
46
- .card-value { font-size: 2rem; font-weight: 700; }
47
- .chart-container { height: 300px; margin-bottom: 2rem; }
48
- table {
49
- width: 100%;
50
- border-collapse: collapse;
51
- background: var(--surface);
52
- border-radius: 8px;
53
- overflow: hidden;
54
- }
55
- th, td {
56
- text-align: left;
57
- padding: 1rem;
58
- border-bottom: 1px solid var(--border);
59
- }
60
- th {
61
- background: var(--surface);
62
- font-weight: 600;
63
- position: sticky;
64
- top: 0;
65
- }
66
- tr:hover { background: var(--bg); }
67
- .badge {
68
- display: inline-block;
69
- padding: 0.25rem 0.75rem;
70
- border-radius: 12px;
71
- font-size: 0.75rem;
72
- font-weight: 600;
73
- }
74
- .badge-success { background: var(--success); color: white; }
75
- .badge-error { background: var(--error); color: white; }
76
- .tag {
77
- display: inline-block;
78
- padding: 0.125rem 0.5rem;
79
- background: var(--border);
80
- border-radius: 4px;
81
- font-size: 0.75rem;
82
- margin-right: 0.25rem;
83
- }
84
- .search {
85
- width: 100%;
86
- padding: 0.75rem;
87
- margin-bottom: 1rem;
88
- background: var(--surface);
89
- border: 1px solid var(--border);
90
- border-radius: 8px;
91
- color: var(--text);
92
- font-size: 1rem;
93
- }
94
- .truncate {
95
- max-width: 300px;
96
- white-space: nowrap;
97
- overflow: hidden;
98
- text-overflow: ellipsis;
99
- }
100
- button {
101
- background: var(--surface);
102
- border: 1px solid var(--border);
103
- color: var(--text);
104
- padding: 0.5rem 1rem;
105
- border-radius: 6px;
106
- cursor: pointer;
107
- font-size: 0.875rem;
108
- }
109
- button:hover { background: var(--border); }
110
- .modal {
111
- display: none;
112
- position: fixed;
113
- top: 0;
114
- left: 0;
115
- width: 100%;
116
- height: 100%;
117
- background: rgba(0, 0, 0, 0.7);
118
- z-index: 1000;
119
- overflow: auto;
120
- }
121
- .modal-content {
122
- background: var(--surface);
123
- margin: 2rem auto;
124
- padding: 2rem;
125
- max-width: 800px;
126
- border-radius: 12px;
127
- position: relative;
128
- }
129
- .modal-close {
130
- position: absolute;
131
- top: 1rem;
132
- right: 1rem;
133
- font-size: 1.5rem;
134
- cursor: pointer;
135
- }
136
- pre {
137
- background: var(--bg);
138
- padding: 1rem;
139
- border-radius: 6px;
140
- overflow-x: auto;
141
- margin: 0.5rem 0;
142
- }
143
- code { font-family: 'Courier New', monospace; font-size: 0.875rem; }
144
- </style>
145
- </head>
146
- <body>
147
- <div class="container">
148
- <h1>LLM Benchmark Report</h1>
149
- <p style="color: var(--text-muted); margin-bottom: 2rem;">{{ summary.total }} tests executed</p>
150
-
151
- <div class="summary">
152
- <div class="card">
153
- <div class="card-title">Pass Rate</div>
154
- <div class="card-value" style="color: var(--success);">{{ "%.1f"|format(summary.pass_rate * 100) }}%</div>
155
- </div>
156
- <div class="card">
157
- <div class="card-title">Avg Latency</div>
158
- <div class="card-value">{{ "%.0f"|format(summary.avg_latency_ms) }}ms</div>
159
- </div>
160
- <div class="card">
161
- <div class="card-title">Total Cost</div>
162
- <div class="card-value">${{ "%.6f"|format(summary.total_cost_usd) }}</div>
163
- </div>
164
- <div class="card">
165
- <div class="card-title">Total Tokens</div>
166
- <div class="card-value">{{ "{:,}".format(summary.total_tokens) }}</div>
167
- </div>
168
- </div>
169
-
170
- <div class="card chart-container">
171
- <canvas id="latencyChart"></canvas>
172
- </div>
173
-
174
- <h2>Test Results</h2>
175
- <input type="text" class="search" id="searchInput" placeholder="Search tests...">
176
-
177
- <table id="resultsTable">
178
- <thead>
179
- <tr>
180
- <th>Status</th>
181
- <th>Name</th>
182
- <th>Model</th>
183
- <th>Latency</th>
184
- <th>Cost</th>
185
- <th>Tags</th>
186
- <th>Actions</th>
187
- </tr>
188
- </thead>
189
- <tbody>
190
- {% for result in results %}
191
- <tr class="result-row">
192
- <td>
193
- {% if result.passed %}
194
- <span class="badge badge-success">PASS</span>
195
- {% else %}
196
- <span class="badge badge-error">FAIL</span>
197
- {% endif %}
198
- </td>
199
- <td>{{ result.test_case.name }}</td>
200
- <td>{{ result.test_case.model }}</td>
201
- <td>{{ "%.0f"|format(result.metrics.latency_ms if result.metrics else 0) }}ms</td>
202
- <td>${{ "%.6f"|format(result.metrics.cost_usd if result.metrics and result.metrics.cost_usd else 0) }}</td>
203
- <td>
204
- {% for tag in result.test_case.tags %}
205
- <span class="tag">{{ tag }}</span>
206
- {% endfor %}
207
- </td>
208
- <td><button onclick="showDetails({{ loop.index0 }})">Details</button></td>
209
- </tr>
210
- {% endfor %}
211
- </tbody>
212
- </table>
213
- </div>
214
-
215
- <div id="detailsModal" class="modal">
216
- <div class="modal-content">
217
- <span class="modal-close" onclick="closeModal()">&times;</span>
218
- <div id="modalBody"></div>
219
- </div>
220
- </div>
221
-
222
- <script>
223
- const results = {{ results_json }};
224
-
225
- new Chart(document.getElementById('latencyChart'), {
226
- type: 'bar',
227
- data: {
228
- labels: results.map((r, i) => r.test_case.name),
229
- datasets: [{
230
- label: 'Latency (ms)',
231
- data: results.map(r => r.metrics ? r.metrics.latency_ms : 0),
232
- backgroundColor: results.map(r => r.passed ? '#22c55e' : '#ef4444')
233
- }]
234
- },
235
- options: {
236
- responsive: true,
237
- maintainAspectRatio: false,
238
- plugins: { legend: { display: false } }
239
- }
240
- });
241
-
242
- document.getElementById('searchInput').addEventListener('input', function(e) {
243
- const term = e.target.value.toLowerCase();
244
- document.querySelectorAll('.result-row').forEach(row => {
245
- const text = row.textContent.toLowerCase();
246
- row.style.display = text.includes(term) ? '' : 'none';
247
- });
248
- });
249
-
250
- function showDetails(index) {
251
- const result = results[index];
252
- const html = `
253
- <h2>${result.test_case.name}</h2>
254
- <p><strong>Status:</strong> <span class="badge ${result.passed ? 'badge-success' : 'badge-error'}">${result.passed ? 'PASS' : 'FAIL'}</span></p>
255
- <p><strong>Model:</strong> ${result.test_case.model}</p>
256
- <p><strong>Validator:</strong> ${result.test_case.validator}</p>
257
- <h3>Prompt</h3>
258
- <pre><code>${escapeHtml(result.test_case.prompt)}</code></pre>
259
- ${result.test_case.system_prompt ? `<h3>System Prompt</h3><pre><code>${escapeHtml(result.test_case.system_prompt)}</code></pre>` : ''}
260
- <h3>Response</h3>
261
- <pre><code>${escapeHtml(result.response || 'No response')}</code></pre>
262
- ${result.execution_error ? `<h3>Error</h3><pre style="color: var(--error);"><code>${escapeHtml(result.execution_error)}</code></pre>` : ''}
263
- ${result.metrics ? `
264
- <h3>Metrics</h3>
265
- <ul>
266
- <li>Latency: ${result.metrics.latency_ms.toFixed(0)}ms</li>
267
- <li>Tokens: ${result.metrics.total_tokens || 'N/A'}</li>
268
- <li>Cost: $${(result.metrics.cost_usd || 0).toFixed(6)}</li>
269
- <li>Attempts: ${result.metrics.total_attempts}</li>
270
- </ul>
271
- ` : ''}
272
- `;
273
- document.getElementById('modalBody').innerHTML = html;
274
- document.getElementById('detailsModal').style.display = 'block';
275
- }
276
-
277
- function closeModal() {
278
- document.getElementById('detailsModal').style.display = 'none';
279
- }
280
-
281
- function escapeHtml(text) {
282
- const div = document.createElement('div');
283
- div.textContent = text;
284
- return div.innerHTML;
285
- }
286
-
287
- window.onclick = function(event) {
288
- const modal = document.getElementById('detailsModal');
289
- if (event.target === modal) closeModal();
290
- }
291
- </script>
292
- </body>
293
- </html>
@@ -1,17 +0,0 @@
1
- import yaml
2
-
3
- from promptum.benchmark.report import Report
4
- from promptum.serialization.base import BaseSerializer
5
-
6
-
7
- class YAMLSerializer(BaseSerializer):
8
- def serialize(self, report: Report) -> str:
9
- data = {
10
- "metadata": report.metadata,
11
- "summary": report.get_summary(),
12
- "results": [self._serialize_result(r) for r in report.results],
13
- }
14
- return yaml.dump(data, default_flow_style=False, sort_keys=False)
15
-
16
- def get_file_extension(self) -> str:
17
- return "yaml"
@@ -1,7 +0,0 @@
1
- from promptum.storage.file import FileStorage
2
- from promptum.storage.protocol import ResultStorage
3
-
4
- __all__ = [
5
- "ResultStorage",
6
- "FileStorage",
7
- ]
promptum/storage/file.py DELETED
@@ -1,157 +0,0 @@
1
- import json
2
- import tempfile
3
- from datetime import datetime
4
- from pathlib import Path
5
- from typing import Any
6
-
7
- from promptum.benchmark.report import Report
8
- from promptum.core.metrics import Metrics
9
- from promptum.core.result import TestResult
10
- from promptum.core.test_case import TestCase
11
- from promptum.validation.validators import PlaceholderValidator
12
-
13
-
14
- class FileStorage:
15
- def __init__(self, base_dir: str = "results"):
16
- self.base_dir = Path(base_dir)
17
- self.reports_dir = self.base_dir / "reports"
18
- self.metadata_file = self.base_dir / "metadata.json"
19
-
20
- self.reports_dir.mkdir(parents=True, exist_ok=True)
21
-
22
- def save(self, report: Report, name: str) -> str:
23
- timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
24
- identifier = f"{timestamp}_{name}"
25
- filename = f"{identifier}.json"
26
- filepath = self.reports_dir / filename
27
-
28
- data = self._serialize_report(report)
29
-
30
- with tempfile.NamedTemporaryFile(
31
- mode="w", delete=False, dir=self.reports_dir, suffix=".tmp"
32
- ) as tmp:
33
- json.dump(data, tmp, indent=2)
34
- tmp_path = Path(tmp.name)
35
-
36
- tmp_path.replace(filepath)
37
-
38
- self._update_metadata(identifier, name, str(filepath))
39
-
40
- return identifier
41
-
42
- def load(self, identifier: str) -> Report:
43
- filepath = self.reports_dir / f"{identifier}.json"
44
-
45
- if not filepath.exists():
46
- raise FileNotFoundError(f"Report not found: {identifier}")
47
-
48
- with open(filepath) as f:
49
- data = json.load(f)
50
-
51
- return self._deserialize_report(data)
52
-
53
- def list_reports(self) -> list[dict[str, Any]]:
54
- if not self.metadata_file.exists():
55
- return []
56
-
57
- with open(self.metadata_file) as f:
58
- return json.load(f)
59
-
60
- def _update_metadata(self, identifier: str, name: str, path: str) -> None:
61
- metadata = self.list_reports()
62
-
63
- metadata.append(
64
- {
65
- "id": identifier,
66
- "name": name,
67
- "path": path,
68
- "timestamp": datetime.now().isoformat(),
69
- }
70
- )
71
-
72
- with tempfile.NamedTemporaryFile(
73
- mode="w", delete=False, dir=self.base_dir, suffix=".tmp"
74
- ) as tmp:
75
- json.dump(metadata, tmp, indent=2)
76
- tmp_path = Path(tmp.name)
77
-
78
- tmp_path.replace(self.metadata_file)
79
-
80
- @staticmethod
81
- def _serialize_report(report: Report) -> dict[str, Any]:
82
- return {
83
- "metadata": report.metadata,
84
- "results": [
85
- {
86
- "test_case": {
87
- "name": r.test_case.name,
88
- "prompt": r.test_case.prompt,
89
- "model": r.test_case.model,
90
- "tags": list(r.test_case.tags),
91
- "system_prompt": r.test_case.system_prompt,
92
- "temperature": r.test_case.temperature,
93
- "max_tokens": r.test_case.max_tokens,
94
- "metadata": r.test_case.metadata,
95
- "validator_description": r.test_case.validator.describe(),
96
- },
97
- "response": r.response,
98
- "passed": r.passed,
99
- "metrics": {
100
- "latency_ms": r.metrics.latency_ms,
101
- "prompt_tokens": r.metrics.prompt_tokens,
102
- "completion_tokens": r.metrics.completion_tokens,
103
- "total_tokens": r.metrics.total_tokens,
104
- "cost_usd": r.metrics.cost_usd,
105
- "retry_delays": list(r.metrics.retry_delays),
106
- }
107
- if r.metrics
108
- else None,
109
- "validation_details": r.validation_details,
110
- "execution_error": r.execution_error,
111
- "timestamp": r.timestamp.isoformat(),
112
- }
113
- for r in report.results
114
- ],
115
- }
116
-
117
- @staticmethod
118
- def _deserialize_report(data: dict[str, Any]) -> Report:
119
- results = []
120
- for r in data["results"]:
121
- test_case = TestCase(
122
- name=r["test_case"]["name"],
123
- prompt=r["test_case"]["prompt"],
124
- model=r["test_case"]["model"],
125
- validator=PlaceholderValidator(
126
- description=r["test_case"]["validator_description"],
127
- ),
128
- tags=tuple(r["test_case"]["tags"]),
129
- system_prompt=r["test_case"]["system_prompt"],
130
- temperature=r["test_case"]["temperature"],
131
- max_tokens=r["test_case"]["max_tokens"],
132
- metadata=r["test_case"]["metadata"],
133
- )
134
-
135
- metrics = None
136
- if r["metrics"]:
137
- metrics = Metrics(
138
- latency_ms=r["metrics"]["latency_ms"],
139
- prompt_tokens=r["metrics"]["prompt_tokens"],
140
- completion_tokens=r["metrics"]["completion_tokens"],
141
- total_tokens=r["metrics"]["total_tokens"],
142
- cost_usd=r["metrics"]["cost_usd"],
143
- retry_delays=tuple(r["metrics"]["retry_delays"]),
144
- )
145
-
146
- result = TestResult(
147
- test_case=test_case,
148
- response=r["response"],
149
- passed=r["passed"],
150
- metrics=metrics,
151
- validation_details=r["validation_details"],
152
- execution_error=r["execution_error"],
153
- timestamp=datetime.fromisoformat(r["timestamp"]),
154
- )
155
- results.append(result)
156
-
157
- return Report(results=results, metadata=data["metadata"])
@@ -1,23 +0,0 @@
1
- from typing import Any, Protocol
2
-
3
- from promptum.benchmark.report import Report
4
-
5
-
6
- class ResultStorage(Protocol):
7
- def save(self, report: Report, name: str) -> str:
8
- """
9
- Saves a report and returns its identifier.
10
- """
11
- ...
12
-
13
- def load(self, identifier: str) -> Report:
14
- """
15
- Loads a report by its identifier.
16
- """
17
- ...
18
-
19
- def list_reports(self) -> list[dict[str, Any]]:
20
- """
21
- Returns metadata for all stored reports.
22
- """
23
- ...
@@ -1,32 +0,0 @@
1
- promptum/__init__.py,sha256=AjeGgmIbpp9Uv-0ybq6knejEJMK-Dnn_-fV9Z86Bp74,932
2
- promptum/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- promptum/benchmark/__init__.py,sha256=NJYiXm6wVFKMloxKNAXMY4H3bMQORTtLh6__nYWYWa0,131
4
- promptum/benchmark/benchmark.py,sha256=3enQSACdLwHW78fqSZj0Un3r7_Ua3V-MjfbEIIKFSWs,1589
5
- promptum/benchmark/report.py,sha256=ol_UO8rw43zbQxhs2o4AwYN5TP7O_Apa77V-pZKq6Uw,2754
6
- promptum/core/__init__.py,sha256=mqajsOdUBNJfcR2krxpwa7rM_wd88vJaAov-9SnVm68,294
7
- promptum/core/metrics.py,sha256=FnS10nHFjQ5Clj5X21C_nW6zAUJU_ZHt0s2fLgp6L28,427
8
- promptum/core/result.py,sha256=nyuVMQFY6DmZwzpgqDPsj0FaAuairpKLJ-0be5WQtTg,472
9
- promptum/core/retry.py,sha256=mA_RRz9_9J_mge_AUd9f1A-gACOxZLGTI8vTIstAr8s,538
10
- promptum/core/test_case.py,sha256=YNlVNj7FkoCyBFb2N0Dzrhce6o3DzUtke4PR6WoXhZo,593
11
- promptum/execution/__init__.py,sha256=fUZa7Bo7yn921sl49cS6TCGsG-lOUNVdhdeRsIa5vCc,67
12
- promptum/execution/runner.py,sha256=sP3uDu2VDLxFi9BkltMHwsyMuCXnz4oP1kVN28KpVZ0,2434
13
- promptum/providers/__init__.py,sha256=OW-CK198wOV7_bz_keOaxxQeRlFPZgINQcVJUZq_uus,169
14
- promptum/providers/openrouter.py,sha256=owquGxHaTB-pZ8jr06l4HouETuFj1lEg92oGX2mM5uo,4601
15
- promptum/providers/protocol.py,sha256=vdTGAGKN3FzThHLwyMMWicU87_LpW-gn0cM3vMcWiEY,488
16
- promptum/serialization/__init__.py,sha256=0dlpgF3dngaw_oR4mg7nuc4Z_VFVl2bATmhe2mHA9T4,319
17
- promptum/serialization/base.py,sha256=JnB4zb7D4oy44k6ndbJu3Xw1PVLpY_9-Y7k3Et2p43g,1851
18
- promptum/serialization/html.py,sha256=kJEd2s6fVfFHH7snJWrD5RGaUW66x3vtMKGMJ_ekmcI,1901
19
- promptum/serialization/json.py,sha256=koqgr5_WHmrpWUOCq6rWXoC07um3mkDDaob2k9vkEK8,870
20
- promptum/serialization/protocol.py,sha256=MZeMYt_HZJIYSyrRd_ZYbEJXDiXLMuJ5tosAeHLxpTM,353
21
- promptum/serialization/report_template.html,sha256=RC8qSLzolqWkWBIGfyhPtPkRWM7_0JkauEWPkaKiB9A,10802
22
- promptum/serialization/yaml.py,sha256=50A612OkX2L3EjhxTZJMZQb5zL8-2PmwcBjjNUhCWsA,528
23
- promptum/storage/__init__.py,sha256=QWOP5Al43WmmQ_kFCM9JGi8amXJzO_pR-x5AKDNy4ds,153
24
- promptum/storage/file.py,sha256=gnNBpNBQ_NeAWn7P2itsw2L99AxS7zOd8Nef6PyYxlk,5750
25
- promptum/storage/protocol.py,sha256=_NpkJzOQB_98Ud_TA_ZYubHf3o2DDXGMveRN3kRyYKI,517
26
- promptum/validation/__init__.py,sha256=mhykyxaIwn2PJh2RXAi0fi2NRIveFmlC5bg1nyCbfVU,252
27
- promptum/validation/protocol.py,sha256=xqxm23YX6eNeZHKMLMZ-Wz8iQKn4ZRzAI5Xryxg0uq4,418
28
- promptum/validation/validators.py,sha256=3lJwSMhhWb9x8BK_-S0FJBj7PFgno79II_i3Z1mCKTs,3217
29
- promptum-0.0.1.dist-info/METADATA,sha256=vt_PN0Ns0JuJalM7p8hJZsz-Y2hwQrbHZ4Jacy7P6L8,8083
30
- promptum-0.0.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
31
- promptum-0.0.1.dist-info/licenses/LICENSE,sha256=Fgn285H5Vy9diOlqO1TzS3hD97WcdF6-GFHvUcFNtmg,1067
32
- promptum-0.0.1.dist-info/RECORD,,
File without changes
File without changes