promptum 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,293 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>LLM Benchmark Report</title>
7
+ <script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.0/dist/chart.umd.js"></script>
8
+ <style>
9
+ * { margin: 0; padding: 0; box-sizing: border-box; }
10
+ :root {
11
+ --bg: #ffffff;
12
+ --surface: #f5f5f5;
13
+ --text: #1a1a1a;
14
+ --text-muted: #666;
15
+ --border: #ddd;
16
+ --success: #22c55e;
17
+ --error: #ef4444;
18
+ --warning: #f59e0b;
19
+ }
20
+ @media (prefers-color-scheme: dark) {
21
+ :root {
22
+ --bg: #0a0a0a;
23
+ --surface: #1a1a1a;
24
+ --text: #e5e5e5;
25
+ --text-muted: #a3a3a3;
26
+ --border: #333;
27
+ }
28
+ }
29
+ body {
30
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
31
+ background: var(--bg);
32
+ color: var(--text);
33
+ line-height: 1.6;
34
+ }
35
+ .container { max-width: 1400px; margin: 0 auto; padding: 2rem; }
36
+ h1 { font-size: 2rem; margin-bottom: 0.5rem; }
37
+ h2 { font-size: 1.5rem; margin: 2rem 0 1rem; }
38
+ .summary { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
39
+ .card {
40
+ background: var(--surface);
41
+ border: 1px solid var(--border);
42
+ border-radius: 8px;
43
+ padding: 1.5rem;
44
+ }
45
+ .card-title { font-size: 0.875rem; color: var(--text-muted); margin-bottom: 0.5rem; }
46
+ .card-value { font-size: 2rem; font-weight: 700; }
47
+ .chart-container { height: 300px; margin-bottom: 2rem; }
48
+ table {
49
+ width: 100%;
50
+ border-collapse: collapse;
51
+ background: var(--surface);
52
+ border-radius: 8px;
53
+ overflow: hidden;
54
+ }
55
+ th, td {
56
+ text-align: left;
57
+ padding: 1rem;
58
+ border-bottom: 1px solid var(--border);
59
+ }
60
+ th {
61
+ background: var(--surface);
62
+ font-weight: 600;
63
+ position: sticky;
64
+ top: 0;
65
+ }
66
+ tr:hover { background: var(--bg); }
67
+ .badge {
68
+ display: inline-block;
69
+ padding: 0.25rem 0.75rem;
70
+ border-radius: 12px;
71
+ font-size: 0.75rem;
72
+ font-weight: 600;
73
+ }
74
+ .badge-success { background: var(--success); color: white; }
75
+ .badge-error { background: var(--error); color: white; }
76
+ .tag {
77
+ display: inline-block;
78
+ padding: 0.125rem 0.5rem;
79
+ background: var(--border);
80
+ border-radius: 4px;
81
+ font-size: 0.75rem;
82
+ margin-right: 0.25rem;
83
+ }
84
+ .search {
85
+ width: 100%;
86
+ padding: 0.75rem;
87
+ margin-bottom: 1rem;
88
+ background: var(--surface);
89
+ border: 1px solid var(--border);
90
+ border-radius: 8px;
91
+ color: var(--text);
92
+ font-size: 1rem;
93
+ }
94
+ .truncate {
95
+ max-width: 300px;
96
+ white-space: nowrap;
97
+ overflow: hidden;
98
+ text-overflow: ellipsis;
99
+ }
100
+ button {
101
+ background: var(--surface);
102
+ border: 1px solid var(--border);
103
+ color: var(--text);
104
+ padding: 0.5rem 1rem;
105
+ border-radius: 6px;
106
+ cursor: pointer;
107
+ font-size: 0.875rem;
108
+ }
109
+ button:hover { background: var(--border); }
110
+ .modal {
111
+ display: none;
112
+ position: fixed;
113
+ top: 0;
114
+ left: 0;
115
+ width: 100%;
116
+ height: 100%;
117
+ background: rgba(0, 0, 0, 0.7);
118
+ z-index: 1000;
119
+ overflow: auto;
120
+ }
121
+ .modal-content {
122
+ background: var(--surface);
123
+ margin: 2rem auto;
124
+ padding: 2rem;
125
+ max-width: 800px;
126
+ border-radius: 12px;
127
+ position: relative;
128
+ }
129
+ .modal-close {
130
+ position: absolute;
131
+ top: 1rem;
132
+ right: 1rem;
133
+ font-size: 1.5rem;
134
+ cursor: pointer;
135
+ }
136
+ pre {
137
+ background: var(--bg);
138
+ padding: 1rem;
139
+ border-radius: 6px;
140
+ overflow-x: auto;
141
+ margin: 0.5rem 0;
142
+ }
143
+ code { font-family: 'Courier New', monospace; font-size: 0.875rem; }
144
+ </style>
145
+ </head>
146
+ <body>
147
+ <div class="container">
148
+ <h1>LLM Benchmark Report</h1>
149
+ <p style="color: var(--text-muted); margin-bottom: 2rem;">{{ summary.total }} tests executed</p>
150
+
151
+ <div class="summary">
152
+ <div class="card">
153
+ <div class="card-title">Pass Rate</div>
154
+ <div class="card-value" style="color: var(--success);">{{ "%.1f"|format(summary.pass_rate * 100) }}%</div>
155
+ </div>
156
+ <div class="card">
157
+ <div class="card-title">Avg Latency</div>
158
+ <div class="card-value">{{ "%.0f"|format(summary.avg_latency_ms) }}ms</div>
159
+ </div>
160
+ <div class="card">
161
+ <div class="card-title">Total Cost</div>
162
+ <div class="card-value">${{ "%.6f"|format(summary.total_cost_usd) }}</div>
163
+ </div>
164
+ <div class="card">
165
+ <div class="card-title">Total Tokens</div>
166
+ <div class="card-value">{{ "{:,}".format(summary.total_tokens) }}</div>
167
+ </div>
168
+ </div>
169
+
170
+ <div class="card chart-container">
171
+ <canvas id="latencyChart"></canvas>
172
+ </div>
173
+
174
+ <h2>Test Results</h2>
175
+ <input type="text" class="search" id="searchInput" placeholder="Search tests...">
176
+
177
+ <table id="resultsTable">
178
+ <thead>
179
+ <tr>
180
+ <th>Status</th>
181
+ <th>Name</th>
182
+ <th>Model</th>
183
+ <th>Latency</th>
184
+ <th>Cost</th>
185
+ <th>Tags</th>
186
+ <th>Actions</th>
187
+ </tr>
188
+ </thead>
189
+ <tbody>
190
+ {% for result in results %}
191
+ <tr class="result-row">
192
+ <td>
193
+ {% if result.passed %}
194
+ <span class="badge badge-success">PASS</span>
195
+ {% else %}
196
+ <span class="badge badge-error">FAIL</span>
197
+ {% endif %}
198
+ </td>
199
+ <td>{{ result.test_case.name }}</td>
200
+ <td>{{ result.test_case.model }}</td>
201
+ <td>{{ "%.0f"|format(result.metrics.latency_ms if result.metrics else 0) }}ms</td>
202
+ <td>${{ "%.6f"|format(result.metrics.cost_usd if result.metrics and result.metrics.cost_usd else 0) }}</td>
203
+ <td>
204
+ {% for tag in result.test_case.tags %}
205
+ <span class="tag">{{ tag }}</span>
206
+ {% endfor %}
207
+ </td>
208
+ <td><button onclick="showDetails({{ loop.index0 }})">Details</button></td>
209
+ </tr>
210
+ {% endfor %}
211
+ </tbody>
212
+ </table>
213
+ </div>
214
+
215
+ <div id="detailsModal" class="modal">
216
+ <div class="modal-content">
217
+ <span class="modal-close" onclick="closeModal()">&times;</span>
218
+ <div id="modalBody"></div>
219
+ </div>
220
+ </div>
221
+
222
+ <script>
223
+ const results = {{ results_json }};
224
+
225
+ new Chart(document.getElementById('latencyChart'), {
226
+ type: 'bar',
227
+ data: {
228
+ labels: results.map((r, i) => r.test_case.name),
229
+ datasets: [{
230
+ label: 'Latency (ms)',
231
+ data: results.map(r => r.metrics ? r.metrics.latency_ms : 0),
232
+ backgroundColor: results.map(r => r.passed ? '#22c55e' : '#ef4444')
233
+ }]
234
+ },
235
+ options: {
236
+ responsive: true,
237
+ maintainAspectRatio: false,
238
+ plugins: { legend: { display: false } }
239
+ }
240
+ });
241
+
242
+ document.getElementById('searchInput').addEventListener('input', function(e) {
243
+ const term = e.target.value.toLowerCase();
244
+ document.querySelectorAll('.result-row').forEach(row => {
245
+ const text = row.textContent.toLowerCase();
246
+ row.style.display = text.includes(term) ? '' : 'none';
247
+ });
248
+ });
249
+
250
+ function showDetails(index) {
251
+ const result = results[index];
252
+ const html = `
253
+ <h2>${result.test_case.name}</h2>
254
+ <p><strong>Status:</strong> <span class="badge ${result.passed ? 'badge-success' : 'badge-error'}">${result.passed ? 'PASS' : 'FAIL'}</span></p>
255
+ <p><strong>Model:</strong> ${result.test_case.model}</p>
256
+ <p><strong>Validator:</strong> ${result.test_case.validator}</p>
257
+ <h3>Prompt</h3>
258
+ <pre><code>${escapeHtml(result.test_case.prompt)}</code></pre>
259
+ ${result.test_case.system_prompt ? `<h3>System Prompt</h3><pre><code>${escapeHtml(result.test_case.system_prompt)}</code></pre>` : ''}
260
+ <h3>Response</h3>
261
+ <pre><code>${escapeHtml(result.response || 'No response')}</code></pre>
262
+ ${result.execution_error ? `<h3>Error</h3><pre style="color: var(--error);"><code>${escapeHtml(result.execution_error)}</code></pre>` : ''}
263
+ ${result.metrics ? `
264
+ <h3>Metrics</h3>
265
+ <ul>
266
+ <li>Latency: ${result.metrics.latency_ms.toFixed(0)}ms</li>
267
+ <li>Tokens: ${result.metrics.total_tokens || 'N/A'}</li>
268
+ <li>Cost: $${(result.metrics.cost_usd || 0).toFixed(6)}</li>
269
+ <li>Attempts: ${result.metrics.total_attempts}</li>
270
+ </ul>
271
+ ` : ''}
272
+ `;
273
+ document.getElementById('modalBody').innerHTML = html;
274
+ document.getElementById('detailsModal').style.display = 'block';
275
+ }
276
+
277
+ function closeModal() {
278
+ document.getElementById('detailsModal').style.display = 'none';
279
+ }
280
+
281
+ function escapeHtml(text) {
282
+ const div = document.createElement('div');
283
+ div.textContent = text;
284
+ return div.innerHTML;
285
+ }
286
+
287
+ window.onclick = function(event) {
288
+ const modal = document.getElementById('detailsModal');
289
+ if (event.target === modal) closeModal();
290
+ }
291
+ </script>
292
+ </body>
293
+ </html>
@@ -0,0 +1,17 @@
1
+ import yaml
2
+
3
+ from promptum.benchmark.report import Report
4
+ from promptum.serialization.base import BaseSerializer
5
+
6
+
7
+ class YAMLSerializer(BaseSerializer):
8
+ def serialize(self, report: Report) -> str:
9
+ data = {
10
+ "metadata": report.metadata,
11
+ "summary": report.get_summary(),
12
+ "results": [self._serialize_result(r) for r in report.results],
13
+ }
14
+ return yaml.dump(data, default_flow_style=False, sort_keys=False)
15
+
16
+ def get_file_extension(self) -> str:
17
+ return "yaml"
@@ -0,0 +1,7 @@
1
+ from promptum.storage.file import FileStorage
2
+ from promptum.storage.protocol import ResultStorage
3
+
4
+ __all__ = [
5
+ "ResultStorage",
6
+ "FileStorage",
7
+ ]
@@ -0,0 +1,157 @@
1
+ import json
2
+ import tempfile
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from promptum.benchmark.report import Report
8
+ from promptum.core.metrics import Metrics
9
+ from promptum.core.result import TestResult
10
+ from promptum.core.test_case import TestCase
11
+ from promptum.validation.validators import PlaceholderValidator
12
+
13
+
14
+ class FileStorage:
15
+ def __init__(self, base_dir: str = "results"):
16
+ self.base_dir = Path(base_dir)
17
+ self.reports_dir = self.base_dir / "reports"
18
+ self.metadata_file = self.base_dir / "metadata.json"
19
+
20
+ self.reports_dir.mkdir(parents=True, exist_ok=True)
21
+
22
+ def save(self, report: Report, name: str) -> str:
23
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
24
+ identifier = f"{timestamp}_{name}"
25
+ filename = f"{identifier}.json"
26
+ filepath = self.reports_dir / filename
27
+
28
+ data = self._serialize_report(report)
29
+
30
+ with tempfile.NamedTemporaryFile(
31
+ mode="w", delete=False, dir=self.reports_dir, suffix=".tmp"
32
+ ) as tmp:
33
+ json.dump(data, tmp, indent=2)
34
+ tmp_path = Path(tmp.name)
35
+
36
+ tmp_path.replace(filepath)
37
+
38
+ self._update_metadata(identifier, name, str(filepath))
39
+
40
+ return identifier
41
+
42
+ def load(self, identifier: str) -> Report:
43
+ filepath = self.reports_dir / f"{identifier}.json"
44
+
45
+ if not filepath.exists():
46
+ raise FileNotFoundError(f"Report not found: {identifier}")
47
+
48
+ with open(filepath) as f:
49
+ data = json.load(f)
50
+
51
+ return self._deserialize_report(data)
52
+
53
+ def list_reports(self) -> list[dict[str, Any]]:
54
+ if not self.metadata_file.exists():
55
+ return []
56
+
57
+ with open(self.metadata_file) as f:
58
+ return json.load(f)
59
+
60
+ def _update_metadata(self, identifier: str, name: str, path: str) -> None:
61
+ metadata = self.list_reports()
62
+
63
+ metadata.append(
64
+ {
65
+ "id": identifier,
66
+ "name": name,
67
+ "path": path,
68
+ "timestamp": datetime.now().isoformat(),
69
+ }
70
+ )
71
+
72
+ with tempfile.NamedTemporaryFile(
73
+ mode="w", delete=False, dir=self.base_dir, suffix=".tmp"
74
+ ) as tmp:
75
+ json.dump(metadata, tmp, indent=2)
76
+ tmp_path = Path(tmp.name)
77
+
78
+ tmp_path.replace(self.metadata_file)
79
+
80
+ @staticmethod
81
+ def _serialize_report(report: Report) -> dict[str, Any]:
82
+ return {
83
+ "metadata": report.metadata,
84
+ "results": [
85
+ {
86
+ "test_case": {
87
+ "name": r.test_case.name,
88
+ "prompt": r.test_case.prompt,
89
+ "model": r.test_case.model,
90
+ "tags": list(r.test_case.tags),
91
+ "system_prompt": r.test_case.system_prompt,
92
+ "temperature": r.test_case.temperature,
93
+ "max_tokens": r.test_case.max_tokens,
94
+ "metadata": r.test_case.metadata,
95
+ "validator_description": r.test_case.validator.describe(),
96
+ },
97
+ "response": r.response,
98
+ "passed": r.passed,
99
+ "metrics": {
100
+ "latency_ms": r.metrics.latency_ms,
101
+ "prompt_tokens": r.metrics.prompt_tokens,
102
+ "completion_tokens": r.metrics.completion_tokens,
103
+ "total_tokens": r.metrics.total_tokens,
104
+ "cost_usd": r.metrics.cost_usd,
105
+ "retry_delays": list(r.metrics.retry_delays),
106
+ }
107
+ if r.metrics
108
+ else None,
109
+ "validation_details": r.validation_details,
110
+ "execution_error": r.execution_error,
111
+ "timestamp": r.timestamp.isoformat(),
112
+ }
113
+ for r in report.results
114
+ ],
115
+ }
116
+
117
+ @staticmethod
118
+ def _deserialize_report(data: dict[str, Any]) -> Report:
119
+ results = []
120
+ for r in data["results"]:
121
+ test_case = TestCase(
122
+ name=r["test_case"]["name"],
123
+ prompt=r["test_case"]["prompt"],
124
+ model=r["test_case"]["model"],
125
+ validator=PlaceholderValidator(
126
+ description=r["test_case"]["validator_description"],
127
+ ),
128
+ tags=tuple(r["test_case"]["tags"]),
129
+ system_prompt=r["test_case"]["system_prompt"],
130
+ temperature=r["test_case"]["temperature"],
131
+ max_tokens=r["test_case"]["max_tokens"],
132
+ metadata=r["test_case"]["metadata"],
133
+ )
134
+
135
+ metrics = None
136
+ if r["metrics"]:
137
+ metrics = Metrics(
138
+ latency_ms=r["metrics"]["latency_ms"],
139
+ prompt_tokens=r["metrics"]["prompt_tokens"],
140
+ completion_tokens=r["metrics"]["completion_tokens"],
141
+ total_tokens=r["metrics"]["total_tokens"],
142
+ cost_usd=r["metrics"]["cost_usd"],
143
+ retry_delays=tuple(r["metrics"]["retry_delays"]),
144
+ )
145
+
146
+ result = TestResult(
147
+ test_case=test_case,
148
+ response=r["response"],
149
+ passed=r["passed"],
150
+ metrics=metrics,
151
+ validation_details=r["validation_details"],
152
+ execution_error=r["execution_error"],
153
+ timestamp=datetime.fromisoformat(r["timestamp"]),
154
+ )
155
+ results.append(result)
156
+
157
+ return Report(results=results, metadata=data["metadata"])
@@ -0,0 +1,23 @@
1
+ from typing import Any, Protocol
2
+
3
+ from promptum.benchmark.report import Report
4
+
5
+
6
+ class ResultStorage(Protocol):
7
+ def save(self, report: Report, name: str) -> str:
8
+ """
9
+ Saves a report and returns its identifier.
10
+ """
11
+ ...
12
+
13
+ def load(self, identifier: str) -> Report:
14
+ """
15
+ Loads a report by its identifier.
16
+ """
17
+ ...
18
+
19
+ def list_reports(self) -> list[dict[str, Any]]:
20
+ """
21
+ Returns metadata for all stored reports.
22
+ """
23
+ ...
@@ -0,0 +1,15 @@
1
+ from promptum.validation.protocol import Validator
2
+ from promptum.validation.validators import (
3
+ Contains,
4
+ ExactMatch,
5
+ JsonSchema,
6
+ Regex,
7
+ )
8
+
9
+ __all__ = [
10
+ "Validator",
11
+ "ExactMatch",
12
+ "Contains",
13
+ "Regex",
14
+ "JsonSchema",
15
+ ]
@@ -0,0 +1,16 @@
1
+ from typing import Any, Protocol
2
+
3
+
4
+ class Validator(Protocol):
5
+ def validate(self, response: str) -> tuple[bool, dict[str, Any]]:
6
+ """
7
+ Validates a response string.
8
+
9
+ Returns:
10
+ (passed, details) where details contains diagnostic information
11
+ """
12
+ ...
13
+
14
+ def describe(self) -> str:
15
+ """Returns a human-readable description of validation criteria."""
16
+ ...
@@ -0,0 +1,108 @@
1
+ import json
2
+ import re
3
+ from dataclasses import dataclass
4
+ from typing import Any
5
+
6
+
7
+ @dataclass(frozen=True, slots=True)
8
+ class ExactMatch:
9
+ expected: str
10
+ case_sensitive: bool = True
11
+
12
+ def validate(self, response: str) -> tuple[bool, dict[str, Any]]:
13
+ if self.case_sensitive:
14
+ passed = response == self.expected
15
+ else:
16
+ passed = response.lower() == self.expected.lower()
17
+
18
+ return passed, {
19
+ "expected": self.expected,
20
+ "actual": response,
21
+ "case_sensitive": self.case_sensitive,
22
+ }
23
+
24
+ def describe(self) -> str:
25
+ mode = "case-sensitive" if self.case_sensitive else "case-insensitive"
26
+ return f"Exact match ({mode}): {self.expected!r}"
27
+
28
+
29
+ @dataclass(frozen=True, slots=True)
30
+ class Contains:
31
+ substring: str
32
+ case_sensitive: bool = True
33
+
34
+ def validate(self, response: str) -> tuple[bool, dict[str, Any]]:
35
+ if self.case_sensitive:
36
+ passed = self.substring in response
37
+ else:
38
+ passed = self.substring.lower() in response.lower()
39
+
40
+ return passed, {
41
+ "substring": self.substring,
42
+ "case_sensitive": self.case_sensitive,
43
+ }
44
+
45
+ def describe(self) -> str:
46
+ mode = "case-sensitive" if self.case_sensitive else "case-insensitive"
47
+ return f"Contains ({mode}): {self.substring!r}"
48
+
49
+
50
+ @dataclass(frozen=True, slots=True)
51
+ class Regex:
52
+ pattern: str
53
+ flags: int = 0
54
+
55
+ def validate(self, response: str) -> tuple[bool, dict[str, Any]]:
56
+ match = re.search(self.pattern, response, self.flags)
57
+ return match is not None, {
58
+ "pattern": self.pattern,
59
+ "matched": match.group(0) if match else None,
60
+ }
61
+
62
+ def describe(self) -> str:
63
+ return f"Regex: {self.pattern!r}"
64
+
65
+
66
+ @dataclass(frozen=True, slots=True)
67
+ class JsonSchema:
68
+ required_keys: tuple[str, ...] = ()
69
+
70
+ def validate(self, response: str) -> tuple[bool, dict[str, Any]]:
71
+ try:
72
+ data = json.loads(response)
73
+ if not isinstance(data, dict):
74
+ return False, {"error": "Response is not a JSON object"}
75
+
76
+ missing_keys = [key for key in self.required_keys if key not in data]
77
+ passed = len(missing_keys) == 0
78
+
79
+ return passed, {
80
+ "parsed": data,
81
+ "missing_keys": missing_keys,
82
+ }
83
+ except json.JSONDecodeError as e:
84
+ return False, {"error": f"Invalid JSON: {e}"}
85
+
86
+ def describe(self) -> str:
87
+ if self.required_keys:
88
+ keys = ", ".join(self.required_keys)
89
+ return f"Valid JSON with keys: {keys}"
90
+ return "Valid JSON object"
91
+
92
+
93
+ @dataclass(frozen=True, slots=True)
94
+ class PlaceholderValidator:
95
+ """
96
+ Placeholder validator for deserialized reports.
97
+
98
+ Used when original validator cannot be reconstructed from storage.
99
+ Always returns True. Original validator logic is not preserved.
100
+ """
101
+
102
+ description: str
103
+
104
+ def validate(self, response: str) -> tuple[bool, dict[str, Any]]:
105
+ return True, {"placeholder": True, "note": "Original validator could not be reconstructed"}
106
+
107
+ def describe(self) -> str:
108
+ return self.description