promptum 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promptum/__init__.py +44 -0
- promptum/benchmark/__init__.py +4 -0
- promptum/benchmark/benchmark.py +50 -0
- promptum/benchmark/report.py +75 -0
- promptum/core/__init__.py +12 -0
- promptum/core/metrics.py +16 -0
- promptum/core/result.py +17 -0
- promptum/core/retry.py +19 -0
- promptum/core/test_case.py +22 -0
- promptum/execution/__init__.py +3 -0
- promptum/execution/runner.py +75 -0
- promptum/providers/__init__.py +7 -0
- promptum/providers/openrouter.py +123 -0
- promptum/providers/protocol.py +22 -0
- promptum/py.typed +0 -0
- promptum/serialization/__init__.py +11 -0
- promptum/serialization/base.py +48 -0
- promptum/serialization/html.py +52 -0
- promptum/serialization/json.py +28 -0
- promptum/serialization/protocol.py +13 -0
- promptum/serialization/report_template.html +293 -0
- promptum/serialization/yaml.py +17 -0
- promptum/storage/__init__.py +7 -0
- promptum/storage/file.py +157 -0
- promptum/storage/protocol.py +23 -0
- promptum/validation/__init__.py +15 -0
- promptum/validation/protocol.py +16 -0
- promptum/validation/validators.py +108 -0
- promptum-0.0.1.dist-info/METADATA +280 -0
- promptum-0.0.1.dist-info/RECORD +32 -0
- promptum-0.0.1.dist-info/WHEEL +4 -0
- promptum-0.0.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8">
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
+
<title>LLM Benchmark Report</title>
|
|
7
|
+
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.0/dist/chart.umd.js"></script>
|
|
8
|
+
<style>
|
|
9
|
+
* { margin: 0; padding: 0; box-sizing: border-box; }
|
|
10
|
+
:root {
|
|
11
|
+
--bg: #ffffff;
|
|
12
|
+
--surface: #f5f5f5;
|
|
13
|
+
--text: #1a1a1a;
|
|
14
|
+
--text-muted: #666;
|
|
15
|
+
--border: #ddd;
|
|
16
|
+
--success: #22c55e;
|
|
17
|
+
--error: #ef4444;
|
|
18
|
+
--warning: #f59e0b;
|
|
19
|
+
}
|
|
20
|
+
@media (prefers-color-scheme: dark) {
|
|
21
|
+
:root {
|
|
22
|
+
--bg: #0a0a0a;
|
|
23
|
+
--surface: #1a1a1a;
|
|
24
|
+
--text: #e5e5e5;
|
|
25
|
+
--text-muted: #a3a3a3;
|
|
26
|
+
--border: #333;
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
body {
|
|
30
|
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
|
|
31
|
+
background: var(--bg);
|
|
32
|
+
color: var(--text);
|
|
33
|
+
line-height: 1.6;
|
|
34
|
+
}
|
|
35
|
+
.container { max-width: 1400px; margin: 0 auto; padding: 2rem; }
|
|
36
|
+
h1 { font-size: 2rem; margin-bottom: 0.5rem; }
|
|
37
|
+
h2 { font-size: 1.5rem; margin: 2rem 0 1rem; }
|
|
38
|
+
.summary { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
|
|
39
|
+
.card {
|
|
40
|
+
background: var(--surface);
|
|
41
|
+
border: 1px solid var(--border);
|
|
42
|
+
border-radius: 8px;
|
|
43
|
+
padding: 1.5rem;
|
|
44
|
+
}
|
|
45
|
+
.card-title { font-size: 0.875rem; color: var(--text-muted); margin-bottom: 0.5rem; }
|
|
46
|
+
.card-value { font-size: 2rem; font-weight: 700; }
|
|
47
|
+
.chart-container { height: 300px; margin-bottom: 2rem; }
|
|
48
|
+
table {
|
|
49
|
+
width: 100%;
|
|
50
|
+
border-collapse: collapse;
|
|
51
|
+
background: var(--surface);
|
|
52
|
+
border-radius: 8px;
|
|
53
|
+
overflow: hidden;
|
|
54
|
+
}
|
|
55
|
+
th, td {
|
|
56
|
+
text-align: left;
|
|
57
|
+
padding: 1rem;
|
|
58
|
+
border-bottom: 1px solid var(--border);
|
|
59
|
+
}
|
|
60
|
+
th {
|
|
61
|
+
background: var(--surface);
|
|
62
|
+
font-weight: 600;
|
|
63
|
+
position: sticky;
|
|
64
|
+
top: 0;
|
|
65
|
+
}
|
|
66
|
+
tr:hover { background: var(--bg); }
|
|
67
|
+
.badge {
|
|
68
|
+
display: inline-block;
|
|
69
|
+
padding: 0.25rem 0.75rem;
|
|
70
|
+
border-radius: 12px;
|
|
71
|
+
font-size: 0.75rem;
|
|
72
|
+
font-weight: 600;
|
|
73
|
+
}
|
|
74
|
+
.badge-success { background: var(--success); color: white; }
|
|
75
|
+
.badge-error { background: var(--error); color: white; }
|
|
76
|
+
.tag {
|
|
77
|
+
display: inline-block;
|
|
78
|
+
padding: 0.125rem 0.5rem;
|
|
79
|
+
background: var(--border);
|
|
80
|
+
border-radius: 4px;
|
|
81
|
+
font-size: 0.75rem;
|
|
82
|
+
margin-right: 0.25rem;
|
|
83
|
+
}
|
|
84
|
+
.search {
|
|
85
|
+
width: 100%;
|
|
86
|
+
padding: 0.75rem;
|
|
87
|
+
margin-bottom: 1rem;
|
|
88
|
+
background: var(--surface);
|
|
89
|
+
border: 1px solid var(--border);
|
|
90
|
+
border-radius: 8px;
|
|
91
|
+
color: var(--text);
|
|
92
|
+
font-size: 1rem;
|
|
93
|
+
}
|
|
94
|
+
.truncate {
|
|
95
|
+
max-width: 300px;
|
|
96
|
+
white-space: nowrap;
|
|
97
|
+
overflow: hidden;
|
|
98
|
+
text-overflow: ellipsis;
|
|
99
|
+
}
|
|
100
|
+
button {
|
|
101
|
+
background: var(--surface);
|
|
102
|
+
border: 1px solid var(--border);
|
|
103
|
+
color: var(--text);
|
|
104
|
+
padding: 0.5rem 1rem;
|
|
105
|
+
border-radius: 6px;
|
|
106
|
+
cursor: pointer;
|
|
107
|
+
font-size: 0.875rem;
|
|
108
|
+
}
|
|
109
|
+
button:hover { background: var(--border); }
|
|
110
|
+
.modal {
|
|
111
|
+
display: none;
|
|
112
|
+
position: fixed;
|
|
113
|
+
top: 0;
|
|
114
|
+
left: 0;
|
|
115
|
+
width: 100%;
|
|
116
|
+
height: 100%;
|
|
117
|
+
background: rgba(0, 0, 0, 0.7);
|
|
118
|
+
z-index: 1000;
|
|
119
|
+
overflow: auto;
|
|
120
|
+
}
|
|
121
|
+
.modal-content {
|
|
122
|
+
background: var(--surface);
|
|
123
|
+
margin: 2rem auto;
|
|
124
|
+
padding: 2rem;
|
|
125
|
+
max-width: 800px;
|
|
126
|
+
border-radius: 12px;
|
|
127
|
+
position: relative;
|
|
128
|
+
}
|
|
129
|
+
.modal-close {
|
|
130
|
+
position: absolute;
|
|
131
|
+
top: 1rem;
|
|
132
|
+
right: 1rem;
|
|
133
|
+
font-size: 1.5rem;
|
|
134
|
+
cursor: pointer;
|
|
135
|
+
}
|
|
136
|
+
pre {
|
|
137
|
+
background: var(--bg);
|
|
138
|
+
padding: 1rem;
|
|
139
|
+
border-radius: 6px;
|
|
140
|
+
overflow-x: auto;
|
|
141
|
+
margin: 0.5rem 0;
|
|
142
|
+
}
|
|
143
|
+
code { font-family: 'Courier New', monospace; font-size: 0.875rem; }
|
|
144
|
+
</style>
|
|
145
|
+
</head>
|
|
146
|
+
<body>
|
|
147
|
+
<div class="container">
|
|
148
|
+
<h1>LLM Benchmark Report</h1>
|
|
149
|
+
<p style="color: var(--text-muted); margin-bottom: 2rem;">{{ summary.total }} tests executed</p>
|
|
150
|
+
|
|
151
|
+
<div class="summary">
|
|
152
|
+
<div class="card">
|
|
153
|
+
<div class="card-title">Pass Rate</div>
|
|
154
|
+
<div class="card-value" style="color: var(--success);">{{ "%.1f"|format(summary.pass_rate * 100) }}%</div>
|
|
155
|
+
</div>
|
|
156
|
+
<div class="card">
|
|
157
|
+
<div class="card-title">Avg Latency</div>
|
|
158
|
+
<div class="card-value">{{ "%.0f"|format(summary.avg_latency_ms) }}ms</div>
|
|
159
|
+
</div>
|
|
160
|
+
<div class="card">
|
|
161
|
+
<div class="card-title">Total Cost</div>
|
|
162
|
+
<div class="card-value">${{ "%.6f"|format(summary.total_cost_usd) }}</div>
|
|
163
|
+
</div>
|
|
164
|
+
<div class="card">
|
|
165
|
+
<div class="card-title">Total Tokens</div>
|
|
166
|
+
<div class="card-value">{{ "{:,}".format(summary.total_tokens) }}</div>
|
|
167
|
+
</div>
|
|
168
|
+
</div>
|
|
169
|
+
|
|
170
|
+
<div class="card chart-container">
|
|
171
|
+
<canvas id="latencyChart"></canvas>
|
|
172
|
+
</div>
|
|
173
|
+
|
|
174
|
+
<h2>Test Results</h2>
|
|
175
|
+
<input type="text" class="search" id="searchInput" placeholder="Search tests...">
|
|
176
|
+
|
|
177
|
+
<table id="resultsTable">
|
|
178
|
+
<thead>
|
|
179
|
+
<tr>
|
|
180
|
+
<th>Status</th>
|
|
181
|
+
<th>Name</th>
|
|
182
|
+
<th>Model</th>
|
|
183
|
+
<th>Latency</th>
|
|
184
|
+
<th>Cost</th>
|
|
185
|
+
<th>Tags</th>
|
|
186
|
+
<th>Actions</th>
|
|
187
|
+
</tr>
|
|
188
|
+
</thead>
|
|
189
|
+
<tbody>
|
|
190
|
+
{% for result in results %}
|
|
191
|
+
<tr class="result-row">
|
|
192
|
+
<td>
|
|
193
|
+
{% if result.passed %}
|
|
194
|
+
<span class="badge badge-success">PASS</span>
|
|
195
|
+
{% else %}
|
|
196
|
+
<span class="badge badge-error">FAIL</span>
|
|
197
|
+
{% endif %}
|
|
198
|
+
</td>
|
|
199
|
+
<td>{{ result.test_case.name }}</td>
|
|
200
|
+
<td>{{ result.test_case.model }}</td>
|
|
201
|
+
<td>{{ "%.0f"|format(result.metrics.latency_ms if result.metrics else 0) }}ms</td>
|
|
202
|
+
<td>${{ "%.6f"|format(result.metrics.cost_usd if result.metrics and result.metrics.cost_usd else 0) }}</td>
|
|
203
|
+
<td>
|
|
204
|
+
{% for tag in result.test_case.tags %}
|
|
205
|
+
<span class="tag">{{ tag }}</span>
|
|
206
|
+
{% endfor %}
|
|
207
|
+
</td>
|
|
208
|
+
<td><button onclick="showDetails({{ loop.index0 }})">Details</button></td>
|
|
209
|
+
</tr>
|
|
210
|
+
{% endfor %}
|
|
211
|
+
</tbody>
|
|
212
|
+
</table>
|
|
213
|
+
</div>
|
|
214
|
+
|
|
215
|
+
<div id="detailsModal" class="modal">
|
|
216
|
+
<div class="modal-content">
|
|
217
|
+
<span class="modal-close" onclick="closeModal()">×</span>
|
|
218
|
+
<div id="modalBody"></div>
|
|
219
|
+
</div>
|
|
220
|
+
</div>
|
|
221
|
+
|
|
222
|
+
<script>
|
|
223
|
+
const results = {{ results_json }};
|
|
224
|
+
|
|
225
|
+
new Chart(document.getElementById('latencyChart'), {
|
|
226
|
+
type: 'bar',
|
|
227
|
+
data: {
|
|
228
|
+
labels: results.map((r, i) => r.test_case.name),
|
|
229
|
+
datasets: [{
|
|
230
|
+
label: 'Latency (ms)',
|
|
231
|
+
data: results.map(r => r.metrics ? r.metrics.latency_ms : 0),
|
|
232
|
+
backgroundColor: results.map(r => r.passed ? '#22c55e' : '#ef4444')
|
|
233
|
+
}]
|
|
234
|
+
},
|
|
235
|
+
options: {
|
|
236
|
+
responsive: true,
|
|
237
|
+
maintainAspectRatio: false,
|
|
238
|
+
plugins: { legend: { display: false } }
|
|
239
|
+
}
|
|
240
|
+
});
|
|
241
|
+
|
|
242
|
+
document.getElementById('searchInput').addEventListener('input', function(e) {
|
|
243
|
+
const term = e.target.value.toLowerCase();
|
|
244
|
+
document.querySelectorAll('.result-row').forEach(row => {
|
|
245
|
+
const text = row.textContent.toLowerCase();
|
|
246
|
+
row.style.display = text.includes(term) ? '' : 'none';
|
|
247
|
+
});
|
|
248
|
+
});
|
|
249
|
+
|
|
250
|
+
function showDetails(index) {
|
|
251
|
+
const result = results[index];
|
|
252
|
+
const html = `
|
|
253
|
+
<h2>${result.test_case.name}</h2>
|
|
254
|
+
<p><strong>Status:</strong> <span class="badge ${result.passed ? 'badge-success' : 'badge-error'}">${result.passed ? 'PASS' : 'FAIL'}</span></p>
|
|
255
|
+
<p><strong>Model:</strong> ${result.test_case.model}</p>
|
|
256
|
+
<p><strong>Validator:</strong> ${result.test_case.validator}</p>
|
|
257
|
+
<h3>Prompt</h3>
|
|
258
|
+
<pre><code>${escapeHtml(result.test_case.prompt)}</code></pre>
|
|
259
|
+
${result.test_case.system_prompt ? `<h3>System Prompt</h3><pre><code>${escapeHtml(result.test_case.system_prompt)}</code></pre>` : ''}
|
|
260
|
+
<h3>Response</h3>
|
|
261
|
+
<pre><code>${escapeHtml(result.response || 'No response')}</code></pre>
|
|
262
|
+
${result.execution_error ? `<h3>Error</h3><pre style="color: var(--error);"><code>${escapeHtml(result.execution_error)}</code></pre>` : ''}
|
|
263
|
+
${result.metrics ? `
|
|
264
|
+
<h3>Metrics</h3>
|
|
265
|
+
<ul>
|
|
266
|
+
<li>Latency: ${result.metrics.latency_ms.toFixed(0)}ms</li>
|
|
267
|
+
<li>Tokens: ${result.metrics.total_tokens || 'N/A'}</li>
|
|
268
|
+
<li>Cost: $${(result.metrics.cost_usd || 0).toFixed(6)}</li>
|
|
269
|
+
<li>Attempts: ${result.metrics.total_attempts}</li>
|
|
270
|
+
</ul>
|
|
271
|
+
` : ''}
|
|
272
|
+
`;
|
|
273
|
+
document.getElementById('modalBody').innerHTML = html;
|
|
274
|
+
document.getElementById('detailsModal').style.display = 'block';
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
function closeModal() {
|
|
278
|
+
document.getElementById('detailsModal').style.display = 'none';
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
function escapeHtml(text) {
|
|
282
|
+
const div = document.createElement('div');
|
|
283
|
+
div.textContent = text;
|
|
284
|
+
return div.innerHTML;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
window.onclick = function(event) {
|
|
288
|
+
const modal = document.getElementById('detailsModal');
|
|
289
|
+
if (event.target === modal) closeModal();
|
|
290
|
+
}
|
|
291
|
+
</script>
|
|
292
|
+
</body>
|
|
293
|
+
</html>
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import yaml
|
|
2
|
+
|
|
3
|
+
from promptum.benchmark.report import Report
|
|
4
|
+
from promptum.serialization.base import BaseSerializer
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class YAMLSerializer(BaseSerializer):
|
|
8
|
+
def serialize(self, report: Report) -> str:
|
|
9
|
+
data = {
|
|
10
|
+
"metadata": report.metadata,
|
|
11
|
+
"summary": report.get_summary(),
|
|
12
|
+
"results": [self._serialize_result(r) for r in report.results],
|
|
13
|
+
}
|
|
14
|
+
return yaml.dump(data, default_flow_style=False, sort_keys=False)
|
|
15
|
+
|
|
16
|
+
def get_file_extension(self) -> str:
|
|
17
|
+
return "yaml"
|
promptum/storage/file.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import tempfile
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from promptum.benchmark.report import Report
|
|
8
|
+
from promptum.core.metrics import Metrics
|
|
9
|
+
from promptum.core.result import TestResult
|
|
10
|
+
from promptum.core.test_case import TestCase
|
|
11
|
+
from promptum.validation.validators import PlaceholderValidator
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FileStorage:
|
|
15
|
+
def __init__(self, base_dir: str = "results"):
|
|
16
|
+
self.base_dir = Path(base_dir)
|
|
17
|
+
self.reports_dir = self.base_dir / "reports"
|
|
18
|
+
self.metadata_file = self.base_dir / "metadata.json"
|
|
19
|
+
|
|
20
|
+
self.reports_dir.mkdir(parents=True, exist_ok=True)
|
|
21
|
+
|
|
22
|
+
def save(self, report: Report, name: str) -> str:
|
|
23
|
+
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
|
24
|
+
identifier = f"{timestamp}_{name}"
|
|
25
|
+
filename = f"{identifier}.json"
|
|
26
|
+
filepath = self.reports_dir / filename
|
|
27
|
+
|
|
28
|
+
data = self._serialize_report(report)
|
|
29
|
+
|
|
30
|
+
with tempfile.NamedTemporaryFile(
|
|
31
|
+
mode="w", delete=False, dir=self.reports_dir, suffix=".tmp"
|
|
32
|
+
) as tmp:
|
|
33
|
+
json.dump(data, tmp, indent=2)
|
|
34
|
+
tmp_path = Path(tmp.name)
|
|
35
|
+
|
|
36
|
+
tmp_path.replace(filepath)
|
|
37
|
+
|
|
38
|
+
self._update_metadata(identifier, name, str(filepath))
|
|
39
|
+
|
|
40
|
+
return identifier
|
|
41
|
+
|
|
42
|
+
def load(self, identifier: str) -> Report:
|
|
43
|
+
filepath = self.reports_dir / f"{identifier}.json"
|
|
44
|
+
|
|
45
|
+
if not filepath.exists():
|
|
46
|
+
raise FileNotFoundError(f"Report not found: {identifier}")
|
|
47
|
+
|
|
48
|
+
with open(filepath) as f:
|
|
49
|
+
data = json.load(f)
|
|
50
|
+
|
|
51
|
+
return self._deserialize_report(data)
|
|
52
|
+
|
|
53
|
+
def list_reports(self) -> list[dict[str, Any]]:
|
|
54
|
+
if not self.metadata_file.exists():
|
|
55
|
+
return []
|
|
56
|
+
|
|
57
|
+
with open(self.metadata_file) as f:
|
|
58
|
+
return json.load(f)
|
|
59
|
+
|
|
60
|
+
def _update_metadata(self, identifier: str, name: str, path: str) -> None:
|
|
61
|
+
metadata = self.list_reports()
|
|
62
|
+
|
|
63
|
+
metadata.append(
|
|
64
|
+
{
|
|
65
|
+
"id": identifier,
|
|
66
|
+
"name": name,
|
|
67
|
+
"path": path,
|
|
68
|
+
"timestamp": datetime.now().isoformat(),
|
|
69
|
+
}
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
with tempfile.NamedTemporaryFile(
|
|
73
|
+
mode="w", delete=False, dir=self.base_dir, suffix=".tmp"
|
|
74
|
+
) as tmp:
|
|
75
|
+
json.dump(metadata, tmp, indent=2)
|
|
76
|
+
tmp_path = Path(tmp.name)
|
|
77
|
+
|
|
78
|
+
tmp_path.replace(self.metadata_file)
|
|
79
|
+
|
|
80
|
+
@staticmethod
|
|
81
|
+
def _serialize_report(report: Report) -> dict[str, Any]:
|
|
82
|
+
return {
|
|
83
|
+
"metadata": report.metadata,
|
|
84
|
+
"results": [
|
|
85
|
+
{
|
|
86
|
+
"test_case": {
|
|
87
|
+
"name": r.test_case.name,
|
|
88
|
+
"prompt": r.test_case.prompt,
|
|
89
|
+
"model": r.test_case.model,
|
|
90
|
+
"tags": list(r.test_case.tags),
|
|
91
|
+
"system_prompt": r.test_case.system_prompt,
|
|
92
|
+
"temperature": r.test_case.temperature,
|
|
93
|
+
"max_tokens": r.test_case.max_tokens,
|
|
94
|
+
"metadata": r.test_case.metadata,
|
|
95
|
+
"validator_description": r.test_case.validator.describe(),
|
|
96
|
+
},
|
|
97
|
+
"response": r.response,
|
|
98
|
+
"passed": r.passed,
|
|
99
|
+
"metrics": {
|
|
100
|
+
"latency_ms": r.metrics.latency_ms,
|
|
101
|
+
"prompt_tokens": r.metrics.prompt_tokens,
|
|
102
|
+
"completion_tokens": r.metrics.completion_tokens,
|
|
103
|
+
"total_tokens": r.metrics.total_tokens,
|
|
104
|
+
"cost_usd": r.metrics.cost_usd,
|
|
105
|
+
"retry_delays": list(r.metrics.retry_delays),
|
|
106
|
+
}
|
|
107
|
+
if r.metrics
|
|
108
|
+
else None,
|
|
109
|
+
"validation_details": r.validation_details,
|
|
110
|
+
"execution_error": r.execution_error,
|
|
111
|
+
"timestamp": r.timestamp.isoformat(),
|
|
112
|
+
}
|
|
113
|
+
for r in report.results
|
|
114
|
+
],
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
@staticmethod
|
|
118
|
+
def _deserialize_report(data: dict[str, Any]) -> Report:
|
|
119
|
+
results = []
|
|
120
|
+
for r in data["results"]:
|
|
121
|
+
test_case = TestCase(
|
|
122
|
+
name=r["test_case"]["name"],
|
|
123
|
+
prompt=r["test_case"]["prompt"],
|
|
124
|
+
model=r["test_case"]["model"],
|
|
125
|
+
validator=PlaceholderValidator(
|
|
126
|
+
description=r["test_case"]["validator_description"],
|
|
127
|
+
),
|
|
128
|
+
tags=tuple(r["test_case"]["tags"]),
|
|
129
|
+
system_prompt=r["test_case"]["system_prompt"],
|
|
130
|
+
temperature=r["test_case"]["temperature"],
|
|
131
|
+
max_tokens=r["test_case"]["max_tokens"],
|
|
132
|
+
metadata=r["test_case"]["metadata"],
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
metrics = None
|
|
136
|
+
if r["metrics"]:
|
|
137
|
+
metrics = Metrics(
|
|
138
|
+
latency_ms=r["metrics"]["latency_ms"],
|
|
139
|
+
prompt_tokens=r["metrics"]["prompt_tokens"],
|
|
140
|
+
completion_tokens=r["metrics"]["completion_tokens"],
|
|
141
|
+
total_tokens=r["metrics"]["total_tokens"],
|
|
142
|
+
cost_usd=r["metrics"]["cost_usd"],
|
|
143
|
+
retry_delays=tuple(r["metrics"]["retry_delays"]),
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
result = TestResult(
|
|
147
|
+
test_case=test_case,
|
|
148
|
+
response=r["response"],
|
|
149
|
+
passed=r["passed"],
|
|
150
|
+
metrics=metrics,
|
|
151
|
+
validation_details=r["validation_details"],
|
|
152
|
+
execution_error=r["execution_error"],
|
|
153
|
+
timestamp=datetime.fromisoformat(r["timestamp"]),
|
|
154
|
+
)
|
|
155
|
+
results.append(result)
|
|
156
|
+
|
|
157
|
+
return Report(results=results, metadata=data["metadata"])
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from typing import Any, Protocol
|
|
2
|
+
|
|
3
|
+
from promptum.benchmark.report import Report
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ResultStorage(Protocol):
|
|
7
|
+
def save(self, report: Report, name: str) -> str:
|
|
8
|
+
"""
|
|
9
|
+
Saves a report and returns its identifier.
|
|
10
|
+
"""
|
|
11
|
+
...
|
|
12
|
+
|
|
13
|
+
def load(self, identifier: str) -> Report:
|
|
14
|
+
"""
|
|
15
|
+
Loads a report by its identifier.
|
|
16
|
+
"""
|
|
17
|
+
...
|
|
18
|
+
|
|
19
|
+
def list_reports(self) -> list[dict[str, Any]]:
|
|
20
|
+
"""
|
|
21
|
+
Returns metadata for all stored reports.
|
|
22
|
+
"""
|
|
23
|
+
...
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from typing import Any, Protocol
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Validator(Protocol):
|
|
5
|
+
def validate(self, response: str) -> tuple[bool, dict[str, Any]]:
|
|
6
|
+
"""
|
|
7
|
+
Validates a response string.
|
|
8
|
+
|
|
9
|
+
Returns:
|
|
10
|
+
(passed, details) where details contains diagnostic information
|
|
11
|
+
"""
|
|
12
|
+
...
|
|
13
|
+
|
|
14
|
+
def describe(self) -> str:
|
|
15
|
+
"""Returns a human-readable description of validation criteria."""
|
|
16
|
+
...
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True, slots=True)
|
|
8
|
+
class ExactMatch:
|
|
9
|
+
expected: str
|
|
10
|
+
case_sensitive: bool = True
|
|
11
|
+
|
|
12
|
+
def validate(self, response: str) -> tuple[bool, dict[str, Any]]:
|
|
13
|
+
if self.case_sensitive:
|
|
14
|
+
passed = response == self.expected
|
|
15
|
+
else:
|
|
16
|
+
passed = response.lower() == self.expected.lower()
|
|
17
|
+
|
|
18
|
+
return passed, {
|
|
19
|
+
"expected": self.expected,
|
|
20
|
+
"actual": response,
|
|
21
|
+
"case_sensitive": self.case_sensitive,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
def describe(self) -> str:
|
|
25
|
+
mode = "case-sensitive" if self.case_sensitive else "case-insensitive"
|
|
26
|
+
return f"Exact match ({mode}): {self.expected!r}"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True, slots=True)
|
|
30
|
+
class Contains:
|
|
31
|
+
substring: str
|
|
32
|
+
case_sensitive: bool = True
|
|
33
|
+
|
|
34
|
+
def validate(self, response: str) -> tuple[bool, dict[str, Any]]:
|
|
35
|
+
if self.case_sensitive:
|
|
36
|
+
passed = self.substring in response
|
|
37
|
+
else:
|
|
38
|
+
passed = self.substring.lower() in response.lower()
|
|
39
|
+
|
|
40
|
+
return passed, {
|
|
41
|
+
"substring": self.substring,
|
|
42
|
+
"case_sensitive": self.case_sensitive,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
def describe(self) -> str:
|
|
46
|
+
mode = "case-sensitive" if self.case_sensitive else "case-insensitive"
|
|
47
|
+
return f"Contains ({mode}): {self.substring!r}"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass(frozen=True, slots=True)
|
|
51
|
+
class Regex:
|
|
52
|
+
pattern: str
|
|
53
|
+
flags: int = 0
|
|
54
|
+
|
|
55
|
+
def validate(self, response: str) -> tuple[bool, dict[str, Any]]:
|
|
56
|
+
match = re.search(self.pattern, response, self.flags)
|
|
57
|
+
return match is not None, {
|
|
58
|
+
"pattern": self.pattern,
|
|
59
|
+
"matched": match.group(0) if match else None,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
def describe(self) -> str:
|
|
63
|
+
return f"Regex: {self.pattern!r}"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass(frozen=True, slots=True)
|
|
67
|
+
class JsonSchema:
|
|
68
|
+
required_keys: tuple[str, ...] = ()
|
|
69
|
+
|
|
70
|
+
def validate(self, response: str) -> tuple[bool, dict[str, Any]]:
|
|
71
|
+
try:
|
|
72
|
+
data = json.loads(response)
|
|
73
|
+
if not isinstance(data, dict):
|
|
74
|
+
return False, {"error": "Response is not a JSON object"}
|
|
75
|
+
|
|
76
|
+
missing_keys = [key for key in self.required_keys if key not in data]
|
|
77
|
+
passed = len(missing_keys) == 0
|
|
78
|
+
|
|
79
|
+
return passed, {
|
|
80
|
+
"parsed": data,
|
|
81
|
+
"missing_keys": missing_keys,
|
|
82
|
+
}
|
|
83
|
+
except json.JSONDecodeError as e:
|
|
84
|
+
return False, {"error": f"Invalid JSON: {e}"}
|
|
85
|
+
|
|
86
|
+
def describe(self) -> str:
|
|
87
|
+
if self.required_keys:
|
|
88
|
+
keys = ", ".join(self.required_keys)
|
|
89
|
+
return f"Valid JSON with keys: {keys}"
|
|
90
|
+
return "Valid JSON object"
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@dataclass(frozen=True, slots=True)
|
|
94
|
+
class PlaceholderValidator:
|
|
95
|
+
"""
|
|
96
|
+
Placeholder validator for deserialized reports.
|
|
97
|
+
|
|
98
|
+
Used when original validator cannot be reconstructed from storage.
|
|
99
|
+
Always returns True. Original validator logic is not preserved.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
description: str
|
|
103
|
+
|
|
104
|
+
def validate(self, response: str) -> tuple[bool, dict[str, Any]]:
|
|
105
|
+
return True, {"placeholder": True, "note": "Original validator could not be reconstructed"}
|
|
106
|
+
|
|
107
|
+
def describe(self) -> str:
|
|
108
|
+
return self.description
|