themis-eval 0.2.3__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +5 -2
- themis/_version.py +14 -1
- themis/api.py +83 -145
- themis/backends/storage.py +5 -0
- themis/cli/commands/info.py +2 -11
- themis/cli/main.py +231 -40
- themis/comparison/engine.py +7 -13
- themis/core/entities.py +4 -0
- themis/evaluation/metric_pipeline.py +12 -0
- themis/evaluation/pipeline.py +22 -0
- themis/evaluation/pipelines/__init__.py +4 -0
- themis/evaluation/pipelines/composable_pipeline.py +55 -0
- themis/evaluation/pipelines/standard_pipeline.py +16 -0
- themis/experiment/__init__.py +2 -2
- themis/experiment/cache_manager.py +15 -1
- themis/experiment/definitions.py +1 -1
- themis/experiment/orchestrator.py +21 -11
- themis/experiment/share.py +264 -0
- themis/experiment/storage.py +345 -298
- themis/generation/router.py +22 -4
- themis/generation/runner.py +16 -1
- themis/presets/benchmarks.py +602 -17
- themis/server/app.py +38 -26
- themis/session.py +125 -0
- themis/specs/__init__.py +7 -0
- themis/specs/execution.py +26 -0
- themis/specs/experiment.py +33 -0
- themis/specs/storage.py +18 -0
- themis/storage/__init__.py +6 -0
- themis/storage/experiment_storage.py +7 -0
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/METADATA +47 -34
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/RECORD +35 -28
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/WHEEL +1 -1
- themis/experiment/builder.py +0 -151
- themis/experiment/export_csv.py +0 -159
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -137,8 +137,8 @@ class ExperimentOrchestrator:
|
|
|
137
137
|
|
|
138
138
|
# Initialize run in storage (if storage exists and run doesn't exist)
|
|
139
139
|
if self._cache.has_storage:
|
|
140
|
-
if not resume or not self._cache.
|
|
141
|
-
self._cache.
|
|
140
|
+
if not resume or not self._cache.run_metadata_exists(run_identifier):
|
|
141
|
+
self._cache.start_run(run_identifier, experiment_id="default")
|
|
142
142
|
|
|
143
143
|
# Cache dataset for resumability
|
|
144
144
|
if dataset_list:
|
|
@@ -351,25 +351,35 @@ class ExperimentOrchestrator:
|
|
|
351
351
|
Returns:
|
|
352
352
|
Dictionary with evaluation configuration
|
|
353
353
|
"""
|
|
354
|
+
if hasattr(self._evaluation, "evaluation_fingerprint"):
|
|
355
|
+
try:
|
|
356
|
+
return dict(self._evaluation.evaluation_fingerprint())
|
|
357
|
+
except Exception:
|
|
358
|
+
pass
|
|
359
|
+
|
|
354
360
|
config = {}
|
|
355
|
-
|
|
361
|
+
|
|
356
362
|
# Add metric names/types
|
|
357
363
|
if hasattr(self._evaluation, "_metrics"):
|
|
358
|
-
config["metrics"] = sorted(
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
364
|
+
config["metrics"] = sorted(
|
|
365
|
+
[
|
|
366
|
+
f"{metric.__class__.__module__}.{metric.__class__.__name__}:{metric.name}"
|
|
367
|
+
for metric in self._evaluation._metrics
|
|
368
|
+
]
|
|
369
|
+
)
|
|
370
|
+
|
|
363
371
|
# Add extractor type
|
|
364
372
|
if hasattr(self._evaluation, "_extractor"):
|
|
365
373
|
extractor = self._evaluation._extractor
|
|
366
|
-
extractor_type =
|
|
374
|
+
extractor_type = (
|
|
375
|
+
f"{extractor.__class__.__module__}.{extractor.__class__.__name__}"
|
|
376
|
+
)
|
|
367
377
|
config["extractor"] = extractor_type
|
|
368
|
-
|
|
378
|
+
|
|
369
379
|
# Include extractor-specific configuration if available
|
|
370
380
|
if hasattr(extractor, "field_name"):
|
|
371
381
|
config["extractor_field"] = extractor.field_name
|
|
372
|
-
|
|
382
|
+
|
|
373
383
|
return config
|
|
374
384
|
|
|
375
385
|
def _resolve_dataset(
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""Shareable assets for experiment runs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
import html
|
|
8
|
+
import json
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from themis.experiment.storage import ExperimentStorage
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class ShareSummary:
|
|
17
|
+
"""Normalized summary details for sharing."""
|
|
18
|
+
|
|
19
|
+
run_id: str
|
|
20
|
+
metrics: dict[str, dict[str, float | int | None]]
|
|
21
|
+
total_samples: int | None
|
|
22
|
+
model: str | None
|
|
23
|
+
cost_usd: float | None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(frozen=True)
|
|
27
|
+
class SharePack:
|
|
28
|
+
"""Generated share assets."""
|
|
29
|
+
|
|
30
|
+
svg_path: Path
|
|
31
|
+
markdown_path: Path
|
|
32
|
+
markdown_snippet: str
|
|
33
|
+
event_log_path: Path | None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def create_share_pack(
|
|
37
|
+
*,
|
|
38
|
+
run_id: str,
|
|
39
|
+
storage_root: Path,
|
|
40
|
+
output_dir: Path,
|
|
41
|
+
metric: str | None = None,
|
|
42
|
+
) -> SharePack:
|
|
43
|
+
"""Generate a shareable SVG badge + Markdown snippet for a run."""
|
|
44
|
+
summary = _load_share_summary(run_id=run_id, storage_root=storage_root)
|
|
45
|
+
metric_name, metric_value = _select_metric(summary.metrics, metric)
|
|
46
|
+
|
|
47
|
+
svg = _render_share_svg(
|
|
48
|
+
run_id=summary.run_id,
|
|
49
|
+
metric_name=metric_name,
|
|
50
|
+
metric_value=metric_value,
|
|
51
|
+
model=summary.model,
|
|
52
|
+
total_samples=summary.total_samples,
|
|
53
|
+
cost_usd=summary.cost_usd,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
safe_run_id = _sanitize_filename(summary.run_id)
|
|
57
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
58
|
+
svg_path = output_dir / f"themis-share-{safe_run_id}.svg"
|
|
59
|
+
svg_path.write_text(svg, encoding="utf-8")
|
|
60
|
+
|
|
61
|
+
markdown_snippet = f"})"
|
|
62
|
+
markdown_path = output_dir / f"themis-share-{safe_run_id}.md"
|
|
63
|
+
markdown_path.write_text(markdown_snippet + "\n", encoding="utf-8")
|
|
64
|
+
|
|
65
|
+
event_log_path = _log_share_event(
|
|
66
|
+
storage_root=storage_root,
|
|
67
|
+
event_name="share_pack_generated",
|
|
68
|
+
payload={
|
|
69
|
+
"run_id": summary.run_id,
|
|
70
|
+
"metric": metric_name,
|
|
71
|
+
"metric_value": metric_value,
|
|
72
|
+
"output_dir": str(output_dir),
|
|
73
|
+
"svg_path": str(svg_path),
|
|
74
|
+
},
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return SharePack(
|
|
78
|
+
svg_path=svg_path,
|
|
79
|
+
markdown_path=markdown_path,
|
|
80
|
+
markdown_snippet=markdown_snippet,
|
|
81
|
+
event_log_path=event_log_path,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _load_share_summary(*, run_id: str, storage_root: Path) -> ShareSummary:
|
|
86
|
+
storage = ExperimentStorage(storage_root)
|
|
87
|
+
run_path = storage.get_run_path(run_id)
|
|
88
|
+
summary_path = run_path / "summary.json"
|
|
89
|
+
if summary_path.exists():
|
|
90
|
+
summary = json.loads(summary_path.read_text(encoding="utf-8"))
|
|
91
|
+
metrics = summary.get("metrics", {}) or {}
|
|
92
|
+
return ShareSummary(
|
|
93
|
+
run_id=str(summary.get("run_id") or run_id),
|
|
94
|
+
metrics=metrics,
|
|
95
|
+
total_samples=_safe_int(summary.get("total_samples")),
|
|
96
|
+
model=_safe_str(summary.get("metadata", {}).get("model")),
|
|
97
|
+
cost_usd=_safe_float(summary.get("cost_usd")),
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
report_path = run_path / "report.json"
|
|
101
|
+
if report_path.exists():
|
|
102
|
+
report = json.loads(report_path.read_text(encoding="utf-8"))
|
|
103
|
+
metrics = {
|
|
104
|
+
entry.get("name"): {
|
|
105
|
+
"mean": entry.get("mean"),
|
|
106
|
+
"count": entry.get("count"),
|
|
107
|
+
}
|
|
108
|
+
for entry in report.get("metrics", [])
|
|
109
|
+
if entry.get("name")
|
|
110
|
+
}
|
|
111
|
+
summary = report.get("summary", {}) or {}
|
|
112
|
+
model = None
|
|
113
|
+
samples = report.get("samples", [])
|
|
114
|
+
if samples:
|
|
115
|
+
metadata = samples[0].get("metadata", {}) or {}
|
|
116
|
+
model = _safe_str(
|
|
117
|
+
metadata.get("model_identifier") or metadata.get("model")
|
|
118
|
+
)
|
|
119
|
+
cost_usd = None
|
|
120
|
+
cost = summary.get("cost")
|
|
121
|
+
if isinstance(cost, dict):
|
|
122
|
+
cost_usd = _safe_float(cost.get("total_cost"))
|
|
123
|
+
total_samples = _safe_int(summary.get("total_samples"))
|
|
124
|
+
if total_samples is None:
|
|
125
|
+
total_samples = _safe_int(report.get("total_samples"))
|
|
126
|
+
return ShareSummary(
|
|
127
|
+
run_id=str(summary.get("run_id") or run_id),
|
|
128
|
+
metrics=metrics,
|
|
129
|
+
total_samples=total_samples,
|
|
130
|
+
model=model,
|
|
131
|
+
cost_usd=cost_usd,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
raise FileNotFoundError(
|
|
135
|
+
f"Run {run_id} is missing summary.json or report.json at {run_path}"
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _select_metric(
|
|
140
|
+
metrics: dict[str, dict[str, float | int | None]],
|
|
141
|
+
metric: str | None,
|
|
142
|
+
) -> tuple[str, float | None]:
|
|
143
|
+
if not metrics:
|
|
144
|
+
raise ValueError("No metrics found for this run")
|
|
145
|
+
|
|
146
|
+
if metric is None:
|
|
147
|
+
metric = sorted(metrics.keys())[0]
|
|
148
|
+
|
|
149
|
+
if metric not in metrics:
|
|
150
|
+
available = ", ".join(sorted(metrics.keys()))
|
|
151
|
+
raise ValueError(f"Metric '{metric}' not found. Available: {available}")
|
|
152
|
+
|
|
153
|
+
mean = metrics[metric].get("mean")
|
|
154
|
+
return metric, _safe_float(mean)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _render_share_svg(
|
|
158
|
+
*,
|
|
159
|
+
run_id: str,
|
|
160
|
+
metric_name: str,
|
|
161
|
+
metric_value: float | None,
|
|
162
|
+
model: str | None,
|
|
163
|
+
total_samples: int | None,
|
|
164
|
+
cost_usd: float | None,
|
|
165
|
+
) -> str:
|
|
166
|
+
title = "Themis Result"
|
|
167
|
+
metric_display = (
|
|
168
|
+
f"{metric_name}: {metric_value:.4f}" if metric_value is not None else "N/A"
|
|
169
|
+
)
|
|
170
|
+
meta_lines = []
|
|
171
|
+
meta_lines.append(f"Model: {model or 'unknown'}")
|
|
172
|
+
meta_lines.append(f"Samples: {total_samples if total_samples is not None else 'N/A'}")
|
|
173
|
+
if cost_usd is not None:
|
|
174
|
+
meta_lines.append(f"Cost: ${cost_usd:.4f}")
|
|
175
|
+
|
|
176
|
+
line_height = 18
|
|
177
|
+
base_y = 64
|
|
178
|
+
run_line_y = base_y + (len(meta_lines) * line_height) + 10
|
|
179
|
+
height = run_line_y + 24
|
|
180
|
+
|
|
181
|
+
meta_svg = "\n".join(
|
|
182
|
+
f'<text class="meta" x="24" y="{base_y + (idx * line_height)}">'
|
|
183
|
+
f"{html.escape(line)}</text>"
|
|
184
|
+
for idx, line in enumerate(meta_lines)
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
return (
|
|
188
|
+
f"""<svg xmlns="http://www.w3.org/2000/svg" width="640" height="{height}" viewBox="0 0 640 {height}">
|
|
189
|
+
<defs>
|
|
190
|
+
<style>
|
|
191
|
+
.title {{ font: 600 18px 'Segoe UI', 'Helvetica Neue', Arial, sans-serif; fill: #0f172a; }}
|
|
192
|
+
.metric {{ font: 700 22px 'Segoe UI', 'Helvetica Neue', Arial, sans-serif; fill: #0f172a; }}
|
|
193
|
+
.meta {{ font: 400 14px 'Segoe UI', 'Helvetica Neue', Arial, sans-serif; fill: #334155; }}
|
|
194
|
+
.run {{ font: 400 12px 'Segoe UI', 'Helvetica Neue', Arial, sans-serif; fill: #64748b; }}
|
|
195
|
+
</style>
|
|
196
|
+
<linearGradient id="bg" x1="0" y1="0" x2="1" y2="1">
|
|
197
|
+
<stop offset="0%" stop-color="#f8fafc"/>
|
|
198
|
+
<stop offset="100%" stop-color="#e2e8f0"/>
|
|
199
|
+
</linearGradient>
|
|
200
|
+
</defs>
|
|
201
|
+
<rect x="0" y="0" width="640" height="{height}" rx="16" fill="url(#bg)"/>
|
|
202
|
+
<rect x="16" y="14" width="608" height="{height - 28}" rx="12" fill="#ffffff" stroke="#e2e8f0"/>
|
|
203
|
+
<text class="title" x="24" y="38">{html.escape(title)}</text>
|
|
204
|
+
<text class="metric" x="24" y="60">{html.escape(metric_display)}</text>
|
|
205
|
+
{meta_svg}
|
|
206
|
+
<text class="run" x="24" y="{run_line_y}">Run: {html.escape(run_id)}</text>
|
|
207
|
+
</svg>"""
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _log_share_event(
|
|
212
|
+
*,
|
|
213
|
+
storage_root: Path,
|
|
214
|
+
event_name: str,
|
|
215
|
+
payload: dict[str, Any],
|
|
216
|
+
) -> Path | None:
|
|
217
|
+
try:
|
|
218
|
+
events_path = storage_root / "share_events.jsonl"
|
|
219
|
+
event = {
|
|
220
|
+
"event": event_name,
|
|
221
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
222
|
+
**payload,
|
|
223
|
+
}
|
|
224
|
+
with events_path.open("a", encoding="utf-8") as handle:
|
|
225
|
+
handle.write(json.dumps(event) + "\n")
|
|
226
|
+
return events_path
|
|
227
|
+
except OSError:
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _sanitize_filename(value: str) -> str:
|
|
232
|
+
return "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in value)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _relative_markdown_path(path: Path) -> str:
|
|
236
|
+
try:
|
|
237
|
+
relative = path.relative_to(Path.cwd())
|
|
238
|
+
except ValueError:
|
|
239
|
+
relative = path
|
|
240
|
+
return relative.as_posix()
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _safe_float(value: Any) -> float | None:
|
|
244
|
+
try:
|
|
245
|
+
if value is None:
|
|
246
|
+
return None
|
|
247
|
+
return float(value)
|
|
248
|
+
except (TypeError, ValueError):
|
|
249
|
+
return None
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _safe_int(value: Any) -> int | None:
|
|
253
|
+
try:
|
|
254
|
+
if value is None:
|
|
255
|
+
return None
|
|
256
|
+
return int(value)
|
|
257
|
+
except (TypeError, ValueError):
|
|
258
|
+
return None
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _safe_str(value: Any) -> str | None:
|
|
262
|
+
if value is None:
|
|
263
|
+
return None
|
|
264
|
+
return str(value)
|