themis-eval 0.2.3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. themis/__init__.py +5 -2
  2. themis/_version.py +14 -1
  3. themis/api.py +83 -145
  4. themis/backends/storage.py +5 -0
  5. themis/cli/commands/info.py +2 -11
  6. themis/cli/main.py +231 -40
  7. themis/comparison/engine.py +7 -13
  8. themis/core/entities.py +4 -0
  9. themis/evaluation/metric_pipeline.py +12 -0
  10. themis/evaluation/pipeline.py +22 -0
  11. themis/evaluation/pipelines/__init__.py +4 -0
  12. themis/evaluation/pipelines/composable_pipeline.py +55 -0
  13. themis/evaluation/pipelines/standard_pipeline.py +16 -0
  14. themis/experiment/__init__.py +2 -2
  15. themis/experiment/cache_manager.py +15 -1
  16. themis/experiment/definitions.py +1 -1
  17. themis/experiment/orchestrator.py +21 -11
  18. themis/experiment/share.py +264 -0
  19. themis/experiment/storage.py +345 -298
  20. themis/generation/router.py +22 -4
  21. themis/generation/runner.py +16 -1
  22. themis/presets/benchmarks.py +602 -17
  23. themis/server/app.py +38 -26
  24. themis/session.py +125 -0
  25. themis/specs/__init__.py +7 -0
  26. themis/specs/execution.py +26 -0
  27. themis/specs/experiment.py +33 -0
  28. themis/specs/storage.py +18 -0
  29. themis/storage/__init__.py +6 -0
  30. themis/storage/experiment_storage.py +7 -0
  31. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/METADATA +47 -34
  32. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/RECORD +35 -28
  33. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/WHEEL +1 -1
  34. themis/experiment/builder.py +0 -151
  35. themis/experiment/export_csv.py +0 -159
  36. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/licenses/LICENSE +0 -0
  37. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/top_level.txt +0 -0
@@ -137,8 +137,8 @@ class ExperimentOrchestrator:
137
137
 
138
138
  # Initialize run in storage (if storage exists and run doesn't exist)
139
139
  if self._cache.has_storage:
140
- if not resume or not self._cache._storage._run_metadata_exists(run_identifier):
141
- self._cache._storage.start_run(run_identifier, experiment_id="default")
140
+ if not resume or not self._cache.run_metadata_exists(run_identifier):
141
+ self._cache.start_run(run_identifier, experiment_id="default")
142
142
 
143
143
  # Cache dataset for resumability
144
144
  if dataset_list:
@@ -351,25 +351,35 @@ class ExperimentOrchestrator:
351
351
  Returns:
352
352
  Dictionary with evaluation configuration
353
353
  """
354
+ if hasattr(self._evaluation, "evaluation_fingerprint"):
355
+ try:
356
+ return dict(self._evaluation.evaluation_fingerprint())
357
+ except Exception:
358
+ pass
359
+
354
360
  config = {}
355
-
361
+
356
362
  # Add metric names/types
357
363
  if hasattr(self._evaluation, "_metrics"):
358
- config["metrics"] = sorted([
359
- f"{metric.__class__.__module__}.{metric.__class__.__name__}:{metric.name}"
360
- for metric in self._evaluation._metrics
361
- ])
362
-
364
+ config["metrics"] = sorted(
365
+ [
366
+ f"{metric.__class__.__module__}.{metric.__class__.__name__}:{metric.name}"
367
+ for metric in self._evaluation._metrics
368
+ ]
369
+ )
370
+
363
371
  # Add extractor type
364
372
  if hasattr(self._evaluation, "_extractor"):
365
373
  extractor = self._evaluation._extractor
366
- extractor_type = f"{extractor.__class__.__module__}.{extractor.__class__.__name__}"
374
+ extractor_type = (
375
+ f"{extractor.__class__.__module__}.{extractor.__class__.__name__}"
376
+ )
367
377
  config["extractor"] = extractor_type
368
-
378
+
369
379
  # Include extractor-specific configuration if available
370
380
  if hasattr(extractor, "field_name"):
371
381
  config["extractor_field"] = extractor.field_name
372
-
382
+
373
383
  return config
374
384
 
375
385
  def _resolve_dataset(
@@ -0,0 +1,264 @@
1
+ """Shareable assets for experiment runs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from datetime import datetime, timezone
7
+ import html
8
+ import json
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from themis.experiment.storage import ExperimentStorage
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class ShareSummary:
17
+ """Normalized summary details for sharing."""
18
+
19
+ run_id: str
20
+ metrics: dict[str, dict[str, float | int | None]]
21
+ total_samples: int | None
22
+ model: str | None
23
+ cost_usd: float | None
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class SharePack:
28
+ """Generated share assets."""
29
+
30
+ svg_path: Path
31
+ markdown_path: Path
32
+ markdown_snippet: str
33
+ event_log_path: Path | None
34
+
35
+
36
+ def create_share_pack(
37
+ *,
38
+ run_id: str,
39
+ storage_root: Path,
40
+ output_dir: Path,
41
+ metric: str | None = None,
42
+ ) -> SharePack:
43
+ """Generate a shareable SVG badge + Markdown snippet for a run."""
44
+ summary = _load_share_summary(run_id=run_id, storage_root=storage_root)
45
+ metric_name, metric_value = _select_metric(summary.metrics, metric)
46
+
47
+ svg = _render_share_svg(
48
+ run_id=summary.run_id,
49
+ metric_name=metric_name,
50
+ metric_value=metric_value,
51
+ model=summary.model,
52
+ total_samples=summary.total_samples,
53
+ cost_usd=summary.cost_usd,
54
+ )
55
+
56
+ safe_run_id = _sanitize_filename(summary.run_id)
57
+ output_dir.mkdir(parents=True, exist_ok=True)
58
+ svg_path = output_dir / f"themis-share-{safe_run_id}.svg"
59
+ svg_path.write_text(svg, encoding="utf-8")
60
+
61
+ markdown_snippet = f"![Themis result]({_relative_markdown_path(svg_path)})"
62
+ markdown_path = output_dir / f"themis-share-{safe_run_id}.md"
63
+ markdown_path.write_text(markdown_snippet + "\n", encoding="utf-8")
64
+
65
+ event_log_path = _log_share_event(
66
+ storage_root=storage_root,
67
+ event_name="share_pack_generated",
68
+ payload={
69
+ "run_id": summary.run_id,
70
+ "metric": metric_name,
71
+ "metric_value": metric_value,
72
+ "output_dir": str(output_dir),
73
+ "svg_path": str(svg_path),
74
+ },
75
+ )
76
+
77
+ return SharePack(
78
+ svg_path=svg_path,
79
+ markdown_path=markdown_path,
80
+ markdown_snippet=markdown_snippet,
81
+ event_log_path=event_log_path,
82
+ )
83
+
84
+
85
+ def _load_share_summary(*, run_id: str, storage_root: Path) -> ShareSummary:
86
+ storage = ExperimentStorage(storage_root)
87
+ run_path = storage.get_run_path(run_id)
88
+ summary_path = run_path / "summary.json"
89
+ if summary_path.exists():
90
+ summary = json.loads(summary_path.read_text(encoding="utf-8"))
91
+ metrics = summary.get("metrics", {}) or {}
92
+ return ShareSummary(
93
+ run_id=str(summary.get("run_id") or run_id),
94
+ metrics=metrics,
95
+ total_samples=_safe_int(summary.get("total_samples")),
96
+ model=_safe_str(summary.get("metadata", {}).get("model")),
97
+ cost_usd=_safe_float(summary.get("cost_usd")),
98
+ )
99
+
100
+ report_path = run_path / "report.json"
101
+ if report_path.exists():
102
+ report = json.loads(report_path.read_text(encoding="utf-8"))
103
+ metrics = {
104
+ entry.get("name"): {
105
+ "mean": entry.get("mean"),
106
+ "count": entry.get("count"),
107
+ }
108
+ for entry in report.get("metrics", [])
109
+ if entry.get("name")
110
+ }
111
+ summary = report.get("summary", {}) or {}
112
+ model = None
113
+ samples = report.get("samples", [])
114
+ if samples:
115
+ metadata = samples[0].get("metadata", {}) or {}
116
+ model = _safe_str(
117
+ metadata.get("model_identifier") or metadata.get("model")
118
+ )
119
+ cost_usd = None
120
+ cost = summary.get("cost")
121
+ if isinstance(cost, dict):
122
+ cost_usd = _safe_float(cost.get("total_cost"))
123
+ total_samples = _safe_int(summary.get("total_samples"))
124
+ if total_samples is None:
125
+ total_samples = _safe_int(report.get("total_samples"))
126
+ return ShareSummary(
127
+ run_id=str(summary.get("run_id") or run_id),
128
+ metrics=metrics,
129
+ total_samples=total_samples,
130
+ model=model,
131
+ cost_usd=cost_usd,
132
+ )
133
+
134
+ raise FileNotFoundError(
135
+ f"Run {run_id} is missing summary.json or report.json at {run_path}"
136
+ )
137
+
138
+
139
+ def _select_metric(
140
+ metrics: dict[str, dict[str, float | int | None]],
141
+ metric: str | None,
142
+ ) -> tuple[str, float | None]:
143
+ if not metrics:
144
+ raise ValueError("No metrics found for this run")
145
+
146
+ if metric is None:
147
+ metric = sorted(metrics.keys())[0]
148
+
149
+ if metric not in metrics:
150
+ available = ", ".join(sorted(metrics.keys()))
151
+ raise ValueError(f"Metric '{metric}' not found. Available: {available}")
152
+
153
+ mean = metrics[metric].get("mean")
154
+ return metric, _safe_float(mean)
155
+
156
+
157
+ def _render_share_svg(
158
+ *,
159
+ run_id: str,
160
+ metric_name: str,
161
+ metric_value: float | None,
162
+ model: str | None,
163
+ total_samples: int | None,
164
+ cost_usd: float | None,
165
+ ) -> str:
166
+ title = "Themis Result"
167
+ metric_display = (
168
+ f"{metric_name}: {metric_value:.4f}" if metric_value is not None else "N/A"
169
+ )
170
+ meta_lines = []
171
+ meta_lines.append(f"Model: {model or 'unknown'}")
172
+ meta_lines.append(f"Samples: {total_samples if total_samples is not None else 'N/A'}")
173
+ if cost_usd is not None:
174
+ meta_lines.append(f"Cost: ${cost_usd:.4f}")
175
+
176
+ line_height = 18
177
+ base_y = 64
178
+ run_line_y = base_y + (len(meta_lines) * line_height) + 10
179
+ height = run_line_y + 24
180
+
181
+ meta_svg = "\n".join(
182
+ f'<text class="meta" x="24" y="{base_y + (idx * line_height)}">'
183
+ f"{html.escape(line)}</text>"
184
+ for idx, line in enumerate(meta_lines)
185
+ )
186
+
187
+ return (
188
+ f"""<svg xmlns="http://www.w3.org/2000/svg" width="640" height="{height}" viewBox="0 0 640 {height}">
189
+ <defs>
190
+ <style>
191
+ .title {{ font: 600 18px 'Segoe UI', 'Helvetica Neue', Arial, sans-serif; fill: #0f172a; }}
192
+ .metric {{ font: 700 22px 'Segoe UI', 'Helvetica Neue', Arial, sans-serif; fill: #0f172a; }}
193
+ .meta {{ font: 400 14px 'Segoe UI', 'Helvetica Neue', Arial, sans-serif; fill: #334155; }}
194
+ .run {{ font: 400 12px 'Segoe UI', 'Helvetica Neue', Arial, sans-serif; fill: #64748b; }}
195
+ </style>
196
+ <linearGradient id="bg" x1="0" y1="0" x2="1" y2="1">
197
+ <stop offset="0%" stop-color="#f8fafc"/>
198
+ <stop offset="100%" stop-color="#e2e8f0"/>
199
+ </linearGradient>
200
+ </defs>
201
+ <rect x="0" y="0" width="640" height="{height}" rx="16" fill="url(#bg)"/>
202
+ <rect x="16" y="14" width="608" height="{height - 28}" rx="12" fill="#ffffff" stroke="#e2e8f0"/>
203
+ <text class="title" x="24" y="38">{html.escape(title)}</text>
204
+ <text class="metric" x="24" y="60">{html.escape(metric_display)}</text>
205
+ {meta_svg}
206
+ <text class="run" x="24" y="{run_line_y}">Run: {html.escape(run_id)}</text>
207
+ </svg>"""
208
+ )
209
+
210
+
211
+ def _log_share_event(
212
+ *,
213
+ storage_root: Path,
214
+ event_name: str,
215
+ payload: dict[str, Any],
216
+ ) -> Path | None:
217
+ try:
218
+ events_path = storage_root / "share_events.jsonl"
219
+ event = {
220
+ "event": event_name,
221
+ "timestamp": datetime.now(timezone.utc).isoformat(),
222
+ **payload,
223
+ }
224
+ with events_path.open("a", encoding="utf-8") as handle:
225
+ handle.write(json.dumps(event) + "\n")
226
+ return events_path
227
+ except OSError:
228
+ return None
229
+
230
+
231
+ def _sanitize_filename(value: str) -> str:
232
+ return "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in value)
233
+
234
+
235
+ def _relative_markdown_path(path: Path) -> str:
236
+ try:
237
+ relative = path.relative_to(Path.cwd())
238
+ except ValueError:
239
+ relative = path
240
+ return relative.as_posix()
241
+
242
+
243
+ def _safe_float(value: Any) -> float | None:
244
+ try:
245
+ if value is None:
246
+ return None
247
+ return float(value)
248
+ except (TypeError, ValueError):
249
+ return None
250
+
251
+
252
+ def _safe_int(value: Any) -> int | None:
253
+ try:
254
+ if value is None:
255
+ return None
256
+ return int(value)
257
+ except (TypeError, ValueError):
258
+ return None
259
+
260
+
261
+ def _safe_str(value: Any) -> str | None:
262
+ if value is None:
263
+ return None
264
+ return str(value)