themis-eval 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/commands/results.py +252 -0
  8. themis/cli/main.py +427 -57
  9. themis/comparison/__init__.py +25 -0
  10. themis/comparison/engine.py +348 -0
  11. themis/comparison/reports.py +283 -0
  12. themis/comparison/statistics.py +402 -0
  13. themis/core/entities.py +23 -3
  14. themis/evaluation/metrics/code/__init__.py +19 -0
  15. themis/evaluation/metrics/code/codebleu.py +144 -0
  16. themis/evaluation/metrics/code/execution.py +280 -0
  17. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  18. themis/evaluation/metrics/nlp/__init__.py +21 -0
  19. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  20. themis/evaluation/metrics/nlp/bleu.py +129 -0
  21. themis/evaluation/metrics/nlp/meteor.py +153 -0
  22. themis/evaluation/metrics/nlp/rouge.py +136 -0
  23. themis/evaluation/pipelines/standard_pipeline.py +68 -8
  24. themis/experiment/cache_manager.py +8 -3
  25. themis/experiment/export.py +110 -2
  26. themis/experiment/orchestrator.py +48 -6
  27. themis/experiment/storage.py +1313 -110
  28. themis/integrations/huggingface.py +12 -1
  29. themis/integrations/wandb.py +13 -1
  30. themis/interfaces/__init__.py +86 -0
  31. themis/presets/__init__.py +10 -0
  32. themis/presets/benchmarks.py +354 -0
  33. themis/presets/models.py +190 -0
  34. themis/server/__init__.py +28 -0
  35. themis/server/app.py +337 -0
  36. themis_eval-0.2.0.dist-info/METADATA +596 -0
  37. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/RECORD +40 -17
  38. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  39. themis_eval-0.1.1.dist-info/METADATA +0 -758
  40. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  41. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,260 @@
1
+ """Storage backend interface for custom storage implementations.
2
+
3
+ This module defines the abstract interface for storage backends, allowing
4
+ users to implement custom storage solutions (cloud storage, databases, etc.)
5
+ without modifying Themis core code.
6
+
7
+ Example implementations:
8
+ - S3Backend: Store results in AWS S3
9
+ - GCSBackend: Store results in Google Cloud Storage
10
+ - PostgresBackend: Store results in PostgreSQL
11
+ - RedisBackend: Use Redis for distributed caching
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from abc import ABC, abstractmethod
17
+ from pathlib import Path
18
+ from typing import Any, Dict, List
19
+
20
+ from themis.core.entities import (
21
+ EvaluationRecord,
22
+ ExperimentReport,
23
+ GenerationRecord,
24
+ )
25
+
26
+
27
+ class StorageBackend(ABC):
28
+ """Abstract interface for storage backends.
29
+
30
+ Implement this interface to create custom storage solutions.
31
+ All methods should be thread-safe if used with concurrent workers.
32
+
33
+ Example:
34
+ >>> class S3StorageBackend(StorageBackend):
35
+ ... def __init__(self, bucket: str):
36
+ ... self.bucket = bucket
37
+ ... self.s3_client = boto3.client('s3')
38
+ ...
39
+ ... def save_run_metadata(self, run_id: str, metadata: RunMetadata) -> None:
40
+ ... key = f"runs/{run_id}/metadata.json"
41
+ ... self.s3_client.put_object(
42
+ ... Bucket=self.bucket,
43
+ ... Key=key,
44
+ ... Body=metadata.to_json(),
45
+ ... )
46
+ ... # ... implement other methods
47
+ """
48
+
49
+ @abstractmethod
50
+ def save_run_metadata(self, run_id: str, metadata: Dict[str, Any]) -> None:
51
+ """Save run metadata.
52
+
53
+ Args:
54
+ run_id: Unique identifier for the run
55
+ metadata: Run metadata to save (as dictionary)
56
+ """
57
+ pass
58
+
59
+ @abstractmethod
60
+ def load_run_metadata(self, run_id: str) -> Dict[str, Any]:
61
+ """Load run metadata.
62
+
63
+ Args:
64
+ run_id: Unique identifier for the run
65
+
66
+ Returns:
67
+ Run metadata as dictionary
68
+
69
+ Raises:
70
+ FileNotFoundError: If run metadata doesn't exist
71
+ """
72
+ pass
73
+
74
+ @abstractmethod
75
+ def save_generation_record(self, run_id: str, record: GenerationRecord) -> None:
76
+ """Save a generation record.
77
+
78
+ Args:
79
+ run_id: Unique identifier for the run
80
+ record: Generation record to save
81
+
82
+ Note:
83
+ This method should be atomic and thread-safe.
84
+ """
85
+ pass
86
+
87
+ @abstractmethod
88
+ def load_generation_records(self, run_id: str) -> List[GenerationRecord]:
89
+ """Load all generation records for a run.
90
+
91
+ Args:
92
+ run_id: Unique identifier for the run
93
+
94
+ Returns:
95
+ List of generation records
96
+ """
97
+ pass
98
+
99
+ @abstractmethod
100
+ def save_evaluation_record(self, run_id: str, record: EvaluationRecord) -> None:
101
+ """Save an evaluation record.
102
+
103
+ Args:
104
+ run_id: Unique identifier for the run
105
+ record: Evaluation record to save
106
+
107
+ Note:
108
+ This method should be atomic and thread-safe.
109
+ """
110
+ pass
111
+
112
+ @abstractmethod
113
+ def load_evaluation_records(self, run_id: str) -> Dict[str, EvaluationRecord]:
114
+ """Load all evaluation records for a run.
115
+
116
+ Args:
117
+ run_id: Unique identifier for the run
118
+
119
+ Returns:
120
+ Dictionary mapping cache_key to EvaluationRecord
121
+ """
122
+ pass
123
+
124
+ @abstractmethod
125
+ def save_report(self, run_id: str, report: ExperimentReport) -> None:
126
+ """Save experiment report.
127
+
128
+ Args:
129
+ run_id: Unique identifier for the run
130
+ report: Experiment report to save
131
+ """
132
+ pass
133
+
134
+ @abstractmethod
135
+ def load_report(self, run_id: str) -> ExperimentReport:
136
+ """Load experiment report.
137
+
138
+ Args:
139
+ run_id: Unique identifier for the run
140
+
141
+ Returns:
142
+ Experiment report
143
+
144
+ Raises:
145
+ FileNotFoundError: If report doesn't exist
146
+ """
147
+ pass
148
+
149
+ @abstractmethod
150
+ def list_runs(self) -> List[str]:
151
+ """List all run IDs in storage.
152
+
153
+ Returns:
154
+ List of run IDs
155
+ """
156
+ pass
157
+
158
+ @abstractmethod
159
+ def run_exists(self, run_id: str) -> bool:
160
+ """Check if a run exists in storage.
161
+
162
+ Args:
163
+ run_id: Unique identifier for the run
164
+
165
+ Returns:
166
+ True if run exists, False otherwise
167
+ """
168
+ pass
169
+
170
+ @abstractmethod
171
+ def delete_run(self, run_id: str) -> None:
172
+ """Delete all data for a run.
173
+
174
+ Args:
175
+ run_id: Unique identifier for the run
176
+ """
177
+ pass
178
+
179
+ def close(self) -> None:
180
+ """Close the storage backend and release resources.
181
+
182
+ Optional method for cleanup. Called when storage is no longer needed.
183
+ """
184
+ pass
185
+
186
+
187
+ class LocalFileStorageBackend(StorageBackend):
188
+ """Adapter for the existing ExperimentStorage implementation.
189
+
190
+ This class wraps the current file-based storage implementation
191
+ to conform to the StorageBackend interface.
192
+
193
+ Note:
194
+ This is a compatibility layer. New code should use the interface,
195
+ but existing storage logic is preserved.
196
+ """
197
+
198
+ def __init__(self, storage_path: str | Path):
199
+ """Initialize with path to storage directory.
200
+
201
+ Args:
202
+ storage_path: Path to storage directory
203
+ """
204
+ from themis.experiment.storage import ExperimentStorage
205
+ self._storage = ExperimentStorage(storage_path)
206
+
207
+ def save_run_metadata(self, run_id: str, metadata: Dict[str, Any]) -> None:
208
+ """Save run metadata."""
209
+ experiment_id = metadata.get("experiment_id", "default")
210
+ self._storage.start_run(run_id, experiment_id=experiment_id)
211
+
212
+ def load_run_metadata(self, run_id: str) -> Dict[str, Any]:
213
+ """Load run metadata."""
214
+ # Note: Current storage doesn't have a direct method for this
215
+ # This is a limitation of the adapter pattern
216
+ raise NotImplementedError("Use ExperimentStorage directly for now")
217
+
218
+ def save_generation_record(self, run_id: str, record: GenerationRecord) -> None:
219
+ """Save generation record."""
220
+ self._storage.append_record(run_id, record)
221
+
222
+ def load_generation_records(self, run_id: str) -> List[GenerationRecord]:
223
+ """Load generation records."""
224
+ cached = self._storage.load_cached_records(run_id)
225
+ return list(cached.values())
226
+
227
+ def save_evaluation_record(self, run_id: str, record: EvaluationRecord) -> None:
228
+ """Save evaluation record."""
229
+ self._storage.append_evaluation(run_id, record)
230
+
231
+ def load_evaluation_records(self, run_id: str) -> Dict[str, EvaluationRecord]:
232
+ """Load evaluation records."""
233
+ return self._storage.load_cached_evaluations(run_id)
234
+
235
+ def save_report(self, run_id: str, report: ExperimentReport) -> None:
236
+ """Save report."""
237
+ self._storage.save_report(run_id, report)
238
+
239
+ def load_report(self, run_id: str) -> ExperimentReport:
240
+ """Load report."""
241
+ return self._storage.load_report(run_id)
242
+
243
+ def list_runs(self) -> List[str]:
244
+ """List runs."""
245
+ return self._storage.list_runs()
246
+
247
+ def run_exists(self, run_id: str) -> bool:
248
+ """Check if run exists."""
249
+ return run_id in self._storage.list_runs()
250
+
251
+ def delete_run(self, run_id: str) -> None:
252
+ """Delete run."""
253
+ # Note: Current storage doesn't have delete functionality
254
+ raise NotImplementedError("Delete not implemented in current storage")
255
+
256
+
257
+ __all__ = [
258
+ "StorageBackend",
259
+ "LocalFileStorageBackend",
260
+ ]
@@ -0,0 +1,252 @@
1
+ """Quick results viewing commands for experiment summaries."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Annotated
8
+
9
+ from cyclopts import Parameter
10
+
11
+
12
+ def summary_command(
13
+ *,
14
+ run_id: Annotated[
15
+ str,
16
+ Parameter(
17
+ help="Run ID to view summary for",
18
+ ),
19
+ ],
20
+ storage: Annotated[
21
+ Path,
22
+ Parameter(
23
+ help="Storage directory containing experiment results",
24
+ ),
25
+ ] = Path(".cache/runs"),
26
+ ) -> int:
27
+ """View quick summary of a single experiment run.
28
+
29
+ This command reads the lightweight summary.json file (~1KB) instead of
30
+ the full report.json (~1.6MB), making it much faster for quick checks.
31
+
32
+ Examples:
33
+ # View summary for a specific run
34
+ uv run python -m themis.cli results summary \\
35
+ --run-id run-20260118-032014 \\
36
+ --storage outputs/evaluation
37
+
38
+ # Quick check of latest run
39
+ uv run python -m themis.cli results summary \\
40
+ --run-id $(ls -t outputs/evaluation | head -1)
41
+ """
42
+ try:
43
+ # Try to find summary.json
44
+ run_dir = storage / run_id
45
+ summary_path = run_dir / "summary.json"
46
+
47
+ if not summary_path.exists():
48
+ print(f"Error: Summary file not found at {summary_path}")
49
+ print("\nNote: summary.json is only available for runs created with")
50
+ print("the updated export functionality. For older runs, use the")
51
+ print("'compare' command which reads full report.json files.")
52
+ return 1
53
+
54
+ # Load summary
55
+ with summary_path.open("r", encoding="utf-8") as f:
56
+ summary = json.load(f)
57
+
58
+ # Display summary
59
+ print("=" * 80)
60
+ print(f"Experiment Summary: {run_id}")
61
+ print("=" * 80)
62
+
63
+ # Basic info
64
+ print(f"\nRun ID: {summary.get('run_id', 'N/A')}")
65
+ print(f"Total Samples: {summary.get('total_samples', 0)}")
66
+
67
+ # Metadata
68
+ metadata = summary.get("metadata", {})
69
+ if metadata:
70
+ print("\nConfiguration:")
71
+ print(f" Model: {metadata.get('model', 'N/A')}")
72
+ print(f" Prompt: {metadata.get('prompt_template', 'N/A')}")
73
+ sampling = metadata.get("sampling", {})
74
+ if sampling:
75
+ print(f" Temperature: {sampling.get('temperature', 'N/A')}")
76
+ print(f" Max Tokens: {sampling.get('max_tokens', 'N/A')}")
77
+
78
+ # Metrics
79
+ metrics = summary.get("metrics", {})
80
+ if metrics:
81
+ print("\nMetrics:")
82
+ for name, data in metrics.items():
83
+ mean = data.get("mean", 0)
84
+ count = data.get("count", 0)
85
+ print(f" {name}: {mean:.4f} (n={count})")
86
+
87
+ # Cost
88
+ cost = summary.get("cost_usd")
89
+ if cost is not None:
90
+ print(f"\nCost: ${cost:.4f}")
91
+
92
+ # Failures
93
+ failures = summary.get("failures", 0)
94
+ failure_rate = summary.get("failure_rate", 0)
95
+ if failures > 0:
96
+ print(f"\nFailures: {failures} ({failure_rate:.2%})")
97
+
98
+ print("\n" + "=" * 80)
99
+ return 0
100
+
101
+ except FileNotFoundError:
102
+ print(f"Error: Run directory not found: {run_dir}")
103
+ return 1
104
+ except json.JSONDecodeError as e:
105
+ print(f"Error: Invalid JSON in summary file: {e}")
106
+ return 1
107
+ except Exception as e:
108
+ print(f"Unexpected error: {e}")
109
+ import traceback
110
+
111
+ traceback.print_exc()
112
+ return 1
113
+
114
+
115
+ def list_command(
116
+ *,
117
+ storage: Annotated[
118
+ Path,
119
+ Parameter(
120
+ help="Storage directory containing experiment results",
121
+ ),
122
+ ] = Path(".cache/runs"),
123
+ limit: Annotated[
124
+ int | None,
125
+ Parameter(
126
+ help="Maximum number of runs to display",
127
+ ),
128
+ ] = None,
129
+ sort_by: Annotated[
130
+ str,
131
+ Parameter(
132
+ help="Sort runs by: time (newest first) or metric name",
133
+ ),
134
+ ] = "time",
135
+ ) -> int:
136
+ """List all experiment runs with quick summaries.
137
+
138
+ This command scans for summary.json files and displays a table of all runs.
139
+ Much faster than loading full report.json files.
140
+
141
+ Examples:
142
+ # List all runs
143
+ uv run python -m themis.cli results list
144
+
145
+ # List 10 most recent runs
146
+ uv run python -m themis.cli results list --limit 10
147
+
148
+ # List runs sorted by accuracy
149
+ uv run python -m themis.cli results list --sort-by accuracy
150
+ """
151
+ try:
152
+ if not storage.exists():
153
+ print(f"Error: Storage directory not found: {storage}")
154
+ return 1
155
+
156
+ # Find all summary.json files
157
+ summaries = []
158
+ for run_dir in storage.iterdir():
159
+ if not run_dir.is_dir():
160
+ continue
161
+ summary_path = run_dir / "summary.json"
162
+ if summary_path.exists():
163
+ try:
164
+ with summary_path.open("r", encoding="utf-8") as f:
165
+ summary = json.load(f)
166
+ summary["_run_dir"] = run_dir.name
167
+ summary["_mtime"] = summary_path.stat().st_mtime
168
+ summaries.append(summary)
169
+ except Exception:
170
+ continue
171
+
172
+ if not summaries:
173
+ print(f"No experiment runs found in {storage}")
174
+ print("\nNote: Only runs with summary.json files are shown.")
175
+ return 0
176
+
177
+ # Sort summaries
178
+ if sort_by == "time":
179
+ summaries.sort(key=lambda s: s.get("_mtime", 0), reverse=True)
180
+ else:
181
+ # Sort by metric value
182
+ summaries.sort(
183
+ key=lambda s: s.get("metrics", {}).get(sort_by, {}).get("mean", 0),
184
+ reverse=True,
185
+ )
186
+
187
+ # Apply limit
188
+ if limit:
189
+ summaries = summaries[:limit]
190
+
191
+ # Display table
192
+ print("=" * 120)
193
+ print(f"Found {len(summaries)} experiment run(s)")
194
+ print("=" * 120)
195
+
196
+ # Collect all metric names
197
+ all_metrics = set()
198
+ for s in summaries:
199
+ all_metrics.update(s.get("metrics", {}).keys())
200
+ metric_names = sorted(all_metrics)
201
+
202
+ # Header
203
+ header_cols = ["Run ID", "Model", "Samples"] + metric_names + ["Cost ($)"]
204
+ col_widths = [25, 30, 8] + [12] * len(metric_names) + [10]
205
+
206
+ header = " | ".join(
207
+ col.ljust(width)[:width] for col, width in zip(header_cols, col_widths)
208
+ )
209
+ print(header)
210
+ print("-" * len(header))
211
+
212
+ # Rows
213
+ for summary in summaries:
214
+ run_id = summary.get("_run_dir", "N/A")[:25]
215
+ model = summary.get("metadata", {}).get("model", "N/A")[:30]
216
+ samples = str(summary.get("total_samples", 0))
217
+ cost = summary.get("cost_usd")
218
+
219
+ row_values = [run_id, model, samples]
220
+
221
+ # Add metric values
222
+ for metric_name in metric_names:
223
+ metric_data = summary.get("metrics", {}).get(metric_name, {})
224
+ mean = metric_data.get("mean")
225
+ if mean is not None:
226
+ row_values.append(f"{mean:.4f}")
227
+ else:
228
+ row_values.append("N/A")
229
+
230
+ # Add cost
231
+ if cost is not None:
232
+ row_values.append(f"{cost:.4f}")
233
+ else:
234
+ row_values.append("N/A")
235
+
236
+ row = " | ".join(
237
+ val.ljust(width)[:width] for val, width in zip(row_values, col_widths)
238
+ )
239
+ print(row)
240
+
241
+ print("=" * 120)
242
+ return 0
243
+
244
+ except Exception as e:
245
+ print(f"Unexpected error: {e}")
246
+ import traceback
247
+
248
+ traceback.print_exc()
249
+ return 1
250
+
251
+
252
+ __all__ = ["summary_command", "list_command"]