themis-eval 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/commands/results.py +252 -0
  8. themis/cli/main.py +427 -57
  9. themis/comparison/__init__.py +25 -0
  10. themis/comparison/engine.py +348 -0
  11. themis/comparison/reports.py +283 -0
  12. themis/comparison/statistics.py +402 -0
  13. themis/core/entities.py +23 -3
  14. themis/evaluation/metrics/code/__init__.py +19 -0
  15. themis/evaluation/metrics/code/codebleu.py +144 -0
  16. themis/evaluation/metrics/code/execution.py +280 -0
  17. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  18. themis/evaluation/metrics/nlp/__init__.py +21 -0
  19. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  20. themis/evaluation/metrics/nlp/bleu.py +129 -0
  21. themis/evaluation/metrics/nlp/meteor.py +153 -0
  22. themis/evaluation/metrics/nlp/rouge.py +136 -0
  23. themis/evaluation/pipelines/standard_pipeline.py +68 -8
  24. themis/experiment/cache_manager.py +8 -3
  25. themis/experiment/export.py +110 -2
  26. themis/experiment/orchestrator.py +48 -6
  27. themis/experiment/storage.py +1313 -110
  28. themis/integrations/huggingface.py +12 -1
  29. themis/integrations/wandb.py +13 -1
  30. themis/interfaces/__init__.py +86 -0
  31. themis/presets/__init__.py +10 -0
  32. themis/presets/benchmarks.py +354 -0
  33. themis/presets/models.py +190 -0
  34. themis/server/__init__.py +28 -0
  35. themis/server/app.py +337 -0
  36. themis_eval-0.2.0.dist-info/METADATA +596 -0
  37. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/RECORD +40 -17
  38. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  39. themis_eval-0.1.1.dist-info/METADATA +0 -758
  40. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  41. {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
@@ -146,19 +146,122 @@ def export_report_json(
146
146
  return path
147
147
 
148
148
 
149
+ def export_summary_json(
150
+ report: orchestrator.ExperimentReport,
151
+ path: str | Path,
152
+ *,
153
+ run_id: str | None = None,
154
+ indent: int = 2,
155
+ ) -> Path:
156
+ """Export a lightweight summary JSON file for quick results viewing.
157
+
158
+ This creates a small summary file (~1KB) containing only the essential
159
+ metrics and metadata, without the full sample-level details. This is
160
+ ideal for quickly comparing multiple runs without parsing large report files.
161
+
162
+ Args:
163
+ report: Experiment report to summarize
164
+ path: Output path for summary.json
165
+ run_id: Optional run identifier to include in summary
166
+ indent: JSON indentation level
167
+
168
+ Returns:
169
+ Path to the created summary file
170
+
171
+ Example:
172
+ >>> export_summary_json(report, "outputs/run-123/summary.json", run_id="run-123")
173
+ >>> # Quick comparison: cat outputs/*/summary.json | jq '.accuracy'
174
+
175
+ Note:
176
+ The summary file is typically ~1KB compared to ~1.6MB for the full report.
177
+ This makes it 1000x faster to view and compare results across runs.
178
+ """
179
+ # Extract key metrics
180
+ metrics_summary = {}
181
+ for name, aggregate in report.evaluation_report.metrics.items():
182
+ metrics_summary[name] = {
183
+ "mean": aggregate.mean,
184
+ "count": aggregate.count,
185
+ }
186
+
187
+ # Extract metadata from first generation record
188
+ metadata = {}
189
+ if report.generation_results:
190
+ first_record = report.generation_results[0]
191
+ metadata = {
192
+ "model": first_record.task.model.identifier,
193
+ "prompt_template": first_record.task.prompt.spec.name,
194
+ "sampling": {
195
+ "temperature": first_record.task.sampling.temperature,
196
+ "top_p": first_record.task.sampling.top_p,
197
+ "max_tokens": first_record.task.sampling.max_tokens,
198
+ },
199
+ }
200
+
201
+ # Calculate total cost if available
202
+ total_cost = 0.0
203
+ for record in report.generation_results:
204
+ if "cost_usd" in record.metrics:
205
+ total_cost += record.metrics["cost_usd"]
206
+
207
+ # Count failures
208
+ failure_count = len(report.evaluation_report.failures)
209
+
210
+ # Build summary
211
+ summary = {
212
+ "run_id": run_id,
213
+ "total_samples": len(report.generation_results),
214
+ "metrics": metrics_summary,
215
+ "metadata": metadata,
216
+ "cost_usd": round(total_cost, 4) if total_cost > 0 else None,
217
+ "failures": failure_count,
218
+ "failure_rate": (
219
+ round(failure_count / len(report.generation_results), 4)
220
+ if report.generation_results
221
+ else 0.0
222
+ ),
223
+ }
224
+
225
+ path = Path(path)
226
+ path.parent.mkdir(parents=True, exist_ok=True)
227
+ path.write_text(json.dumps(summary, indent=indent), encoding="utf-8")
228
+ return path
229
+
230
+
149
231
  def export_report_bundle(
150
232
  report: orchestrator.ExperimentReport,
151
233
  *,
152
234
  csv_path: str | Path | None = None,
153
235
  html_path: str | Path | None = None,
154
236
  json_path: str | Path | None = None,
237
+ summary_path: str | Path | None = None,
238
+ run_id: str | None = None,
155
239
  charts: Sequence[ChartLike] | None = None,
156
240
  title: str = "Experiment report",
157
241
  sample_limit: int = 100,
158
242
  indent: int = 2,
159
243
  ) -> OrderedDict[str, Path]:
160
- """Convenience helper that writes multiple export formats at once."""
161
-
244
+ """Convenience helper that writes multiple export formats at once.
245
+
246
+ Args:
247
+ report: Experiment report to export
248
+ csv_path: Optional path for CSV export
249
+ html_path: Optional path for HTML export
250
+ json_path: Optional path for full JSON export
251
+ summary_path: Optional path for lightweight summary JSON export
252
+ run_id: Optional run identifier for summary
253
+ charts: Optional charts to include in visualizations
254
+ title: Report title
255
+ sample_limit: Maximum samples to include in detailed exports
256
+ indent: JSON indentation level
257
+
258
+ Returns:
259
+ Ordered dict of format -> path for created files
260
+
261
+ Note:
262
+ The summary export is highly recommended as it provides quick access
263
+ to key metrics without parsing large report files.
264
+ """
162
265
  outputs: OrderedDict[str, Path] = OrderedDict()
163
266
  if csv_path is not None:
164
267
  outputs["csv"] = export_report_csv(report, csv_path)
@@ -179,6 +282,10 @@ def export_report_bundle(
179
282
  sample_limit=sample_limit,
180
283
  indent=indent,
181
284
  )
285
+ if summary_path is not None:
286
+ outputs["summary"] = export_summary_json(
287
+ report, summary_path, run_id=run_id, indent=indent
288
+ )
182
289
  return outputs
183
290
 
184
291
 
@@ -684,6 +791,7 @@ __all__ = [
684
791
  "export_report_csv",
685
792
  "export_html_report",
686
793
  "export_report_json",
794
+ "export_summary_json",
687
795
  "export_report_bundle",
688
796
  "render_html_report",
689
797
  "build_json_report",
@@ -120,6 +120,11 @@ class ExperimentOrchestrator:
120
120
  )
121
121
  run_identifier = run_id or self._default_run_id()
122
122
 
123
+ # Initialize run in storage (if storage exists and run doesn't exist)
124
+ if self._cache.has_storage:
125
+ if not resume or not self._cache._storage._run_metadata_exists(run_identifier):
126
+ self._cache._storage.start_run(run_identifier, experiment_id="default")
127
+
123
128
  # Cache dataset for resumability
124
129
  if dataset_list:
125
130
  self._cache.cache_dataset(run_identifier, dataset_list)
@@ -127,12 +132,15 @@ class ExperimentOrchestrator:
127
132
  # Expand dataset into generation tasks
128
133
  tasks = list(self._plan.expand(selected_dataset))
129
134
 
135
+ # Build evaluation configuration for cache invalidation
136
+ evaluation_config = self._build_evaluation_config()
137
+
130
138
  # Load cached results if resuming
131
139
  cached_records = (
132
140
  self._cache.load_cached_records(run_identifier) if resume else {}
133
141
  )
134
142
  cached_evaluations = (
135
- self._cache.load_cached_evaluations(run_identifier) if resume else {}
143
+ self._cache.load_cached_evaluations(run_identifier, evaluation_config) if resume else {}
136
144
  )
137
145
 
138
146
  # Process tasks: use cached or run new generations
@@ -144,8 +152,8 @@ class ExperimentOrchestrator:
144
152
  cached_eval_records: list[EvaluationRecord] = []
145
153
 
146
154
  for task in tasks:
147
- cache_key = experiment_storage.task_cache_key(task)
148
- cached = cached_records.get(cache_key)
155
+ task_cache_key = experiment_storage.task_cache_key(task)
156
+ cached = cached_records.get(task_cache_key)
149
157
  if cached is not None:
150
158
  generation_results.append(cached)
151
159
  if cached.error:
@@ -155,12 +163,14 @@ class ExperimentOrchestrator:
155
163
  message=cached.error.message,
156
164
  )
157
165
  )
158
- evaluation = cached_evaluations.get(cache_key)
166
+ # Use evaluation_cache_key that includes evaluation config
167
+ eval_cache_key = experiment_storage.evaluation_cache_key(task, evaluation_config)
168
+ evaluation = cached_evaluations.get(eval_cache_key)
159
169
  if evaluation is not None:
160
170
  cached_eval_records.append(evaluation)
161
171
  else:
162
172
  pending_records.append(cached)
163
- pending_keys.append(cache_key)
173
+ pending_keys.append(eval_cache_key)
164
174
  if on_result:
165
175
  on_result(cached)
166
176
  else:
@@ -214,7 +224,9 @@ class ExperimentOrchestrator:
214
224
 
215
225
  # Cache evaluation results
216
226
  for record, evaluation in zip(pending_records, new_evaluation_report.records):
217
- self._cache.save_evaluation_record(run_identifier, record, evaluation)
227
+ self._cache.save_evaluation_record(
228
+ run_identifier, record, evaluation, evaluation_config
229
+ )
218
230
 
219
231
  # Combine cached and new evaluations
220
232
  evaluation_report = self._combine_evaluations(
@@ -274,6 +286,36 @@ class ExperimentOrchestrator:
274
286
  def _default_run_id(self) -> str:
275
287
  return datetime.now(timezone.utc).strftime("run-%Y%m%d-%H%M%S")
276
288
 
289
+ def _build_evaluation_config(self) -> dict:
290
+ """Build evaluation configuration for cache key generation.
291
+
292
+ This configuration includes all evaluation settings that affect results,
293
+ so changing metrics or extractors will invalidate the cache.
294
+
295
+ Returns:
296
+ Dictionary with evaluation configuration
297
+ """
298
+ config = {}
299
+
300
+ # Add metric names/types
301
+ if hasattr(self._evaluation, "_metrics"):
302
+ config["metrics"] = sorted([
303
+ f"{metric.__class__.__module__}.{metric.__class__.__name__}:{metric.name}"
304
+ for metric in self._evaluation._metrics
305
+ ])
306
+
307
+ # Add extractor type
308
+ if hasattr(self._evaluation, "_extractor"):
309
+ extractor = self._evaluation._extractor
310
+ extractor_type = f"{extractor.__class__.__module__}.{extractor.__class__.__name__}"
311
+ config["extractor"] = extractor_type
312
+
313
+ # Include extractor-specific configuration if available
314
+ if hasattr(extractor, "field_name"):
315
+ config["extractor_field"] = extractor.field_name
316
+
317
+ return config
318
+
277
319
  def _resolve_dataset(
278
320
  self,
279
321
  *,