themis-eval 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +429 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/commands/results.py +252 -0
  8. themis/cli/main.py +427 -57
  9. themis/comparison/__init__.py +25 -0
  10. themis/comparison/engine.py +348 -0
  11. themis/comparison/reports.py +283 -0
  12. themis/comparison/statistics.py +402 -0
  13. themis/core/entities.py +23 -3
  14. themis/evaluation/metrics/code/__init__.py +19 -0
  15. themis/evaluation/metrics/code/codebleu.py +144 -0
  16. themis/evaluation/metrics/code/execution.py +280 -0
  17. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  18. themis/evaluation/metrics/nlp/__init__.py +21 -0
  19. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  20. themis/evaluation/metrics/nlp/bleu.py +129 -0
  21. themis/evaluation/metrics/nlp/meteor.py +153 -0
  22. themis/evaluation/metrics/nlp/rouge.py +136 -0
  23. themis/evaluation/pipelines/standard_pipeline.py +68 -8
  24. themis/experiment/cache_manager.py +8 -3
  25. themis/experiment/export.py +110 -2
  26. themis/experiment/orchestrator.py +109 -11
  27. themis/experiment/storage.py +1457 -110
  28. themis/generation/providers/litellm_provider.py +46 -0
  29. themis/generation/runner.py +22 -6
  30. themis/integrations/huggingface.py +12 -1
  31. themis/integrations/wandb.py +13 -1
  32. themis/interfaces/__init__.py +86 -0
  33. themis/presets/__init__.py +10 -0
  34. themis/presets/benchmarks.py +354 -0
  35. themis/presets/models.py +190 -0
  36. themis/server/__init__.py +28 -0
  37. themis/server/app.py +337 -0
  38. themis_eval-0.2.1.dist-info/METADATA +596 -0
  39. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/RECORD +42 -19
  40. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/WHEEL +1 -1
  41. themis_eval-0.1.1.dist-info/METADATA +0 -758
  42. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/licenses/LICENSE +0 -0
  43. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/top_level.txt +0 -0
@@ -146,19 +146,122 @@ def export_report_json(
146
146
  return path
147
147
 
148
148
 
149
+ def export_summary_json(
150
+ report: orchestrator.ExperimentReport,
151
+ path: str | Path,
152
+ *,
153
+ run_id: str | None = None,
154
+ indent: int = 2,
155
+ ) -> Path:
156
+ """Export a lightweight summary JSON file for quick results viewing.
157
+
158
+ This creates a small summary file (~1KB) containing only the essential
159
+ metrics and metadata, without the full sample-level details. This is
160
+ ideal for quickly comparing multiple runs without parsing large report files.
161
+
162
+ Args:
163
+ report: Experiment report to summarize
164
+ path: Output path for summary.json
165
+ run_id: Optional run identifier to include in summary
166
+ indent: JSON indentation level
167
+
168
+ Returns:
169
+ Path to the created summary file
170
+
171
+ Example:
172
+ >>> export_summary_json(report, "outputs/run-123/summary.json", run_id="run-123")
173
+ >>> # Quick comparison: cat outputs/*/summary.json | jq '.accuracy'
174
+
175
+ Note:
176
+ The summary file is typically ~1KB compared to ~1.6MB for the full report.
177
+ This makes it 1000x faster to view and compare results across runs.
178
+ """
179
+ # Extract key metrics
180
+ metrics_summary = {}
181
+ for name, aggregate in report.evaluation_report.metrics.items():
182
+ metrics_summary[name] = {
183
+ "mean": aggregate.mean,
184
+ "count": aggregate.count,
185
+ }
186
+
187
+ # Extract metadata from first generation record
188
+ metadata = {}
189
+ if report.generation_results:
190
+ first_record = report.generation_results[0]
191
+ metadata = {
192
+ "model": first_record.task.model.identifier,
193
+ "prompt_template": first_record.task.prompt.spec.name,
194
+ "sampling": {
195
+ "temperature": first_record.task.sampling.temperature,
196
+ "top_p": first_record.task.sampling.top_p,
197
+ "max_tokens": first_record.task.sampling.max_tokens,
198
+ },
199
+ }
200
+
201
+ # Calculate total cost if available
202
+ total_cost = 0.0
203
+ for record in report.generation_results:
204
+ if "cost_usd" in record.metrics:
205
+ total_cost += record.metrics["cost_usd"]
206
+
207
+ # Count failures
208
+ failure_count = len(report.evaluation_report.failures)
209
+
210
+ # Build summary
211
+ summary = {
212
+ "run_id": run_id,
213
+ "total_samples": len(report.generation_results),
214
+ "metrics": metrics_summary,
215
+ "metadata": metadata,
216
+ "cost_usd": round(total_cost, 4) if total_cost > 0 else None,
217
+ "failures": failure_count,
218
+ "failure_rate": (
219
+ round(failure_count / len(report.generation_results), 4)
220
+ if report.generation_results
221
+ else 0.0
222
+ ),
223
+ }
224
+
225
+ path = Path(path)
226
+ path.parent.mkdir(parents=True, exist_ok=True)
227
+ path.write_text(json.dumps(summary, indent=indent), encoding="utf-8")
228
+ return path
229
+
230
+
149
231
  def export_report_bundle(
150
232
  report: orchestrator.ExperimentReport,
151
233
  *,
152
234
  csv_path: str | Path | None = None,
153
235
  html_path: str | Path | None = None,
154
236
  json_path: str | Path | None = None,
237
+ summary_path: str | Path | None = None,
238
+ run_id: str | None = None,
155
239
  charts: Sequence[ChartLike] | None = None,
156
240
  title: str = "Experiment report",
157
241
  sample_limit: int = 100,
158
242
  indent: int = 2,
159
243
  ) -> OrderedDict[str, Path]:
160
- """Convenience helper that writes multiple export formats at once."""
161
-
244
+ """Convenience helper that writes multiple export formats at once.
245
+
246
+ Args:
247
+ report: Experiment report to export
248
+ csv_path: Optional path for CSV export
249
+ html_path: Optional path for HTML export
250
+ json_path: Optional path for full JSON export
251
+ summary_path: Optional path for lightweight summary JSON export
252
+ run_id: Optional run identifier for summary
253
+ charts: Optional charts to include in visualizations
254
+ title: Report title
255
+ sample_limit: Maximum samples to include in detailed exports
256
+ indent: JSON indentation level
257
+
258
+ Returns:
259
+ Ordered dict of format -> path for created files
260
+
261
+ Note:
262
+ The summary export is highly recommended as it provides quick access
263
+ to key metrics without parsing large report files.
264
+ """
162
265
  outputs: OrderedDict[str, Path] = OrderedDict()
163
266
  if csv_path is not None:
164
267
  outputs["csv"] = export_report_csv(report, csv_path)
@@ -179,6 +282,10 @@ def export_report_bundle(
179
282
  sample_limit=sample_limit,
180
283
  indent=indent,
181
284
  )
285
+ if summary_path is not None:
286
+ outputs["summary"] = export_summary_json(
287
+ report, summary_path, run_id=run_id, indent=indent
288
+ )
182
289
  return outputs
183
290
 
184
291
 
@@ -684,6 +791,7 @@ __all__ = [
684
791
  "export_report_csv",
685
792
  "export_html_report",
686
793
  "export_report_json",
794
+ "export_summary_json",
687
795
  "export_report_bundle",
688
796
  "render_html_report",
689
797
  "build_json_report",
@@ -2,10 +2,13 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import logging
5
6
  from datetime import datetime, timezone
6
7
  from typing import Callable, Sequence
7
8
 
8
9
  from themis.config.schema import IntegrationsConfig
10
+
11
+ logger = logging.getLogger(__name__)
9
12
  from themis.core.entities import (
10
13
  EvaluationRecord,
11
14
  ExperimentFailure,
@@ -102,6 +105,8 @@ class ExperimentOrchestrator:
102
105
  Returns:
103
106
  ExperimentReport with generation results, evaluation, and metadata
104
107
  """
108
+ logger.info("Orchestrator: Initializing experiment run")
109
+
105
110
  # Initialize integrations
106
111
  self._integrations.initialize_run(
107
112
  {
@@ -112,28 +117,58 @@ class ExperimentOrchestrator:
112
117
  )
113
118
 
114
119
  # Prepare dataset
115
- dataset_list = self._resolve_dataset(
116
- dataset=dataset, dataset_loader=dataset_loader, run_id=run_id
117
- )
120
+ logger.info("Orchestrator: Loading dataset...")
121
+ try:
122
+ dataset_list = self._resolve_dataset(
123
+ dataset=dataset, dataset_loader=dataset_loader, run_id=run_id
124
+ )
125
+ logger.info(f"Orchestrator: Dataset loaded ({len(dataset_list)} total samples)")
126
+ except Exception as e:
127
+ logger.error(f"Orchestrator: ❌ Failed to load dataset: {e}")
128
+ raise
129
+
118
130
  selected_dataset = (
119
131
  dataset_list[:max_samples] if max_samples is not None else dataset_list
120
132
  )
121
133
  run_identifier = run_id or self._default_run_id()
134
+
135
+ logger.info(f"Orchestrator: Processing {len(selected_dataset)} samples")
136
+ logger.info(f"Orchestrator: Run ID = {run_identifier}")
137
+
138
+ # Initialize run in storage (if storage exists and run doesn't exist)
139
+ if self._cache.has_storage:
140
+ if not resume or not self._cache._storage._run_metadata_exists(run_identifier):
141
+ self._cache._storage.start_run(run_identifier, experiment_id="default")
122
142
 
123
143
  # Cache dataset for resumability
124
144
  if dataset_list:
125
145
  self._cache.cache_dataset(run_identifier, dataset_list)
126
146
 
127
147
  # Expand dataset into generation tasks
128
- tasks = list(self._plan.expand(selected_dataset))
148
+ logger.info("Orchestrator: Expanding dataset into generation tasks...")
149
+ try:
150
+ tasks = list(self._plan.expand(selected_dataset))
151
+ logger.info(f"Orchestrator: Created {len(tasks)} generation tasks")
152
+ except Exception as e:
153
+ logger.error(f"Orchestrator: ❌ Failed to expand dataset: {e}")
154
+ raise
155
+
156
+ # Build evaluation configuration for cache invalidation
157
+ evaluation_config = self._build_evaluation_config()
129
158
 
130
159
  # Load cached results if resuming
160
+ if resume:
161
+ logger.info("Orchestrator: Loading cached results...")
131
162
  cached_records = (
132
163
  self._cache.load_cached_records(run_identifier) if resume else {}
133
164
  )
134
165
  cached_evaluations = (
135
- self._cache.load_cached_evaluations(run_identifier) if resume else {}
166
+ self._cache.load_cached_evaluations(run_identifier, evaluation_config) if resume else {}
136
167
  )
168
+ if resume and cached_records:
169
+ logger.info(f"Orchestrator: Found {len(cached_records)} cached generation records")
170
+ if resume and cached_evaluations:
171
+ logger.info(f"Orchestrator: Found {len(cached_evaluations)} cached evaluation records")
137
172
 
138
173
  # Process tasks: use cached or run new generations
139
174
  generation_results: list[GenerationRecord] = []
@@ -144,8 +179,8 @@ class ExperimentOrchestrator:
144
179
  cached_eval_records: list[EvaluationRecord] = []
145
180
 
146
181
  for task in tasks:
147
- cache_key = experiment_storage.task_cache_key(task)
148
- cached = cached_records.get(cache_key)
182
+ task_cache_key = experiment_storage.task_cache_key(task)
183
+ cached = cached_records.get(task_cache_key)
149
184
  if cached is not None:
150
185
  generation_results.append(cached)
151
186
  if cached.error:
@@ -155,12 +190,14 @@ class ExperimentOrchestrator:
155
190
  message=cached.error.message,
156
191
  )
157
192
  )
158
- evaluation = cached_evaluations.get(cache_key)
193
+ # Use evaluation_cache_key that includes evaluation config
194
+ eval_cache_key = experiment_storage.evaluation_cache_key(task, evaluation_config)
195
+ evaluation = cached_evaluations.get(eval_cache_key)
159
196
  if evaluation is not None:
160
197
  cached_eval_records.append(evaluation)
161
198
  else:
162
199
  pending_records.append(cached)
163
- pending_keys.append(cache_key)
200
+ pending_keys.append(eval_cache_key)
164
201
  if on_result:
165
202
  on_result(cached)
166
203
  else:
@@ -168,9 +205,18 @@ class ExperimentOrchestrator:
168
205
 
169
206
  # Run pending generation tasks
170
207
  if pending_tasks:
208
+ logger.info(f"Orchestrator: Running {len(pending_tasks)} generation tasks...")
209
+ completed = 0
171
210
  for record in self._runner.run(pending_tasks):
211
+ logger.debug(f"Orchestrator: Received generation record")
172
212
  generation_results.append(record)
213
+ completed += 1
214
+
215
+ # Log progress every 10 samples or at key milestones
216
+ if completed % 10 == 0 or completed == len(pending_tasks):
217
+ logger.info(f"Orchestrator: Generation progress: {completed}/{len(pending_tasks)} ({100*completed//len(pending_tasks)}%)")
173
218
 
219
+ logger.debug(f"Orchestrator: Processing record (cost tracking...)")
174
220
  # Track cost for successful generations
175
221
  if record.output and record.output.usage:
176
222
  usage = record.output.usage
@@ -187,6 +233,7 @@ class ExperimentOrchestrator:
187
233
  cost=cost,
188
234
  )
189
235
 
236
+ logger.debug(f"Orchestrator: Processing record (error handling...)")
190
237
  if record.error:
191
238
  failures.append(
192
239
  ExperimentFailure(
@@ -194,35 +241,56 @@ class ExperimentOrchestrator:
194
241
  message=record.error.message,
195
242
  )
196
243
  )
244
+
245
+ logger.debug(f"Orchestrator: Processing record (caching...)")
197
246
  cache_key = experiment_storage.task_cache_key(record.task)
198
247
  if cache_results:
199
248
  self._cache.save_generation_record(
200
249
  run_identifier, record, cache_key
201
250
  )
251
+
252
+ logger.debug(f"Orchestrator: Processing record (adding to pending...)")
202
253
  pending_records.append(record)
203
254
  pending_keys.append(cache_key)
255
+
256
+ logger.debug(f"Orchestrator: Processing record (callback...)")
204
257
  if on_result:
205
258
  on_result(record)
259
+ logger.debug(f"Orchestrator: Record processing complete")
206
260
 
207
261
  # Evaluate pending records
262
+ logger.info(f"Orchestrator: Preparing to evaluate {len(pending_records)} pending records...")
208
263
  if pending_records:
209
- new_evaluation_report = self._evaluation.evaluate(pending_records)
264
+ logger.info(f"Orchestrator: Starting evaluation of {len(pending_records)} records...")
265
+ try:
266
+ new_evaluation_report = self._evaluation.evaluate(pending_records)
267
+ logger.info(f"Orchestrator: ✅ Evaluation complete - got {len(new_evaluation_report.records)} results")
268
+ except Exception as e:
269
+ logger.error(f"Orchestrator: ❌ Evaluation failed: {e}")
270
+ raise
210
271
  else:
272
+ logger.info("Orchestrator: No new records to evaluate (all cached)")
211
273
  new_evaluation_report = evaluation_pipeline.EvaluationReport(
212
274
  metrics={}, failures=[], records=[]
213
275
  )
214
276
 
215
277
  # Cache evaluation results
216
278
  for record, evaluation in zip(pending_records, new_evaluation_report.records):
217
- self._cache.save_evaluation_record(run_identifier, record, evaluation)
279
+ self._cache.save_evaluation_record(
280
+ run_identifier, record, evaluation, evaluation_config
281
+ )
218
282
 
219
283
  # Combine cached and new evaluations
284
+ logger.info("Orchestrator: Combining cached and new evaluations...")
220
285
  evaluation_report = self._combine_evaluations(
221
286
  cached_eval_records, new_evaluation_report
222
287
  )
288
+ logger.info(f"Orchestrator: Total evaluation records: {len(evaluation_report.records)}")
223
289
 
224
290
  # Get cost breakdown
225
291
  cost_breakdown = self._cost_tracker.get_breakdown()
292
+ if cost_breakdown.total_cost > 0:
293
+ logger.info(f"Orchestrator: Total cost: ${cost_breakdown.total_cost:.4f}")
226
294
 
227
295
  # Build metadata
228
296
  metadata = {
@@ -274,6 +342,36 @@ class ExperimentOrchestrator:
274
342
  def _default_run_id(self) -> str:
275
343
  return datetime.now(timezone.utc).strftime("run-%Y%m%d-%H%M%S")
276
344
 
345
+ def _build_evaluation_config(self) -> dict:
346
+ """Build evaluation configuration for cache key generation.
347
+
348
+ This configuration includes all evaluation settings that affect results,
349
+ so changing metrics or extractors will invalidate the cache.
350
+
351
+ Returns:
352
+ Dictionary with evaluation configuration
353
+ """
354
+ config = {}
355
+
356
+ # Add metric names/types
357
+ if hasattr(self._evaluation, "_metrics"):
358
+ config["metrics"] = sorted([
359
+ f"{metric.__class__.__module__}.{metric.__class__.__name__}:{metric.name}"
360
+ for metric in self._evaluation._metrics
361
+ ])
362
+
363
+ # Add extractor type
364
+ if hasattr(self._evaluation, "_extractor"):
365
+ extractor = self._evaluation._extractor
366
+ extractor_type = f"{extractor.__class__.__module__}.{extractor.__class__.__name__}"
367
+ config["extractor"] = extractor_type
368
+
369
+ # Include extractor-specific configuration if available
370
+ if hasattr(extractor, "field_name"):
371
+ config["extractor_field"] = extractor.field_name
372
+
373
+ return config
374
+
277
375
  def _resolve_dataset(
278
376
  self,
279
377
  *,