themis-eval 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +12 -1
- themis/_version.py +2 -2
- themis/api.py +429 -0
- themis/backends/__init__.py +17 -0
- themis/backends/execution.py +197 -0
- themis/backends/storage.py +260 -0
- themis/cli/commands/results.py +252 -0
- themis/cli/main.py +427 -57
- themis/comparison/__init__.py +25 -0
- themis/comparison/engine.py +348 -0
- themis/comparison/reports.py +283 -0
- themis/comparison/statistics.py +402 -0
- themis/core/entities.py +23 -3
- themis/evaluation/metrics/code/__init__.py +19 -0
- themis/evaluation/metrics/code/codebleu.py +144 -0
- themis/evaluation/metrics/code/execution.py +280 -0
- themis/evaluation/metrics/code/pass_at_k.py +181 -0
- themis/evaluation/metrics/nlp/__init__.py +21 -0
- themis/evaluation/metrics/nlp/bertscore.py +138 -0
- themis/evaluation/metrics/nlp/bleu.py +129 -0
- themis/evaluation/metrics/nlp/meteor.py +153 -0
- themis/evaluation/metrics/nlp/rouge.py +136 -0
- themis/evaluation/pipelines/standard_pipeline.py +68 -8
- themis/experiment/cache_manager.py +8 -3
- themis/experiment/export.py +110 -2
- themis/experiment/orchestrator.py +109 -11
- themis/experiment/storage.py +1457 -110
- themis/generation/providers/litellm_provider.py +46 -0
- themis/generation/runner.py +22 -6
- themis/integrations/huggingface.py +12 -1
- themis/integrations/wandb.py +13 -1
- themis/interfaces/__init__.py +86 -0
- themis/presets/__init__.py +10 -0
- themis/presets/benchmarks.py +354 -0
- themis/presets/models.py +190 -0
- themis/server/__init__.py +28 -0
- themis/server/app.py +337 -0
- themis_eval-0.2.1.dist-info/METADATA +596 -0
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/RECORD +42 -19
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/WHEEL +1 -1
- themis_eval-0.1.1.dist-info/METADATA +0 -758
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/top_level.txt +0 -0
themis/experiment/export.py
CHANGED
|
@@ -146,19 +146,122 @@ def export_report_json(
|
|
|
146
146
|
return path
|
|
147
147
|
|
|
148
148
|
|
|
149
|
+
def export_summary_json(
|
|
150
|
+
report: orchestrator.ExperimentReport,
|
|
151
|
+
path: str | Path,
|
|
152
|
+
*,
|
|
153
|
+
run_id: str | None = None,
|
|
154
|
+
indent: int = 2,
|
|
155
|
+
) -> Path:
|
|
156
|
+
"""Export a lightweight summary JSON file for quick results viewing.
|
|
157
|
+
|
|
158
|
+
This creates a small summary file (~1KB) containing only the essential
|
|
159
|
+
metrics and metadata, without the full sample-level details. This is
|
|
160
|
+
ideal for quickly comparing multiple runs without parsing large report files.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
report: Experiment report to summarize
|
|
164
|
+
path: Output path for summary.json
|
|
165
|
+
run_id: Optional run identifier to include in summary
|
|
166
|
+
indent: JSON indentation level
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Path to the created summary file
|
|
170
|
+
|
|
171
|
+
Example:
|
|
172
|
+
>>> export_summary_json(report, "outputs/run-123/summary.json", run_id="run-123")
|
|
173
|
+
>>> # Quick comparison: cat outputs/*/summary.json | jq '.accuracy'
|
|
174
|
+
|
|
175
|
+
Note:
|
|
176
|
+
The summary file is typically ~1KB compared to ~1.6MB for the full report.
|
|
177
|
+
This makes it 1000x faster to view and compare results across runs.
|
|
178
|
+
"""
|
|
179
|
+
# Extract key metrics
|
|
180
|
+
metrics_summary = {}
|
|
181
|
+
for name, aggregate in report.evaluation_report.metrics.items():
|
|
182
|
+
metrics_summary[name] = {
|
|
183
|
+
"mean": aggregate.mean,
|
|
184
|
+
"count": aggregate.count,
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
# Extract metadata from first generation record
|
|
188
|
+
metadata = {}
|
|
189
|
+
if report.generation_results:
|
|
190
|
+
first_record = report.generation_results[0]
|
|
191
|
+
metadata = {
|
|
192
|
+
"model": first_record.task.model.identifier,
|
|
193
|
+
"prompt_template": first_record.task.prompt.spec.name,
|
|
194
|
+
"sampling": {
|
|
195
|
+
"temperature": first_record.task.sampling.temperature,
|
|
196
|
+
"top_p": first_record.task.sampling.top_p,
|
|
197
|
+
"max_tokens": first_record.task.sampling.max_tokens,
|
|
198
|
+
},
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
# Calculate total cost if available
|
|
202
|
+
total_cost = 0.0
|
|
203
|
+
for record in report.generation_results:
|
|
204
|
+
if "cost_usd" in record.metrics:
|
|
205
|
+
total_cost += record.metrics["cost_usd"]
|
|
206
|
+
|
|
207
|
+
# Count failures
|
|
208
|
+
failure_count = len(report.evaluation_report.failures)
|
|
209
|
+
|
|
210
|
+
# Build summary
|
|
211
|
+
summary = {
|
|
212
|
+
"run_id": run_id,
|
|
213
|
+
"total_samples": len(report.generation_results),
|
|
214
|
+
"metrics": metrics_summary,
|
|
215
|
+
"metadata": metadata,
|
|
216
|
+
"cost_usd": round(total_cost, 4) if total_cost > 0 else None,
|
|
217
|
+
"failures": failure_count,
|
|
218
|
+
"failure_rate": (
|
|
219
|
+
round(failure_count / len(report.generation_results), 4)
|
|
220
|
+
if report.generation_results
|
|
221
|
+
else 0.0
|
|
222
|
+
),
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
path = Path(path)
|
|
226
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
227
|
+
path.write_text(json.dumps(summary, indent=indent), encoding="utf-8")
|
|
228
|
+
return path
|
|
229
|
+
|
|
230
|
+
|
|
149
231
|
def export_report_bundle(
|
|
150
232
|
report: orchestrator.ExperimentReport,
|
|
151
233
|
*,
|
|
152
234
|
csv_path: str | Path | None = None,
|
|
153
235
|
html_path: str | Path | None = None,
|
|
154
236
|
json_path: str | Path | None = None,
|
|
237
|
+
summary_path: str | Path | None = None,
|
|
238
|
+
run_id: str | None = None,
|
|
155
239
|
charts: Sequence[ChartLike] | None = None,
|
|
156
240
|
title: str = "Experiment report",
|
|
157
241
|
sample_limit: int = 100,
|
|
158
242
|
indent: int = 2,
|
|
159
243
|
) -> OrderedDict[str, Path]:
|
|
160
|
-
"""Convenience helper that writes multiple export formats at once.
|
|
161
|
-
|
|
244
|
+
"""Convenience helper that writes multiple export formats at once.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
report: Experiment report to export
|
|
248
|
+
csv_path: Optional path for CSV export
|
|
249
|
+
html_path: Optional path for HTML export
|
|
250
|
+
json_path: Optional path for full JSON export
|
|
251
|
+
summary_path: Optional path for lightweight summary JSON export
|
|
252
|
+
run_id: Optional run identifier for summary
|
|
253
|
+
charts: Optional charts to include in visualizations
|
|
254
|
+
title: Report title
|
|
255
|
+
sample_limit: Maximum samples to include in detailed exports
|
|
256
|
+
indent: JSON indentation level
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
Ordered dict of format -> path for created files
|
|
260
|
+
|
|
261
|
+
Note:
|
|
262
|
+
The summary export is highly recommended as it provides quick access
|
|
263
|
+
to key metrics without parsing large report files.
|
|
264
|
+
"""
|
|
162
265
|
outputs: OrderedDict[str, Path] = OrderedDict()
|
|
163
266
|
if csv_path is not None:
|
|
164
267
|
outputs["csv"] = export_report_csv(report, csv_path)
|
|
@@ -179,6 +282,10 @@ def export_report_bundle(
|
|
|
179
282
|
sample_limit=sample_limit,
|
|
180
283
|
indent=indent,
|
|
181
284
|
)
|
|
285
|
+
if summary_path is not None:
|
|
286
|
+
outputs["summary"] = export_summary_json(
|
|
287
|
+
report, summary_path, run_id=run_id, indent=indent
|
|
288
|
+
)
|
|
182
289
|
return outputs
|
|
183
290
|
|
|
184
291
|
|
|
@@ -684,6 +791,7 @@ __all__ = [
|
|
|
684
791
|
"export_report_csv",
|
|
685
792
|
"export_html_report",
|
|
686
793
|
"export_report_json",
|
|
794
|
+
"export_summary_json",
|
|
687
795
|
"export_report_bundle",
|
|
688
796
|
"render_html_report",
|
|
689
797
|
"build_json_report",
|
|
@@ -2,10 +2,13 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import logging
|
|
5
6
|
from datetime import datetime, timezone
|
|
6
7
|
from typing import Callable, Sequence
|
|
7
8
|
|
|
8
9
|
from themis.config.schema import IntegrationsConfig
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
9
12
|
from themis.core.entities import (
|
|
10
13
|
EvaluationRecord,
|
|
11
14
|
ExperimentFailure,
|
|
@@ -102,6 +105,8 @@ class ExperimentOrchestrator:
|
|
|
102
105
|
Returns:
|
|
103
106
|
ExperimentReport with generation results, evaluation, and metadata
|
|
104
107
|
"""
|
|
108
|
+
logger.info("Orchestrator: Initializing experiment run")
|
|
109
|
+
|
|
105
110
|
# Initialize integrations
|
|
106
111
|
self._integrations.initialize_run(
|
|
107
112
|
{
|
|
@@ -112,28 +117,58 @@ class ExperimentOrchestrator:
|
|
|
112
117
|
)
|
|
113
118
|
|
|
114
119
|
# Prepare dataset
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
120
|
+
logger.info("Orchestrator: Loading dataset...")
|
|
121
|
+
try:
|
|
122
|
+
dataset_list = self._resolve_dataset(
|
|
123
|
+
dataset=dataset, dataset_loader=dataset_loader, run_id=run_id
|
|
124
|
+
)
|
|
125
|
+
logger.info(f"Orchestrator: Dataset loaded ({len(dataset_list)} total samples)")
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logger.error(f"Orchestrator: ❌ Failed to load dataset: {e}")
|
|
128
|
+
raise
|
|
129
|
+
|
|
118
130
|
selected_dataset = (
|
|
119
131
|
dataset_list[:max_samples] if max_samples is not None else dataset_list
|
|
120
132
|
)
|
|
121
133
|
run_identifier = run_id or self._default_run_id()
|
|
134
|
+
|
|
135
|
+
logger.info(f"Orchestrator: Processing {len(selected_dataset)} samples")
|
|
136
|
+
logger.info(f"Orchestrator: Run ID = {run_identifier}")
|
|
137
|
+
|
|
138
|
+
# Initialize run in storage (if storage exists and run doesn't exist)
|
|
139
|
+
if self._cache.has_storage:
|
|
140
|
+
if not resume or not self._cache._storage._run_metadata_exists(run_identifier):
|
|
141
|
+
self._cache._storage.start_run(run_identifier, experiment_id="default")
|
|
122
142
|
|
|
123
143
|
# Cache dataset for resumability
|
|
124
144
|
if dataset_list:
|
|
125
145
|
self._cache.cache_dataset(run_identifier, dataset_list)
|
|
126
146
|
|
|
127
147
|
# Expand dataset into generation tasks
|
|
128
|
-
|
|
148
|
+
logger.info("Orchestrator: Expanding dataset into generation tasks...")
|
|
149
|
+
try:
|
|
150
|
+
tasks = list(self._plan.expand(selected_dataset))
|
|
151
|
+
logger.info(f"Orchestrator: Created {len(tasks)} generation tasks")
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.error(f"Orchestrator: ❌ Failed to expand dataset: {e}")
|
|
154
|
+
raise
|
|
155
|
+
|
|
156
|
+
# Build evaluation configuration for cache invalidation
|
|
157
|
+
evaluation_config = self._build_evaluation_config()
|
|
129
158
|
|
|
130
159
|
# Load cached results if resuming
|
|
160
|
+
if resume:
|
|
161
|
+
logger.info("Orchestrator: Loading cached results...")
|
|
131
162
|
cached_records = (
|
|
132
163
|
self._cache.load_cached_records(run_identifier) if resume else {}
|
|
133
164
|
)
|
|
134
165
|
cached_evaluations = (
|
|
135
|
-
self._cache.load_cached_evaluations(run_identifier) if resume else {}
|
|
166
|
+
self._cache.load_cached_evaluations(run_identifier, evaluation_config) if resume else {}
|
|
136
167
|
)
|
|
168
|
+
if resume and cached_records:
|
|
169
|
+
logger.info(f"Orchestrator: Found {len(cached_records)} cached generation records")
|
|
170
|
+
if resume and cached_evaluations:
|
|
171
|
+
logger.info(f"Orchestrator: Found {len(cached_evaluations)} cached evaluation records")
|
|
137
172
|
|
|
138
173
|
# Process tasks: use cached or run new generations
|
|
139
174
|
generation_results: list[GenerationRecord] = []
|
|
@@ -144,8 +179,8 @@ class ExperimentOrchestrator:
|
|
|
144
179
|
cached_eval_records: list[EvaluationRecord] = []
|
|
145
180
|
|
|
146
181
|
for task in tasks:
|
|
147
|
-
|
|
148
|
-
cached = cached_records.get(
|
|
182
|
+
task_cache_key = experiment_storage.task_cache_key(task)
|
|
183
|
+
cached = cached_records.get(task_cache_key)
|
|
149
184
|
if cached is not None:
|
|
150
185
|
generation_results.append(cached)
|
|
151
186
|
if cached.error:
|
|
@@ -155,12 +190,14 @@ class ExperimentOrchestrator:
|
|
|
155
190
|
message=cached.error.message,
|
|
156
191
|
)
|
|
157
192
|
)
|
|
158
|
-
evaluation
|
|
193
|
+
# Use evaluation_cache_key that includes evaluation config
|
|
194
|
+
eval_cache_key = experiment_storage.evaluation_cache_key(task, evaluation_config)
|
|
195
|
+
evaluation = cached_evaluations.get(eval_cache_key)
|
|
159
196
|
if evaluation is not None:
|
|
160
197
|
cached_eval_records.append(evaluation)
|
|
161
198
|
else:
|
|
162
199
|
pending_records.append(cached)
|
|
163
|
-
pending_keys.append(
|
|
200
|
+
pending_keys.append(eval_cache_key)
|
|
164
201
|
if on_result:
|
|
165
202
|
on_result(cached)
|
|
166
203
|
else:
|
|
@@ -168,9 +205,18 @@ class ExperimentOrchestrator:
|
|
|
168
205
|
|
|
169
206
|
# Run pending generation tasks
|
|
170
207
|
if pending_tasks:
|
|
208
|
+
logger.info(f"Orchestrator: Running {len(pending_tasks)} generation tasks...")
|
|
209
|
+
completed = 0
|
|
171
210
|
for record in self._runner.run(pending_tasks):
|
|
211
|
+
logger.debug(f"Orchestrator: Received generation record")
|
|
172
212
|
generation_results.append(record)
|
|
213
|
+
completed += 1
|
|
214
|
+
|
|
215
|
+
# Log progress every 10 samples or at key milestones
|
|
216
|
+
if completed % 10 == 0 or completed == len(pending_tasks):
|
|
217
|
+
logger.info(f"Orchestrator: Generation progress: {completed}/{len(pending_tasks)} ({100*completed//len(pending_tasks)}%)")
|
|
173
218
|
|
|
219
|
+
logger.debug(f"Orchestrator: Processing record (cost tracking...)")
|
|
174
220
|
# Track cost for successful generations
|
|
175
221
|
if record.output and record.output.usage:
|
|
176
222
|
usage = record.output.usage
|
|
@@ -187,6 +233,7 @@ class ExperimentOrchestrator:
|
|
|
187
233
|
cost=cost,
|
|
188
234
|
)
|
|
189
235
|
|
|
236
|
+
logger.debug(f"Orchestrator: Processing record (error handling...)")
|
|
190
237
|
if record.error:
|
|
191
238
|
failures.append(
|
|
192
239
|
ExperimentFailure(
|
|
@@ -194,35 +241,56 @@ class ExperimentOrchestrator:
|
|
|
194
241
|
message=record.error.message,
|
|
195
242
|
)
|
|
196
243
|
)
|
|
244
|
+
|
|
245
|
+
logger.debug(f"Orchestrator: Processing record (caching...)")
|
|
197
246
|
cache_key = experiment_storage.task_cache_key(record.task)
|
|
198
247
|
if cache_results:
|
|
199
248
|
self._cache.save_generation_record(
|
|
200
249
|
run_identifier, record, cache_key
|
|
201
250
|
)
|
|
251
|
+
|
|
252
|
+
logger.debug(f"Orchestrator: Processing record (adding to pending...)")
|
|
202
253
|
pending_records.append(record)
|
|
203
254
|
pending_keys.append(cache_key)
|
|
255
|
+
|
|
256
|
+
logger.debug(f"Orchestrator: Processing record (callback...)")
|
|
204
257
|
if on_result:
|
|
205
258
|
on_result(record)
|
|
259
|
+
logger.debug(f"Orchestrator: Record processing complete")
|
|
206
260
|
|
|
207
261
|
# Evaluate pending records
|
|
262
|
+
logger.info(f"Orchestrator: Preparing to evaluate {len(pending_records)} pending records...")
|
|
208
263
|
if pending_records:
|
|
209
|
-
|
|
264
|
+
logger.info(f"Orchestrator: Starting evaluation of {len(pending_records)} records...")
|
|
265
|
+
try:
|
|
266
|
+
new_evaluation_report = self._evaluation.evaluate(pending_records)
|
|
267
|
+
logger.info(f"Orchestrator: ✅ Evaluation complete - got {len(new_evaluation_report.records)} results")
|
|
268
|
+
except Exception as e:
|
|
269
|
+
logger.error(f"Orchestrator: ❌ Evaluation failed: {e}")
|
|
270
|
+
raise
|
|
210
271
|
else:
|
|
272
|
+
logger.info("Orchestrator: No new records to evaluate (all cached)")
|
|
211
273
|
new_evaluation_report = evaluation_pipeline.EvaluationReport(
|
|
212
274
|
metrics={}, failures=[], records=[]
|
|
213
275
|
)
|
|
214
276
|
|
|
215
277
|
# Cache evaluation results
|
|
216
278
|
for record, evaluation in zip(pending_records, new_evaluation_report.records):
|
|
217
|
-
self._cache.save_evaluation_record(
|
|
279
|
+
self._cache.save_evaluation_record(
|
|
280
|
+
run_identifier, record, evaluation, evaluation_config
|
|
281
|
+
)
|
|
218
282
|
|
|
219
283
|
# Combine cached and new evaluations
|
|
284
|
+
logger.info("Orchestrator: Combining cached and new evaluations...")
|
|
220
285
|
evaluation_report = self._combine_evaluations(
|
|
221
286
|
cached_eval_records, new_evaluation_report
|
|
222
287
|
)
|
|
288
|
+
logger.info(f"Orchestrator: Total evaluation records: {len(evaluation_report.records)}")
|
|
223
289
|
|
|
224
290
|
# Get cost breakdown
|
|
225
291
|
cost_breakdown = self._cost_tracker.get_breakdown()
|
|
292
|
+
if cost_breakdown.total_cost > 0:
|
|
293
|
+
logger.info(f"Orchestrator: Total cost: ${cost_breakdown.total_cost:.4f}")
|
|
226
294
|
|
|
227
295
|
# Build metadata
|
|
228
296
|
metadata = {
|
|
@@ -274,6 +342,36 @@ class ExperimentOrchestrator:
|
|
|
274
342
|
def _default_run_id(self) -> str:
|
|
275
343
|
return datetime.now(timezone.utc).strftime("run-%Y%m%d-%H%M%S")
|
|
276
344
|
|
|
345
|
+
def _build_evaluation_config(self) -> dict:
|
|
346
|
+
"""Build evaluation configuration for cache key generation.
|
|
347
|
+
|
|
348
|
+
This configuration includes all evaluation settings that affect results,
|
|
349
|
+
so changing metrics or extractors will invalidate the cache.
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
Dictionary with evaluation configuration
|
|
353
|
+
"""
|
|
354
|
+
config = {}
|
|
355
|
+
|
|
356
|
+
# Add metric names/types
|
|
357
|
+
if hasattr(self._evaluation, "_metrics"):
|
|
358
|
+
config["metrics"] = sorted([
|
|
359
|
+
f"{metric.__class__.__module__}.{metric.__class__.__name__}:{metric.name}"
|
|
360
|
+
for metric in self._evaluation._metrics
|
|
361
|
+
])
|
|
362
|
+
|
|
363
|
+
# Add extractor type
|
|
364
|
+
if hasattr(self._evaluation, "_extractor"):
|
|
365
|
+
extractor = self._evaluation._extractor
|
|
366
|
+
extractor_type = f"{extractor.__class__.__module__}.{extractor.__class__.__name__}"
|
|
367
|
+
config["extractor"] = extractor_type
|
|
368
|
+
|
|
369
|
+
# Include extractor-specific configuration if available
|
|
370
|
+
if hasattr(extractor, "field_name"):
|
|
371
|
+
config["extractor_field"] = extractor.field_name
|
|
372
|
+
|
|
373
|
+
return config
|
|
374
|
+
|
|
277
375
|
def _resolve_dataset(
|
|
278
376
|
self,
|
|
279
377
|
*,
|