themis-eval 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +12 -1
- themis/_version.py +2 -2
- themis/api.py +343 -0
- themis/backends/__init__.py +17 -0
- themis/backends/execution.py +197 -0
- themis/backends/storage.py +260 -0
- themis/cli/commands/results.py +252 -0
- themis/cli/main.py +427 -57
- themis/comparison/__init__.py +25 -0
- themis/comparison/engine.py +348 -0
- themis/comparison/reports.py +283 -0
- themis/comparison/statistics.py +402 -0
- themis/core/entities.py +23 -3
- themis/evaluation/metrics/code/__init__.py +19 -0
- themis/evaluation/metrics/code/codebleu.py +144 -0
- themis/evaluation/metrics/code/execution.py +280 -0
- themis/evaluation/metrics/code/pass_at_k.py +181 -0
- themis/evaluation/metrics/nlp/__init__.py +21 -0
- themis/evaluation/metrics/nlp/bertscore.py +138 -0
- themis/evaluation/metrics/nlp/bleu.py +129 -0
- themis/evaluation/metrics/nlp/meteor.py +153 -0
- themis/evaluation/metrics/nlp/rouge.py +136 -0
- themis/evaluation/pipelines/standard_pipeline.py +68 -8
- themis/experiment/cache_manager.py +8 -3
- themis/experiment/export.py +110 -2
- themis/experiment/orchestrator.py +48 -6
- themis/experiment/storage.py +1313 -110
- themis/integrations/huggingface.py +12 -1
- themis/integrations/wandb.py +13 -1
- themis/interfaces/__init__.py +86 -0
- themis/presets/__init__.py +10 -0
- themis/presets/benchmarks.py +354 -0
- themis/presets/models.py +190 -0
- themis/server/__init__.py +28 -0
- themis/server/app.py +337 -0
- themis_eval-0.2.0.dist-info/METADATA +596 -0
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/RECORD +40 -17
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
- themis_eval-0.1.1.dist-info/METADATA +0 -758
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
themis/experiment/export.py
CHANGED
|
@@ -146,19 +146,122 @@ def export_report_json(
|
|
|
146
146
|
return path
|
|
147
147
|
|
|
148
148
|
|
|
149
|
+
def export_summary_json(
|
|
150
|
+
report: orchestrator.ExperimentReport,
|
|
151
|
+
path: str | Path,
|
|
152
|
+
*,
|
|
153
|
+
run_id: str | None = None,
|
|
154
|
+
indent: int = 2,
|
|
155
|
+
) -> Path:
|
|
156
|
+
"""Export a lightweight summary JSON file for quick results viewing.
|
|
157
|
+
|
|
158
|
+
This creates a small summary file (~1KB) containing only the essential
|
|
159
|
+
metrics and metadata, without the full sample-level details. This is
|
|
160
|
+
ideal for quickly comparing multiple runs without parsing large report files.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
report: Experiment report to summarize
|
|
164
|
+
path: Output path for summary.json
|
|
165
|
+
run_id: Optional run identifier to include in summary
|
|
166
|
+
indent: JSON indentation level
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Path to the created summary file
|
|
170
|
+
|
|
171
|
+
Example:
|
|
172
|
+
>>> export_summary_json(report, "outputs/run-123/summary.json", run_id="run-123")
|
|
173
|
+
>>> # Quick comparison: cat outputs/*/summary.json | jq '.accuracy'
|
|
174
|
+
|
|
175
|
+
Note:
|
|
176
|
+
The summary file is typically ~1KB compared to ~1.6MB for the full report.
|
|
177
|
+
This makes it 1000x faster to view and compare results across runs.
|
|
178
|
+
"""
|
|
179
|
+
# Extract key metrics
|
|
180
|
+
metrics_summary = {}
|
|
181
|
+
for name, aggregate in report.evaluation_report.metrics.items():
|
|
182
|
+
metrics_summary[name] = {
|
|
183
|
+
"mean": aggregate.mean,
|
|
184
|
+
"count": aggregate.count,
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
# Extract metadata from first generation record
|
|
188
|
+
metadata = {}
|
|
189
|
+
if report.generation_results:
|
|
190
|
+
first_record = report.generation_results[0]
|
|
191
|
+
metadata = {
|
|
192
|
+
"model": first_record.task.model.identifier,
|
|
193
|
+
"prompt_template": first_record.task.prompt.spec.name,
|
|
194
|
+
"sampling": {
|
|
195
|
+
"temperature": first_record.task.sampling.temperature,
|
|
196
|
+
"top_p": first_record.task.sampling.top_p,
|
|
197
|
+
"max_tokens": first_record.task.sampling.max_tokens,
|
|
198
|
+
},
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
# Calculate total cost if available
|
|
202
|
+
total_cost = 0.0
|
|
203
|
+
for record in report.generation_results:
|
|
204
|
+
if "cost_usd" in record.metrics:
|
|
205
|
+
total_cost += record.metrics["cost_usd"]
|
|
206
|
+
|
|
207
|
+
# Count failures
|
|
208
|
+
failure_count = len(report.evaluation_report.failures)
|
|
209
|
+
|
|
210
|
+
# Build summary
|
|
211
|
+
summary = {
|
|
212
|
+
"run_id": run_id,
|
|
213
|
+
"total_samples": len(report.generation_results),
|
|
214
|
+
"metrics": metrics_summary,
|
|
215
|
+
"metadata": metadata,
|
|
216
|
+
"cost_usd": round(total_cost, 4) if total_cost > 0 else None,
|
|
217
|
+
"failures": failure_count,
|
|
218
|
+
"failure_rate": (
|
|
219
|
+
round(failure_count / len(report.generation_results), 4)
|
|
220
|
+
if report.generation_results
|
|
221
|
+
else 0.0
|
|
222
|
+
),
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
path = Path(path)
|
|
226
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
227
|
+
path.write_text(json.dumps(summary, indent=indent), encoding="utf-8")
|
|
228
|
+
return path
|
|
229
|
+
|
|
230
|
+
|
|
149
231
|
def export_report_bundle(
|
|
150
232
|
report: orchestrator.ExperimentReport,
|
|
151
233
|
*,
|
|
152
234
|
csv_path: str | Path | None = None,
|
|
153
235
|
html_path: str | Path | None = None,
|
|
154
236
|
json_path: str | Path | None = None,
|
|
237
|
+
summary_path: str | Path | None = None,
|
|
238
|
+
run_id: str | None = None,
|
|
155
239
|
charts: Sequence[ChartLike] | None = None,
|
|
156
240
|
title: str = "Experiment report",
|
|
157
241
|
sample_limit: int = 100,
|
|
158
242
|
indent: int = 2,
|
|
159
243
|
) -> OrderedDict[str, Path]:
|
|
160
|
-
"""Convenience helper that writes multiple export formats at once.
|
|
161
|
-
|
|
244
|
+
"""Convenience helper that writes multiple export formats at once.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
report: Experiment report to export
|
|
248
|
+
csv_path: Optional path for CSV export
|
|
249
|
+
html_path: Optional path for HTML export
|
|
250
|
+
json_path: Optional path for full JSON export
|
|
251
|
+
summary_path: Optional path for lightweight summary JSON export
|
|
252
|
+
run_id: Optional run identifier for summary
|
|
253
|
+
charts: Optional charts to include in visualizations
|
|
254
|
+
title: Report title
|
|
255
|
+
sample_limit: Maximum samples to include in detailed exports
|
|
256
|
+
indent: JSON indentation level
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
Ordered dict of format -> path for created files
|
|
260
|
+
|
|
261
|
+
Note:
|
|
262
|
+
The summary export is highly recommended as it provides quick access
|
|
263
|
+
to key metrics without parsing large report files.
|
|
264
|
+
"""
|
|
162
265
|
outputs: OrderedDict[str, Path] = OrderedDict()
|
|
163
266
|
if csv_path is not None:
|
|
164
267
|
outputs["csv"] = export_report_csv(report, csv_path)
|
|
@@ -179,6 +282,10 @@ def export_report_bundle(
|
|
|
179
282
|
sample_limit=sample_limit,
|
|
180
283
|
indent=indent,
|
|
181
284
|
)
|
|
285
|
+
if summary_path is not None:
|
|
286
|
+
outputs["summary"] = export_summary_json(
|
|
287
|
+
report, summary_path, run_id=run_id, indent=indent
|
|
288
|
+
)
|
|
182
289
|
return outputs
|
|
183
290
|
|
|
184
291
|
|
|
@@ -684,6 +791,7 @@ __all__ = [
|
|
|
684
791
|
"export_report_csv",
|
|
685
792
|
"export_html_report",
|
|
686
793
|
"export_report_json",
|
|
794
|
+
"export_summary_json",
|
|
687
795
|
"export_report_bundle",
|
|
688
796
|
"render_html_report",
|
|
689
797
|
"build_json_report",
|
|
@@ -120,6 +120,11 @@ class ExperimentOrchestrator:
|
|
|
120
120
|
)
|
|
121
121
|
run_identifier = run_id or self._default_run_id()
|
|
122
122
|
|
|
123
|
+
# Initialize run in storage (if storage exists and run doesn't exist)
|
|
124
|
+
if self._cache.has_storage:
|
|
125
|
+
if not resume or not self._cache._storage._run_metadata_exists(run_identifier):
|
|
126
|
+
self._cache._storage.start_run(run_identifier, experiment_id="default")
|
|
127
|
+
|
|
123
128
|
# Cache dataset for resumability
|
|
124
129
|
if dataset_list:
|
|
125
130
|
self._cache.cache_dataset(run_identifier, dataset_list)
|
|
@@ -127,12 +132,15 @@ class ExperimentOrchestrator:
|
|
|
127
132
|
# Expand dataset into generation tasks
|
|
128
133
|
tasks = list(self._plan.expand(selected_dataset))
|
|
129
134
|
|
|
135
|
+
# Build evaluation configuration for cache invalidation
|
|
136
|
+
evaluation_config = self._build_evaluation_config()
|
|
137
|
+
|
|
130
138
|
# Load cached results if resuming
|
|
131
139
|
cached_records = (
|
|
132
140
|
self._cache.load_cached_records(run_identifier) if resume else {}
|
|
133
141
|
)
|
|
134
142
|
cached_evaluations = (
|
|
135
|
-
self._cache.load_cached_evaluations(run_identifier) if resume else {}
|
|
143
|
+
self._cache.load_cached_evaluations(run_identifier, evaluation_config) if resume else {}
|
|
136
144
|
)
|
|
137
145
|
|
|
138
146
|
# Process tasks: use cached or run new generations
|
|
@@ -144,8 +152,8 @@ class ExperimentOrchestrator:
|
|
|
144
152
|
cached_eval_records: list[EvaluationRecord] = []
|
|
145
153
|
|
|
146
154
|
for task in tasks:
|
|
147
|
-
|
|
148
|
-
cached = cached_records.get(
|
|
155
|
+
task_cache_key = experiment_storage.task_cache_key(task)
|
|
156
|
+
cached = cached_records.get(task_cache_key)
|
|
149
157
|
if cached is not None:
|
|
150
158
|
generation_results.append(cached)
|
|
151
159
|
if cached.error:
|
|
@@ -155,12 +163,14 @@ class ExperimentOrchestrator:
|
|
|
155
163
|
message=cached.error.message,
|
|
156
164
|
)
|
|
157
165
|
)
|
|
158
|
-
evaluation
|
|
166
|
+
# Use evaluation_cache_key that includes evaluation config
|
|
167
|
+
eval_cache_key = experiment_storage.evaluation_cache_key(task, evaluation_config)
|
|
168
|
+
evaluation = cached_evaluations.get(eval_cache_key)
|
|
159
169
|
if evaluation is not None:
|
|
160
170
|
cached_eval_records.append(evaluation)
|
|
161
171
|
else:
|
|
162
172
|
pending_records.append(cached)
|
|
163
|
-
pending_keys.append(
|
|
173
|
+
pending_keys.append(eval_cache_key)
|
|
164
174
|
if on_result:
|
|
165
175
|
on_result(cached)
|
|
166
176
|
else:
|
|
@@ -214,7 +224,9 @@ class ExperimentOrchestrator:
|
|
|
214
224
|
|
|
215
225
|
# Cache evaluation results
|
|
216
226
|
for record, evaluation in zip(pending_records, new_evaluation_report.records):
|
|
217
|
-
self._cache.save_evaluation_record(
|
|
227
|
+
self._cache.save_evaluation_record(
|
|
228
|
+
run_identifier, record, evaluation, evaluation_config
|
|
229
|
+
)
|
|
218
230
|
|
|
219
231
|
# Combine cached and new evaluations
|
|
220
232
|
evaluation_report = self._combine_evaluations(
|
|
@@ -274,6 +286,36 @@ class ExperimentOrchestrator:
|
|
|
274
286
|
def _default_run_id(self) -> str:
|
|
275
287
|
return datetime.now(timezone.utc).strftime("run-%Y%m%d-%H%M%S")
|
|
276
288
|
|
|
289
|
+
def _build_evaluation_config(self) -> dict:
|
|
290
|
+
"""Build evaluation configuration for cache key generation.
|
|
291
|
+
|
|
292
|
+
This configuration includes all evaluation settings that affect results,
|
|
293
|
+
so changing metrics or extractors will invalidate the cache.
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
Dictionary with evaluation configuration
|
|
297
|
+
"""
|
|
298
|
+
config = {}
|
|
299
|
+
|
|
300
|
+
# Add metric names/types
|
|
301
|
+
if hasattr(self._evaluation, "_metrics"):
|
|
302
|
+
config["metrics"] = sorted([
|
|
303
|
+
f"{metric.__class__.__module__}.{metric.__class__.__name__}:{metric.name}"
|
|
304
|
+
for metric in self._evaluation._metrics
|
|
305
|
+
])
|
|
306
|
+
|
|
307
|
+
# Add extractor type
|
|
308
|
+
if hasattr(self._evaluation, "_extractor"):
|
|
309
|
+
extractor = self._evaluation._extractor
|
|
310
|
+
extractor_type = f"{extractor.__class__.__module__}.{extractor.__class__.__name__}"
|
|
311
|
+
config["extractor"] = extractor_type
|
|
312
|
+
|
|
313
|
+
# Include extractor-specific configuration if available
|
|
314
|
+
if hasattr(extractor, "field_name"):
|
|
315
|
+
config["extractor_field"] = extractor.field_name
|
|
316
|
+
|
|
317
|
+
return config
|
|
318
|
+
|
|
277
319
|
def _resolve_dataset(
|
|
278
320
|
self,
|
|
279
321
|
*,
|