themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/__init__.py +5 -0
  8. themis/cli/__main__.py +6 -0
  9. themis/cli/commands/__init__.py +19 -0
  10. themis/cli/commands/benchmarks.py +221 -0
  11. themis/cli/commands/comparison.py +394 -0
  12. themis/cli/commands/config_commands.py +244 -0
  13. themis/cli/commands/cost.py +214 -0
  14. themis/cli/commands/demo.py +68 -0
  15. themis/cli/commands/info.py +90 -0
  16. themis/cli/commands/leaderboard.py +362 -0
  17. themis/cli/commands/math_benchmarks.py +318 -0
  18. themis/cli/commands/mcq_benchmarks.py +207 -0
  19. themis/cli/commands/results.py +252 -0
  20. themis/cli/commands/sample_run.py +244 -0
  21. themis/cli/commands/visualize.py +299 -0
  22. themis/cli/main.py +463 -0
  23. themis/cli/new_project.py +33 -0
  24. themis/cli/utils.py +51 -0
  25. themis/comparison/__init__.py +25 -0
  26. themis/comparison/engine.py +348 -0
  27. themis/comparison/reports.py +283 -0
  28. themis/comparison/statistics.py +402 -0
  29. themis/config/__init__.py +19 -0
  30. themis/config/loader.py +27 -0
  31. themis/config/registry.py +34 -0
  32. themis/config/runtime.py +214 -0
  33. themis/config/schema.py +112 -0
  34. themis/core/__init__.py +5 -0
  35. themis/core/conversation.py +354 -0
  36. themis/core/entities.py +184 -0
  37. themis/core/serialization.py +231 -0
  38. themis/core/tools.py +393 -0
  39. themis/core/types.py +141 -0
  40. themis/datasets/__init__.py +273 -0
  41. themis/datasets/base.py +264 -0
  42. themis/datasets/commonsense_qa.py +174 -0
  43. themis/datasets/competition_math.py +265 -0
  44. themis/datasets/coqa.py +133 -0
  45. themis/datasets/gpqa.py +190 -0
  46. themis/datasets/gsm8k.py +123 -0
  47. themis/datasets/gsm_symbolic.py +124 -0
  48. themis/datasets/math500.py +122 -0
  49. themis/datasets/med_qa.py +179 -0
  50. themis/datasets/medmcqa.py +169 -0
  51. themis/datasets/mmlu_pro.py +262 -0
  52. themis/datasets/piqa.py +146 -0
  53. themis/datasets/registry.py +201 -0
  54. themis/datasets/schema.py +245 -0
  55. themis/datasets/sciq.py +150 -0
  56. themis/datasets/social_i_qa.py +151 -0
  57. themis/datasets/super_gpqa.py +263 -0
  58. themis/evaluation/__init__.py +1 -0
  59. themis/evaluation/conditional.py +410 -0
  60. themis/evaluation/extractors/__init__.py +19 -0
  61. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  62. themis/evaluation/extractors/exceptions.py +7 -0
  63. themis/evaluation/extractors/identity_extractor.py +29 -0
  64. themis/evaluation/extractors/json_field_extractor.py +45 -0
  65. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  66. themis/evaluation/extractors/regex_extractor.py +43 -0
  67. themis/evaluation/math_verify_utils.py +87 -0
  68. themis/evaluation/metrics/__init__.py +21 -0
  69. themis/evaluation/metrics/code/__init__.py +19 -0
  70. themis/evaluation/metrics/code/codebleu.py +144 -0
  71. themis/evaluation/metrics/code/execution.py +280 -0
  72. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  73. themis/evaluation/metrics/composite_metric.py +47 -0
  74. themis/evaluation/metrics/consistency_metric.py +80 -0
  75. themis/evaluation/metrics/exact_match.py +51 -0
  76. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  77. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  78. themis/evaluation/metrics/nlp/__init__.py +21 -0
  79. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  80. themis/evaluation/metrics/nlp/bleu.py +129 -0
  81. themis/evaluation/metrics/nlp/meteor.py +153 -0
  82. themis/evaluation/metrics/nlp/rouge.py +136 -0
  83. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  84. themis/evaluation/metrics/response_length.py +33 -0
  85. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  86. themis/evaluation/pipeline.py +49 -0
  87. themis/evaluation/pipelines/__init__.py +15 -0
  88. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  89. themis/evaluation/pipelines/standard_pipeline.py +348 -0
  90. themis/evaluation/reports.py +293 -0
  91. themis/evaluation/statistics/__init__.py +53 -0
  92. themis/evaluation/statistics/bootstrap.py +79 -0
  93. themis/evaluation/statistics/confidence_intervals.py +121 -0
  94. themis/evaluation/statistics/distributions.py +207 -0
  95. themis/evaluation/statistics/effect_sizes.py +124 -0
  96. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  97. themis/evaluation/statistics/types.py +139 -0
  98. themis/evaluation/strategies/__init__.py +13 -0
  99. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  100. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  101. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  102. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  103. themis/experiment/__init__.py +5 -0
  104. themis/experiment/builder.py +151 -0
  105. themis/experiment/cache_manager.py +134 -0
  106. themis/experiment/comparison.py +631 -0
  107. themis/experiment/cost.py +310 -0
  108. themis/experiment/definitions.py +62 -0
  109. themis/experiment/export.py +798 -0
  110. themis/experiment/export_csv.py +159 -0
  111. themis/experiment/integration_manager.py +104 -0
  112. themis/experiment/math.py +192 -0
  113. themis/experiment/mcq.py +169 -0
  114. themis/experiment/orchestrator.py +415 -0
  115. themis/experiment/pricing.py +317 -0
  116. themis/experiment/storage.py +1458 -0
  117. themis/experiment/visualization.py +588 -0
  118. themis/generation/__init__.py +1 -0
  119. themis/generation/agentic_runner.py +420 -0
  120. themis/generation/batching.py +254 -0
  121. themis/generation/clients.py +143 -0
  122. themis/generation/conversation_runner.py +236 -0
  123. themis/generation/plan.py +456 -0
  124. themis/generation/providers/litellm_provider.py +221 -0
  125. themis/generation/providers/vllm_provider.py +135 -0
  126. themis/generation/router.py +34 -0
  127. themis/generation/runner.py +207 -0
  128. themis/generation/strategies.py +98 -0
  129. themis/generation/templates.py +71 -0
  130. themis/generation/turn_strategies.py +393 -0
  131. themis/generation/types.py +9 -0
  132. themis/integrations/__init__.py +0 -0
  133. themis/integrations/huggingface.py +72 -0
  134. themis/integrations/wandb.py +77 -0
  135. themis/interfaces/__init__.py +169 -0
  136. themis/presets/__init__.py +10 -0
  137. themis/presets/benchmarks.py +354 -0
  138. themis/presets/models.py +190 -0
  139. themis/project/__init__.py +20 -0
  140. themis/project/definitions.py +98 -0
  141. themis/project/patterns.py +230 -0
  142. themis/providers/__init__.py +5 -0
  143. themis/providers/registry.py +39 -0
  144. themis/server/__init__.py +28 -0
  145. themis/server/app.py +337 -0
  146. themis/utils/api_generator.py +379 -0
  147. themis/utils/cost_tracking.py +376 -0
  148. themis/utils/dashboard.py +452 -0
  149. themis/utils/logging_utils.py +41 -0
  150. themis/utils/progress.py +58 -0
  151. themis/utils/tracing.py +320 -0
  152. themis_eval-0.2.0.dist-info/METADATA +596 -0
  153. themis_eval-0.2.0.dist-info/RECORD +157 -0
  154. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  155. themis_eval-0.1.0.dist-info/METADATA +0 -758
  156. themis_eval-0.1.0.dist-info/RECORD +0 -8
  157. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  158. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,415 @@
1
+ """Experiment orchestrator primitives."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime, timezone
6
+ from typing import Callable, Sequence
7
+
8
+ from themis.config.schema import IntegrationsConfig
9
+ from themis.core.entities import (
10
+ EvaluationRecord,
11
+ ExperimentFailure,
12
+ ExperimentReport,
13
+ GenerationRecord,
14
+ GenerationTask,
15
+ MetricScore,
16
+ )
17
+ from themis.evaluation import pipeline as evaluation_pipeline
18
+ from themis.evaluation.reports import EvaluationFailure
19
+ from themis.experiment import storage as experiment_storage
20
+ from themis.experiment.cache_manager import CacheManager
21
+ from themis.experiment.cost import CostTracker
22
+ from themis.experiment.integration_manager import IntegrationManager
23
+ from themis.experiment.pricing import calculate_cost, get_provider_pricing
24
+ from themis.generation import plan as generation_plan
25
+ from themis.generation import runner as generation_runner
26
+
27
+
28
+ class ExperimentOrchestrator:
29
+ """Orchestrates experiment execution: generation → evaluation → reporting.
30
+
31
+ This class coordinates the experiment workflow using focused managers:
32
+ - CacheManager: Handles storage and resumability
33
+ - IntegrationManager: Handles WandB and HuggingFace Hub
34
+
35
+ Single Responsibility: Orchestration of experiment flow
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ *,
41
+ generation_plan: generation_plan.GenerationPlan,
42
+ generation_runner: generation_runner.GenerationRunner,
43
+ evaluation_pipeline: evaluation_pipeline.EvaluationPipeline,
44
+ storage: experiment_storage.ExperimentStorage | None = None,
45
+ integrations_config: IntegrationsConfig | None = None,
46
+ cache_manager: CacheManager | None = None,
47
+ integration_manager: IntegrationManager | None = None,
48
+ ) -> None:
49
+ """Initialize experiment orchestrator.
50
+
51
+ Args:
52
+ generation_plan: Plan for expanding dataset into tasks
53
+ generation_runner: Runner for executing generation tasks
54
+ evaluation_pipeline: Pipeline for evaluating outputs
55
+ storage: Optional storage backend (deprecated, use cache_manager)
56
+ integrations_config: Integration config (deprecated, use integration_manager)
57
+ cache_manager: Manager for caching and resumability
58
+ integration_manager: Manager for external integrations
59
+ """
60
+ self._plan = generation_plan
61
+ self._runner = generation_runner
62
+ self._evaluation = evaluation_pipeline
63
+
64
+ # Support both new managers and legacy direct parameters for backward compatibility
65
+ self._cache = cache_manager or CacheManager(
66
+ storage=storage,
67
+ enable_resume=True,
68
+ enable_cache=True,
69
+ )
70
+ self._integrations = integration_manager or IntegrationManager(
71
+ config=integrations_config or IntegrationsConfig()
72
+ )
73
+
74
+ # Initialize cost tracker
75
+ self._cost_tracker = CostTracker()
76
+
77
+ # Keep legacy references for backward compatibility
78
+ self._storage = storage
79
+
80
+ def run(
81
+ self,
82
+ dataset: Sequence[dict[str, object]] | None = None,
83
+ *,
84
+ dataset_loader: Callable[[], Sequence[dict[str, object]]] | None = None,
85
+ max_samples: int | None = None,
86
+ run_id: str | None = None,
87
+ resume: bool = True,
88
+ cache_results: bool = True,
89
+ on_result: Callable[[GenerationRecord], None] | None = None,
90
+ ) -> ExperimentReport:
91
+ """Run experiment: generate responses, evaluate, and report results.
92
+
93
+ Args:
94
+ dataset: Optional dataset samples to use
95
+ dataset_loader: Optional callable to load dataset
96
+ max_samples: Optional limit on number of samples
97
+ run_id: Optional run identifier for caching
98
+ resume: Whether to resume from cached results
99
+ cache_results: Whether to cache new results
100
+ on_result: Optional callback for each generation result
101
+
102
+ Returns:
103
+ ExperimentReport with generation results, evaluation, and metadata
104
+ """
105
+ # Initialize integrations
106
+ self._integrations.initialize_run(
107
+ {
108
+ "max_samples": max_samples,
109
+ "run_id": run_id,
110
+ "resume": resume,
111
+ }
112
+ )
113
+
114
+ # Prepare dataset
115
+ dataset_list = self._resolve_dataset(
116
+ dataset=dataset, dataset_loader=dataset_loader, run_id=run_id
117
+ )
118
+ selected_dataset = (
119
+ dataset_list[:max_samples] if max_samples is not None else dataset_list
120
+ )
121
+ run_identifier = run_id or self._default_run_id()
122
+
123
+ # Initialize run in storage (if storage exists and run doesn't exist)
124
+ if self._cache.has_storage:
125
+ if not resume or not self._cache._storage._run_metadata_exists(run_identifier):
126
+ self._cache._storage.start_run(run_identifier, experiment_id="default")
127
+
128
+ # Cache dataset for resumability
129
+ if dataset_list:
130
+ self._cache.cache_dataset(run_identifier, dataset_list)
131
+
132
+ # Expand dataset into generation tasks
133
+ tasks = list(self._plan.expand(selected_dataset))
134
+
135
+ # Build evaluation configuration for cache invalidation
136
+ evaluation_config = self._build_evaluation_config()
137
+
138
+ # Load cached results if resuming
139
+ cached_records = (
140
+ self._cache.load_cached_records(run_identifier) if resume else {}
141
+ )
142
+ cached_evaluations = (
143
+ self._cache.load_cached_evaluations(run_identifier, evaluation_config) if resume else {}
144
+ )
145
+
146
+ # Process tasks: use cached or run new generations
147
+ generation_results: list[GenerationRecord] = []
148
+ failures: list[ExperimentFailure] = []
149
+ pending_tasks: list[GenerationTask] = []
150
+ pending_records: list[GenerationRecord] = []
151
+ pending_keys: list[str] = []
152
+ cached_eval_records: list[EvaluationRecord] = []
153
+
154
+ for task in tasks:
155
+ task_cache_key = experiment_storage.task_cache_key(task)
156
+ cached = cached_records.get(task_cache_key)
157
+ if cached is not None:
158
+ generation_results.append(cached)
159
+ if cached.error:
160
+ failures.append(
161
+ ExperimentFailure(
162
+ sample_id=cached.task.metadata.get("dataset_id"),
163
+ message=cached.error.message,
164
+ )
165
+ )
166
+ # Use evaluation_cache_key that includes evaluation config
167
+ eval_cache_key = experiment_storage.evaluation_cache_key(task, evaluation_config)
168
+ evaluation = cached_evaluations.get(eval_cache_key)
169
+ if evaluation is not None:
170
+ cached_eval_records.append(evaluation)
171
+ else:
172
+ pending_records.append(cached)
173
+ pending_keys.append(eval_cache_key)
174
+ if on_result:
175
+ on_result(cached)
176
+ else:
177
+ pending_tasks.append(task)
178
+
179
+ # Run pending generation tasks
180
+ if pending_tasks:
181
+ for record in self._runner.run(pending_tasks):
182
+ generation_results.append(record)
183
+
184
+ # Track cost for successful generations
185
+ if record.output and record.output.usage:
186
+ usage = record.output.usage
187
+ prompt_tokens = usage.get("prompt_tokens", 0)
188
+ completion_tokens = usage.get("completion_tokens", 0)
189
+ model = record.task.model.identifier
190
+
191
+ # Calculate cost using pricing database
192
+ cost = calculate_cost(model, prompt_tokens, completion_tokens)
193
+ self._cost_tracker.record_generation(
194
+ model=model,
195
+ prompt_tokens=prompt_tokens,
196
+ completion_tokens=completion_tokens,
197
+ cost=cost,
198
+ )
199
+
200
+ if record.error:
201
+ failures.append(
202
+ ExperimentFailure(
203
+ sample_id=record.task.metadata.get("dataset_id"),
204
+ message=record.error.message,
205
+ )
206
+ )
207
+ cache_key = experiment_storage.task_cache_key(record.task)
208
+ if cache_results:
209
+ self._cache.save_generation_record(
210
+ run_identifier, record, cache_key
211
+ )
212
+ pending_records.append(record)
213
+ pending_keys.append(cache_key)
214
+ if on_result:
215
+ on_result(record)
216
+
217
+ # Evaluate pending records
218
+ if pending_records:
219
+ new_evaluation_report = self._evaluation.evaluate(pending_records)
220
+ else:
221
+ new_evaluation_report = evaluation_pipeline.EvaluationReport(
222
+ metrics={}, failures=[], records=[]
223
+ )
224
+
225
+ # Cache evaluation results
226
+ for record, evaluation in zip(pending_records, new_evaluation_report.records):
227
+ self._cache.save_evaluation_record(
228
+ run_identifier, record, evaluation, evaluation_config
229
+ )
230
+
231
+ # Combine cached and new evaluations
232
+ evaluation_report = self._combine_evaluations(
233
+ cached_eval_records, new_evaluation_report
234
+ )
235
+
236
+ # Get cost breakdown
237
+ cost_breakdown = self._cost_tracker.get_breakdown()
238
+
239
+ # Build metadata
240
+ metadata = {
241
+ "total_samples": len(selected_dataset),
242
+ "successful_generations": sum(
243
+ 1 for result in generation_results if not result.error
244
+ ),
245
+ "failed_generations": sum(
246
+ 1 for result in generation_results if result.error
247
+ ),
248
+ "run_id": run_identifier,
249
+ "evaluation_failures": sum(
250
+ 1 for record in evaluation_report.records if record.failures
251
+ )
252
+ + len(evaluation_report.failures),
253
+ # Cost tracking
254
+ "cost": {
255
+ "total_cost": cost_breakdown.total_cost,
256
+ "generation_cost": cost_breakdown.generation_cost,
257
+ "evaluation_cost": cost_breakdown.evaluation_cost,
258
+ "currency": cost_breakdown.currency,
259
+ "token_counts": cost_breakdown.token_counts,
260
+ "api_calls": cost_breakdown.api_calls,
261
+ "per_model_costs": cost_breakdown.per_model_costs,
262
+ },
263
+ }
264
+
265
+ # Create final report
266
+ report = ExperimentReport(
267
+ generation_results=generation_results,
268
+ evaluation_report=evaluation_report,
269
+ failures=failures,
270
+ metadata=metadata,
271
+ )
272
+
273
+ # Log to integrations
274
+ self._integrations.log_results(report)
275
+
276
+ # Upload to HuggingFace Hub if enabled
277
+ run_path = self._cache.get_run_path(run_identifier)
278
+ self._integrations.upload_results(report, run_path)
279
+
280
+ # Save report.json for multi-experiment comparison
281
+ if cache_results:
282
+ self._save_report_json(report, run_identifier)
283
+
284
+ return report
285
+
286
+ def _default_run_id(self) -> str:
287
+ return datetime.now(timezone.utc).strftime("run-%Y%m%d-%H%M%S")
288
+
289
+ def _build_evaluation_config(self) -> dict:
290
+ """Build evaluation configuration for cache key generation.
291
+
292
+ This configuration includes all evaluation settings that affect results,
293
+ so changing metrics or extractors will invalidate the cache.
294
+
295
+ Returns:
296
+ Dictionary with evaluation configuration
297
+ """
298
+ config = {}
299
+
300
+ # Add metric names/types
301
+ if hasattr(self._evaluation, "_metrics"):
302
+ config["metrics"] = sorted([
303
+ f"{metric.__class__.__module__}.{metric.__class__.__name__}:{metric.name}"
304
+ for metric in self._evaluation._metrics
305
+ ])
306
+
307
+ # Add extractor type
308
+ if hasattr(self._evaluation, "_extractor"):
309
+ extractor = self._evaluation._extractor
310
+ extractor_type = f"{extractor.__class__.__module__}.{extractor.__class__.__name__}"
311
+ config["extractor"] = extractor_type
312
+
313
+ # Include extractor-specific configuration if available
314
+ if hasattr(extractor, "field_name"):
315
+ config["extractor_field"] = extractor.field_name
316
+
317
+ return config
318
+
319
+ def _resolve_dataset(
320
+ self,
321
+ *,
322
+ dataset: Sequence[dict[str, object]] | None,
323
+ dataset_loader: Callable[[], Sequence[dict[str, object]]] | None,
324
+ run_id: str | None,
325
+ ) -> list[dict[str, object]]:
326
+ """Resolve dataset from various sources.
327
+
328
+ Args:
329
+ dataset: Direct dataset samples
330
+ dataset_loader: Callable to load dataset
331
+ run_id: Run ID to load cached dataset
332
+
333
+ Returns:
334
+ List of dataset samples
335
+
336
+ Raises:
337
+ ValueError: If no dataset source is available
338
+ """
339
+ if dataset is not None:
340
+ return list(dataset)
341
+ if dataset_loader is not None:
342
+ return list(dataset_loader())
343
+ # Try to load from cache (for backward compatibility, still use _storage directly)
344
+ if self._storage is not None and run_id is not None:
345
+ return self._storage.load_dataset(run_id)
346
+ raise ValueError(
347
+ "No dataset provided. Supply `dataset=` rows, a `dataset_loader`, "
348
+ "or set `run_id` with storage configured so cached data can be reloaded."
349
+ )
350
+
351
+ def _combine_evaluations(
352
+ self,
353
+ cached_records: list[EvaluationRecord],
354
+ new_report: evaluation_pipeline.EvaluationReport,
355
+ ) -> evaluation_pipeline.EvaluationReport:
356
+ all_records = list(cached_records) + list(new_report.records)
357
+ per_metric: dict[str, list[MetricScore]] = {}
358
+ for record in all_records:
359
+ for score in record.scores:
360
+ per_metric.setdefault(score.metric_name, []).append(score)
361
+
362
+ aggregates: dict[str, evaluation_pipeline.MetricAggregate] = {}
363
+ metric_names = set(per_metric.keys()) | set(new_report.metrics.keys())
364
+ for name in metric_names:
365
+ scores = per_metric.get(name, [])
366
+ mean = sum(score.value for score in scores) / len(scores) if scores else 0.0
367
+ aggregates[name] = evaluation_pipeline.MetricAggregate(
368
+ name=name,
369
+ count=len(scores),
370
+ mean=mean,
371
+ per_sample=scores,
372
+ )
373
+
374
+ failures = list(new_report.failures)
375
+ for record in cached_records:
376
+ for message in record.failures:
377
+ failures.append(
378
+ EvaluationFailure(sample_id=record.sample_id, message=message)
379
+ )
380
+
381
+ return evaluation_pipeline.EvaluationReport(
382
+ metrics=aggregates,
383
+ failures=failures,
384
+ records=all_records,
385
+ )
386
+
387
+ def _save_report_json(self, report: ExperimentReport, run_id: str) -> None:
388
+ """Save experiment report as JSON for multi-experiment comparison.
389
+
390
+ Args:
391
+ report: Experiment report to save
392
+ run_id: Run identifier
393
+ """
394
+ from pathlib import Path
395
+
396
+ from themis.experiment.export import build_json_report
397
+
398
+ # Get run path from cache manager
399
+ run_path_str = self._cache.get_run_path(run_id)
400
+ if run_path_str is None:
401
+ # No storage configured, skip saving report.json
402
+ return
403
+
404
+ run_path = Path(run_path_str)
405
+ report_path = run_path / "report.json"
406
+
407
+ # Build JSON report
408
+ json_data = build_json_report(report, title=f"Experiment {run_id}")
409
+
410
+ # Save to file
411
+ import json
412
+
413
+ report_path.parent.mkdir(parents=True, exist_ok=True)
414
+ with report_path.open("w", encoding="utf-8") as f:
415
+ json.dump(json_data, f, indent=2)