themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/__init__.py +5 -0
  8. themis/cli/__main__.py +6 -0
  9. themis/cli/commands/__init__.py +19 -0
  10. themis/cli/commands/benchmarks.py +221 -0
  11. themis/cli/commands/comparison.py +394 -0
  12. themis/cli/commands/config_commands.py +244 -0
  13. themis/cli/commands/cost.py +214 -0
  14. themis/cli/commands/demo.py +68 -0
  15. themis/cli/commands/info.py +90 -0
  16. themis/cli/commands/leaderboard.py +362 -0
  17. themis/cli/commands/math_benchmarks.py +318 -0
  18. themis/cli/commands/mcq_benchmarks.py +207 -0
  19. themis/cli/commands/results.py +252 -0
  20. themis/cli/commands/sample_run.py +244 -0
  21. themis/cli/commands/visualize.py +299 -0
  22. themis/cli/main.py +463 -0
  23. themis/cli/new_project.py +33 -0
  24. themis/cli/utils.py +51 -0
  25. themis/comparison/__init__.py +25 -0
  26. themis/comparison/engine.py +348 -0
  27. themis/comparison/reports.py +283 -0
  28. themis/comparison/statistics.py +402 -0
  29. themis/config/__init__.py +19 -0
  30. themis/config/loader.py +27 -0
  31. themis/config/registry.py +34 -0
  32. themis/config/runtime.py +214 -0
  33. themis/config/schema.py +112 -0
  34. themis/core/__init__.py +5 -0
  35. themis/core/conversation.py +354 -0
  36. themis/core/entities.py +184 -0
  37. themis/core/serialization.py +231 -0
  38. themis/core/tools.py +393 -0
  39. themis/core/types.py +141 -0
  40. themis/datasets/__init__.py +273 -0
  41. themis/datasets/base.py +264 -0
  42. themis/datasets/commonsense_qa.py +174 -0
  43. themis/datasets/competition_math.py +265 -0
  44. themis/datasets/coqa.py +133 -0
  45. themis/datasets/gpqa.py +190 -0
  46. themis/datasets/gsm8k.py +123 -0
  47. themis/datasets/gsm_symbolic.py +124 -0
  48. themis/datasets/math500.py +122 -0
  49. themis/datasets/med_qa.py +179 -0
  50. themis/datasets/medmcqa.py +169 -0
  51. themis/datasets/mmlu_pro.py +262 -0
  52. themis/datasets/piqa.py +146 -0
  53. themis/datasets/registry.py +201 -0
  54. themis/datasets/schema.py +245 -0
  55. themis/datasets/sciq.py +150 -0
  56. themis/datasets/social_i_qa.py +151 -0
  57. themis/datasets/super_gpqa.py +263 -0
  58. themis/evaluation/__init__.py +1 -0
  59. themis/evaluation/conditional.py +410 -0
  60. themis/evaluation/extractors/__init__.py +19 -0
  61. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  62. themis/evaluation/extractors/exceptions.py +7 -0
  63. themis/evaluation/extractors/identity_extractor.py +29 -0
  64. themis/evaluation/extractors/json_field_extractor.py +45 -0
  65. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  66. themis/evaluation/extractors/regex_extractor.py +43 -0
  67. themis/evaluation/math_verify_utils.py +87 -0
  68. themis/evaluation/metrics/__init__.py +21 -0
  69. themis/evaluation/metrics/code/__init__.py +19 -0
  70. themis/evaluation/metrics/code/codebleu.py +144 -0
  71. themis/evaluation/metrics/code/execution.py +280 -0
  72. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  73. themis/evaluation/metrics/composite_metric.py +47 -0
  74. themis/evaluation/metrics/consistency_metric.py +80 -0
  75. themis/evaluation/metrics/exact_match.py +51 -0
  76. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  77. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  78. themis/evaluation/metrics/nlp/__init__.py +21 -0
  79. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  80. themis/evaluation/metrics/nlp/bleu.py +129 -0
  81. themis/evaluation/metrics/nlp/meteor.py +153 -0
  82. themis/evaluation/metrics/nlp/rouge.py +136 -0
  83. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  84. themis/evaluation/metrics/response_length.py +33 -0
  85. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  86. themis/evaluation/pipeline.py +49 -0
  87. themis/evaluation/pipelines/__init__.py +15 -0
  88. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  89. themis/evaluation/pipelines/standard_pipeline.py +348 -0
  90. themis/evaluation/reports.py +293 -0
  91. themis/evaluation/statistics/__init__.py +53 -0
  92. themis/evaluation/statistics/bootstrap.py +79 -0
  93. themis/evaluation/statistics/confidence_intervals.py +121 -0
  94. themis/evaluation/statistics/distributions.py +207 -0
  95. themis/evaluation/statistics/effect_sizes.py +124 -0
  96. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  97. themis/evaluation/statistics/types.py +139 -0
  98. themis/evaluation/strategies/__init__.py +13 -0
  99. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  100. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  101. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  102. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  103. themis/experiment/__init__.py +5 -0
  104. themis/experiment/builder.py +151 -0
  105. themis/experiment/cache_manager.py +134 -0
  106. themis/experiment/comparison.py +631 -0
  107. themis/experiment/cost.py +310 -0
  108. themis/experiment/definitions.py +62 -0
  109. themis/experiment/export.py +798 -0
  110. themis/experiment/export_csv.py +159 -0
  111. themis/experiment/integration_manager.py +104 -0
  112. themis/experiment/math.py +192 -0
  113. themis/experiment/mcq.py +169 -0
  114. themis/experiment/orchestrator.py +415 -0
  115. themis/experiment/pricing.py +317 -0
  116. themis/experiment/storage.py +1458 -0
  117. themis/experiment/visualization.py +588 -0
  118. themis/generation/__init__.py +1 -0
  119. themis/generation/agentic_runner.py +420 -0
  120. themis/generation/batching.py +254 -0
  121. themis/generation/clients.py +143 -0
  122. themis/generation/conversation_runner.py +236 -0
  123. themis/generation/plan.py +456 -0
  124. themis/generation/providers/litellm_provider.py +221 -0
  125. themis/generation/providers/vllm_provider.py +135 -0
  126. themis/generation/router.py +34 -0
  127. themis/generation/runner.py +207 -0
  128. themis/generation/strategies.py +98 -0
  129. themis/generation/templates.py +71 -0
  130. themis/generation/turn_strategies.py +393 -0
  131. themis/generation/types.py +9 -0
  132. themis/integrations/__init__.py +0 -0
  133. themis/integrations/huggingface.py +72 -0
  134. themis/integrations/wandb.py +77 -0
  135. themis/interfaces/__init__.py +169 -0
  136. themis/presets/__init__.py +10 -0
  137. themis/presets/benchmarks.py +354 -0
  138. themis/presets/models.py +190 -0
  139. themis/project/__init__.py +20 -0
  140. themis/project/definitions.py +98 -0
  141. themis/project/patterns.py +230 -0
  142. themis/providers/__init__.py +5 -0
  143. themis/providers/registry.py +39 -0
  144. themis/server/__init__.py +28 -0
  145. themis/server/app.py +337 -0
  146. themis/utils/api_generator.py +379 -0
  147. themis/utils/cost_tracking.py +376 -0
  148. themis/utils/dashboard.py +452 -0
  149. themis/utils/logging_utils.py +41 -0
  150. themis/utils/progress.py +58 -0
  151. themis/utils/tracing.py +320 -0
  152. themis_eval-0.2.0.dist-info/METADATA +596 -0
  153. themis_eval-0.2.0.dist-info/RECORD +157 -0
  154. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  155. themis_eval-0.1.0.dist-info/METADATA +0 -758
  156. themis_eval-0.1.0.dist-info/RECORD +0 -8
  157. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  158. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1458 @@
1
+ """Robust storage architecture with lifecycle management, atomic operations, and integrity checks.
2
+
3
+ This is a rewrite of the storage layer to address:
4
+ - Run lifecycle management (in_progress, completed, failed)
5
+ - Atomic write operations
6
+ - File locking for concurrent access
7
+ - Index persistence
8
+ - Experiment-level organization
9
+ - Separate evaluation tracking
10
+ - Data integrity validation
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import contextlib
16
+ import gzip
17
+ import hashlib
18
+ import json
19
+ import os
20
+ import sqlite3
21
+ import sys
22
+ import tempfile
23
+ from dataclasses import dataclass, field
24
+ import shutil
25
+ from datetime import datetime, timedelta
26
+ from enum import Enum
27
+ from pathlib import Path
28
+ from typing import Dict, Iterable, List, Literal
29
+
30
+ # fcntl is Unix-only, use msvcrt on Windows
31
+ if sys.platform == "win32":
32
+ import msvcrt
33
+ FCNTL_AVAILABLE = False
34
+ else:
35
+ try:
36
+ import fcntl
37
+ FCNTL_AVAILABLE = True
38
+ except ImportError:
39
+ FCNTL_AVAILABLE = False
40
+
41
+ from themis.core import entities as core_entities
42
+ from themis.core import serialization as core_serialization
43
+
44
+ STORAGE_FORMAT_VERSION = "2.0.0"
45
+
46
+
47
+ class RunStatus(str, Enum):
48
+ """Status of a run."""
49
+
50
+ IN_PROGRESS = "in_progress"
51
+ COMPLETED = "completed"
52
+ FAILED = "failed"
53
+ CANCELLED = "cancelled"
54
+
55
+
56
+ @dataclass
57
+ class RetentionPolicy:
58
+ """Retention policy for automatic cleanup.
59
+
60
+ Attributes:
61
+ max_runs_per_experiment: Maximum runs to keep per experiment
62
+ max_age_days: Maximum age in days for runs
63
+ max_storage_gb: Maximum total storage in GB
64
+ keep_completed_only: Only keep completed runs
65
+ keep_latest_n: Always keep N most recent runs
66
+ """
67
+
68
+ max_runs_per_experiment: int | None = None
69
+ max_age_days: int | None = None
70
+ max_storage_gb: float | None = None
71
+ keep_completed_only: bool = True
72
+ keep_latest_n: int = 5
73
+
74
+
75
+ @dataclass
76
+ class StorageConfig:
77
+ """Configuration for experiment storage behavior.
78
+
79
+ Attributes:
80
+ save_raw_responses: Save full API responses (default: False)
81
+ save_dataset: Save dataset copy (default: True)
82
+ compression: Compression format - "gzip" | "none" (default: "gzip")
83
+ deduplicate_templates: Store templates once (default: True)
84
+ enable_checksums: Add integrity checksums (default: True)
85
+ use_sqlite_metadata: Use SQLite for metadata (default: True)
86
+ checkpoint_interval: Save checkpoint every N records (default: 100)
87
+ retention_policy: Automatic cleanup policy (default: None)
88
+ """
89
+
90
+ save_raw_responses: bool = False
91
+ save_dataset: bool = True
92
+ compression: Literal["none", "gzip"] = "gzip"
93
+ deduplicate_templates: bool = True
94
+ enable_checksums: bool = True
95
+ use_sqlite_metadata: bool = True
96
+ checkpoint_interval: int = 100
97
+ retention_policy: RetentionPolicy | None = None
98
+
99
+
100
+ @dataclass
101
+ class RunMetadata:
102
+ """Metadata for a run."""
103
+
104
+ run_id: str
105
+ experiment_id: str
106
+ status: RunStatus
107
+ created_at: str
108
+ updated_at: str
109
+ completed_at: str | None = None
110
+ total_samples: int = 0
111
+ successful_generations: int = 0
112
+ failed_generations: int = 0
113
+ config_snapshot: dict = field(default_factory=dict)
114
+ error_message: str | None = None
115
+
116
+
117
+ @dataclass
118
+ class EvaluationMetadata:
119
+ """Metadata for an evaluation run."""
120
+
121
+ eval_id: str
122
+ run_id: str
123
+ eval_name: str
124
+ created_at: str
125
+ metrics_config: dict = field(default_factory=dict)
126
+ total_evaluated: int = 0
127
+ total_failures: int = 0
128
+
129
+
130
+ class DataIntegrityError(Exception):
131
+ """Raised when data integrity check fails."""
132
+
133
+ pass
134
+
135
+
136
+ class ConcurrentAccessError(Exception):
137
+ """Raised when concurrent access conflict detected."""
138
+
139
+ pass
140
+
141
+
142
+ class ExperimentStorage:
143
+ """Robust storage with lifecycle management, locking, and integrity checks.
144
+
145
+ Features:
146
+ - Atomic write operations
147
+ - File locking for concurrent access
148
+ - Run lifecycle tracking (in_progress, completed, failed)
149
+ - Experiment-level organization
150
+ - Separate evaluation tracking
151
+ - Persistent indexes
152
+ - Data integrity validation
153
+ - SQLite metadata database
154
+
155
+ Example:
156
+ >>> config = StorageConfig()
157
+ >>> storage = ExperimentStorage("outputs/experiments", config=config)
158
+ >>>
159
+ >>> # Start a run
160
+ >>> metadata = storage.start_run("run-1", "experiment-1", config={})
161
+ >>>
162
+ >>> # Append records with locking
163
+ >>> storage.append_record("run-1", record)
164
+ >>>
165
+ >>> # Complete the run
166
+ >>> storage.complete_run("run-1")
167
+ """
168
+
169
+ def __init__(
170
+ self, root: str | Path, config: StorageConfig | None = None
171
+ ) -> None:
172
+ self._root = Path(root)
173
+ self._root.mkdir(parents=True, exist_ok=True)
174
+ self._config = config or StorageConfig()
175
+
176
+ # Create experiments directory
177
+ self._experiments_dir = self._root / "experiments"
178
+ self._experiments_dir.mkdir(exist_ok=True)
179
+
180
+ # Initialize SQLite database
181
+ if self._config.use_sqlite_metadata:
182
+ self._init_database()
183
+
184
+ # In-memory caches
185
+ self._task_index: dict[str, set[str]] = {}
186
+ self._template_index: dict[str, dict[str, str]] = {}
187
+ self._locks: dict[str, int] = {} # fd for lock files
188
+
189
+ def _init_database(self):
190
+ """Initialize SQLite metadata database."""
191
+ db_path = self._root / "experiments.db"
192
+ conn = sqlite3.connect(db_path)
193
+
194
+ conn.execute("""
195
+ CREATE TABLE IF NOT EXISTS experiments (
196
+ experiment_id TEXT PRIMARY KEY,
197
+ name TEXT NOT NULL,
198
+ description TEXT,
199
+ created_at TEXT NOT NULL,
200
+ updated_at TEXT NOT NULL,
201
+ config TEXT,
202
+ tags TEXT
203
+ )
204
+ """)
205
+
206
+ conn.execute("""
207
+ CREATE TABLE IF NOT EXISTS runs (
208
+ run_id TEXT PRIMARY KEY,
209
+ experiment_id TEXT NOT NULL,
210
+ status TEXT NOT NULL,
211
+ created_at TEXT NOT NULL,
212
+ updated_at TEXT NOT NULL,
213
+ completed_at TEXT,
214
+ total_samples INTEGER DEFAULT 0,
215
+ successful_generations INTEGER DEFAULT 0,
216
+ failed_generations INTEGER DEFAULT 0,
217
+ config_snapshot TEXT,
218
+ error_message TEXT,
219
+ FOREIGN KEY (experiment_id) REFERENCES experiments(experiment_id)
220
+ )
221
+ """)
222
+
223
+ conn.execute("""
224
+ CREATE TABLE IF NOT EXISTS evaluations (
225
+ eval_id TEXT PRIMARY KEY,
226
+ run_id TEXT NOT NULL,
227
+ eval_name TEXT NOT NULL,
228
+ created_at TEXT NOT NULL,
229
+ metrics_config TEXT,
230
+ total_evaluated INTEGER DEFAULT 0,
231
+ total_failures INTEGER DEFAULT 0,
232
+ FOREIGN KEY (run_id) REFERENCES runs(run_id)
233
+ )
234
+ """)
235
+
236
+ conn.execute("""
237
+ CREATE INDEX IF NOT EXISTS idx_runs_experiment
238
+ ON runs(experiment_id)
239
+ """)
240
+
241
+ conn.execute("""
242
+ CREATE INDEX IF NOT EXISTS idx_runs_status
243
+ ON runs(status)
244
+ """)
245
+
246
+ conn.execute("""
247
+ CREATE INDEX IF NOT EXISTS idx_evaluations_run
248
+ ON evaluations(run_id)
249
+ """)
250
+
251
+ conn.commit()
252
+ conn.close()
253
+
254
+ @contextlib.contextmanager
255
+ def _acquire_lock(self, run_id: str):
256
+ """Acquire exclusive lock for run directory."""
257
+ lock_path = self._get_run_dir(run_id) / ".lock"
258
+ lock_path.parent.mkdir(parents=True, exist_ok=True)
259
+
260
+ # Open lock file
261
+ lock_fd = os.open(lock_path, os.O_CREAT | os.O_RDWR)
262
+
263
+ try:
264
+ # Acquire exclusive lock (blocking)
265
+ if sys.platform == "win32":
266
+ # Windows file locking
267
+ msvcrt.locking(lock_fd, msvcrt.LK_LOCK, 1)
268
+ elif FCNTL_AVAILABLE:
269
+ # Unix file locking
270
+ fcntl.flock(lock_fd, fcntl.LOCK_EX)
271
+ # If neither available, proceed without locking (single-process only)
272
+
273
+ self._locks[run_id] = lock_fd
274
+ yield
275
+ finally:
276
+ # Release lock
277
+ if sys.platform == "win32":
278
+ msvcrt.locking(lock_fd, msvcrt.LK_UNLCK, 1)
279
+ elif FCNTL_AVAILABLE:
280
+ fcntl.flock(lock_fd, fcntl.LOCK_UN)
281
+
282
+ os.close(lock_fd)
283
+ self._locks.pop(run_id, None)
284
+
285
+ def start_run(
286
+ self,
287
+ run_id: str,
288
+ experiment_id: str,
289
+ config: dict | None = None,
290
+ ) -> RunMetadata:
291
+ """Start a new run with in_progress status.
292
+
293
+ Args:
294
+ run_id: Unique run identifier
295
+ experiment_id: Experiment this run belongs to
296
+ config: Configuration snapshot for this run
297
+
298
+ Returns:
299
+ RunMetadata with in_progress status
300
+
301
+ Raises:
302
+ ValueError: If run already exists
303
+ """
304
+ with self._acquire_lock(run_id):
305
+ # Check if run already exists
306
+ if self._run_metadata_exists(run_id):
307
+ raise ValueError(f"Run {run_id} already exists")
308
+
309
+ # Create run directory
310
+ run_dir = self._get_run_dir(run_id)
311
+ run_dir.mkdir(parents=True, exist_ok=True)
312
+
313
+ # Create metadata
314
+ metadata = RunMetadata(
315
+ run_id=run_id,
316
+ experiment_id=experiment_id,
317
+ status=RunStatus.IN_PROGRESS,
318
+ created_at=datetime.now().isoformat(),
319
+ updated_at=datetime.now().isoformat(),
320
+ config_snapshot=config or {},
321
+ )
322
+
323
+ # Save metadata
324
+ self._save_run_metadata(metadata)
325
+
326
+ return metadata
327
+
328
+ def complete_run(self, run_id: str):
329
+ """Mark run as completed.
330
+
331
+ Args:
332
+ run_id: Run identifier
333
+
334
+ Raises:
335
+ ValueError: If run doesn't exist
336
+ """
337
+ with self._acquire_lock(run_id):
338
+ metadata = self._load_run_metadata(run_id)
339
+ metadata.status = RunStatus.COMPLETED
340
+ metadata.completed_at = datetime.now().isoformat()
341
+ metadata.updated_at = datetime.now().isoformat()
342
+ self._save_run_metadata(metadata)
343
+
344
+ def fail_run(self, run_id: str, error_message: str):
345
+ """Mark run as failed with error message.
346
+
347
+ Args:
348
+ run_id: Run identifier
349
+ error_message: Error description
350
+ """
351
+ with self._acquire_lock(run_id):
352
+ metadata = self._load_run_metadata(run_id)
353
+ metadata.status = RunStatus.FAILED
354
+ metadata.error_message = error_message
355
+ metadata.updated_at = datetime.now().isoformat()
356
+ self._save_run_metadata(metadata)
357
+
358
+ def update_run_progress(
359
+ self,
360
+ run_id: str,
361
+ total_samples: int | None = None,
362
+ successful_generations: int | None = None,
363
+ failed_generations: int | None = None,
364
+ ):
365
+ """Update run progress counters.
366
+
367
+ Args:
368
+ run_id: Run identifier
369
+ total_samples: Total samples (if provided)
370
+ successful_generations: Successful count (if provided)
371
+ failed_generations: Failed count (if provided)
372
+ """
373
+ with self._acquire_lock(run_id):
374
+ metadata = self._load_run_metadata(run_id)
375
+
376
+ if total_samples is not None:
377
+ metadata.total_samples = total_samples
378
+ if successful_generations is not None:
379
+ metadata.successful_generations = successful_generations
380
+ if failed_generations is not None:
381
+ metadata.failed_generations = failed_generations
382
+
383
+ metadata.updated_at = datetime.now().isoformat()
384
+ self._save_run_metadata(metadata)
385
+
386
+ def append_record(
387
+ self,
388
+ run_id: str,
389
+ record: core_entities.GenerationRecord,
390
+ *,
391
+ cache_key: str | None = None,
392
+ ) -> None:
393
+ """Append record with atomic write and locking.
394
+
395
+ Args:
396
+ run_id: Run identifier
397
+ record: Generation record to append
398
+ cache_key: Optional cache key (generated if not provided)
399
+ """
400
+ with self._acquire_lock(run_id):
401
+ # Ensure generation directory exists
402
+ gen_dir = self._get_generation_dir(run_id)
403
+ gen_dir.mkdir(parents=True, exist_ok=True)
404
+
405
+ path = gen_dir / "records.jsonl"
406
+
407
+ # Initialize file with header if needed
408
+ if not self._file_exists_any_compression(path):
409
+ self._write_jsonl_with_header(path, [], file_type="records")
410
+
411
+ # Serialize record
412
+ payload = self._serialize_record(run_id, record)
413
+ payload["cache_key"] = cache_key or self._task_cache_key(record.task)
414
+
415
+ # Atomic append
416
+ self._atomic_append(path, payload)
417
+
418
+ # Update progress
419
+ metadata = self._load_run_metadata(run_id)
420
+ new_successful = metadata.successful_generations + (1 if record.output else 0)
421
+ new_failed = metadata.failed_generations + (1 if record.error else 0)
422
+
423
+ self.update_run_progress(
424
+ run_id,
425
+ total_samples=metadata.total_samples + 1,
426
+ successful_generations=new_successful,
427
+ failed_generations=new_failed,
428
+ )
429
+
430
+ # Auto-checkpoint if configured
431
+ if self._config.checkpoint_interval > 0:
432
+ total = new_successful + new_failed
433
+ if total % self._config.checkpoint_interval == 0:
434
+ checkpoint_data = {
435
+ "total_samples": total,
436
+ "successful": new_successful,
437
+ "failed": new_failed,
438
+ "timestamp": datetime.now().isoformat(),
439
+ }
440
+ self.save_checkpoint(run_id, checkpoint_data)
441
+
442
+ def _atomic_append(self, path: Path, data: dict):
443
+ """Append data atomically using temp file.
444
+
445
+ Args:
446
+ path: Target file path
447
+ data: Data to append (will be JSON serialized)
448
+ """
449
+ json_line = json.dumps(data) + "\n"
450
+
451
+ # Write to temp file
452
+ temp_fd, temp_path = tempfile.mkstemp(
453
+ dir=path.parent, prefix=".tmp_", suffix=".json"
454
+ )
455
+ temp_path = Path(temp_path)
456
+
457
+ try:
458
+ if self._config.compression == "gzip":
459
+ with gzip.open(temp_path, "wt", encoding="utf-8") as f:
460
+ f.write(json_line)
461
+ f.flush()
462
+ os.fsync(f.fileno())
463
+ else:
464
+ with open(temp_fd, "w", encoding="utf-8") as f:
465
+ f.write(json_line)
466
+ f.flush()
467
+ os.fsync(f.fileno())
468
+ os.close(temp_fd)
469
+
470
+ # Get target path with compression
471
+ target_path = (
472
+ path.with_suffix(path.suffix + ".gz")
473
+ if self._config.compression == "gzip"
474
+ else path
475
+ )
476
+
477
+ # Append to existing file
478
+ if target_path.exists():
479
+ with open(target_path, "ab") as dest:
480
+ with open(temp_path, "rb") as src:
481
+ dest.write(src.read())
482
+ dest.flush()
483
+ os.fsync(dest.fileno())
484
+ else:
485
+ # No existing file, just rename
486
+ temp_path.rename(target_path)
487
+ return
488
+
489
+ finally:
490
+ # Clean up temp file if still exists
491
+ if temp_path.exists():
492
+ temp_path.unlink()
493
+
494
+ def _save_run_metadata(self, metadata: RunMetadata):
495
+ """Save run metadata to both JSON and SQLite.
496
+
497
+ Args:
498
+ metadata: Run metadata to save
499
+ """
500
+ # Save to JSON file
501
+ metadata_path = self._get_run_dir(metadata.run_id) / "metadata.json"
502
+ metadata_dict = {
503
+ "run_id": metadata.run_id,
504
+ "experiment_id": metadata.experiment_id,
505
+ "status": metadata.status.value,
506
+ "created_at": metadata.created_at,
507
+ "updated_at": metadata.updated_at,
508
+ "completed_at": metadata.completed_at,
509
+ "total_samples": metadata.total_samples,
510
+ "successful_generations": metadata.successful_generations,
511
+ "failed_generations": metadata.failed_generations,
512
+ "config_snapshot": metadata.config_snapshot,
513
+ "error_message": metadata.error_message,
514
+ }
515
+ metadata_path.write_text(json.dumps(metadata_dict, indent=2))
516
+
517
+ # Save to SQLite
518
+ if self._config.use_sqlite_metadata:
519
+ self._save_run_metadata_to_db(metadata)
520
+
521
+ def _save_run_metadata_to_db(self, metadata: RunMetadata):
522
+ """Save run metadata to SQLite database."""
523
+ db_path = self._root / "experiments.db"
524
+ conn = sqlite3.connect(db_path)
525
+
526
+ # Ensure experiment exists
527
+ conn.execute(
528
+ """
529
+ INSERT OR IGNORE INTO experiments (experiment_id, name, created_at, updated_at)
530
+ VALUES (?, ?, ?, ?)
531
+ """,
532
+ (
533
+ metadata.experiment_id,
534
+ metadata.experiment_id,
535
+ metadata.created_at,
536
+ metadata.updated_at,
537
+ ),
538
+ )
539
+
540
+ # Upsert run
541
+ conn.execute(
542
+ """
543
+ INSERT OR REPLACE INTO runs (
544
+ run_id, experiment_id, status, created_at, updated_at, completed_at,
545
+ total_samples, successful_generations, failed_generations,
546
+ config_snapshot, error_message
547
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
548
+ """,
549
+ (
550
+ metadata.run_id,
551
+ metadata.experiment_id,
552
+ metadata.status.value,
553
+ metadata.created_at,
554
+ metadata.updated_at,
555
+ metadata.completed_at,
556
+ metadata.total_samples,
557
+ metadata.successful_generations,
558
+ metadata.failed_generations,
559
+ json.dumps(metadata.config_snapshot),
560
+ metadata.error_message,
561
+ ),
562
+ )
563
+
564
+ conn.commit()
565
+ conn.close()
566
+
567
+ def _load_run_metadata(self, run_id: str) -> RunMetadata:
568
+ """Load run metadata from JSON file.
569
+
570
+ Args:
571
+ run_id: Run identifier
572
+
573
+ Returns:
574
+ RunMetadata
575
+
576
+ Raises:
577
+ FileNotFoundError: If metadata doesn't exist
578
+ """
579
+ metadata_path = self._get_run_dir(run_id) / "metadata.json"
580
+ if not metadata_path.exists():
581
+ raise FileNotFoundError(f"Run metadata not found for {run_id}")
582
+
583
+ data = json.loads(metadata_path.read_text())
584
+ return RunMetadata(
585
+ run_id=data["run_id"],
586
+ experiment_id=data["experiment_id"],
587
+ status=RunStatus(data["status"]),
588
+ created_at=data["created_at"],
589
+ updated_at=data["updated_at"],
590
+ completed_at=data.get("completed_at"),
591
+ total_samples=data.get("total_samples", 0),
592
+ successful_generations=data.get("successful_generations", 0),
593
+ failed_generations=data.get("failed_generations", 0),
594
+ config_snapshot=data.get("config_snapshot", {}),
595
+ error_message=data.get("error_message"),
596
+ )
597
+
598
+ def _run_metadata_exists(self, run_id: str) -> bool:
599
+ """Check if run metadata exists."""
600
+ metadata_path = self._get_run_dir(run_id) / "metadata.json"
601
+ return metadata_path.exists()
602
+
603
+ def _get_run_dir(self, run_id: str) -> Path:
604
+ """Get run directory path.
605
+
606
+ Uses hierarchical structure: experiments/<experiment_id>/runs/<run_id>/
607
+ Falls back to experiments/default/runs/<run_id>/ if experiment_id unknown.
608
+ """
609
+ # Check if we already have metadata
610
+ for exp_dir in self._experiments_dir.iterdir():
611
+ if not exp_dir.is_dir():
612
+ continue
613
+ runs_dir = exp_dir / "runs"
614
+ if not runs_dir.exists():
615
+ continue
616
+ candidate_path = runs_dir / run_id / "metadata.json"
617
+ if candidate_path.exists():
618
+ return runs_dir / run_id
619
+
620
+ # Default location for new runs
621
+ return self._experiments_dir / "default" / "runs" / run_id
622
+
623
+ def _get_generation_dir(self, run_id: str) -> Path:
624
+ """Get generation data directory."""
625
+ return self._get_run_dir(run_id) / "generation"
626
+
627
+ def _get_evaluation_dir(self, run_id: str, eval_id: str = "default") -> Path:
628
+ """Get evaluation directory."""
629
+ return self._get_run_dir(run_id) / "evaluations" / eval_id
630
+
631
+ def _file_exists_any_compression(self, path: Path) -> bool:
632
+ """Check if file exists with any compression suffix."""
633
+ return path.exists() or path.with_suffix(path.suffix + ".gz").exists()
634
+
635
+ def _open_for_read(self, path: Path):
636
+ """Open file for reading with automatic compression detection.
637
+
638
+ Args:
639
+ path: File path
640
+
641
+ Returns:
642
+ File handle (text mode)
643
+ """
644
+ # Try .gz version first
645
+ gz_path = path.with_suffix(path.suffix + ".gz")
646
+ if gz_path.exists():
647
+ return gzip.open(gz_path, "rt", encoding="utf-8")
648
+ if path.exists():
649
+ return path.open("r", encoding="utf-8")
650
+ raise FileNotFoundError(f"File not found: {path}")
651
+
652
+ def _write_jsonl_with_header(
653
+ self, path: Path, items: Iterable[dict], file_type: str
654
+ ):
655
+ """Write JSONL file with format version header."""
656
+ # Determine actual path based on compression
657
+ if self._config.compression == "gzip":
658
+ actual_path = path.with_suffix(path.suffix + ".gz")
659
+ handle = gzip.open(actual_path, "wt", encoding="utf-8")
660
+ else:
661
+ actual_path = path
662
+ handle = open(actual_path, "w", encoding="utf-8")
663
+
664
+ with handle:
665
+ # Write header
666
+ header = {
667
+ "_type": "header",
668
+ "_format_version": STORAGE_FORMAT_VERSION,
669
+ "_file_type": file_type,
670
+ }
671
+ handle.write(json.dumps(header) + "\n")
672
+
673
+ # Write items
674
+ for item in items:
675
+ handle.write(json.dumps(item) + "\n")
676
+
677
+ handle.flush()
678
+ if hasattr(handle, "fileno"):
679
+ os.fsync(handle.fileno())
680
+
681
+ def cache_dataset(self, run_id: str, dataset: Iterable[dict[str, object]]) -> None:
682
+ """Cache dataset samples to storage.
683
+
684
+ Args:
685
+ run_id: Unique run identifier
686
+ dataset: Iterable of dataset samples
687
+ """
688
+ if not self._config.save_dataset:
689
+ return
690
+
691
+ with self._acquire_lock(run_id):
692
+ gen_dir = self._get_generation_dir(run_id)
693
+ gen_dir.mkdir(parents=True, exist_ok=True)
694
+ path = gen_dir / "dataset.jsonl"
695
+
696
+ self._write_jsonl_with_header(path, dataset, file_type="dataset")
697
+
698
+ def load_dataset(self, run_id: str) -> List[dict[str, object]]:
699
+ """Load cached dataset.
700
+
701
+ Args:
702
+ run_id: Run identifier
703
+
704
+ Returns:
705
+ List of dataset samples
706
+ """
707
+ gen_dir = self._get_generation_dir(run_id)
708
+ path = gen_dir / "dataset.jsonl"
709
+
710
+ rows: list[dict[str, object]] = []
711
+ with self._open_for_read(path) as handle:
712
+ for line in handle:
713
+ if not line.strip():
714
+ continue
715
+ data = json.loads(line)
716
+ if data.get("_type") == "header":
717
+ continue
718
+ rows.append(data)
719
+ return rows
720
+
721
+ def load_cached_records(
722
+ self, run_id: str
723
+ ) -> Dict[str, core_entities.GenerationRecord]:
724
+ """Load cached generation records.
725
+
726
+ Args:
727
+ run_id: Run identifier
728
+
729
+ Returns:
730
+ Dict mapping cache_key to GenerationRecord
731
+ """
732
+ gen_dir = self._get_generation_dir(run_id)
733
+ path = gen_dir / "records.jsonl"
734
+
735
+ try:
736
+ handle = self._open_for_read(path)
737
+ except FileNotFoundError:
738
+ return {}
739
+
740
+ tasks = self._load_tasks(run_id)
741
+ records: dict[str, core_entities.GenerationRecord] = {}
742
+
743
+ with handle:
744
+ for line in handle:
745
+ if not line.strip():
746
+ continue
747
+ data = json.loads(line)
748
+ if data.get("_type") == "header":
749
+ continue
750
+
751
+ key = data.get("cache_key")
752
+ if not key:
753
+ continue
754
+
755
+ record = self._deserialize_record(data, tasks)
756
+ records[key] = record
757
+
758
+ return records
759
+
760
+ def append_evaluation(
761
+ self,
762
+ run_id: str,
763
+ record: core_entities.GenerationRecord,
764
+ evaluation: core_entities.EvaluationRecord,
765
+ *,
766
+ eval_id: str = "default",
767
+ evaluation_config: dict | None = None,
768
+ ) -> None:
769
+ """Append evaluation result.
770
+
771
+ Args:
772
+ run_id: Run identifier
773
+ record: Generation record being evaluated
774
+ evaluation: Evaluation record
775
+ eval_id: Evaluation identifier (default: "default")
776
+ evaluation_config: Evaluation configuration (metrics, extractor) for cache invalidation
777
+ """
778
+ with self._acquire_lock(run_id):
779
+ eval_dir = self._get_evaluation_dir(run_id, eval_id)
780
+ eval_dir.mkdir(parents=True, exist_ok=True)
781
+
782
+ path = eval_dir / "evaluation.jsonl"
783
+
784
+ if not self._file_exists_any_compression(path):
785
+ self._write_jsonl_with_header(path, [], file_type="evaluation")
786
+
787
+ # Use evaluation_cache_key that includes evaluation config
788
+ cache_key = evaluation_cache_key(record.task, evaluation_config)
789
+
790
+ payload = {
791
+ "cache_key": cache_key,
792
+ "evaluation": core_serialization.serialize_evaluation_record(evaluation),
793
+ }
794
+ self._atomic_append(path, payload)
795
+
796
+ def load_cached_evaluations(
797
+ self, run_id: str, eval_id: str = "default", evaluation_config: dict | None = None
798
+ ) -> Dict[str, core_entities.EvaluationRecord]:
799
+ """Load cached evaluation records.
800
+
801
+ Args:
802
+ run_id: Run identifier
803
+ eval_id: Evaluation identifier
804
+ evaluation_config: Evaluation configuration for cache key matching
805
+
806
+ Returns:
807
+ Dict mapping cache_key to EvaluationRecord
808
+
809
+ Note:
810
+ If evaluation_config is provided, only evaluations matching that config
811
+ will be loaded. This ensures that changing metrics invalidates the cache.
812
+ """
813
+ eval_dir = self._get_evaluation_dir(run_id, eval_id)
814
+ path = eval_dir / "evaluation.jsonl"
815
+
816
+ try:
817
+ handle = self._open_for_read(path)
818
+ except FileNotFoundError:
819
+ return {}
820
+
821
+ evaluations: dict[str, core_entities.EvaluationRecord] = {}
822
+
823
+ with handle:
824
+ for line in handle:
825
+ if not line.strip():
826
+ continue
827
+ data = json.loads(line)
828
+ if data.get("_type") == "header":
829
+ continue
830
+
831
+ key = data.get("cache_key")
832
+ if not key:
833
+ continue
834
+
835
+ evaluations[key] = core_serialization.deserialize_evaluation_record(
836
+ data["evaluation"]
837
+ )
838
+
839
+ return evaluations
840
+
841
+ def get_run_path(self, run_id: str) -> Path:
842
+ """Get the filesystem path for a run's storage directory.
843
+
844
+ Args:
845
+ run_id: Unique run identifier
846
+
847
+ Returns:
848
+ Path to the run's storage directory
849
+ """
850
+ return self._get_run_dir(run_id)
851
+
852
+ def _serialize_record(
853
+ self, run_id: str, record: core_entities.GenerationRecord
854
+ ) -> dict:
855
+ """Serialize generation record."""
856
+ task_key = self._persist_task(run_id, record.task)
857
+
858
+ # Prepare output data
859
+ output_data = None
860
+ if record.output:
861
+ output_data = {"text": record.output.text}
862
+ if self._config.save_raw_responses:
863
+ output_data["raw"] = record.output.raw
864
+
865
+ return {
866
+ "task_key": task_key,
867
+ "output": output_data,
868
+ "error": {
869
+ "message": record.error.message,
870
+ "kind": record.error.kind,
871
+ "details": record.error.details,
872
+ }
873
+ if record.error
874
+ else None,
875
+ "metrics": record.metrics,
876
+ "attempts": [
877
+ self._serialize_record(run_id, attempt) for attempt in record.attempts
878
+ ],
879
+ }
880
+
881
+ def _deserialize_record(
882
+ self, payload: dict, tasks: dict[str, core_entities.GenerationTask]
883
+ ) -> core_entities.GenerationRecord:
884
+ """Deserialize generation record."""
885
+ task_key = payload["task_key"]
886
+ task = tasks[task_key]
887
+ output_data = payload.get("output")
888
+ error_data = payload.get("error")
889
+
890
+ record = core_entities.GenerationRecord(
891
+ task=task,
892
+ output=core_entities.ModelOutput(
893
+ text=output_data["text"],
894
+ raw=output_data.get("raw")
895
+ )
896
+ if output_data
897
+ else None,
898
+ error=core_entities.ModelError(
899
+ message=error_data["message"],
900
+ kind=error_data.get("kind", "model_error"),
901
+ details=error_data.get("details", {}),
902
+ )
903
+ if error_data
904
+ else None,
905
+ metrics=payload.get("metrics", {}),
906
+ )
907
+
908
+ record.attempts = [
909
+ self._deserialize_record(attempt, tasks)
910
+ for attempt in payload.get("attempts", [])
911
+ ]
912
+
913
+ return record
914
+
915
+ def _persist_task(self, run_id: str, task: core_entities.GenerationTask) -> str:
916
+ """Persist task and return cache key."""
917
+ # Implementation similar to original but with atomic writes
918
+ # and proper locking (already have lock from append_record)
919
+ key = self._task_cache_key(task)
920
+ index = self._load_task_index(run_id)
921
+
922
+ if key in index:
923
+ return key
924
+
925
+ gen_dir = self._get_generation_dir(run_id)
926
+ gen_dir.mkdir(parents=True, exist_ok=True)
927
+ path = gen_dir / "tasks.jsonl"
928
+
929
+ # Initialize if needed
930
+ if not self._file_exists_any_compression(path):
931
+ self._write_jsonl_with_header(path, [], file_type="tasks")
932
+
933
+ # Serialize task
934
+ if self._config.deduplicate_templates:
935
+ template_id = self._persist_template(run_id, task.prompt.spec)
936
+ task_data = core_serialization.serialize_generation_task(task)
937
+ task_data["prompt"]["spec"] = {"_template_ref": template_id}
938
+ else:
939
+ task_data = core_serialization.serialize_generation_task(task)
940
+
941
+ payload = {"task_key": key, "task": task_data}
942
+ self._atomic_append(path, payload)
943
+
944
+ index.add(key)
945
+ self._save_task_index(run_id, index)
946
+
947
+ return key
948
+
949
+ def _persist_template(
950
+ self, run_id: str, spec: core_entities.PromptSpec
951
+ ) -> str:
952
+ """Persist prompt template."""
953
+ template_content = f"{spec.name}:{spec.template}"
954
+ template_id = hashlib.sha256(template_content.encode("utf-8")).hexdigest()[:16]
955
+
956
+ if run_id not in self._template_index:
957
+ self._template_index[run_id] = {}
958
+ self._load_templates(run_id)
959
+
960
+ if template_id in self._template_index[run_id]:
961
+ return template_id
962
+
963
+ gen_dir = self._get_generation_dir(run_id)
964
+ path = gen_dir / "templates.jsonl"
965
+
966
+ if not self._file_exists_any_compression(path):
967
+ self._write_jsonl_with_header(path, [], file_type="templates")
968
+
969
+ payload = {
970
+ "template_id": template_id,
971
+ "spec": core_serialization.serialize_prompt_spec(spec),
972
+ }
973
+ self._atomic_append(path, payload)
974
+
975
+ self._template_index[run_id][template_id] = spec.template
976
+ return template_id
977
+
978
+ def _load_task_index(self, run_id: str) -> set[str]:
979
+ """Load task index from disk cache or rebuild."""
980
+ if run_id in self._task_index:
981
+ return self._task_index[run_id]
982
+
983
+ # Try to load from persisted index
984
+ index_path = self._get_run_dir(run_id) / ".index.json"
985
+ if index_path.exists():
986
+ index_data = json.loads(index_path.read_text())
987
+ self._task_index[run_id] = set(index_data.get("task_keys", []))
988
+ return self._task_index[run_id]
989
+
990
+ # Rebuild from tasks file
991
+ self._task_index[run_id] = set()
992
+ return self._task_index[run_id]
993
+
994
+ def _save_task_index(self, run_id: str, index: set[str]):
995
+ """Save task index to disk."""
996
+ index_path = self._get_run_dir(run_id) / ".index.json"
997
+ index_data = {
998
+ "task_keys": list(index),
999
+ "template_ids": self._template_index.get(run_id, {}),
1000
+ "last_updated": datetime.now().isoformat(),
1001
+ }
1002
+ index_path.write_text(json.dumps(index_data))
1003
+
1004
+ def _load_templates(self, run_id: str) -> dict[str, core_entities.PromptSpec]:
1005
+ """Load templates from disk.
1006
+
1007
+ Args:
1008
+ run_id: Run identifier
1009
+
1010
+ Returns:
1011
+ Dict mapping template_id to PromptSpec
1012
+ """
1013
+ gen_dir = self._get_generation_dir(run_id)
1014
+ path = gen_dir / "templates.jsonl"
1015
+
1016
+ templates: dict[str, core_entities.PromptSpec] = {}
1017
+ try:
1018
+ handle = self._open_for_read(path)
1019
+ except FileNotFoundError:
1020
+ return templates
1021
+
1022
+ with handle:
1023
+ for line in handle:
1024
+ if not line.strip():
1025
+ continue
1026
+ data = json.loads(line)
1027
+ if data.get("_type") == "header":
1028
+ continue
1029
+
1030
+ template_id = data["template_id"]
1031
+ templates[template_id] = core_serialization.deserialize_prompt_spec(
1032
+ data["spec"]
1033
+ )
1034
+
1035
+ return templates
1036
+
1037
+ def _load_tasks(self, run_id: str) -> dict[str, core_entities.GenerationTask]:
1038
+ """Load tasks from disk.
1039
+
1040
+ Args:
1041
+ run_id: Run identifier
1042
+
1043
+ Returns:
1044
+ Dict mapping task_key to GenerationTask
1045
+ """
1046
+ gen_dir = self._get_generation_dir(run_id)
1047
+ path = gen_dir / "tasks.jsonl"
1048
+
1049
+ tasks: dict[str, core_entities.GenerationTask] = {}
1050
+ try:
1051
+ handle = self._open_for_read(path)
1052
+ except FileNotFoundError:
1053
+ return tasks
1054
+
1055
+ # Load templates if deduplication enabled
1056
+ templates = self._load_templates(run_id) if self._config.deduplicate_templates else {}
1057
+
1058
+ with handle:
1059
+ for line in handle:
1060
+ if not line.strip():
1061
+ continue
1062
+ data = json.loads(line)
1063
+ if data.get("_type") == "header":
1064
+ continue
1065
+
1066
+ task_key = data["task_key"]
1067
+ task_data = data["task"]
1068
+
1069
+ # Restore template from reference if needed
1070
+ if (
1071
+ self._config.deduplicate_templates
1072
+ and "_template_ref" in task_data.get("prompt", {}).get("spec", {})
1073
+ ):
1074
+ template_id = task_data["prompt"]["spec"]["_template_ref"]
1075
+ if template_id in templates:
1076
+ task_data["prompt"]["spec"] = core_serialization.serialize_prompt_spec(
1077
+ templates[template_id]
1078
+ )
1079
+
1080
+ tasks[task_key] = core_serialization.deserialize_generation_task(task_data)
1081
+
1082
+ self._task_index[run_id] = set(tasks.keys())
1083
+ return tasks
1084
+
1085
+ def _task_cache_key(self, task: core_entities.GenerationTask) -> str:
1086
+ """Generate cache key for task."""
1087
+ dataset_raw = task.metadata.get("dataset_id") or task.metadata.get("sample_id")
1088
+ dataset_id = str(dataset_raw) if dataset_raw is not None else ""
1089
+ prompt_hash = hashlib.sha256(task.prompt.text.encode("utf-8")).hexdigest()[:12]
1090
+ sampling = task.sampling
1091
+ sampling_key = (
1092
+ f"{sampling.temperature:.3f}-{sampling.top_p:.3f}-{sampling.max_tokens}"
1093
+ )
1094
+ template = task.prompt.spec.name
1095
+ model = task.model.identifier
1096
+ return "::".join(
1097
+ filter(None, [dataset_id, template, model, sampling_key, prompt_hash])
1098
+ )
1099
+
1100
+ # ===== Phase 3 Features =====
1101
+
1102
+ def save_checkpoint(self, run_id: str, checkpoint_data: dict):
1103
+ """Save checkpoint for resumability.
1104
+
1105
+ Args:
1106
+ run_id: Run identifier
1107
+ checkpoint_data: Checkpoint data to save
1108
+ """
1109
+ with self._acquire_lock(run_id):
1110
+ checkpoint_dir = self._get_run_dir(run_id) / "checkpoints"
1111
+ checkpoint_dir.mkdir(exist_ok=True)
1112
+
1113
+ # Use timestamp for checkpoint filename
1114
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1115
+ checkpoint_path = checkpoint_dir / f"checkpoint_{timestamp}.json"
1116
+
1117
+ checkpoint_path.write_text(json.dumps(checkpoint_data, indent=2))
1118
+
1119
+ def load_latest_checkpoint(self, run_id: str) -> dict | None:
1120
+ """Load most recent checkpoint.
1121
+
1122
+ Args:
1123
+ run_id: Run identifier
1124
+
1125
+ Returns:
1126
+ Checkpoint data or None if no checkpoints exist
1127
+ """
1128
+ checkpoint_dir = self._get_run_dir(run_id) / "checkpoints"
1129
+ if not checkpoint_dir.exists():
1130
+ return None
1131
+
1132
+ # Find latest checkpoint
1133
+ checkpoints = sorted(checkpoint_dir.glob("checkpoint_*.json"), reverse=True)
1134
+ if not checkpoints:
1135
+ return None
1136
+
1137
+ return json.loads(checkpoints[0].read_text())
1138
+
1139
+ def apply_retention_policy(self, policy: RetentionPolicy | None = None):
1140
+ """Apply retention policy to clean up old runs.
1141
+
1142
+ Args:
1143
+ policy: Retention policy (uses config if not provided)
1144
+ """
1145
+ policy = policy or self._config.retention_policy
1146
+ if not policy:
1147
+ return
1148
+
1149
+ # Get all experiments
1150
+ for exp_dir in self._experiments_dir.iterdir():
1151
+ if not exp_dir.is_dir():
1152
+ continue
1153
+
1154
+ runs_dir = exp_dir / "runs"
1155
+ if not runs_dir.exists():
1156
+ continue
1157
+
1158
+ # Load all run metadata
1159
+ runs = []
1160
+ for run_dir in runs_dir.iterdir():
1161
+ if not run_dir.is_dir():
1162
+ continue
1163
+ metadata_path = run_dir / "metadata.json"
1164
+ if not metadata_path.exists():
1165
+ continue
1166
+
1167
+ try:
1168
+ metadata = self._load_run_metadata(run_dir.name)
1169
+ runs.append((run_dir, metadata))
1170
+ except Exception:
1171
+ continue
1172
+
1173
+ # Sort by creation time (newest first)
1174
+ runs.sort(key=lambda x: x[1].created_at, reverse=True)
1175
+
1176
+ # Apply policies
1177
+ runs_to_delete = []
1178
+
1179
+ for i, (run_dir, metadata) in enumerate(runs):
1180
+ # Always keep latest N runs
1181
+ if i < policy.keep_latest_n:
1182
+ continue
1183
+
1184
+ # Check if should keep based on status
1185
+ if policy.keep_completed_only and metadata.status != RunStatus.COMPLETED:
1186
+ runs_to_delete.append(run_dir)
1187
+ continue
1188
+
1189
+ # Check age policy
1190
+ if policy.max_age_days:
1191
+ created = datetime.fromisoformat(metadata.created_at)
1192
+ age = datetime.now() - created
1193
+ if age > timedelta(days=policy.max_age_days):
1194
+ runs_to_delete.append(run_dir)
1195
+ continue
1196
+
1197
+ # Check max runs policy
1198
+ if policy.max_runs_per_experiment:
1199
+ if i >= policy.max_runs_per_experiment:
1200
+ runs_to_delete.append(run_dir)
1201
+
1202
+ # Delete runs
1203
+ for run_dir in runs_to_delete:
1204
+ self._delete_run_dir(run_dir)
1205
+
1206
+ def _delete_run_dir(self, run_dir: Path):
1207
+ """Delete run directory and update database.
1208
+
1209
+ Args:
1210
+ run_dir: Run directory to delete
1211
+ """
1212
+ run_id = run_dir.name
1213
+
1214
+ # Remove from SQLite
1215
+ if self._config.use_sqlite_metadata:
1216
+ db_path = self._root / "experiments.db"
1217
+ conn = sqlite3.connect(db_path)
1218
+ conn.execute("DELETE FROM runs WHERE run_id = ?", (run_id,))
1219
+ conn.commit()
1220
+ conn.close()
1221
+
1222
+ # Remove directory
1223
+ shutil.rmtree(run_dir, ignore_errors=True)
1224
+
1225
+ def get_storage_size(self, experiment_id: str | None = None) -> int:
1226
+ """Get total storage size in bytes.
1227
+
1228
+ Args:
1229
+ experiment_id: Optional experiment to check (all if None)
1230
+
1231
+ Returns:
1232
+ Total size in bytes
1233
+ """
1234
+ if experiment_id:
1235
+ exp_dir = self._experiments_dir / experiment_id
1236
+ if not exp_dir.exists():
1237
+ return 0
1238
+ return sum(f.stat().st_size for f in exp_dir.rglob("*") if f.is_file())
1239
+ else:
1240
+ return sum(f.stat().st_size for f in self._experiments_dir.rglob("*") if f.is_file())
1241
+
1242
+ def list_runs(
1243
+ self,
1244
+ experiment_id: str | None = None,
1245
+ status: RunStatus | None = None,
1246
+ limit: int | None = None
1247
+ ) -> list[RunMetadata]:
1248
+ """List runs with optional filtering.
1249
+
1250
+ Args:
1251
+ experiment_id: Filter by experiment
1252
+ status: Filter by status
1253
+ limit: Maximum number of runs to return
1254
+
1255
+ Returns:
1256
+ List of run metadata
1257
+ """
1258
+ if not self._config.use_sqlite_metadata:
1259
+ # Fallback to file-based listing
1260
+ return self._list_runs_from_files(experiment_id, status, limit)
1261
+
1262
+ # Query SQLite
1263
+ db_path = self._root / "experiments.db"
1264
+ conn = sqlite3.connect(db_path)
1265
+
1266
+ query = "SELECT * FROM runs WHERE 1=1"
1267
+ params = []
1268
+
1269
+ if experiment_id:
1270
+ query += " AND experiment_id = ?"
1271
+ params.append(experiment_id)
1272
+
1273
+ if status:
1274
+ query += " AND status = ?"
1275
+ params.append(status.value)
1276
+
1277
+ query += " ORDER BY created_at DESC"
1278
+
1279
+ if limit:
1280
+ query += " LIMIT ?"
1281
+ params.append(limit)
1282
+
1283
+ cursor = conn.execute(query, params)
1284
+ rows = cursor.fetchall()
1285
+ conn.close()
1286
+
1287
+ # Convert to RunMetadata
1288
+ runs = []
1289
+ for row in rows:
1290
+ runs.append(RunMetadata(
1291
+ run_id=row[0],
1292
+ experiment_id=row[1],
1293
+ status=RunStatus(row[2]),
1294
+ created_at=row[3],
1295
+ updated_at=row[4],
1296
+ completed_at=row[5],
1297
+ total_samples=row[6] or 0,
1298
+ successful_generations=row[7] or 0,
1299
+ failed_generations=row[8] or 0,
1300
+ config_snapshot=json.loads(row[9]) if row[9] else {},
1301
+ error_message=row[10],
1302
+ ))
1303
+
1304
+ return runs
1305
+
1306
+ def _list_runs_from_files(
1307
+ self,
1308
+ experiment_id: str | None,
1309
+ status: RunStatus | None,
1310
+ limit: int | None
1311
+ ) -> list[RunMetadata]:
1312
+ """List runs by scanning files (fallback)."""
1313
+ runs = []
1314
+
1315
+ # Scan experiment directories
1316
+ exp_dirs = [self._experiments_dir / experiment_id] if experiment_id else list(self._experiments_dir.iterdir())
1317
+
1318
+ for exp_dir in exp_dirs:
1319
+ if not exp_dir.is_dir():
1320
+ continue
1321
+
1322
+ runs_dir = exp_dir / "runs"
1323
+ if not runs_dir.exists():
1324
+ continue
1325
+
1326
+ for run_dir in runs_dir.iterdir():
1327
+ if not run_dir.is_dir():
1328
+ continue
1329
+
1330
+ try:
1331
+ metadata = self._load_run_metadata(run_dir.name)
1332
+ if status and metadata.status != status:
1333
+ continue
1334
+ runs.append(metadata)
1335
+ except Exception:
1336
+ continue
1337
+
1338
+ # Sort by creation time
1339
+ runs.sort(key=lambda r: r.created_at, reverse=True)
1340
+
1341
+ if limit:
1342
+ runs = runs[:limit]
1343
+
1344
+ return runs
1345
+
1346
+ def validate_integrity(self, run_id: str) -> dict:
1347
+ """Validate data integrity for a run.
1348
+
1349
+ Args:
1350
+ run_id: Run identifier
1351
+
1352
+ Returns:
1353
+ Dict with validation results
1354
+ """
1355
+ results = {
1356
+ "run_id": run_id,
1357
+ "valid": True,
1358
+ "errors": [],
1359
+ "warnings": [],
1360
+ }
1361
+
1362
+ run_dir = self._get_run_dir(run_id)
1363
+ if not run_dir.exists():
1364
+ results["valid"] = False
1365
+ results["errors"].append(f"Run directory not found: {run_dir}")
1366
+ return results
1367
+
1368
+ # Check metadata
1369
+ metadata_path = run_dir / "metadata.json"
1370
+ if not metadata_path.exists():
1371
+ results["valid"] = False
1372
+ results["errors"].append("Missing metadata.json")
1373
+
1374
+ # Check generation directory
1375
+ gen_dir = run_dir / "generation"
1376
+ if not gen_dir.exists():
1377
+ results["warnings"].append("No generation directory")
1378
+ else:
1379
+ # Check for required files
1380
+ for filename in ["records.jsonl", "tasks.jsonl"]:
1381
+ if not self._file_exists_any_compression(gen_dir / filename):
1382
+ results["warnings"].append(f"Missing {filename}")
1383
+
1384
+ # Check lock file
1385
+ lock_path = run_dir / ".lock"
1386
+ if not lock_path.exists():
1387
+ results["warnings"].append("No lock file (may not have been used)")
1388
+
1389
+ return results
1390
+
1391
+
1392
+ def task_cache_key(task: core_entities.GenerationTask) -> str:
1393
+ """Derive a stable cache key for a generation task (module-level function for backward compatibility)."""
1394
+ dataset_raw = task.metadata.get("dataset_id") or task.metadata.get("sample_id")
1395
+ dataset_id = str(dataset_raw) if dataset_raw is not None else ""
1396
+ prompt_hash = hashlib.sha256(task.prompt.text.encode("utf-8")).hexdigest()[:12]
1397
+ sampling = task.sampling
1398
+ sampling_key = (
1399
+ f"{sampling.temperature:.3f}-{sampling.top_p:.3f}-{sampling.max_tokens}"
1400
+ )
1401
+ template = task.prompt.spec.name
1402
+ model = task.model.identifier
1403
+ return "::".join(
1404
+ filter(None, [dataset_id, template, model, sampling_key, prompt_hash])
1405
+ )
1406
+
1407
+
1408
+ def evaluation_cache_key(
1409
+ task: core_entities.GenerationTask,
1410
+ evaluation_config: dict | None = None,
1411
+ ) -> str:
1412
+ """Derive a stable cache key for an evaluation that includes both task and evaluation configuration.
1413
+
1414
+ This ensures that changing metrics or evaluation settings will invalidate the cache
1415
+ and trigger re-evaluation, even if the generation is cached.
1416
+
1417
+ Args:
1418
+ task: Generation task
1419
+ evaluation_config: Dictionary with evaluation configuration:
1420
+ - metrics: List of metric names/types
1421
+ - extractor: Extractor type/configuration
1422
+ - Any other evaluation settings
1423
+
1424
+ Returns:
1425
+ Cache key string that includes both task and evaluation config
1426
+
1427
+ Example:
1428
+ >>> config = {
1429
+ ... "metrics": ["exact_match", "f1_score"],
1430
+ ... "extractor": "json_field_extractor:answer"
1431
+ ... }
1432
+ >>> key = evaluation_cache_key(task, config)
1433
+ """
1434
+ task_key = task_cache_key(task)
1435
+
1436
+ if not evaluation_config:
1437
+ # No config provided, use task key only (for backward compatibility)
1438
+ return task_key
1439
+
1440
+ # Create deterministic hash of evaluation configuration
1441
+ config_str = json.dumps(evaluation_config, sort_keys=True)
1442
+ config_hash = hashlib.sha256(config_str.encode("utf-8")).hexdigest()[:12]
1443
+
1444
+ return f"{task_key}::eval:{config_hash}"
1445
+
1446
+
1447
+ __all__ = [
1448
+ "ExperimentStorage",
1449
+ "StorageConfig",
1450
+ "RunMetadata",
1451
+ "EvaluationMetadata",
1452
+ "RunStatus",
1453
+ "RetentionPolicy",
1454
+ "DataIntegrityError",
1455
+ "ConcurrentAccessError",
1456
+ "task_cache_key",
1457
+ "evaluation_cache_key",
1458
+ ]