themis-eval 0.2.3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. themis/__init__.py +5 -2
  2. themis/_version.py +14 -1
  3. themis/api.py +83 -145
  4. themis/backends/storage.py +5 -0
  5. themis/cli/commands/info.py +2 -11
  6. themis/cli/main.py +231 -40
  7. themis/comparison/engine.py +7 -13
  8. themis/core/entities.py +4 -0
  9. themis/evaluation/metric_pipeline.py +12 -0
  10. themis/evaluation/pipeline.py +22 -0
  11. themis/evaluation/pipelines/__init__.py +4 -0
  12. themis/evaluation/pipelines/composable_pipeline.py +55 -0
  13. themis/evaluation/pipelines/standard_pipeline.py +16 -0
  14. themis/experiment/__init__.py +2 -2
  15. themis/experiment/cache_manager.py +15 -1
  16. themis/experiment/definitions.py +1 -1
  17. themis/experiment/orchestrator.py +21 -11
  18. themis/experiment/share.py +264 -0
  19. themis/experiment/storage.py +345 -298
  20. themis/generation/router.py +22 -4
  21. themis/generation/runner.py +16 -1
  22. themis/presets/benchmarks.py +602 -17
  23. themis/server/app.py +38 -26
  24. themis/session.py +125 -0
  25. themis/specs/__init__.py +7 -0
  26. themis/specs/execution.py +26 -0
  27. themis/specs/experiment.py +33 -0
  28. themis/specs/storage.py +18 -0
  29. themis/storage/__init__.py +6 -0
  30. themis/storage/experiment_storage.py +7 -0
  31. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/METADATA +47 -34
  32. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/RECORD +35 -28
  33. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/WHEEL +1 -1
  34. themis/experiment/builder.py +0 -151
  35. themis/experiment/export_csv.py +0 -159
  36. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/licenses/LICENSE +0 -0
  37. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/top_level.txt +0 -0
@@ -20,6 +20,8 @@ import os
20
20
  import sqlite3
21
21
  import sys
22
22
  import tempfile
23
+ import threading
24
+ import time
23
25
  from dataclasses import dataclass, field
24
26
  import shutil
25
27
  from datetime import datetime, timedelta
@@ -27,13 +29,13 @@ from enum import Enum
27
29
  from pathlib import Path
28
30
  from typing import Dict, Iterable, List, Literal
29
31
 
30
- # fcntl is Unix-only, use msvcrt on Windows
32
+ # fcntl is Unix-only
31
33
  if sys.platform == "win32":
32
- import msvcrt
33
34
  FCNTL_AVAILABLE = False
34
35
  else:
35
36
  try:
36
37
  import fcntl
38
+
37
39
  FCNTL_AVAILABLE = True
38
40
  except ImportError:
39
41
  FCNTL_AVAILABLE = False
@@ -56,7 +58,7 @@ class RunStatus(str, Enum):
56
58
  @dataclass
57
59
  class RetentionPolicy:
58
60
  """Retention policy for automatic cleanup.
59
-
61
+
60
62
  Attributes:
61
63
  max_runs_per_experiment: Maximum runs to keep per experiment
62
64
  max_age_days: Maximum age in days for runs
@@ -64,7 +66,7 @@ class RetentionPolicy:
64
66
  keep_completed_only: Only keep completed runs
65
67
  keep_latest_n: Always keep N most recent runs
66
68
  """
67
-
69
+
68
70
  max_runs_per_experiment: int | None = None
69
71
  max_age_days: int | None = None
70
72
  max_storage_gb: float | None = None
@@ -155,20 +157,18 @@ class ExperimentStorage:
155
157
  Example:
156
158
  >>> config = StorageConfig()
157
159
  >>> storage = ExperimentStorage("outputs/experiments", config=config)
158
- >>>
160
+ >>>
159
161
  >>> # Start a run
160
162
  >>> metadata = storage.start_run("run-1", "experiment-1", config={})
161
- >>>
163
+ >>>
162
164
  >>> # Append records with locking
163
165
  >>> storage.append_record("run-1", record)
164
- >>>
166
+ >>>
165
167
  >>> # Complete the run
166
168
  >>> storage.complete_run("run-1")
167
169
  """
168
170
 
169
- def __init__(
170
- self, root: str | Path, config: StorageConfig | None = None
171
- ) -> None:
171
+ def __init__(self, root: str | Path, config: StorageConfig | None = None) -> None:
172
172
  self._root = Path(root)
173
173
  self._root.mkdir(parents=True, exist_ok=True)
174
174
  self._config = config or StorageConfig()
@@ -178,6 +178,7 @@ class ExperimentStorage:
178
178
  self._experiments_dir.mkdir(exist_ok=True)
179
179
 
180
180
  # Initialize SQLite database
181
+ self._db_write_lock = threading.RLock()
181
182
  if self._config.use_sqlite_metadata:
182
183
  self._init_database()
183
184
 
@@ -188,93 +189,98 @@ class ExperimentStorage:
188
189
 
189
190
  def _init_database(self):
190
191
  """Initialize SQLite metadata database."""
192
+ with self._db_write_lock:
193
+ with self._connect_db() as conn:
194
+ # WAL allows concurrent readers with a single writer and
195
+ # significantly reduces lock contention in threaded CI runs.
196
+ conn.execute("PRAGMA journal_mode=WAL")
197
+ conn.execute("""
198
+ CREATE TABLE IF NOT EXISTS experiments (
199
+ experiment_id TEXT PRIMARY KEY,
200
+ name TEXT NOT NULL,
201
+ description TEXT,
202
+ created_at TEXT NOT NULL,
203
+ updated_at TEXT NOT NULL,
204
+ config TEXT,
205
+ tags TEXT
206
+ )
207
+ """)
208
+
209
+ conn.execute("""
210
+ CREATE TABLE IF NOT EXISTS runs (
211
+ run_id TEXT PRIMARY KEY,
212
+ experiment_id TEXT NOT NULL,
213
+ status TEXT NOT NULL,
214
+ created_at TEXT NOT NULL,
215
+ updated_at TEXT NOT NULL,
216
+ completed_at TEXT,
217
+ total_samples INTEGER DEFAULT 0,
218
+ successful_generations INTEGER DEFAULT 0,
219
+ failed_generations INTEGER DEFAULT 0,
220
+ config_snapshot TEXT,
221
+ error_message TEXT,
222
+ FOREIGN KEY (experiment_id) REFERENCES experiments(experiment_id)
223
+ )
224
+ """)
225
+
226
+ conn.execute("""
227
+ CREATE TABLE IF NOT EXISTS evaluations (
228
+ eval_id TEXT PRIMARY KEY,
229
+ run_id TEXT NOT NULL,
230
+ eval_name TEXT NOT NULL,
231
+ created_at TEXT NOT NULL,
232
+ metrics_config TEXT,
233
+ total_evaluated INTEGER DEFAULT 0,
234
+ total_failures INTEGER DEFAULT 0,
235
+ FOREIGN KEY (run_id) REFERENCES runs(run_id)
236
+ )
237
+ """)
238
+
239
+ conn.execute("""
240
+ CREATE INDEX IF NOT EXISTS idx_runs_experiment
241
+ ON runs(experiment_id)
242
+ """)
243
+
244
+ conn.execute("""
245
+ CREATE INDEX IF NOT EXISTS idx_runs_status
246
+ ON runs(status)
247
+ """)
248
+
249
+ conn.execute("""
250
+ CREATE INDEX IF NOT EXISTS idx_evaluations_run
251
+ ON evaluations(run_id)
252
+ """)
253
+ conn.commit()
254
+
255
+ def _connect_db(self) -> sqlite3.Connection:
256
+ """Create a SQLite connection configured for concurrent access."""
191
257
  db_path = self._root / "experiments.db"
192
- conn = sqlite3.connect(db_path)
193
-
194
- conn.execute("""
195
- CREATE TABLE IF NOT EXISTS experiments (
196
- experiment_id TEXT PRIMARY KEY,
197
- name TEXT NOT NULL,
198
- description TEXT,
199
- created_at TEXT NOT NULL,
200
- updated_at TEXT NOT NULL,
201
- config TEXT,
202
- tags TEXT
203
- )
204
- """)
205
-
206
- conn.execute("""
207
- CREATE TABLE IF NOT EXISTS runs (
208
- run_id TEXT PRIMARY KEY,
209
- experiment_id TEXT NOT NULL,
210
- status TEXT NOT NULL,
211
- created_at TEXT NOT NULL,
212
- updated_at TEXT NOT NULL,
213
- completed_at TEXT,
214
- total_samples INTEGER DEFAULT 0,
215
- successful_generations INTEGER DEFAULT 0,
216
- failed_generations INTEGER DEFAULT 0,
217
- config_snapshot TEXT,
218
- error_message TEXT,
219
- FOREIGN KEY (experiment_id) REFERENCES experiments(experiment_id)
220
- )
221
- """)
222
-
223
- conn.execute("""
224
- CREATE TABLE IF NOT EXISTS evaluations (
225
- eval_id TEXT PRIMARY KEY,
226
- run_id TEXT NOT NULL,
227
- eval_name TEXT NOT NULL,
228
- created_at TEXT NOT NULL,
229
- metrics_config TEXT,
230
- total_evaluated INTEGER DEFAULT 0,
231
- total_failures INTEGER DEFAULT 0,
232
- FOREIGN KEY (run_id) REFERENCES runs(run_id)
233
- )
234
- """)
235
-
236
- conn.execute("""
237
- CREATE INDEX IF NOT EXISTS idx_runs_experiment
238
- ON runs(experiment_id)
239
- """)
240
-
241
- conn.execute("""
242
- CREATE INDEX IF NOT EXISTS idx_runs_status
243
- ON runs(status)
244
- """)
245
-
246
- conn.execute("""
247
- CREATE INDEX IF NOT EXISTS idx_evaluations_run
248
- ON evaluations(run_id)
249
- """)
250
-
251
- conn.commit()
252
- conn.close()
258
+ conn = sqlite3.connect(db_path, timeout=30.0)
259
+ conn.execute("PRAGMA busy_timeout=30000")
260
+ return conn
253
261
 
254
262
  @contextlib.contextmanager
255
263
  def _acquire_lock(self, run_id: str):
256
264
  """Acquire exclusive lock for run directory with timeout (reentrant).
257
-
265
+
258
266
  This lock is reentrant within the same thread to prevent deadlocks when
259
- the same process acquires the lock multiple times (e.g., start_run()
267
+ the same process acquires the lock multiple times (e.g., start_run()
260
268
  followed by append_record()).
261
-
269
+
262
270
  The lock uses OS-specific file locking:
263
271
  - Unix/Linux/macOS: fcntl.flock with non-blocking retry
264
272
  - Windows: msvcrt.locking
265
273
  - Fallback: No locking (single-process mode)
266
-
274
+
267
275
  Args:
268
276
  run_id: Unique run identifier
269
-
277
+
270
278
  Yields:
271
279
  Context manager that holds the lock
272
-
280
+
273
281
  Raises:
274
282
  TimeoutError: If lock cannot be acquired within 30 seconds
275
283
  """
276
- import time
277
-
278
284
  # Check if we already hold the lock (reentrant)
279
285
  if run_id in self._locks:
280
286
  lock_fd, count = self._locks[run_id]
@@ -291,7 +297,7 @@ class ExperimentStorage:
291
297
  # Last unlock - release the actual lock
292
298
  self._release_os_lock(lock_fd, run_id)
293
299
  return
294
-
300
+
295
301
  # First time acquiring lock for this run_id
296
302
  lock_path = self._get_run_dir(run_id) / ".lock"
297
303
  lock_path.parent.mkdir(parents=True, exist_ok=True)
@@ -302,7 +308,7 @@ class ExperimentStorage:
302
308
  try:
303
309
  # Acquire exclusive lock with timeout
304
310
  self._acquire_os_lock(lock_fd, run_id, lock_path, timeout=30)
305
-
311
+
306
312
  self._locks[run_id] = (lock_fd, 1)
307
313
  yield
308
314
  finally:
@@ -314,27 +320,23 @@ class ExperimentStorage:
314
320
  else:
315
321
  # Decrement count
316
322
  self._locks[run_id] = (lock_fd, count - 1)
317
-
323
+
318
324
  def _acquire_os_lock(
319
- self,
320
- lock_fd: int,
321
- run_id: str,
322
- lock_path: Path,
323
- timeout: int = 30
325
+ self, lock_fd: int, run_id: str, lock_path: Path, timeout: int = 30
324
326
  ) -> None:
325
327
  """Acquire OS-specific file lock with timeout.
326
-
328
+
327
329
  Args:
328
330
  lock_fd: File descriptor for lock file
329
331
  run_id: Run identifier (for error messages)
330
332
  lock_path: Path to lock file (for error messages)
331
333
  timeout: Timeout in seconds
332
-
334
+
333
335
  Raises:
334
336
  TimeoutError: If lock cannot be acquired within timeout
335
337
  """
336
338
  import time
337
-
339
+
338
340
  if sys.platform == "win32":
339
341
  # Windows file locking with retry
340
342
  try:
@@ -342,10 +344,11 @@ class ExperimentStorage:
342
344
  except ImportError:
343
345
  # msvcrt not available - single-process mode
344
346
  import logging
347
+
345
348
  logger = logging.getLogger(__name__)
346
349
  logger.debug("msvcrt not available. Single-process mode only.")
347
350
  return
348
-
351
+
349
352
  start_time = time.time()
350
353
  while True:
351
354
  try:
@@ -356,7 +359,7 @@ class ExperimentStorage:
356
359
  if time.time() - start_time > timeout:
357
360
  try:
358
361
  os.close(lock_fd)
359
- except:
362
+ except OSError:
360
363
  pass
361
364
  raise TimeoutError(
362
365
  f"Failed to acquire lock for run {run_id} after {timeout}s on Windows. "
@@ -376,7 +379,7 @@ class ExperimentStorage:
376
379
  if time.time() - start_time > timeout:
377
380
  try:
378
381
  os.close(lock_fd)
379
- except:
382
+ except OSError:
380
383
  pass
381
384
  raise TimeoutError(
382
385
  f"Failed to acquire lock for run {run_id} after {timeout}s. "
@@ -388,15 +391,16 @@ class ExperimentStorage:
388
391
  # No locking available - single-process mode
389
392
  # This is safe for single-process usage (most common case)
390
393
  import logging
394
+
391
395
  logger = logging.getLogger(__name__)
392
396
  logger.debug(
393
- f"File locking not available on this platform. "
394
- f"Storage will work in single-process mode only."
397
+ "File locking not available on this platform. "
398
+ "Storage will work in single-process mode only."
395
399
  )
396
-
400
+
397
401
  def _release_os_lock(self, lock_fd: int, run_id: str) -> None:
398
402
  """Release OS-specific file lock.
399
-
403
+
400
404
  Args:
401
405
  lock_fd: File descriptor to close
402
406
  run_id: Run identifier (for cleanup)
@@ -405,6 +409,7 @@ class ExperimentStorage:
405
409
  if sys.platform == "win32":
406
410
  try:
407
411
  import msvcrt
412
+
408
413
  msvcrt.locking(lock_fd, msvcrt.LK_UNLCK, 1)
409
414
  except (ImportError, OSError):
410
415
  pass # Lock may already be released
@@ -413,13 +418,13 @@ class ExperimentStorage:
413
418
  fcntl.flock(lock_fd, fcntl.LOCK_UN)
414
419
  except (IOError, OSError):
415
420
  pass # Lock may already be released
416
-
421
+
417
422
  # Close file descriptor
418
423
  try:
419
424
  os.close(lock_fd)
420
425
  except OSError:
421
426
  pass # FD may already be closed
422
-
427
+
423
428
  # Clean up tracking
424
429
  self._locks.pop(run_id, None)
425
430
 
@@ -558,9 +563,11 @@ class ExperimentStorage:
558
563
 
559
564
  # Update progress
560
565
  metadata = self._load_run_metadata(run_id)
561
- new_successful = metadata.successful_generations + (1 if record.output else 0)
566
+ new_successful = metadata.successful_generations + (
567
+ 1 if record.output else 0
568
+ )
562
569
  new_failed = metadata.failed_generations + (1 if record.error else 0)
563
-
570
+
564
571
  self.update_run_progress(
565
572
  run_id,
566
573
  total_samples=metadata.total_samples + 1,
@@ -664,49 +671,57 @@ class ExperimentStorage:
664
671
 
665
672
  def _save_run_metadata_to_db(self, metadata: RunMetadata):
666
673
  """Save run metadata to SQLite database."""
667
- db_path = self._root / "experiments.db"
668
- conn = sqlite3.connect(db_path)
669
-
670
- # Ensure experiment exists
671
- conn.execute(
672
- """
673
- INSERT OR IGNORE INTO experiments (experiment_id, name, created_at, updated_at)
674
- VALUES (?, ?, ?, ?)
675
- """,
676
- (
677
- metadata.experiment_id,
678
- metadata.experiment_id,
679
- metadata.created_at,
680
- metadata.updated_at,
681
- ),
682
- )
683
-
684
- # Upsert run
685
- conn.execute(
686
- """
687
- INSERT OR REPLACE INTO runs (
688
- run_id, experiment_id, status, created_at, updated_at, completed_at,
689
- total_samples, successful_generations, failed_generations,
690
- config_snapshot, error_message
691
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
692
- """,
693
- (
694
- metadata.run_id,
695
- metadata.experiment_id,
696
- metadata.status.value,
697
- metadata.created_at,
698
- metadata.updated_at,
699
- metadata.completed_at,
700
- metadata.total_samples,
701
- metadata.successful_generations,
702
- metadata.failed_generations,
703
- json.dumps(metadata.config_snapshot),
704
- metadata.error_message,
705
- ),
706
- )
674
+ # Serialise process-local writers to avoid lock thrash on Windows CI.
675
+ with self._db_write_lock:
676
+ retry_delay = 0.05
677
+ max_attempts = 5
678
+ for attempt in range(max_attempts):
679
+ try:
680
+ with self._connect_db() as conn:
681
+ # Ensure experiment exists
682
+ conn.execute(
683
+ """
684
+ INSERT OR IGNORE INTO experiments (experiment_id, name, created_at, updated_at)
685
+ VALUES (?, ?, ?, ?)
686
+ """,
687
+ (
688
+ metadata.experiment_id,
689
+ metadata.experiment_id,
690
+ metadata.created_at,
691
+ metadata.updated_at,
692
+ ),
693
+ )
707
694
 
708
- conn.commit()
709
- conn.close()
695
+ # Upsert run
696
+ conn.execute(
697
+ """
698
+ INSERT OR REPLACE INTO runs (
699
+ run_id, experiment_id, status, created_at, updated_at, completed_at,
700
+ total_samples, successful_generations, failed_generations,
701
+ config_snapshot, error_message
702
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
703
+ """,
704
+ (
705
+ metadata.run_id,
706
+ metadata.experiment_id,
707
+ metadata.status.value,
708
+ metadata.created_at,
709
+ metadata.updated_at,
710
+ metadata.completed_at,
711
+ metadata.total_samples,
712
+ metadata.successful_generations,
713
+ metadata.failed_generations,
714
+ json.dumps(metadata.config_snapshot),
715
+ metadata.error_message,
716
+ ),
717
+ )
718
+ conn.commit()
719
+ return
720
+ except sqlite3.OperationalError as exc:
721
+ if "locked" not in str(exc).lower() or attempt == max_attempts - 1:
722
+ raise
723
+ time.sleep(retry_delay)
724
+ retry_delay *= 2
710
725
 
711
726
  def _load_run_metadata(self, run_id: str) -> RunMetadata:
712
727
  """Load run metadata from JSON file.
@@ -744,6 +759,10 @@ class ExperimentStorage:
744
759
  metadata_path = self._get_run_dir(run_id) / "metadata.json"
745
760
  return metadata_path.exists()
746
761
 
762
+ def run_metadata_exists(self, run_id: str) -> bool:
763
+ """Check if run metadata exists (public API)."""
764
+ return self._run_metadata_exists(run_id)
765
+
747
766
  def _get_run_dir(self, run_id: str) -> Path:
748
767
  """Get run directory path.
749
768
 
@@ -778,10 +797,10 @@ class ExperimentStorage:
778
797
 
779
798
  def _open_for_read(self, path: Path):
780
799
  """Open file for reading with automatic compression detection.
781
-
800
+
782
801
  Args:
783
802
  path: File path
784
-
803
+
785
804
  Returns:
786
805
  File handle (text mode)
787
806
  """
@@ -841,16 +860,16 @@ class ExperimentStorage:
841
860
 
842
861
  def load_dataset(self, run_id: str) -> List[dict[str, object]]:
843
862
  """Load cached dataset.
844
-
863
+
845
864
  Args:
846
865
  run_id: Run identifier
847
-
866
+
848
867
  Returns:
849
868
  List of dataset samples
850
869
  """
851
870
  gen_dir = self._get_generation_dir(run_id)
852
871
  path = gen_dir / "dataset.jsonl"
853
-
872
+
854
873
  rows: list[dict[str, object]] = []
855
874
  with self._open_for_read(path) as handle:
856
875
  for line in handle:
@@ -866,16 +885,16 @@ class ExperimentStorage:
866
885
  self, run_id: str
867
886
  ) -> Dict[str, core_entities.GenerationRecord]:
868
887
  """Load cached generation records.
869
-
888
+
870
889
  Args:
871
890
  run_id: Run identifier
872
-
891
+
873
892
  Returns:
874
893
  Dict mapping cache_key to GenerationRecord
875
894
  """
876
895
  gen_dir = self._get_generation_dir(run_id)
877
896
  path = gen_dir / "records.jsonl"
878
-
897
+
879
898
  try:
880
899
  handle = self._open_for_read(path)
881
900
  except FileNotFoundError:
@@ -883,7 +902,7 @@ class ExperimentStorage:
883
902
 
884
903
  tasks = self._load_tasks(run_id)
885
904
  records: dict[str, core_entities.GenerationRecord] = {}
886
-
905
+
887
906
  with handle:
888
907
  for line in handle:
889
908
  if not line.strip():
@@ -891,14 +910,14 @@ class ExperimentStorage:
891
910
  data = json.loads(line)
892
911
  if data.get("_type") == "header":
893
912
  continue
894
-
913
+
895
914
  key = data.get("cache_key")
896
915
  if not key:
897
916
  continue
898
-
917
+
899
918
  record = self._deserialize_record(data, tasks)
900
919
  records[key] = record
901
-
920
+
902
921
  return records
903
922
 
904
923
  def append_evaluation(
@@ -911,7 +930,7 @@ class ExperimentStorage:
911
930
  evaluation_config: dict | None = None,
912
931
  ) -> None:
913
932
  """Append evaluation result.
914
-
933
+
915
934
  Args:
916
935
  run_id: Run identifier
917
936
  record: Generation record being evaluated
@@ -922,48 +941,53 @@ class ExperimentStorage:
922
941
  with self._acquire_lock(run_id):
923
942
  eval_dir = self._get_evaluation_dir(run_id, eval_id)
924
943
  eval_dir.mkdir(parents=True, exist_ok=True)
925
-
944
+
926
945
  path = eval_dir / "evaluation.jsonl"
927
-
946
+
928
947
  if not self._file_exists_any_compression(path):
929
948
  self._write_jsonl_with_header(path, [], file_type="evaluation")
930
-
949
+
931
950
  # Use evaluation_cache_key that includes evaluation config
932
951
  cache_key = evaluation_cache_key(record.task, evaluation_config)
933
-
952
+
934
953
  payload = {
935
954
  "cache_key": cache_key,
936
- "evaluation": core_serialization.serialize_evaluation_record(evaluation),
955
+ "evaluation": core_serialization.serialize_evaluation_record(
956
+ evaluation
957
+ ),
937
958
  }
938
959
  self._atomic_append(path, payload)
939
960
 
940
961
  def load_cached_evaluations(
941
- self, run_id: str, eval_id: str = "default", evaluation_config: dict | None = None
962
+ self,
963
+ run_id: str,
964
+ eval_id: str = "default",
965
+ evaluation_config: dict | None = None,
942
966
  ) -> Dict[str, core_entities.EvaluationRecord]:
943
967
  """Load cached evaluation records.
944
-
968
+
945
969
  Args:
946
970
  run_id: Run identifier
947
971
  eval_id: Evaluation identifier
948
972
  evaluation_config: Evaluation configuration for cache key matching
949
-
973
+
950
974
  Returns:
951
975
  Dict mapping cache_key to EvaluationRecord
952
-
976
+
953
977
  Note:
954
978
  If evaluation_config is provided, only evaluations matching that config
955
979
  will be loaded. This ensures that changing metrics invalidates the cache.
956
980
  """
957
981
  eval_dir = self._get_evaluation_dir(run_id, eval_id)
958
982
  path = eval_dir / "evaluation.jsonl"
959
-
983
+
960
984
  try:
961
985
  handle = self._open_for_read(path)
962
986
  except FileNotFoundError:
963
987
  return {}
964
-
988
+
965
989
  evaluations: dict[str, core_entities.EvaluationRecord] = {}
966
-
990
+
967
991
  with handle:
968
992
  for line in handle:
969
993
  if not line.strip():
@@ -971,15 +995,15 @@ class ExperimentStorage:
971
995
  data = json.loads(line)
972
996
  if data.get("_type") == "header":
973
997
  continue
974
-
998
+
975
999
  key = data.get("cache_key")
976
1000
  if not key:
977
1001
  continue
978
-
1002
+
979
1003
  evaluations[key] = core_serialization.deserialize_evaluation_record(
980
1004
  data["evaluation"]
981
1005
  )
982
-
1006
+
983
1007
  return evaluations
984
1008
 
985
1009
  def get_run_path(self, run_id: str) -> Path:
@@ -1030,12 +1054,11 @@ class ExperimentStorage:
1030
1054
  task = tasks[task_key]
1031
1055
  output_data = payload.get("output")
1032
1056
  error_data = payload.get("error")
1033
-
1057
+
1034
1058
  record = core_entities.GenerationRecord(
1035
1059
  task=task,
1036
1060
  output=core_entities.ModelOutput(
1037
- text=output_data["text"],
1038
- raw=output_data.get("raw")
1061
+ text=output_data["text"], raw=output_data.get("raw")
1039
1062
  )
1040
1063
  if output_data
1041
1064
  else None,
@@ -1048,12 +1071,12 @@ class ExperimentStorage:
1048
1071
  else None,
1049
1072
  metrics=payload.get("metrics", {}),
1050
1073
  )
1051
-
1074
+
1052
1075
  record.attempts = [
1053
1076
  self._deserialize_record(attempt, tasks)
1054
1077
  for attempt in payload.get("attempts", [])
1055
1078
  ]
1056
-
1079
+
1057
1080
  return record
1058
1081
 
1059
1082
  def _persist_task(self, run_id: str, task: core_entities.GenerationTask) -> str:
@@ -1090,9 +1113,7 @@ class ExperimentStorage:
1090
1113
 
1091
1114
  return key
1092
1115
 
1093
- def _persist_template(
1094
- self, run_id: str, spec: core_entities.PromptSpec
1095
- ) -> str:
1116
+ def _persist_template(self, run_id: str, spec: core_entities.PromptSpec) -> str:
1096
1117
  """Persist prompt template."""
1097
1118
  template_content = f"{spec.name}:{spec.template}"
1098
1119
  template_id = hashlib.sha256(template_content.encode("utf-8")).hexdigest()[:16]
@@ -1147,22 +1168,22 @@ class ExperimentStorage:
1147
1168
 
1148
1169
  def _load_templates(self, run_id: str) -> dict[str, core_entities.PromptSpec]:
1149
1170
  """Load templates from disk.
1150
-
1171
+
1151
1172
  Args:
1152
1173
  run_id: Run identifier
1153
-
1174
+
1154
1175
  Returns:
1155
1176
  Dict mapping template_id to PromptSpec
1156
1177
  """
1157
1178
  gen_dir = self._get_generation_dir(run_id)
1158
1179
  path = gen_dir / "templates.jsonl"
1159
-
1180
+
1160
1181
  templates: dict[str, core_entities.PromptSpec] = {}
1161
1182
  try:
1162
1183
  handle = self._open_for_read(path)
1163
1184
  except FileNotFoundError:
1164
1185
  return templates
1165
-
1186
+
1166
1187
  with handle:
1167
1188
  for line in handle:
1168
1189
  if not line.strip():
@@ -1170,35 +1191,37 @@ class ExperimentStorage:
1170
1191
  data = json.loads(line)
1171
1192
  if data.get("_type") == "header":
1172
1193
  continue
1173
-
1194
+
1174
1195
  template_id = data["template_id"]
1175
1196
  templates[template_id] = core_serialization.deserialize_prompt_spec(
1176
1197
  data["spec"]
1177
1198
  )
1178
-
1199
+
1179
1200
  return templates
1180
1201
 
1181
1202
  def _load_tasks(self, run_id: str) -> dict[str, core_entities.GenerationTask]:
1182
1203
  """Load tasks from disk.
1183
-
1204
+
1184
1205
  Args:
1185
1206
  run_id: Run identifier
1186
-
1207
+
1187
1208
  Returns:
1188
1209
  Dict mapping task_key to GenerationTask
1189
1210
  """
1190
1211
  gen_dir = self._get_generation_dir(run_id)
1191
1212
  path = gen_dir / "tasks.jsonl"
1192
-
1213
+
1193
1214
  tasks: dict[str, core_entities.GenerationTask] = {}
1194
1215
  try:
1195
1216
  handle = self._open_for_read(path)
1196
1217
  except FileNotFoundError:
1197
1218
  return tasks
1198
-
1219
+
1199
1220
  # Load templates if deduplication enabled
1200
- templates = self._load_templates(run_id) if self._config.deduplicate_templates else {}
1201
-
1221
+ templates = (
1222
+ self._load_templates(run_id) if self._config.deduplicate_templates else {}
1223
+ )
1224
+
1202
1225
  with handle:
1203
1226
  for line in handle:
1204
1227
  if not line.strip():
@@ -1206,10 +1229,10 @@ class ExperimentStorage:
1206
1229
  data = json.loads(line)
1207
1230
  if data.get("_type") == "header":
1208
1231
  continue
1209
-
1232
+
1210
1233
  task_key = data["task_key"]
1211
1234
  task_data = data["task"]
1212
-
1235
+
1213
1236
  # Restore template from reference if needed
1214
1237
  if (
1215
1238
  self._config.deduplicate_templates
@@ -1217,12 +1240,16 @@ class ExperimentStorage:
1217
1240
  ):
1218
1241
  template_id = task_data["prompt"]["spec"]["_template_ref"]
1219
1242
  if template_id in templates:
1220
- task_data["prompt"]["spec"] = core_serialization.serialize_prompt_spec(
1221
- templates[template_id]
1243
+ task_data["prompt"]["spec"] = (
1244
+ core_serialization.serialize_prompt_spec(
1245
+ templates[template_id]
1246
+ )
1222
1247
  )
1223
-
1224
- tasks[task_key] = core_serialization.deserialize_generation_task(task_data)
1225
-
1248
+
1249
+ tasks[task_key] = core_serialization.deserialize_generation_task(
1250
+ task_data
1251
+ )
1252
+
1226
1253
  self._task_index[run_id] = set(tasks.keys())
1227
1254
  return tasks
1228
1255
 
@@ -1245,7 +1272,7 @@ class ExperimentStorage:
1245
1272
 
1246
1273
  def save_checkpoint(self, run_id: str, checkpoint_data: dict):
1247
1274
  """Save checkpoint for resumability.
1248
-
1275
+
1249
1276
  Args:
1250
1277
  run_id: Run identifier
1251
1278
  checkpoint_data: Checkpoint data to save
@@ -1253,52 +1280,52 @@ class ExperimentStorage:
1253
1280
  with self._acquire_lock(run_id):
1254
1281
  checkpoint_dir = self._get_run_dir(run_id) / "checkpoints"
1255
1282
  checkpoint_dir.mkdir(exist_ok=True)
1256
-
1283
+
1257
1284
  # Use timestamp for checkpoint filename
1258
1285
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1259
1286
  checkpoint_path = checkpoint_dir / f"checkpoint_{timestamp}.json"
1260
-
1287
+
1261
1288
  checkpoint_path.write_text(json.dumps(checkpoint_data, indent=2))
1262
1289
 
1263
1290
  def load_latest_checkpoint(self, run_id: str) -> dict | None:
1264
1291
  """Load most recent checkpoint.
1265
-
1292
+
1266
1293
  Args:
1267
1294
  run_id: Run identifier
1268
-
1295
+
1269
1296
  Returns:
1270
1297
  Checkpoint data or None if no checkpoints exist
1271
1298
  """
1272
1299
  checkpoint_dir = self._get_run_dir(run_id) / "checkpoints"
1273
1300
  if not checkpoint_dir.exists():
1274
1301
  return None
1275
-
1302
+
1276
1303
  # Find latest checkpoint
1277
1304
  checkpoints = sorted(checkpoint_dir.glob("checkpoint_*.json"), reverse=True)
1278
1305
  if not checkpoints:
1279
1306
  return None
1280
-
1307
+
1281
1308
  return json.loads(checkpoints[0].read_text())
1282
1309
 
1283
1310
  def apply_retention_policy(self, policy: RetentionPolicy | None = None):
1284
1311
  """Apply retention policy to clean up old runs.
1285
-
1312
+
1286
1313
  Args:
1287
1314
  policy: Retention policy (uses config if not provided)
1288
1315
  """
1289
1316
  policy = policy or self._config.retention_policy
1290
1317
  if not policy:
1291
1318
  return
1292
-
1319
+
1293
1320
  # Get all experiments
1294
1321
  for exp_dir in self._experiments_dir.iterdir():
1295
1322
  if not exp_dir.is_dir():
1296
1323
  continue
1297
-
1324
+
1298
1325
  runs_dir = exp_dir / "runs"
1299
1326
  if not runs_dir.exists():
1300
1327
  continue
1301
-
1328
+
1302
1329
  # Load all run metadata
1303
1330
  runs = []
1304
1331
  for run_dir in runs_dir.iterdir():
@@ -1307,29 +1334,32 @@ class ExperimentStorage:
1307
1334
  metadata_path = run_dir / "metadata.json"
1308
1335
  if not metadata_path.exists():
1309
1336
  continue
1310
-
1337
+
1311
1338
  try:
1312
1339
  metadata = self._load_run_metadata(run_dir.name)
1313
1340
  runs.append((run_dir, metadata))
1314
1341
  except Exception:
1315
1342
  continue
1316
-
1343
+
1317
1344
  # Sort by creation time (newest first)
1318
1345
  runs.sort(key=lambda x: x[1].created_at, reverse=True)
1319
-
1346
+
1320
1347
  # Apply policies
1321
1348
  runs_to_delete = []
1322
-
1349
+
1323
1350
  for i, (run_dir, metadata) in enumerate(runs):
1324
1351
  # Always keep latest N runs
1325
1352
  if i < policy.keep_latest_n:
1326
1353
  continue
1327
-
1354
+
1328
1355
  # Check if should keep based on status
1329
- if policy.keep_completed_only and metadata.status != RunStatus.COMPLETED:
1356
+ if (
1357
+ policy.keep_completed_only
1358
+ and metadata.status != RunStatus.COMPLETED
1359
+ ):
1330
1360
  runs_to_delete.append(run_dir)
1331
1361
  continue
1332
-
1362
+
1333
1363
  # Check age policy
1334
1364
  if policy.max_age_days:
1335
1365
  created = datetime.fromisoformat(metadata.created_at)
@@ -1337,41 +1367,54 @@ class ExperimentStorage:
1337
1367
  if age > timedelta(days=policy.max_age_days):
1338
1368
  runs_to_delete.append(run_dir)
1339
1369
  continue
1340
-
1370
+
1341
1371
  # Check max runs policy
1342
1372
  if policy.max_runs_per_experiment:
1343
1373
  if i >= policy.max_runs_per_experiment:
1344
1374
  runs_to_delete.append(run_dir)
1345
-
1375
+
1346
1376
  # Delete runs
1347
1377
  for run_dir in runs_to_delete:
1348
1378
  self._delete_run_dir(run_dir)
1349
1379
 
1350
1380
  def _delete_run_dir(self, run_dir: Path):
1351
1381
  """Delete run directory and update database.
1352
-
1382
+
1353
1383
  Args:
1354
1384
  run_dir: Run directory to delete
1355
1385
  """
1356
1386
  run_id = run_dir.name
1357
-
1387
+
1358
1388
  # Remove from SQLite
1359
1389
  if self._config.use_sqlite_metadata:
1360
- db_path = self._root / "experiments.db"
1361
- conn = sqlite3.connect(db_path)
1362
- conn.execute("DELETE FROM runs WHERE run_id = ?", (run_id,))
1363
- conn.commit()
1364
- conn.close()
1365
-
1390
+ with self._db_write_lock:
1391
+ with self._connect_db() as conn:
1392
+ conn.execute("DELETE FROM runs WHERE run_id = ?", (run_id,))
1393
+ conn.commit()
1394
+
1366
1395
  # Remove directory
1367
1396
  shutil.rmtree(run_dir, ignore_errors=True)
1368
1397
 
1398
+ def delete_run(self, run_id: str) -> None:
1399
+ """Delete a run and its stored artifacts.
1400
+
1401
+ Args:
1402
+ run_id: Run identifier to delete
1403
+
1404
+ Raises:
1405
+ FileNotFoundError: If the run does not exist
1406
+ """
1407
+ run_dir = self._get_run_dir(run_id)
1408
+ if not run_dir.exists():
1409
+ raise FileNotFoundError(f"Run not found: {run_id}")
1410
+ self._delete_run_dir(run_dir)
1411
+
1369
1412
  def get_storage_size(self, experiment_id: str | None = None) -> int:
1370
1413
  """Get total storage size in bytes.
1371
-
1414
+
1372
1415
  Args:
1373
1416
  experiment_id: Optional experiment to check (all if None)
1374
-
1417
+
1375
1418
  Returns:
1376
1419
  Total size in bytes
1377
1420
  """
@@ -1381,96 +1424,100 @@ class ExperimentStorage:
1381
1424
  return 0
1382
1425
  return sum(f.stat().st_size for f in exp_dir.rglob("*") if f.is_file())
1383
1426
  else:
1384
- return sum(f.stat().st_size for f in self._experiments_dir.rglob("*") if f.is_file())
1427
+ return sum(
1428
+ f.stat().st_size
1429
+ for f in self._experiments_dir.rglob("*")
1430
+ if f.is_file()
1431
+ )
1385
1432
 
1386
1433
  def list_runs(
1387
1434
  self,
1388
1435
  experiment_id: str | None = None,
1389
1436
  status: RunStatus | None = None,
1390
- limit: int | None = None
1437
+ limit: int | None = None,
1391
1438
  ) -> list[RunMetadata]:
1392
1439
  """List runs with optional filtering.
1393
-
1440
+
1394
1441
  Args:
1395
1442
  experiment_id: Filter by experiment
1396
1443
  status: Filter by status
1397
1444
  limit: Maximum number of runs to return
1398
-
1445
+
1399
1446
  Returns:
1400
1447
  List of run metadata
1401
1448
  """
1402
1449
  if not self._config.use_sqlite_metadata:
1403
1450
  # Fallback to file-based listing
1404
1451
  return self._list_runs_from_files(experiment_id, status, limit)
1405
-
1452
+
1406
1453
  # Query SQLite
1407
- db_path = self._root / "experiments.db"
1408
- conn = sqlite3.connect(db_path)
1409
-
1410
- query = "SELECT * FROM runs WHERE 1=1"
1411
- params = []
1412
-
1413
- if experiment_id:
1414
- query += " AND experiment_id = ?"
1415
- params.append(experiment_id)
1416
-
1417
- if status:
1418
- query += " AND status = ?"
1419
- params.append(status.value)
1420
-
1421
- query += " ORDER BY created_at DESC"
1422
-
1423
- if limit:
1424
- query += " LIMIT ?"
1425
- params.append(limit)
1426
-
1427
- cursor = conn.execute(query, params)
1428
- rows = cursor.fetchall()
1429
- conn.close()
1430
-
1454
+ with self._connect_db() as conn:
1455
+ query = "SELECT * FROM runs WHERE 1=1"
1456
+ params = []
1457
+
1458
+ if experiment_id:
1459
+ query += " AND experiment_id = ?"
1460
+ params.append(experiment_id)
1461
+
1462
+ if status:
1463
+ query += " AND status = ?"
1464
+ params.append(status.value)
1465
+
1466
+ query += " ORDER BY created_at DESC"
1467
+
1468
+ if limit:
1469
+ query += " LIMIT ?"
1470
+ params.append(limit)
1471
+
1472
+ cursor = conn.execute(query, params)
1473
+ rows = cursor.fetchall()
1474
+
1431
1475
  # Convert to RunMetadata
1432
1476
  runs = []
1433
1477
  for row in rows:
1434
- runs.append(RunMetadata(
1435
- run_id=row[0],
1436
- experiment_id=row[1],
1437
- status=RunStatus(row[2]),
1438
- created_at=row[3],
1439
- updated_at=row[4],
1440
- completed_at=row[5],
1441
- total_samples=row[6] or 0,
1442
- successful_generations=row[7] or 0,
1443
- failed_generations=row[8] or 0,
1444
- config_snapshot=json.loads(row[9]) if row[9] else {},
1445
- error_message=row[10],
1446
- ))
1447
-
1478
+ runs.append(
1479
+ RunMetadata(
1480
+ run_id=row[0],
1481
+ experiment_id=row[1],
1482
+ status=RunStatus(row[2]),
1483
+ created_at=row[3],
1484
+ updated_at=row[4],
1485
+ completed_at=row[5],
1486
+ total_samples=row[6] or 0,
1487
+ successful_generations=row[7] or 0,
1488
+ failed_generations=row[8] or 0,
1489
+ config_snapshot=json.loads(row[9]) if row[9] else {},
1490
+ error_message=row[10],
1491
+ )
1492
+ )
1493
+
1448
1494
  return runs
1449
1495
 
1450
1496
  def _list_runs_from_files(
1451
- self,
1452
- experiment_id: str | None,
1453
- status: RunStatus | None,
1454
- limit: int | None
1497
+ self, experiment_id: str | None, status: RunStatus | None, limit: int | None
1455
1498
  ) -> list[RunMetadata]:
1456
1499
  """List runs by scanning files (fallback)."""
1457
1500
  runs = []
1458
-
1501
+
1459
1502
  # Scan experiment directories
1460
- exp_dirs = [self._experiments_dir / experiment_id] if experiment_id else list(self._experiments_dir.iterdir())
1461
-
1503
+ exp_dirs = (
1504
+ [self._experiments_dir / experiment_id]
1505
+ if experiment_id
1506
+ else list(self._experiments_dir.iterdir())
1507
+ )
1508
+
1462
1509
  for exp_dir in exp_dirs:
1463
1510
  if not exp_dir.is_dir():
1464
1511
  continue
1465
-
1512
+
1466
1513
  runs_dir = exp_dir / "runs"
1467
1514
  if not runs_dir.exists():
1468
1515
  continue
1469
-
1516
+
1470
1517
  for run_dir in runs_dir.iterdir():
1471
1518
  if not run_dir.is_dir():
1472
1519
  continue
1473
-
1520
+
1474
1521
  try:
1475
1522
  metadata = self._load_run_metadata(run_dir.name)
1476
1523
  if status and metadata.status != status:
@@ -1478,21 +1525,21 @@ class ExperimentStorage:
1478
1525
  runs.append(metadata)
1479
1526
  except Exception:
1480
1527
  continue
1481
-
1528
+
1482
1529
  # Sort by creation time
1483
1530
  runs.sort(key=lambda r: r.created_at, reverse=True)
1484
-
1531
+
1485
1532
  if limit:
1486
1533
  runs = runs[:limit]
1487
-
1534
+
1488
1535
  return runs
1489
1536
 
1490
1537
  def validate_integrity(self, run_id: str) -> dict:
1491
1538
  """Validate data integrity for a run.
1492
-
1539
+
1493
1540
  Args:
1494
1541
  run_id: Run identifier
1495
-
1542
+
1496
1543
  Returns:
1497
1544
  Dict with validation results
1498
1545
  """
@@ -1502,19 +1549,19 @@ class ExperimentStorage:
1502
1549
  "errors": [],
1503
1550
  "warnings": [],
1504
1551
  }
1505
-
1552
+
1506
1553
  run_dir = self._get_run_dir(run_id)
1507
1554
  if not run_dir.exists():
1508
1555
  results["valid"] = False
1509
1556
  results["errors"].append(f"Run directory not found: {run_dir}")
1510
1557
  return results
1511
-
1558
+
1512
1559
  # Check metadata
1513
1560
  metadata_path = run_dir / "metadata.json"
1514
1561
  if not metadata_path.exists():
1515
1562
  results["valid"] = False
1516
1563
  results["errors"].append("Missing metadata.json")
1517
-
1564
+
1518
1565
  # Check generation directory
1519
1566
  gen_dir = run_dir / "generation"
1520
1567
  if not gen_dir.exists():
@@ -1524,12 +1571,12 @@ class ExperimentStorage:
1524
1571
  for filename in ["records.jsonl", "tasks.jsonl"]:
1525
1572
  if not self._file_exists_any_compression(gen_dir / filename):
1526
1573
  results["warnings"].append(f"Missing {filename}")
1527
-
1574
+
1528
1575
  # Check lock file
1529
1576
  lock_path = run_dir / ".lock"
1530
1577
  if not lock_path.exists():
1531
1578
  results["warnings"].append("No lock file (may not have been used)")
1532
-
1579
+
1533
1580
  return results
1534
1581
 
1535
1582
 
@@ -1554,20 +1601,20 @@ def evaluation_cache_key(
1554
1601
  evaluation_config: dict | None = None,
1555
1602
  ) -> str:
1556
1603
  """Derive a stable cache key for an evaluation that includes both task and evaluation configuration.
1557
-
1604
+
1558
1605
  This ensures that changing metrics or evaluation settings will invalidate the cache
1559
1606
  and trigger re-evaluation, even if the generation is cached.
1560
-
1607
+
1561
1608
  Args:
1562
1609
  task: Generation task
1563
1610
  evaluation_config: Dictionary with evaluation configuration:
1564
1611
  - metrics: List of metric names/types
1565
1612
  - extractor: Extractor type/configuration
1566
1613
  - Any other evaluation settings
1567
-
1614
+
1568
1615
  Returns:
1569
1616
  Cache key string that includes both task and evaluation config
1570
-
1617
+
1571
1618
  Example:
1572
1619
  >>> config = {
1573
1620
  ... "metrics": ["exact_match", "f1_score"],
@@ -1576,15 +1623,15 @@ def evaluation_cache_key(
1576
1623
  >>> key = evaluation_cache_key(task, config)
1577
1624
  """
1578
1625
  task_key = task_cache_key(task)
1579
-
1626
+
1580
1627
  if not evaluation_config:
1581
1628
  # No config provided, use task key only (for backward compatibility)
1582
1629
  return task_key
1583
-
1630
+
1584
1631
  # Create deterministic hash of evaluation configuration
1585
1632
  config_str = json.dumps(evaluation_config, sort_keys=True)
1586
1633
  config_hash = hashlib.sha256(config_str.encode("utf-8")).hexdigest()[:12]
1587
-
1634
+
1588
1635
  return f"{task_key}::eval:{config_hash}"
1589
1636
 
1590
1637