themis-eval 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +429 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/commands/results.py +252 -0
  8. themis/cli/main.py +427 -57
  9. themis/comparison/__init__.py +25 -0
  10. themis/comparison/engine.py +348 -0
  11. themis/comparison/reports.py +283 -0
  12. themis/comparison/statistics.py +402 -0
  13. themis/core/entities.py +23 -3
  14. themis/evaluation/metrics/code/__init__.py +19 -0
  15. themis/evaluation/metrics/code/codebleu.py +144 -0
  16. themis/evaluation/metrics/code/execution.py +280 -0
  17. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  18. themis/evaluation/metrics/nlp/__init__.py +21 -0
  19. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  20. themis/evaluation/metrics/nlp/bleu.py +129 -0
  21. themis/evaluation/metrics/nlp/meteor.py +153 -0
  22. themis/evaluation/metrics/nlp/rouge.py +136 -0
  23. themis/evaluation/pipelines/standard_pipeline.py +68 -8
  24. themis/experiment/cache_manager.py +8 -3
  25. themis/experiment/export.py +110 -2
  26. themis/experiment/orchestrator.py +109 -11
  27. themis/experiment/storage.py +1457 -110
  28. themis/generation/providers/litellm_provider.py +46 -0
  29. themis/generation/runner.py +22 -6
  30. themis/integrations/huggingface.py +12 -1
  31. themis/integrations/wandb.py +13 -1
  32. themis/interfaces/__init__.py +86 -0
  33. themis/presets/__init__.py +10 -0
  34. themis/presets/benchmarks.py +354 -0
  35. themis/presets/models.py +190 -0
  36. themis/server/__init__.py +28 -0
  37. themis/server/app.py +337 -0
  38. themis_eval-0.2.1.dist-info/METADATA +596 -0
  39. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/RECORD +42 -19
  40. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/WHEEL +1 -1
  41. themis_eval-0.1.1.dist-info/METADATA +0 -758
  42. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/licenses/LICENSE +0 -0
  43. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/top_level.txt +0 -0
@@ -1,57 +1,528 @@
1
- """Local storage helpers for experiment datasets and cached records."""
1
+ """Robust storage architecture with lifecycle management, atomic operations, and integrity checks.
2
+
3
+ This is a rewrite of the storage layer to address:
4
+ - Run lifecycle management (in_progress, completed, failed)
5
+ - Atomic write operations
6
+ - File locking for concurrent access
7
+ - Index persistence
8
+ - Experiment-level organization
9
+ - Separate evaluation tracking
10
+ - Data integrity validation
11
+ """
2
12
 
3
13
  from __future__ import annotations
4
14
 
15
+ import contextlib
16
+ import gzip
5
17
  import hashlib
6
18
  import json
19
+ import os
20
+ import sqlite3
21
+ import sys
22
+ import tempfile
23
+ from dataclasses import dataclass, field
24
+ import shutil
25
+ from datetime import datetime, timedelta
26
+ from enum import Enum
7
27
  from pathlib import Path
8
- from typing import Dict, Iterable, List
28
+ from typing import Dict, Iterable, List, Literal
29
+
30
+ # fcntl is Unix-only, use msvcrt on Windows
31
+ if sys.platform == "win32":
32
+ import msvcrt
33
+ FCNTL_AVAILABLE = False
34
+ else:
35
+ try:
36
+ import fcntl
37
+ FCNTL_AVAILABLE = True
38
+ except ImportError:
39
+ FCNTL_AVAILABLE = False
9
40
 
10
41
  from themis.core import entities as core_entities
11
42
  from themis.core import serialization as core_serialization
12
43
 
44
+ STORAGE_FORMAT_VERSION = "2.0.0"
13
45
 
14
- def task_cache_key(task: core_entities.GenerationTask) -> str:
15
- """Derive a stable cache key for a generation task."""
16
46
 
17
- dataset_raw = task.metadata.get("dataset_id") or task.metadata.get("sample_id")
18
- dataset_id = str(dataset_raw) if dataset_raw is not None else ""
19
- prompt_hash = hashlib.sha256(task.prompt.text.encode("utf-8")).hexdigest()[:12]
20
- sampling = task.sampling
21
- sampling_key = (
22
- f"{sampling.temperature:.3f}-{sampling.top_p:.3f}-{sampling.max_tokens}"
23
- )
24
- template = task.prompt.spec.name
25
- model = task.model.identifier
26
- return "::".join(
27
- filter(None, [dataset_id, template, model, sampling_key, prompt_hash])
28
- )
47
+ class RunStatus(str, Enum):
48
+ """Status of a run."""
49
+
50
+ IN_PROGRESS = "in_progress"
51
+ COMPLETED = "completed"
52
+ FAILED = "failed"
53
+ CANCELLED = "cancelled"
54
+
55
+
56
+ @dataclass
57
+ class RetentionPolicy:
58
+ """Retention policy for automatic cleanup.
59
+
60
+ Attributes:
61
+ max_runs_per_experiment: Maximum runs to keep per experiment
62
+ max_age_days: Maximum age in days for runs
63
+ max_storage_gb: Maximum total storage in GB
64
+ keep_completed_only: Only keep completed runs
65
+ keep_latest_n: Always keep N most recent runs
66
+ """
67
+
68
+ max_runs_per_experiment: int | None = None
69
+ max_age_days: int | None = None
70
+ max_storage_gb: float | None = None
71
+ keep_completed_only: bool = True
72
+ keep_latest_n: int = 5
73
+
74
+
75
+ @dataclass
76
+ class StorageConfig:
77
+ """Configuration for experiment storage behavior.
78
+
79
+ Attributes:
80
+ save_raw_responses: Save full API responses (default: False)
81
+ save_dataset: Save dataset copy (default: True)
82
+ compression: Compression format - "gzip" | "none" (default: "gzip")
83
+ deduplicate_templates: Store templates once (default: True)
84
+ enable_checksums: Add integrity checksums (default: True)
85
+ use_sqlite_metadata: Use SQLite for metadata (default: True)
86
+ checkpoint_interval: Save checkpoint every N records (default: 100)
87
+ retention_policy: Automatic cleanup policy (default: None)
88
+ """
89
+
90
+ save_raw_responses: bool = False
91
+ save_dataset: bool = True
92
+ compression: Literal["none", "gzip"] = "gzip"
93
+ deduplicate_templates: bool = True
94
+ enable_checksums: bool = True
95
+ use_sqlite_metadata: bool = True
96
+ checkpoint_interval: int = 100
97
+ retention_policy: RetentionPolicy | None = None
98
+
99
+
100
+ @dataclass
101
+ class RunMetadata:
102
+ """Metadata for a run."""
103
+
104
+ run_id: str
105
+ experiment_id: str
106
+ status: RunStatus
107
+ created_at: str
108
+ updated_at: str
109
+ completed_at: str | None = None
110
+ total_samples: int = 0
111
+ successful_generations: int = 0
112
+ failed_generations: int = 0
113
+ config_snapshot: dict = field(default_factory=dict)
114
+ error_message: str | None = None
115
+
116
+
117
+ @dataclass
118
+ class EvaluationMetadata:
119
+ """Metadata for an evaluation run."""
120
+
121
+ eval_id: str
122
+ run_id: str
123
+ eval_name: str
124
+ created_at: str
125
+ metrics_config: dict = field(default_factory=dict)
126
+ total_evaluated: int = 0
127
+ total_failures: int = 0
128
+
129
+
130
+ class DataIntegrityError(Exception):
131
+ """Raised when data integrity check fails."""
132
+
133
+ pass
134
+
135
+
136
+ class ConcurrentAccessError(Exception):
137
+ """Raised when concurrent access conflict detected."""
138
+
139
+ pass
29
140
 
30
141
 
31
142
  class ExperimentStorage:
32
- """Persists datasets and generation records for resumability/caching."""
143
+ """Robust storage with lifecycle management, locking, and integrity checks.
144
+
145
+ Features:
146
+ - Atomic write operations
147
+ - File locking for concurrent access
148
+ - Run lifecycle tracking (in_progress, completed, failed)
149
+ - Experiment-level organization
150
+ - Separate evaluation tracking
151
+ - Persistent indexes
152
+ - Data integrity validation
153
+ - SQLite metadata database
33
154
 
34
- def __init__(self, root: str | Path) -> None:
155
+ Example:
156
+ >>> config = StorageConfig()
157
+ >>> storage = ExperimentStorage("outputs/experiments", config=config)
158
+ >>>
159
+ >>> # Start a run
160
+ >>> metadata = storage.start_run("run-1", "experiment-1", config={})
161
+ >>>
162
+ >>> # Append records with locking
163
+ >>> storage.append_record("run-1", record)
164
+ >>>
165
+ >>> # Complete the run
166
+ >>> storage.complete_run("run-1")
167
+ """
168
+
169
+ def __init__(
170
+ self, root: str | Path, config: StorageConfig | None = None
171
+ ) -> None:
35
172
  self._root = Path(root)
36
173
  self._root.mkdir(parents=True, exist_ok=True)
174
+ self._config = config or StorageConfig()
175
+
176
+ # Create experiments directory
177
+ self._experiments_dir = self._root / "experiments"
178
+ self._experiments_dir.mkdir(exist_ok=True)
179
+
180
+ # Initialize SQLite database
181
+ if self._config.use_sqlite_metadata:
182
+ self._init_database()
183
+
184
+ # In-memory caches
37
185
  self._task_index: dict[str, set[str]] = {}
186
+ self._template_index: dict[str, dict[str, str]] = {}
187
+ self._locks: dict[str, tuple[int, int]] = {} # (fd, count) for reentrant locks
38
188
 
39
- def cache_dataset(self, run_id: str, dataset: Iterable[dict[str, object]]) -> None:
40
- path = self._dataset_path(run_id)
41
- path.parent.mkdir(parents=True, exist_ok=True)
42
- with path.open("w", encoding="utf-8") as handle:
43
- for row in dataset:
44
- handle.write(json.dumps(row) + "\n")
189
+ def _init_database(self):
190
+ """Initialize SQLite metadata database."""
191
+ db_path = self._root / "experiments.db"
192
+ conn = sqlite3.connect(db_path)
45
193
 
46
- def load_dataset(self, run_id: str) -> List[dict[str, object]]:
47
- path = self._dataset_path(run_id)
48
- if not path.exists():
49
- raise FileNotFoundError(f"Dataset cache not found for run '{run_id}'")
50
- rows: list[dict[str, object]] = []
51
- with path.open("r", encoding="utf-8") as handle:
52
- for line in handle:
53
- rows.append(json.loads(line))
54
- return rows
194
+ conn.execute("""
195
+ CREATE TABLE IF NOT EXISTS experiments (
196
+ experiment_id TEXT PRIMARY KEY,
197
+ name TEXT NOT NULL,
198
+ description TEXT,
199
+ created_at TEXT NOT NULL,
200
+ updated_at TEXT NOT NULL,
201
+ config TEXT,
202
+ tags TEXT
203
+ )
204
+ """)
205
+
206
+ conn.execute("""
207
+ CREATE TABLE IF NOT EXISTS runs (
208
+ run_id TEXT PRIMARY KEY,
209
+ experiment_id TEXT NOT NULL,
210
+ status TEXT NOT NULL,
211
+ created_at TEXT NOT NULL,
212
+ updated_at TEXT NOT NULL,
213
+ completed_at TEXT,
214
+ total_samples INTEGER DEFAULT 0,
215
+ successful_generations INTEGER DEFAULT 0,
216
+ failed_generations INTEGER DEFAULT 0,
217
+ config_snapshot TEXT,
218
+ error_message TEXT,
219
+ FOREIGN KEY (experiment_id) REFERENCES experiments(experiment_id)
220
+ )
221
+ """)
222
+
223
+ conn.execute("""
224
+ CREATE TABLE IF NOT EXISTS evaluations (
225
+ eval_id TEXT PRIMARY KEY,
226
+ run_id TEXT NOT NULL,
227
+ eval_name TEXT NOT NULL,
228
+ created_at TEXT NOT NULL,
229
+ metrics_config TEXT,
230
+ total_evaluated INTEGER DEFAULT 0,
231
+ total_failures INTEGER DEFAULT 0,
232
+ FOREIGN KEY (run_id) REFERENCES runs(run_id)
233
+ )
234
+ """)
235
+
236
+ conn.execute("""
237
+ CREATE INDEX IF NOT EXISTS idx_runs_experiment
238
+ ON runs(experiment_id)
239
+ """)
240
+
241
+ conn.execute("""
242
+ CREATE INDEX IF NOT EXISTS idx_runs_status
243
+ ON runs(status)
244
+ """)
245
+
246
+ conn.execute("""
247
+ CREATE INDEX IF NOT EXISTS idx_evaluations_run
248
+ ON evaluations(run_id)
249
+ """)
250
+
251
+ conn.commit()
252
+ conn.close()
253
+
254
+ @contextlib.contextmanager
255
+ def _acquire_lock(self, run_id: str):
256
+ """Acquire exclusive lock for run directory with timeout (reentrant).
257
+
258
+ This lock is reentrant within the same thread to prevent deadlocks when
259
+ the same process acquires the lock multiple times (e.g., start_run()
260
+ followed by append_record()).
261
+
262
+ The lock uses OS-specific file locking:
263
+ - Unix/Linux/macOS: fcntl.flock with non-blocking retry
264
+ - Windows: msvcrt.locking
265
+ - Fallback: No locking (single-process mode)
266
+
267
+ Args:
268
+ run_id: Unique run identifier
269
+
270
+ Yields:
271
+ Context manager that holds the lock
272
+
273
+ Raises:
274
+ TimeoutError: If lock cannot be acquired within 30 seconds
275
+ """
276
+ import time
277
+
278
+ # Check if we already hold the lock (reentrant)
279
+ if run_id in self._locks:
280
+ lock_fd, count = self._locks[run_id]
281
+ self._locks[run_id] = (lock_fd, count + 1)
282
+ try:
283
+ yield
284
+ finally:
285
+ # Check if lock still exists (might have been cleaned up by another thread)
286
+ if run_id in self._locks:
287
+ lock_fd, count = self._locks[run_id]
288
+ if count > 1:
289
+ self._locks[run_id] = (lock_fd, count - 1)
290
+ else:
291
+ # Last unlock - release the actual lock
292
+ self._release_os_lock(lock_fd, run_id)
293
+ return
294
+
295
+ # First time acquiring lock for this run_id
296
+ lock_path = self._get_run_dir(run_id) / ".lock"
297
+ lock_path.parent.mkdir(parents=True, exist_ok=True)
298
+
299
+ # Open lock file (OS-independent flags)
300
+ lock_fd = os.open(str(lock_path), os.O_CREAT | os.O_RDWR)
301
+
302
+ try:
303
+ # Acquire exclusive lock with timeout
304
+ self._acquire_os_lock(lock_fd, run_id, lock_path, timeout=30)
305
+
306
+ self._locks[run_id] = (lock_fd, 1)
307
+ yield
308
+ finally:
309
+ # Release lock (only if this was the outermost lock)
310
+ if run_id in self._locks:
311
+ lock_fd, count = self._locks[run_id]
312
+ if count == 1:
313
+ self._release_os_lock(lock_fd, run_id)
314
+ else:
315
+ # Decrement count
316
+ self._locks[run_id] = (lock_fd, count - 1)
317
+
318
+ def _acquire_os_lock(
319
+ self,
320
+ lock_fd: int,
321
+ run_id: str,
322
+ lock_path: Path,
323
+ timeout: int = 30
324
+ ) -> None:
325
+ """Acquire OS-specific file lock with timeout.
326
+
327
+ Args:
328
+ lock_fd: File descriptor for lock file
329
+ run_id: Run identifier (for error messages)
330
+ lock_path: Path to lock file (for error messages)
331
+ timeout: Timeout in seconds
332
+
333
+ Raises:
334
+ TimeoutError: If lock cannot be acquired within timeout
335
+ """
336
+ import time
337
+
338
+ if sys.platform == "win32":
339
+ # Windows file locking with retry
340
+ try:
341
+ import msvcrt
342
+ except ImportError:
343
+ # msvcrt not available - single-process mode
344
+ import logging
345
+ logger = logging.getLogger(__name__)
346
+ logger.debug("msvcrt not available. Single-process mode only.")
347
+ return
348
+
349
+ start_time = time.time()
350
+ while True:
351
+ try:
352
+ msvcrt.locking(lock_fd, msvcrt.LK_NBLCK, 1)
353
+ break # Lock acquired
354
+ except OSError as e:
355
+ # Lock is held by another thread/process (errno 13 Permission denied)
356
+ if time.time() - start_time > timeout:
357
+ try:
358
+ os.close(lock_fd)
359
+ except:
360
+ pass
361
+ raise TimeoutError(
362
+ f"Failed to acquire lock for run {run_id} after {timeout}s on Windows. "
363
+ f"This usually means another process is holding the lock or a previous process crashed. "
364
+ f"Try deleting: {lock_path}"
365
+ ) from e
366
+ time.sleep(0.1) # Wait 100ms before retry
367
+ elif FCNTL_AVAILABLE:
368
+ # Unix file locking with non-blocking retry
369
+ start_time = time.time()
370
+ while True:
371
+ try:
372
+ fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
373
+ break # Lock acquired
374
+ except (IOError, OSError) as e:
375
+ # Lock is held by another process
376
+ if time.time() - start_time > timeout:
377
+ try:
378
+ os.close(lock_fd)
379
+ except:
380
+ pass
381
+ raise TimeoutError(
382
+ f"Failed to acquire lock for run {run_id} after {timeout}s. "
383
+ f"This usually means another process is holding the lock or a previous process crashed. "
384
+ f"Try: rm -f {lock_path}"
385
+ ) from e
386
+ time.sleep(0.1) # Wait 100ms before retry
387
+ else:
388
+ # No locking available - single-process mode
389
+ # This is safe for single-process usage (most common case)
390
+ import logging
391
+ logger = logging.getLogger(__name__)
392
+ logger.debug(
393
+ f"File locking not available on this platform. "
394
+ f"Storage will work in single-process mode only."
395
+ )
396
+
397
+ def _release_os_lock(self, lock_fd: int, run_id: str) -> None:
398
+ """Release OS-specific file lock.
399
+
400
+ Args:
401
+ lock_fd: File descriptor to close
402
+ run_id: Run identifier (for cleanup)
403
+ """
404
+ # Release lock
405
+ if sys.platform == "win32":
406
+ try:
407
+ import msvcrt
408
+ msvcrt.locking(lock_fd, msvcrt.LK_UNLCK, 1)
409
+ except (ImportError, OSError):
410
+ pass # Lock may already be released
411
+ elif FCNTL_AVAILABLE:
412
+ try:
413
+ fcntl.flock(lock_fd, fcntl.LOCK_UN)
414
+ except (IOError, OSError):
415
+ pass # Lock may already be released
416
+
417
+ # Close file descriptor
418
+ try:
419
+ os.close(lock_fd)
420
+ except OSError:
421
+ pass # FD may already be closed
422
+
423
+ # Clean up tracking
424
+ self._locks.pop(run_id, None)
425
+
426
+ def start_run(
427
+ self,
428
+ run_id: str,
429
+ experiment_id: str,
430
+ config: dict | None = None,
431
+ ) -> RunMetadata:
432
+ """Start a new run with in_progress status.
433
+
434
+ Args:
435
+ run_id: Unique run identifier
436
+ experiment_id: Experiment this run belongs to
437
+ config: Configuration snapshot for this run
438
+
439
+ Returns:
440
+ RunMetadata with in_progress status
441
+
442
+ Raises:
443
+ ValueError: If run already exists
444
+ """
445
+ with self._acquire_lock(run_id):
446
+ # Check if run already exists
447
+ if self._run_metadata_exists(run_id):
448
+ raise ValueError(f"Run {run_id} already exists")
449
+
450
+ # Create run directory
451
+ run_dir = self._get_run_dir(run_id)
452
+ run_dir.mkdir(parents=True, exist_ok=True)
453
+
454
+ # Create metadata
455
+ metadata = RunMetadata(
456
+ run_id=run_id,
457
+ experiment_id=experiment_id,
458
+ status=RunStatus.IN_PROGRESS,
459
+ created_at=datetime.now().isoformat(),
460
+ updated_at=datetime.now().isoformat(),
461
+ config_snapshot=config or {},
462
+ )
463
+
464
+ # Save metadata
465
+ self._save_run_metadata(metadata)
466
+
467
+ return metadata
468
+
469
+ def complete_run(self, run_id: str):
470
+ """Mark run as completed.
471
+
472
+ Args:
473
+ run_id: Run identifier
474
+
475
+ Raises:
476
+ ValueError: If run doesn't exist
477
+ """
478
+ with self._acquire_lock(run_id):
479
+ metadata = self._load_run_metadata(run_id)
480
+ metadata.status = RunStatus.COMPLETED
481
+ metadata.completed_at = datetime.now().isoformat()
482
+ metadata.updated_at = datetime.now().isoformat()
483
+ self._save_run_metadata(metadata)
484
+
485
+ def fail_run(self, run_id: str, error_message: str):
486
+ """Mark run as failed with error message.
487
+
488
+ Args:
489
+ run_id: Run identifier
490
+ error_message: Error description
491
+ """
492
+ with self._acquire_lock(run_id):
493
+ metadata = self._load_run_metadata(run_id)
494
+ metadata.status = RunStatus.FAILED
495
+ metadata.error_message = error_message
496
+ metadata.updated_at = datetime.now().isoformat()
497
+ self._save_run_metadata(metadata)
498
+
499
+ def update_run_progress(
500
+ self,
501
+ run_id: str,
502
+ total_samples: int | None = None,
503
+ successful_generations: int | None = None,
504
+ failed_generations: int | None = None,
505
+ ):
506
+ """Update run progress counters.
507
+
508
+ Args:
509
+ run_id: Run identifier
510
+ total_samples: Total samples (if provided)
511
+ successful_generations: Successful count (if provided)
512
+ failed_generations: Failed count (if provided)
513
+ """
514
+ with self._acquire_lock(run_id):
515
+ metadata = self._load_run_metadata(run_id)
516
+
517
+ if total_samples is not None:
518
+ metadata.total_samples = total_samples
519
+ if successful_generations is not None:
520
+ metadata.successful_generations = successful_generations
521
+ if failed_generations is not None:
522
+ metadata.failed_generations = failed_generations
523
+
524
+ metadata.updated_at = datetime.now().isoformat()
525
+ self._save_run_metadata(metadata)
55
526
 
56
527
  def append_record(
57
528
  self,
@@ -60,31 +531,374 @@ class ExperimentStorage:
60
531
  *,
61
532
  cache_key: str | None = None,
62
533
  ) -> None:
63
- path = self._records_path(run_id)
64
- path.parent.mkdir(parents=True, exist_ok=True)
65
- payload = self._serialize_record(run_id, record)
66
- payload["cache_key"] = cache_key or task_cache_key(record.task)
67
- with path.open("a", encoding="utf-8") as handle:
68
- handle.write(json.dumps(payload) + "\n")
534
+ """Append record with atomic write and locking.
535
+
536
+ Args:
537
+ run_id: Run identifier
538
+ record: Generation record to append
539
+ cache_key: Optional cache key (generated if not provided)
540
+ """
541
+ with self._acquire_lock(run_id):
542
+ # Ensure generation directory exists
543
+ gen_dir = self._get_generation_dir(run_id)
544
+ gen_dir.mkdir(parents=True, exist_ok=True)
545
+
546
+ path = gen_dir / "records.jsonl"
547
+
548
+ # Initialize file with header if needed
549
+ if not self._file_exists_any_compression(path):
550
+ self._write_jsonl_with_header(path, [], file_type="records")
551
+
552
+ # Serialize record
553
+ payload = self._serialize_record(run_id, record)
554
+ payload["cache_key"] = cache_key or self._task_cache_key(record.task)
555
+
556
+ # Atomic append
557
+ self._atomic_append(path, payload)
558
+
559
+ # Update progress
560
+ metadata = self._load_run_metadata(run_id)
561
+ new_successful = metadata.successful_generations + (1 if record.output else 0)
562
+ new_failed = metadata.failed_generations + (1 if record.error else 0)
563
+
564
+ self.update_run_progress(
565
+ run_id,
566
+ total_samples=metadata.total_samples + 1,
567
+ successful_generations=new_successful,
568
+ failed_generations=new_failed,
569
+ )
570
+
571
+ # Auto-checkpoint if configured
572
+ if self._config.checkpoint_interval > 0:
573
+ total = new_successful + new_failed
574
+ if total % self._config.checkpoint_interval == 0:
575
+ checkpoint_data = {
576
+ "total_samples": total,
577
+ "successful": new_successful,
578
+ "failed": new_failed,
579
+ "timestamp": datetime.now().isoformat(),
580
+ }
581
+ self.save_checkpoint(run_id, checkpoint_data)
582
+
583
+ def _atomic_append(self, path: Path, data: dict):
584
+ """Append data atomically using temp file.
585
+
586
+ Args:
587
+ path: Target file path
588
+ data: Data to append (will be JSON serialized)
589
+ """
590
+ json_line = json.dumps(data) + "\n"
591
+
592
+ # Write to temp file
593
+ temp_fd, temp_path = tempfile.mkstemp(
594
+ dir=path.parent, prefix=".tmp_", suffix=".json"
595
+ )
596
+ temp_path = Path(temp_path)
597
+
598
+ try:
599
+ if self._config.compression == "gzip":
600
+ # Close the fd first since gzip.open will open by path
601
+ os.close(temp_fd)
602
+ with gzip.open(temp_path, "wt", encoding="utf-8") as f:
603
+ f.write(json_line)
604
+ f.flush()
605
+ os.fsync(f.fileno())
606
+ else:
607
+ # Use the fd directly
608
+ with open(temp_fd, "w", encoding="utf-8") as f:
609
+ f.write(json_line)
610
+ f.flush()
611
+ os.fsync(f.fileno())
612
+ # fd is closed by context manager, don't close again
613
+
614
+ # Get target path with compression
615
+ target_path = (
616
+ path.with_suffix(path.suffix + ".gz")
617
+ if self._config.compression == "gzip"
618
+ else path
619
+ )
620
+
621
+ # Append to existing file
622
+ if target_path.exists():
623
+ with open(target_path, "ab") as dest:
624
+ with open(temp_path, "rb") as src:
625
+ dest.write(src.read())
626
+ dest.flush()
627
+ os.fsync(dest.fileno())
628
+ else:
629
+ # No existing file, just rename
630
+ temp_path.rename(target_path)
631
+ return
632
+
633
+ finally:
634
+ # Clean up temp file if still exists
635
+ if temp_path.exists():
636
+ temp_path.unlink()
637
+
638
+ def _save_run_metadata(self, metadata: RunMetadata):
639
+ """Save run metadata to both JSON and SQLite.
640
+
641
+ Args:
642
+ metadata: Run metadata to save
643
+ """
644
+ # Save to JSON file
645
+ metadata_path = self._get_run_dir(metadata.run_id) / "metadata.json"
646
+ metadata_dict = {
647
+ "run_id": metadata.run_id,
648
+ "experiment_id": metadata.experiment_id,
649
+ "status": metadata.status.value,
650
+ "created_at": metadata.created_at,
651
+ "updated_at": metadata.updated_at,
652
+ "completed_at": metadata.completed_at,
653
+ "total_samples": metadata.total_samples,
654
+ "successful_generations": metadata.successful_generations,
655
+ "failed_generations": metadata.failed_generations,
656
+ "config_snapshot": metadata.config_snapshot,
657
+ "error_message": metadata.error_message,
658
+ }
659
+ metadata_path.write_text(json.dumps(metadata_dict, indent=2))
660
+
661
+ # Save to SQLite
662
+ if self._config.use_sqlite_metadata:
663
+ self._save_run_metadata_to_db(metadata)
664
+
665
+ def _save_run_metadata_to_db(self, metadata: RunMetadata):
666
+ """Save run metadata to SQLite database."""
667
+ db_path = self._root / "experiments.db"
668
+ conn = sqlite3.connect(db_path)
669
+
670
+ # Ensure experiment exists
671
+ conn.execute(
672
+ """
673
+ INSERT OR IGNORE INTO experiments (experiment_id, name, created_at, updated_at)
674
+ VALUES (?, ?, ?, ?)
675
+ """,
676
+ (
677
+ metadata.experiment_id,
678
+ metadata.experiment_id,
679
+ metadata.created_at,
680
+ metadata.updated_at,
681
+ ),
682
+ )
683
+
684
+ # Upsert run
685
+ conn.execute(
686
+ """
687
+ INSERT OR REPLACE INTO runs (
688
+ run_id, experiment_id, status, created_at, updated_at, completed_at,
689
+ total_samples, successful_generations, failed_generations,
690
+ config_snapshot, error_message
691
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
692
+ """,
693
+ (
694
+ metadata.run_id,
695
+ metadata.experiment_id,
696
+ metadata.status.value,
697
+ metadata.created_at,
698
+ metadata.updated_at,
699
+ metadata.completed_at,
700
+ metadata.total_samples,
701
+ metadata.successful_generations,
702
+ metadata.failed_generations,
703
+ json.dumps(metadata.config_snapshot),
704
+ metadata.error_message,
705
+ ),
706
+ )
707
+
708
+ conn.commit()
709
+ conn.close()
710
+
711
+ def _load_run_metadata(self, run_id: str) -> RunMetadata:
712
+ """Load run metadata from JSON file.
713
+
714
+ Args:
715
+ run_id: Run identifier
716
+
717
+ Returns:
718
+ RunMetadata
719
+
720
+ Raises:
721
+ FileNotFoundError: If metadata doesn't exist
722
+ """
723
+ metadata_path = self._get_run_dir(run_id) / "metadata.json"
724
+ if not metadata_path.exists():
725
+ raise FileNotFoundError(f"Run metadata not found for {run_id}")
726
+
727
+ data = json.loads(metadata_path.read_text())
728
+ return RunMetadata(
729
+ run_id=data["run_id"],
730
+ experiment_id=data["experiment_id"],
731
+ status=RunStatus(data["status"]),
732
+ created_at=data["created_at"],
733
+ updated_at=data["updated_at"],
734
+ completed_at=data.get("completed_at"),
735
+ total_samples=data.get("total_samples", 0),
736
+ successful_generations=data.get("successful_generations", 0),
737
+ failed_generations=data.get("failed_generations", 0),
738
+ config_snapshot=data.get("config_snapshot", {}),
739
+ error_message=data.get("error_message"),
740
+ )
741
+
742
+ def _run_metadata_exists(self, run_id: str) -> bool:
743
+ """Check if run metadata exists."""
744
+ metadata_path = self._get_run_dir(run_id) / "metadata.json"
745
+ return metadata_path.exists()
746
+
747
+ def _get_run_dir(self, run_id: str) -> Path:
748
+ """Get run directory path.
749
+
750
+ Uses hierarchical structure: experiments/<experiment_id>/runs/<run_id>/
751
+ Falls back to experiments/default/runs/<run_id>/ if experiment_id unknown.
752
+ """
753
+ # Check if we already have metadata
754
+ for exp_dir in self._experiments_dir.iterdir():
755
+ if not exp_dir.is_dir():
756
+ continue
757
+ runs_dir = exp_dir / "runs"
758
+ if not runs_dir.exists():
759
+ continue
760
+ candidate_path = runs_dir / run_id / "metadata.json"
761
+ if candidate_path.exists():
762
+ return runs_dir / run_id
763
+
764
+ # Default location for new runs
765
+ return self._experiments_dir / "default" / "runs" / run_id
766
+
767
+ def _get_generation_dir(self, run_id: str) -> Path:
768
+ """Get generation data directory."""
769
+ return self._get_run_dir(run_id) / "generation"
770
+
771
+ def _get_evaluation_dir(self, run_id: str, eval_id: str = "default") -> Path:
772
+ """Get evaluation directory."""
773
+ return self._get_run_dir(run_id) / "evaluations" / eval_id
774
+
775
+ def _file_exists_any_compression(self, path: Path) -> bool:
776
+ """Check if file exists with any compression suffix."""
777
+ return path.exists() or path.with_suffix(path.suffix + ".gz").exists()
778
+
779
+ def _open_for_read(self, path: Path):
780
+ """Open file for reading with automatic compression detection.
781
+
782
+ Args:
783
+ path: File path
784
+
785
+ Returns:
786
+ File handle (text mode)
787
+ """
788
+ # Try .gz version first
789
+ gz_path = path.with_suffix(path.suffix + ".gz")
790
+ if gz_path.exists():
791
+ return gzip.open(gz_path, "rt", encoding="utf-8")
792
+ if path.exists():
793
+ return path.open("r", encoding="utf-8")
794
+ raise FileNotFoundError(f"File not found: {path}")
795
+
796
+ def _write_jsonl_with_header(
797
+ self, path: Path, items: Iterable[dict], file_type: str
798
+ ):
799
+ """Write JSONL file with format version header."""
800
+ # Determine actual path based on compression
801
+ if self._config.compression == "gzip":
802
+ actual_path = path.with_suffix(path.suffix + ".gz")
803
+ handle = gzip.open(actual_path, "wt", encoding="utf-8")
804
+ else:
805
+ actual_path = path
806
+ handle = open(actual_path, "w", encoding="utf-8")
807
+
808
+ with handle:
809
+ # Write header
810
+ header = {
811
+ "_type": "header",
812
+ "_format_version": STORAGE_FORMAT_VERSION,
813
+ "_file_type": file_type,
814
+ }
815
+ handle.write(json.dumps(header) + "\n")
816
+
817
+ # Write items
818
+ for item in items:
819
+ handle.write(json.dumps(item) + "\n")
820
+
821
+ handle.flush()
822
+ if hasattr(handle, "fileno"):
823
+ os.fsync(handle.fileno())
824
+
825
+ def cache_dataset(self, run_id: str, dataset: Iterable[dict[str, object]]) -> None:
826
+ """Cache dataset samples to storage.
827
+
828
+ Args:
829
+ run_id: Unique run identifier
830
+ dataset: Iterable of dataset samples
831
+ """
832
+ if not self._config.save_dataset:
833
+ return
834
+
835
+ with self._acquire_lock(run_id):
836
+ gen_dir = self._get_generation_dir(run_id)
837
+ gen_dir.mkdir(parents=True, exist_ok=True)
838
+ path = gen_dir / "dataset.jsonl"
839
+
840
+ self._write_jsonl_with_header(path, dataset, file_type="dataset")
841
+
842
+ def load_dataset(self, run_id: str) -> List[dict[str, object]]:
843
+ """Load cached dataset.
844
+
845
+ Args:
846
+ run_id: Run identifier
847
+
848
+ Returns:
849
+ List of dataset samples
850
+ """
851
+ gen_dir = self._get_generation_dir(run_id)
852
+ path = gen_dir / "dataset.jsonl"
853
+
854
+ rows: list[dict[str, object]] = []
855
+ with self._open_for_read(path) as handle:
856
+ for line in handle:
857
+ if not line.strip():
858
+ continue
859
+ data = json.loads(line)
860
+ if data.get("_type") == "header":
861
+ continue
862
+ rows.append(data)
863
+ return rows
69
864
 
70
865
  def load_cached_records(
71
866
  self, run_id: str
72
867
  ) -> Dict[str, core_entities.GenerationRecord]:
73
- path = self._records_path(run_id)
74
- if not path.exists():
868
+ """Load cached generation records.
869
+
870
+ Args:
871
+ run_id: Run identifier
872
+
873
+ Returns:
874
+ Dict mapping cache_key to GenerationRecord
875
+ """
876
+ gen_dir = self._get_generation_dir(run_id)
877
+ path = gen_dir / "records.jsonl"
878
+
879
+ try:
880
+ handle = self._open_for_read(path)
881
+ except FileNotFoundError:
75
882
  return {}
883
+
76
884
  tasks = self._load_tasks(run_id)
77
885
  records: dict[str, core_entities.GenerationRecord] = {}
78
- with path.open("r", encoding="utf-8") as handle:
886
+
887
+ with handle:
79
888
  for line in handle:
80
889
  if not line.strip():
81
890
  continue
82
891
  data = json.loads(line)
892
+ if data.get("_type") == "header":
893
+ continue
894
+
83
895
  key = data.get("cache_key")
84
896
  if not key:
85
897
  continue
898
+
86
899
  record = self._deserialize_record(data, tasks)
87
900
  records[key] = record
901
+
88
902
  return records
89
903
 
90
904
  def append_evaluation(
@@ -92,34 +906,80 @@ class ExperimentStorage:
92
906
  run_id: str,
93
907
  record: core_entities.GenerationRecord,
94
908
  evaluation: core_entities.EvaluationRecord,
909
+ *,
910
+ eval_id: str = "default",
911
+ evaluation_config: dict | None = None,
95
912
  ) -> None:
96
- path = self._evaluation_path(run_id)
97
- path.parent.mkdir(parents=True, exist_ok=True)
98
- payload = {
99
- "cache_key": task_cache_key(record.task),
100
- "evaluation": core_serialization.serialize_evaluation_record(evaluation),
101
- }
102
- with path.open("a", encoding="utf-8") as handle:
103
- handle.write(json.dumps(payload) + "\n")
913
+ """Append evaluation result.
914
+
915
+ Args:
916
+ run_id: Run identifier
917
+ record: Generation record being evaluated
918
+ evaluation: Evaluation record
919
+ eval_id: Evaluation identifier (default: "default")
920
+ evaluation_config: Evaluation configuration (metrics, extractor) for cache invalidation
921
+ """
922
+ with self._acquire_lock(run_id):
923
+ eval_dir = self._get_evaluation_dir(run_id, eval_id)
924
+ eval_dir.mkdir(parents=True, exist_ok=True)
925
+
926
+ path = eval_dir / "evaluation.jsonl"
927
+
928
+ if not self._file_exists_any_compression(path):
929
+ self._write_jsonl_with_header(path, [], file_type="evaluation")
930
+
931
+ # Use evaluation_cache_key that includes evaluation config
932
+ cache_key = evaluation_cache_key(record.task, evaluation_config)
933
+
934
+ payload = {
935
+ "cache_key": cache_key,
936
+ "evaluation": core_serialization.serialize_evaluation_record(evaluation),
937
+ }
938
+ self._atomic_append(path, payload)
104
939
 
105
940
  def load_cached_evaluations(
106
- self, run_id: str
941
+ self, run_id: str, eval_id: str = "default", evaluation_config: dict | None = None
107
942
  ) -> Dict[str, core_entities.EvaluationRecord]:
108
- path = self._evaluation_path(run_id)
109
- if not path.exists():
943
+ """Load cached evaluation records.
944
+
945
+ Args:
946
+ run_id: Run identifier
947
+ eval_id: Evaluation identifier
948
+ evaluation_config: Evaluation configuration for cache key matching
949
+
950
+ Returns:
951
+ Dict mapping cache_key to EvaluationRecord
952
+
953
+ Note:
954
+ If evaluation_config is provided, only evaluations matching that config
955
+ will be loaded. This ensures that changing metrics invalidates the cache.
956
+ """
957
+ eval_dir = self._get_evaluation_dir(run_id, eval_id)
958
+ path = eval_dir / "evaluation.jsonl"
959
+
960
+ try:
961
+ handle = self._open_for_read(path)
962
+ except FileNotFoundError:
110
963
  return {}
964
+
111
965
  evaluations: dict[str, core_entities.EvaluationRecord] = {}
112
- with path.open("r", encoding="utf-8") as handle:
966
+
967
+ with handle:
113
968
  for line in handle:
114
969
  if not line.strip():
115
970
  continue
116
971
  data = json.loads(line)
972
+ if data.get("_type") == "header":
973
+ continue
974
+
117
975
  key = data.get("cache_key")
118
976
  if not key:
119
977
  continue
978
+
120
979
  evaluations[key] = core_serialization.deserialize_evaluation_record(
121
980
  data["evaluation"]
122
981
  )
982
+
123
983
  return evaluations
124
984
 
125
985
  def get_run_path(self, run_id: str) -> Path:
@@ -131,35 +991,24 @@ class ExperimentStorage:
131
991
  Returns:
132
992
  Path to the run's storage directory
133
993
  """
134
- return self._run_dir(run_id)
135
-
136
- def _dataset_path(self, run_id: str) -> Path:
137
- return self._run_dir(run_id) / "dataset.jsonl"
138
-
139
- def _records_path(self, run_id: str) -> Path:
140
- return self._run_dir(run_id) / "records.jsonl"
141
-
142
- def _tasks_path(self, run_id: str) -> Path:
143
- return self._run_dir(run_id) / "tasks.jsonl"
144
-
145
- def _evaluation_path(self, run_id: str) -> Path:
146
- return self._run_dir(run_id) / "evaluation.jsonl"
147
-
148
- def _run_dir(self, run_id: str) -> Path:
149
- return self._root / run_id
994
+ return self._get_run_dir(run_id)
150
995
 
151
996
  def _serialize_record(
152
997
  self, run_id: str, record: core_entities.GenerationRecord
153
- ) -> dict[str, object]:
998
+ ) -> dict:
999
+ """Serialize generation record."""
154
1000
  task_key = self._persist_task(run_id, record.task)
155
- payload = {
1001
+
1002
+ # Prepare output data
1003
+ output_data = None
1004
+ if record.output:
1005
+ output_data = {"text": record.output.text}
1006
+ if self._config.save_raw_responses:
1007
+ output_data["raw"] = record.output.raw
1008
+
1009
+ return {
156
1010
  "task_key": task_key,
157
- "output": {
158
- "text": record.output.text,
159
- "raw": record.output.raw,
160
- }
161
- if record.output
162
- else None,
1011
+ "output": output_data,
163
1012
  "error": {
164
1013
  "message": record.error.message,
165
1014
  "kind": record.error.kind,
@@ -172,19 +1021,21 @@ class ExperimentStorage:
172
1021
  self._serialize_record(run_id, attempt) for attempt in record.attempts
173
1022
  ],
174
1023
  }
175
- return payload
176
1024
 
177
1025
  def _deserialize_record(
178
- self, payload: dict[str, object], tasks: dict[str, core_entities.GenerationTask]
1026
+ self, payload: dict, tasks: dict[str, core_entities.GenerationTask]
179
1027
  ) -> core_entities.GenerationRecord:
1028
+ """Deserialize generation record."""
180
1029
  task_key = payload["task_key"]
181
1030
  task = tasks[task_key]
182
1031
  output_data = payload.get("output")
183
1032
  error_data = payload.get("error")
1033
+
184
1034
  record = core_entities.GenerationRecord(
185
1035
  task=task,
186
1036
  output=core_entities.ModelOutput(
187
- text=output_data["text"], raw=output_data.get("raw")
1037
+ text=output_data["text"],
1038
+ raw=output_data.get("raw")
188
1039
  )
189
1040
  if output_data
190
1041
  else None,
@@ -197,59 +1048,555 @@ class ExperimentStorage:
197
1048
  else None,
198
1049
  metrics=payload.get("metrics", {}),
199
1050
  )
1051
+
200
1052
  record.attempts = [
201
1053
  self._deserialize_record(attempt, tasks)
202
1054
  for attempt in payload.get("attempts", [])
203
1055
  ]
1056
+
204
1057
  return record
205
1058
 
206
1059
  def _persist_task(self, run_id: str, task: core_entities.GenerationTask) -> str:
207
- key = task_cache_key(task)
1060
+ """Persist task and return cache key."""
1061
+ # Implementation similar to original but with atomic writes
1062
+ # and proper locking (already have lock from append_record)
1063
+ key = self._task_cache_key(task)
208
1064
  index = self._load_task_index(run_id)
1065
+
209
1066
  if key in index:
210
1067
  return key
211
- path = self._tasks_path(run_id)
212
- path.parent.mkdir(parents=True, exist_ok=True)
213
- payload = {
214
- "task_key": key,
215
- "task": core_serialization.serialize_generation_task(task),
216
- }
217
- with path.open("a", encoding="utf-8") as handle:
218
- handle.write(json.dumps(payload) + "\n")
1068
+
1069
+ gen_dir = self._get_generation_dir(run_id)
1070
+ gen_dir.mkdir(parents=True, exist_ok=True)
1071
+ path = gen_dir / "tasks.jsonl"
1072
+
1073
+ # Initialize if needed
1074
+ if not self._file_exists_any_compression(path):
1075
+ self._write_jsonl_with_header(path, [], file_type="tasks")
1076
+
1077
+ # Serialize task
1078
+ if self._config.deduplicate_templates:
1079
+ template_id = self._persist_template(run_id, task.prompt.spec)
1080
+ task_data = core_serialization.serialize_generation_task(task)
1081
+ task_data["prompt"]["spec"] = {"_template_ref": template_id}
1082
+ else:
1083
+ task_data = core_serialization.serialize_generation_task(task)
1084
+
1085
+ payload = {"task_key": key, "task": task_data}
1086
+ self._atomic_append(path, payload)
1087
+
219
1088
  index.add(key)
1089
+ self._save_task_index(run_id, index)
1090
+
220
1091
  return key
221
1092
 
1093
+ def _persist_template(
1094
+ self, run_id: str, spec: core_entities.PromptSpec
1095
+ ) -> str:
1096
+ """Persist prompt template."""
1097
+ template_content = f"{spec.name}:{spec.template}"
1098
+ template_id = hashlib.sha256(template_content.encode("utf-8")).hexdigest()[:16]
1099
+
1100
+ if run_id not in self._template_index:
1101
+ self._template_index[run_id] = {}
1102
+ self._load_templates(run_id)
1103
+
1104
+ if template_id in self._template_index[run_id]:
1105
+ return template_id
1106
+
1107
+ gen_dir = self._get_generation_dir(run_id)
1108
+ path = gen_dir / "templates.jsonl"
1109
+
1110
+ if not self._file_exists_any_compression(path):
1111
+ self._write_jsonl_with_header(path, [], file_type="templates")
1112
+
1113
+ payload = {
1114
+ "template_id": template_id,
1115
+ "spec": core_serialization.serialize_prompt_spec(spec),
1116
+ }
1117
+ self._atomic_append(path, payload)
1118
+
1119
+ self._template_index[run_id][template_id] = spec.template
1120
+ return template_id
1121
+
1122
+ def _load_task_index(self, run_id: str) -> set[str]:
1123
+ """Load task index from disk cache or rebuild."""
1124
+ if run_id in self._task_index:
1125
+ return self._task_index[run_id]
1126
+
1127
+ # Try to load from persisted index
1128
+ index_path = self._get_run_dir(run_id) / ".index.json"
1129
+ if index_path.exists():
1130
+ index_data = json.loads(index_path.read_text())
1131
+ self._task_index[run_id] = set(index_data.get("task_keys", []))
1132
+ return self._task_index[run_id]
1133
+
1134
+ # Rebuild from tasks file
1135
+ self._task_index[run_id] = set()
1136
+ return self._task_index[run_id]
1137
+
1138
+ def _save_task_index(self, run_id: str, index: set[str]):
1139
+ """Save task index to disk."""
1140
+ index_path = self._get_run_dir(run_id) / ".index.json"
1141
+ index_data = {
1142
+ "task_keys": list(index),
1143
+ "template_ids": self._template_index.get(run_id, {}),
1144
+ "last_updated": datetime.now().isoformat(),
1145
+ }
1146
+ index_path.write_text(json.dumps(index_data))
1147
+
1148
+ def _load_templates(self, run_id: str) -> dict[str, core_entities.PromptSpec]:
1149
+ """Load templates from disk.
1150
+
1151
+ Args:
1152
+ run_id: Run identifier
1153
+
1154
+ Returns:
1155
+ Dict mapping template_id to PromptSpec
1156
+ """
1157
+ gen_dir = self._get_generation_dir(run_id)
1158
+ path = gen_dir / "templates.jsonl"
1159
+
1160
+ templates: dict[str, core_entities.PromptSpec] = {}
1161
+ try:
1162
+ handle = self._open_for_read(path)
1163
+ except FileNotFoundError:
1164
+ return templates
1165
+
1166
+ with handle:
1167
+ for line in handle:
1168
+ if not line.strip():
1169
+ continue
1170
+ data = json.loads(line)
1171
+ if data.get("_type") == "header":
1172
+ continue
1173
+
1174
+ template_id = data["template_id"]
1175
+ templates[template_id] = core_serialization.deserialize_prompt_spec(
1176
+ data["spec"]
1177
+ )
1178
+
1179
+ return templates
1180
+
222
1181
  def _load_tasks(self, run_id: str) -> dict[str, core_entities.GenerationTask]:
223
- path = self._tasks_path(run_id)
1182
+ """Load tasks from disk.
1183
+
1184
+ Args:
1185
+ run_id: Run identifier
1186
+
1187
+ Returns:
1188
+ Dict mapping task_key to GenerationTask
1189
+ """
1190
+ gen_dir = self._get_generation_dir(run_id)
1191
+ path = gen_dir / "tasks.jsonl"
1192
+
224
1193
  tasks: dict[str, core_entities.GenerationTask] = {}
225
- if not path.exists():
1194
+ try:
1195
+ handle = self._open_for_read(path)
1196
+ except FileNotFoundError:
226
1197
  return tasks
227
- with path.open("r", encoding="utf-8") as handle:
1198
+
1199
+ # Load templates if deduplication enabled
1200
+ templates = self._load_templates(run_id) if self._config.deduplicate_templates else {}
1201
+
1202
+ with handle:
228
1203
  for line in handle:
229
1204
  if not line.strip():
230
1205
  continue
231
1206
  data = json.loads(line)
1207
+ if data.get("_type") == "header":
1208
+ continue
1209
+
232
1210
  task_key = data["task_key"]
233
- tasks[task_key] = core_serialization.deserialize_generation_task(
234
- data["task"]
235
- )
1211
+ task_data = data["task"]
1212
+
1213
+ # Restore template from reference if needed
1214
+ if (
1215
+ self._config.deduplicate_templates
1216
+ and "_template_ref" in task_data.get("prompt", {}).get("spec", {})
1217
+ ):
1218
+ template_id = task_data["prompt"]["spec"]["_template_ref"]
1219
+ if template_id in templates:
1220
+ task_data["prompt"]["spec"] = core_serialization.serialize_prompt_spec(
1221
+ templates[template_id]
1222
+ )
1223
+
1224
+ tasks[task_key] = core_serialization.deserialize_generation_task(task_data)
1225
+
236
1226
  self._task_index[run_id] = set(tasks.keys())
237
1227
  return tasks
238
1228
 
239
- def _load_task_index(self, run_id: str) -> set[str]:
240
- if run_id in self._task_index:
241
- return self._task_index[run_id]
242
- path = self._tasks_path(run_id)
243
- index: set[str] = set()
244
- if path.exists():
245
- with path.open("r", encoding="utf-8") as handle:
246
- for line in handle:
247
- if not line.strip():
1229
+ def _task_cache_key(self, task: core_entities.GenerationTask) -> str:
1230
+ """Generate cache key for task."""
1231
+ dataset_raw = task.metadata.get("dataset_id") or task.metadata.get("sample_id")
1232
+ dataset_id = str(dataset_raw) if dataset_raw is not None else ""
1233
+ prompt_hash = hashlib.sha256(task.prompt.text.encode("utf-8")).hexdigest()[:12]
1234
+ sampling = task.sampling
1235
+ sampling_key = (
1236
+ f"{sampling.temperature:.3f}-{sampling.top_p:.3f}-{sampling.max_tokens}"
1237
+ )
1238
+ template = task.prompt.spec.name
1239
+ model = task.model.identifier
1240
+ return "::".join(
1241
+ filter(None, [dataset_id, template, model, sampling_key, prompt_hash])
1242
+ )
1243
+
1244
+ # ===== Phase 3 Features =====
1245
+
1246
+ def save_checkpoint(self, run_id: str, checkpoint_data: dict):
1247
+ """Save checkpoint for resumability.
1248
+
1249
+ Args:
1250
+ run_id: Run identifier
1251
+ checkpoint_data: Checkpoint data to save
1252
+ """
1253
+ with self._acquire_lock(run_id):
1254
+ checkpoint_dir = self._get_run_dir(run_id) / "checkpoints"
1255
+ checkpoint_dir.mkdir(exist_ok=True)
1256
+
1257
+ # Use timestamp for checkpoint filename
1258
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1259
+ checkpoint_path = checkpoint_dir / f"checkpoint_{timestamp}.json"
1260
+
1261
+ checkpoint_path.write_text(json.dumps(checkpoint_data, indent=2))
1262
+
1263
+ def load_latest_checkpoint(self, run_id: str) -> dict | None:
1264
+ """Load most recent checkpoint.
1265
+
1266
+ Args:
1267
+ run_id: Run identifier
1268
+
1269
+ Returns:
1270
+ Checkpoint data or None if no checkpoints exist
1271
+ """
1272
+ checkpoint_dir = self._get_run_dir(run_id) / "checkpoints"
1273
+ if not checkpoint_dir.exists():
1274
+ return None
1275
+
1276
+ # Find latest checkpoint
1277
+ checkpoints = sorted(checkpoint_dir.glob("checkpoint_*.json"), reverse=True)
1278
+ if not checkpoints:
1279
+ return None
1280
+
1281
+ return json.loads(checkpoints[0].read_text())
1282
+
1283
+ def apply_retention_policy(self, policy: RetentionPolicy | None = None):
1284
+ """Apply retention policy to clean up old runs.
1285
+
1286
+ Args:
1287
+ policy: Retention policy (uses config if not provided)
1288
+ """
1289
+ policy = policy or self._config.retention_policy
1290
+ if not policy:
1291
+ return
1292
+
1293
+ # Get all experiments
1294
+ for exp_dir in self._experiments_dir.iterdir():
1295
+ if not exp_dir.is_dir():
1296
+ continue
1297
+
1298
+ runs_dir = exp_dir / "runs"
1299
+ if not runs_dir.exists():
1300
+ continue
1301
+
1302
+ # Load all run metadata
1303
+ runs = []
1304
+ for run_dir in runs_dir.iterdir():
1305
+ if not run_dir.is_dir():
1306
+ continue
1307
+ metadata_path = run_dir / "metadata.json"
1308
+ if not metadata_path.exists():
1309
+ continue
1310
+
1311
+ try:
1312
+ metadata = self._load_run_metadata(run_dir.name)
1313
+ runs.append((run_dir, metadata))
1314
+ except Exception:
1315
+ continue
1316
+
1317
+ # Sort by creation time (newest first)
1318
+ runs.sort(key=lambda x: x[1].created_at, reverse=True)
1319
+
1320
+ # Apply policies
1321
+ runs_to_delete = []
1322
+
1323
+ for i, (run_dir, metadata) in enumerate(runs):
1324
+ # Always keep latest N runs
1325
+ if i < policy.keep_latest_n:
1326
+ continue
1327
+
1328
+ # Check if should keep based on status
1329
+ if policy.keep_completed_only and metadata.status != RunStatus.COMPLETED:
1330
+ runs_to_delete.append(run_dir)
1331
+ continue
1332
+
1333
+ # Check age policy
1334
+ if policy.max_age_days:
1335
+ created = datetime.fromisoformat(metadata.created_at)
1336
+ age = datetime.now() - created
1337
+ if age > timedelta(days=policy.max_age_days):
1338
+ runs_to_delete.append(run_dir)
1339
+ continue
1340
+
1341
+ # Check max runs policy
1342
+ if policy.max_runs_per_experiment:
1343
+ if i >= policy.max_runs_per_experiment:
1344
+ runs_to_delete.append(run_dir)
1345
+
1346
+ # Delete runs
1347
+ for run_dir in runs_to_delete:
1348
+ self._delete_run_dir(run_dir)
1349
+
1350
+ def _delete_run_dir(self, run_dir: Path):
1351
+ """Delete run directory and update database.
1352
+
1353
+ Args:
1354
+ run_dir: Run directory to delete
1355
+ """
1356
+ run_id = run_dir.name
1357
+
1358
+ # Remove from SQLite
1359
+ if self._config.use_sqlite_metadata:
1360
+ db_path = self._root / "experiments.db"
1361
+ conn = sqlite3.connect(db_path)
1362
+ conn.execute("DELETE FROM runs WHERE run_id = ?", (run_id,))
1363
+ conn.commit()
1364
+ conn.close()
1365
+
1366
+ # Remove directory
1367
+ shutil.rmtree(run_dir, ignore_errors=True)
1368
+
1369
+ def get_storage_size(self, experiment_id: str | None = None) -> int:
1370
+ """Get total storage size in bytes.
1371
+
1372
+ Args:
1373
+ experiment_id: Optional experiment to check (all if None)
1374
+
1375
+ Returns:
1376
+ Total size in bytes
1377
+ """
1378
+ if experiment_id:
1379
+ exp_dir = self._experiments_dir / experiment_id
1380
+ if not exp_dir.exists():
1381
+ return 0
1382
+ return sum(f.stat().st_size for f in exp_dir.rglob("*") if f.is_file())
1383
+ else:
1384
+ return sum(f.stat().st_size for f in self._experiments_dir.rglob("*") if f.is_file())
1385
+
1386
+ def list_runs(
1387
+ self,
1388
+ experiment_id: str | None = None,
1389
+ status: RunStatus | None = None,
1390
+ limit: int | None = None
1391
+ ) -> list[RunMetadata]:
1392
+ """List runs with optional filtering.
1393
+
1394
+ Args:
1395
+ experiment_id: Filter by experiment
1396
+ status: Filter by status
1397
+ limit: Maximum number of runs to return
1398
+
1399
+ Returns:
1400
+ List of run metadata
1401
+ """
1402
+ if not self._config.use_sqlite_metadata:
1403
+ # Fallback to file-based listing
1404
+ return self._list_runs_from_files(experiment_id, status, limit)
1405
+
1406
+ # Query SQLite
1407
+ db_path = self._root / "experiments.db"
1408
+ conn = sqlite3.connect(db_path)
1409
+
1410
+ query = "SELECT * FROM runs WHERE 1=1"
1411
+ params = []
1412
+
1413
+ if experiment_id:
1414
+ query += " AND experiment_id = ?"
1415
+ params.append(experiment_id)
1416
+
1417
+ if status:
1418
+ query += " AND status = ?"
1419
+ params.append(status.value)
1420
+
1421
+ query += " ORDER BY created_at DESC"
1422
+
1423
+ if limit:
1424
+ query += " LIMIT ?"
1425
+ params.append(limit)
1426
+
1427
+ cursor = conn.execute(query, params)
1428
+ rows = cursor.fetchall()
1429
+ conn.close()
1430
+
1431
+ # Convert to RunMetadata
1432
+ runs = []
1433
+ for row in rows:
1434
+ runs.append(RunMetadata(
1435
+ run_id=row[0],
1436
+ experiment_id=row[1],
1437
+ status=RunStatus(row[2]),
1438
+ created_at=row[3],
1439
+ updated_at=row[4],
1440
+ completed_at=row[5],
1441
+ total_samples=row[6] or 0,
1442
+ successful_generations=row[7] or 0,
1443
+ failed_generations=row[8] or 0,
1444
+ config_snapshot=json.loads(row[9]) if row[9] else {},
1445
+ error_message=row[10],
1446
+ ))
1447
+
1448
+ return runs
1449
+
1450
+ def _list_runs_from_files(
1451
+ self,
1452
+ experiment_id: str | None,
1453
+ status: RunStatus | None,
1454
+ limit: int | None
1455
+ ) -> list[RunMetadata]:
1456
+ """List runs by scanning files (fallback)."""
1457
+ runs = []
1458
+
1459
+ # Scan experiment directories
1460
+ exp_dirs = [self._experiments_dir / experiment_id] if experiment_id else list(self._experiments_dir.iterdir())
1461
+
1462
+ for exp_dir in exp_dirs:
1463
+ if not exp_dir.is_dir():
1464
+ continue
1465
+
1466
+ runs_dir = exp_dir / "runs"
1467
+ if not runs_dir.exists():
1468
+ continue
1469
+
1470
+ for run_dir in runs_dir.iterdir():
1471
+ if not run_dir.is_dir():
1472
+ continue
1473
+
1474
+ try:
1475
+ metadata = self._load_run_metadata(run_dir.name)
1476
+ if status and metadata.status != status:
248
1477
  continue
249
- data = json.loads(line)
250
- index.add(data["task_key"])
251
- self._task_index[run_id] = index
252
- return index
1478
+ runs.append(metadata)
1479
+ except Exception:
1480
+ continue
1481
+
1482
+ # Sort by creation time
1483
+ runs.sort(key=lambda r: r.created_at, reverse=True)
1484
+
1485
+ if limit:
1486
+ runs = runs[:limit]
1487
+
1488
+ return runs
1489
+
1490
+ def validate_integrity(self, run_id: str) -> dict:
1491
+ """Validate data integrity for a run.
1492
+
1493
+ Args:
1494
+ run_id: Run identifier
1495
+
1496
+ Returns:
1497
+ Dict with validation results
1498
+ """
1499
+ results = {
1500
+ "run_id": run_id,
1501
+ "valid": True,
1502
+ "errors": [],
1503
+ "warnings": [],
1504
+ }
1505
+
1506
+ run_dir = self._get_run_dir(run_id)
1507
+ if not run_dir.exists():
1508
+ results["valid"] = False
1509
+ results["errors"].append(f"Run directory not found: {run_dir}")
1510
+ return results
1511
+
1512
+ # Check metadata
1513
+ metadata_path = run_dir / "metadata.json"
1514
+ if not metadata_path.exists():
1515
+ results["valid"] = False
1516
+ results["errors"].append("Missing metadata.json")
1517
+
1518
+ # Check generation directory
1519
+ gen_dir = run_dir / "generation"
1520
+ if not gen_dir.exists():
1521
+ results["warnings"].append("No generation directory")
1522
+ else:
1523
+ # Check for required files
1524
+ for filename in ["records.jsonl", "tasks.jsonl"]:
1525
+ if not self._file_exists_any_compression(gen_dir / filename):
1526
+ results["warnings"].append(f"Missing {filename}")
1527
+
1528
+ # Check lock file
1529
+ lock_path = run_dir / ".lock"
1530
+ if not lock_path.exists():
1531
+ results["warnings"].append("No lock file (may not have been used)")
1532
+
1533
+ return results
1534
+
1535
+
1536
+ def task_cache_key(task: core_entities.GenerationTask) -> str:
1537
+ """Derive a stable cache key for a generation task (module-level function for backward compatibility)."""
1538
+ dataset_raw = task.metadata.get("dataset_id") or task.metadata.get("sample_id")
1539
+ dataset_id = str(dataset_raw) if dataset_raw is not None else ""
1540
+ prompt_hash = hashlib.sha256(task.prompt.text.encode("utf-8")).hexdigest()[:12]
1541
+ sampling = task.sampling
1542
+ sampling_key = (
1543
+ f"{sampling.temperature:.3f}-{sampling.top_p:.3f}-{sampling.max_tokens}"
1544
+ )
1545
+ template = task.prompt.spec.name
1546
+ model = task.model.identifier
1547
+ return "::".join(
1548
+ filter(None, [dataset_id, template, model, sampling_key, prompt_hash])
1549
+ )
1550
+
1551
+
1552
+ def evaluation_cache_key(
1553
+ task: core_entities.GenerationTask,
1554
+ evaluation_config: dict | None = None,
1555
+ ) -> str:
1556
+ """Derive a stable cache key for an evaluation that includes both task and evaluation configuration.
1557
+
1558
+ This ensures that changing metrics or evaluation settings will invalidate the cache
1559
+ and trigger re-evaluation, even if the generation is cached.
1560
+
1561
+ Args:
1562
+ task: Generation task
1563
+ evaluation_config: Dictionary with evaluation configuration:
1564
+ - metrics: List of metric names/types
1565
+ - extractor: Extractor type/configuration
1566
+ - Any other evaluation settings
1567
+
1568
+ Returns:
1569
+ Cache key string that includes both task and evaluation config
1570
+
1571
+ Example:
1572
+ >>> config = {
1573
+ ... "metrics": ["exact_match", "f1_score"],
1574
+ ... "extractor": "json_field_extractor:answer"
1575
+ ... }
1576
+ >>> key = evaluation_cache_key(task, config)
1577
+ """
1578
+ task_key = task_cache_key(task)
1579
+
1580
+ if not evaluation_config:
1581
+ # No config provided, use task key only (for backward compatibility)
1582
+ return task_key
1583
+
1584
+ # Create deterministic hash of evaluation configuration
1585
+ config_str = json.dumps(evaluation_config, sort_keys=True)
1586
+ config_hash = hashlib.sha256(config_str.encode("utf-8")).hexdigest()[:12]
1587
+
1588
+ return f"{task_key}::eval:{config_hash}"
253
1589
 
254
1590
 
255
- __all__ = ["ExperimentStorage", "task_cache_key"]
1591
+ __all__ = [
1592
+ "ExperimentStorage",
1593
+ "StorageConfig",
1594
+ "RunMetadata",
1595
+ "EvaluationMetadata",
1596
+ "RunStatus",
1597
+ "RetentionPolicy",
1598
+ "DataIntegrityError",
1599
+ "ConcurrentAccessError",
1600
+ "task_cache_key",
1601
+ "evaluation_cache_key",
1602
+ ]