themis-eval 0.2.3__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +5 -2
- themis/_version.py +14 -1
- themis/api.py +83 -145
- themis/backends/storage.py +5 -0
- themis/cli/commands/info.py +2 -11
- themis/cli/main.py +231 -40
- themis/comparison/engine.py +7 -13
- themis/core/entities.py +4 -0
- themis/evaluation/metric_pipeline.py +12 -0
- themis/evaluation/pipeline.py +22 -0
- themis/evaluation/pipelines/__init__.py +4 -0
- themis/evaluation/pipelines/composable_pipeline.py +55 -0
- themis/evaluation/pipelines/standard_pipeline.py +16 -0
- themis/experiment/__init__.py +2 -2
- themis/experiment/cache_manager.py +15 -1
- themis/experiment/definitions.py +1 -1
- themis/experiment/orchestrator.py +21 -11
- themis/experiment/share.py +264 -0
- themis/experiment/storage.py +345 -298
- themis/generation/router.py +22 -4
- themis/generation/runner.py +16 -1
- themis/presets/benchmarks.py +602 -17
- themis/server/app.py +38 -26
- themis/session.py +125 -0
- themis/specs/__init__.py +7 -0
- themis/specs/execution.py +26 -0
- themis/specs/experiment.py +33 -0
- themis/specs/storage.py +18 -0
- themis/storage/__init__.py +6 -0
- themis/storage/experiment_storage.py +7 -0
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/METADATA +47 -34
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/RECORD +35 -28
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/WHEEL +1 -1
- themis/experiment/builder.py +0 -151
- themis/experiment/export_csv.py +0 -159
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/top_level.txt +0 -0
themis/experiment/storage.py
CHANGED
|
@@ -20,6 +20,8 @@ import os
|
|
|
20
20
|
import sqlite3
|
|
21
21
|
import sys
|
|
22
22
|
import tempfile
|
|
23
|
+
import threading
|
|
24
|
+
import time
|
|
23
25
|
from dataclasses import dataclass, field
|
|
24
26
|
import shutil
|
|
25
27
|
from datetime import datetime, timedelta
|
|
@@ -27,13 +29,13 @@ from enum import Enum
|
|
|
27
29
|
from pathlib import Path
|
|
28
30
|
from typing import Dict, Iterable, List, Literal
|
|
29
31
|
|
|
30
|
-
# fcntl is Unix-only
|
|
32
|
+
# fcntl is Unix-only
|
|
31
33
|
if sys.platform == "win32":
|
|
32
|
-
import msvcrt
|
|
33
34
|
FCNTL_AVAILABLE = False
|
|
34
35
|
else:
|
|
35
36
|
try:
|
|
36
37
|
import fcntl
|
|
38
|
+
|
|
37
39
|
FCNTL_AVAILABLE = True
|
|
38
40
|
except ImportError:
|
|
39
41
|
FCNTL_AVAILABLE = False
|
|
@@ -56,7 +58,7 @@ class RunStatus(str, Enum):
|
|
|
56
58
|
@dataclass
|
|
57
59
|
class RetentionPolicy:
|
|
58
60
|
"""Retention policy for automatic cleanup.
|
|
59
|
-
|
|
61
|
+
|
|
60
62
|
Attributes:
|
|
61
63
|
max_runs_per_experiment: Maximum runs to keep per experiment
|
|
62
64
|
max_age_days: Maximum age in days for runs
|
|
@@ -64,7 +66,7 @@ class RetentionPolicy:
|
|
|
64
66
|
keep_completed_only: Only keep completed runs
|
|
65
67
|
keep_latest_n: Always keep N most recent runs
|
|
66
68
|
"""
|
|
67
|
-
|
|
69
|
+
|
|
68
70
|
max_runs_per_experiment: int | None = None
|
|
69
71
|
max_age_days: int | None = None
|
|
70
72
|
max_storage_gb: float | None = None
|
|
@@ -155,20 +157,18 @@ class ExperimentStorage:
|
|
|
155
157
|
Example:
|
|
156
158
|
>>> config = StorageConfig()
|
|
157
159
|
>>> storage = ExperimentStorage("outputs/experiments", config=config)
|
|
158
|
-
>>>
|
|
160
|
+
>>>
|
|
159
161
|
>>> # Start a run
|
|
160
162
|
>>> metadata = storage.start_run("run-1", "experiment-1", config={})
|
|
161
|
-
>>>
|
|
163
|
+
>>>
|
|
162
164
|
>>> # Append records with locking
|
|
163
165
|
>>> storage.append_record("run-1", record)
|
|
164
|
-
>>>
|
|
166
|
+
>>>
|
|
165
167
|
>>> # Complete the run
|
|
166
168
|
>>> storage.complete_run("run-1")
|
|
167
169
|
"""
|
|
168
170
|
|
|
169
|
-
def __init__(
|
|
170
|
-
self, root: str | Path, config: StorageConfig | None = None
|
|
171
|
-
) -> None:
|
|
171
|
+
def __init__(self, root: str | Path, config: StorageConfig | None = None) -> None:
|
|
172
172
|
self._root = Path(root)
|
|
173
173
|
self._root.mkdir(parents=True, exist_ok=True)
|
|
174
174
|
self._config = config or StorageConfig()
|
|
@@ -178,6 +178,7 @@ class ExperimentStorage:
|
|
|
178
178
|
self._experiments_dir.mkdir(exist_ok=True)
|
|
179
179
|
|
|
180
180
|
# Initialize SQLite database
|
|
181
|
+
self._db_write_lock = threading.RLock()
|
|
181
182
|
if self._config.use_sqlite_metadata:
|
|
182
183
|
self._init_database()
|
|
183
184
|
|
|
@@ -188,93 +189,98 @@ class ExperimentStorage:
|
|
|
188
189
|
|
|
189
190
|
def _init_database(self):
|
|
190
191
|
"""Initialize SQLite metadata database."""
|
|
192
|
+
with self._db_write_lock:
|
|
193
|
+
with self._connect_db() as conn:
|
|
194
|
+
# WAL allows concurrent readers with a single writer and
|
|
195
|
+
# significantly reduces lock contention in threaded CI runs.
|
|
196
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
197
|
+
conn.execute("""
|
|
198
|
+
CREATE TABLE IF NOT EXISTS experiments (
|
|
199
|
+
experiment_id TEXT PRIMARY KEY,
|
|
200
|
+
name TEXT NOT NULL,
|
|
201
|
+
description TEXT,
|
|
202
|
+
created_at TEXT NOT NULL,
|
|
203
|
+
updated_at TEXT NOT NULL,
|
|
204
|
+
config TEXT,
|
|
205
|
+
tags TEXT
|
|
206
|
+
)
|
|
207
|
+
""")
|
|
208
|
+
|
|
209
|
+
conn.execute("""
|
|
210
|
+
CREATE TABLE IF NOT EXISTS runs (
|
|
211
|
+
run_id TEXT PRIMARY KEY,
|
|
212
|
+
experiment_id TEXT NOT NULL,
|
|
213
|
+
status TEXT NOT NULL,
|
|
214
|
+
created_at TEXT NOT NULL,
|
|
215
|
+
updated_at TEXT NOT NULL,
|
|
216
|
+
completed_at TEXT,
|
|
217
|
+
total_samples INTEGER DEFAULT 0,
|
|
218
|
+
successful_generations INTEGER DEFAULT 0,
|
|
219
|
+
failed_generations INTEGER DEFAULT 0,
|
|
220
|
+
config_snapshot TEXT,
|
|
221
|
+
error_message TEXT,
|
|
222
|
+
FOREIGN KEY (experiment_id) REFERENCES experiments(experiment_id)
|
|
223
|
+
)
|
|
224
|
+
""")
|
|
225
|
+
|
|
226
|
+
conn.execute("""
|
|
227
|
+
CREATE TABLE IF NOT EXISTS evaluations (
|
|
228
|
+
eval_id TEXT PRIMARY KEY,
|
|
229
|
+
run_id TEXT NOT NULL,
|
|
230
|
+
eval_name TEXT NOT NULL,
|
|
231
|
+
created_at TEXT NOT NULL,
|
|
232
|
+
metrics_config TEXT,
|
|
233
|
+
total_evaluated INTEGER DEFAULT 0,
|
|
234
|
+
total_failures INTEGER DEFAULT 0,
|
|
235
|
+
FOREIGN KEY (run_id) REFERENCES runs(run_id)
|
|
236
|
+
)
|
|
237
|
+
""")
|
|
238
|
+
|
|
239
|
+
conn.execute("""
|
|
240
|
+
CREATE INDEX IF NOT EXISTS idx_runs_experiment
|
|
241
|
+
ON runs(experiment_id)
|
|
242
|
+
""")
|
|
243
|
+
|
|
244
|
+
conn.execute("""
|
|
245
|
+
CREATE INDEX IF NOT EXISTS idx_runs_status
|
|
246
|
+
ON runs(status)
|
|
247
|
+
""")
|
|
248
|
+
|
|
249
|
+
conn.execute("""
|
|
250
|
+
CREATE INDEX IF NOT EXISTS idx_evaluations_run
|
|
251
|
+
ON evaluations(run_id)
|
|
252
|
+
""")
|
|
253
|
+
conn.commit()
|
|
254
|
+
|
|
255
|
+
def _connect_db(self) -> sqlite3.Connection:
|
|
256
|
+
"""Create a SQLite connection configured for concurrent access."""
|
|
191
257
|
db_path = self._root / "experiments.db"
|
|
192
|
-
conn = sqlite3.connect(db_path)
|
|
193
|
-
|
|
194
|
-
conn
|
|
195
|
-
CREATE TABLE IF NOT EXISTS experiments (
|
|
196
|
-
experiment_id TEXT PRIMARY KEY,
|
|
197
|
-
name TEXT NOT NULL,
|
|
198
|
-
description TEXT,
|
|
199
|
-
created_at TEXT NOT NULL,
|
|
200
|
-
updated_at TEXT NOT NULL,
|
|
201
|
-
config TEXT,
|
|
202
|
-
tags TEXT
|
|
203
|
-
)
|
|
204
|
-
""")
|
|
205
|
-
|
|
206
|
-
conn.execute("""
|
|
207
|
-
CREATE TABLE IF NOT EXISTS runs (
|
|
208
|
-
run_id TEXT PRIMARY KEY,
|
|
209
|
-
experiment_id TEXT NOT NULL,
|
|
210
|
-
status TEXT NOT NULL,
|
|
211
|
-
created_at TEXT NOT NULL,
|
|
212
|
-
updated_at TEXT NOT NULL,
|
|
213
|
-
completed_at TEXT,
|
|
214
|
-
total_samples INTEGER DEFAULT 0,
|
|
215
|
-
successful_generations INTEGER DEFAULT 0,
|
|
216
|
-
failed_generations INTEGER DEFAULT 0,
|
|
217
|
-
config_snapshot TEXT,
|
|
218
|
-
error_message TEXT,
|
|
219
|
-
FOREIGN KEY (experiment_id) REFERENCES experiments(experiment_id)
|
|
220
|
-
)
|
|
221
|
-
""")
|
|
222
|
-
|
|
223
|
-
conn.execute("""
|
|
224
|
-
CREATE TABLE IF NOT EXISTS evaluations (
|
|
225
|
-
eval_id TEXT PRIMARY KEY,
|
|
226
|
-
run_id TEXT NOT NULL,
|
|
227
|
-
eval_name TEXT NOT NULL,
|
|
228
|
-
created_at TEXT NOT NULL,
|
|
229
|
-
metrics_config TEXT,
|
|
230
|
-
total_evaluated INTEGER DEFAULT 0,
|
|
231
|
-
total_failures INTEGER DEFAULT 0,
|
|
232
|
-
FOREIGN KEY (run_id) REFERENCES runs(run_id)
|
|
233
|
-
)
|
|
234
|
-
""")
|
|
235
|
-
|
|
236
|
-
conn.execute("""
|
|
237
|
-
CREATE INDEX IF NOT EXISTS idx_runs_experiment
|
|
238
|
-
ON runs(experiment_id)
|
|
239
|
-
""")
|
|
240
|
-
|
|
241
|
-
conn.execute("""
|
|
242
|
-
CREATE INDEX IF NOT EXISTS idx_runs_status
|
|
243
|
-
ON runs(status)
|
|
244
|
-
""")
|
|
245
|
-
|
|
246
|
-
conn.execute("""
|
|
247
|
-
CREATE INDEX IF NOT EXISTS idx_evaluations_run
|
|
248
|
-
ON evaluations(run_id)
|
|
249
|
-
""")
|
|
250
|
-
|
|
251
|
-
conn.commit()
|
|
252
|
-
conn.close()
|
|
258
|
+
conn = sqlite3.connect(db_path, timeout=30.0)
|
|
259
|
+
conn.execute("PRAGMA busy_timeout=30000")
|
|
260
|
+
return conn
|
|
253
261
|
|
|
254
262
|
@contextlib.contextmanager
|
|
255
263
|
def _acquire_lock(self, run_id: str):
|
|
256
264
|
"""Acquire exclusive lock for run directory with timeout (reentrant).
|
|
257
|
-
|
|
265
|
+
|
|
258
266
|
This lock is reentrant within the same thread to prevent deadlocks when
|
|
259
|
-
the same process acquires the lock multiple times (e.g., start_run()
|
|
267
|
+
the same process acquires the lock multiple times (e.g., start_run()
|
|
260
268
|
followed by append_record()).
|
|
261
|
-
|
|
269
|
+
|
|
262
270
|
The lock uses OS-specific file locking:
|
|
263
271
|
- Unix/Linux/macOS: fcntl.flock with non-blocking retry
|
|
264
272
|
- Windows: msvcrt.locking
|
|
265
273
|
- Fallback: No locking (single-process mode)
|
|
266
|
-
|
|
274
|
+
|
|
267
275
|
Args:
|
|
268
276
|
run_id: Unique run identifier
|
|
269
|
-
|
|
277
|
+
|
|
270
278
|
Yields:
|
|
271
279
|
Context manager that holds the lock
|
|
272
|
-
|
|
280
|
+
|
|
273
281
|
Raises:
|
|
274
282
|
TimeoutError: If lock cannot be acquired within 30 seconds
|
|
275
283
|
"""
|
|
276
|
-
import time
|
|
277
|
-
|
|
278
284
|
# Check if we already hold the lock (reentrant)
|
|
279
285
|
if run_id in self._locks:
|
|
280
286
|
lock_fd, count = self._locks[run_id]
|
|
@@ -291,7 +297,7 @@ class ExperimentStorage:
|
|
|
291
297
|
# Last unlock - release the actual lock
|
|
292
298
|
self._release_os_lock(lock_fd, run_id)
|
|
293
299
|
return
|
|
294
|
-
|
|
300
|
+
|
|
295
301
|
# First time acquiring lock for this run_id
|
|
296
302
|
lock_path = self._get_run_dir(run_id) / ".lock"
|
|
297
303
|
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -302,7 +308,7 @@ class ExperimentStorage:
|
|
|
302
308
|
try:
|
|
303
309
|
# Acquire exclusive lock with timeout
|
|
304
310
|
self._acquire_os_lock(lock_fd, run_id, lock_path, timeout=30)
|
|
305
|
-
|
|
311
|
+
|
|
306
312
|
self._locks[run_id] = (lock_fd, 1)
|
|
307
313
|
yield
|
|
308
314
|
finally:
|
|
@@ -314,27 +320,23 @@ class ExperimentStorage:
|
|
|
314
320
|
else:
|
|
315
321
|
# Decrement count
|
|
316
322
|
self._locks[run_id] = (lock_fd, count - 1)
|
|
317
|
-
|
|
323
|
+
|
|
318
324
|
def _acquire_os_lock(
|
|
319
|
-
self,
|
|
320
|
-
lock_fd: int,
|
|
321
|
-
run_id: str,
|
|
322
|
-
lock_path: Path,
|
|
323
|
-
timeout: int = 30
|
|
325
|
+
self, lock_fd: int, run_id: str, lock_path: Path, timeout: int = 30
|
|
324
326
|
) -> None:
|
|
325
327
|
"""Acquire OS-specific file lock with timeout.
|
|
326
|
-
|
|
328
|
+
|
|
327
329
|
Args:
|
|
328
330
|
lock_fd: File descriptor for lock file
|
|
329
331
|
run_id: Run identifier (for error messages)
|
|
330
332
|
lock_path: Path to lock file (for error messages)
|
|
331
333
|
timeout: Timeout in seconds
|
|
332
|
-
|
|
334
|
+
|
|
333
335
|
Raises:
|
|
334
336
|
TimeoutError: If lock cannot be acquired within timeout
|
|
335
337
|
"""
|
|
336
338
|
import time
|
|
337
|
-
|
|
339
|
+
|
|
338
340
|
if sys.platform == "win32":
|
|
339
341
|
# Windows file locking with retry
|
|
340
342
|
try:
|
|
@@ -342,10 +344,11 @@ class ExperimentStorage:
|
|
|
342
344
|
except ImportError:
|
|
343
345
|
# msvcrt not available - single-process mode
|
|
344
346
|
import logging
|
|
347
|
+
|
|
345
348
|
logger = logging.getLogger(__name__)
|
|
346
349
|
logger.debug("msvcrt not available. Single-process mode only.")
|
|
347
350
|
return
|
|
348
|
-
|
|
351
|
+
|
|
349
352
|
start_time = time.time()
|
|
350
353
|
while True:
|
|
351
354
|
try:
|
|
@@ -356,7 +359,7 @@ class ExperimentStorage:
|
|
|
356
359
|
if time.time() - start_time > timeout:
|
|
357
360
|
try:
|
|
358
361
|
os.close(lock_fd)
|
|
359
|
-
except:
|
|
362
|
+
except OSError:
|
|
360
363
|
pass
|
|
361
364
|
raise TimeoutError(
|
|
362
365
|
f"Failed to acquire lock for run {run_id} after {timeout}s on Windows. "
|
|
@@ -376,7 +379,7 @@ class ExperimentStorage:
|
|
|
376
379
|
if time.time() - start_time > timeout:
|
|
377
380
|
try:
|
|
378
381
|
os.close(lock_fd)
|
|
379
|
-
except:
|
|
382
|
+
except OSError:
|
|
380
383
|
pass
|
|
381
384
|
raise TimeoutError(
|
|
382
385
|
f"Failed to acquire lock for run {run_id} after {timeout}s. "
|
|
@@ -388,15 +391,16 @@ class ExperimentStorage:
|
|
|
388
391
|
# No locking available - single-process mode
|
|
389
392
|
# This is safe for single-process usage (most common case)
|
|
390
393
|
import logging
|
|
394
|
+
|
|
391
395
|
logger = logging.getLogger(__name__)
|
|
392
396
|
logger.debug(
|
|
393
|
-
|
|
394
|
-
|
|
397
|
+
"File locking not available on this platform. "
|
|
398
|
+
"Storage will work in single-process mode only."
|
|
395
399
|
)
|
|
396
|
-
|
|
400
|
+
|
|
397
401
|
def _release_os_lock(self, lock_fd: int, run_id: str) -> None:
|
|
398
402
|
"""Release OS-specific file lock.
|
|
399
|
-
|
|
403
|
+
|
|
400
404
|
Args:
|
|
401
405
|
lock_fd: File descriptor to close
|
|
402
406
|
run_id: Run identifier (for cleanup)
|
|
@@ -405,6 +409,7 @@ class ExperimentStorage:
|
|
|
405
409
|
if sys.platform == "win32":
|
|
406
410
|
try:
|
|
407
411
|
import msvcrt
|
|
412
|
+
|
|
408
413
|
msvcrt.locking(lock_fd, msvcrt.LK_UNLCK, 1)
|
|
409
414
|
except (ImportError, OSError):
|
|
410
415
|
pass # Lock may already be released
|
|
@@ -413,13 +418,13 @@ class ExperimentStorage:
|
|
|
413
418
|
fcntl.flock(lock_fd, fcntl.LOCK_UN)
|
|
414
419
|
except (IOError, OSError):
|
|
415
420
|
pass # Lock may already be released
|
|
416
|
-
|
|
421
|
+
|
|
417
422
|
# Close file descriptor
|
|
418
423
|
try:
|
|
419
424
|
os.close(lock_fd)
|
|
420
425
|
except OSError:
|
|
421
426
|
pass # FD may already be closed
|
|
422
|
-
|
|
427
|
+
|
|
423
428
|
# Clean up tracking
|
|
424
429
|
self._locks.pop(run_id, None)
|
|
425
430
|
|
|
@@ -558,9 +563,11 @@ class ExperimentStorage:
|
|
|
558
563
|
|
|
559
564
|
# Update progress
|
|
560
565
|
metadata = self._load_run_metadata(run_id)
|
|
561
|
-
new_successful = metadata.successful_generations + (
|
|
566
|
+
new_successful = metadata.successful_generations + (
|
|
567
|
+
1 if record.output else 0
|
|
568
|
+
)
|
|
562
569
|
new_failed = metadata.failed_generations + (1 if record.error else 0)
|
|
563
|
-
|
|
570
|
+
|
|
564
571
|
self.update_run_progress(
|
|
565
572
|
run_id,
|
|
566
573
|
total_samples=metadata.total_samples + 1,
|
|
@@ -664,49 +671,57 @@ class ExperimentStorage:
|
|
|
664
671
|
|
|
665
672
|
def _save_run_metadata_to_db(self, metadata: RunMetadata):
|
|
666
673
|
"""Save run metadata to SQLite database."""
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
INSERT OR REPLACE INTO runs (
|
|
688
|
-
run_id, experiment_id, status, created_at, updated_at, completed_at,
|
|
689
|
-
total_samples, successful_generations, failed_generations,
|
|
690
|
-
config_snapshot, error_message
|
|
691
|
-
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
692
|
-
""",
|
|
693
|
-
(
|
|
694
|
-
metadata.run_id,
|
|
695
|
-
metadata.experiment_id,
|
|
696
|
-
metadata.status.value,
|
|
697
|
-
metadata.created_at,
|
|
698
|
-
metadata.updated_at,
|
|
699
|
-
metadata.completed_at,
|
|
700
|
-
metadata.total_samples,
|
|
701
|
-
metadata.successful_generations,
|
|
702
|
-
metadata.failed_generations,
|
|
703
|
-
json.dumps(metadata.config_snapshot),
|
|
704
|
-
metadata.error_message,
|
|
705
|
-
),
|
|
706
|
-
)
|
|
674
|
+
# Serialise process-local writers to avoid lock thrash on Windows CI.
|
|
675
|
+
with self._db_write_lock:
|
|
676
|
+
retry_delay = 0.05
|
|
677
|
+
max_attempts = 5
|
|
678
|
+
for attempt in range(max_attempts):
|
|
679
|
+
try:
|
|
680
|
+
with self._connect_db() as conn:
|
|
681
|
+
# Ensure experiment exists
|
|
682
|
+
conn.execute(
|
|
683
|
+
"""
|
|
684
|
+
INSERT OR IGNORE INTO experiments (experiment_id, name, created_at, updated_at)
|
|
685
|
+
VALUES (?, ?, ?, ?)
|
|
686
|
+
""",
|
|
687
|
+
(
|
|
688
|
+
metadata.experiment_id,
|
|
689
|
+
metadata.experiment_id,
|
|
690
|
+
metadata.created_at,
|
|
691
|
+
metadata.updated_at,
|
|
692
|
+
),
|
|
693
|
+
)
|
|
707
694
|
|
|
708
|
-
|
|
709
|
-
|
|
695
|
+
# Upsert run
|
|
696
|
+
conn.execute(
|
|
697
|
+
"""
|
|
698
|
+
INSERT OR REPLACE INTO runs (
|
|
699
|
+
run_id, experiment_id, status, created_at, updated_at, completed_at,
|
|
700
|
+
total_samples, successful_generations, failed_generations,
|
|
701
|
+
config_snapshot, error_message
|
|
702
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
703
|
+
""",
|
|
704
|
+
(
|
|
705
|
+
metadata.run_id,
|
|
706
|
+
metadata.experiment_id,
|
|
707
|
+
metadata.status.value,
|
|
708
|
+
metadata.created_at,
|
|
709
|
+
metadata.updated_at,
|
|
710
|
+
metadata.completed_at,
|
|
711
|
+
metadata.total_samples,
|
|
712
|
+
metadata.successful_generations,
|
|
713
|
+
metadata.failed_generations,
|
|
714
|
+
json.dumps(metadata.config_snapshot),
|
|
715
|
+
metadata.error_message,
|
|
716
|
+
),
|
|
717
|
+
)
|
|
718
|
+
conn.commit()
|
|
719
|
+
return
|
|
720
|
+
except sqlite3.OperationalError as exc:
|
|
721
|
+
if "locked" not in str(exc).lower() or attempt == max_attempts - 1:
|
|
722
|
+
raise
|
|
723
|
+
time.sleep(retry_delay)
|
|
724
|
+
retry_delay *= 2
|
|
710
725
|
|
|
711
726
|
def _load_run_metadata(self, run_id: str) -> RunMetadata:
|
|
712
727
|
"""Load run metadata from JSON file.
|
|
@@ -744,6 +759,10 @@ class ExperimentStorage:
|
|
|
744
759
|
metadata_path = self._get_run_dir(run_id) / "metadata.json"
|
|
745
760
|
return metadata_path.exists()
|
|
746
761
|
|
|
762
|
+
def run_metadata_exists(self, run_id: str) -> bool:
|
|
763
|
+
"""Check if run metadata exists (public API)."""
|
|
764
|
+
return self._run_metadata_exists(run_id)
|
|
765
|
+
|
|
747
766
|
def _get_run_dir(self, run_id: str) -> Path:
|
|
748
767
|
"""Get run directory path.
|
|
749
768
|
|
|
@@ -778,10 +797,10 @@ class ExperimentStorage:
|
|
|
778
797
|
|
|
779
798
|
def _open_for_read(self, path: Path):
|
|
780
799
|
"""Open file for reading with automatic compression detection.
|
|
781
|
-
|
|
800
|
+
|
|
782
801
|
Args:
|
|
783
802
|
path: File path
|
|
784
|
-
|
|
803
|
+
|
|
785
804
|
Returns:
|
|
786
805
|
File handle (text mode)
|
|
787
806
|
"""
|
|
@@ -841,16 +860,16 @@ class ExperimentStorage:
|
|
|
841
860
|
|
|
842
861
|
def load_dataset(self, run_id: str) -> List[dict[str, object]]:
|
|
843
862
|
"""Load cached dataset.
|
|
844
|
-
|
|
863
|
+
|
|
845
864
|
Args:
|
|
846
865
|
run_id: Run identifier
|
|
847
|
-
|
|
866
|
+
|
|
848
867
|
Returns:
|
|
849
868
|
List of dataset samples
|
|
850
869
|
"""
|
|
851
870
|
gen_dir = self._get_generation_dir(run_id)
|
|
852
871
|
path = gen_dir / "dataset.jsonl"
|
|
853
|
-
|
|
872
|
+
|
|
854
873
|
rows: list[dict[str, object]] = []
|
|
855
874
|
with self._open_for_read(path) as handle:
|
|
856
875
|
for line in handle:
|
|
@@ -866,16 +885,16 @@ class ExperimentStorage:
|
|
|
866
885
|
self, run_id: str
|
|
867
886
|
) -> Dict[str, core_entities.GenerationRecord]:
|
|
868
887
|
"""Load cached generation records.
|
|
869
|
-
|
|
888
|
+
|
|
870
889
|
Args:
|
|
871
890
|
run_id: Run identifier
|
|
872
|
-
|
|
891
|
+
|
|
873
892
|
Returns:
|
|
874
893
|
Dict mapping cache_key to GenerationRecord
|
|
875
894
|
"""
|
|
876
895
|
gen_dir = self._get_generation_dir(run_id)
|
|
877
896
|
path = gen_dir / "records.jsonl"
|
|
878
|
-
|
|
897
|
+
|
|
879
898
|
try:
|
|
880
899
|
handle = self._open_for_read(path)
|
|
881
900
|
except FileNotFoundError:
|
|
@@ -883,7 +902,7 @@ class ExperimentStorage:
|
|
|
883
902
|
|
|
884
903
|
tasks = self._load_tasks(run_id)
|
|
885
904
|
records: dict[str, core_entities.GenerationRecord] = {}
|
|
886
|
-
|
|
905
|
+
|
|
887
906
|
with handle:
|
|
888
907
|
for line in handle:
|
|
889
908
|
if not line.strip():
|
|
@@ -891,14 +910,14 @@ class ExperimentStorage:
|
|
|
891
910
|
data = json.loads(line)
|
|
892
911
|
if data.get("_type") == "header":
|
|
893
912
|
continue
|
|
894
|
-
|
|
913
|
+
|
|
895
914
|
key = data.get("cache_key")
|
|
896
915
|
if not key:
|
|
897
916
|
continue
|
|
898
|
-
|
|
917
|
+
|
|
899
918
|
record = self._deserialize_record(data, tasks)
|
|
900
919
|
records[key] = record
|
|
901
|
-
|
|
920
|
+
|
|
902
921
|
return records
|
|
903
922
|
|
|
904
923
|
def append_evaluation(
|
|
@@ -911,7 +930,7 @@ class ExperimentStorage:
|
|
|
911
930
|
evaluation_config: dict | None = None,
|
|
912
931
|
) -> None:
|
|
913
932
|
"""Append evaluation result.
|
|
914
|
-
|
|
933
|
+
|
|
915
934
|
Args:
|
|
916
935
|
run_id: Run identifier
|
|
917
936
|
record: Generation record being evaluated
|
|
@@ -922,48 +941,53 @@ class ExperimentStorage:
|
|
|
922
941
|
with self._acquire_lock(run_id):
|
|
923
942
|
eval_dir = self._get_evaluation_dir(run_id, eval_id)
|
|
924
943
|
eval_dir.mkdir(parents=True, exist_ok=True)
|
|
925
|
-
|
|
944
|
+
|
|
926
945
|
path = eval_dir / "evaluation.jsonl"
|
|
927
|
-
|
|
946
|
+
|
|
928
947
|
if not self._file_exists_any_compression(path):
|
|
929
948
|
self._write_jsonl_with_header(path, [], file_type="evaluation")
|
|
930
|
-
|
|
949
|
+
|
|
931
950
|
# Use evaluation_cache_key that includes evaluation config
|
|
932
951
|
cache_key = evaluation_cache_key(record.task, evaluation_config)
|
|
933
|
-
|
|
952
|
+
|
|
934
953
|
payload = {
|
|
935
954
|
"cache_key": cache_key,
|
|
936
|
-
"evaluation": core_serialization.serialize_evaluation_record(
|
|
955
|
+
"evaluation": core_serialization.serialize_evaluation_record(
|
|
956
|
+
evaluation
|
|
957
|
+
),
|
|
937
958
|
}
|
|
938
959
|
self._atomic_append(path, payload)
|
|
939
960
|
|
|
940
961
|
def load_cached_evaluations(
|
|
941
|
-
self,
|
|
962
|
+
self,
|
|
963
|
+
run_id: str,
|
|
964
|
+
eval_id: str = "default",
|
|
965
|
+
evaluation_config: dict | None = None,
|
|
942
966
|
) -> Dict[str, core_entities.EvaluationRecord]:
|
|
943
967
|
"""Load cached evaluation records.
|
|
944
|
-
|
|
968
|
+
|
|
945
969
|
Args:
|
|
946
970
|
run_id: Run identifier
|
|
947
971
|
eval_id: Evaluation identifier
|
|
948
972
|
evaluation_config: Evaluation configuration for cache key matching
|
|
949
|
-
|
|
973
|
+
|
|
950
974
|
Returns:
|
|
951
975
|
Dict mapping cache_key to EvaluationRecord
|
|
952
|
-
|
|
976
|
+
|
|
953
977
|
Note:
|
|
954
978
|
If evaluation_config is provided, only evaluations matching that config
|
|
955
979
|
will be loaded. This ensures that changing metrics invalidates the cache.
|
|
956
980
|
"""
|
|
957
981
|
eval_dir = self._get_evaluation_dir(run_id, eval_id)
|
|
958
982
|
path = eval_dir / "evaluation.jsonl"
|
|
959
|
-
|
|
983
|
+
|
|
960
984
|
try:
|
|
961
985
|
handle = self._open_for_read(path)
|
|
962
986
|
except FileNotFoundError:
|
|
963
987
|
return {}
|
|
964
|
-
|
|
988
|
+
|
|
965
989
|
evaluations: dict[str, core_entities.EvaluationRecord] = {}
|
|
966
|
-
|
|
990
|
+
|
|
967
991
|
with handle:
|
|
968
992
|
for line in handle:
|
|
969
993
|
if not line.strip():
|
|
@@ -971,15 +995,15 @@ class ExperimentStorage:
|
|
|
971
995
|
data = json.loads(line)
|
|
972
996
|
if data.get("_type") == "header":
|
|
973
997
|
continue
|
|
974
|
-
|
|
998
|
+
|
|
975
999
|
key = data.get("cache_key")
|
|
976
1000
|
if not key:
|
|
977
1001
|
continue
|
|
978
|
-
|
|
1002
|
+
|
|
979
1003
|
evaluations[key] = core_serialization.deserialize_evaluation_record(
|
|
980
1004
|
data["evaluation"]
|
|
981
1005
|
)
|
|
982
|
-
|
|
1006
|
+
|
|
983
1007
|
return evaluations
|
|
984
1008
|
|
|
985
1009
|
def get_run_path(self, run_id: str) -> Path:
|
|
@@ -1030,12 +1054,11 @@ class ExperimentStorage:
|
|
|
1030
1054
|
task = tasks[task_key]
|
|
1031
1055
|
output_data = payload.get("output")
|
|
1032
1056
|
error_data = payload.get("error")
|
|
1033
|
-
|
|
1057
|
+
|
|
1034
1058
|
record = core_entities.GenerationRecord(
|
|
1035
1059
|
task=task,
|
|
1036
1060
|
output=core_entities.ModelOutput(
|
|
1037
|
-
text=output_data["text"],
|
|
1038
|
-
raw=output_data.get("raw")
|
|
1061
|
+
text=output_data["text"], raw=output_data.get("raw")
|
|
1039
1062
|
)
|
|
1040
1063
|
if output_data
|
|
1041
1064
|
else None,
|
|
@@ -1048,12 +1071,12 @@ class ExperimentStorage:
|
|
|
1048
1071
|
else None,
|
|
1049
1072
|
metrics=payload.get("metrics", {}),
|
|
1050
1073
|
)
|
|
1051
|
-
|
|
1074
|
+
|
|
1052
1075
|
record.attempts = [
|
|
1053
1076
|
self._deserialize_record(attempt, tasks)
|
|
1054
1077
|
for attempt in payload.get("attempts", [])
|
|
1055
1078
|
]
|
|
1056
|
-
|
|
1079
|
+
|
|
1057
1080
|
return record
|
|
1058
1081
|
|
|
1059
1082
|
def _persist_task(self, run_id: str, task: core_entities.GenerationTask) -> str:
|
|
@@ -1090,9 +1113,7 @@ class ExperimentStorage:
|
|
|
1090
1113
|
|
|
1091
1114
|
return key
|
|
1092
1115
|
|
|
1093
|
-
def _persist_template(
|
|
1094
|
-
self, run_id: str, spec: core_entities.PromptSpec
|
|
1095
|
-
) -> str:
|
|
1116
|
+
def _persist_template(self, run_id: str, spec: core_entities.PromptSpec) -> str:
|
|
1096
1117
|
"""Persist prompt template."""
|
|
1097
1118
|
template_content = f"{spec.name}:{spec.template}"
|
|
1098
1119
|
template_id = hashlib.sha256(template_content.encode("utf-8")).hexdigest()[:16]
|
|
@@ -1147,22 +1168,22 @@ class ExperimentStorage:
|
|
|
1147
1168
|
|
|
1148
1169
|
def _load_templates(self, run_id: str) -> dict[str, core_entities.PromptSpec]:
|
|
1149
1170
|
"""Load templates from disk.
|
|
1150
|
-
|
|
1171
|
+
|
|
1151
1172
|
Args:
|
|
1152
1173
|
run_id: Run identifier
|
|
1153
|
-
|
|
1174
|
+
|
|
1154
1175
|
Returns:
|
|
1155
1176
|
Dict mapping template_id to PromptSpec
|
|
1156
1177
|
"""
|
|
1157
1178
|
gen_dir = self._get_generation_dir(run_id)
|
|
1158
1179
|
path = gen_dir / "templates.jsonl"
|
|
1159
|
-
|
|
1180
|
+
|
|
1160
1181
|
templates: dict[str, core_entities.PromptSpec] = {}
|
|
1161
1182
|
try:
|
|
1162
1183
|
handle = self._open_for_read(path)
|
|
1163
1184
|
except FileNotFoundError:
|
|
1164
1185
|
return templates
|
|
1165
|
-
|
|
1186
|
+
|
|
1166
1187
|
with handle:
|
|
1167
1188
|
for line in handle:
|
|
1168
1189
|
if not line.strip():
|
|
@@ -1170,35 +1191,37 @@ class ExperimentStorage:
|
|
|
1170
1191
|
data = json.loads(line)
|
|
1171
1192
|
if data.get("_type") == "header":
|
|
1172
1193
|
continue
|
|
1173
|
-
|
|
1194
|
+
|
|
1174
1195
|
template_id = data["template_id"]
|
|
1175
1196
|
templates[template_id] = core_serialization.deserialize_prompt_spec(
|
|
1176
1197
|
data["spec"]
|
|
1177
1198
|
)
|
|
1178
|
-
|
|
1199
|
+
|
|
1179
1200
|
return templates
|
|
1180
1201
|
|
|
1181
1202
|
def _load_tasks(self, run_id: str) -> dict[str, core_entities.GenerationTask]:
|
|
1182
1203
|
"""Load tasks from disk.
|
|
1183
|
-
|
|
1204
|
+
|
|
1184
1205
|
Args:
|
|
1185
1206
|
run_id: Run identifier
|
|
1186
|
-
|
|
1207
|
+
|
|
1187
1208
|
Returns:
|
|
1188
1209
|
Dict mapping task_key to GenerationTask
|
|
1189
1210
|
"""
|
|
1190
1211
|
gen_dir = self._get_generation_dir(run_id)
|
|
1191
1212
|
path = gen_dir / "tasks.jsonl"
|
|
1192
|
-
|
|
1213
|
+
|
|
1193
1214
|
tasks: dict[str, core_entities.GenerationTask] = {}
|
|
1194
1215
|
try:
|
|
1195
1216
|
handle = self._open_for_read(path)
|
|
1196
1217
|
except FileNotFoundError:
|
|
1197
1218
|
return tasks
|
|
1198
|
-
|
|
1219
|
+
|
|
1199
1220
|
# Load templates if deduplication enabled
|
|
1200
|
-
templates =
|
|
1201
|
-
|
|
1221
|
+
templates = (
|
|
1222
|
+
self._load_templates(run_id) if self._config.deduplicate_templates else {}
|
|
1223
|
+
)
|
|
1224
|
+
|
|
1202
1225
|
with handle:
|
|
1203
1226
|
for line in handle:
|
|
1204
1227
|
if not line.strip():
|
|
@@ -1206,10 +1229,10 @@ class ExperimentStorage:
|
|
|
1206
1229
|
data = json.loads(line)
|
|
1207
1230
|
if data.get("_type") == "header":
|
|
1208
1231
|
continue
|
|
1209
|
-
|
|
1232
|
+
|
|
1210
1233
|
task_key = data["task_key"]
|
|
1211
1234
|
task_data = data["task"]
|
|
1212
|
-
|
|
1235
|
+
|
|
1213
1236
|
# Restore template from reference if needed
|
|
1214
1237
|
if (
|
|
1215
1238
|
self._config.deduplicate_templates
|
|
@@ -1217,12 +1240,16 @@ class ExperimentStorage:
|
|
|
1217
1240
|
):
|
|
1218
1241
|
template_id = task_data["prompt"]["spec"]["_template_ref"]
|
|
1219
1242
|
if template_id in templates:
|
|
1220
|
-
task_data["prompt"]["spec"] =
|
|
1221
|
-
|
|
1243
|
+
task_data["prompt"]["spec"] = (
|
|
1244
|
+
core_serialization.serialize_prompt_spec(
|
|
1245
|
+
templates[template_id]
|
|
1246
|
+
)
|
|
1222
1247
|
)
|
|
1223
|
-
|
|
1224
|
-
tasks[task_key] = core_serialization.deserialize_generation_task(
|
|
1225
|
-
|
|
1248
|
+
|
|
1249
|
+
tasks[task_key] = core_serialization.deserialize_generation_task(
|
|
1250
|
+
task_data
|
|
1251
|
+
)
|
|
1252
|
+
|
|
1226
1253
|
self._task_index[run_id] = set(tasks.keys())
|
|
1227
1254
|
return tasks
|
|
1228
1255
|
|
|
@@ -1245,7 +1272,7 @@ class ExperimentStorage:
|
|
|
1245
1272
|
|
|
1246
1273
|
def save_checkpoint(self, run_id: str, checkpoint_data: dict):
|
|
1247
1274
|
"""Save checkpoint for resumability.
|
|
1248
|
-
|
|
1275
|
+
|
|
1249
1276
|
Args:
|
|
1250
1277
|
run_id: Run identifier
|
|
1251
1278
|
checkpoint_data: Checkpoint data to save
|
|
@@ -1253,52 +1280,52 @@ class ExperimentStorage:
|
|
|
1253
1280
|
with self._acquire_lock(run_id):
|
|
1254
1281
|
checkpoint_dir = self._get_run_dir(run_id) / "checkpoints"
|
|
1255
1282
|
checkpoint_dir.mkdir(exist_ok=True)
|
|
1256
|
-
|
|
1283
|
+
|
|
1257
1284
|
# Use timestamp for checkpoint filename
|
|
1258
1285
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
1259
1286
|
checkpoint_path = checkpoint_dir / f"checkpoint_{timestamp}.json"
|
|
1260
|
-
|
|
1287
|
+
|
|
1261
1288
|
checkpoint_path.write_text(json.dumps(checkpoint_data, indent=2))
|
|
1262
1289
|
|
|
1263
1290
|
def load_latest_checkpoint(self, run_id: str) -> dict | None:
|
|
1264
1291
|
"""Load most recent checkpoint.
|
|
1265
|
-
|
|
1292
|
+
|
|
1266
1293
|
Args:
|
|
1267
1294
|
run_id: Run identifier
|
|
1268
|
-
|
|
1295
|
+
|
|
1269
1296
|
Returns:
|
|
1270
1297
|
Checkpoint data or None if no checkpoints exist
|
|
1271
1298
|
"""
|
|
1272
1299
|
checkpoint_dir = self._get_run_dir(run_id) / "checkpoints"
|
|
1273
1300
|
if not checkpoint_dir.exists():
|
|
1274
1301
|
return None
|
|
1275
|
-
|
|
1302
|
+
|
|
1276
1303
|
# Find latest checkpoint
|
|
1277
1304
|
checkpoints = sorted(checkpoint_dir.glob("checkpoint_*.json"), reverse=True)
|
|
1278
1305
|
if not checkpoints:
|
|
1279
1306
|
return None
|
|
1280
|
-
|
|
1307
|
+
|
|
1281
1308
|
return json.loads(checkpoints[0].read_text())
|
|
1282
1309
|
|
|
1283
1310
|
def apply_retention_policy(self, policy: RetentionPolicy | None = None):
|
|
1284
1311
|
"""Apply retention policy to clean up old runs.
|
|
1285
|
-
|
|
1312
|
+
|
|
1286
1313
|
Args:
|
|
1287
1314
|
policy: Retention policy (uses config if not provided)
|
|
1288
1315
|
"""
|
|
1289
1316
|
policy = policy or self._config.retention_policy
|
|
1290
1317
|
if not policy:
|
|
1291
1318
|
return
|
|
1292
|
-
|
|
1319
|
+
|
|
1293
1320
|
# Get all experiments
|
|
1294
1321
|
for exp_dir in self._experiments_dir.iterdir():
|
|
1295
1322
|
if not exp_dir.is_dir():
|
|
1296
1323
|
continue
|
|
1297
|
-
|
|
1324
|
+
|
|
1298
1325
|
runs_dir = exp_dir / "runs"
|
|
1299
1326
|
if not runs_dir.exists():
|
|
1300
1327
|
continue
|
|
1301
|
-
|
|
1328
|
+
|
|
1302
1329
|
# Load all run metadata
|
|
1303
1330
|
runs = []
|
|
1304
1331
|
for run_dir in runs_dir.iterdir():
|
|
@@ -1307,29 +1334,32 @@ class ExperimentStorage:
|
|
|
1307
1334
|
metadata_path = run_dir / "metadata.json"
|
|
1308
1335
|
if not metadata_path.exists():
|
|
1309
1336
|
continue
|
|
1310
|
-
|
|
1337
|
+
|
|
1311
1338
|
try:
|
|
1312
1339
|
metadata = self._load_run_metadata(run_dir.name)
|
|
1313
1340
|
runs.append((run_dir, metadata))
|
|
1314
1341
|
except Exception:
|
|
1315
1342
|
continue
|
|
1316
|
-
|
|
1343
|
+
|
|
1317
1344
|
# Sort by creation time (newest first)
|
|
1318
1345
|
runs.sort(key=lambda x: x[1].created_at, reverse=True)
|
|
1319
|
-
|
|
1346
|
+
|
|
1320
1347
|
# Apply policies
|
|
1321
1348
|
runs_to_delete = []
|
|
1322
|
-
|
|
1349
|
+
|
|
1323
1350
|
for i, (run_dir, metadata) in enumerate(runs):
|
|
1324
1351
|
# Always keep latest N runs
|
|
1325
1352
|
if i < policy.keep_latest_n:
|
|
1326
1353
|
continue
|
|
1327
|
-
|
|
1354
|
+
|
|
1328
1355
|
# Check if should keep based on status
|
|
1329
|
-
if
|
|
1356
|
+
if (
|
|
1357
|
+
policy.keep_completed_only
|
|
1358
|
+
and metadata.status != RunStatus.COMPLETED
|
|
1359
|
+
):
|
|
1330
1360
|
runs_to_delete.append(run_dir)
|
|
1331
1361
|
continue
|
|
1332
|
-
|
|
1362
|
+
|
|
1333
1363
|
# Check age policy
|
|
1334
1364
|
if policy.max_age_days:
|
|
1335
1365
|
created = datetime.fromisoformat(metadata.created_at)
|
|
@@ -1337,41 +1367,54 @@ class ExperimentStorage:
|
|
|
1337
1367
|
if age > timedelta(days=policy.max_age_days):
|
|
1338
1368
|
runs_to_delete.append(run_dir)
|
|
1339
1369
|
continue
|
|
1340
|
-
|
|
1370
|
+
|
|
1341
1371
|
# Check max runs policy
|
|
1342
1372
|
if policy.max_runs_per_experiment:
|
|
1343
1373
|
if i >= policy.max_runs_per_experiment:
|
|
1344
1374
|
runs_to_delete.append(run_dir)
|
|
1345
|
-
|
|
1375
|
+
|
|
1346
1376
|
# Delete runs
|
|
1347
1377
|
for run_dir in runs_to_delete:
|
|
1348
1378
|
self._delete_run_dir(run_dir)
|
|
1349
1379
|
|
|
1350
1380
|
def _delete_run_dir(self, run_dir: Path):
|
|
1351
1381
|
"""Delete run directory and update database.
|
|
1352
|
-
|
|
1382
|
+
|
|
1353
1383
|
Args:
|
|
1354
1384
|
run_dir: Run directory to delete
|
|
1355
1385
|
"""
|
|
1356
1386
|
run_id = run_dir.name
|
|
1357
|
-
|
|
1387
|
+
|
|
1358
1388
|
# Remove from SQLite
|
|
1359
1389
|
if self._config.use_sqlite_metadata:
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1390
|
+
with self._db_write_lock:
|
|
1391
|
+
with self._connect_db() as conn:
|
|
1392
|
+
conn.execute("DELETE FROM runs WHERE run_id = ?", (run_id,))
|
|
1393
|
+
conn.commit()
|
|
1394
|
+
|
|
1366
1395
|
# Remove directory
|
|
1367
1396
|
shutil.rmtree(run_dir, ignore_errors=True)
|
|
1368
1397
|
|
|
1398
|
+
def delete_run(self, run_id: str) -> None:
|
|
1399
|
+
"""Delete a run and its stored artifacts.
|
|
1400
|
+
|
|
1401
|
+
Args:
|
|
1402
|
+
run_id: Run identifier to delete
|
|
1403
|
+
|
|
1404
|
+
Raises:
|
|
1405
|
+
FileNotFoundError: If the run does not exist
|
|
1406
|
+
"""
|
|
1407
|
+
run_dir = self._get_run_dir(run_id)
|
|
1408
|
+
if not run_dir.exists():
|
|
1409
|
+
raise FileNotFoundError(f"Run not found: {run_id}")
|
|
1410
|
+
self._delete_run_dir(run_dir)
|
|
1411
|
+
|
|
1369
1412
|
def get_storage_size(self, experiment_id: str | None = None) -> int:
|
|
1370
1413
|
"""Get total storage size in bytes.
|
|
1371
|
-
|
|
1414
|
+
|
|
1372
1415
|
Args:
|
|
1373
1416
|
experiment_id: Optional experiment to check (all if None)
|
|
1374
|
-
|
|
1417
|
+
|
|
1375
1418
|
Returns:
|
|
1376
1419
|
Total size in bytes
|
|
1377
1420
|
"""
|
|
@@ -1381,96 +1424,100 @@ class ExperimentStorage:
|
|
|
1381
1424
|
return 0
|
|
1382
1425
|
return sum(f.stat().st_size for f in exp_dir.rglob("*") if f.is_file())
|
|
1383
1426
|
else:
|
|
1384
|
-
return sum(
|
|
1427
|
+
return sum(
|
|
1428
|
+
f.stat().st_size
|
|
1429
|
+
for f in self._experiments_dir.rglob("*")
|
|
1430
|
+
if f.is_file()
|
|
1431
|
+
)
|
|
1385
1432
|
|
|
1386
1433
|
def list_runs(
|
|
1387
1434
|
self,
|
|
1388
1435
|
experiment_id: str | None = None,
|
|
1389
1436
|
status: RunStatus | None = None,
|
|
1390
|
-
limit: int | None = None
|
|
1437
|
+
limit: int | None = None,
|
|
1391
1438
|
) -> list[RunMetadata]:
|
|
1392
1439
|
"""List runs with optional filtering.
|
|
1393
|
-
|
|
1440
|
+
|
|
1394
1441
|
Args:
|
|
1395
1442
|
experiment_id: Filter by experiment
|
|
1396
1443
|
status: Filter by status
|
|
1397
1444
|
limit: Maximum number of runs to return
|
|
1398
|
-
|
|
1445
|
+
|
|
1399
1446
|
Returns:
|
|
1400
1447
|
List of run metadata
|
|
1401
1448
|
"""
|
|
1402
1449
|
if not self._config.use_sqlite_metadata:
|
|
1403
1450
|
# Fallback to file-based listing
|
|
1404
1451
|
return self._list_runs_from_files(experiment_id, status, limit)
|
|
1405
|
-
|
|
1452
|
+
|
|
1406
1453
|
# Query SQLite
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
rows = cursor.fetchall()
|
|
1429
|
-
conn.close()
|
|
1430
|
-
|
|
1454
|
+
with self._connect_db() as conn:
|
|
1455
|
+
query = "SELECT * FROM runs WHERE 1=1"
|
|
1456
|
+
params = []
|
|
1457
|
+
|
|
1458
|
+
if experiment_id:
|
|
1459
|
+
query += " AND experiment_id = ?"
|
|
1460
|
+
params.append(experiment_id)
|
|
1461
|
+
|
|
1462
|
+
if status:
|
|
1463
|
+
query += " AND status = ?"
|
|
1464
|
+
params.append(status.value)
|
|
1465
|
+
|
|
1466
|
+
query += " ORDER BY created_at DESC"
|
|
1467
|
+
|
|
1468
|
+
if limit:
|
|
1469
|
+
query += " LIMIT ?"
|
|
1470
|
+
params.append(limit)
|
|
1471
|
+
|
|
1472
|
+
cursor = conn.execute(query, params)
|
|
1473
|
+
rows = cursor.fetchall()
|
|
1474
|
+
|
|
1431
1475
|
# Convert to RunMetadata
|
|
1432
1476
|
runs = []
|
|
1433
1477
|
for row in rows:
|
|
1434
|
-
runs.append(
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1478
|
+
runs.append(
|
|
1479
|
+
RunMetadata(
|
|
1480
|
+
run_id=row[0],
|
|
1481
|
+
experiment_id=row[1],
|
|
1482
|
+
status=RunStatus(row[2]),
|
|
1483
|
+
created_at=row[3],
|
|
1484
|
+
updated_at=row[4],
|
|
1485
|
+
completed_at=row[5],
|
|
1486
|
+
total_samples=row[6] or 0,
|
|
1487
|
+
successful_generations=row[7] or 0,
|
|
1488
|
+
failed_generations=row[8] or 0,
|
|
1489
|
+
config_snapshot=json.loads(row[9]) if row[9] else {},
|
|
1490
|
+
error_message=row[10],
|
|
1491
|
+
)
|
|
1492
|
+
)
|
|
1493
|
+
|
|
1448
1494
|
return runs
|
|
1449
1495
|
|
|
1450
1496
|
def _list_runs_from_files(
|
|
1451
|
-
self,
|
|
1452
|
-
experiment_id: str | None,
|
|
1453
|
-
status: RunStatus | None,
|
|
1454
|
-
limit: int | None
|
|
1497
|
+
self, experiment_id: str | None, status: RunStatus | None, limit: int | None
|
|
1455
1498
|
) -> list[RunMetadata]:
|
|
1456
1499
|
"""List runs by scanning files (fallback)."""
|
|
1457
1500
|
runs = []
|
|
1458
|
-
|
|
1501
|
+
|
|
1459
1502
|
# Scan experiment directories
|
|
1460
|
-
exp_dirs =
|
|
1461
|
-
|
|
1503
|
+
exp_dirs = (
|
|
1504
|
+
[self._experiments_dir / experiment_id]
|
|
1505
|
+
if experiment_id
|
|
1506
|
+
else list(self._experiments_dir.iterdir())
|
|
1507
|
+
)
|
|
1508
|
+
|
|
1462
1509
|
for exp_dir in exp_dirs:
|
|
1463
1510
|
if not exp_dir.is_dir():
|
|
1464
1511
|
continue
|
|
1465
|
-
|
|
1512
|
+
|
|
1466
1513
|
runs_dir = exp_dir / "runs"
|
|
1467
1514
|
if not runs_dir.exists():
|
|
1468
1515
|
continue
|
|
1469
|
-
|
|
1516
|
+
|
|
1470
1517
|
for run_dir in runs_dir.iterdir():
|
|
1471
1518
|
if not run_dir.is_dir():
|
|
1472
1519
|
continue
|
|
1473
|
-
|
|
1520
|
+
|
|
1474
1521
|
try:
|
|
1475
1522
|
metadata = self._load_run_metadata(run_dir.name)
|
|
1476
1523
|
if status and metadata.status != status:
|
|
@@ -1478,21 +1525,21 @@ class ExperimentStorage:
|
|
|
1478
1525
|
runs.append(metadata)
|
|
1479
1526
|
except Exception:
|
|
1480
1527
|
continue
|
|
1481
|
-
|
|
1528
|
+
|
|
1482
1529
|
# Sort by creation time
|
|
1483
1530
|
runs.sort(key=lambda r: r.created_at, reverse=True)
|
|
1484
|
-
|
|
1531
|
+
|
|
1485
1532
|
if limit:
|
|
1486
1533
|
runs = runs[:limit]
|
|
1487
|
-
|
|
1534
|
+
|
|
1488
1535
|
return runs
|
|
1489
1536
|
|
|
1490
1537
|
def validate_integrity(self, run_id: str) -> dict:
|
|
1491
1538
|
"""Validate data integrity for a run.
|
|
1492
|
-
|
|
1539
|
+
|
|
1493
1540
|
Args:
|
|
1494
1541
|
run_id: Run identifier
|
|
1495
|
-
|
|
1542
|
+
|
|
1496
1543
|
Returns:
|
|
1497
1544
|
Dict with validation results
|
|
1498
1545
|
"""
|
|
@@ -1502,19 +1549,19 @@ class ExperimentStorage:
|
|
|
1502
1549
|
"errors": [],
|
|
1503
1550
|
"warnings": [],
|
|
1504
1551
|
}
|
|
1505
|
-
|
|
1552
|
+
|
|
1506
1553
|
run_dir = self._get_run_dir(run_id)
|
|
1507
1554
|
if not run_dir.exists():
|
|
1508
1555
|
results["valid"] = False
|
|
1509
1556
|
results["errors"].append(f"Run directory not found: {run_dir}")
|
|
1510
1557
|
return results
|
|
1511
|
-
|
|
1558
|
+
|
|
1512
1559
|
# Check metadata
|
|
1513
1560
|
metadata_path = run_dir / "metadata.json"
|
|
1514
1561
|
if not metadata_path.exists():
|
|
1515
1562
|
results["valid"] = False
|
|
1516
1563
|
results["errors"].append("Missing metadata.json")
|
|
1517
|
-
|
|
1564
|
+
|
|
1518
1565
|
# Check generation directory
|
|
1519
1566
|
gen_dir = run_dir / "generation"
|
|
1520
1567
|
if not gen_dir.exists():
|
|
@@ -1524,12 +1571,12 @@ class ExperimentStorage:
|
|
|
1524
1571
|
for filename in ["records.jsonl", "tasks.jsonl"]:
|
|
1525
1572
|
if not self._file_exists_any_compression(gen_dir / filename):
|
|
1526
1573
|
results["warnings"].append(f"Missing {filename}")
|
|
1527
|
-
|
|
1574
|
+
|
|
1528
1575
|
# Check lock file
|
|
1529
1576
|
lock_path = run_dir / ".lock"
|
|
1530
1577
|
if not lock_path.exists():
|
|
1531
1578
|
results["warnings"].append("No lock file (may not have been used)")
|
|
1532
|
-
|
|
1579
|
+
|
|
1533
1580
|
return results
|
|
1534
1581
|
|
|
1535
1582
|
|
|
@@ -1554,20 +1601,20 @@ def evaluation_cache_key(
|
|
|
1554
1601
|
evaluation_config: dict | None = None,
|
|
1555
1602
|
) -> str:
|
|
1556
1603
|
"""Derive a stable cache key for an evaluation that includes both task and evaluation configuration.
|
|
1557
|
-
|
|
1604
|
+
|
|
1558
1605
|
This ensures that changing metrics or evaluation settings will invalidate the cache
|
|
1559
1606
|
and trigger re-evaluation, even if the generation is cached.
|
|
1560
|
-
|
|
1607
|
+
|
|
1561
1608
|
Args:
|
|
1562
1609
|
task: Generation task
|
|
1563
1610
|
evaluation_config: Dictionary with evaluation configuration:
|
|
1564
1611
|
- metrics: List of metric names/types
|
|
1565
1612
|
- extractor: Extractor type/configuration
|
|
1566
1613
|
- Any other evaluation settings
|
|
1567
|
-
|
|
1614
|
+
|
|
1568
1615
|
Returns:
|
|
1569
1616
|
Cache key string that includes both task and evaluation config
|
|
1570
|
-
|
|
1617
|
+
|
|
1571
1618
|
Example:
|
|
1572
1619
|
>>> config = {
|
|
1573
1620
|
... "metrics": ["exact_match", "f1_score"],
|
|
@@ -1576,15 +1623,15 @@ def evaluation_cache_key(
|
|
|
1576
1623
|
>>> key = evaluation_cache_key(task, config)
|
|
1577
1624
|
"""
|
|
1578
1625
|
task_key = task_cache_key(task)
|
|
1579
|
-
|
|
1626
|
+
|
|
1580
1627
|
if not evaluation_config:
|
|
1581
1628
|
# No config provided, use task key only (for backward compatibility)
|
|
1582
1629
|
return task_key
|
|
1583
|
-
|
|
1630
|
+
|
|
1584
1631
|
# Create deterministic hash of evaluation configuration
|
|
1585
1632
|
config_str = json.dumps(evaluation_config, sort_keys=True)
|
|
1586
1633
|
config_hash = hashlib.sha256(config_str.encode("utf-8")).hexdigest()[:12]
|
|
1587
|
-
|
|
1634
|
+
|
|
1588
1635
|
return f"{task_key}::eval:{config_hash}"
|
|
1589
1636
|
|
|
1590
1637
|
|