themis-eval 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +12 -1
- themis/_version.py +2 -2
- themis/api.py +343 -0
- themis/backends/__init__.py +17 -0
- themis/backends/execution.py +197 -0
- themis/backends/storage.py +260 -0
- themis/cli/commands/results.py +252 -0
- themis/cli/main.py +427 -57
- themis/comparison/__init__.py +25 -0
- themis/comparison/engine.py +348 -0
- themis/comparison/reports.py +283 -0
- themis/comparison/statistics.py +402 -0
- themis/core/entities.py +23 -3
- themis/evaluation/metrics/code/__init__.py +19 -0
- themis/evaluation/metrics/code/codebleu.py +144 -0
- themis/evaluation/metrics/code/execution.py +280 -0
- themis/evaluation/metrics/code/pass_at_k.py +181 -0
- themis/evaluation/metrics/nlp/__init__.py +21 -0
- themis/evaluation/metrics/nlp/bertscore.py +138 -0
- themis/evaluation/metrics/nlp/bleu.py +129 -0
- themis/evaluation/metrics/nlp/meteor.py +153 -0
- themis/evaluation/metrics/nlp/rouge.py +136 -0
- themis/evaluation/pipelines/standard_pipeline.py +68 -8
- themis/experiment/cache_manager.py +8 -3
- themis/experiment/export.py +110 -2
- themis/experiment/orchestrator.py +48 -6
- themis/experiment/storage.py +1313 -110
- themis/integrations/huggingface.py +12 -1
- themis/integrations/wandb.py +13 -1
- themis/interfaces/__init__.py +86 -0
- themis/presets/__init__.py +10 -0
- themis/presets/benchmarks.py +354 -0
- themis/presets/models.py +190 -0
- themis/server/__init__.py +28 -0
- themis/server/app.py +337 -0
- themis_eval-0.2.0.dist-info/METADATA +596 -0
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/RECORD +40 -17
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
- themis_eval-0.1.1.dist-info/METADATA +0 -758
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.1.1.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
themis/experiment/storage.py
CHANGED
|
@@ -1,57 +1,387 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Robust storage architecture with lifecycle management, atomic operations, and integrity checks.
|
|
2
|
+
|
|
3
|
+
This is a rewrite of the storage layer to address:
|
|
4
|
+
- Run lifecycle management (in_progress, completed, failed)
|
|
5
|
+
- Atomic write operations
|
|
6
|
+
- File locking for concurrent access
|
|
7
|
+
- Index persistence
|
|
8
|
+
- Experiment-level organization
|
|
9
|
+
- Separate evaluation tracking
|
|
10
|
+
- Data integrity validation
|
|
11
|
+
"""
|
|
2
12
|
|
|
3
13
|
from __future__ import annotations
|
|
4
14
|
|
|
15
|
+
import contextlib
|
|
16
|
+
import gzip
|
|
5
17
|
import hashlib
|
|
6
18
|
import json
|
|
19
|
+
import os
|
|
20
|
+
import sqlite3
|
|
21
|
+
import sys
|
|
22
|
+
import tempfile
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
import shutil
|
|
25
|
+
from datetime import datetime, timedelta
|
|
26
|
+
from enum import Enum
|
|
7
27
|
from pathlib import Path
|
|
8
|
-
from typing import Dict, Iterable, List
|
|
28
|
+
from typing import Dict, Iterable, List, Literal
|
|
29
|
+
|
|
30
|
+
# fcntl is Unix-only, use msvcrt on Windows
|
|
31
|
+
if sys.platform == "win32":
|
|
32
|
+
import msvcrt
|
|
33
|
+
FCNTL_AVAILABLE = False
|
|
34
|
+
else:
|
|
35
|
+
try:
|
|
36
|
+
import fcntl
|
|
37
|
+
FCNTL_AVAILABLE = True
|
|
38
|
+
except ImportError:
|
|
39
|
+
FCNTL_AVAILABLE = False
|
|
9
40
|
|
|
10
41
|
from themis.core import entities as core_entities
|
|
11
42
|
from themis.core import serialization as core_serialization
|
|
12
43
|
|
|
44
|
+
STORAGE_FORMAT_VERSION = "2.0.0"
|
|
13
45
|
|
|
14
|
-
def task_cache_key(task: core_entities.GenerationTask) -> str:
|
|
15
|
-
"""Derive a stable cache key for a generation task."""
|
|
16
46
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
47
|
+
class RunStatus(str, Enum):
|
|
48
|
+
"""Status of a run."""
|
|
49
|
+
|
|
50
|
+
IN_PROGRESS = "in_progress"
|
|
51
|
+
COMPLETED = "completed"
|
|
52
|
+
FAILED = "failed"
|
|
53
|
+
CANCELLED = "cancelled"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class RetentionPolicy:
|
|
58
|
+
"""Retention policy for automatic cleanup.
|
|
59
|
+
|
|
60
|
+
Attributes:
|
|
61
|
+
max_runs_per_experiment: Maximum runs to keep per experiment
|
|
62
|
+
max_age_days: Maximum age in days for runs
|
|
63
|
+
max_storage_gb: Maximum total storage in GB
|
|
64
|
+
keep_completed_only: Only keep completed runs
|
|
65
|
+
keep_latest_n: Always keep N most recent runs
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
max_runs_per_experiment: int | None = None
|
|
69
|
+
max_age_days: int | None = None
|
|
70
|
+
max_storage_gb: float | None = None
|
|
71
|
+
keep_completed_only: bool = True
|
|
72
|
+
keep_latest_n: int = 5
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class StorageConfig:
|
|
77
|
+
"""Configuration for experiment storage behavior.
|
|
78
|
+
|
|
79
|
+
Attributes:
|
|
80
|
+
save_raw_responses: Save full API responses (default: False)
|
|
81
|
+
save_dataset: Save dataset copy (default: True)
|
|
82
|
+
compression: Compression format - "gzip" | "none" (default: "gzip")
|
|
83
|
+
deduplicate_templates: Store templates once (default: True)
|
|
84
|
+
enable_checksums: Add integrity checksums (default: True)
|
|
85
|
+
use_sqlite_metadata: Use SQLite for metadata (default: True)
|
|
86
|
+
checkpoint_interval: Save checkpoint every N records (default: 100)
|
|
87
|
+
retention_policy: Automatic cleanup policy (default: None)
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
save_raw_responses: bool = False
|
|
91
|
+
save_dataset: bool = True
|
|
92
|
+
compression: Literal["none", "gzip"] = "gzip"
|
|
93
|
+
deduplicate_templates: bool = True
|
|
94
|
+
enable_checksums: bool = True
|
|
95
|
+
use_sqlite_metadata: bool = True
|
|
96
|
+
checkpoint_interval: int = 100
|
|
97
|
+
retention_policy: RetentionPolicy | None = None
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dataclass
|
|
101
|
+
class RunMetadata:
|
|
102
|
+
"""Metadata for a run."""
|
|
103
|
+
|
|
104
|
+
run_id: str
|
|
105
|
+
experiment_id: str
|
|
106
|
+
status: RunStatus
|
|
107
|
+
created_at: str
|
|
108
|
+
updated_at: str
|
|
109
|
+
completed_at: str | None = None
|
|
110
|
+
total_samples: int = 0
|
|
111
|
+
successful_generations: int = 0
|
|
112
|
+
failed_generations: int = 0
|
|
113
|
+
config_snapshot: dict = field(default_factory=dict)
|
|
114
|
+
error_message: str | None = None
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@dataclass
|
|
118
|
+
class EvaluationMetadata:
|
|
119
|
+
"""Metadata for an evaluation run."""
|
|
120
|
+
|
|
121
|
+
eval_id: str
|
|
122
|
+
run_id: str
|
|
123
|
+
eval_name: str
|
|
124
|
+
created_at: str
|
|
125
|
+
metrics_config: dict = field(default_factory=dict)
|
|
126
|
+
total_evaluated: int = 0
|
|
127
|
+
total_failures: int = 0
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class DataIntegrityError(Exception):
|
|
131
|
+
"""Raised when data integrity check fails."""
|
|
132
|
+
|
|
133
|
+
pass
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class ConcurrentAccessError(Exception):
|
|
137
|
+
"""Raised when concurrent access conflict detected."""
|
|
138
|
+
|
|
139
|
+
pass
|
|
29
140
|
|
|
30
141
|
|
|
31
142
|
class ExperimentStorage:
|
|
32
|
-
"""
|
|
143
|
+
"""Robust storage with lifecycle management, locking, and integrity checks.
|
|
33
144
|
|
|
34
|
-
|
|
145
|
+
Features:
|
|
146
|
+
- Atomic write operations
|
|
147
|
+
- File locking for concurrent access
|
|
148
|
+
- Run lifecycle tracking (in_progress, completed, failed)
|
|
149
|
+
- Experiment-level organization
|
|
150
|
+
- Separate evaluation tracking
|
|
151
|
+
- Persistent indexes
|
|
152
|
+
- Data integrity validation
|
|
153
|
+
- SQLite metadata database
|
|
154
|
+
|
|
155
|
+
Example:
|
|
156
|
+
>>> config = StorageConfig()
|
|
157
|
+
>>> storage = ExperimentStorage("outputs/experiments", config=config)
|
|
158
|
+
>>>
|
|
159
|
+
>>> # Start a run
|
|
160
|
+
>>> metadata = storage.start_run("run-1", "experiment-1", config={})
|
|
161
|
+
>>>
|
|
162
|
+
>>> # Append records with locking
|
|
163
|
+
>>> storage.append_record("run-1", record)
|
|
164
|
+
>>>
|
|
165
|
+
>>> # Complete the run
|
|
166
|
+
>>> storage.complete_run("run-1")
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
def __init__(
|
|
170
|
+
self, root: str | Path, config: StorageConfig | None = None
|
|
171
|
+
) -> None:
|
|
35
172
|
self._root = Path(root)
|
|
36
173
|
self._root.mkdir(parents=True, exist_ok=True)
|
|
174
|
+
self._config = config or StorageConfig()
|
|
175
|
+
|
|
176
|
+
# Create experiments directory
|
|
177
|
+
self._experiments_dir = self._root / "experiments"
|
|
178
|
+
self._experiments_dir.mkdir(exist_ok=True)
|
|
179
|
+
|
|
180
|
+
# Initialize SQLite database
|
|
181
|
+
if self._config.use_sqlite_metadata:
|
|
182
|
+
self._init_database()
|
|
183
|
+
|
|
184
|
+
# In-memory caches
|
|
37
185
|
self._task_index: dict[str, set[str]] = {}
|
|
186
|
+
self._template_index: dict[str, dict[str, str]] = {}
|
|
187
|
+
self._locks: dict[str, int] = {} # fd for lock files
|
|
38
188
|
|
|
39
|
-
def
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
for row in dataset:
|
|
44
|
-
handle.write(json.dumps(row) + "\n")
|
|
189
|
+
def _init_database(self):
|
|
190
|
+
"""Initialize SQLite metadata database."""
|
|
191
|
+
db_path = self._root / "experiments.db"
|
|
192
|
+
conn = sqlite3.connect(db_path)
|
|
45
193
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
194
|
+
conn.execute("""
|
|
195
|
+
CREATE TABLE IF NOT EXISTS experiments (
|
|
196
|
+
experiment_id TEXT PRIMARY KEY,
|
|
197
|
+
name TEXT NOT NULL,
|
|
198
|
+
description TEXT,
|
|
199
|
+
created_at TEXT NOT NULL,
|
|
200
|
+
updated_at TEXT NOT NULL,
|
|
201
|
+
config TEXT,
|
|
202
|
+
tags TEXT
|
|
203
|
+
)
|
|
204
|
+
""")
|
|
205
|
+
|
|
206
|
+
conn.execute("""
|
|
207
|
+
CREATE TABLE IF NOT EXISTS runs (
|
|
208
|
+
run_id TEXT PRIMARY KEY,
|
|
209
|
+
experiment_id TEXT NOT NULL,
|
|
210
|
+
status TEXT NOT NULL,
|
|
211
|
+
created_at TEXT NOT NULL,
|
|
212
|
+
updated_at TEXT NOT NULL,
|
|
213
|
+
completed_at TEXT,
|
|
214
|
+
total_samples INTEGER DEFAULT 0,
|
|
215
|
+
successful_generations INTEGER DEFAULT 0,
|
|
216
|
+
failed_generations INTEGER DEFAULT 0,
|
|
217
|
+
config_snapshot TEXT,
|
|
218
|
+
error_message TEXT,
|
|
219
|
+
FOREIGN KEY (experiment_id) REFERENCES experiments(experiment_id)
|
|
220
|
+
)
|
|
221
|
+
""")
|
|
222
|
+
|
|
223
|
+
conn.execute("""
|
|
224
|
+
CREATE TABLE IF NOT EXISTS evaluations (
|
|
225
|
+
eval_id TEXT PRIMARY KEY,
|
|
226
|
+
run_id TEXT NOT NULL,
|
|
227
|
+
eval_name TEXT NOT NULL,
|
|
228
|
+
created_at TEXT NOT NULL,
|
|
229
|
+
metrics_config TEXT,
|
|
230
|
+
total_evaluated INTEGER DEFAULT 0,
|
|
231
|
+
total_failures INTEGER DEFAULT 0,
|
|
232
|
+
FOREIGN KEY (run_id) REFERENCES runs(run_id)
|
|
233
|
+
)
|
|
234
|
+
""")
|
|
235
|
+
|
|
236
|
+
conn.execute("""
|
|
237
|
+
CREATE INDEX IF NOT EXISTS idx_runs_experiment
|
|
238
|
+
ON runs(experiment_id)
|
|
239
|
+
""")
|
|
240
|
+
|
|
241
|
+
conn.execute("""
|
|
242
|
+
CREATE INDEX IF NOT EXISTS idx_runs_status
|
|
243
|
+
ON runs(status)
|
|
244
|
+
""")
|
|
245
|
+
|
|
246
|
+
conn.execute("""
|
|
247
|
+
CREATE INDEX IF NOT EXISTS idx_evaluations_run
|
|
248
|
+
ON evaluations(run_id)
|
|
249
|
+
""")
|
|
250
|
+
|
|
251
|
+
conn.commit()
|
|
252
|
+
conn.close()
|
|
253
|
+
|
|
254
|
+
@contextlib.contextmanager
|
|
255
|
+
def _acquire_lock(self, run_id: str):
|
|
256
|
+
"""Acquire exclusive lock for run directory."""
|
|
257
|
+
lock_path = self._get_run_dir(run_id) / ".lock"
|
|
258
|
+
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
259
|
+
|
|
260
|
+
# Open lock file
|
|
261
|
+
lock_fd = os.open(lock_path, os.O_CREAT | os.O_RDWR)
|
|
262
|
+
|
|
263
|
+
try:
|
|
264
|
+
# Acquire exclusive lock (blocking)
|
|
265
|
+
if sys.platform == "win32":
|
|
266
|
+
# Windows file locking
|
|
267
|
+
msvcrt.locking(lock_fd, msvcrt.LK_LOCK, 1)
|
|
268
|
+
elif FCNTL_AVAILABLE:
|
|
269
|
+
# Unix file locking
|
|
270
|
+
fcntl.flock(lock_fd, fcntl.LOCK_EX)
|
|
271
|
+
# If neither available, proceed without locking (single-process only)
|
|
272
|
+
|
|
273
|
+
self._locks[run_id] = lock_fd
|
|
274
|
+
yield
|
|
275
|
+
finally:
|
|
276
|
+
# Release lock
|
|
277
|
+
if sys.platform == "win32":
|
|
278
|
+
msvcrt.locking(lock_fd, msvcrt.LK_UNLCK, 1)
|
|
279
|
+
elif FCNTL_AVAILABLE:
|
|
280
|
+
fcntl.flock(lock_fd, fcntl.LOCK_UN)
|
|
281
|
+
|
|
282
|
+
os.close(lock_fd)
|
|
283
|
+
self._locks.pop(run_id, None)
|
|
284
|
+
|
|
285
|
+
def start_run(
|
|
286
|
+
self,
|
|
287
|
+
run_id: str,
|
|
288
|
+
experiment_id: str,
|
|
289
|
+
config: dict | None = None,
|
|
290
|
+
) -> RunMetadata:
|
|
291
|
+
"""Start a new run with in_progress status.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
run_id: Unique run identifier
|
|
295
|
+
experiment_id: Experiment this run belongs to
|
|
296
|
+
config: Configuration snapshot for this run
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
RunMetadata with in_progress status
|
|
300
|
+
|
|
301
|
+
Raises:
|
|
302
|
+
ValueError: If run already exists
|
|
303
|
+
"""
|
|
304
|
+
with self._acquire_lock(run_id):
|
|
305
|
+
# Check if run already exists
|
|
306
|
+
if self._run_metadata_exists(run_id):
|
|
307
|
+
raise ValueError(f"Run {run_id} already exists")
|
|
308
|
+
|
|
309
|
+
# Create run directory
|
|
310
|
+
run_dir = self._get_run_dir(run_id)
|
|
311
|
+
run_dir.mkdir(parents=True, exist_ok=True)
|
|
312
|
+
|
|
313
|
+
# Create metadata
|
|
314
|
+
metadata = RunMetadata(
|
|
315
|
+
run_id=run_id,
|
|
316
|
+
experiment_id=experiment_id,
|
|
317
|
+
status=RunStatus.IN_PROGRESS,
|
|
318
|
+
created_at=datetime.now().isoformat(),
|
|
319
|
+
updated_at=datetime.now().isoformat(),
|
|
320
|
+
config_snapshot=config or {},
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
# Save metadata
|
|
324
|
+
self._save_run_metadata(metadata)
|
|
325
|
+
|
|
326
|
+
return metadata
|
|
327
|
+
|
|
328
|
+
def complete_run(self, run_id: str):
|
|
329
|
+
"""Mark run as completed.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
run_id: Run identifier
|
|
333
|
+
|
|
334
|
+
Raises:
|
|
335
|
+
ValueError: If run doesn't exist
|
|
336
|
+
"""
|
|
337
|
+
with self._acquire_lock(run_id):
|
|
338
|
+
metadata = self._load_run_metadata(run_id)
|
|
339
|
+
metadata.status = RunStatus.COMPLETED
|
|
340
|
+
metadata.completed_at = datetime.now().isoformat()
|
|
341
|
+
metadata.updated_at = datetime.now().isoformat()
|
|
342
|
+
self._save_run_metadata(metadata)
|
|
343
|
+
|
|
344
|
+
def fail_run(self, run_id: str, error_message: str):
|
|
345
|
+
"""Mark run as failed with error message.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
run_id: Run identifier
|
|
349
|
+
error_message: Error description
|
|
350
|
+
"""
|
|
351
|
+
with self._acquire_lock(run_id):
|
|
352
|
+
metadata = self._load_run_metadata(run_id)
|
|
353
|
+
metadata.status = RunStatus.FAILED
|
|
354
|
+
metadata.error_message = error_message
|
|
355
|
+
metadata.updated_at = datetime.now().isoformat()
|
|
356
|
+
self._save_run_metadata(metadata)
|
|
357
|
+
|
|
358
|
+
def update_run_progress(
|
|
359
|
+
self,
|
|
360
|
+
run_id: str,
|
|
361
|
+
total_samples: int | None = None,
|
|
362
|
+
successful_generations: int | None = None,
|
|
363
|
+
failed_generations: int | None = None,
|
|
364
|
+
):
|
|
365
|
+
"""Update run progress counters.
|
|
366
|
+
|
|
367
|
+
Args:
|
|
368
|
+
run_id: Run identifier
|
|
369
|
+
total_samples: Total samples (if provided)
|
|
370
|
+
successful_generations: Successful count (if provided)
|
|
371
|
+
failed_generations: Failed count (if provided)
|
|
372
|
+
"""
|
|
373
|
+
with self._acquire_lock(run_id):
|
|
374
|
+
metadata = self._load_run_metadata(run_id)
|
|
375
|
+
|
|
376
|
+
if total_samples is not None:
|
|
377
|
+
metadata.total_samples = total_samples
|
|
378
|
+
if successful_generations is not None:
|
|
379
|
+
metadata.successful_generations = successful_generations
|
|
380
|
+
if failed_generations is not None:
|
|
381
|
+
metadata.failed_generations = failed_generations
|
|
382
|
+
|
|
383
|
+
metadata.updated_at = datetime.now().isoformat()
|
|
384
|
+
self._save_run_metadata(metadata)
|
|
55
385
|
|
|
56
386
|
def append_record(
|
|
57
387
|
self,
|
|
@@ -60,31 +390,371 @@ class ExperimentStorage:
|
|
|
60
390
|
*,
|
|
61
391
|
cache_key: str | None = None,
|
|
62
392
|
) -> None:
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
393
|
+
"""Append record with atomic write and locking.
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
run_id: Run identifier
|
|
397
|
+
record: Generation record to append
|
|
398
|
+
cache_key: Optional cache key (generated if not provided)
|
|
399
|
+
"""
|
|
400
|
+
with self._acquire_lock(run_id):
|
|
401
|
+
# Ensure generation directory exists
|
|
402
|
+
gen_dir = self._get_generation_dir(run_id)
|
|
403
|
+
gen_dir.mkdir(parents=True, exist_ok=True)
|
|
404
|
+
|
|
405
|
+
path = gen_dir / "records.jsonl"
|
|
406
|
+
|
|
407
|
+
# Initialize file with header if needed
|
|
408
|
+
if not self._file_exists_any_compression(path):
|
|
409
|
+
self._write_jsonl_with_header(path, [], file_type="records")
|
|
410
|
+
|
|
411
|
+
# Serialize record
|
|
412
|
+
payload = self._serialize_record(run_id, record)
|
|
413
|
+
payload["cache_key"] = cache_key or self._task_cache_key(record.task)
|
|
414
|
+
|
|
415
|
+
# Atomic append
|
|
416
|
+
self._atomic_append(path, payload)
|
|
417
|
+
|
|
418
|
+
# Update progress
|
|
419
|
+
metadata = self._load_run_metadata(run_id)
|
|
420
|
+
new_successful = metadata.successful_generations + (1 if record.output else 0)
|
|
421
|
+
new_failed = metadata.failed_generations + (1 if record.error else 0)
|
|
422
|
+
|
|
423
|
+
self.update_run_progress(
|
|
424
|
+
run_id,
|
|
425
|
+
total_samples=metadata.total_samples + 1,
|
|
426
|
+
successful_generations=new_successful,
|
|
427
|
+
failed_generations=new_failed,
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
# Auto-checkpoint if configured
|
|
431
|
+
if self._config.checkpoint_interval > 0:
|
|
432
|
+
total = new_successful + new_failed
|
|
433
|
+
if total % self._config.checkpoint_interval == 0:
|
|
434
|
+
checkpoint_data = {
|
|
435
|
+
"total_samples": total,
|
|
436
|
+
"successful": new_successful,
|
|
437
|
+
"failed": new_failed,
|
|
438
|
+
"timestamp": datetime.now().isoformat(),
|
|
439
|
+
}
|
|
440
|
+
self.save_checkpoint(run_id, checkpoint_data)
|
|
441
|
+
|
|
442
|
+
def _atomic_append(self, path: Path, data: dict):
|
|
443
|
+
"""Append data atomically using temp file.
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
path: Target file path
|
|
447
|
+
data: Data to append (will be JSON serialized)
|
|
448
|
+
"""
|
|
449
|
+
json_line = json.dumps(data) + "\n"
|
|
450
|
+
|
|
451
|
+
# Write to temp file
|
|
452
|
+
temp_fd, temp_path = tempfile.mkstemp(
|
|
453
|
+
dir=path.parent, prefix=".tmp_", suffix=".json"
|
|
454
|
+
)
|
|
455
|
+
temp_path = Path(temp_path)
|
|
456
|
+
|
|
457
|
+
try:
|
|
458
|
+
if self._config.compression == "gzip":
|
|
459
|
+
with gzip.open(temp_path, "wt", encoding="utf-8") as f:
|
|
460
|
+
f.write(json_line)
|
|
461
|
+
f.flush()
|
|
462
|
+
os.fsync(f.fileno())
|
|
463
|
+
else:
|
|
464
|
+
with open(temp_fd, "w", encoding="utf-8") as f:
|
|
465
|
+
f.write(json_line)
|
|
466
|
+
f.flush()
|
|
467
|
+
os.fsync(f.fileno())
|
|
468
|
+
os.close(temp_fd)
|
|
469
|
+
|
|
470
|
+
# Get target path with compression
|
|
471
|
+
target_path = (
|
|
472
|
+
path.with_suffix(path.suffix + ".gz")
|
|
473
|
+
if self._config.compression == "gzip"
|
|
474
|
+
else path
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
# Append to existing file
|
|
478
|
+
if target_path.exists():
|
|
479
|
+
with open(target_path, "ab") as dest:
|
|
480
|
+
with open(temp_path, "rb") as src:
|
|
481
|
+
dest.write(src.read())
|
|
482
|
+
dest.flush()
|
|
483
|
+
os.fsync(dest.fileno())
|
|
484
|
+
else:
|
|
485
|
+
# No existing file, just rename
|
|
486
|
+
temp_path.rename(target_path)
|
|
487
|
+
return
|
|
488
|
+
|
|
489
|
+
finally:
|
|
490
|
+
# Clean up temp file if still exists
|
|
491
|
+
if temp_path.exists():
|
|
492
|
+
temp_path.unlink()
|
|
493
|
+
|
|
494
|
+
def _save_run_metadata(self, metadata: RunMetadata):
|
|
495
|
+
"""Save run metadata to both JSON and SQLite.
|
|
496
|
+
|
|
497
|
+
Args:
|
|
498
|
+
metadata: Run metadata to save
|
|
499
|
+
"""
|
|
500
|
+
# Save to JSON file
|
|
501
|
+
metadata_path = self._get_run_dir(metadata.run_id) / "metadata.json"
|
|
502
|
+
metadata_dict = {
|
|
503
|
+
"run_id": metadata.run_id,
|
|
504
|
+
"experiment_id": metadata.experiment_id,
|
|
505
|
+
"status": metadata.status.value,
|
|
506
|
+
"created_at": metadata.created_at,
|
|
507
|
+
"updated_at": metadata.updated_at,
|
|
508
|
+
"completed_at": metadata.completed_at,
|
|
509
|
+
"total_samples": metadata.total_samples,
|
|
510
|
+
"successful_generations": metadata.successful_generations,
|
|
511
|
+
"failed_generations": metadata.failed_generations,
|
|
512
|
+
"config_snapshot": metadata.config_snapshot,
|
|
513
|
+
"error_message": metadata.error_message,
|
|
514
|
+
}
|
|
515
|
+
metadata_path.write_text(json.dumps(metadata_dict, indent=2))
|
|
516
|
+
|
|
517
|
+
# Save to SQLite
|
|
518
|
+
if self._config.use_sqlite_metadata:
|
|
519
|
+
self._save_run_metadata_to_db(metadata)
|
|
520
|
+
|
|
521
|
+
def _save_run_metadata_to_db(self, metadata: RunMetadata):
|
|
522
|
+
"""Save run metadata to SQLite database."""
|
|
523
|
+
db_path = self._root / "experiments.db"
|
|
524
|
+
conn = sqlite3.connect(db_path)
|
|
525
|
+
|
|
526
|
+
# Ensure experiment exists
|
|
527
|
+
conn.execute(
|
|
528
|
+
"""
|
|
529
|
+
INSERT OR IGNORE INTO experiments (experiment_id, name, created_at, updated_at)
|
|
530
|
+
VALUES (?, ?, ?, ?)
|
|
531
|
+
""",
|
|
532
|
+
(
|
|
533
|
+
metadata.experiment_id,
|
|
534
|
+
metadata.experiment_id,
|
|
535
|
+
metadata.created_at,
|
|
536
|
+
metadata.updated_at,
|
|
537
|
+
),
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
# Upsert run
|
|
541
|
+
conn.execute(
|
|
542
|
+
"""
|
|
543
|
+
INSERT OR REPLACE INTO runs (
|
|
544
|
+
run_id, experiment_id, status, created_at, updated_at, completed_at,
|
|
545
|
+
total_samples, successful_generations, failed_generations,
|
|
546
|
+
config_snapshot, error_message
|
|
547
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
548
|
+
""",
|
|
549
|
+
(
|
|
550
|
+
metadata.run_id,
|
|
551
|
+
metadata.experiment_id,
|
|
552
|
+
metadata.status.value,
|
|
553
|
+
metadata.created_at,
|
|
554
|
+
metadata.updated_at,
|
|
555
|
+
metadata.completed_at,
|
|
556
|
+
metadata.total_samples,
|
|
557
|
+
metadata.successful_generations,
|
|
558
|
+
metadata.failed_generations,
|
|
559
|
+
json.dumps(metadata.config_snapshot),
|
|
560
|
+
metadata.error_message,
|
|
561
|
+
),
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
conn.commit()
|
|
565
|
+
conn.close()
|
|
566
|
+
|
|
567
|
+
def _load_run_metadata(self, run_id: str) -> RunMetadata:
|
|
568
|
+
"""Load run metadata from JSON file.
|
|
569
|
+
|
|
570
|
+
Args:
|
|
571
|
+
run_id: Run identifier
|
|
572
|
+
|
|
573
|
+
Returns:
|
|
574
|
+
RunMetadata
|
|
575
|
+
|
|
576
|
+
Raises:
|
|
577
|
+
FileNotFoundError: If metadata doesn't exist
|
|
578
|
+
"""
|
|
579
|
+
metadata_path = self._get_run_dir(run_id) / "metadata.json"
|
|
580
|
+
if not metadata_path.exists():
|
|
581
|
+
raise FileNotFoundError(f"Run metadata not found for {run_id}")
|
|
582
|
+
|
|
583
|
+
data = json.loads(metadata_path.read_text())
|
|
584
|
+
return RunMetadata(
|
|
585
|
+
run_id=data["run_id"],
|
|
586
|
+
experiment_id=data["experiment_id"],
|
|
587
|
+
status=RunStatus(data["status"]),
|
|
588
|
+
created_at=data["created_at"],
|
|
589
|
+
updated_at=data["updated_at"],
|
|
590
|
+
completed_at=data.get("completed_at"),
|
|
591
|
+
total_samples=data.get("total_samples", 0),
|
|
592
|
+
successful_generations=data.get("successful_generations", 0),
|
|
593
|
+
failed_generations=data.get("failed_generations", 0),
|
|
594
|
+
config_snapshot=data.get("config_snapshot", {}),
|
|
595
|
+
error_message=data.get("error_message"),
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
def _run_metadata_exists(self, run_id: str) -> bool:
|
|
599
|
+
"""Check if run metadata exists."""
|
|
600
|
+
metadata_path = self._get_run_dir(run_id) / "metadata.json"
|
|
601
|
+
return metadata_path.exists()
|
|
602
|
+
|
|
603
|
+
def _get_run_dir(self, run_id: str) -> Path:
|
|
604
|
+
"""Get run directory path.
|
|
605
|
+
|
|
606
|
+
Uses hierarchical structure: experiments/<experiment_id>/runs/<run_id>/
|
|
607
|
+
Falls back to experiments/default/runs/<run_id>/ if experiment_id unknown.
|
|
608
|
+
"""
|
|
609
|
+
# Check if we already have metadata
|
|
610
|
+
for exp_dir in self._experiments_dir.iterdir():
|
|
611
|
+
if not exp_dir.is_dir():
|
|
612
|
+
continue
|
|
613
|
+
runs_dir = exp_dir / "runs"
|
|
614
|
+
if not runs_dir.exists():
|
|
615
|
+
continue
|
|
616
|
+
candidate_path = runs_dir / run_id / "metadata.json"
|
|
617
|
+
if candidate_path.exists():
|
|
618
|
+
return runs_dir / run_id
|
|
619
|
+
|
|
620
|
+
# Default location for new runs
|
|
621
|
+
return self._experiments_dir / "default" / "runs" / run_id
|
|
622
|
+
|
|
623
|
+
def _get_generation_dir(self, run_id: str) -> Path:
|
|
624
|
+
"""Get generation data directory."""
|
|
625
|
+
return self._get_run_dir(run_id) / "generation"
|
|
626
|
+
|
|
627
|
+
def _get_evaluation_dir(self, run_id: str, eval_id: str = "default") -> Path:
|
|
628
|
+
"""Get evaluation directory."""
|
|
629
|
+
return self._get_run_dir(run_id) / "evaluations" / eval_id
|
|
630
|
+
|
|
631
|
+
def _file_exists_any_compression(self, path: Path) -> bool:
|
|
632
|
+
"""Check if file exists with any compression suffix."""
|
|
633
|
+
return path.exists() or path.with_suffix(path.suffix + ".gz").exists()
|
|
634
|
+
|
|
635
|
+
def _open_for_read(self, path: Path):
|
|
636
|
+
"""Open file for reading with automatic compression detection.
|
|
637
|
+
|
|
638
|
+
Args:
|
|
639
|
+
path: File path
|
|
640
|
+
|
|
641
|
+
Returns:
|
|
642
|
+
File handle (text mode)
|
|
643
|
+
"""
|
|
644
|
+
# Try .gz version first
|
|
645
|
+
gz_path = path.with_suffix(path.suffix + ".gz")
|
|
646
|
+
if gz_path.exists():
|
|
647
|
+
return gzip.open(gz_path, "rt", encoding="utf-8")
|
|
648
|
+
if path.exists():
|
|
649
|
+
return path.open("r", encoding="utf-8")
|
|
650
|
+
raise FileNotFoundError(f"File not found: {path}")
|
|
651
|
+
|
|
652
|
+
def _write_jsonl_with_header(
|
|
653
|
+
self, path: Path, items: Iterable[dict], file_type: str
|
|
654
|
+
):
|
|
655
|
+
"""Write JSONL file with format version header."""
|
|
656
|
+
# Determine actual path based on compression
|
|
657
|
+
if self._config.compression == "gzip":
|
|
658
|
+
actual_path = path.with_suffix(path.suffix + ".gz")
|
|
659
|
+
handle = gzip.open(actual_path, "wt", encoding="utf-8")
|
|
660
|
+
else:
|
|
661
|
+
actual_path = path
|
|
662
|
+
handle = open(actual_path, "w", encoding="utf-8")
|
|
663
|
+
|
|
664
|
+
with handle:
|
|
665
|
+
# Write header
|
|
666
|
+
header = {
|
|
667
|
+
"_type": "header",
|
|
668
|
+
"_format_version": STORAGE_FORMAT_VERSION,
|
|
669
|
+
"_file_type": file_type,
|
|
670
|
+
}
|
|
671
|
+
handle.write(json.dumps(header) + "\n")
|
|
672
|
+
|
|
673
|
+
# Write items
|
|
674
|
+
for item in items:
|
|
675
|
+
handle.write(json.dumps(item) + "\n")
|
|
676
|
+
|
|
677
|
+
handle.flush()
|
|
678
|
+
if hasattr(handle, "fileno"):
|
|
679
|
+
os.fsync(handle.fileno())
|
|
680
|
+
|
|
681
|
+
def cache_dataset(self, run_id: str, dataset: Iterable[dict[str, object]]) -> None:
|
|
682
|
+
"""Cache dataset samples to storage.
|
|
683
|
+
|
|
684
|
+
Args:
|
|
685
|
+
run_id: Unique run identifier
|
|
686
|
+
dataset: Iterable of dataset samples
|
|
687
|
+
"""
|
|
688
|
+
if not self._config.save_dataset:
|
|
689
|
+
return
|
|
690
|
+
|
|
691
|
+
with self._acquire_lock(run_id):
|
|
692
|
+
gen_dir = self._get_generation_dir(run_id)
|
|
693
|
+
gen_dir.mkdir(parents=True, exist_ok=True)
|
|
694
|
+
path = gen_dir / "dataset.jsonl"
|
|
695
|
+
|
|
696
|
+
self._write_jsonl_with_header(path, dataset, file_type="dataset")
|
|
697
|
+
|
|
698
|
+
def load_dataset(self, run_id: str) -> List[dict[str, object]]:
|
|
699
|
+
"""Load cached dataset.
|
|
700
|
+
|
|
701
|
+
Args:
|
|
702
|
+
run_id: Run identifier
|
|
703
|
+
|
|
704
|
+
Returns:
|
|
705
|
+
List of dataset samples
|
|
706
|
+
"""
|
|
707
|
+
gen_dir = self._get_generation_dir(run_id)
|
|
708
|
+
path = gen_dir / "dataset.jsonl"
|
|
709
|
+
|
|
710
|
+
rows: list[dict[str, object]] = []
|
|
711
|
+
with self._open_for_read(path) as handle:
|
|
712
|
+
for line in handle:
|
|
713
|
+
if not line.strip():
|
|
714
|
+
continue
|
|
715
|
+
data = json.loads(line)
|
|
716
|
+
if data.get("_type") == "header":
|
|
717
|
+
continue
|
|
718
|
+
rows.append(data)
|
|
719
|
+
return rows
|
|
69
720
|
|
|
70
721
|
def load_cached_records(
|
|
71
722
|
self, run_id: str
|
|
72
723
|
) -> Dict[str, core_entities.GenerationRecord]:
|
|
73
|
-
|
|
74
|
-
|
|
724
|
+
"""Load cached generation records.
|
|
725
|
+
|
|
726
|
+
Args:
|
|
727
|
+
run_id: Run identifier
|
|
728
|
+
|
|
729
|
+
Returns:
|
|
730
|
+
Dict mapping cache_key to GenerationRecord
|
|
731
|
+
"""
|
|
732
|
+
gen_dir = self._get_generation_dir(run_id)
|
|
733
|
+
path = gen_dir / "records.jsonl"
|
|
734
|
+
|
|
735
|
+
try:
|
|
736
|
+
handle = self._open_for_read(path)
|
|
737
|
+
except FileNotFoundError:
|
|
75
738
|
return {}
|
|
739
|
+
|
|
76
740
|
tasks = self._load_tasks(run_id)
|
|
77
741
|
records: dict[str, core_entities.GenerationRecord] = {}
|
|
78
|
-
|
|
742
|
+
|
|
743
|
+
with handle:
|
|
79
744
|
for line in handle:
|
|
80
745
|
if not line.strip():
|
|
81
746
|
continue
|
|
82
747
|
data = json.loads(line)
|
|
748
|
+
if data.get("_type") == "header":
|
|
749
|
+
continue
|
|
750
|
+
|
|
83
751
|
key = data.get("cache_key")
|
|
84
752
|
if not key:
|
|
85
753
|
continue
|
|
754
|
+
|
|
86
755
|
record = self._deserialize_record(data, tasks)
|
|
87
756
|
records[key] = record
|
|
757
|
+
|
|
88
758
|
return records
|
|
89
759
|
|
|
90
760
|
def append_evaluation(
|
|
@@ -92,34 +762,80 @@ class ExperimentStorage:
|
|
|
92
762
|
run_id: str,
|
|
93
763
|
record: core_entities.GenerationRecord,
|
|
94
764
|
evaluation: core_entities.EvaluationRecord,
|
|
765
|
+
*,
|
|
766
|
+
eval_id: str = "default",
|
|
767
|
+
evaluation_config: dict | None = None,
|
|
95
768
|
) -> None:
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
769
|
+
"""Append evaluation result.
|
|
770
|
+
|
|
771
|
+
Args:
|
|
772
|
+
run_id: Run identifier
|
|
773
|
+
record: Generation record being evaluated
|
|
774
|
+
evaluation: Evaluation record
|
|
775
|
+
eval_id: Evaluation identifier (default: "default")
|
|
776
|
+
evaluation_config: Evaluation configuration (metrics, extractor) for cache invalidation
|
|
777
|
+
"""
|
|
778
|
+
with self._acquire_lock(run_id):
|
|
779
|
+
eval_dir = self._get_evaluation_dir(run_id, eval_id)
|
|
780
|
+
eval_dir.mkdir(parents=True, exist_ok=True)
|
|
781
|
+
|
|
782
|
+
path = eval_dir / "evaluation.jsonl"
|
|
783
|
+
|
|
784
|
+
if not self._file_exists_any_compression(path):
|
|
785
|
+
self._write_jsonl_with_header(path, [], file_type="evaluation")
|
|
786
|
+
|
|
787
|
+
# Use evaluation_cache_key that includes evaluation config
|
|
788
|
+
cache_key = evaluation_cache_key(record.task, evaluation_config)
|
|
789
|
+
|
|
790
|
+
payload = {
|
|
791
|
+
"cache_key": cache_key,
|
|
792
|
+
"evaluation": core_serialization.serialize_evaluation_record(evaluation),
|
|
793
|
+
}
|
|
794
|
+
self._atomic_append(path, payload)
|
|
104
795
|
|
|
105
796
|
def load_cached_evaluations(
|
|
106
|
-
self, run_id: str
|
|
797
|
+
self, run_id: str, eval_id: str = "default", evaluation_config: dict | None = None
|
|
107
798
|
) -> Dict[str, core_entities.EvaluationRecord]:
|
|
108
|
-
|
|
109
|
-
|
|
799
|
+
"""Load cached evaluation records.
|
|
800
|
+
|
|
801
|
+
Args:
|
|
802
|
+
run_id: Run identifier
|
|
803
|
+
eval_id: Evaluation identifier
|
|
804
|
+
evaluation_config: Evaluation configuration for cache key matching
|
|
805
|
+
|
|
806
|
+
Returns:
|
|
807
|
+
Dict mapping cache_key to EvaluationRecord
|
|
808
|
+
|
|
809
|
+
Note:
|
|
810
|
+
If evaluation_config is provided, only evaluations matching that config
|
|
811
|
+
will be loaded. This ensures that changing metrics invalidates the cache.
|
|
812
|
+
"""
|
|
813
|
+
eval_dir = self._get_evaluation_dir(run_id, eval_id)
|
|
814
|
+
path = eval_dir / "evaluation.jsonl"
|
|
815
|
+
|
|
816
|
+
try:
|
|
817
|
+
handle = self._open_for_read(path)
|
|
818
|
+
except FileNotFoundError:
|
|
110
819
|
return {}
|
|
820
|
+
|
|
111
821
|
evaluations: dict[str, core_entities.EvaluationRecord] = {}
|
|
112
|
-
|
|
822
|
+
|
|
823
|
+
with handle:
|
|
113
824
|
for line in handle:
|
|
114
825
|
if not line.strip():
|
|
115
826
|
continue
|
|
116
827
|
data = json.loads(line)
|
|
828
|
+
if data.get("_type") == "header":
|
|
829
|
+
continue
|
|
830
|
+
|
|
117
831
|
key = data.get("cache_key")
|
|
118
832
|
if not key:
|
|
119
833
|
continue
|
|
834
|
+
|
|
120
835
|
evaluations[key] = core_serialization.deserialize_evaluation_record(
|
|
121
836
|
data["evaluation"]
|
|
122
837
|
)
|
|
838
|
+
|
|
123
839
|
return evaluations
|
|
124
840
|
|
|
125
841
|
def get_run_path(self, run_id: str) -> Path:
|
|
@@ -131,35 +847,24 @@ class ExperimentStorage:
|
|
|
131
847
|
Returns:
|
|
132
848
|
Path to the run's storage directory
|
|
133
849
|
"""
|
|
134
|
-
return self.
|
|
135
|
-
|
|
136
|
-
def _dataset_path(self, run_id: str) -> Path:
|
|
137
|
-
return self._run_dir(run_id) / "dataset.jsonl"
|
|
138
|
-
|
|
139
|
-
def _records_path(self, run_id: str) -> Path:
|
|
140
|
-
return self._run_dir(run_id) / "records.jsonl"
|
|
141
|
-
|
|
142
|
-
def _tasks_path(self, run_id: str) -> Path:
|
|
143
|
-
return self._run_dir(run_id) / "tasks.jsonl"
|
|
144
|
-
|
|
145
|
-
def _evaluation_path(self, run_id: str) -> Path:
|
|
146
|
-
return self._run_dir(run_id) / "evaluation.jsonl"
|
|
147
|
-
|
|
148
|
-
def _run_dir(self, run_id: str) -> Path:
|
|
149
|
-
return self._root / run_id
|
|
850
|
+
return self._get_run_dir(run_id)
|
|
150
851
|
|
|
151
852
|
def _serialize_record(
|
|
152
853
|
self, run_id: str, record: core_entities.GenerationRecord
|
|
153
|
-
) -> dict
|
|
854
|
+
) -> dict:
|
|
855
|
+
"""Serialize generation record."""
|
|
154
856
|
task_key = self._persist_task(run_id, record.task)
|
|
155
|
-
|
|
857
|
+
|
|
858
|
+
# Prepare output data
|
|
859
|
+
output_data = None
|
|
860
|
+
if record.output:
|
|
861
|
+
output_data = {"text": record.output.text}
|
|
862
|
+
if self._config.save_raw_responses:
|
|
863
|
+
output_data["raw"] = record.output.raw
|
|
864
|
+
|
|
865
|
+
return {
|
|
156
866
|
"task_key": task_key,
|
|
157
|
-
"output":
|
|
158
|
-
"text": record.output.text,
|
|
159
|
-
"raw": record.output.raw,
|
|
160
|
-
}
|
|
161
|
-
if record.output
|
|
162
|
-
else None,
|
|
867
|
+
"output": output_data,
|
|
163
868
|
"error": {
|
|
164
869
|
"message": record.error.message,
|
|
165
870
|
"kind": record.error.kind,
|
|
@@ -172,19 +877,21 @@ class ExperimentStorage:
|
|
|
172
877
|
self._serialize_record(run_id, attempt) for attempt in record.attempts
|
|
173
878
|
],
|
|
174
879
|
}
|
|
175
|
-
return payload
|
|
176
880
|
|
|
177
881
|
def _deserialize_record(
|
|
178
|
-
self, payload: dict
|
|
882
|
+
self, payload: dict, tasks: dict[str, core_entities.GenerationTask]
|
|
179
883
|
) -> core_entities.GenerationRecord:
|
|
884
|
+
"""Deserialize generation record."""
|
|
180
885
|
task_key = payload["task_key"]
|
|
181
886
|
task = tasks[task_key]
|
|
182
887
|
output_data = payload.get("output")
|
|
183
888
|
error_data = payload.get("error")
|
|
889
|
+
|
|
184
890
|
record = core_entities.GenerationRecord(
|
|
185
891
|
task=task,
|
|
186
892
|
output=core_entities.ModelOutput(
|
|
187
|
-
text=output_data["text"],
|
|
893
|
+
text=output_data["text"],
|
|
894
|
+
raw=output_data.get("raw")
|
|
188
895
|
)
|
|
189
896
|
if output_data
|
|
190
897
|
else None,
|
|
@@ -197,59 +904,555 @@ class ExperimentStorage:
|
|
|
197
904
|
else None,
|
|
198
905
|
metrics=payload.get("metrics", {}),
|
|
199
906
|
)
|
|
907
|
+
|
|
200
908
|
record.attempts = [
|
|
201
909
|
self._deserialize_record(attempt, tasks)
|
|
202
910
|
for attempt in payload.get("attempts", [])
|
|
203
911
|
]
|
|
912
|
+
|
|
204
913
|
return record
|
|
205
914
|
|
|
206
915
|
def _persist_task(self, run_id: str, task: core_entities.GenerationTask) -> str:
|
|
207
|
-
|
|
916
|
+
"""Persist task and return cache key."""
|
|
917
|
+
# Implementation similar to original but with atomic writes
|
|
918
|
+
# and proper locking (already have lock from append_record)
|
|
919
|
+
key = self._task_cache_key(task)
|
|
208
920
|
index = self._load_task_index(run_id)
|
|
921
|
+
|
|
209
922
|
if key in index:
|
|
210
923
|
return key
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
924
|
+
|
|
925
|
+
gen_dir = self._get_generation_dir(run_id)
|
|
926
|
+
gen_dir.mkdir(parents=True, exist_ok=True)
|
|
927
|
+
path = gen_dir / "tasks.jsonl"
|
|
928
|
+
|
|
929
|
+
# Initialize if needed
|
|
930
|
+
if not self._file_exists_any_compression(path):
|
|
931
|
+
self._write_jsonl_with_header(path, [], file_type="tasks")
|
|
932
|
+
|
|
933
|
+
# Serialize task
|
|
934
|
+
if self._config.deduplicate_templates:
|
|
935
|
+
template_id = self._persist_template(run_id, task.prompt.spec)
|
|
936
|
+
task_data = core_serialization.serialize_generation_task(task)
|
|
937
|
+
task_data["prompt"]["spec"] = {"_template_ref": template_id}
|
|
938
|
+
else:
|
|
939
|
+
task_data = core_serialization.serialize_generation_task(task)
|
|
940
|
+
|
|
941
|
+
payload = {"task_key": key, "task": task_data}
|
|
942
|
+
self._atomic_append(path, payload)
|
|
943
|
+
|
|
219
944
|
index.add(key)
|
|
945
|
+
self._save_task_index(run_id, index)
|
|
946
|
+
|
|
220
947
|
return key
|
|
221
948
|
|
|
949
|
+
def _persist_template(
|
|
950
|
+
self, run_id: str, spec: core_entities.PromptSpec
|
|
951
|
+
) -> str:
|
|
952
|
+
"""Persist prompt template."""
|
|
953
|
+
template_content = f"{spec.name}:{spec.template}"
|
|
954
|
+
template_id = hashlib.sha256(template_content.encode("utf-8")).hexdigest()[:16]
|
|
955
|
+
|
|
956
|
+
if run_id not in self._template_index:
|
|
957
|
+
self._template_index[run_id] = {}
|
|
958
|
+
self._load_templates(run_id)
|
|
959
|
+
|
|
960
|
+
if template_id in self._template_index[run_id]:
|
|
961
|
+
return template_id
|
|
962
|
+
|
|
963
|
+
gen_dir = self._get_generation_dir(run_id)
|
|
964
|
+
path = gen_dir / "templates.jsonl"
|
|
965
|
+
|
|
966
|
+
if not self._file_exists_any_compression(path):
|
|
967
|
+
self._write_jsonl_with_header(path, [], file_type="templates")
|
|
968
|
+
|
|
969
|
+
payload = {
|
|
970
|
+
"template_id": template_id,
|
|
971
|
+
"spec": core_serialization.serialize_prompt_spec(spec),
|
|
972
|
+
}
|
|
973
|
+
self._atomic_append(path, payload)
|
|
974
|
+
|
|
975
|
+
self._template_index[run_id][template_id] = spec.template
|
|
976
|
+
return template_id
|
|
977
|
+
|
|
978
|
+
def _load_task_index(self, run_id: str) -> set[str]:
|
|
979
|
+
"""Load task index from disk cache or rebuild."""
|
|
980
|
+
if run_id in self._task_index:
|
|
981
|
+
return self._task_index[run_id]
|
|
982
|
+
|
|
983
|
+
# Try to load from persisted index
|
|
984
|
+
index_path = self._get_run_dir(run_id) / ".index.json"
|
|
985
|
+
if index_path.exists():
|
|
986
|
+
index_data = json.loads(index_path.read_text())
|
|
987
|
+
self._task_index[run_id] = set(index_data.get("task_keys", []))
|
|
988
|
+
return self._task_index[run_id]
|
|
989
|
+
|
|
990
|
+
# Rebuild from tasks file
|
|
991
|
+
self._task_index[run_id] = set()
|
|
992
|
+
return self._task_index[run_id]
|
|
993
|
+
|
|
994
|
+
def _save_task_index(self, run_id: str, index: set[str]):
|
|
995
|
+
"""Save task index to disk."""
|
|
996
|
+
index_path = self._get_run_dir(run_id) / ".index.json"
|
|
997
|
+
index_data = {
|
|
998
|
+
"task_keys": list(index),
|
|
999
|
+
"template_ids": self._template_index.get(run_id, {}),
|
|
1000
|
+
"last_updated": datetime.now().isoformat(),
|
|
1001
|
+
}
|
|
1002
|
+
index_path.write_text(json.dumps(index_data))
|
|
1003
|
+
|
|
1004
|
+
def _load_templates(self, run_id: str) -> dict[str, core_entities.PromptSpec]:
|
|
1005
|
+
"""Load templates from disk.
|
|
1006
|
+
|
|
1007
|
+
Args:
|
|
1008
|
+
run_id: Run identifier
|
|
1009
|
+
|
|
1010
|
+
Returns:
|
|
1011
|
+
Dict mapping template_id to PromptSpec
|
|
1012
|
+
"""
|
|
1013
|
+
gen_dir = self._get_generation_dir(run_id)
|
|
1014
|
+
path = gen_dir / "templates.jsonl"
|
|
1015
|
+
|
|
1016
|
+
templates: dict[str, core_entities.PromptSpec] = {}
|
|
1017
|
+
try:
|
|
1018
|
+
handle = self._open_for_read(path)
|
|
1019
|
+
except FileNotFoundError:
|
|
1020
|
+
return templates
|
|
1021
|
+
|
|
1022
|
+
with handle:
|
|
1023
|
+
for line in handle:
|
|
1024
|
+
if not line.strip():
|
|
1025
|
+
continue
|
|
1026
|
+
data = json.loads(line)
|
|
1027
|
+
if data.get("_type") == "header":
|
|
1028
|
+
continue
|
|
1029
|
+
|
|
1030
|
+
template_id = data["template_id"]
|
|
1031
|
+
templates[template_id] = core_serialization.deserialize_prompt_spec(
|
|
1032
|
+
data["spec"]
|
|
1033
|
+
)
|
|
1034
|
+
|
|
1035
|
+
return templates
|
|
1036
|
+
|
|
222
1037
|
def _load_tasks(self, run_id: str) -> dict[str, core_entities.GenerationTask]:
|
|
223
|
-
|
|
1038
|
+
"""Load tasks from disk.
|
|
1039
|
+
|
|
1040
|
+
Args:
|
|
1041
|
+
run_id: Run identifier
|
|
1042
|
+
|
|
1043
|
+
Returns:
|
|
1044
|
+
Dict mapping task_key to GenerationTask
|
|
1045
|
+
"""
|
|
1046
|
+
gen_dir = self._get_generation_dir(run_id)
|
|
1047
|
+
path = gen_dir / "tasks.jsonl"
|
|
1048
|
+
|
|
224
1049
|
tasks: dict[str, core_entities.GenerationTask] = {}
|
|
225
|
-
|
|
1050
|
+
try:
|
|
1051
|
+
handle = self._open_for_read(path)
|
|
1052
|
+
except FileNotFoundError:
|
|
226
1053
|
return tasks
|
|
227
|
-
|
|
1054
|
+
|
|
1055
|
+
# Load templates if deduplication enabled
|
|
1056
|
+
templates = self._load_templates(run_id) if self._config.deduplicate_templates else {}
|
|
1057
|
+
|
|
1058
|
+
with handle:
|
|
228
1059
|
for line in handle:
|
|
229
1060
|
if not line.strip():
|
|
230
1061
|
continue
|
|
231
1062
|
data = json.loads(line)
|
|
1063
|
+
if data.get("_type") == "header":
|
|
1064
|
+
continue
|
|
1065
|
+
|
|
232
1066
|
task_key = data["task_key"]
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
1067
|
+
task_data = data["task"]
|
|
1068
|
+
|
|
1069
|
+
# Restore template from reference if needed
|
|
1070
|
+
if (
|
|
1071
|
+
self._config.deduplicate_templates
|
|
1072
|
+
and "_template_ref" in task_data.get("prompt", {}).get("spec", {})
|
|
1073
|
+
):
|
|
1074
|
+
template_id = task_data["prompt"]["spec"]["_template_ref"]
|
|
1075
|
+
if template_id in templates:
|
|
1076
|
+
task_data["prompt"]["spec"] = core_serialization.serialize_prompt_spec(
|
|
1077
|
+
templates[template_id]
|
|
1078
|
+
)
|
|
1079
|
+
|
|
1080
|
+
tasks[task_key] = core_serialization.deserialize_generation_task(task_data)
|
|
1081
|
+
|
|
236
1082
|
self._task_index[run_id] = set(tasks.keys())
|
|
237
1083
|
return tasks
|
|
238
1084
|
|
|
239
|
-
def
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
1085
|
+
def _task_cache_key(self, task: core_entities.GenerationTask) -> str:
|
|
1086
|
+
"""Generate cache key for task."""
|
|
1087
|
+
dataset_raw = task.metadata.get("dataset_id") or task.metadata.get("sample_id")
|
|
1088
|
+
dataset_id = str(dataset_raw) if dataset_raw is not None else ""
|
|
1089
|
+
prompt_hash = hashlib.sha256(task.prompt.text.encode("utf-8")).hexdigest()[:12]
|
|
1090
|
+
sampling = task.sampling
|
|
1091
|
+
sampling_key = (
|
|
1092
|
+
f"{sampling.temperature:.3f}-{sampling.top_p:.3f}-{sampling.max_tokens}"
|
|
1093
|
+
)
|
|
1094
|
+
template = task.prompt.spec.name
|
|
1095
|
+
model = task.model.identifier
|
|
1096
|
+
return "::".join(
|
|
1097
|
+
filter(None, [dataset_id, template, model, sampling_key, prompt_hash])
|
|
1098
|
+
)
|
|
1099
|
+
|
|
1100
|
+
# ===== Phase 3 Features =====
|
|
1101
|
+
|
|
1102
|
+
def save_checkpoint(self, run_id: str, checkpoint_data: dict):
|
|
1103
|
+
"""Save checkpoint for resumability.
|
|
1104
|
+
|
|
1105
|
+
Args:
|
|
1106
|
+
run_id: Run identifier
|
|
1107
|
+
checkpoint_data: Checkpoint data to save
|
|
1108
|
+
"""
|
|
1109
|
+
with self._acquire_lock(run_id):
|
|
1110
|
+
checkpoint_dir = self._get_run_dir(run_id) / "checkpoints"
|
|
1111
|
+
checkpoint_dir.mkdir(exist_ok=True)
|
|
1112
|
+
|
|
1113
|
+
# Use timestamp for checkpoint filename
|
|
1114
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
1115
|
+
checkpoint_path = checkpoint_dir / f"checkpoint_{timestamp}.json"
|
|
1116
|
+
|
|
1117
|
+
checkpoint_path.write_text(json.dumps(checkpoint_data, indent=2))
|
|
1118
|
+
|
|
1119
|
+
def load_latest_checkpoint(self, run_id: str) -> dict | None:
|
|
1120
|
+
"""Load most recent checkpoint.
|
|
1121
|
+
|
|
1122
|
+
Args:
|
|
1123
|
+
run_id: Run identifier
|
|
1124
|
+
|
|
1125
|
+
Returns:
|
|
1126
|
+
Checkpoint data or None if no checkpoints exist
|
|
1127
|
+
"""
|
|
1128
|
+
checkpoint_dir = self._get_run_dir(run_id) / "checkpoints"
|
|
1129
|
+
if not checkpoint_dir.exists():
|
|
1130
|
+
return None
|
|
1131
|
+
|
|
1132
|
+
# Find latest checkpoint
|
|
1133
|
+
checkpoints = sorted(checkpoint_dir.glob("checkpoint_*.json"), reverse=True)
|
|
1134
|
+
if not checkpoints:
|
|
1135
|
+
return None
|
|
1136
|
+
|
|
1137
|
+
return json.loads(checkpoints[0].read_text())
|
|
1138
|
+
|
|
1139
|
+
def apply_retention_policy(self, policy: RetentionPolicy | None = None):
|
|
1140
|
+
"""Apply retention policy to clean up old runs.
|
|
1141
|
+
|
|
1142
|
+
Args:
|
|
1143
|
+
policy: Retention policy (uses config if not provided)
|
|
1144
|
+
"""
|
|
1145
|
+
policy = policy or self._config.retention_policy
|
|
1146
|
+
if not policy:
|
|
1147
|
+
return
|
|
1148
|
+
|
|
1149
|
+
# Get all experiments
|
|
1150
|
+
for exp_dir in self._experiments_dir.iterdir():
|
|
1151
|
+
if not exp_dir.is_dir():
|
|
1152
|
+
continue
|
|
1153
|
+
|
|
1154
|
+
runs_dir = exp_dir / "runs"
|
|
1155
|
+
if not runs_dir.exists():
|
|
1156
|
+
continue
|
|
1157
|
+
|
|
1158
|
+
# Load all run metadata
|
|
1159
|
+
runs = []
|
|
1160
|
+
for run_dir in runs_dir.iterdir():
|
|
1161
|
+
if not run_dir.is_dir():
|
|
1162
|
+
continue
|
|
1163
|
+
metadata_path = run_dir / "metadata.json"
|
|
1164
|
+
if not metadata_path.exists():
|
|
1165
|
+
continue
|
|
1166
|
+
|
|
1167
|
+
try:
|
|
1168
|
+
metadata = self._load_run_metadata(run_dir.name)
|
|
1169
|
+
runs.append((run_dir, metadata))
|
|
1170
|
+
except Exception:
|
|
1171
|
+
continue
|
|
1172
|
+
|
|
1173
|
+
# Sort by creation time (newest first)
|
|
1174
|
+
runs.sort(key=lambda x: x[1].created_at, reverse=True)
|
|
1175
|
+
|
|
1176
|
+
# Apply policies
|
|
1177
|
+
runs_to_delete = []
|
|
1178
|
+
|
|
1179
|
+
for i, (run_dir, metadata) in enumerate(runs):
|
|
1180
|
+
# Always keep latest N runs
|
|
1181
|
+
if i < policy.keep_latest_n:
|
|
1182
|
+
continue
|
|
1183
|
+
|
|
1184
|
+
# Check if should keep based on status
|
|
1185
|
+
if policy.keep_completed_only and metadata.status != RunStatus.COMPLETED:
|
|
1186
|
+
runs_to_delete.append(run_dir)
|
|
1187
|
+
continue
|
|
1188
|
+
|
|
1189
|
+
# Check age policy
|
|
1190
|
+
if policy.max_age_days:
|
|
1191
|
+
created = datetime.fromisoformat(metadata.created_at)
|
|
1192
|
+
age = datetime.now() - created
|
|
1193
|
+
if age > timedelta(days=policy.max_age_days):
|
|
1194
|
+
runs_to_delete.append(run_dir)
|
|
1195
|
+
continue
|
|
1196
|
+
|
|
1197
|
+
# Check max runs policy
|
|
1198
|
+
if policy.max_runs_per_experiment:
|
|
1199
|
+
if i >= policy.max_runs_per_experiment:
|
|
1200
|
+
runs_to_delete.append(run_dir)
|
|
1201
|
+
|
|
1202
|
+
# Delete runs
|
|
1203
|
+
for run_dir in runs_to_delete:
|
|
1204
|
+
self._delete_run_dir(run_dir)
|
|
1205
|
+
|
|
1206
|
+
def _delete_run_dir(self, run_dir: Path):
|
|
1207
|
+
"""Delete run directory and update database.
|
|
1208
|
+
|
|
1209
|
+
Args:
|
|
1210
|
+
run_dir: Run directory to delete
|
|
1211
|
+
"""
|
|
1212
|
+
run_id = run_dir.name
|
|
1213
|
+
|
|
1214
|
+
# Remove from SQLite
|
|
1215
|
+
if self._config.use_sqlite_metadata:
|
|
1216
|
+
db_path = self._root / "experiments.db"
|
|
1217
|
+
conn = sqlite3.connect(db_path)
|
|
1218
|
+
conn.execute("DELETE FROM runs WHERE run_id = ?", (run_id,))
|
|
1219
|
+
conn.commit()
|
|
1220
|
+
conn.close()
|
|
1221
|
+
|
|
1222
|
+
# Remove directory
|
|
1223
|
+
shutil.rmtree(run_dir, ignore_errors=True)
|
|
1224
|
+
|
|
1225
|
+
def get_storage_size(self, experiment_id: str | None = None) -> int:
|
|
1226
|
+
"""Get total storage size in bytes.
|
|
1227
|
+
|
|
1228
|
+
Args:
|
|
1229
|
+
experiment_id: Optional experiment to check (all if None)
|
|
1230
|
+
|
|
1231
|
+
Returns:
|
|
1232
|
+
Total size in bytes
|
|
1233
|
+
"""
|
|
1234
|
+
if experiment_id:
|
|
1235
|
+
exp_dir = self._experiments_dir / experiment_id
|
|
1236
|
+
if not exp_dir.exists():
|
|
1237
|
+
return 0
|
|
1238
|
+
return sum(f.stat().st_size for f in exp_dir.rglob("*") if f.is_file())
|
|
1239
|
+
else:
|
|
1240
|
+
return sum(f.stat().st_size for f in self._experiments_dir.rglob("*") if f.is_file())
|
|
1241
|
+
|
|
1242
|
+
def list_runs(
|
|
1243
|
+
self,
|
|
1244
|
+
experiment_id: str | None = None,
|
|
1245
|
+
status: RunStatus | None = None,
|
|
1246
|
+
limit: int | None = None
|
|
1247
|
+
) -> list[RunMetadata]:
|
|
1248
|
+
"""List runs with optional filtering.
|
|
1249
|
+
|
|
1250
|
+
Args:
|
|
1251
|
+
experiment_id: Filter by experiment
|
|
1252
|
+
status: Filter by status
|
|
1253
|
+
limit: Maximum number of runs to return
|
|
1254
|
+
|
|
1255
|
+
Returns:
|
|
1256
|
+
List of run metadata
|
|
1257
|
+
"""
|
|
1258
|
+
if not self._config.use_sqlite_metadata:
|
|
1259
|
+
# Fallback to file-based listing
|
|
1260
|
+
return self._list_runs_from_files(experiment_id, status, limit)
|
|
1261
|
+
|
|
1262
|
+
# Query SQLite
|
|
1263
|
+
db_path = self._root / "experiments.db"
|
|
1264
|
+
conn = sqlite3.connect(db_path)
|
|
1265
|
+
|
|
1266
|
+
query = "SELECT * FROM runs WHERE 1=1"
|
|
1267
|
+
params = []
|
|
1268
|
+
|
|
1269
|
+
if experiment_id:
|
|
1270
|
+
query += " AND experiment_id = ?"
|
|
1271
|
+
params.append(experiment_id)
|
|
1272
|
+
|
|
1273
|
+
if status:
|
|
1274
|
+
query += " AND status = ?"
|
|
1275
|
+
params.append(status.value)
|
|
1276
|
+
|
|
1277
|
+
query += " ORDER BY created_at DESC"
|
|
1278
|
+
|
|
1279
|
+
if limit:
|
|
1280
|
+
query += " LIMIT ?"
|
|
1281
|
+
params.append(limit)
|
|
1282
|
+
|
|
1283
|
+
cursor = conn.execute(query, params)
|
|
1284
|
+
rows = cursor.fetchall()
|
|
1285
|
+
conn.close()
|
|
1286
|
+
|
|
1287
|
+
# Convert to RunMetadata
|
|
1288
|
+
runs = []
|
|
1289
|
+
for row in rows:
|
|
1290
|
+
runs.append(RunMetadata(
|
|
1291
|
+
run_id=row[0],
|
|
1292
|
+
experiment_id=row[1],
|
|
1293
|
+
status=RunStatus(row[2]),
|
|
1294
|
+
created_at=row[3],
|
|
1295
|
+
updated_at=row[4],
|
|
1296
|
+
completed_at=row[5],
|
|
1297
|
+
total_samples=row[6] or 0,
|
|
1298
|
+
successful_generations=row[7] or 0,
|
|
1299
|
+
failed_generations=row[8] or 0,
|
|
1300
|
+
config_snapshot=json.loads(row[9]) if row[9] else {},
|
|
1301
|
+
error_message=row[10],
|
|
1302
|
+
))
|
|
1303
|
+
|
|
1304
|
+
return runs
|
|
1305
|
+
|
|
1306
|
+
def _list_runs_from_files(
|
|
1307
|
+
self,
|
|
1308
|
+
experiment_id: str | None,
|
|
1309
|
+
status: RunStatus | None,
|
|
1310
|
+
limit: int | None
|
|
1311
|
+
) -> list[RunMetadata]:
|
|
1312
|
+
"""List runs by scanning files (fallback)."""
|
|
1313
|
+
runs = []
|
|
1314
|
+
|
|
1315
|
+
# Scan experiment directories
|
|
1316
|
+
exp_dirs = [self._experiments_dir / experiment_id] if experiment_id else list(self._experiments_dir.iterdir())
|
|
1317
|
+
|
|
1318
|
+
for exp_dir in exp_dirs:
|
|
1319
|
+
if not exp_dir.is_dir():
|
|
1320
|
+
continue
|
|
1321
|
+
|
|
1322
|
+
runs_dir = exp_dir / "runs"
|
|
1323
|
+
if not runs_dir.exists():
|
|
1324
|
+
continue
|
|
1325
|
+
|
|
1326
|
+
for run_dir in runs_dir.iterdir():
|
|
1327
|
+
if not run_dir.is_dir():
|
|
1328
|
+
continue
|
|
1329
|
+
|
|
1330
|
+
try:
|
|
1331
|
+
metadata = self._load_run_metadata(run_dir.name)
|
|
1332
|
+
if status and metadata.status != status:
|
|
248
1333
|
continue
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
1334
|
+
runs.append(metadata)
|
|
1335
|
+
except Exception:
|
|
1336
|
+
continue
|
|
1337
|
+
|
|
1338
|
+
# Sort by creation time
|
|
1339
|
+
runs.sort(key=lambda r: r.created_at, reverse=True)
|
|
1340
|
+
|
|
1341
|
+
if limit:
|
|
1342
|
+
runs = runs[:limit]
|
|
1343
|
+
|
|
1344
|
+
return runs
|
|
1345
|
+
|
|
1346
|
+
def validate_integrity(self, run_id: str) -> dict:
|
|
1347
|
+
"""Validate data integrity for a run.
|
|
1348
|
+
|
|
1349
|
+
Args:
|
|
1350
|
+
run_id: Run identifier
|
|
1351
|
+
|
|
1352
|
+
Returns:
|
|
1353
|
+
Dict with validation results
|
|
1354
|
+
"""
|
|
1355
|
+
results = {
|
|
1356
|
+
"run_id": run_id,
|
|
1357
|
+
"valid": True,
|
|
1358
|
+
"errors": [],
|
|
1359
|
+
"warnings": [],
|
|
1360
|
+
}
|
|
1361
|
+
|
|
1362
|
+
run_dir = self._get_run_dir(run_id)
|
|
1363
|
+
if not run_dir.exists():
|
|
1364
|
+
results["valid"] = False
|
|
1365
|
+
results["errors"].append(f"Run directory not found: {run_dir}")
|
|
1366
|
+
return results
|
|
1367
|
+
|
|
1368
|
+
# Check metadata
|
|
1369
|
+
metadata_path = run_dir / "metadata.json"
|
|
1370
|
+
if not metadata_path.exists():
|
|
1371
|
+
results["valid"] = False
|
|
1372
|
+
results["errors"].append("Missing metadata.json")
|
|
1373
|
+
|
|
1374
|
+
# Check generation directory
|
|
1375
|
+
gen_dir = run_dir / "generation"
|
|
1376
|
+
if not gen_dir.exists():
|
|
1377
|
+
results["warnings"].append("No generation directory")
|
|
1378
|
+
else:
|
|
1379
|
+
# Check for required files
|
|
1380
|
+
for filename in ["records.jsonl", "tasks.jsonl"]:
|
|
1381
|
+
if not self._file_exists_any_compression(gen_dir / filename):
|
|
1382
|
+
results["warnings"].append(f"Missing {filename}")
|
|
1383
|
+
|
|
1384
|
+
# Check lock file
|
|
1385
|
+
lock_path = run_dir / ".lock"
|
|
1386
|
+
if not lock_path.exists():
|
|
1387
|
+
results["warnings"].append("No lock file (may not have been used)")
|
|
1388
|
+
|
|
1389
|
+
return results
|
|
1390
|
+
|
|
1391
|
+
|
|
1392
|
+
def task_cache_key(task: core_entities.GenerationTask) -> str:
|
|
1393
|
+
"""Derive a stable cache key for a generation task (module-level function for backward compatibility)."""
|
|
1394
|
+
dataset_raw = task.metadata.get("dataset_id") or task.metadata.get("sample_id")
|
|
1395
|
+
dataset_id = str(dataset_raw) if dataset_raw is not None else ""
|
|
1396
|
+
prompt_hash = hashlib.sha256(task.prompt.text.encode("utf-8")).hexdigest()[:12]
|
|
1397
|
+
sampling = task.sampling
|
|
1398
|
+
sampling_key = (
|
|
1399
|
+
f"{sampling.temperature:.3f}-{sampling.top_p:.3f}-{sampling.max_tokens}"
|
|
1400
|
+
)
|
|
1401
|
+
template = task.prompt.spec.name
|
|
1402
|
+
model = task.model.identifier
|
|
1403
|
+
return "::".join(
|
|
1404
|
+
filter(None, [dataset_id, template, model, sampling_key, prompt_hash])
|
|
1405
|
+
)
|
|
1406
|
+
|
|
1407
|
+
|
|
1408
|
+
def evaluation_cache_key(
|
|
1409
|
+
task: core_entities.GenerationTask,
|
|
1410
|
+
evaluation_config: dict | None = None,
|
|
1411
|
+
) -> str:
|
|
1412
|
+
"""Derive a stable cache key for an evaluation that includes both task and evaluation configuration.
|
|
1413
|
+
|
|
1414
|
+
This ensures that changing metrics or evaluation settings will invalidate the cache
|
|
1415
|
+
and trigger re-evaluation, even if the generation is cached.
|
|
1416
|
+
|
|
1417
|
+
Args:
|
|
1418
|
+
task: Generation task
|
|
1419
|
+
evaluation_config: Dictionary with evaluation configuration:
|
|
1420
|
+
- metrics: List of metric names/types
|
|
1421
|
+
- extractor: Extractor type/configuration
|
|
1422
|
+
- Any other evaluation settings
|
|
1423
|
+
|
|
1424
|
+
Returns:
|
|
1425
|
+
Cache key string that includes both task and evaluation config
|
|
1426
|
+
|
|
1427
|
+
Example:
|
|
1428
|
+
>>> config = {
|
|
1429
|
+
... "metrics": ["exact_match", "f1_score"],
|
|
1430
|
+
... "extractor": "json_field_extractor:answer"
|
|
1431
|
+
... }
|
|
1432
|
+
>>> key = evaluation_cache_key(task, config)
|
|
1433
|
+
"""
|
|
1434
|
+
task_key = task_cache_key(task)
|
|
1435
|
+
|
|
1436
|
+
if not evaluation_config:
|
|
1437
|
+
# No config provided, use task key only (for backward compatibility)
|
|
1438
|
+
return task_key
|
|
1439
|
+
|
|
1440
|
+
# Create deterministic hash of evaluation configuration
|
|
1441
|
+
config_str = json.dumps(evaluation_config, sort_keys=True)
|
|
1442
|
+
config_hash = hashlib.sha256(config_str.encode("utf-8")).hexdigest()[:12]
|
|
1443
|
+
|
|
1444
|
+
return f"{task_key}::eval:{config_hash}"
|
|
253
1445
|
|
|
254
1446
|
|
|
255
|
-
__all__ = [
|
|
1447
|
+
__all__ = [
|
|
1448
|
+
"ExperimentStorage",
|
|
1449
|
+
"StorageConfig",
|
|
1450
|
+
"RunMetadata",
|
|
1451
|
+
"EvaluationMetadata",
|
|
1452
|
+
"RunStatus",
|
|
1453
|
+
"RetentionPolicy",
|
|
1454
|
+
"DataIntegrityError",
|
|
1455
|
+
"ConcurrentAccessError",
|
|
1456
|
+
"task_cache_key",
|
|
1457
|
+
"evaluation_cache_key",
|
|
1458
|
+
]
|