yanex 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
yanex/core/manager.py ADDED
@@ -0,0 +1,555 @@
1
+ """
2
+ Experiment Manager - Core orchestration component for yanex.
3
+
4
+ Handles experiment lifecycle management, ID generation, and coordinates
5
+ between all core components (git, config, storage, environment).
6
+ """
7
+
8
+ import secrets
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import Any, Dict, List, Optional
12
+
13
+ from ..utils.exceptions import ExperimentAlreadyRunningError
14
+ from ..utils.validation import validate_experiment_name, validate_tags
15
+ from .environment import capture_full_environment
16
+ from .git_utils import get_current_commit_info, validate_clean_working_directory
17
+ from .storage import ExperimentStorage
18
+
19
+
20
+ class ExperimentManager:
21
+ """Central manager for experiment lifecycle and orchestration."""
22
+
23
+ def __init__(self, experiments_dir: Optional[Path] = None):
24
+ """Initialize experiment manager.
25
+
26
+ Args:
27
+ experiments_dir: Directory for experiment storage.
28
+ Defaults to ~/.yanex/experiments
29
+ """
30
+ if experiments_dir is None:
31
+ experiments_dir = Path.home() / ".yanex" / "experiments"
32
+
33
+ self.experiments_dir = experiments_dir
34
+ self.storage = ExperimentStorage(experiments_dir)
35
+
36
+ def generate_experiment_id(self) -> str:
37
+ """Generate unique 8-character hex experiment ID.
38
+
39
+ Uses cryptographically secure random generation and checks for
40
+ collisions with existing experiments. Retries up to 10 times
41
+ before failing.
42
+
43
+ Returns:
44
+ Unique 8-character hex string
45
+
46
+ Raises:
47
+ RuntimeError: If unable to generate unique ID after 10 attempts
48
+ """
49
+ max_attempts = 10
50
+
51
+ for _ in range(max_attempts):
52
+ # Generate 8-character hex ID using secure random
53
+ experiment_id = secrets.token_hex(4)
54
+
55
+ # Check for collision with existing experiments
56
+ if not self.storage.experiment_exists(experiment_id):
57
+ return experiment_id
58
+
59
+ # If we reach here, we had collisions on all attempts
60
+ raise RuntimeError(
61
+ f"Failed to generate unique experiment ID after {max_attempts} attempts"
62
+ )
63
+
64
+ def get_running_experiment(self) -> Optional[str]:
65
+ """Check if there's currently a running experiment.
66
+
67
+ Scans the experiments directory for experiments with status='running'.
68
+
69
+ Returns:
70
+ Experiment ID of running experiment, or None if no running experiment
71
+ """
72
+ if not self.experiments_dir.exists():
73
+ return None
74
+
75
+ # Scan all experiment directories
76
+ for experiment_dir in self.experiments_dir.iterdir():
77
+ if not experiment_dir.is_dir():
78
+ continue
79
+
80
+ experiment_id = experiment_dir.name
81
+
82
+ # Check if this experiment exists and is running
83
+ if self.storage.experiment_exists(experiment_id):
84
+ try:
85
+ metadata = self.storage.load_metadata(experiment_id)
86
+ if metadata.get("status") == "running":
87
+ return experiment_id
88
+ except Exception:
89
+ # Skip experiments with corrupted metadata
90
+ continue
91
+
92
+ return None
93
+
94
+ def start_experiment(self, experiment_id: str) -> None:
95
+ """Transition experiment to running state.
96
+
97
+ Args:
98
+ experiment_id: Experiment identifier
99
+
100
+ Raises:
101
+ ExperimentNotFoundError: If experiment doesn't exist
102
+ ValueError: If experiment is not in 'created' state
103
+ StorageError: If metadata update fails
104
+ """
105
+ # Verify experiment exists
106
+ if not self.storage.experiment_exists(experiment_id):
107
+ from ..utils.exceptions import ExperimentNotFoundError
108
+
109
+ raise ExperimentNotFoundError(experiment_id)
110
+
111
+ # Load current metadata
112
+ metadata = self.storage.load_metadata(experiment_id)
113
+
114
+ # Verify experiment is in correct state
115
+ if metadata.get("status") != "created":
116
+ current_status = metadata.get("status", "unknown")
117
+ raise ValueError(
118
+ f"Cannot start experiment {experiment_id}. "
119
+ f"Expected status 'created', got '{current_status}'"
120
+ )
121
+
122
+ # Update status and timestamps
123
+ now = datetime.utcnow().isoformat()
124
+ metadata["status"] = "running"
125
+ metadata["started_at"] = now
126
+
127
+ # Save updated metadata
128
+ self.storage.save_metadata(experiment_id, metadata)
129
+
130
+ def complete_experiment(self, experiment_id: str) -> None:
131
+ """Mark experiment as completed.
132
+
133
+ Args:
134
+ experiment_id: Experiment identifier
135
+
136
+ Raises:
137
+ ExperimentNotFoundError: If experiment doesn't exist
138
+ StorageError: If metadata update fails
139
+ """
140
+ # Verify experiment exists
141
+ if not self.storage.experiment_exists(experiment_id):
142
+ from ..utils.exceptions import ExperimentNotFoundError
143
+
144
+ raise ExperimentNotFoundError(experiment_id)
145
+
146
+ # Load current metadata
147
+ metadata = self.storage.load_metadata(experiment_id)
148
+
149
+ # Update status and timestamps
150
+ now = datetime.utcnow().isoformat()
151
+ metadata["status"] = "completed"
152
+ metadata["completed_at"] = now
153
+
154
+ # Calculate duration if we have start time
155
+ if metadata.get("started_at"):
156
+ try:
157
+ start_time = datetime.fromisoformat(metadata["started_at"])
158
+ end_time = datetime.fromisoformat(now)
159
+ duration = (end_time - start_time).total_seconds()
160
+ metadata["duration"] = duration
161
+ except (ValueError, TypeError):
162
+ # If we can't parse timestamps, skip duration calculation
163
+ metadata["duration"] = None
164
+ else:
165
+ metadata["duration"] = None
166
+
167
+ # Save updated metadata
168
+ self.storage.save_metadata(experiment_id, metadata)
169
+
170
+ def fail_experiment(self, experiment_id: str, error_message: str) -> None:
171
+ """Mark experiment as failed with error details.
172
+
173
+ Args:
174
+ experiment_id: Experiment identifier
175
+ error_message: Error message describing the failure
176
+
177
+ Raises:
178
+ ExperimentNotFoundError: If experiment doesn't exist
179
+ StorageError: If metadata update fails
180
+ """
181
+ # Verify experiment exists
182
+ if not self.storage.experiment_exists(experiment_id):
183
+ from ..utils.exceptions import ExperimentNotFoundError
184
+
185
+ raise ExperimentNotFoundError(experiment_id)
186
+
187
+ # Load current metadata
188
+ metadata = self.storage.load_metadata(experiment_id)
189
+
190
+ # Update status and error information
191
+ now = datetime.utcnow().isoformat()
192
+ metadata["status"] = "failed"
193
+ metadata["completed_at"] = now
194
+ metadata["error_message"] = error_message
195
+
196
+ # Calculate duration if we have start time
197
+ if metadata.get("started_at"):
198
+ try:
199
+ start_time = datetime.fromisoformat(metadata["started_at"])
200
+ end_time = datetime.fromisoformat(now)
201
+ duration = (end_time - start_time).total_seconds()
202
+ metadata["duration"] = duration
203
+ except (ValueError, TypeError):
204
+ metadata["duration"] = None
205
+ else:
206
+ metadata["duration"] = None
207
+
208
+ # Save updated metadata
209
+ self.storage.save_metadata(experiment_id, metadata)
210
+
211
+ def cancel_experiment(self, experiment_id: str, reason: str) -> None:
212
+ """Mark experiment as cancelled with reason.
213
+
214
+ Args:
215
+ experiment_id: Experiment identifier
216
+ reason: Reason for cancellation
217
+
218
+ Raises:
219
+ ExperimentNotFoundError: If experiment doesn't exist
220
+ StorageError: If metadata update fails
221
+ """
222
+ # Verify experiment exists
223
+ if not self.storage.experiment_exists(experiment_id):
224
+ from ..utils.exceptions import ExperimentNotFoundError
225
+
226
+ raise ExperimentNotFoundError(experiment_id)
227
+
228
+ # Load current metadata
229
+ metadata = self.storage.load_metadata(experiment_id)
230
+
231
+ # Update status and cancellation information
232
+ now = datetime.utcnow().isoformat()
233
+ metadata["status"] = "cancelled"
234
+ metadata["completed_at"] = now
235
+ metadata["cancellation_reason"] = reason
236
+
237
+ # Calculate duration if we have start time
238
+ if metadata.get("started_at"):
239
+ try:
240
+ start_time = datetime.fromisoformat(metadata["started_at"])
241
+ end_time = datetime.fromisoformat(now)
242
+ duration = (end_time - start_time).total_seconds()
243
+ metadata["duration"] = duration
244
+ except (ValueError, TypeError):
245
+ metadata["duration"] = None
246
+ else:
247
+ metadata["duration"] = None
248
+
249
+ # Save updated metadata
250
+ self.storage.save_metadata(experiment_id, metadata)
251
+
252
+ def get_experiment_status(self, experiment_id: str) -> str:
253
+ """Get current status of an experiment.
254
+
255
+ Args:
256
+ experiment_id: Experiment identifier
257
+
258
+ Returns:
259
+ Current experiment status
260
+
261
+ Raises:
262
+ ExperimentNotFoundError: If experiment doesn't exist
263
+ StorageError: If metadata cannot be loaded
264
+ """
265
+ # Verify experiment exists
266
+ if not self.storage.experiment_exists(experiment_id):
267
+ from ..utils.exceptions import ExperimentNotFoundError
268
+
269
+ raise ExperimentNotFoundError(experiment_id)
270
+
271
+ # Load metadata and return status
272
+ metadata = self.storage.load_metadata(experiment_id)
273
+ return metadata.get("status", "unknown")
274
+
275
+ def get_experiment_metadata(self, experiment_id: str) -> Dict[str, Any]:
276
+ """Get complete metadata for an experiment.
277
+
278
+ Args:
279
+ experiment_id: Experiment identifier
280
+
281
+ Returns:
282
+ Complete experiment metadata
283
+
284
+ Raises:
285
+ ExperimentNotFoundError: If experiment doesn't exist
286
+ StorageError: If metadata cannot be loaded
287
+ """
288
+ # Verify experiment exists
289
+ if not self.storage.experiment_exists(experiment_id):
290
+ from ..utils.exceptions import ExperimentNotFoundError
291
+
292
+ raise ExperimentNotFoundError(experiment_id)
293
+
294
+ return self.storage.load_metadata(experiment_id)
295
+
296
+ def list_experiments(self, status_filter: Optional[str] = None) -> List[str]:
297
+ """List experiment IDs, optionally filtered by status.
298
+
299
+ Args:
300
+ status_filter: Optional status to filter by (e.g., 'completed', 'failed')
301
+
302
+ Returns:
303
+ List of experiment IDs matching the criteria
304
+ """
305
+ experiment_ids = self.storage.list_experiments()
306
+
307
+ if status_filter is None:
308
+ return experiment_ids
309
+
310
+ # Filter by status
311
+ filtered_ids = []
312
+ for experiment_id in experiment_ids:
313
+ try:
314
+ if self.get_experiment_status(experiment_id) == status_filter:
315
+ filtered_ids.append(experiment_id)
316
+ except Exception:
317
+ # Skip experiments with corrupted metadata
318
+ continue
319
+
320
+ return filtered_ids
321
+
322
+ def archive_experiment(self, experiment_id: str) -> Path:
323
+ """Archive an experiment by moving it to archive directory.
324
+
325
+ Args:
326
+ experiment_id: Experiment identifier
327
+
328
+ Returns:
329
+ Path where experiment was archived
330
+
331
+ Raises:
332
+ ExperimentNotFoundError: If experiment doesn't exist
333
+ StorageError: If archiving fails
334
+ """
335
+ # Verify experiment exists
336
+ if not self.storage.experiment_exists(experiment_id):
337
+ from ..utils.exceptions import ExperimentNotFoundError
338
+
339
+ raise ExperimentNotFoundError(experiment_id)
340
+
341
+ return self.storage.archive_experiment(experiment_id)
342
+
343
+ def prevent_concurrent_execution(self) -> None:
344
+ """Ensure no other experiment is currently running.
345
+
346
+ Raises:
347
+ ExperimentAlreadyRunningError: If another experiment is running
348
+ """
349
+ running_experiment = self.get_running_experiment()
350
+ if running_experiment is not None:
351
+ raise ExperimentAlreadyRunningError(
352
+ f"Experiment {running_experiment} is already running. "
353
+ "Only one experiment can run at a time."
354
+ )
355
+
356
+ def create_experiment(
357
+ self,
358
+ script_path: Path,
359
+ name: Optional[str] = None,
360
+ config: Optional[Dict[str, Any]] = None,
361
+ tags: Optional[List[str]] = None,
362
+ description: Optional[str] = None,
363
+ allow_dirty: bool = False,
364
+ stage_only: bool = False,
365
+ ) -> str:
366
+ """Create new experiment with metadata.
367
+
368
+ Args:
369
+ script_path: Path to the Python script to run
370
+ name: Optional experiment name
371
+ config: Configuration dictionary
372
+ tags: List of tags for the experiment
373
+ description: Optional experiment description
374
+ allow_dirty: Allow running with uncommitted changes
375
+ stage_only: If True, create experiment with "staged" status for later execution
376
+
377
+ Returns:
378
+ Experiment ID
379
+
380
+ Raises:
381
+ DirtyWorkingDirectoryError: If git working directory is not clean and allow_dirty=False
382
+ ValidationError: If input parameters are invalid
383
+ ExperimentAlreadyRunningError: If another experiment is running (unless stage_only=True)
384
+ StorageError: If experiment creation fails
385
+ """
386
+ # Validate git working directory is clean (unless explicitly allowed)
387
+ if not allow_dirty:
388
+ validate_clean_working_directory()
389
+
390
+ # Prevent concurrent execution (unless staging only)
391
+ if not stage_only:
392
+ self.prevent_concurrent_execution()
393
+
394
+ # Set defaults
395
+ if config is None:
396
+ config = {}
397
+ if tags is None:
398
+ tags = []
399
+
400
+ # Validate inputs
401
+ if name is not None:
402
+ validate_experiment_name(name)
403
+ # Note: We allow duplicate names to support experiment grouping
404
+
405
+ validate_tags(tags)
406
+
407
+ # Generate unique experiment ID
408
+ experiment_id = self.generate_experiment_id()
409
+
410
+ # Create experiment directory structure
411
+ self.storage.create_experiment_directory(experiment_id)
412
+
413
+ # Build and save metadata
414
+ metadata = self.build_metadata(
415
+ experiment_id, script_path, name, tags, description, stage_only
416
+ )
417
+ self.storage.save_metadata(experiment_id, metadata)
418
+
419
+ # Save resolved configuration
420
+ self.storage.save_config(experiment_id, config)
421
+
422
+ return experiment_id
423
+
424
+ def build_metadata(
425
+ self,
426
+ experiment_id: str,
427
+ script_path: Path,
428
+ name: Optional[str],
429
+ tags: List[str],
430
+ description: Optional[str],
431
+ stage_only: bool = False,
432
+ ) -> Dict[str, Any]:
433
+ """Build complete experiment metadata.
434
+
435
+ Args:
436
+ experiment_id: Unique experiment identifier
437
+ script_path: Path to the Python script
438
+ name: Optional experiment name
439
+ tags: List of experiment tags
440
+ description: Optional experiment description
441
+ stage_only: If True, create with "staged" status
442
+
443
+ Returns:
444
+ Complete metadata dictionary
445
+ """
446
+ # Get current timestamp
447
+ timestamp = datetime.utcnow().isoformat()
448
+
449
+ # Capture git information
450
+ git_info = get_current_commit_info()
451
+
452
+ # Capture environment information
453
+ environment_info = capture_full_environment()
454
+
455
+ # Build metadata
456
+ status = "staged" if stage_only else "created"
457
+ metadata = {
458
+ "id": experiment_id,
459
+ "name": name,
460
+ "script_path": str(script_path.resolve()),
461
+ "tags": tags,
462
+ "description": description,
463
+ "status": status,
464
+ "created_at": timestamp,
465
+ "started_at": None,
466
+ "completed_at": None,
467
+ "duration": None,
468
+ "git": git_info,
469
+ "environment": environment_info,
470
+ }
471
+
472
+ return metadata
473
+
474
+ def execute_staged_experiment(self, experiment_id: str) -> None:
475
+ """Execute a staged experiment.
476
+
477
+ Args:
478
+ experiment_id: Experiment identifier
479
+
480
+ Raises:
481
+ ExperimentNotFoundError: If experiment doesn't exist
482
+ ValueError: If experiment is not in 'staged' state
483
+ StorageError: If metadata update fails
484
+ """
485
+ # Verify experiment exists
486
+ if not self.storage.experiment_exists(experiment_id):
487
+ from ..utils.exceptions import ExperimentNotFoundError
488
+
489
+ raise ExperimentNotFoundError(experiment_id)
490
+
491
+ # Load current metadata
492
+ metadata = self.storage.load_metadata(experiment_id)
493
+
494
+ # Verify experiment is in staged state
495
+ if metadata.get("status") != "staged":
496
+ current_status = metadata.get("status", "unknown")
497
+ raise ValueError(
498
+ f"Cannot execute experiment {experiment_id}. "
499
+ f"Expected status 'staged', got '{current_status}'"
500
+ )
501
+
502
+ # Transition to running state
503
+ now = datetime.utcnow().isoformat()
504
+ metadata["status"] = "running"
505
+ metadata["started_at"] = now
506
+ self.storage.save_metadata(experiment_id, metadata)
507
+
508
+ def get_staged_experiments(self) -> List[str]:
509
+ """Get list of staged experiment IDs.
510
+
511
+ Returns:
512
+ List of experiment IDs with status 'staged'
513
+ """
514
+ all_experiments = self.storage.list_experiments(include_archived=False)
515
+ staged_experiments = []
516
+
517
+ for exp_id in all_experiments:
518
+ try:
519
+ metadata = self.storage.load_metadata(exp_id)
520
+ if metadata.get("status") == "staged":
521
+ staged_experiments.append(exp_id)
522
+ except Exception:
523
+ # Skip experiments with loading errors
524
+ continue
525
+
526
+ return staged_experiments
527
+
528
+ def find_experiment_by_name(self, name: str) -> Optional[str]:
529
+ """Find experiment ID by name.
530
+
531
+ Args:
532
+ name: Experiment name to search for
533
+
534
+ Returns:
535
+ Experiment ID if found, None otherwise
536
+ """
537
+ if not self.experiments_dir.exists():
538
+ return None
539
+
540
+ # Search through all experiment directories
541
+ for experiment_dir in self.experiments_dir.iterdir():
542
+ if not experiment_dir.is_dir():
543
+ continue
544
+
545
+ experiment_id = experiment_dir.name
546
+ if self.storage.experiment_exists(experiment_id):
547
+ try:
548
+ metadata = self.storage.load_metadata(experiment_id)
549
+ if metadata.get("name") == name:
550
+ return experiment_id
551
+ except Exception:
552
+ # Skip experiments with corrupted metadata
553
+ continue
554
+
555
+ return None