rushti 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rushti/__init__.py ADDED
@@ -0,0 +1,33 @@
1
+ """
2
+ RushTI - Parallel TM1 TurboIntegrator Process Execution.
3
+
4
+ This package provides tools for executing TM1 TI processes in parallel
5
+ with dependency management, checkpoint/resume support, and execution logging.
6
+ """
7
+
8
+ __version__ = "2.0.0"
9
+ __app_name__ = "RushTI"
10
+
11
+ # Core exports for programmatic use
12
+ from rushti.task import Task, OptimizedTask, ExecutionMode
13
+ from rushti.dag import DAG
14
+ from rushti.checkpoint import Checkpoint, CheckpointManager
15
+ from rushti.settings import Settings, load_settings
16
+ from rushti.taskfile import Taskfile, TaskDefinition, parse_json_taskfile
17
+
18
+ __all__ = [
19
+ "__version__",
20
+ "__app_name__",
21
+ # Core classes
22
+ "Task",
23
+ "OptimizedTask",
24
+ "ExecutionMode",
25
+ "DAG",
26
+ "Checkpoint",
27
+ "CheckpointManager",
28
+ "Settings",
29
+ "load_settings",
30
+ "Taskfile",
31
+ "TaskDefinition",
32
+ "parse_json_taskfile",
33
+ ]
rushti/checkpoint.py ADDED
@@ -0,0 +1,642 @@
1
+ """Checkpoint and resume functionality for RushTI.
2
+
3
+ This module provides:
4
+ - Automatic checkpoint saving during task execution
5
+ - Checkpoint loading and validation for resume operations
6
+ - Integration with safe_retry for interrupted task handling
7
+
8
+ Checkpoint files are JSON documents that capture execution state:
9
+ - Completed tasks with their results
10
+ - In-progress tasks at the time of checkpoint
11
+ - Pending tasks yet to be executed
12
+ - Failed tasks
13
+
14
+ Usage:
15
+ # During execution
16
+ checkpoint = Checkpoint.create(taskfile_path, workflow)
17
+ checkpoint.mark_completed("task-1", success=True, duration=10.5)
18
+ save_checkpoint(checkpoint, checkpoint_path)
19
+
20
+ # On resume
21
+ checkpoint = load_checkpoint(checkpoint_path)
22
+ checkpoint.validate_against_taskfile(taskfile_path)
23
+ """
24
+
25
+ import json
26
+ import logging
27
+ import os
28
+ import tempfile
29
+ from dataclasses import dataclass, field, asdict
30
+ from datetime import datetime
31
+ from pathlib import Path
32
+ from typing import Dict, List, Optional, Set
33
+ import hashlib
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+ # Bytes to read per iteration when hashing files
38
+ _FILE_HASH_CHUNK_SIZE = 8192
39
+
40
+
41
+ @dataclass
42
+ class TaskResult:
43
+ """Result of a completed task execution."""
44
+
45
+ task_id: str
46
+ success: bool
47
+ duration_seconds: float
48
+ retry_count: int = 0
49
+ error_message: Optional[str] = None
50
+ completed_at: str = field(default_factory=lambda: datetime.now().isoformat())
51
+
52
+ def to_dict(self) -> dict:
53
+ return asdict(self)
54
+
55
+ @classmethod
56
+ def from_dict(cls, data: dict) -> "TaskResult":
57
+ return cls(**data)
58
+
59
+
60
+ @dataclass
61
+ class Checkpoint:
62
+ """Execution state checkpoint for resume capability.
63
+
64
+ Attributes:
65
+ taskfile_path: Path to the original task file
66
+ workflow: Workflow name (from metadata or filename)
67
+ taskfile_hash: Hash of taskfile content for validation
68
+ run_started: ISO timestamp when execution started
69
+ checkpoint_created: ISO timestamp when this checkpoint was created
70
+ completed_tasks: Dict of task_id -> TaskResult for completed tasks
71
+ in_progress_tasks: Set of task IDs that were running when checkpoint was saved
72
+ pending_tasks: Set of task IDs not yet executed
73
+ failed_tasks: Set of task IDs that failed
74
+ skipped_tasks: Set of task IDs that were skipped (e.g., predecessor failed)
75
+ total_tasks: Total number of tasks in the taskfile
76
+ version: Checkpoint format version for compatibility
77
+ """
78
+
79
+ taskfile_path: str
80
+ workflow: str
81
+ taskfile_hash: str
82
+ run_started: str
83
+ checkpoint_created: str
84
+ completed_tasks: Dict[str, TaskResult] = field(default_factory=dict)
85
+ in_progress_tasks: Set[str] = field(default_factory=set)
86
+ pending_tasks: Set[str] = field(default_factory=set)
87
+ failed_tasks: Set[str] = field(default_factory=set)
88
+ skipped_tasks: Set[str] = field(default_factory=set)
89
+ total_tasks: int = 0
90
+ version: str = "1.0"
91
+
92
+ @classmethod
93
+ def create(
94
+ cls,
95
+ taskfile_path: str,
96
+ workflow: str,
97
+ task_ids: List[str],
98
+ ) -> "Checkpoint":
99
+ """Create a new checkpoint at the start of execution.
100
+
101
+ :param taskfile_path: Path to the task file
102
+ :param workflow: Workflow name
103
+ :param task_ids: List of all task IDs in the taskfile
104
+ :return: New Checkpoint instance
105
+ """
106
+ taskfile_hash = _compute_file_hash(taskfile_path)
107
+ now = datetime.now().isoformat()
108
+
109
+ return cls(
110
+ taskfile_path=str(Path(taskfile_path).absolute()),
111
+ workflow=workflow,
112
+ taskfile_hash=taskfile_hash,
113
+ run_started=now,
114
+ checkpoint_created=now,
115
+ pending_tasks=set(task_ids),
116
+ total_tasks=len(task_ids),
117
+ )
118
+
119
+ def mark_running(self, task_id: str) -> None:
120
+ """Mark a task as currently running.
121
+
122
+ :param task_id: Task ID that started running
123
+ """
124
+ self.pending_tasks.discard(task_id)
125
+ self.in_progress_tasks.add(task_id)
126
+ self.checkpoint_created = datetime.now().isoformat()
127
+
128
+ def mark_completed(
129
+ self,
130
+ task_id: str,
131
+ success: bool,
132
+ duration_seconds: float,
133
+ retry_count: int = 0,
134
+ error_message: Optional[str] = None,
135
+ ) -> None:
136
+ """Mark a task as completed.
137
+
138
+ :param task_id: Task ID that completed
139
+ :param success: Whether the task succeeded
140
+ :param duration_seconds: Execution duration
141
+ :param retry_count: Number of retries performed
142
+ :param error_message: Error message if failed
143
+ """
144
+ self.in_progress_tasks.discard(task_id)
145
+ self.pending_tasks.discard(task_id)
146
+
147
+ result = TaskResult(
148
+ task_id=task_id,
149
+ success=success,
150
+ duration_seconds=duration_seconds,
151
+ retry_count=retry_count,
152
+ error_message=error_message,
153
+ )
154
+ self.completed_tasks[task_id] = result
155
+
156
+ if not success:
157
+ self.failed_tasks.add(task_id)
158
+
159
+ self.checkpoint_created = datetime.now().isoformat()
160
+
161
+ def mark_skipped(self, task_id: str, reason: str = "predecessor_failed") -> None:
162
+ """Mark a task as skipped.
163
+
164
+ :param task_id: Task ID that was skipped
165
+ :param reason: Reason for skipping
166
+ """
167
+ self.pending_tasks.discard(task_id)
168
+ self.in_progress_tasks.discard(task_id)
169
+ self.skipped_tasks.add(task_id)
170
+
171
+ # Record as completed with success=False
172
+ result = TaskResult(
173
+ task_id=task_id,
174
+ success=False,
175
+ duration_seconds=0.0,
176
+ error_message=f"Skipped: {reason}",
177
+ )
178
+ self.completed_tasks[task_id] = result
179
+ self.checkpoint_created = datetime.now().isoformat()
180
+
181
+ def get_tasks_for_resume(
182
+ self,
183
+ task_safe_retry_map: Dict[str, bool],
184
+ ) -> tuple:
185
+ """Determine which tasks to execute on resume.
186
+
187
+ :param task_safe_retry_map: Dict mapping task_id -> safe_retry flag
188
+ :return: Tuple of (tasks_to_run, tasks_requiring_decision, error_message)
189
+ - tasks_to_run: Set of task IDs to execute
190
+ - tasks_requiring_decision: Set of in-progress non-safe-retry tasks
191
+ - error_message: Error message if there are blocking issues
192
+ """
193
+ tasks_to_run = set(self.pending_tasks)
194
+ tasks_requiring_decision = set()
195
+
196
+ # Handle in-progress tasks based on safe_retry flag
197
+ for task_id in self.in_progress_tasks:
198
+ safe_retry = task_safe_retry_map.get(task_id, False)
199
+ if safe_retry:
200
+ # Safe to retry - add to tasks to run
201
+ tasks_to_run.add(task_id)
202
+ logger.info(f"Task '{task_id}' was in-progress with safe_retry=true, will retry")
203
+ else:
204
+ # Not safe to retry - requires user decision
205
+ tasks_requiring_decision.add(task_id)
206
+ logger.warning(
207
+ f"Task '{task_id}' was in-progress with safe_retry=false, "
208
+ f"requires --resume-from to specify handling"
209
+ )
210
+
211
+ error_message = None
212
+ if tasks_requiring_decision:
213
+ task_list = ", ".join(sorted(tasks_requiring_decision))
214
+ error_message = (
215
+ f"Cannot automatically resume: {len(tasks_requiring_decision)} task(s) were in-progress "
216
+ f"with safe_retry=false: {task_list}. "
217
+ f"Use --resume-from <task_id> to specify where to resume from."
218
+ )
219
+
220
+ return tasks_to_run, tasks_requiring_decision, error_message
221
+
222
+ def get_resume_from_task(
223
+ self,
224
+ resume_from_task_id: str,
225
+ all_task_ids: List[str],
226
+ ) -> Set[str]:
227
+ """Get tasks to run when resuming from a specific task.
228
+
229
+ :param resume_from_task_id: Task ID to resume from
230
+ :param all_task_ids: Ordered list of all task IDs
231
+ :return: Set of task IDs to execute
232
+ :raises ValueError: If resume_from_task_id is not found
233
+ """
234
+ if resume_from_task_id not in set(all_task_ids):
235
+ raise ValueError(f"Task '{resume_from_task_id}' not found in taskfile")
236
+
237
+ # Find the position of the resume-from task
238
+ try:
239
+ resume_index = all_task_ids.index(resume_from_task_id)
240
+ except ValueError:
241
+ raise ValueError(f"Task '{resume_from_task_id}' not found in taskfile")
242
+
243
+ # Return all tasks from resume point onwards
244
+ tasks_to_run = set(all_task_ids[resume_index:])
245
+
246
+ # Also include any pending tasks that might have been skipped
247
+ tasks_to_run.update(self.pending_tasks)
248
+
249
+ return tasks_to_run
250
+
251
+ @property
252
+ def is_complete(self) -> bool:
253
+ """Check if all tasks are complete (no pending or in-progress)."""
254
+ return len(self.pending_tasks) == 0 and len(self.in_progress_tasks) == 0
255
+
256
+ @property
257
+ def success_count(self) -> int:
258
+ """Count of successfully completed tasks."""
259
+ return sum(1 for r in self.completed_tasks.values() if r.success)
260
+
261
+ @property
262
+ def failure_count(self) -> int:
263
+ """Count of failed tasks."""
264
+ return len(self.failed_tasks)
265
+
266
+ @property
267
+ def progress_percentage(self) -> float:
268
+ """Percentage of tasks completed."""
269
+ if self.total_tasks == 0:
270
+ return 0.0
271
+ completed = len(self.completed_tasks)
272
+ return (completed / self.total_tasks) * 100
273
+
274
+ def to_dict(self) -> dict:
275
+ """Convert to dictionary for JSON serialization."""
276
+ return {
277
+ "version": self.version,
278
+ "taskfile_path": self.taskfile_path,
279
+ "workflow": self.workflow,
280
+ "taskfile_hash": self.taskfile_hash,
281
+ "run_started": self.run_started,
282
+ "checkpoint_created": self.checkpoint_created,
283
+ "total_tasks": self.total_tasks,
284
+ "completed_tasks": {k: v.to_dict() for k, v in self.completed_tasks.items()},
285
+ "in_progress_tasks": list(self.in_progress_tasks),
286
+ "pending_tasks": list(self.pending_tasks),
287
+ "failed_tasks": list(self.failed_tasks),
288
+ "skipped_tasks": list(self.skipped_tasks),
289
+ "summary": {
290
+ "completed": len(self.completed_tasks),
291
+ "in_progress": len(self.in_progress_tasks),
292
+ "pending": len(self.pending_tasks),
293
+ "failed": len(self.failed_tasks),
294
+ "skipped": len(self.skipped_tasks),
295
+ "success_count": self.success_count,
296
+ "progress_percentage": round(self.progress_percentage, 1),
297
+ },
298
+ }
299
+
300
+ @classmethod
301
+ def from_dict(cls, data: dict) -> "Checkpoint":
302
+ """Create Checkpoint from dictionary."""
303
+ # Parse completed_tasks
304
+ completed_tasks = {}
305
+ for task_id, result_data in data.get("completed_tasks", {}).items():
306
+ completed_tasks[task_id] = TaskResult.from_dict(result_data)
307
+
308
+ return cls(
309
+ taskfile_path=data["taskfile_path"],
310
+ workflow=data["workflow"],
311
+ taskfile_hash=data["taskfile_hash"],
312
+ run_started=data["run_started"],
313
+ checkpoint_created=data["checkpoint_created"],
314
+ completed_tasks=completed_tasks,
315
+ in_progress_tasks=set(data.get("in_progress_tasks", [])),
316
+ pending_tasks=set(data.get("pending_tasks", [])),
317
+ failed_tasks=set(data.get("failed_tasks", [])),
318
+ skipped_tasks=set(data.get("skipped_tasks", [])),
319
+ total_tasks=data.get("total_tasks", 0),
320
+ version=data.get("version", "1.0"),
321
+ )
322
+
323
+ def validate_against_taskfile(
324
+ self,
325
+ taskfile_path: str,
326
+ strict: bool = True,
327
+ ) -> tuple:
328
+ """Validate this checkpoint matches the given taskfile.
329
+
330
+ :param taskfile_path: Path to the current taskfile
331
+ :param strict: If True, require exact hash match; if False, just warn
332
+ :return: Tuple of (is_valid, warnings)
333
+ """
334
+ warnings = []
335
+ is_valid = True
336
+
337
+ # Check file hash
338
+ current_hash = _compute_file_hash(taskfile_path)
339
+ if current_hash != self.taskfile_hash:
340
+ msg = (
341
+ f"Taskfile has been modified since checkpoint was created. "
342
+ f"Original hash: {self.taskfile_hash[:8]}..., "
343
+ f"Current hash: {current_hash[:8]}..."
344
+ )
345
+ if strict:
346
+ is_valid = False
347
+ warnings.append(f"ERROR: {msg}")
348
+ else:
349
+ warnings.append(f"WARNING: {msg}")
350
+
351
+ # Check path matches (normalized)
352
+ checkpoint_path = Path(self.taskfile_path).resolve()
353
+ current_path = Path(taskfile_path).resolve()
354
+ if checkpoint_path != current_path:
355
+ warnings.append(
356
+ f"WARNING: Checkpoint was created for '{checkpoint_path}', "
357
+ f"but resuming with '{current_path}'"
358
+ )
359
+
360
+ return is_valid, warnings
361
+
362
+
363
+ def _compute_file_hash(file_path: str) -> str:
364
+ """Compute SHA-256 hash of a file.
365
+
366
+ :param file_path: Path to the file
367
+ :return: Hex digest of the hash
368
+ """
369
+ sha256 = hashlib.sha256()
370
+ with open(file_path, "rb") as f:
371
+ for chunk in iter(lambda: f.read(_FILE_HASH_CHUNK_SIZE), b""):
372
+ sha256.update(chunk)
373
+ return sha256.hexdigest()
374
+
375
+
376
+ def save_checkpoint(checkpoint: Checkpoint, file_path: str) -> None:
377
+ """Save checkpoint to file atomically.
378
+
379
+ Uses a write-to-temp-then-rename strategy to ensure the checkpoint
380
+ file is never left in a partial/corrupt state.
381
+
382
+ :param checkpoint: Checkpoint to save
383
+ :param file_path: Target file path
384
+ """
385
+ from rushti.utils import ensure_shared_file, makedirs_shared
386
+
387
+ file_path = Path(file_path)
388
+
389
+ # Ensure directory exists (shared permissions for multi-user access)
390
+ makedirs_shared(str(file_path.parent))
391
+
392
+ # Write to temporary file first
393
+ fd, temp_path = tempfile.mkstemp(
394
+ suffix=".tmp",
395
+ prefix="checkpoint_",
396
+ dir=file_path.parent,
397
+ )
398
+
399
+ try:
400
+ with os.fdopen(fd, "w") as f:
401
+ json.dump(checkpoint.to_dict(), f, indent=2)
402
+
403
+ # Atomic rename (on POSIX systems)
404
+ # On Windows, need to remove target first if it exists
405
+ if os.name == "nt" and file_path.exists():
406
+ file_path.unlink()
407
+
408
+ os.rename(temp_path, file_path)
409
+ ensure_shared_file(str(file_path))
410
+ logger.debug(f"Checkpoint saved to {file_path}")
411
+
412
+ except Exception as e:
413
+ # Clean up temp file on error
414
+ if os.path.exists(temp_path):
415
+ os.unlink(temp_path)
416
+ raise RuntimeError(f"Failed to save checkpoint: {e}") from e
417
+
418
+
419
+ def load_checkpoint(file_path: str) -> Checkpoint:
420
+ """Load checkpoint from file.
421
+
422
+ :param file_path: Path to checkpoint file
423
+ :return: Loaded Checkpoint instance
424
+ :raises FileNotFoundError: If checkpoint file doesn't exist
425
+ :raises ValueError: If checkpoint file is invalid
426
+ """
427
+ file_path = Path(file_path)
428
+
429
+ if not file_path.exists():
430
+ raise FileNotFoundError(f"Checkpoint file not found: {file_path}")
431
+
432
+ try:
433
+ with open(file_path, "r") as f:
434
+ data = json.load(f)
435
+
436
+ # Validate required fields
437
+ required_fields = ["taskfile_path", "workflow", "run_started"]
438
+ for field in required_fields:
439
+ if field not in data:
440
+ raise ValueError(f"Checkpoint missing required field: {field}")
441
+
442
+ checkpoint = Checkpoint.from_dict(data)
443
+ logger.info(
444
+ f"Loaded checkpoint from {file_path}: "
445
+ f"{checkpoint.success_count}/{checkpoint.total_tasks} completed, "
446
+ f"{len(checkpoint.pending_tasks)} pending, "
447
+ f"{len(checkpoint.in_progress_tasks)} in-progress"
448
+ )
449
+
450
+ return checkpoint
451
+
452
+ except json.JSONDecodeError as e:
453
+ raise ValueError(f"Invalid checkpoint file format: {e}") from e
454
+
455
+
456
+ def delete_checkpoint(file_path: str) -> bool:
457
+ """Delete a checkpoint file.
458
+
459
+ :param file_path: Path to checkpoint file
460
+ :return: True if deleted, False if didn't exist
461
+ """
462
+ file_path = Path(file_path)
463
+
464
+ if file_path.exists():
465
+ file_path.unlink()
466
+ logger.info(f"Checkpoint deleted: {file_path}")
467
+ return True
468
+
469
+ return False
470
+
471
+
472
+ def get_checkpoint_path(
473
+ checkpoint_dir: str,
474
+ workflow: str,
475
+ ) -> Path:
476
+ """Get the standard checkpoint file path for a taskfile.
477
+
478
+ :param checkpoint_dir: Base directory for checkpoints
479
+ :param workflow: Workflow name
480
+ :return: Path to the checkpoint file
481
+ """
482
+ # Sanitize workflow name for use in filename
483
+ safe_id = "".join(c if c.isalnum() or c in "-_" else "_" for c in workflow)
484
+ return Path(checkpoint_dir) / f"checkpoint_{safe_id}.json"
485
+
486
+
487
+ def find_checkpoint_for_taskfile(
488
+ checkpoint_dir: str,
489
+ taskfile_path: str,
490
+ ) -> Optional[Path]:
491
+ """Find an existing checkpoint for a taskfile.
492
+
493
+ :param checkpoint_dir: Directory containing checkpoints
494
+ :param taskfile_path: Path to the taskfile
495
+ :return: Path to checkpoint file if found, None otherwise
496
+ """
497
+ checkpoint_dir = Path(checkpoint_dir)
498
+
499
+ if not checkpoint_dir.exists():
500
+ return None
501
+
502
+ # Look for checkpoint files
503
+ taskfile_path_resolved = Path(taskfile_path).resolve()
504
+
505
+ for checkpoint_file in checkpoint_dir.glob("checkpoint_*.json"):
506
+ try:
507
+ checkpoint = load_checkpoint(checkpoint_file)
508
+ if Path(checkpoint.taskfile_path).resolve() == taskfile_path_resolved:
509
+ return checkpoint_file
510
+ except (ValueError, FileNotFoundError):
511
+ continue
512
+
513
+ return None
514
+
515
+
516
+ class CheckpointManager:
517
+ """Manages checkpoint creation, saving, and cleanup during execution.
518
+
519
+ Usage:
520
+ manager = CheckpointManager(
521
+ checkpoint_dir="./checkpoints",
522
+ taskfile_path="tasks.json",
523
+ workflow="daily-etl",
524
+ task_ids=["1", "2", "3"],
525
+ checkpoint_interval=60,
526
+ )
527
+
528
+ # During execution
529
+ manager.mark_running("1")
530
+ manager.mark_completed("1", success=True, duration=10.5)
531
+
532
+ # After successful completion
533
+ manager.cleanup()
534
+ """
535
+
536
+ def __init__(
537
+ self,
538
+ checkpoint_dir: str,
539
+ taskfile_path: str,
540
+ workflow: str,
541
+ task_ids: List[str],
542
+ checkpoint_interval: int = 60,
543
+ enabled: bool = True,
544
+ ):
545
+ """Initialize checkpoint manager.
546
+
547
+ :param checkpoint_dir: Directory for checkpoint files
548
+ :param taskfile_path: Path to the task file
549
+ :param workflow: Workflow name
550
+ :param task_ids: List of all task IDs
551
+ :param checkpoint_interval: Seconds between automatic checkpoint saves
552
+ :param enabled: Whether checkpointing is enabled
553
+ """
554
+ self.enabled = enabled
555
+ self.checkpoint_dir = Path(checkpoint_dir)
556
+ self.checkpoint_path = get_checkpoint_path(checkpoint_dir, workflow)
557
+ self.checkpoint_interval = checkpoint_interval
558
+ self._last_save_time = datetime.now()
559
+
560
+ if enabled:
561
+ self.checkpoint = Checkpoint.create(
562
+ taskfile_path=taskfile_path,
563
+ workflow=workflow,
564
+ task_ids=task_ids,
565
+ )
566
+ # Create initial checkpoint
567
+ self._save()
568
+ else:
569
+ self.checkpoint = None
570
+
571
+ def mark_running(self, task_id: str) -> None:
572
+ """Mark a task as running."""
573
+ if not self.enabled or not self.checkpoint:
574
+ return
575
+ self.checkpoint.mark_running(task_id)
576
+ self._maybe_save()
577
+
578
+ def mark_completed(
579
+ self,
580
+ task_id: str,
581
+ success: bool,
582
+ duration_seconds: float,
583
+ retry_count: int = 0,
584
+ error_message: Optional[str] = None,
585
+ ) -> None:
586
+ """Mark a task as completed and save checkpoint."""
587
+ if not self.enabled or not self.checkpoint:
588
+ return
589
+
590
+ self.checkpoint.mark_completed(
591
+ task_id=task_id,
592
+ success=success,
593
+ duration_seconds=duration_seconds,
594
+ retry_count=retry_count,
595
+ error_message=error_message,
596
+ )
597
+ # Always save on task completion
598
+ self._save()
599
+
600
+ def mark_skipped(self, task_id: str, reason: str = "predecessor_failed") -> None:
601
+ """Mark a task as skipped."""
602
+ if not self.enabled or not self.checkpoint:
603
+ return
604
+ self.checkpoint.mark_skipped(task_id, reason)
605
+ self._maybe_save()
606
+
607
+ def _maybe_save(self) -> None:
608
+ """Save checkpoint if interval has elapsed."""
609
+ now = datetime.now()
610
+ elapsed = (now - self._last_save_time).total_seconds()
611
+ if elapsed >= self.checkpoint_interval:
612
+ self._save()
613
+
614
+ def _save(self) -> None:
615
+ """Save the current checkpoint."""
616
+ if not self.enabled or not self.checkpoint:
617
+ return
618
+
619
+ try:
620
+ save_checkpoint(self.checkpoint, self.checkpoint_path)
621
+ self._last_save_time = datetime.now()
622
+ except Exception as e:
623
+ logger.warning(f"Failed to save checkpoint: {e}")
624
+
625
+ def force_save(self) -> None:
626
+ """Force immediate checkpoint save."""
627
+ self._save()
628
+
629
+ def cleanup(self, success: bool = True) -> None:
630
+ """Clean up checkpoint after execution.
631
+
632
+ :param success: If True, delete checkpoint; if False, retain for resume
633
+ """
634
+ if not self.enabled:
635
+ return
636
+
637
+ if success:
638
+ delete_checkpoint(self.checkpoint_path)
639
+ else:
640
+ # Ensure final state is saved
641
+ self._save()
642
+ logger.info(f"Checkpoint retained for resume: {self.checkpoint_path}")