rushti 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rushti/__init__.py +33 -0
- rushti/checkpoint.py +642 -0
- rushti/cli.py +1329 -0
- rushti/commands.py +2025 -0
- rushti/contention_analyzer.py +1012 -0
- rushti/dag.py +421 -0
- rushti/dashboard.py +1595 -0
- rushti/db_admin.py +774 -0
- rushti/exclusive.py +316 -0
- rushti/execution.py +713 -0
- rushti/logging.py +353 -0
- rushti/messages.py +74 -0
- rushti/optimization_report.py +1053 -0
- rushti/optimizer.py +423 -0
- rushti/parsing.py +413 -0
- rushti/settings.py +436 -0
- rushti/stats.py +846 -0
- rushti/task.py +151 -0
- rushti/taskfile.py +710 -0
- rushti/taskfile_ops.py +1222 -0
- rushti/templates/__init__.py +1 -0
- rushti/templates/visualization.html +1054 -0
- rushti/tm1_build.py +651 -0
- rushti/tm1_integration.py +520 -0
- rushti/tm1_objects.py +776 -0
- rushti/utils.py +166 -0
- rushti/visualization_template.py +1067 -0
- rushti-2.0.0.dist-info/METADATA +168 -0
- rushti-2.0.0.dist-info/RECORD +33 -0
- rushti-2.0.0.dist-info/WHEEL +5 -0
- rushti-2.0.0.dist-info/entry_points.txt +2 -0
- rushti-2.0.0.dist-info/licenses/LICENSE +21 -0
- rushti-2.0.0.dist-info/top_level.txt +1 -0
rushti/__init__.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RushTI - Parallel TM1 TurboIntegrator Process Execution.
|
|
3
|
+
|
|
4
|
+
This package provides tools for executing TM1 TI processes in parallel
|
|
5
|
+
with dependency management, checkpoint/resume support, and execution logging.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "2.0.0"
|
|
9
|
+
__app_name__ = "RushTI"
|
|
10
|
+
|
|
11
|
+
# Core exports for programmatic use
|
|
12
|
+
from rushti.task import Task, OptimizedTask, ExecutionMode
|
|
13
|
+
from rushti.dag import DAG
|
|
14
|
+
from rushti.checkpoint import Checkpoint, CheckpointManager
|
|
15
|
+
from rushti.settings import Settings, load_settings
|
|
16
|
+
from rushti.taskfile import Taskfile, TaskDefinition, parse_json_taskfile
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"__version__",
|
|
20
|
+
"__app_name__",
|
|
21
|
+
# Core classes
|
|
22
|
+
"Task",
|
|
23
|
+
"OptimizedTask",
|
|
24
|
+
"ExecutionMode",
|
|
25
|
+
"DAG",
|
|
26
|
+
"Checkpoint",
|
|
27
|
+
"CheckpointManager",
|
|
28
|
+
"Settings",
|
|
29
|
+
"load_settings",
|
|
30
|
+
"Taskfile",
|
|
31
|
+
"TaskDefinition",
|
|
32
|
+
"parse_json_taskfile",
|
|
33
|
+
]
|
rushti/checkpoint.py
ADDED
|
@@ -0,0 +1,642 @@
|
|
|
1
|
+
"""Checkpoint and resume functionality for RushTI.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- Automatic checkpoint saving during task execution
|
|
5
|
+
- Checkpoint loading and validation for resume operations
|
|
6
|
+
- Integration with safe_retry for interrupted task handling
|
|
7
|
+
|
|
8
|
+
Checkpoint files are JSON documents that capture execution state:
|
|
9
|
+
- Completed tasks with their results
|
|
10
|
+
- In-progress tasks at the time of checkpoint
|
|
11
|
+
- Pending tasks yet to be executed
|
|
12
|
+
- Failed tasks
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
# During execution
|
|
16
|
+
checkpoint = Checkpoint.create(taskfile_path, workflow)
|
|
17
|
+
checkpoint.mark_completed("task-1", success=True, duration=10.5)
|
|
18
|
+
save_checkpoint(checkpoint, checkpoint_path)
|
|
19
|
+
|
|
20
|
+
# On resume
|
|
21
|
+
checkpoint = load_checkpoint(checkpoint_path)
|
|
22
|
+
checkpoint.validate_against_taskfile(taskfile_path)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import json
|
|
26
|
+
import logging
|
|
27
|
+
import os
|
|
28
|
+
import tempfile
|
|
29
|
+
from dataclasses import dataclass, field, asdict
|
|
30
|
+
from datetime import datetime
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
from typing import Dict, List, Optional, Set
|
|
33
|
+
import hashlib
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
# Bytes to read per iteration when hashing files
|
|
38
|
+
_FILE_HASH_CHUNK_SIZE = 8192
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class TaskResult:
|
|
43
|
+
"""Result of a completed task execution."""
|
|
44
|
+
|
|
45
|
+
task_id: str
|
|
46
|
+
success: bool
|
|
47
|
+
duration_seconds: float
|
|
48
|
+
retry_count: int = 0
|
|
49
|
+
error_message: Optional[str] = None
|
|
50
|
+
completed_at: str = field(default_factory=lambda: datetime.now().isoformat())
|
|
51
|
+
|
|
52
|
+
def to_dict(self) -> dict:
|
|
53
|
+
return asdict(self)
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def from_dict(cls, data: dict) -> "TaskResult":
|
|
57
|
+
return cls(**data)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class Checkpoint:
|
|
62
|
+
"""Execution state checkpoint for resume capability.
|
|
63
|
+
|
|
64
|
+
Attributes:
|
|
65
|
+
taskfile_path: Path to the original task file
|
|
66
|
+
workflow: Workflow name (from metadata or filename)
|
|
67
|
+
taskfile_hash: Hash of taskfile content for validation
|
|
68
|
+
run_started: ISO timestamp when execution started
|
|
69
|
+
checkpoint_created: ISO timestamp when this checkpoint was created
|
|
70
|
+
completed_tasks: Dict of task_id -> TaskResult for completed tasks
|
|
71
|
+
in_progress_tasks: Set of task IDs that were running when checkpoint was saved
|
|
72
|
+
pending_tasks: Set of task IDs not yet executed
|
|
73
|
+
failed_tasks: Set of task IDs that failed
|
|
74
|
+
skipped_tasks: Set of task IDs that were skipped (e.g., predecessor failed)
|
|
75
|
+
total_tasks: Total number of tasks in the taskfile
|
|
76
|
+
version: Checkpoint format version for compatibility
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
taskfile_path: str
|
|
80
|
+
workflow: str
|
|
81
|
+
taskfile_hash: str
|
|
82
|
+
run_started: str
|
|
83
|
+
checkpoint_created: str
|
|
84
|
+
completed_tasks: Dict[str, TaskResult] = field(default_factory=dict)
|
|
85
|
+
in_progress_tasks: Set[str] = field(default_factory=set)
|
|
86
|
+
pending_tasks: Set[str] = field(default_factory=set)
|
|
87
|
+
failed_tasks: Set[str] = field(default_factory=set)
|
|
88
|
+
skipped_tasks: Set[str] = field(default_factory=set)
|
|
89
|
+
total_tasks: int = 0
|
|
90
|
+
version: str = "1.0"
|
|
91
|
+
|
|
92
|
+
@classmethod
|
|
93
|
+
def create(
|
|
94
|
+
cls,
|
|
95
|
+
taskfile_path: str,
|
|
96
|
+
workflow: str,
|
|
97
|
+
task_ids: List[str],
|
|
98
|
+
) -> "Checkpoint":
|
|
99
|
+
"""Create a new checkpoint at the start of execution.
|
|
100
|
+
|
|
101
|
+
:param taskfile_path: Path to the task file
|
|
102
|
+
:param workflow: Workflow name
|
|
103
|
+
:param task_ids: List of all task IDs in the taskfile
|
|
104
|
+
:return: New Checkpoint instance
|
|
105
|
+
"""
|
|
106
|
+
taskfile_hash = _compute_file_hash(taskfile_path)
|
|
107
|
+
now = datetime.now().isoformat()
|
|
108
|
+
|
|
109
|
+
return cls(
|
|
110
|
+
taskfile_path=str(Path(taskfile_path).absolute()),
|
|
111
|
+
workflow=workflow,
|
|
112
|
+
taskfile_hash=taskfile_hash,
|
|
113
|
+
run_started=now,
|
|
114
|
+
checkpoint_created=now,
|
|
115
|
+
pending_tasks=set(task_ids),
|
|
116
|
+
total_tasks=len(task_ids),
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
def mark_running(self, task_id: str) -> None:
|
|
120
|
+
"""Mark a task as currently running.
|
|
121
|
+
|
|
122
|
+
:param task_id: Task ID that started running
|
|
123
|
+
"""
|
|
124
|
+
self.pending_tasks.discard(task_id)
|
|
125
|
+
self.in_progress_tasks.add(task_id)
|
|
126
|
+
self.checkpoint_created = datetime.now().isoformat()
|
|
127
|
+
|
|
128
|
+
def mark_completed(
|
|
129
|
+
self,
|
|
130
|
+
task_id: str,
|
|
131
|
+
success: bool,
|
|
132
|
+
duration_seconds: float,
|
|
133
|
+
retry_count: int = 0,
|
|
134
|
+
error_message: Optional[str] = None,
|
|
135
|
+
) -> None:
|
|
136
|
+
"""Mark a task as completed.
|
|
137
|
+
|
|
138
|
+
:param task_id: Task ID that completed
|
|
139
|
+
:param success: Whether the task succeeded
|
|
140
|
+
:param duration_seconds: Execution duration
|
|
141
|
+
:param retry_count: Number of retries performed
|
|
142
|
+
:param error_message: Error message if failed
|
|
143
|
+
"""
|
|
144
|
+
self.in_progress_tasks.discard(task_id)
|
|
145
|
+
self.pending_tasks.discard(task_id)
|
|
146
|
+
|
|
147
|
+
result = TaskResult(
|
|
148
|
+
task_id=task_id,
|
|
149
|
+
success=success,
|
|
150
|
+
duration_seconds=duration_seconds,
|
|
151
|
+
retry_count=retry_count,
|
|
152
|
+
error_message=error_message,
|
|
153
|
+
)
|
|
154
|
+
self.completed_tasks[task_id] = result
|
|
155
|
+
|
|
156
|
+
if not success:
|
|
157
|
+
self.failed_tasks.add(task_id)
|
|
158
|
+
|
|
159
|
+
self.checkpoint_created = datetime.now().isoformat()
|
|
160
|
+
|
|
161
|
+
def mark_skipped(self, task_id: str, reason: str = "predecessor_failed") -> None:
|
|
162
|
+
"""Mark a task as skipped.
|
|
163
|
+
|
|
164
|
+
:param task_id: Task ID that was skipped
|
|
165
|
+
:param reason: Reason for skipping
|
|
166
|
+
"""
|
|
167
|
+
self.pending_tasks.discard(task_id)
|
|
168
|
+
self.in_progress_tasks.discard(task_id)
|
|
169
|
+
self.skipped_tasks.add(task_id)
|
|
170
|
+
|
|
171
|
+
# Record as completed with success=False
|
|
172
|
+
result = TaskResult(
|
|
173
|
+
task_id=task_id,
|
|
174
|
+
success=False,
|
|
175
|
+
duration_seconds=0.0,
|
|
176
|
+
error_message=f"Skipped: {reason}",
|
|
177
|
+
)
|
|
178
|
+
self.completed_tasks[task_id] = result
|
|
179
|
+
self.checkpoint_created = datetime.now().isoformat()
|
|
180
|
+
|
|
181
|
+
def get_tasks_for_resume(
|
|
182
|
+
self,
|
|
183
|
+
task_safe_retry_map: Dict[str, bool],
|
|
184
|
+
) -> tuple:
|
|
185
|
+
"""Determine which tasks to execute on resume.
|
|
186
|
+
|
|
187
|
+
:param task_safe_retry_map: Dict mapping task_id -> safe_retry flag
|
|
188
|
+
:return: Tuple of (tasks_to_run, tasks_requiring_decision, error_message)
|
|
189
|
+
- tasks_to_run: Set of task IDs to execute
|
|
190
|
+
- tasks_requiring_decision: Set of in-progress non-safe-retry tasks
|
|
191
|
+
- error_message: Error message if there are blocking issues
|
|
192
|
+
"""
|
|
193
|
+
tasks_to_run = set(self.pending_tasks)
|
|
194
|
+
tasks_requiring_decision = set()
|
|
195
|
+
|
|
196
|
+
# Handle in-progress tasks based on safe_retry flag
|
|
197
|
+
for task_id in self.in_progress_tasks:
|
|
198
|
+
safe_retry = task_safe_retry_map.get(task_id, False)
|
|
199
|
+
if safe_retry:
|
|
200
|
+
# Safe to retry - add to tasks to run
|
|
201
|
+
tasks_to_run.add(task_id)
|
|
202
|
+
logger.info(f"Task '{task_id}' was in-progress with safe_retry=true, will retry")
|
|
203
|
+
else:
|
|
204
|
+
# Not safe to retry - requires user decision
|
|
205
|
+
tasks_requiring_decision.add(task_id)
|
|
206
|
+
logger.warning(
|
|
207
|
+
f"Task '{task_id}' was in-progress with safe_retry=false, "
|
|
208
|
+
f"requires --resume-from to specify handling"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
error_message = None
|
|
212
|
+
if tasks_requiring_decision:
|
|
213
|
+
task_list = ", ".join(sorted(tasks_requiring_decision))
|
|
214
|
+
error_message = (
|
|
215
|
+
f"Cannot automatically resume: {len(tasks_requiring_decision)} task(s) were in-progress "
|
|
216
|
+
f"with safe_retry=false: {task_list}. "
|
|
217
|
+
f"Use --resume-from <task_id> to specify where to resume from."
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
return tasks_to_run, tasks_requiring_decision, error_message
|
|
221
|
+
|
|
222
|
+
def get_resume_from_task(
|
|
223
|
+
self,
|
|
224
|
+
resume_from_task_id: str,
|
|
225
|
+
all_task_ids: List[str],
|
|
226
|
+
) -> Set[str]:
|
|
227
|
+
"""Get tasks to run when resuming from a specific task.
|
|
228
|
+
|
|
229
|
+
:param resume_from_task_id: Task ID to resume from
|
|
230
|
+
:param all_task_ids: Ordered list of all task IDs
|
|
231
|
+
:return: Set of task IDs to execute
|
|
232
|
+
:raises ValueError: If resume_from_task_id is not found
|
|
233
|
+
"""
|
|
234
|
+
if resume_from_task_id not in set(all_task_ids):
|
|
235
|
+
raise ValueError(f"Task '{resume_from_task_id}' not found in taskfile")
|
|
236
|
+
|
|
237
|
+
# Find the position of the resume-from task
|
|
238
|
+
try:
|
|
239
|
+
resume_index = all_task_ids.index(resume_from_task_id)
|
|
240
|
+
except ValueError:
|
|
241
|
+
raise ValueError(f"Task '{resume_from_task_id}' not found in taskfile")
|
|
242
|
+
|
|
243
|
+
# Return all tasks from resume point onwards
|
|
244
|
+
tasks_to_run = set(all_task_ids[resume_index:])
|
|
245
|
+
|
|
246
|
+
# Also include any pending tasks that might have been skipped
|
|
247
|
+
tasks_to_run.update(self.pending_tasks)
|
|
248
|
+
|
|
249
|
+
return tasks_to_run
|
|
250
|
+
|
|
251
|
+
@property
|
|
252
|
+
def is_complete(self) -> bool:
|
|
253
|
+
"""Check if all tasks are complete (no pending or in-progress)."""
|
|
254
|
+
return len(self.pending_tasks) == 0 and len(self.in_progress_tasks) == 0
|
|
255
|
+
|
|
256
|
+
@property
|
|
257
|
+
def success_count(self) -> int:
|
|
258
|
+
"""Count of successfully completed tasks."""
|
|
259
|
+
return sum(1 for r in self.completed_tasks.values() if r.success)
|
|
260
|
+
|
|
261
|
+
@property
|
|
262
|
+
def failure_count(self) -> int:
|
|
263
|
+
"""Count of failed tasks."""
|
|
264
|
+
return len(self.failed_tasks)
|
|
265
|
+
|
|
266
|
+
@property
|
|
267
|
+
def progress_percentage(self) -> float:
|
|
268
|
+
"""Percentage of tasks completed."""
|
|
269
|
+
if self.total_tasks == 0:
|
|
270
|
+
return 0.0
|
|
271
|
+
completed = len(self.completed_tasks)
|
|
272
|
+
return (completed / self.total_tasks) * 100
|
|
273
|
+
|
|
274
|
+
def to_dict(self) -> dict:
|
|
275
|
+
"""Convert to dictionary for JSON serialization."""
|
|
276
|
+
return {
|
|
277
|
+
"version": self.version,
|
|
278
|
+
"taskfile_path": self.taskfile_path,
|
|
279
|
+
"workflow": self.workflow,
|
|
280
|
+
"taskfile_hash": self.taskfile_hash,
|
|
281
|
+
"run_started": self.run_started,
|
|
282
|
+
"checkpoint_created": self.checkpoint_created,
|
|
283
|
+
"total_tasks": self.total_tasks,
|
|
284
|
+
"completed_tasks": {k: v.to_dict() for k, v in self.completed_tasks.items()},
|
|
285
|
+
"in_progress_tasks": list(self.in_progress_tasks),
|
|
286
|
+
"pending_tasks": list(self.pending_tasks),
|
|
287
|
+
"failed_tasks": list(self.failed_tasks),
|
|
288
|
+
"skipped_tasks": list(self.skipped_tasks),
|
|
289
|
+
"summary": {
|
|
290
|
+
"completed": len(self.completed_tasks),
|
|
291
|
+
"in_progress": len(self.in_progress_tasks),
|
|
292
|
+
"pending": len(self.pending_tasks),
|
|
293
|
+
"failed": len(self.failed_tasks),
|
|
294
|
+
"skipped": len(self.skipped_tasks),
|
|
295
|
+
"success_count": self.success_count,
|
|
296
|
+
"progress_percentage": round(self.progress_percentage, 1),
|
|
297
|
+
},
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
@classmethod
|
|
301
|
+
def from_dict(cls, data: dict) -> "Checkpoint":
|
|
302
|
+
"""Create Checkpoint from dictionary."""
|
|
303
|
+
# Parse completed_tasks
|
|
304
|
+
completed_tasks = {}
|
|
305
|
+
for task_id, result_data in data.get("completed_tasks", {}).items():
|
|
306
|
+
completed_tasks[task_id] = TaskResult.from_dict(result_data)
|
|
307
|
+
|
|
308
|
+
return cls(
|
|
309
|
+
taskfile_path=data["taskfile_path"],
|
|
310
|
+
workflow=data["workflow"],
|
|
311
|
+
taskfile_hash=data["taskfile_hash"],
|
|
312
|
+
run_started=data["run_started"],
|
|
313
|
+
checkpoint_created=data["checkpoint_created"],
|
|
314
|
+
completed_tasks=completed_tasks,
|
|
315
|
+
in_progress_tasks=set(data.get("in_progress_tasks", [])),
|
|
316
|
+
pending_tasks=set(data.get("pending_tasks", [])),
|
|
317
|
+
failed_tasks=set(data.get("failed_tasks", [])),
|
|
318
|
+
skipped_tasks=set(data.get("skipped_tasks", [])),
|
|
319
|
+
total_tasks=data.get("total_tasks", 0),
|
|
320
|
+
version=data.get("version", "1.0"),
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
def validate_against_taskfile(
|
|
324
|
+
self,
|
|
325
|
+
taskfile_path: str,
|
|
326
|
+
strict: bool = True,
|
|
327
|
+
) -> tuple:
|
|
328
|
+
"""Validate this checkpoint matches the given taskfile.
|
|
329
|
+
|
|
330
|
+
:param taskfile_path: Path to the current taskfile
|
|
331
|
+
:param strict: If True, require exact hash match; if False, just warn
|
|
332
|
+
:return: Tuple of (is_valid, warnings)
|
|
333
|
+
"""
|
|
334
|
+
warnings = []
|
|
335
|
+
is_valid = True
|
|
336
|
+
|
|
337
|
+
# Check file hash
|
|
338
|
+
current_hash = _compute_file_hash(taskfile_path)
|
|
339
|
+
if current_hash != self.taskfile_hash:
|
|
340
|
+
msg = (
|
|
341
|
+
f"Taskfile has been modified since checkpoint was created. "
|
|
342
|
+
f"Original hash: {self.taskfile_hash[:8]}..., "
|
|
343
|
+
f"Current hash: {current_hash[:8]}..."
|
|
344
|
+
)
|
|
345
|
+
if strict:
|
|
346
|
+
is_valid = False
|
|
347
|
+
warnings.append(f"ERROR: {msg}")
|
|
348
|
+
else:
|
|
349
|
+
warnings.append(f"WARNING: {msg}")
|
|
350
|
+
|
|
351
|
+
# Check path matches (normalized)
|
|
352
|
+
checkpoint_path = Path(self.taskfile_path).resolve()
|
|
353
|
+
current_path = Path(taskfile_path).resolve()
|
|
354
|
+
if checkpoint_path != current_path:
|
|
355
|
+
warnings.append(
|
|
356
|
+
f"WARNING: Checkpoint was created for '{checkpoint_path}', "
|
|
357
|
+
f"but resuming with '{current_path}'"
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
return is_valid, warnings
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def _compute_file_hash(file_path: str) -> str:
|
|
364
|
+
"""Compute SHA-256 hash of a file.
|
|
365
|
+
|
|
366
|
+
:param file_path: Path to the file
|
|
367
|
+
:return: Hex digest of the hash
|
|
368
|
+
"""
|
|
369
|
+
sha256 = hashlib.sha256()
|
|
370
|
+
with open(file_path, "rb") as f:
|
|
371
|
+
for chunk in iter(lambda: f.read(_FILE_HASH_CHUNK_SIZE), b""):
|
|
372
|
+
sha256.update(chunk)
|
|
373
|
+
return sha256.hexdigest()
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def save_checkpoint(checkpoint: Checkpoint, file_path: str) -> None:
|
|
377
|
+
"""Save checkpoint to file atomically.
|
|
378
|
+
|
|
379
|
+
Uses a write-to-temp-then-rename strategy to ensure the checkpoint
|
|
380
|
+
file is never left in a partial/corrupt state.
|
|
381
|
+
|
|
382
|
+
:param checkpoint: Checkpoint to save
|
|
383
|
+
:param file_path: Target file path
|
|
384
|
+
"""
|
|
385
|
+
from rushti.utils import ensure_shared_file, makedirs_shared
|
|
386
|
+
|
|
387
|
+
file_path = Path(file_path)
|
|
388
|
+
|
|
389
|
+
# Ensure directory exists (shared permissions for multi-user access)
|
|
390
|
+
makedirs_shared(str(file_path.parent))
|
|
391
|
+
|
|
392
|
+
# Write to temporary file first
|
|
393
|
+
fd, temp_path = tempfile.mkstemp(
|
|
394
|
+
suffix=".tmp",
|
|
395
|
+
prefix="checkpoint_",
|
|
396
|
+
dir=file_path.parent,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
try:
|
|
400
|
+
with os.fdopen(fd, "w") as f:
|
|
401
|
+
json.dump(checkpoint.to_dict(), f, indent=2)
|
|
402
|
+
|
|
403
|
+
# Atomic rename (on POSIX systems)
|
|
404
|
+
# On Windows, need to remove target first if it exists
|
|
405
|
+
if os.name == "nt" and file_path.exists():
|
|
406
|
+
file_path.unlink()
|
|
407
|
+
|
|
408
|
+
os.rename(temp_path, file_path)
|
|
409
|
+
ensure_shared_file(str(file_path))
|
|
410
|
+
logger.debug(f"Checkpoint saved to {file_path}")
|
|
411
|
+
|
|
412
|
+
except Exception as e:
|
|
413
|
+
# Clean up temp file on error
|
|
414
|
+
if os.path.exists(temp_path):
|
|
415
|
+
os.unlink(temp_path)
|
|
416
|
+
raise RuntimeError(f"Failed to save checkpoint: {e}") from e
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def load_checkpoint(file_path: str) -> Checkpoint:
|
|
420
|
+
"""Load checkpoint from file.
|
|
421
|
+
|
|
422
|
+
:param file_path: Path to checkpoint file
|
|
423
|
+
:return: Loaded Checkpoint instance
|
|
424
|
+
:raises FileNotFoundError: If checkpoint file doesn't exist
|
|
425
|
+
:raises ValueError: If checkpoint file is invalid
|
|
426
|
+
"""
|
|
427
|
+
file_path = Path(file_path)
|
|
428
|
+
|
|
429
|
+
if not file_path.exists():
|
|
430
|
+
raise FileNotFoundError(f"Checkpoint file not found: {file_path}")
|
|
431
|
+
|
|
432
|
+
try:
|
|
433
|
+
with open(file_path, "r") as f:
|
|
434
|
+
data = json.load(f)
|
|
435
|
+
|
|
436
|
+
# Validate required fields
|
|
437
|
+
required_fields = ["taskfile_path", "workflow", "run_started"]
|
|
438
|
+
for field in required_fields:
|
|
439
|
+
if field not in data:
|
|
440
|
+
raise ValueError(f"Checkpoint missing required field: {field}")
|
|
441
|
+
|
|
442
|
+
checkpoint = Checkpoint.from_dict(data)
|
|
443
|
+
logger.info(
|
|
444
|
+
f"Loaded checkpoint from {file_path}: "
|
|
445
|
+
f"{checkpoint.success_count}/{checkpoint.total_tasks} completed, "
|
|
446
|
+
f"{len(checkpoint.pending_tasks)} pending, "
|
|
447
|
+
f"{len(checkpoint.in_progress_tasks)} in-progress"
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
return checkpoint
|
|
451
|
+
|
|
452
|
+
except json.JSONDecodeError as e:
|
|
453
|
+
raise ValueError(f"Invalid checkpoint file format: {e}") from e
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def delete_checkpoint(file_path: str) -> bool:
|
|
457
|
+
"""Delete a checkpoint file.
|
|
458
|
+
|
|
459
|
+
:param file_path: Path to checkpoint file
|
|
460
|
+
:return: True if deleted, False if didn't exist
|
|
461
|
+
"""
|
|
462
|
+
file_path = Path(file_path)
|
|
463
|
+
|
|
464
|
+
if file_path.exists():
|
|
465
|
+
file_path.unlink()
|
|
466
|
+
logger.info(f"Checkpoint deleted: {file_path}")
|
|
467
|
+
return True
|
|
468
|
+
|
|
469
|
+
return False
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def get_checkpoint_path(
|
|
473
|
+
checkpoint_dir: str,
|
|
474
|
+
workflow: str,
|
|
475
|
+
) -> Path:
|
|
476
|
+
"""Get the standard checkpoint file path for a taskfile.
|
|
477
|
+
|
|
478
|
+
:param checkpoint_dir: Base directory for checkpoints
|
|
479
|
+
:param workflow: Workflow name
|
|
480
|
+
:return: Path to the checkpoint file
|
|
481
|
+
"""
|
|
482
|
+
# Sanitize workflow name for use in filename
|
|
483
|
+
safe_id = "".join(c if c.isalnum() or c in "-_" else "_" for c in workflow)
|
|
484
|
+
return Path(checkpoint_dir) / f"checkpoint_{safe_id}.json"
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def find_checkpoint_for_taskfile(
|
|
488
|
+
checkpoint_dir: str,
|
|
489
|
+
taskfile_path: str,
|
|
490
|
+
) -> Optional[Path]:
|
|
491
|
+
"""Find an existing checkpoint for a taskfile.
|
|
492
|
+
|
|
493
|
+
:param checkpoint_dir: Directory containing checkpoints
|
|
494
|
+
:param taskfile_path: Path to the taskfile
|
|
495
|
+
:return: Path to checkpoint file if found, None otherwise
|
|
496
|
+
"""
|
|
497
|
+
checkpoint_dir = Path(checkpoint_dir)
|
|
498
|
+
|
|
499
|
+
if not checkpoint_dir.exists():
|
|
500
|
+
return None
|
|
501
|
+
|
|
502
|
+
# Look for checkpoint files
|
|
503
|
+
taskfile_path_resolved = Path(taskfile_path).resolve()
|
|
504
|
+
|
|
505
|
+
for checkpoint_file in checkpoint_dir.glob("checkpoint_*.json"):
|
|
506
|
+
try:
|
|
507
|
+
checkpoint = load_checkpoint(checkpoint_file)
|
|
508
|
+
if Path(checkpoint.taskfile_path).resolve() == taskfile_path_resolved:
|
|
509
|
+
return checkpoint_file
|
|
510
|
+
except (ValueError, FileNotFoundError):
|
|
511
|
+
continue
|
|
512
|
+
|
|
513
|
+
return None
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
class CheckpointManager:
|
|
517
|
+
"""Manages checkpoint creation, saving, and cleanup during execution.
|
|
518
|
+
|
|
519
|
+
Usage:
|
|
520
|
+
manager = CheckpointManager(
|
|
521
|
+
checkpoint_dir="./checkpoints",
|
|
522
|
+
taskfile_path="tasks.json",
|
|
523
|
+
workflow="daily-etl",
|
|
524
|
+
task_ids=["1", "2", "3"],
|
|
525
|
+
checkpoint_interval=60,
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
# During execution
|
|
529
|
+
manager.mark_running("1")
|
|
530
|
+
manager.mark_completed("1", success=True, duration=10.5)
|
|
531
|
+
|
|
532
|
+
# After successful completion
|
|
533
|
+
manager.cleanup()
|
|
534
|
+
"""
|
|
535
|
+
|
|
536
|
+
def __init__(
|
|
537
|
+
self,
|
|
538
|
+
checkpoint_dir: str,
|
|
539
|
+
taskfile_path: str,
|
|
540
|
+
workflow: str,
|
|
541
|
+
task_ids: List[str],
|
|
542
|
+
checkpoint_interval: int = 60,
|
|
543
|
+
enabled: bool = True,
|
|
544
|
+
):
|
|
545
|
+
"""Initialize checkpoint manager.
|
|
546
|
+
|
|
547
|
+
:param checkpoint_dir: Directory for checkpoint files
|
|
548
|
+
:param taskfile_path: Path to the task file
|
|
549
|
+
:param workflow: Workflow name
|
|
550
|
+
:param task_ids: List of all task IDs
|
|
551
|
+
:param checkpoint_interval: Seconds between automatic checkpoint saves
|
|
552
|
+
:param enabled: Whether checkpointing is enabled
|
|
553
|
+
"""
|
|
554
|
+
self.enabled = enabled
|
|
555
|
+
self.checkpoint_dir = Path(checkpoint_dir)
|
|
556
|
+
self.checkpoint_path = get_checkpoint_path(checkpoint_dir, workflow)
|
|
557
|
+
self.checkpoint_interval = checkpoint_interval
|
|
558
|
+
self._last_save_time = datetime.now()
|
|
559
|
+
|
|
560
|
+
if enabled:
|
|
561
|
+
self.checkpoint = Checkpoint.create(
|
|
562
|
+
taskfile_path=taskfile_path,
|
|
563
|
+
workflow=workflow,
|
|
564
|
+
task_ids=task_ids,
|
|
565
|
+
)
|
|
566
|
+
# Create initial checkpoint
|
|
567
|
+
self._save()
|
|
568
|
+
else:
|
|
569
|
+
self.checkpoint = None
|
|
570
|
+
|
|
571
|
+
def mark_running(self, task_id: str) -> None:
|
|
572
|
+
"""Mark a task as running."""
|
|
573
|
+
if not self.enabled or not self.checkpoint:
|
|
574
|
+
return
|
|
575
|
+
self.checkpoint.mark_running(task_id)
|
|
576
|
+
self._maybe_save()
|
|
577
|
+
|
|
578
|
+
def mark_completed(
|
|
579
|
+
self,
|
|
580
|
+
task_id: str,
|
|
581
|
+
success: bool,
|
|
582
|
+
duration_seconds: float,
|
|
583
|
+
retry_count: int = 0,
|
|
584
|
+
error_message: Optional[str] = None,
|
|
585
|
+
) -> None:
|
|
586
|
+
"""Mark a task as completed and save checkpoint."""
|
|
587
|
+
if not self.enabled or not self.checkpoint:
|
|
588
|
+
return
|
|
589
|
+
|
|
590
|
+
self.checkpoint.mark_completed(
|
|
591
|
+
task_id=task_id,
|
|
592
|
+
success=success,
|
|
593
|
+
duration_seconds=duration_seconds,
|
|
594
|
+
retry_count=retry_count,
|
|
595
|
+
error_message=error_message,
|
|
596
|
+
)
|
|
597
|
+
# Always save on task completion
|
|
598
|
+
self._save()
|
|
599
|
+
|
|
600
|
+
def mark_skipped(self, task_id: str, reason: str = "predecessor_failed") -> None:
|
|
601
|
+
"""Mark a task as skipped."""
|
|
602
|
+
if not self.enabled or not self.checkpoint:
|
|
603
|
+
return
|
|
604
|
+
self.checkpoint.mark_skipped(task_id, reason)
|
|
605
|
+
self._maybe_save()
|
|
606
|
+
|
|
607
|
+
def _maybe_save(self) -> None:
|
|
608
|
+
"""Save checkpoint if interval has elapsed."""
|
|
609
|
+
now = datetime.now()
|
|
610
|
+
elapsed = (now - self._last_save_time).total_seconds()
|
|
611
|
+
if elapsed >= self.checkpoint_interval:
|
|
612
|
+
self._save()
|
|
613
|
+
|
|
614
|
+
def _save(self) -> None:
|
|
615
|
+
"""Save the current checkpoint."""
|
|
616
|
+
if not self.enabled or not self.checkpoint:
|
|
617
|
+
return
|
|
618
|
+
|
|
619
|
+
try:
|
|
620
|
+
save_checkpoint(self.checkpoint, self.checkpoint_path)
|
|
621
|
+
self._last_save_time = datetime.now()
|
|
622
|
+
except Exception as e:
|
|
623
|
+
logger.warning(f"Failed to save checkpoint: {e}")
|
|
624
|
+
|
|
625
|
+
def force_save(self) -> None:
|
|
626
|
+
"""Force immediate checkpoint save."""
|
|
627
|
+
self._save()
|
|
628
|
+
|
|
629
|
+
def cleanup(self, success: bool = True) -> None:
|
|
630
|
+
"""Clean up checkpoint after execution.
|
|
631
|
+
|
|
632
|
+
:param success: If True, delete checkpoint; if False, retain for resume
|
|
633
|
+
"""
|
|
634
|
+
if not self.enabled:
|
|
635
|
+
return
|
|
636
|
+
|
|
637
|
+
if success:
|
|
638
|
+
delete_checkpoint(self.checkpoint_path)
|
|
639
|
+
else:
|
|
640
|
+
# Ensure final state is saved
|
|
641
|
+
self._save()
|
|
642
|
+
logger.info(f"Checkpoint retained for resume: {self.checkpoint_path}")
|