b10-transfer 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- b10_transfer/__init__.py +51 -0
- b10_transfer/archive.py +175 -0
- b10_transfer/async_torch_cache.py +62 -0
- b10_transfer/async_transfers.py +275 -0
- b10_transfer/cleanup.py +179 -0
- b10_transfer/constants.py +149 -0
- b10_transfer/core.py +160 -0
- b10_transfer/environment.py +134 -0
- b10_transfer/info.py +172 -0
- b10_transfer/space_monitor.py +299 -0
- b10_transfer/torch_cache.py +376 -0
- b10_transfer/utils.py +355 -0
- b10_transfer-0.0.1.dist-info/METADATA +219 -0
- b10_transfer-0.0.1.dist-info/RECORD +15 -0
- b10_transfer-0.0.1.dist-info/WHEEL +4 -0
b10_transfer/info.py
ADDED
@@ -0,0 +1,172 @@
|
|
1
|
+
import logging
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Dict, Any
|
4
|
+
|
5
|
+
from .environment import get_cache_filename, get_environment_key
|
6
|
+
from .archive import get_file_size_mb
|
7
|
+
from .constants import (
|
8
|
+
TORCH_CACHE_DIR,
|
9
|
+
B10FS_CACHE_DIR,
|
10
|
+
CACHE_PREFIX,
|
11
|
+
CACHE_LATEST_SUFFIX,
|
12
|
+
CACHE_FILE_EXTENSION,
|
13
|
+
)
|
14
|
+
from .utils import safe_execute, _is_b10fs_enabled
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
@safe_execute("Failed to calculate local cache size", None)
|
20
|
+
def _calculate_local_cache_size(torch_dir: Path) -> float:
|
21
|
+
"""Calculate the total size of local torch cache directory in megabytes.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
torch_dir: Path to the torch cache directory to measure.
|
25
|
+
|
26
|
+
Returns:
|
27
|
+
float: Total size of all files in the directory in MB, or None if
|
28
|
+
calculation fails (handled by decorator).
|
29
|
+
|
30
|
+
Raises:
|
31
|
+
Exception: Any filesystem errors during directory traversal
|
32
|
+
(caught by decorator and returns None).
|
33
|
+
"""
|
34
|
+
# FIXME(SR): I guess directory structure could change here while rglob is running/iterating, so this is not safe.
|
35
|
+
# But this is for debuggging anyways, we can remove/revisit this later. Not critical imho.
|
36
|
+
local_size = sum(f.stat().st_size for f in torch_dir.rglob("*") if f.is_file())
|
37
|
+
return local_size / (1024 * 1024)
|
38
|
+
|
39
|
+
|
40
|
+
@safe_execute("Error reading cache file", None)
|
41
|
+
def _process_cache_file(cache_file: Path) -> Dict[str, Any]:
|
42
|
+
"""Extract metadata information from a cache file.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
cache_file: Path to the cache file to process.
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
Dict[str, Any]: Dictionary containing cache file metadata with keys:
|
49
|
+
- filename: The cache file name
|
50
|
+
- environment_key: Extracted environment identifier
|
51
|
+
- size_mb: File size in megabytes
|
52
|
+
- is_current_environment: Whether this matches current environment
|
53
|
+
- created_time: File creation timestamp
|
54
|
+
Returns None if processing fails (handled by decorator).
|
55
|
+
|
56
|
+
Raises:
|
57
|
+
Exception: Any errors reading file metadata (caught by decorator).
|
58
|
+
"""
|
59
|
+
# Extract env key: cache_a1b2c3d4e5f6.latest.tar.gz
|
60
|
+
env_key = cache_file.name.replace(CACHE_PREFIX, "").replace(
|
61
|
+
f"{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}", ""
|
62
|
+
)
|
63
|
+
|
64
|
+
return {
|
65
|
+
"filename": cache_file.name,
|
66
|
+
"environment_key": env_key,
|
67
|
+
"size_mb": get_file_size_mb(cache_file),
|
68
|
+
"is_current_environment": env_key == get_environment_key(),
|
69
|
+
"created_time": cache_file.stat().st_mtime,
|
70
|
+
}
|
71
|
+
|
72
|
+
|
73
|
+
def get_cache_info() -> Dict[str, Any]:
|
74
|
+
"""Get comprehensive information about the current cache state.
|
75
|
+
|
76
|
+
This function provides a snapshot of both local and b10fs cache status,
|
77
|
+
including existence, sizes, and environment information. It safely handles
|
78
|
+
cases where b10fs is unavailable or directories don't exist.
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
Dict[str, Any]: Dictionary containing cache information with keys:
|
82
|
+
- environment_key: Current environment identifier hash
|
83
|
+
- local_cache_exists: Whether local torch cache has content
|
84
|
+
- b10fs_enabled: Whether b10fs filesystem is available
|
85
|
+
- b10fs_cache_exists: Whether cache exists on b10fs
|
86
|
+
- local_cache_size_mb: Local cache size in MB (if exists)
|
87
|
+
- b10fs_cache_size_mb: B10fs cache size in MB (if exists)
|
88
|
+
|
89
|
+
Raises:
|
90
|
+
No exceptions are raised; errors are handled gracefully with None values.
|
91
|
+
"""
|
92
|
+
torch_dir = Path(TORCH_CACHE_DIR)
|
93
|
+
b10fs_dir = Path(B10FS_CACHE_DIR)
|
94
|
+
cache_filename = get_cache_filename()
|
95
|
+
cache_file = (
|
96
|
+
b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
|
97
|
+
)
|
98
|
+
|
99
|
+
info = {
|
100
|
+
"environment_key": get_environment_key(),
|
101
|
+
"local_cache_exists": torch_dir.exists() and any(torch_dir.iterdir()),
|
102
|
+
"b10fs_enabled": _is_b10fs_enabled(),
|
103
|
+
"b10fs_cache_exists": cache_file.exists() if _is_b10fs_enabled() else False,
|
104
|
+
}
|
105
|
+
|
106
|
+
# Add size info
|
107
|
+
if info["local_cache_exists"]:
|
108
|
+
info["local_cache_size_mb"] = _calculate_local_cache_size(torch_dir)
|
109
|
+
|
110
|
+
if info["b10fs_cache_exists"] and _is_b10fs_enabled():
|
111
|
+
info["b10fs_cache_size_mb"] = get_file_size_mb(cache_file)
|
112
|
+
|
113
|
+
return info
|
114
|
+
|
115
|
+
|
116
|
+
def list_available_caches() -> Dict[str, Any]:
|
117
|
+
"""List all available cache files with their metadata and environment info.
|
118
|
+
|
119
|
+
This function scans the b10fs directory for all cache files and returns
|
120
|
+
detailed information about each one, including which environment they
|
121
|
+
belong to and their creation times. Results are sorted by creation time.
|
122
|
+
|
123
|
+
Returns:
|
124
|
+
Dict[str, Any]: Dictionary containing cache listing with keys:
|
125
|
+
- caches: List of cache file dictionaries (from _process_cache_file)
|
126
|
+
- current_environment: Current environment identifier
|
127
|
+
- total_caches: Total number of cache files found
|
128
|
+
- current_cache_exists: Whether current environment has a cache
|
129
|
+
- b10fs_enabled: Whether b10fs is available
|
130
|
+
- error: Error message if b10fs is not enabled
|
131
|
+
|
132
|
+
Raises:
|
133
|
+
No exceptions are raised; individual file errors are handled gracefully
|
134
|
+
and problematic files are skipped.
|
135
|
+
"""
|
136
|
+
if not _is_b10fs_enabled():
|
137
|
+
return {
|
138
|
+
"caches": [],
|
139
|
+
"current_environment": get_environment_key(),
|
140
|
+
"b10fs_enabled": False,
|
141
|
+
"error": "b10fs is not enabled",
|
142
|
+
}
|
143
|
+
|
144
|
+
b10fs_dir = Path(B10FS_CACHE_DIR)
|
145
|
+
|
146
|
+
if not b10fs_dir.exists():
|
147
|
+
return {
|
148
|
+
"caches": [],
|
149
|
+
"current_environment": get_environment_key(),
|
150
|
+
"b10fs_enabled": True,
|
151
|
+
}
|
152
|
+
|
153
|
+
caches = []
|
154
|
+
|
155
|
+
# Find all latest cache files
|
156
|
+
for cache_file in b10fs_dir.glob(
|
157
|
+
f"{CACHE_PREFIX}*{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
|
158
|
+
):
|
159
|
+
cache_info = _process_cache_file(cache_file)
|
160
|
+
if cache_info:
|
161
|
+
caches.append(cache_info)
|
162
|
+
|
163
|
+
# Sort by creation time (newest first)
|
164
|
+
caches.sort(key=lambda x: x["created_time"], reverse=True)
|
165
|
+
|
166
|
+
return {
|
167
|
+
"caches": caches,
|
168
|
+
"current_environment": get_environment_key(),
|
169
|
+
"total_caches": len(caches),
|
170
|
+
"current_cache_exists": any(c["is_current_environment"] for c in caches),
|
171
|
+
"b10fs_enabled": True,
|
172
|
+
}
|
@@ -0,0 +1,299 @@
|
|
1
|
+
"""Space monitoring utilities for b10-tcache.
|
2
|
+
|
3
|
+
This module provides disk space monitoring functionality to prevent cache operations
|
4
|
+
from exhausting available disk space and causing system instability.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import time
|
8
|
+
import logging
|
9
|
+
import shutil
|
10
|
+
import threading
|
11
|
+
import multiprocessing
|
12
|
+
from pathlib import Path
|
13
|
+
from multiprocessing import Process, Queue
|
14
|
+
from functools import wraps
|
15
|
+
|
16
|
+
from .constants import WorkerStatus, SPACE_MONITOR_CHECK_INTERVAL_SECONDS
|
17
|
+
|
18
|
+
logger = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
|
21
|
+
class CacheOperationInterrupted(Exception):
|
22
|
+
"""Raised when a cache operation is interrupted due to insufficient disk space."""
|
23
|
+
|
24
|
+
pass
|
25
|
+
|
26
|
+
|
27
|
+
def worker_process(cancelled_message: str):
|
28
|
+
"""Decorator for worker process functions to handle common try/catch/result_queue pattern.
|
29
|
+
|
30
|
+
This decorator wraps worker functions to:
|
31
|
+
1. Check for cancellation before starting
|
32
|
+
2. Handle exceptions and put appropriate status in result_queue
|
33
|
+
3. Put success status on completion
|
34
|
+
|
35
|
+
Args:
|
36
|
+
cancelled_message: Message to send if the worker is cancelled before starting
|
37
|
+
|
38
|
+
Usage:
|
39
|
+
@worker_process("Operation was cancelled before starting")
|
40
|
+
def my_worker(arg1, arg2, result_queue, stop_event):
|
41
|
+
# Your worker logic here
|
42
|
+
# No need to handle try/catch or result_queue.put()
|
43
|
+
"""
|
44
|
+
|
45
|
+
def decorator(func):
|
46
|
+
@wraps(func)
|
47
|
+
def wrapper(*args):
|
48
|
+
# Extract result_queue and stop_event from the end of args
|
49
|
+
*worker_args, result_queue, stop_event = args
|
50
|
+
|
51
|
+
try:
|
52
|
+
# Check for stop signal before starting
|
53
|
+
if stop_event.is_set():
|
54
|
+
result_queue.put((WorkerStatus.CANCELLED.value, cancelled_message))
|
55
|
+
return
|
56
|
+
|
57
|
+
# Call the actual worker function with just the worker args
|
58
|
+
func(*worker_args)
|
59
|
+
|
60
|
+
# If we get here, the function completed successfully
|
61
|
+
result_queue.put((WorkerStatus.SUCCESS.value, None))
|
62
|
+
|
63
|
+
except Exception as e:
|
64
|
+
result_queue.put((WorkerStatus.ERROR.value, str(e)))
|
65
|
+
|
66
|
+
return wrapper
|
67
|
+
|
68
|
+
return decorator
|
69
|
+
|
70
|
+
|
71
|
+
def get_available_disk_space_mb(path: Path) -> float:
|
72
|
+
"""Get available disk space in megabytes for the given path.
|
73
|
+
|
74
|
+
This function returns the available disk space for the filesystem
|
75
|
+
containing the specified path. It's useful for checking if there's
|
76
|
+
enough space before performing disk-intensive operations.
|
77
|
+
|
78
|
+
Args:
|
79
|
+
path: Path to check disk space for. The path's parent directory
|
80
|
+
will be used if the path itself doesn't exist.
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
float: Available disk space in megabytes, or 0.0 if unable to
|
84
|
+
determine (e.g., path doesn't exist or permission denied).
|
85
|
+
|
86
|
+
Raises:
|
87
|
+
No exceptions are raised; OSError is caught and returns 0.0.
|
88
|
+
"""
|
89
|
+
try:
|
90
|
+
# Ensure we check an existing directory
|
91
|
+
check_path = path
|
92
|
+
while not check_path.exists() and check_path != check_path.parent:
|
93
|
+
check_path = check_path.parent
|
94
|
+
|
95
|
+
if not check_path.exists():
|
96
|
+
return 0.0
|
97
|
+
|
98
|
+
# Get disk usage stats
|
99
|
+
_, _, free_bytes = shutil.disk_usage(check_path)
|
100
|
+
return free_bytes / (1024 * 1024)
|
101
|
+
except OSError:
|
102
|
+
return 0.0
|
103
|
+
|
104
|
+
|
105
|
+
def check_sufficient_disk_space(
|
106
|
+
path: Path, required_mb: float, operation_name: str = "operation"
|
107
|
+
) -> None:
|
108
|
+
"""Check if there's sufficient disk space for an operation.
|
109
|
+
|
110
|
+
This function verifies that the filesystem has enough available space
|
111
|
+
for the specified operation, raising an exception if insufficient space
|
112
|
+
is available.
|
113
|
+
|
114
|
+
Args:
|
115
|
+
path: Path where the operation will write data.
|
116
|
+
required_mb: Required disk space in megabytes.
|
117
|
+
operation_name: Name of the operation for error messages.
|
118
|
+
|
119
|
+
Raises:
|
120
|
+
CacheValidationError: If insufficient disk space is available.
|
121
|
+
"""
|
122
|
+
from .utils import CacheValidationError
|
123
|
+
|
124
|
+
available_mb = get_available_disk_space_mb(path)
|
125
|
+
if available_mb < required_mb:
|
126
|
+
raise CacheValidationError(
|
127
|
+
f"Insufficient disk space for {operation_name}: "
|
128
|
+
f"required {required_mb:.1f}MB, available {available_mb:.1f}MB"
|
129
|
+
)
|
130
|
+
|
131
|
+
|
132
|
+
class CacheSpaceMonitor:
|
133
|
+
"""Background monitor for disk space during cache operations.
|
134
|
+
|
135
|
+
This class implements a daemon thread that continuously monitors available
|
136
|
+
disk space and signals when space falls below required thresholds. It follows
|
137
|
+
the SpaceMonitor pattern from node-warmer for graceful operation interruption.
|
138
|
+
"""
|
139
|
+
|
140
|
+
def __init__(
|
141
|
+
self, required_space_mb: float, path: Path, check_interval: float = 2.0
|
142
|
+
):
|
143
|
+
"""Initialize the space monitor.
|
144
|
+
|
145
|
+
Args:
|
146
|
+
required_space_mb: Minimum required disk space in megabytes.
|
147
|
+
path: Path to monitor for disk space (will check filesystem containing this path).
|
148
|
+
check_interval: How often to check disk space in seconds. Defaults to 2.0.
|
149
|
+
"""
|
150
|
+
self.required_space_mb = required_space_mb
|
151
|
+
self.path = path
|
152
|
+
self.check_interval = check_interval
|
153
|
+
self.stop_operation = threading.Event()
|
154
|
+
self.thread: threading.Thread = None
|
155
|
+
|
156
|
+
def start(self) -> None:
|
157
|
+
"""Start monitoring disk space in background daemon thread."""
|
158
|
+
if self.thread is not None:
|
159
|
+
return # Already started
|
160
|
+
|
161
|
+
self.thread = threading.Thread(target=self._monitor, daemon=True)
|
162
|
+
self.thread.start()
|
163
|
+
logger.debug(
|
164
|
+
f"Started space monitor for {self.path} (required: {self.required_space_mb:.1f}MB)"
|
165
|
+
)
|
166
|
+
|
167
|
+
def _monitor(self) -> None:
|
168
|
+
"""Continuously monitor disk space and signal when insufficient."""
|
169
|
+
while not self.stop_operation.is_set():
|
170
|
+
try:
|
171
|
+
available_mb = get_available_disk_space_mb(self.path)
|
172
|
+
logger.debug(
|
173
|
+
f"[SpaceMonitor] Available space: {available_mb:.1f}MB (required: {self.required_space_mb:.1f}MB)"
|
174
|
+
)
|
175
|
+
|
176
|
+
if available_mb < self.required_space_mb:
|
177
|
+
logger.error(
|
178
|
+
f"CRITICAL: Space ({available_mb:.1f}MB) below required {self.required_space_mb:.1f}MB. Signaling stop!"
|
179
|
+
)
|
180
|
+
self.stop_operation.set()
|
181
|
+
break
|
182
|
+
|
183
|
+
except Exception as e:
|
184
|
+
logger.warning(f"Space monitor error: {e}")
|
185
|
+
|
186
|
+
time.sleep(self.check_interval)
|
187
|
+
|
188
|
+
def should_stop(self) -> bool:
|
189
|
+
"""Check if operations should stop due to insufficient disk space.
|
190
|
+
|
191
|
+
Returns:
|
192
|
+
bool: True if insufficient disk space was detected, False otherwise.
|
193
|
+
"""
|
194
|
+
return self.stop_operation.is_set()
|
195
|
+
|
196
|
+
def stop(self) -> None:
|
197
|
+
"""Stop the background monitoring thread."""
|
198
|
+
self.stop_operation.set()
|
199
|
+
if self.thread is not None:
|
200
|
+
logger.debug("Stopped space monitor")
|
201
|
+
|
202
|
+
|
203
|
+
def cleanup_process(
|
204
|
+
process: Process, operation_name: str, timeout: float = 5.0
|
205
|
+
) -> None:
|
206
|
+
"""Clean up a process with graceful termination and force kill fallback.
|
207
|
+
|
208
|
+
This helper function implements the standard pattern for cleaning up
|
209
|
+
multiprocessing.Process instances with proper timeout handling.
|
210
|
+
|
211
|
+
Args:
|
212
|
+
process: The process to clean up.
|
213
|
+
operation_name: Name of the operation for logging.
|
214
|
+
timeout: How long to wait for graceful termination before force kill.
|
215
|
+
"""
|
216
|
+
if process.is_alive():
|
217
|
+
process.terminate()
|
218
|
+
process.join(timeout=timeout)
|
219
|
+
if process.is_alive():
|
220
|
+
logger.warning(f"Force killing {operation_name} process")
|
221
|
+
process.kill()
|
222
|
+
process.join()
|
223
|
+
|
224
|
+
|
225
|
+
def run_monitored_process(
|
226
|
+
worker_func,
|
227
|
+
args,
|
228
|
+
space_monitor: CacheSpaceMonitor,
|
229
|
+
operation_name: str,
|
230
|
+
cleanup_func=None,
|
231
|
+
) -> None:
|
232
|
+
"""Run a worker process with space monitoring and automatic termination.
|
233
|
+
|
234
|
+
This function starts a worker process and monitors it alongside the space monitor.
|
235
|
+
If insufficient disk space is detected, the worker process is terminated and
|
236
|
+
cleanup is performed.
|
237
|
+
|
238
|
+
Args:
|
239
|
+
worker_func: The worker function to run in a separate process.
|
240
|
+
args: Arguments to pass to the worker function.
|
241
|
+
space_monitor: CacheSpaceMonitor instance to check for space issues.
|
242
|
+
operation_name: Name of the operation for logging.
|
243
|
+
cleanup_func: Optional function to call for cleanup if operation is interrupted.
|
244
|
+
|
245
|
+
Raises:
|
246
|
+
CacheOperationInterrupted: If the operation was interrupted due to insufficient disk space.
|
247
|
+
Exception: If the worker process failed for other reasons.
|
248
|
+
"""
|
249
|
+
result_queue = Queue()
|
250
|
+
stop_event = multiprocessing.Event()
|
251
|
+
|
252
|
+
# Add result_queue and stop_event to worker args
|
253
|
+
worker_args = args + (result_queue, stop_event)
|
254
|
+
|
255
|
+
# Start the worker process
|
256
|
+
process = Process(target=worker_func, args=worker_args)
|
257
|
+
process.start()
|
258
|
+
|
259
|
+
try:
|
260
|
+
# Monitor the process
|
261
|
+
while process.is_alive():
|
262
|
+
if space_monitor.should_stop():
|
263
|
+
logger.warning(f"Low disk space detected, cancelling {operation_name}")
|
264
|
+
stop_event.set()
|
265
|
+
cleanup_process(process, operation_name)
|
266
|
+
|
267
|
+
# Run cleanup if provided
|
268
|
+
if cleanup_func:
|
269
|
+
cleanup_func()
|
270
|
+
|
271
|
+
raise CacheOperationInterrupted(
|
272
|
+
f"{operation_name} was cancelled due to insufficient disk space"
|
273
|
+
)
|
274
|
+
|
275
|
+
time.sleep(SPACE_MONITOR_CHECK_INTERVAL_SECONDS)
|
276
|
+
|
277
|
+
# Process finished, get the result
|
278
|
+
process.join()
|
279
|
+
|
280
|
+
if not result_queue.empty():
|
281
|
+
status, error_msg = result_queue.get()
|
282
|
+
if status == WorkerStatus.ERROR.value:
|
283
|
+
logger.error(f"{operation_name} worker failed: {error_msg}")
|
284
|
+
raise Exception(error_msg)
|
285
|
+
elif status == WorkerStatus.CANCELLED.value:
|
286
|
+
if cleanup_func:
|
287
|
+
cleanup_func()
|
288
|
+
raise CacheOperationInterrupted(error_msg)
|
289
|
+
# status == WorkerStatus.SUCCESS.value - continue normally
|
290
|
+
|
291
|
+
logger.debug(f"{operation_name} completed successfully")
|
292
|
+
|
293
|
+
except Exception as e:
|
294
|
+
# Ensure process is cleaned up
|
295
|
+
cleanup_process(process, operation_name)
|
296
|
+
|
297
|
+
if not isinstance(e, CacheOperationInterrupted):
|
298
|
+
logger.error(f"{operation_name} failed: {e}")
|
299
|
+
raise
|