b10-transfer 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
b10_transfer/info.py ADDED
@@ -0,0 +1,172 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Dict, Any
4
+
5
+ from .environment import get_cache_filename, get_environment_key
6
+ from .archive import get_file_size_mb
7
+ from .constants import (
8
+ TORCH_CACHE_DIR,
9
+ B10FS_CACHE_DIR,
10
+ CACHE_PREFIX,
11
+ CACHE_LATEST_SUFFIX,
12
+ CACHE_FILE_EXTENSION,
13
+ )
14
+ from .utils import safe_execute, _is_b10fs_enabled
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @safe_execute("Failed to calculate local cache size", None)
20
+ def _calculate_local_cache_size(torch_dir: Path) -> float:
21
+ """Calculate the total size of local torch cache directory in megabytes.
22
+
23
+ Args:
24
+ torch_dir: Path to the torch cache directory to measure.
25
+
26
+ Returns:
27
+ float: Total size of all files in the directory in MB, or None if
28
+ calculation fails (handled by decorator).
29
+
30
+ Raises:
31
+ Exception: Any filesystem errors during directory traversal
32
+ (caught by decorator and returns None).
33
+ """
34
+ # FIXME(SR): I guess directory structure could change here while rglob is running/iterating, so this is not safe.
35
+ # But this is for debuggging anyways, we can remove/revisit this later. Not critical imho.
36
+ local_size = sum(f.stat().st_size for f in torch_dir.rglob("*") if f.is_file())
37
+ return local_size / (1024 * 1024)
38
+
39
+
40
+ @safe_execute("Error reading cache file", None)
41
+ def _process_cache_file(cache_file: Path) -> Dict[str, Any]:
42
+ """Extract metadata information from a cache file.
43
+
44
+ Args:
45
+ cache_file: Path to the cache file to process.
46
+
47
+ Returns:
48
+ Dict[str, Any]: Dictionary containing cache file metadata with keys:
49
+ - filename: The cache file name
50
+ - environment_key: Extracted environment identifier
51
+ - size_mb: File size in megabytes
52
+ - is_current_environment: Whether this matches current environment
53
+ - created_time: File creation timestamp
54
+ Returns None if processing fails (handled by decorator).
55
+
56
+ Raises:
57
+ Exception: Any errors reading file metadata (caught by decorator).
58
+ """
59
+ # Extract env key: cache_a1b2c3d4e5f6.latest.tar.gz
60
+ env_key = cache_file.name.replace(CACHE_PREFIX, "").replace(
61
+ f"{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}", ""
62
+ )
63
+
64
+ return {
65
+ "filename": cache_file.name,
66
+ "environment_key": env_key,
67
+ "size_mb": get_file_size_mb(cache_file),
68
+ "is_current_environment": env_key == get_environment_key(),
69
+ "created_time": cache_file.stat().st_mtime,
70
+ }
71
+
72
+
73
+ def get_cache_info() -> Dict[str, Any]:
74
+ """Get comprehensive information about the current cache state.
75
+
76
+ This function provides a snapshot of both local and b10fs cache status,
77
+ including existence, sizes, and environment information. It safely handles
78
+ cases where b10fs is unavailable or directories don't exist.
79
+
80
+ Returns:
81
+ Dict[str, Any]: Dictionary containing cache information with keys:
82
+ - environment_key: Current environment identifier hash
83
+ - local_cache_exists: Whether local torch cache has content
84
+ - b10fs_enabled: Whether b10fs filesystem is available
85
+ - b10fs_cache_exists: Whether cache exists on b10fs
86
+ - local_cache_size_mb: Local cache size in MB (if exists)
87
+ - b10fs_cache_size_mb: B10fs cache size in MB (if exists)
88
+
89
+ Raises:
90
+ No exceptions are raised; errors are handled gracefully with None values.
91
+ """
92
+ torch_dir = Path(TORCH_CACHE_DIR)
93
+ b10fs_dir = Path(B10FS_CACHE_DIR)
94
+ cache_filename = get_cache_filename()
95
+ cache_file = (
96
+ b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
97
+ )
98
+
99
+ info = {
100
+ "environment_key": get_environment_key(),
101
+ "local_cache_exists": torch_dir.exists() and any(torch_dir.iterdir()),
102
+ "b10fs_enabled": _is_b10fs_enabled(),
103
+ "b10fs_cache_exists": cache_file.exists() if _is_b10fs_enabled() else False,
104
+ }
105
+
106
+ # Add size info
107
+ if info["local_cache_exists"]:
108
+ info["local_cache_size_mb"] = _calculate_local_cache_size(torch_dir)
109
+
110
+ if info["b10fs_cache_exists"] and _is_b10fs_enabled():
111
+ info["b10fs_cache_size_mb"] = get_file_size_mb(cache_file)
112
+
113
+ return info
114
+
115
+
116
+ def list_available_caches() -> Dict[str, Any]:
117
+ """List all available cache files with their metadata and environment info.
118
+
119
+ This function scans the b10fs directory for all cache files and returns
120
+ detailed information about each one, including which environment they
121
+ belong to and their creation times. Results are sorted by creation time.
122
+
123
+ Returns:
124
+ Dict[str, Any]: Dictionary containing cache listing with keys:
125
+ - caches: List of cache file dictionaries (from _process_cache_file)
126
+ - current_environment: Current environment identifier
127
+ - total_caches: Total number of cache files found
128
+ - current_cache_exists: Whether current environment has a cache
129
+ - b10fs_enabled: Whether b10fs is available
130
+ - error: Error message if b10fs is not enabled
131
+
132
+ Raises:
133
+ No exceptions are raised; individual file errors are handled gracefully
134
+ and problematic files are skipped.
135
+ """
136
+ if not _is_b10fs_enabled():
137
+ return {
138
+ "caches": [],
139
+ "current_environment": get_environment_key(),
140
+ "b10fs_enabled": False,
141
+ "error": "b10fs is not enabled",
142
+ }
143
+
144
+ b10fs_dir = Path(B10FS_CACHE_DIR)
145
+
146
+ if not b10fs_dir.exists():
147
+ return {
148
+ "caches": [],
149
+ "current_environment": get_environment_key(),
150
+ "b10fs_enabled": True,
151
+ }
152
+
153
+ caches = []
154
+
155
+ # Find all latest cache files
156
+ for cache_file in b10fs_dir.glob(
157
+ f"{CACHE_PREFIX}*{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
158
+ ):
159
+ cache_info = _process_cache_file(cache_file)
160
+ if cache_info:
161
+ caches.append(cache_info)
162
+
163
+ # Sort by creation time (newest first)
164
+ caches.sort(key=lambda x: x["created_time"], reverse=True)
165
+
166
+ return {
167
+ "caches": caches,
168
+ "current_environment": get_environment_key(),
169
+ "total_caches": len(caches),
170
+ "current_cache_exists": any(c["is_current_environment"] for c in caches),
171
+ "b10fs_enabled": True,
172
+ }
@@ -0,0 +1,299 @@
1
+ """Space monitoring utilities for b10-tcache.
2
+
3
+ This module provides disk space monitoring functionality to prevent cache operations
4
+ from exhausting available disk space and causing system instability.
5
+ """
6
+
7
+ import time
8
+ import logging
9
+ import shutil
10
+ import threading
11
+ import multiprocessing
12
+ from pathlib import Path
13
+ from multiprocessing import Process, Queue
14
+ from functools import wraps
15
+
16
+ from .constants import WorkerStatus, SPACE_MONITOR_CHECK_INTERVAL_SECONDS
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class CacheOperationInterrupted(Exception):
22
+ """Raised when a cache operation is interrupted due to insufficient disk space."""
23
+
24
+ pass
25
+
26
+
27
+ def worker_process(cancelled_message: str):
28
+ """Decorator for worker process functions to handle common try/catch/result_queue pattern.
29
+
30
+ This decorator wraps worker functions to:
31
+ 1. Check for cancellation before starting
32
+ 2. Handle exceptions and put appropriate status in result_queue
33
+ 3. Put success status on completion
34
+
35
+ Args:
36
+ cancelled_message: Message to send if the worker is cancelled before starting
37
+
38
+ Usage:
39
+ @worker_process("Operation was cancelled before starting")
40
+ def my_worker(arg1, arg2, result_queue, stop_event):
41
+ # Your worker logic here
42
+ # No need to handle try/catch or result_queue.put()
43
+ """
44
+
45
+ def decorator(func):
46
+ @wraps(func)
47
+ def wrapper(*args):
48
+ # Extract result_queue and stop_event from the end of args
49
+ *worker_args, result_queue, stop_event = args
50
+
51
+ try:
52
+ # Check for stop signal before starting
53
+ if stop_event.is_set():
54
+ result_queue.put((WorkerStatus.CANCELLED.value, cancelled_message))
55
+ return
56
+
57
+ # Call the actual worker function with just the worker args
58
+ func(*worker_args)
59
+
60
+ # If we get here, the function completed successfully
61
+ result_queue.put((WorkerStatus.SUCCESS.value, None))
62
+
63
+ except Exception as e:
64
+ result_queue.put((WorkerStatus.ERROR.value, str(e)))
65
+
66
+ return wrapper
67
+
68
+ return decorator
69
+
70
+
71
+ def get_available_disk_space_mb(path: Path) -> float:
72
+ """Get available disk space in megabytes for the given path.
73
+
74
+ This function returns the available disk space for the filesystem
75
+ containing the specified path. It's useful for checking if there's
76
+ enough space before performing disk-intensive operations.
77
+
78
+ Args:
79
+ path: Path to check disk space for. The path's parent directory
80
+ will be used if the path itself doesn't exist.
81
+
82
+ Returns:
83
+ float: Available disk space in megabytes, or 0.0 if unable to
84
+ determine (e.g., path doesn't exist or permission denied).
85
+
86
+ Raises:
87
+ No exceptions are raised; OSError is caught and returns 0.0.
88
+ """
89
+ try:
90
+ # Ensure we check an existing directory
91
+ check_path = path
92
+ while not check_path.exists() and check_path != check_path.parent:
93
+ check_path = check_path.parent
94
+
95
+ if not check_path.exists():
96
+ return 0.0
97
+
98
+ # Get disk usage stats
99
+ _, _, free_bytes = shutil.disk_usage(check_path)
100
+ return free_bytes / (1024 * 1024)
101
+ except OSError:
102
+ return 0.0
103
+
104
+
105
+ def check_sufficient_disk_space(
106
+ path: Path, required_mb: float, operation_name: str = "operation"
107
+ ) -> None:
108
+ """Check if there's sufficient disk space for an operation.
109
+
110
+ This function verifies that the filesystem has enough available space
111
+ for the specified operation, raising an exception if insufficient space
112
+ is available.
113
+
114
+ Args:
115
+ path: Path where the operation will write data.
116
+ required_mb: Required disk space in megabytes.
117
+ operation_name: Name of the operation for error messages.
118
+
119
+ Raises:
120
+ CacheValidationError: If insufficient disk space is available.
121
+ """
122
+ from .utils import CacheValidationError
123
+
124
+ available_mb = get_available_disk_space_mb(path)
125
+ if available_mb < required_mb:
126
+ raise CacheValidationError(
127
+ f"Insufficient disk space for {operation_name}: "
128
+ f"required {required_mb:.1f}MB, available {available_mb:.1f}MB"
129
+ )
130
+
131
+
132
+ class CacheSpaceMonitor:
133
+ """Background monitor for disk space during cache operations.
134
+
135
+ This class implements a daemon thread that continuously monitors available
136
+ disk space and signals when space falls below required thresholds. It follows
137
+ the SpaceMonitor pattern from node-warmer for graceful operation interruption.
138
+ """
139
+
140
+ def __init__(
141
+ self, required_space_mb: float, path: Path, check_interval: float = 2.0
142
+ ):
143
+ """Initialize the space monitor.
144
+
145
+ Args:
146
+ required_space_mb: Minimum required disk space in megabytes.
147
+ path: Path to monitor for disk space (will check filesystem containing this path).
148
+ check_interval: How often to check disk space in seconds. Defaults to 2.0.
149
+ """
150
+ self.required_space_mb = required_space_mb
151
+ self.path = path
152
+ self.check_interval = check_interval
153
+ self.stop_operation = threading.Event()
154
+ self.thread: threading.Thread = None
155
+
156
+ def start(self) -> None:
157
+ """Start monitoring disk space in background daemon thread."""
158
+ if self.thread is not None:
159
+ return # Already started
160
+
161
+ self.thread = threading.Thread(target=self._monitor, daemon=True)
162
+ self.thread.start()
163
+ logger.debug(
164
+ f"Started space monitor for {self.path} (required: {self.required_space_mb:.1f}MB)"
165
+ )
166
+
167
+ def _monitor(self) -> None:
168
+ """Continuously monitor disk space and signal when insufficient."""
169
+ while not self.stop_operation.is_set():
170
+ try:
171
+ available_mb = get_available_disk_space_mb(self.path)
172
+ logger.debug(
173
+ f"[SpaceMonitor] Available space: {available_mb:.1f}MB (required: {self.required_space_mb:.1f}MB)"
174
+ )
175
+
176
+ if available_mb < self.required_space_mb:
177
+ logger.error(
178
+ f"CRITICAL: Space ({available_mb:.1f}MB) below required {self.required_space_mb:.1f}MB. Signaling stop!"
179
+ )
180
+ self.stop_operation.set()
181
+ break
182
+
183
+ except Exception as e:
184
+ logger.warning(f"Space monitor error: {e}")
185
+
186
+ time.sleep(self.check_interval)
187
+
188
+ def should_stop(self) -> bool:
189
+ """Check if operations should stop due to insufficient disk space.
190
+
191
+ Returns:
192
+ bool: True if insufficient disk space was detected, False otherwise.
193
+ """
194
+ return self.stop_operation.is_set()
195
+
196
+ def stop(self) -> None:
197
+ """Stop the background monitoring thread."""
198
+ self.stop_operation.set()
199
+ if self.thread is not None:
200
+ logger.debug("Stopped space monitor")
201
+
202
+
203
+ def cleanup_process(
204
+ process: Process, operation_name: str, timeout: float = 5.0
205
+ ) -> None:
206
+ """Clean up a process with graceful termination and force kill fallback.
207
+
208
+ This helper function implements the standard pattern for cleaning up
209
+ multiprocessing.Process instances with proper timeout handling.
210
+
211
+ Args:
212
+ process: The process to clean up.
213
+ operation_name: Name of the operation for logging.
214
+ timeout: How long to wait for graceful termination before force kill.
215
+ """
216
+ if process.is_alive():
217
+ process.terminate()
218
+ process.join(timeout=timeout)
219
+ if process.is_alive():
220
+ logger.warning(f"Force killing {operation_name} process")
221
+ process.kill()
222
+ process.join()
223
+
224
+
225
+ def run_monitored_process(
226
+ worker_func,
227
+ args,
228
+ space_monitor: CacheSpaceMonitor,
229
+ operation_name: str,
230
+ cleanup_func=None,
231
+ ) -> None:
232
+ """Run a worker process with space monitoring and automatic termination.
233
+
234
+ This function starts a worker process and monitors it alongside the space monitor.
235
+ If insufficient disk space is detected, the worker process is terminated and
236
+ cleanup is performed.
237
+
238
+ Args:
239
+ worker_func: The worker function to run in a separate process.
240
+ args: Arguments to pass to the worker function.
241
+ space_monitor: CacheSpaceMonitor instance to check for space issues.
242
+ operation_name: Name of the operation for logging.
243
+ cleanup_func: Optional function to call for cleanup if operation is interrupted.
244
+
245
+ Raises:
246
+ CacheOperationInterrupted: If the operation was interrupted due to insufficient disk space.
247
+ Exception: If the worker process failed for other reasons.
248
+ """
249
+ result_queue = Queue()
250
+ stop_event = multiprocessing.Event()
251
+
252
+ # Add result_queue and stop_event to worker args
253
+ worker_args = args + (result_queue, stop_event)
254
+
255
+ # Start the worker process
256
+ process = Process(target=worker_func, args=worker_args)
257
+ process.start()
258
+
259
+ try:
260
+ # Monitor the process
261
+ while process.is_alive():
262
+ if space_monitor.should_stop():
263
+ logger.warning(f"Low disk space detected, cancelling {operation_name}")
264
+ stop_event.set()
265
+ cleanup_process(process, operation_name)
266
+
267
+ # Run cleanup if provided
268
+ if cleanup_func:
269
+ cleanup_func()
270
+
271
+ raise CacheOperationInterrupted(
272
+ f"{operation_name} was cancelled due to insufficient disk space"
273
+ )
274
+
275
+ time.sleep(SPACE_MONITOR_CHECK_INTERVAL_SECONDS)
276
+
277
+ # Process finished, get the result
278
+ process.join()
279
+
280
+ if not result_queue.empty():
281
+ status, error_msg = result_queue.get()
282
+ if status == WorkerStatus.ERROR.value:
283
+ logger.error(f"{operation_name} worker failed: {error_msg}")
284
+ raise Exception(error_msg)
285
+ elif status == WorkerStatus.CANCELLED.value:
286
+ if cleanup_func:
287
+ cleanup_func()
288
+ raise CacheOperationInterrupted(error_msg)
289
+ # status == WorkerStatus.SUCCESS.value - continue normally
290
+
291
+ logger.debug(f"{operation_name} completed successfully")
292
+
293
+ except Exception as e:
294
+ # Ensure process is cleaned up
295
+ cleanup_process(process, operation_name)
296
+
297
+ if not isinstance(e, CacheOperationInterrupted):
298
+ logger.error(f"{operation_name} failed: {e}")
299
+ raise