b10-transfer 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- b10_transfer/__init__.py +4 -32
- b10_transfer/cleanup.py +1 -1
- b10_transfer/constants.py +2 -23
- b10_transfer/core.py +343 -118
- b10_transfer/space_monitor.py +1 -14
- b10_transfer-0.1.2.dist-info/METADATA +127 -0
- b10_transfer-0.1.2.dist-info/RECORD +12 -0
- b10_transfer/async_torch_cache.py +0 -62
- b10_transfer/async_transfers.py +0 -283
- b10_transfer/torch_cache.py +0 -388
- b10_transfer-0.1.0.dist-info/METADATA +0 -219
- b10_transfer-0.1.0.dist-info/RECORD +0 -15
- {b10_transfer-0.1.0.dist-info → b10_transfer-0.1.2.dist-info}/WHEEL +0 -0
b10_transfer/__init__.py
CHANGED
@@ -1,27 +1,13 @@
|
|
1
|
-
"""B10 Transfer - Lock-free PyTorch
|
1
|
+
"""B10 Transfer - Lock-free PyTorch file transfer for Baseten."""
|
2
2
|
|
3
|
-
from .core import
|
4
|
-
from .torch_cache import load_compile_cache, save_compile_cache, clear_local_cache
|
5
|
-
from .async_transfers import (
|
6
|
-
start_transfer_async,
|
7
|
-
get_transfer_status,
|
8
|
-
is_transfer_complete,
|
9
|
-
wait_for_completion,
|
10
|
-
cancel_transfer,
|
11
|
-
list_active_transfers,
|
12
|
-
TransferProgress,
|
13
|
-
)
|
14
|
-
from .async_torch_cache import (
|
15
|
-
load_compile_cache_async,
|
16
|
-
save_compile_cache_async,
|
17
|
-
)
|
3
|
+
from .core import load_compile_cache, save_compile_cache, clear_local_cache
|
18
4
|
from .utils import CacheError, CacheValidationError
|
19
5
|
from .space_monitor import CacheOperationInterrupted
|
20
6
|
from .info import get_cache_info, list_available_caches
|
21
|
-
from .constants import SaveStatus, LoadStatus
|
7
|
+
from .constants import SaveStatus, LoadStatus
|
22
8
|
|
23
9
|
# Version
|
24
|
-
__version__ = "0.1.
|
10
|
+
__version__ = "0.1.2"
|
25
11
|
|
26
12
|
__all__ = [
|
27
13
|
"CacheError",
|
@@ -29,23 +15,9 @@ __all__ = [
|
|
29
15
|
"CacheOperationInterrupted",
|
30
16
|
"SaveStatus",
|
31
17
|
"LoadStatus",
|
32
|
-
"TransferStatus",
|
33
|
-
"AsyncTransferStatus",
|
34
|
-
"transfer",
|
35
18
|
"load_compile_cache",
|
36
19
|
"save_compile_cache",
|
37
20
|
"clear_local_cache",
|
38
21
|
"get_cache_info",
|
39
22
|
"list_available_caches",
|
40
|
-
# Generic async operations
|
41
|
-
"start_transfer_async",
|
42
|
-
"get_transfer_status",
|
43
|
-
"is_transfer_complete",
|
44
|
-
"wait_for_completion",
|
45
|
-
"cancel_transfer",
|
46
|
-
"list_active_transfers",
|
47
|
-
"TransferProgress",
|
48
|
-
# Torch-specific async operations
|
49
|
-
"load_compile_cache_async",
|
50
|
-
"save_compile_cache_async",
|
51
23
|
]
|
b10_transfer/cleanup.py
CHANGED
b10_transfer/constants.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
"""Configuration constants for b10-
|
1
|
+
"""Configuration constants for b10-transfer.
|
2
2
|
|
3
3
|
This module defines configuration constants for the PyTorch compilation cache system.
|
4
4
|
Some values can be overridden by environment variables, but security caps are enforced
|
@@ -36,8 +36,7 @@ B10FS_CACHE_DIR = validate_path_security(
|
|
36
36
|
_b10fs_cache_dir, [_REQUIRED_TORCH_CACHE_DIR_PREFIX], "B10FS_CACHE_DIR"
|
37
37
|
)
|
38
38
|
|
39
|
-
# Validate LOCAL_WORK_DIR - allow /app, /tmp, and /cache paths
|
40
|
-
# This is like a "scratch" directory where you can do work (like compression/archival for example)
|
39
|
+
# Validate LOCAL_WORK_DIR - allow /app, /tmp, and /cache paths
|
41
40
|
_local_work_dir = os.getenv("LOCAL_WORK_DIR", "/app")
|
42
41
|
LOCAL_WORK_DIR = validate_path_security(
|
43
42
|
_local_work_dir, ["/app/", "/tmp/", "/cache/"], "LOCAL_WORK_DIR"
|
@@ -113,7 +112,6 @@ class WorkerStatus(Enum):
|
|
113
112
|
SUCCESS = auto()
|
114
113
|
ERROR = auto()
|
115
114
|
CANCELLED = auto()
|
116
|
-
FILE_NOT_FOUND = auto()
|
117
115
|
|
118
116
|
|
119
117
|
class LoadStatus(Enum):
|
@@ -131,22 +129,3 @@ class SaveStatus(Enum):
|
|
131
129
|
SUCCESS = auto()
|
132
130
|
ERROR = auto()
|
133
131
|
SKIPPED = auto()
|
134
|
-
|
135
|
-
|
136
|
-
class TransferStatus(Enum):
|
137
|
-
"""Status values for generic transfer operations."""
|
138
|
-
|
139
|
-
SUCCESS = auto()
|
140
|
-
ERROR = auto()
|
141
|
-
INTERRUPTED = auto()
|
142
|
-
DOES_NOT_EXIST = auto()
|
143
|
-
|
144
|
-
|
145
|
-
class AsyncTransferStatus(Enum):
|
146
|
-
NOT_STARTED = auto()
|
147
|
-
IN_PROGRESS = auto()
|
148
|
-
SUCCESS = auto()
|
149
|
-
ERROR = auto()
|
150
|
-
INTERRUPTED = auto()
|
151
|
-
CANCELLED = auto()
|
152
|
-
DOES_NOT_EXIST = auto()
|
b10_transfer/core.py
CHANGED
@@ -1,169 +1,394 @@
|
|
1
|
+
import os
|
1
2
|
import logging
|
3
|
+
import tempfile
|
4
|
+
import shutil
|
2
5
|
from pathlib import Path
|
3
6
|
|
7
|
+
import time
|
8
|
+
|
9
|
+
from .environment import get_cache_filename
|
4
10
|
from .cleanup import cooperative_cleanup_b10fs
|
5
11
|
from .utils import (
|
6
12
|
timed_fn,
|
13
|
+
critical_section_b10fs_file_lock,
|
7
14
|
safe_execute,
|
15
|
+
temp_file_cleanup,
|
8
16
|
cache_operation,
|
17
|
+
safe_unlink,
|
9
18
|
)
|
10
19
|
from .space_monitor import (
|
11
20
|
check_sufficient_disk_space,
|
12
21
|
CacheSpaceMonitor,
|
13
22
|
CacheOperationInterrupted,
|
14
|
-
CacheFileNotFoundError,
|
15
23
|
run_monitored_process,
|
24
|
+
worker_process,
|
16
25
|
)
|
17
26
|
from .constants import (
|
27
|
+
TORCH_CACHE_DIR,
|
18
28
|
B10FS_CACHE_DIR,
|
19
29
|
LOCAL_WORK_DIR,
|
30
|
+
MAX_CACHE_SIZE_MB,
|
20
31
|
REQUIRED_B10FS_SPACE_MB,
|
21
32
|
MIN_LOCAL_SPACE_MB,
|
22
|
-
|
33
|
+
CACHE_FILE_EXTENSION,
|
34
|
+
CACHE_LATEST_SUFFIX,
|
35
|
+
CACHE_INCOMPLETE_SUFFIX,
|
36
|
+
LoadStatus,
|
37
|
+
SaveStatus,
|
23
38
|
)
|
24
39
|
|
25
40
|
logger = logging.getLogger(__name__)
|
26
41
|
|
27
42
|
|
28
|
-
@timed_fn(logger=logger, name="
|
29
|
-
@safe_execute("
|
30
|
-
def
|
31
|
-
|
32
|
-
dest: Path,
|
33
|
-
callback: callable,
|
34
|
-
*callback_args,
|
35
|
-
monitor_local: bool = True,
|
36
|
-
monitor_b10fs: bool = True,
|
37
|
-
**callback_kwargs,
|
38
|
-
) -> TransferStatus:
|
39
|
-
"""Generic transfer function with space monitoring and atomic operations.
|
40
|
-
|
41
|
-
The actual transfer logic is provided via callback.
|
42
|
-
|
43
|
-
The function handles:
|
44
|
-
- Cooperative cleanup of stale shared resources
|
45
|
-
- Space monitoring during operations (optional for local and b10fs)
|
46
|
-
- Atomic operations using temp files and rename
|
47
|
-
- Automatic cleanup on interruption or failure
|
48
|
-
- Lock management for b10fs operations
|
43
|
+
@timed_fn(logger=logger, name="Loading compile cache")
|
44
|
+
@safe_execute("Load failed", False)
|
45
|
+
def load_compile_cache() -> LoadStatus:
|
46
|
+
"""Load PyTorch compilation cache from b10fs to local torch cache directory.
|
49
47
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
**callback_kwargs: Keyword arguments to pass to callback
|
48
|
+
This function implements a lock-free pattern to safely load cached PyTorch
|
49
|
+
compilation artifacts from the b10fs shared filesystem to the local torch
|
50
|
+
cache directory. It validates b10fs availability, checks for existing cache,
|
51
|
+
and extracts the archive if needed.
|
52
|
+
|
53
|
+
The function monitors local disk space during both the copy from b10fs and
|
54
|
+
extraction phases, interrupting operations if space falls below MIN_LOCAL_SPACE_MB.
|
58
55
|
|
59
56
|
Returns:
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
57
|
+
LoadStatus:
|
58
|
+
LoadStatus.SUCCESS if cache was successfully loaded
|
59
|
+
LoadStatus.SKIPPED if already exists
|
60
|
+
LoadStatus.ERROR if b10fs is unavailable, local disk space is insufficient, or loading failed.
|
61
|
+
LoadStatus.DOES_NOT_EXIST if no cache file was found.
|
64
62
|
|
65
63
|
Raises:
|
66
|
-
CacheValidationError: If b10fs is not enabled (caught and returns
|
64
|
+
CacheValidationError: If b10fs is not enabled (caught and returns LoadStatus.ERROR).
|
67
65
|
CacheOperationInterrupted: If operations interrupted due to insufficient
|
68
|
-
disk space (caught and returns
|
69
|
-
Exception: Any other errors during
|
66
|
+
local disk space (caught and returns LoadStatus.ERROR).
|
67
|
+
Exception: Any other errors during loading (caught and returns LoadStatus.ERROR).
|
70
68
|
"""
|
71
|
-
with cache_operation("
|
69
|
+
with cache_operation("Load"):
|
72
70
|
# Cooperative cleanup of stale shared resources
|
73
71
|
cooperative_cleanup_b10fs()
|
74
72
|
|
75
73
|
b10fs_dir = Path(B10FS_CACHE_DIR)
|
74
|
+
torch_dir = Path(TORCH_CACHE_DIR)
|
76
75
|
work_dir = Path(LOCAL_WORK_DIR)
|
77
76
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
):
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
# Determine primary space monitor (prioritize b10fs if both are monitored)
|
113
|
-
primary_monitor = None
|
114
|
-
if monitor_b10fs and b10fs_path:
|
115
|
-
primary_monitor = CacheSpaceMonitor(REQUIRED_B10FS_SPACE_MB, b10fs_path)
|
116
|
-
elif monitor_local and local_path:
|
117
|
-
primary_monitor = CacheSpaceMonitor(MIN_LOCAL_SPACE_MB, local_path)
|
118
|
-
|
119
|
-
if primary_monitor is None:
|
120
|
-
# No monitoring requested, execute callback directly
|
121
|
-
logger.info(f"Starting transfer (no monitoring): {source} -> {dest}")
|
122
|
-
try:
|
123
|
-
callback(source, dest, *callback_args, **callback_kwargs)
|
124
|
-
logger.info("Transfer complete")
|
125
|
-
return TransferStatus.SUCCESS
|
126
|
-
except (FileNotFoundError, CacheFileNotFoundError) as e:
|
127
|
-
logger.info(f"Transfer failed - file not found: {e}")
|
128
|
-
return TransferStatus.DOES_NOT_EXIST
|
129
|
-
|
130
|
-
# Start the primary space monitor
|
131
|
-
primary_monitor.start()
|
77
|
+
cache_filename = get_cache_filename()
|
78
|
+
cache_file = (
|
79
|
+
b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
|
80
|
+
)
|
81
|
+
logger.debug(f"Looking for cache file: {cache_file}")
|
82
|
+
|
83
|
+
if not cache_file.exists():
|
84
|
+
logger.info("No cache file found in b10fs")
|
85
|
+
return LoadStatus.DOES_NOT_EXIST
|
86
|
+
|
87
|
+
# Skip if already loaded
|
88
|
+
if torch_dir.exists() and any(torch_dir.iterdir()):
|
89
|
+
logger.info("Torch cache already loaded, skipping extraction")
|
90
|
+
return LoadStatus.SKIPPED
|
91
|
+
|
92
|
+
# Initial disk space check for local operations
|
93
|
+
check_sufficient_disk_space(
|
94
|
+
work_dir, MIN_LOCAL_SPACE_MB, "cache load operations"
|
95
|
+
)
|
96
|
+
logger.debug(
|
97
|
+
f"Initial space check passed: {MIN_LOCAL_SPACE_MB:.1f}MB required on local machine"
|
98
|
+
)
|
99
|
+
|
100
|
+
# Start background space monitoring for local disk
|
101
|
+
space_monitor = CacheSpaceMonitor(MIN_LOCAL_SPACE_MB, work_dir)
|
102
|
+
space_monitor.start()
|
103
|
+
|
104
|
+
# Create temp local copy
|
105
|
+
with tempfile.NamedTemporaryFile(
|
106
|
+
suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
|
107
|
+
) as f:
|
108
|
+
temp_path = Path(f.name)
|
109
|
+
logger.debug(f"Created temporary file for cache: {temp_path}")
|
132
110
|
|
133
111
|
try:
|
134
|
-
|
135
|
-
|
112
|
+
with temp_file_cleanup(temp_path):
|
113
|
+
# Phase 1: Copy from b10fs to local temp file in separate process
|
114
|
+
@critical_section_b10fs_file_lock("copy_out")
|
115
|
+
def _monitored_copy_from_b10fs():
|
116
|
+
logger.info(
|
117
|
+
f"Starting copy from b10fs: {cache_file} -> {temp_path}"
|
118
|
+
)
|
119
|
+
run_monitored_process(
|
120
|
+
_cache_copy_from_b10fs_worker,
|
121
|
+
(str(cache_file), str(temp_path)),
|
122
|
+
space_monitor,
|
123
|
+
"b10fs to local copy",
|
124
|
+
)
|
125
|
+
|
126
|
+
_monitored_copy_from_b10fs()
|
136
127
|
|
137
|
-
|
138
|
-
|
128
|
+
# Phase 2: Extract archive in separate process
|
129
|
+
logger.info(f"Starting extraction: {temp_path} -> {torch_dir}")
|
139
130
|
run_monitored_process(
|
140
|
-
|
141
|
-
(
|
142
|
-
|
143
|
-
"
|
131
|
+
_cache_extract_worker,
|
132
|
+
(str(temp_path), str(torch_dir)),
|
133
|
+
space_monitor,
|
134
|
+
"archive extraction",
|
135
|
+
cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
|
144
136
|
)
|
145
|
-
logger.info("Transfer complete (monitored)")
|
146
|
-
return TransferStatus.SUCCESS
|
147
137
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
138
|
+
logger.info("Cache load complete")
|
139
|
+
return LoadStatus.SUCCESS
|
140
|
+
|
141
|
+
except CacheOperationInterrupted as e:
|
142
|
+
logger.warning(f"Cache load interrupted: {e}")
|
143
|
+
return LoadStatus.ERROR
|
144
|
+
|
145
|
+
finally:
|
146
|
+
space_monitor.stop()
|
147
|
+
|
148
|
+
|
149
|
+
"""
|
150
|
+
FIXME(SRAY):
|
151
|
+
What about the case in @b10-transfer/ where a single pod finishes an inference request,
|
152
|
+
and then the client calls save_compile_cache. And while we are creating the local archive,
|
153
|
+
another inference call on the same pod is kicked off, which then modifies the torch cache.
|
154
|
+
How would this be handled? Maybe just accept that the cache will be recompiled/overwritten?
|
155
|
+
Otherwise you'd need application level coordination to ensure that the cache is not modified
|
156
|
+
while we are creating the archive, but this doesn't really seem like a good idea in terms of adoption.
|
157
|
+
|
158
|
+
FIXME(SR):
|
159
|
+
More things to consider:
|
160
|
+
- [possible] What if b10fs dies *during* an op? right now we check for b10fs availability in the beginning of the op... Add some constants instead of just False for load().
|
161
|
+
- [possible, and really bad if it happens] potential memory exhaustion during compression if the cache is super super large. very very edge case. higher compression levels also have high memory usage.
|
162
|
+
"""
|
163
|
+
|
164
|
+
|
165
|
+
@timed_fn(logger=logger, name="Saving compile cache")
|
166
|
+
@safe_execute("Save failed", False)
|
167
|
+
def save_compile_cache() -> SaveStatus:
|
168
|
+
"""Save local PyTorch compilation cache to b10fs using atomic journal pattern.
|
169
|
+
|
170
|
+
This function creates an archive of the local torch cache directory and
|
171
|
+
atomically saves it to b10fs using a journal pattern (write to temp file,
|
172
|
+
then rename). This ensures concurrent saves don't corrupt each other.
|
173
|
+
|
174
|
+
The function validates b10fs availability, checks if cache already exists
|
175
|
+
(early exit), performs initial space checks using pre-calculated requirements
|
176
|
+
for concurrent saves, starts background space monitoring, then runs compression
|
177
|
+
and copy operations in separate worker processes that can be terminated if disk
|
178
|
+
space becomes insufficient, finally performing an atomic rename to the final cache file.
|
179
|
+
|
180
|
+
Returns:
|
181
|
+
SaveStatus:
|
182
|
+
SaveStatus.SUCCESS if cache was successfully saved or already exists
|
183
|
+
SaveStatus.ERROR if b10fs is unavailable, insufficient disk space caused interruption,
|
184
|
+
no cache exists to save, or saving failed.
|
185
|
+
SaveStatus.SKIPPED if no cache exists to save or cache already exists in b10fs
|
186
|
+
|
187
|
+
Raises:
|
188
|
+
CacheValidationError: If b10fs is not enabled (caught and returns SaveStatus.ERROR).
|
189
|
+
CacheOperationInterrupted: If operations interrupted due to insufficient
|
190
|
+
disk space (caught and returns SaveStatus.ERROR).
|
191
|
+
ArchiveError: If archive creation fails (caught and returns SaveStatus.ERROR).
|
192
|
+
Exception: Any other errors during saving (caught and returns SaveStatus.ERROR).
|
193
|
+
"""
|
194
|
+
with cache_operation("Save"):
|
195
|
+
# Cooperative cleanup of stale shared resources
|
196
|
+
cooperative_cleanup_b10fs()
|
197
|
+
|
198
|
+
b10fs_dir = Path(B10FS_CACHE_DIR)
|
199
|
+
torch_dir = Path(TORCH_CACHE_DIR)
|
200
|
+
work_dir = Path(LOCAL_WORK_DIR)
|
201
|
+
|
202
|
+
# Check if anything to save
|
203
|
+
if not torch_dir.exists() or not any(torch_dir.iterdir()):
|
204
|
+
logger.info("No torch cache to save")
|
205
|
+
return SaveStatus.SKIPPED
|
206
|
+
|
207
|
+
cache_filename = get_cache_filename()
|
208
|
+
final_file = (
|
209
|
+
b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
|
210
|
+
)
|
211
|
+
|
212
|
+
# Check for existing cache first (early exit)
|
213
|
+
if final_file.exists():
|
214
|
+
logger.info("Cache already exists in b10fs, skipping save")
|
215
|
+
return SaveStatus.SKIPPED
|
216
|
+
|
217
|
+
# Initial disk space checks using calculated space requirements
|
218
|
+
check_sufficient_disk_space(
|
219
|
+
work_dir, MAX_CACHE_SIZE_MB, "local temp file creation"
|
220
|
+
)
|
221
|
+
check_sufficient_disk_space(
|
222
|
+
b10fs_dir, REQUIRED_B10FS_SPACE_MB, "cache save to b10fs"
|
223
|
+
)
|
224
|
+
logger.debug(
|
225
|
+
f"Initial space checks passed: {MAX_CACHE_SIZE_MB:.1f}MB local, {REQUIRED_B10FS_SPACE_MB:.1f}MB b10fs"
|
226
|
+
)
|
227
|
+
|
228
|
+
temp_file = (
|
229
|
+
b10fs_dir
|
230
|
+
/ f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
|
231
|
+
)
|
232
|
+
|
233
|
+
# Start background space monitoring
|
234
|
+
space_monitor = CacheSpaceMonitor(REQUIRED_B10FS_SPACE_MB, b10fs_dir)
|
235
|
+
space_monitor.start()
|
236
|
+
|
237
|
+
with tempfile.NamedTemporaryFile(
|
238
|
+
suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
|
239
|
+
) as f:
|
240
|
+
local_temp = Path(f.name)
|
241
|
+
logger.debug(f"Created local temp file for archive: {local_temp}")
|
242
|
+
|
243
|
+
try:
|
244
|
+
with temp_file_cleanup(local_temp):
|
245
|
+
# Phase 1: Compression in separate process
|
246
|
+
logger.info(f"Starting compression: {torch_dir} -> {local_temp}")
|
247
|
+
run_monitored_process(
|
248
|
+
_cache_compression_worker,
|
249
|
+
(str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
|
250
|
+
space_monitor,
|
251
|
+
"compression",
|
152
252
|
)
|
153
253
|
|
154
|
-
|
155
|
-
callback(source, dest, *callback_args, **callback_kwargs)
|
156
|
-
logger.info("Transfer complete (unmonitored)")
|
157
|
-
return TransferStatus.SUCCESS
|
254
|
+
b10fs_dir.mkdir(parents=True, exist_ok=True)
|
158
255
|
|
159
|
-
|
160
|
-
|
161
|
-
|
256
|
+
# Phase 2: Copy to b10fs in separate process
|
257
|
+
@critical_section_b10fs_file_lock("copy_in")
|
258
|
+
def _monitored_copy_to_b10fs():
|
259
|
+
logger.info(f"Starting copy to b10fs: {local_temp} -> {temp_file}")
|
260
|
+
run_monitored_process(
|
261
|
+
_cache_copy_worker,
|
262
|
+
(str(local_temp), str(temp_file)),
|
263
|
+
space_monitor,
|
264
|
+
"b10fs copy",
|
265
|
+
cleanup_func=lambda: safe_unlink(
|
266
|
+
temp_file, f"Failed to cleanup interrupted copy {temp_file}"
|
267
|
+
),
|
268
|
+
)
|
269
|
+
|
270
|
+
_monitored_copy_to_b10fs()
|
271
|
+
|
272
|
+
# Phase 3: Atomic rename (fast, don't interrupt)
|
273
|
+
logger.info(
|
274
|
+
f"Renaming temp file to final cache file: {temp_file} -> {final_file}"
|
275
|
+
)
|
276
|
+
temp_file.rename(final_file)
|
277
|
+
|
278
|
+
logger.info("Cache save complete")
|
279
|
+
return SaveStatus.SUCCESS
|
162
280
|
|
163
281
|
except CacheOperationInterrupted as e:
|
164
|
-
logger.warning(f"
|
165
|
-
return
|
282
|
+
logger.warning(f"Cache save interrupted: {e}")
|
283
|
+
return SaveStatus.ERROR
|
166
284
|
|
167
285
|
finally:
|
168
|
-
|
169
|
-
|
286
|
+
space_monitor.stop()
|
287
|
+
|
288
|
+
|
289
|
+
@safe_execute("Clear failed", False)
|
290
|
+
def clear_local_cache() -> bool:
|
291
|
+
"""Clear the local PyTorch compilation cache directory.
|
292
|
+
|
293
|
+
This function removes the entire local torch cache directory and all its
|
294
|
+
contents. This is useful for cleaning up disk space or forcing recompilation.
|
295
|
+
|
296
|
+
Returns:
|
297
|
+
bool: True if cache was successfully cleared or didn't exist, False if
|
298
|
+
clearing failed due to permissions or other filesystem errors.
|
299
|
+
|
300
|
+
Raises:
|
301
|
+
Exception: Any errors during directory removal (caught and returns False).
|
302
|
+
"""
|
303
|
+
torch_dir = Path(TORCH_CACHE_DIR)
|
304
|
+
if not torch_dir.exists():
|
305
|
+
return True
|
306
|
+
shutil.rmtree(torch_dir)
|
307
|
+
return True
|
308
|
+
|
309
|
+
|
310
|
+
@worker_process("Compression was cancelled before starting")
|
311
|
+
def _cache_compression_worker(
|
312
|
+
torch_dir_str: str, local_temp_str: str, max_size_mb: int
|
313
|
+
) -> None:
|
314
|
+
"""Worker process that handles cache compression.
|
315
|
+
|
316
|
+
This function runs in a separate process to compress the torch cache directory
|
317
|
+
into an archive. It can be terminated externally if disk space becomes insufficient.
|
318
|
+
|
319
|
+
Args:
|
320
|
+
torch_dir_str: String path to the torch cache directory to compress.
|
321
|
+
local_temp_str: String path where the compressed archive will be created.
|
322
|
+
max_size_mb: Maximum allowed archive size in megabytes.
|
323
|
+
"""
|
324
|
+
torch_dir = Path(torch_dir_str)
|
325
|
+
local_temp = Path(local_temp_str)
|
326
|
+
|
327
|
+
# Import here to avoid issues with multiprocessing
|
328
|
+
from .archive import create_archive
|
329
|
+
|
330
|
+
create_archive(torch_dir, local_temp, max_size_mb)
|
331
|
+
|
332
|
+
|
333
|
+
@worker_process("Copy was cancelled before starting")
|
334
|
+
def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
|
335
|
+
"""Worker process that handles file copy to b10fs.
|
336
|
+
|
337
|
+
This function runs in a separate process to copy the compressed cache file
|
338
|
+
to the b10fs filesystem. It can be terminated externally if disk space becomes insufficient.
|
339
|
+
|
340
|
+
Args:
|
341
|
+
source_path_str: String path to the source file to copy.
|
342
|
+
dest_path_str: String path where the file will be copied.
|
343
|
+
"""
|
344
|
+
source_path = Path(source_path_str)
|
345
|
+
dest_path = Path(dest_path_str)
|
346
|
+
|
347
|
+
shutil.copy2(source_path, dest_path)
|
348
|
+
|
349
|
+
|
350
|
+
@worker_process("Copy from b10fs was cancelled before starting")
|
351
|
+
def _cache_copy_from_b10fs_worker(source_path_str: str, dest_path_str: str) -> None:
|
352
|
+
"""Worker process that handles file copy from b10fs to local machine.
|
353
|
+
|
354
|
+
This function runs in a separate process to copy the cache file from b10fs
|
355
|
+
to the local filesystem. It can be terminated externally if local disk space becomes insufficient.
|
356
|
+
|
357
|
+
Args:
|
358
|
+
source_path_str: String path to the source file in b10fs to copy.
|
359
|
+
dest_path_str: String path where the file will be copied locally.
|
360
|
+
"""
|
361
|
+
source_path = Path(source_path_str)
|
362
|
+
dest_path = Path(dest_path_str)
|
363
|
+
|
364
|
+
shutil.copy2(source_path, dest_path)
|
365
|
+
|
366
|
+
|
367
|
+
def _cleanup_torch_dir(torch_dir: Path) -> None:
|
368
|
+
"""Helper function to safely cleanup torch directory during interrupted extraction."""
|
369
|
+
try:
|
370
|
+
if torch_dir.exists():
|
371
|
+
shutil.rmtree(torch_dir)
|
372
|
+
logger.debug(f"Cleaned up torch directory: {torch_dir}")
|
373
|
+
except Exception as e:
|
374
|
+
logger.error(f"Failed to cleanup torch directory {torch_dir}: {e}")
|
375
|
+
|
376
|
+
|
377
|
+
@worker_process("Extraction was cancelled before starting")
|
378
|
+
def _cache_extract_worker(archive_path_str: str, dest_dir_str: str) -> None:
|
379
|
+
"""Worker process that handles archive extraction.
|
380
|
+
|
381
|
+
This function runs in a separate process to extract the cache archive to
|
382
|
+
the torch cache directory. It can be terminated externally if local disk space becomes insufficient.
|
383
|
+
|
384
|
+
Args:
|
385
|
+
archive_path_str: String path to the archive file to extract.
|
386
|
+
dest_dir_str: String path to the directory where archive will be extracted.
|
387
|
+
"""
|
388
|
+
archive_path = Path(archive_path_str)
|
389
|
+
dest_dir = Path(dest_dir_str)
|
390
|
+
|
391
|
+
# Import here to avoid issues with multiprocessing
|
392
|
+
from .archive import extract_archive
|
393
|
+
|
394
|
+
extract_archive(archive_path, dest_dir)
|
b10_transfer/space_monitor.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
"""Space monitoring utilities for b10-
|
1
|
+
"""Space monitoring utilities for b10-transfer.
|
2
2
|
|
3
3
|
This module provides disk space monitoring functionality to prevent cache operations
|
4
4
|
from exhausting available disk space and causing system instability.
|
@@ -24,12 +24,6 @@ class CacheOperationInterrupted(Exception):
|
|
24
24
|
pass
|
25
25
|
|
26
26
|
|
27
|
-
class CacheFileNotFoundError(Exception):
|
28
|
-
"""Raised when a cache file is not found during transfer operations."""
|
29
|
-
|
30
|
-
pass
|
31
|
-
|
32
|
-
|
33
27
|
def worker_process(cancelled_message: str):
|
34
28
|
"""Decorator for worker process functions to handle common try/catch/result_queue pattern.
|
35
29
|
|
@@ -66,8 +60,6 @@ def worker_process(cancelled_message: str):
|
|
66
60
|
# If we get here, the function completed successfully
|
67
61
|
result_queue.put((WorkerStatus.SUCCESS.value, None))
|
68
62
|
|
69
|
-
except FileNotFoundError as e:
|
70
|
-
result_queue.put((WorkerStatus.FILE_NOT_FOUND.value, str(e)))
|
71
63
|
except Exception as e:
|
72
64
|
result_queue.put((WorkerStatus.ERROR.value, str(e)))
|
73
65
|
|
@@ -294,11 +286,6 @@ def run_monitored_process(
|
|
294
286
|
if cleanup_func:
|
295
287
|
cleanup_func()
|
296
288
|
raise CacheOperationInterrupted(error_msg)
|
297
|
-
elif status == WorkerStatus.FILE_NOT_FOUND.value:
|
298
|
-
logger.info(
|
299
|
-
f"{operation_name} worker failed - file not found: {error_msg}"
|
300
|
-
)
|
301
|
-
raise CacheFileNotFoundError(error_msg)
|
302
289
|
# status == WorkerStatus.SUCCESS.value - continue normally
|
303
290
|
|
304
291
|
logger.debug(f"{operation_name} completed successfully")
|