b10-transfer 0.1.4__tar.gz → 0.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {b10_transfer-0.1.4 → b10_transfer-0.1.6}/PKG-INFO +1 -1
- {b10_transfer-0.1.4 → b10_transfer-0.1.6}/pyproject.toml +1 -1
- {b10_transfer-0.1.4 → b10_transfer-0.1.6}/src/b10_transfer/__init__.py +6 -5
- b10_transfer-0.1.4/src/b10_transfer/core.py → b10_transfer-0.1.6/src/b10_transfer/cache.py +158 -228
- {b10_transfer-0.1.4 → b10_transfer-0.1.6}/src/b10_transfer/constants.py +4 -19
- b10_transfer-0.1.6/src/b10_transfer/core.py +131 -0
- {b10_transfer-0.1.4 → b10_transfer-0.1.6}/README.md +0 -0
- {b10_transfer-0.1.4 → b10_transfer-0.1.6}/src/b10_transfer/archive.py +0 -0
- {b10_transfer-0.1.4 → b10_transfer-0.1.6}/src/b10_transfer/cleanup.py +0 -0
- {b10_transfer-0.1.4 → b10_transfer-0.1.6}/src/b10_transfer/environment.py +0 -0
- {b10_transfer-0.1.4 → b10_transfer-0.1.6}/src/b10_transfer/info.py +0 -0
- {b10_transfer-0.1.4 → b10_transfer-0.1.6}/src/b10_transfer/space_monitor.py +0 -0
- {b10_transfer-0.1.4 → b10_transfer-0.1.6}/src/b10_transfer/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: b10-transfer
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.6
|
4
4
|
Summary: Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management
|
5
5
|
License: MIT
|
6
6
|
Keywords: pytorch,file-transfer,cache,machine-learning,inference
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
4
4
|
|
5
5
|
[tool.poetry]
|
6
6
|
name = "b10-transfer"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.6"
|
8
8
|
description = "Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management"
|
9
9
|
authors = ["Shounak Ray <shounak.noreply@baseten.co>", "Fred Liu <fred.liu.noreply@baseten.co>"]
|
10
10
|
maintainers = ["Fred Liu <fred.liu.noreply@baseten.co>", "Shounak Ray <shounak.noreply@baseten.co>"]
|
@@ -1,23 +1,24 @@
|
|
1
1
|
"""B10 Transfer - Lock-free PyTorch file transfer for Baseten."""
|
2
2
|
|
3
|
-
from .
|
3
|
+
from .cache import load_compile_cache, save_compile_cache, clear_local_cache
|
4
|
+
from .core import transfer
|
4
5
|
from .utils import CacheError, CacheValidationError
|
5
6
|
from .space_monitor import CacheOperationInterrupted
|
6
7
|
from .info import get_cache_info, list_available_caches
|
7
|
-
from .constants import
|
8
|
+
from .constants import OperationStatus
|
8
9
|
|
9
10
|
# Version
|
10
|
-
__version__ = "0.1.
|
11
|
+
__version__ = "0.1.6"
|
11
12
|
|
12
13
|
__all__ = [
|
13
14
|
"CacheError",
|
14
15
|
"CacheValidationError",
|
15
16
|
"CacheOperationInterrupted",
|
16
|
-
"
|
17
|
-
"LoadStatus",
|
17
|
+
"OperationStatus",
|
18
18
|
"load_compile_cache",
|
19
19
|
"save_compile_cache",
|
20
20
|
"clear_local_cache",
|
21
|
+
"transfer",
|
21
22
|
"get_cache_info",
|
22
23
|
"list_available_caches",
|
23
24
|
]
|
@@ -1,11 +1,13 @@
|
|
1
|
-
|
1
|
+
"""Cache operations for PyTorch compilation artifacts.
|
2
|
+
|
3
|
+
This module provides functions for loading and saving PyTorch compilation cache
|
4
|
+
to/from b10fs shared storage using atomic operations and space monitoring.
|
5
|
+
"""
|
6
|
+
|
2
7
|
import logging
|
3
8
|
import tempfile
|
4
|
-
import shutil
|
5
9
|
from pathlib import Path
|
6
10
|
|
7
|
-
import time
|
8
|
-
|
9
11
|
from .environment import get_cache_filename
|
10
12
|
from .cleanup import cooperative_cleanup_b10fs
|
11
13
|
from .utils import (
|
@@ -17,7 +19,6 @@ from .utils import (
|
|
17
19
|
safe_unlink,
|
18
20
|
)
|
19
21
|
from .space_monitor import (
|
20
|
-
check_sufficient_disk_space,
|
21
22
|
CacheSpaceMonitor,
|
22
23
|
CacheOperationInterrupted,
|
23
24
|
run_monitored_process,
|
@@ -33,17 +34,100 @@ from .constants import (
|
|
33
34
|
CACHE_FILE_EXTENSION,
|
34
35
|
CACHE_LATEST_SUFFIX,
|
35
36
|
CACHE_INCOMPLETE_SUFFIX,
|
36
|
-
|
37
|
-
SaveStatus,
|
38
|
-
TransferStatus,
|
37
|
+
OperationStatus,
|
39
38
|
)
|
39
|
+
from .core import transfer
|
40
40
|
|
41
41
|
logger = logging.getLogger(__name__)
|
42
42
|
|
43
43
|
|
44
|
+
"""
|
45
|
+
FIXME(SRAY):
|
46
|
+
What about the case in @b10-transfer/ where a single pod finishes an inference request,
|
47
|
+
and then the client calls save_compile_cache. And while we are creating the local archive,
|
48
|
+
another inference call on the same pod is kicked off, which then modifies the torch cache.
|
49
|
+
How would this be handled? Maybe just accept that the cache will be recompiled/overwritten?
|
50
|
+
Otherwise you'd need application level coordination to ensure that the cache is not modified
|
51
|
+
while we are creating the archive, but this doesn't really seem like a good idea in terms of adoption.
|
52
|
+
|
53
|
+
FIXME(SR):
|
54
|
+
More things to consider:
|
55
|
+
- [possible] What if b10fs dies *during* an op? right now we check for b10fs availability in the beginning of the op... Add some constants instead of just False for load().
|
56
|
+
- [possible, and really bad if it happens] potential memory exhaustion during compression if the cache is super super large. very very edge case. higher compression levels also have high memory usage.
|
57
|
+
"""
|
58
|
+
|
59
|
+
|
60
|
+
def _setup_cache_paths():
|
61
|
+
"""Common setup for cache operations - returns paths and performs cleanup."""
|
62
|
+
# Cooperative cleanup of stale shared resources
|
63
|
+
cooperative_cleanup_b10fs()
|
64
|
+
|
65
|
+
b10fs_dir = Path(B10FS_CACHE_DIR)
|
66
|
+
torch_dir = Path(TORCH_CACHE_DIR)
|
67
|
+
work_dir = Path(LOCAL_WORK_DIR)
|
68
|
+
|
69
|
+
return b10fs_dir, torch_dir, work_dir
|
70
|
+
|
71
|
+
|
72
|
+
def _get_cache_file_paths(cache_filename: str, b10fs_dir: Path):
|
73
|
+
"""Generate cache file paths for a given cache filename."""
|
74
|
+
final_file = (
|
75
|
+
b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
|
76
|
+
)
|
77
|
+
temp_file = (
|
78
|
+
b10fs_dir / f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
|
79
|
+
)
|
80
|
+
return final_file, temp_file
|
81
|
+
|
82
|
+
|
83
|
+
def _run_with_space_monitoring(
|
84
|
+
space_threshold_mb: float,
|
85
|
+
monitor_dir: Path,
|
86
|
+
operation_name: str,
|
87
|
+
worker_func,
|
88
|
+
worker_args: tuple,
|
89
|
+
cleanup_func=None,
|
90
|
+
):
|
91
|
+
"""Helper to run an operation with space monitoring."""
|
92
|
+
space_monitor = CacheSpaceMonitor(space_threshold_mb, monitor_dir)
|
93
|
+
space_monitor.start()
|
94
|
+
|
95
|
+
try:
|
96
|
+
logger.info(
|
97
|
+
f"Starting {operation_name}: {' -> '.join(str(arg) for arg in worker_args[:2])}"
|
98
|
+
)
|
99
|
+
run_monitored_process(
|
100
|
+
worker_func,
|
101
|
+
worker_args,
|
102
|
+
space_monitor,
|
103
|
+
operation_name,
|
104
|
+
cleanup_func=cleanup_func,
|
105
|
+
)
|
106
|
+
finally:
|
107
|
+
space_monitor.stop()
|
108
|
+
|
109
|
+
|
110
|
+
def _transfer_with_b10fs_lock(
|
111
|
+
source: str, dest: str, lock_type: str, cleanup_on_failure=True
|
112
|
+
):
|
113
|
+
"""Transfer a file with b10fs file locking and error handling."""
|
114
|
+
|
115
|
+
@critical_section_b10fs_file_lock(lock_type)
|
116
|
+
def _locked_transfer():
|
117
|
+
result = transfer(source, dest)
|
118
|
+
if result != OperationStatus.SUCCESS:
|
119
|
+
if cleanup_on_failure:
|
120
|
+
safe_unlink(
|
121
|
+
Path(dest), f"Failed to cleanup after failed transfer {dest}"
|
122
|
+
)
|
123
|
+
raise Exception(f"Failed to transfer {source} -> {dest}")
|
124
|
+
|
125
|
+
_locked_transfer()
|
126
|
+
|
127
|
+
|
44
128
|
@timed_fn(logger=logger, name="Loading compile cache")
|
45
129
|
@safe_execute("Load failed", False)
|
46
|
-
def load_compile_cache() ->
|
130
|
+
def load_compile_cache() -> OperationStatus:
|
47
131
|
"""Load PyTorch compilation cache from b10fs to local torch cache directory.
|
48
132
|
|
49
133
|
This function implements a lock-free pattern to safely load cached PyTorch
|
@@ -55,40 +139,33 @@ def load_compile_cache() -> LoadStatus:
|
|
55
139
|
extraction phases, interrupting operations if space falls below MIN_LOCAL_SPACE_MB.
|
56
140
|
|
57
141
|
Returns:
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
142
|
+
OperationStatus:
|
143
|
+
OperationStatus.SUCCESS if cache was successfully loaded
|
144
|
+
OperationStatus.SKIPPED if already exists
|
145
|
+
OperationStatus.ERROR if b10fs is unavailable, local disk space is insufficient, or loading failed.
|
146
|
+
OperationStatus.DOES_NOT_EXIST if no cache file was found.
|
63
147
|
|
64
148
|
Raises:
|
65
|
-
CacheValidationError: If b10fs is not enabled (caught and returns
|
149
|
+
CacheValidationError: If b10fs is not enabled (caught and returns OperationStatus.ERROR).
|
66
150
|
CacheOperationInterrupted: If operations interrupted due to insufficient
|
67
|
-
local disk space (caught and returns
|
68
|
-
Exception: Any other errors during loading (caught and returns
|
151
|
+
local disk space (caught and returns OperationStatus.ERROR).
|
152
|
+
Exception: Any other errors during loading (caught and returns OperationStatus.ERROR).
|
69
153
|
"""
|
70
154
|
with cache_operation("Load"):
|
71
|
-
|
72
|
-
cooperative_cleanup_b10fs()
|
73
|
-
|
74
|
-
b10fs_dir = Path(B10FS_CACHE_DIR)
|
75
|
-
torch_dir = Path(TORCH_CACHE_DIR)
|
76
|
-
work_dir = Path(LOCAL_WORK_DIR)
|
155
|
+
b10fs_dir, torch_dir, work_dir = _setup_cache_paths()
|
77
156
|
|
78
157
|
cache_filename = get_cache_filename()
|
79
|
-
|
80
|
-
|
81
|
-
)
|
82
|
-
logger.debug(f"Looking for cache file: {cache_file}")
|
158
|
+
final_file, _ = _get_cache_file_paths(cache_filename, b10fs_dir)
|
159
|
+
logger.debug(f"Looking for cache file: {final_file}")
|
83
160
|
|
84
|
-
if not
|
161
|
+
if not final_file.exists():
|
85
162
|
logger.info("No cache file found in b10fs")
|
86
|
-
return
|
163
|
+
return OperationStatus.DOES_NOT_EXIST
|
87
164
|
|
88
165
|
# Skip if already loaded
|
89
166
|
if torch_dir.exists() and any(torch_dir.iterdir()):
|
90
167
|
logger.info("Torch cache already loaded, skipping extraction")
|
91
|
-
return
|
168
|
+
return OperationStatus.SKIPPED
|
92
169
|
|
93
170
|
# Create temp local copy
|
94
171
|
with tempfile.NamedTemporaryFile(
|
@@ -99,58 +176,35 @@ def load_compile_cache() -> LoadStatus:
|
|
99
176
|
|
100
177
|
try:
|
101
178
|
with temp_file_cleanup(temp_path):
|
102
|
-
# Phase 1: Copy from b10fs to local temp file
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
(str(temp_path), str(torch_dir)),
|
120
|
-
space_monitor,
|
121
|
-
"archive extraction",
|
122
|
-
cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
|
123
|
-
)
|
124
|
-
finally:
|
125
|
-
space_monitor.stop()
|
179
|
+
# Phase 1: Copy from b10fs to local temp file
|
180
|
+
_transfer_with_b10fs_lock(
|
181
|
+
str(final_file),
|
182
|
+
str(temp_path),
|
183
|
+
"copy_out",
|
184
|
+
cleanup_on_failure=False,
|
185
|
+
)
|
186
|
+
|
187
|
+
# Phase 2: Extract archive with space monitoring
|
188
|
+
_run_with_space_monitoring(
|
189
|
+
MIN_LOCAL_SPACE_MB,
|
190
|
+
work_dir,
|
191
|
+
"archive extraction",
|
192
|
+
_cache_extract_worker,
|
193
|
+
(str(temp_path), str(torch_dir)),
|
194
|
+
cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
|
195
|
+
)
|
126
196
|
|
127
197
|
logger.info("Cache load complete")
|
128
|
-
return
|
198
|
+
return OperationStatus.SUCCESS
|
129
199
|
|
130
200
|
except CacheOperationInterrupted as e:
|
131
201
|
logger.warning(f"Cache load interrupted: {e}")
|
132
|
-
return
|
133
|
-
|
134
|
-
|
135
|
-
"""
|
136
|
-
FIXME(SRAY):
|
137
|
-
What about the case in @b10-transfer/ where a single pod finishes an inference request,
|
138
|
-
and then the client calls save_compile_cache. And while we are creating the local archive,
|
139
|
-
another inference call on the same pod is kicked off, which then modifies the torch cache.
|
140
|
-
How would this be handled? Maybe just accept that the cache will be recompiled/overwritten?
|
141
|
-
Otherwise you'd need application level coordination to ensure that the cache is not modified
|
142
|
-
while we are creating the archive, but this doesn't really seem like a good idea in terms of adoption.
|
143
|
-
|
144
|
-
FIXME(SR):
|
145
|
-
More things to consider:
|
146
|
-
- [possible] What if b10fs dies *during* an op? right now we check for b10fs availability in the beginning of the op... Add some constants instead of just False for load().
|
147
|
-
- [possible, and really bad if it happens] potential memory exhaustion during compression if the cache is super super large. very very edge case. higher compression levels also have high memory usage.
|
148
|
-
"""
|
202
|
+
return OperationStatus.ERROR
|
149
203
|
|
150
204
|
|
151
205
|
@timed_fn(logger=logger, name="Saving compile cache")
|
152
206
|
@safe_execute("Save failed", False)
|
153
|
-
def save_compile_cache() ->
|
207
|
+
def save_compile_cache() -> OperationStatus:
|
154
208
|
"""Save local PyTorch compilation cache to b10fs using atomic journal pattern.
|
155
209
|
|
156
210
|
This function creates an archive of the local torch cache directory and
|
@@ -164,46 +218,34 @@ def save_compile_cache() -> SaveStatus:
|
|
164
218
|
space becomes insufficient, finally performing an atomic rename to the final cache file.
|
165
219
|
|
166
220
|
Returns:
|
167
|
-
|
168
|
-
|
169
|
-
|
221
|
+
OperationStatus:
|
222
|
+
OperationStatus.SUCCESS if cache was successfully saved or already exists
|
223
|
+
OperationStatus.ERROR if b10fs is unavailable, insufficient disk space caused interruption,
|
170
224
|
no cache exists to save, or saving failed.
|
171
|
-
|
225
|
+
OperationStatus.SKIPPED if no cache exists to save or cache already exists in b10fs
|
172
226
|
|
173
227
|
Raises:
|
174
|
-
CacheValidationError: If b10fs is not enabled (caught and returns
|
228
|
+
CacheValidationError: If b10fs is not enabled (caught and returns OperationStatus.ERROR).
|
175
229
|
CacheOperationInterrupted: If operations interrupted due to insufficient
|
176
|
-
disk space (caught and returns
|
177
|
-
ArchiveError: If archive creation fails (caught and returns
|
178
|
-
Exception: Any other errors during saving (caught and returns
|
230
|
+
disk space (caught and returns OperationStatus.ERROR).
|
231
|
+
ArchiveError: If archive creation fails (caught and returns OperationStatus.ERROR).
|
232
|
+
Exception: Any other errors during saving (caught and returns OperationStatus.ERROR).
|
179
233
|
"""
|
180
234
|
with cache_operation("Save"):
|
181
|
-
|
182
|
-
cooperative_cleanup_b10fs()
|
183
|
-
|
184
|
-
b10fs_dir = Path(B10FS_CACHE_DIR)
|
185
|
-
torch_dir = Path(TORCH_CACHE_DIR)
|
186
|
-
work_dir = Path(LOCAL_WORK_DIR)
|
235
|
+
b10fs_dir, torch_dir, work_dir = _setup_cache_paths()
|
187
236
|
|
188
237
|
# Check if anything to save
|
189
238
|
if not torch_dir.exists() or not any(torch_dir.iterdir()):
|
190
239
|
logger.info("No torch cache to save")
|
191
|
-
return
|
240
|
+
return OperationStatus.SKIPPED
|
192
241
|
|
193
242
|
cache_filename = get_cache_filename()
|
194
|
-
final_file = (
|
195
|
-
b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
|
196
|
-
)
|
243
|
+
final_file, temp_file = _get_cache_file_paths(cache_filename, b10fs_dir)
|
197
244
|
|
198
245
|
# Check for existing cache first (early exit)
|
199
246
|
if final_file.exists():
|
200
247
|
logger.info("Cache already exists in b10fs, skipping save")
|
201
|
-
return
|
202
|
-
|
203
|
-
temp_file = (
|
204
|
-
b10fs_dir
|
205
|
-
/ f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
|
206
|
-
)
|
248
|
+
return OperationStatus.SKIPPED
|
207
249
|
|
208
250
|
with tempfile.NamedTemporaryFile(
|
209
251
|
suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
|
@@ -213,34 +255,19 @@ def save_compile_cache() -> SaveStatus:
|
|
213
255
|
|
214
256
|
try:
|
215
257
|
with temp_file_cleanup(local_temp):
|
216
|
-
# Phase 1: Compression
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
)
|
228
|
-
|
229
|
-
space_monitor.stop()
|
230
|
-
|
231
|
-
# Phase 2: Copy to b10fs using transfer()
|
232
|
-
@critical_section_b10fs_file_lock("copy_in")
|
233
|
-
def _monitored_copy_to_b10fs():
|
234
|
-
result = transfer(str(local_temp), str(temp_file))
|
235
|
-
if result != TransferStatus.SUCCESS:
|
236
|
-
# Clean up the temp file if transfer failed
|
237
|
-
safe_unlink(
|
238
|
-
temp_file,
|
239
|
-
f"Failed to cleanup after failed copy {temp_file}",
|
240
|
-
)
|
241
|
-
raise Exception("Failed to copy cache file to b10fs")
|
242
|
-
|
243
|
-
_monitored_copy_to_b10fs()
|
258
|
+
# Phase 1: Compression with space monitoring
|
259
|
+
_run_with_space_monitoring(
|
260
|
+
REQUIRED_B10FS_SPACE_MB,
|
261
|
+
b10fs_dir,
|
262
|
+
"compression",
|
263
|
+
_cache_compression_worker,
|
264
|
+
(str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
|
265
|
+
)
|
266
|
+
|
267
|
+
# Phase 2: Copy to b10fs with locking
|
268
|
+
_transfer_with_b10fs_lock(
|
269
|
+
str(local_temp), str(temp_file), "copy_in", cleanup_on_failure=True
|
270
|
+
)
|
244
271
|
|
245
272
|
# Phase 3: Atomic rename (fast, don't interrupt)
|
246
273
|
logger.info(
|
@@ -249,95 +276,11 @@ def save_compile_cache() -> SaveStatus:
|
|
249
276
|
temp_file.rename(final_file)
|
250
277
|
|
251
278
|
logger.info("Cache save complete")
|
252
|
-
return
|
279
|
+
return OperationStatus.SUCCESS
|
253
280
|
|
254
281
|
except CacheOperationInterrupted as e:
|
255
282
|
logger.warning(f"Cache save interrupted: {e}")
|
256
|
-
return
|
257
|
-
|
258
|
-
|
259
|
-
@timed_fn(logger=logger, name="Transferring file")
|
260
|
-
@safe_execute("Transfer failed", TransferStatus.ERROR)
|
261
|
-
def transfer(source: str, dest: str) -> TransferStatus:
|
262
|
-
"""Transfer a file from source to destination with space monitoring.
|
263
|
-
|
264
|
-
This function copies a file from source to destination using the same
|
265
|
-
monitored process approach as the cache operations. It monitors disk space
|
266
|
-
at the destination and can interrupt the transfer if space becomes insufficient.
|
267
|
-
|
268
|
-
Args:
|
269
|
-
source: Path to the source file to copy.
|
270
|
-
dest: Path to the destination where the file will be copied.
|
271
|
-
|
272
|
-
Returns:
|
273
|
-
TransferStatus:
|
274
|
-
TransferStatus.SUCCESS if transfer was successful
|
275
|
-
TransferStatus.ERROR if transfer failed due to insufficient disk space,
|
276
|
-
file not found, or other errors.
|
277
|
-
|
278
|
-
Raises:
|
279
|
-
CacheOperationInterrupted: If transfer interrupted due to insufficient
|
280
|
-
disk space (caught and returns TransferStatus.ERROR).
|
281
|
-
Exception: Any other errors during transfer (caught and returns TransferStatus.ERROR).
|
282
|
-
"""
|
283
|
-
source_path = Path(source)
|
284
|
-
dest_path = Path(dest)
|
285
|
-
|
286
|
-
# Validate source file exists
|
287
|
-
if not source_path.exists():
|
288
|
-
logger.error(f"Source file does not exist: {source}")
|
289
|
-
return TransferStatus.ERROR
|
290
|
-
|
291
|
-
# Create destination directory if it doesn't exist
|
292
|
-
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
293
|
-
|
294
|
-
# Determine appropriate space threshold based on destination directory
|
295
|
-
dest_dir = dest_path.parent
|
296
|
-
if str(dest_dir).startswith(B10FS_CACHE_DIR):
|
297
|
-
# Transferring to b10fs - use b10fs space requirements
|
298
|
-
space_threshold_mb = REQUIRED_B10FS_SPACE_MB
|
299
|
-
logger.debug(
|
300
|
-
f"Transfer to b10fs detected, using {space_threshold_mb:.1f}MB threshold"
|
301
|
-
)
|
302
|
-
else:
|
303
|
-
# Transferring to local directory - use local space requirements
|
304
|
-
space_threshold_mb = MIN_LOCAL_SPACE_MB
|
305
|
-
logger.debug(
|
306
|
-
f"Transfer to local directory detected, using {space_threshold_mb:.1f}MB threshold"
|
307
|
-
)
|
308
|
-
|
309
|
-
# Initial disk space check
|
310
|
-
check_sufficient_disk_space(dest_dir, space_threshold_mb, "file transfer")
|
311
|
-
logger.debug(
|
312
|
-
f"Initial space check passed: {space_threshold_mb:.1f}MB required at destination"
|
313
|
-
)
|
314
|
-
|
315
|
-
# Start background space monitoring for destination directory
|
316
|
-
space_monitor = CacheSpaceMonitor(space_threshold_mb, dest_dir)
|
317
|
-
space_monitor.start()
|
318
|
-
|
319
|
-
try:
|
320
|
-
# Run monitored copy process
|
321
|
-
logger.info(f"Starting transfer: {source} -> {dest}")
|
322
|
-
run_monitored_process(
|
323
|
-
_cache_copy_worker,
|
324
|
-
(str(source_path), str(dest_path)),
|
325
|
-
space_monitor,
|
326
|
-
"file transfer",
|
327
|
-
cleanup_func=lambda: safe_unlink(
|
328
|
-
dest_path, f"Failed to cleanup interrupted transfer {dest_path}"
|
329
|
-
),
|
330
|
-
)
|
331
|
-
|
332
|
-
logger.info("File transfer complete")
|
333
|
-
return TransferStatus.SUCCESS
|
334
|
-
|
335
|
-
except CacheOperationInterrupted as e:
|
336
|
-
logger.warning(f"File transfer interrupted: {e}")
|
337
|
-
return TransferStatus.ERROR
|
338
|
-
|
339
|
-
finally:
|
340
|
-
space_monitor.stop()
|
283
|
+
return OperationStatus.ERROR
|
341
284
|
|
342
285
|
|
343
286
|
@safe_execute("Clear failed", False)
|
@@ -357,6 +300,8 @@ def clear_local_cache() -> bool:
|
|
357
300
|
torch_dir = Path(TORCH_CACHE_DIR)
|
358
301
|
if not torch_dir.exists():
|
359
302
|
return True
|
303
|
+
import shutil
|
304
|
+
|
360
305
|
shutil.rmtree(torch_dir)
|
361
306
|
return True
|
362
307
|
|
@@ -384,27 +329,12 @@ def _cache_compression_worker(
|
|
384
329
|
create_archive(torch_dir, local_temp, max_size_mb)
|
385
330
|
|
386
331
|
|
387
|
-
@worker_process("Copy was cancelled before starting")
|
388
|
-
def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
|
389
|
-
"""Worker process that handles file copy to b10fs.
|
390
|
-
|
391
|
-
This function runs in a separate process to copy the compressed cache file
|
392
|
-
to the b10fs filesystem. It can be terminated externally if disk space becomes insufficient.
|
393
|
-
|
394
|
-
Args:
|
395
|
-
source_path_str: String path to the source file to copy.
|
396
|
-
dest_path_str: String path where the file will be copied.
|
397
|
-
"""
|
398
|
-
source_path = Path(source_path_str)
|
399
|
-
dest_path = Path(dest_path_str)
|
400
|
-
|
401
|
-
shutil.copy2(source_path, dest_path)
|
402
|
-
|
403
|
-
|
404
332
|
def _cleanup_torch_dir(torch_dir: Path) -> None:
|
405
333
|
"""Helper function to safely cleanup torch directory during interrupted extraction."""
|
406
334
|
try:
|
407
335
|
if torch_dir.exists():
|
336
|
+
import shutil
|
337
|
+
|
408
338
|
shutil.rmtree(torch_dir)
|
409
339
|
logger.debug(f"Cleaned up torch directory: {torch_dir}")
|
410
340
|
except Exception as e:
|
@@ -114,25 +114,10 @@ class WorkerStatus(Enum):
|
|
114
114
|
CANCELLED = auto()
|
115
115
|
|
116
116
|
|
117
|
-
class
|
118
|
-
"""Status values for
|
119
|
-
|
120
|
-
SUCCESS = auto()
|
121
|
-
ERROR = auto()
|
122
|
-
DOES_NOT_EXIST = auto()
|
123
|
-
SKIPPED = auto()
|
124
|
-
|
125
|
-
|
126
|
-
class SaveStatus(Enum):
|
127
|
-
"""Status values for cache saving operations."""
|
128
|
-
|
129
|
-
SUCCESS = auto()
|
130
|
-
ERROR = auto()
|
131
|
-
SKIPPED = auto()
|
132
|
-
|
133
|
-
|
134
|
-
class TransferStatus(Enum):
|
135
|
-
"""Status values for file transfer operations."""
|
117
|
+
class OperationStatus(Enum):
|
118
|
+
"""Status values for all b10-transfer operations (load, save, transfer)."""
|
136
119
|
|
137
120
|
SUCCESS = auto()
|
138
121
|
ERROR = auto()
|
122
|
+
DOES_NOT_EXIST = auto() # Used by load operations when cache file not found
|
123
|
+
SKIPPED = auto() # Used by load/save operations when operation not needed
|
@@ -0,0 +1,131 @@
|
|
1
|
+
"""Core file transfer operations for b10-transfer.
|
2
|
+
|
3
|
+
This module provides generic file transfer functionality with space monitoring
|
4
|
+
and error handling for b10fs operations.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import logging
|
8
|
+
import shutil
|
9
|
+
from pathlib import Path
|
10
|
+
|
11
|
+
from .utils import (
|
12
|
+
timed_fn,
|
13
|
+
safe_execute,
|
14
|
+
safe_unlink,
|
15
|
+
)
|
16
|
+
from .space_monitor import (
|
17
|
+
check_sufficient_disk_space,
|
18
|
+
CacheSpaceMonitor,
|
19
|
+
CacheOperationInterrupted,
|
20
|
+
run_monitored_process,
|
21
|
+
worker_process,
|
22
|
+
)
|
23
|
+
from .constants import (
|
24
|
+
B10FS_CACHE_DIR,
|
25
|
+
REQUIRED_B10FS_SPACE_MB,
|
26
|
+
MIN_LOCAL_SPACE_MB,
|
27
|
+
OperationStatus,
|
28
|
+
)
|
29
|
+
|
30
|
+
logger = logging.getLogger(__name__)
|
31
|
+
|
32
|
+
|
33
|
+
@timed_fn(logger=logger, name="Transferring file")
|
34
|
+
@safe_execute("Transfer failed", OperationStatus.ERROR)
|
35
|
+
def transfer(source: str, dest: str) -> OperationStatus:
|
36
|
+
"""Transfer a file from source to destination with space monitoring.
|
37
|
+
|
38
|
+
This function copies a file from source to destination using the same
|
39
|
+
monitored process approach as the cache operations. It monitors disk space
|
40
|
+
at the destination and can interrupt the transfer if space becomes insufficient.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
source: Path to the source file to copy.
|
44
|
+
dest: Path to the destination where the file will be copied.
|
45
|
+
|
46
|
+
Returns:
|
47
|
+
OperationStatus:
|
48
|
+
OperationStatus.SUCCESS if transfer was successful
|
49
|
+
OperationStatus.ERROR if transfer failed due to insufficient disk space,
|
50
|
+
file not found, or other errors.
|
51
|
+
|
52
|
+
Raises:
|
53
|
+
CacheOperationInterrupted: If transfer interrupted due to insufficient
|
54
|
+
disk space (caught and returns OperationStatus.ERROR).
|
55
|
+
Exception: Any other errors during transfer (caught and returns OperationStatus.ERROR).
|
56
|
+
"""
|
57
|
+
source_path = Path(source)
|
58
|
+
dest_path = Path(dest)
|
59
|
+
|
60
|
+
# Validate source file exists
|
61
|
+
if not source_path.exists():
|
62
|
+
logger.error(f"Source file does not exist: {source}")
|
63
|
+
return OperationStatus.ERROR
|
64
|
+
|
65
|
+
# Create destination directory if it doesn't exist
|
66
|
+
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
67
|
+
|
68
|
+
# Determine appropriate space threshold based on destination directory
|
69
|
+
dest_dir = dest_path.parent
|
70
|
+
if str(dest_dir).startswith(B10FS_CACHE_DIR):
|
71
|
+
# Transferring to b10fs - use b10fs space requirements
|
72
|
+
space_threshold_mb = REQUIRED_B10FS_SPACE_MB
|
73
|
+
logger.debug(
|
74
|
+
f"Transfer to b10fs detected, using {space_threshold_mb:.1f}MB threshold"
|
75
|
+
)
|
76
|
+
else:
|
77
|
+
# Transferring to local directory - use local space requirements
|
78
|
+
space_threshold_mb = MIN_LOCAL_SPACE_MB
|
79
|
+
logger.debug(
|
80
|
+
f"Transfer to local directory detected, using {space_threshold_mb:.1f}MB threshold"
|
81
|
+
)
|
82
|
+
|
83
|
+
# Initial disk space check
|
84
|
+
check_sufficient_disk_space(dest_dir, space_threshold_mb, "file transfer")
|
85
|
+
logger.debug(
|
86
|
+
f"Initial space check passed: {space_threshold_mb:.1f}MB required at destination"
|
87
|
+
)
|
88
|
+
|
89
|
+
# Start background space monitoring for destination directory
|
90
|
+
space_monitor = CacheSpaceMonitor(space_threshold_mb, dest_dir)
|
91
|
+
space_monitor.start()
|
92
|
+
|
93
|
+
try:
|
94
|
+
# Run monitored copy process
|
95
|
+
logger.info(f"Starting transfer: {source} -> {dest}")
|
96
|
+
run_monitored_process(
|
97
|
+
_cache_copy_worker,
|
98
|
+
(str(source_path), str(dest_path)),
|
99
|
+
space_monitor,
|
100
|
+
"file transfer",
|
101
|
+
cleanup_func=lambda: safe_unlink(
|
102
|
+
dest_path, f"Failed to cleanup interrupted transfer {dest_path}"
|
103
|
+
),
|
104
|
+
)
|
105
|
+
|
106
|
+
logger.info("File transfer complete")
|
107
|
+
return OperationStatus.SUCCESS
|
108
|
+
|
109
|
+
except CacheOperationInterrupted as e:
|
110
|
+
logger.warning(f"File transfer interrupted: {e}")
|
111
|
+
return OperationStatus.ERROR
|
112
|
+
|
113
|
+
finally:
|
114
|
+
space_monitor.stop()
|
115
|
+
|
116
|
+
|
117
|
+
@worker_process("Copy was cancelled before starting")
|
118
|
+
def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
|
119
|
+
"""Worker process that handles file copy operations.
|
120
|
+
|
121
|
+
This function runs in a separate process to copy files between locations.
|
122
|
+
It can be terminated externally if disk space becomes insufficient.
|
123
|
+
|
124
|
+
Args:
|
125
|
+
source_path_str: String path to the source file to copy.
|
126
|
+
dest_path_str: String path where the file will be copied.
|
127
|
+
"""
|
128
|
+
source_path = Path(source_path_str)
|
129
|
+
dest_path = Path(dest_path_str)
|
130
|
+
|
131
|
+
shutil.copy2(source_path, dest_path)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|