b10-transfer 0.1.3__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {b10_transfer-0.1.3 → b10_transfer-0.1.5}/PKG-INFO +1 -1
- {b10_transfer-0.1.3 → b10_transfer-0.1.5}/pyproject.toml +1 -1
- {b10_transfer-0.1.3 → b10_transfer-0.1.5}/src/b10_transfer/__init__.py +5 -5
- {b10_transfer-0.1.3 → b10_transfer-0.1.5}/src/b10_transfer/constants.py +4 -19
- {b10_transfer-0.1.3 → b10_transfer-0.1.5}/src/b10_transfer/core.py +133 -156
- {b10_transfer-0.1.3 → b10_transfer-0.1.5}/README.md +0 -0
- {b10_transfer-0.1.3 → b10_transfer-0.1.5}/src/b10_transfer/archive.py +0 -0
- {b10_transfer-0.1.3 → b10_transfer-0.1.5}/src/b10_transfer/cleanup.py +0 -0
- {b10_transfer-0.1.3 → b10_transfer-0.1.5}/src/b10_transfer/environment.py +0 -0
- {b10_transfer-0.1.3 → b10_transfer-0.1.5}/src/b10_transfer/info.py +0 -0
- {b10_transfer-0.1.3 → b10_transfer-0.1.5}/src/b10_transfer/space_monitor.py +0 -0
- {b10_transfer-0.1.3 → b10_transfer-0.1.5}/src/b10_transfer/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: b10-transfer
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.5
|
4
4
|
Summary: Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management
|
5
5
|
License: MIT
|
6
6
|
Keywords: pytorch,file-transfer,cache,machine-learning,inference
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
4
4
|
|
5
5
|
[tool.poetry]
|
6
6
|
name = "b10-transfer"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.5"
|
8
8
|
description = "Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management"
|
9
9
|
authors = ["Shounak Ray <shounak.noreply@baseten.co>", "Fred Liu <fred.liu.noreply@baseten.co>"]
|
10
10
|
maintainers = ["Fred Liu <fred.liu.noreply@baseten.co>", "Shounak Ray <shounak.noreply@baseten.co>"]
|
@@ -1,23 +1,23 @@
|
|
1
1
|
"""B10 Transfer - Lock-free PyTorch file transfer for Baseten."""
|
2
2
|
|
3
|
-
from .core import load_compile_cache, save_compile_cache, clear_local_cache
|
3
|
+
from .core import load_compile_cache, save_compile_cache, clear_local_cache, transfer
|
4
4
|
from .utils import CacheError, CacheValidationError
|
5
5
|
from .space_monitor import CacheOperationInterrupted
|
6
6
|
from .info import get_cache_info, list_available_caches
|
7
|
-
from .constants import
|
7
|
+
from .constants import OperationStatus
|
8
8
|
|
9
9
|
# Version
|
10
|
-
__version__ = "0.1.
|
10
|
+
__version__ = "0.1.5"
|
11
11
|
|
12
12
|
__all__ = [
|
13
13
|
"CacheError",
|
14
14
|
"CacheValidationError",
|
15
15
|
"CacheOperationInterrupted",
|
16
|
-
"
|
17
|
-
"LoadStatus",
|
16
|
+
"OperationStatus",
|
18
17
|
"load_compile_cache",
|
19
18
|
"save_compile_cache",
|
20
19
|
"clear_local_cache",
|
20
|
+
"transfer",
|
21
21
|
"get_cache_info",
|
22
22
|
"list_available_caches",
|
23
23
|
]
|
@@ -114,25 +114,10 @@ class WorkerStatus(Enum):
|
|
114
114
|
CANCELLED = auto()
|
115
115
|
|
116
116
|
|
117
|
-
class
|
118
|
-
"""Status values for
|
119
|
-
|
120
|
-
SUCCESS = auto()
|
121
|
-
ERROR = auto()
|
122
|
-
DOES_NOT_EXIST = auto()
|
123
|
-
SKIPPED = auto()
|
124
|
-
|
125
|
-
|
126
|
-
class SaveStatus(Enum):
|
127
|
-
"""Status values for cache saving operations."""
|
128
|
-
|
129
|
-
SUCCESS = auto()
|
130
|
-
ERROR = auto()
|
131
|
-
SKIPPED = auto()
|
132
|
-
|
133
|
-
|
134
|
-
class TransferStatus(Enum):
|
135
|
-
"""Status values for file transfer operations."""
|
117
|
+
class OperationStatus(Enum):
|
118
|
+
"""Status values for all b10-transfer operations (load, save, transfer)."""
|
136
119
|
|
137
120
|
SUCCESS = auto()
|
138
121
|
ERROR = auto()
|
122
|
+
DOES_NOT_EXIST = auto() # Used by load operations when cache file not found
|
123
|
+
SKIPPED = auto() # Used by load/save operations when operation not needed
|
@@ -33,17 +33,83 @@ from .constants import (
|
|
33
33
|
CACHE_FILE_EXTENSION,
|
34
34
|
CACHE_LATEST_SUFFIX,
|
35
35
|
CACHE_INCOMPLETE_SUFFIX,
|
36
|
-
|
37
|
-
SaveStatus,
|
38
|
-
TransferStatus,
|
36
|
+
OperationStatus,
|
39
37
|
)
|
40
38
|
|
41
39
|
logger = logging.getLogger(__name__)
|
42
40
|
|
43
41
|
|
42
|
+
def _setup_cache_paths():
|
43
|
+
"""Common setup for cache operations - returns paths and performs cleanup."""
|
44
|
+
# Cooperative cleanup of stale shared resources
|
45
|
+
cooperative_cleanup_b10fs()
|
46
|
+
|
47
|
+
b10fs_dir = Path(B10FS_CACHE_DIR)
|
48
|
+
torch_dir = Path(TORCH_CACHE_DIR)
|
49
|
+
work_dir = Path(LOCAL_WORK_DIR)
|
50
|
+
|
51
|
+
return b10fs_dir, torch_dir, work_dir
|
52
|
+
|
53
|
+
|
54
|
+
def _get_cache_file_paths(cache_filename: str, b10fs_dir: Path):
|
55
|
+
"""Generate cache file paths for a given cache filename."""
|
56
|
+
final_file = (
|
57
|
+
b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
|
58
|
+
)
|
59
|
+
temp_file = (
|
60
|
+
b10fs_dir / f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
|
61
|
+
)
|
62
|
+
return final_file, temp_file
|
63
|
+
|
64
|
+
|
65
|
+
def _run_with_space_monitoring(
|
66
|
+
space_threshold_mb: float,
|
67
|
+
monitor_dir: Path,
|
68
|
+
operation_name: str,
|
69
|
+
worker_func,
|
70
|
+
worker_args: tuple,
|
71
|
+
cleanup_func=None,
|
72
|
+
):
|
73
|
+
"""Helper to run an operation with space monitoring."""
|
74
|
+
space_monitor = CacheSpaceMonitor(space_threshold_mb, monitor_dir)
|
75
|
+
space_monitor.start()
|
76
|
+
|
77
|
+
try:
|
78
|
+
logger.info(
|
79
|
+
f"Starting {operation_name}: {' -> '.join(str(arg) for arg in worker_args[:2])}"
|
80
|
+
)
|
81
|
+
run_monitored_process(
|
82
|
+
worker_func,
|
83
|
+
worker_args,
|
84
|
+
space_monitor,
|
85
|
+
operation_name,
|
86
|
+
cleanup_func=cleanup_func,
|
87
|
+
)
|
88
|
+
finally:
|
89
|
+
space_monitor.stop()
|
90
|
+
|
91
|
+
|
92
|
+
def _transfer_with_b10fs_lock(
|
93
|
+
source: str, dest: str, lock_type: str, cleanup_on_failure=True
|
94
|
+
):
|
95
|
+
"""Transfer a file with b10fs file locking and error handling."""
|
96
|
+
|
97
|
+
@critical_section_b10fs_file_lock(lock_type)
|
98
|
+
def _locked_transfer():
|
99
|
+
result = transfer(source, dest)
|
100
|
+
if result != OperationStatus.SUCCESS:
|
101
|
+
if cleanup_on_failure:
|
102
|
+
safe_unlink(
|
103
|
+
Path(dest), f"Failed to cleanup after failed transfer {dest}"
|
104
|
+
)
|
105
|
+
raise Exception(f"Failed to transfer {source} -> {dest}")
|
106
|
+
|
107
|
+
_locked_transfer()
|
108
|
+
|
109
|
+
|
44
110
|
@timed_fn(logger=logger, name="Loading compile cache")
|
45
111
|
@safe_execute("Load failed", False)
|
46
|
-
def load_compile_cache() ->
|
112
|
+
def load_compile_cache() -> OperationStatus:
|
47
113
|
"""Load PyTorch compilation cache from b10fs to local torch cache directory.
|
48
114
|
|
49
115
|
This function implements a lock-free pattern to safely load cached PyTorch
|
@@ -55,52 +121,33 @@ def load_compile_cache() -> LoadStatus:
|
|
55
121
|
extraction phases, interrupting operations if space falls below MIN_LOCAL_SPACE_MB.
|
56
122
|
|
57
123
|
Returns:
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
124
|
+
OperationStatus:
|
125
|
+
OperationStatus.SUCCESS if cache was successfully loaded
|
126
|
+
OperationStatus.SKIPPED if already exists
|
127
|
+
OperationStatus.ERROR if b10fs is unavailable, local disk space is insufficient, or loading failed.
|
128
|
+
OperationStatus.DOES_NOT_EXIST if no cache file was found.
|
63
129
|
|
64
130
|
Raises:
|
65
|
-
CacheValidationError: If b10fs is not enabled (caught and returns
|
131
|
+
CacheValidationError: If b10fs is not enabled (caught and returns OperationStatus.ERROR).
|
66
132
|
CacheOperationInterrupted: If operations interrupted due to insufficient
|
67
|
-
local disk space (caught and returns
|
68
|
-
Exception: Any other errors during loading (caught and returns
|
133
|
+
local disk space (caught and returns OperationStatus.ERROR).
|
134
|
+
Exception: Any other errors during loading (caught and returns OperationStatus.ERROR).
|
69
135
|
"""
|
70
136
|
with cache_operation("Load"):
|
71
|
-
|
72
|
-
cooperative_cleanup_b10fs()
|
73
|
-
|
74
|
-
b10fs_dir = Path(B10FS_CACHE_DIR)
|
75
|
-
torch_dir = Path(TORCH_CACHE_DIR)
|
76
|
-
work_dir = Path(LOCAL_WORK_DIR)
|
137
|
+
b10fs_dir, torch_dir, work_dir = _setup_cache_paths()
|
77
138
|
|
78
139
|
cache_filename = get_cache_filename()
|
79
|
-
|
80
|
-
|
81
|
-
)
|
82
|
-
logger.debug(f"Looking for cache file: {cache_file}")
|
140
|
+
final_file, _ = _get_cache_file_paths(cache_filename, b10fs_dir)
|
141
|
+
logger.debug(f"Looking for cache file: {final_file}")
|
83
142
|
|
84
|
-
if not
|
143
|
+
if not final_file.exists():
|
85
144
|
logger.info("No cache file found in b10fs")
|
86
|
-
return
|
145
|
+
return OperationStatus.DOES_NOT_EXIST
|
87
146
|
|
88
147
|
# Skip if already loaded
|
89
148
|
if torch_dir.exists() and any(torch_dir.iterdir()):
|
90
149
|
logger.info("Torch cache already loaded, skipping extraction")
|
91
|
-
return
|
92
|
-
|
93
|
-
# Initial disk space check for local operations
|
94
|
-
check_sufficient_disk_space(
|
95
|
-
work_dir, MIN_LOCAL_SPACE_MB, "cache load operations"
|
96
|
-
)
|
97
|
-
logger.debug(
|
98
|
-
f"Initial space check passed: {MIN_LOCAL_SPACE_MB:.1f}MB required on local machine"
|
99
|
-
)
|
100
|
-
|
101
|
-
# Start background space monitoring for local disk
|
102
|
-
space_monitor = CacheSpaceMonitor(MIN_LOCAL_SPACE_MB, work_dir)
|
103
|
-
space_monitor.start()
|
150
|
+
return OperationStatus.SKIPPED
|
104
151
|
|
105
152
|
# Create temp local copy
|
106
153
|
with tempfile.NamedTemporaryFile(
|
@@ -111,40 +158,30 @@ def load_compile_cache() -> LoadStatus:
|
|
111
158
|
|
112
159
|
try:
|
113
160
|
with temp_file_cleanup(temp_path):
|
114
|
-
# Phase 1: Copy from b10fs to local temp file
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
_monitored_copy_from_b10fs()
|
128
|
-
|
129
|
-
# Phase 2: Extract archive in separate process
|
130
|
-
logger.info(f"Starting extraction: {temp_path} -> {torch_dir}")
|
131
|
-
run_monitored_process(
|
161
|
+
# Phase 1: Copy from b10fs to local temp file
|
162
|
+
_transfer_with_b10fs_lock(
|
163
|
+
str(final_file),
|
164
|
+
str(temp_path),
|
165
|
+
"copy_out",
|
166
|
+
cleanup_on_failure=False,
|
167
|
+
)
|
168
|
+
|
169
|
+
# Phase 2: Extract archive with space monitoring
|
170
|
+
_run_with_space_monitoring(
|
171
|
+
MIN_LOCAL_SPACE_MB,
|
172
|
+
work_dir,
|
173
|
+
"archive extraction",
|
132
174
|
_cache_extract_worker,
|
133
175
|
(str(temp_path), str(torch_dir)),
|
134
|
-
space_monitor,
|
135
|
-
"archive extraction",
|
136
176
|
cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
|
137
177
|
)
|
138
178
|
|
139
179
|
logger.info("Cache load complete")
|
140
|
-
return
|
180
|
+
return OperationStatus.SUCCESS
|
141
181
|
|
142
182
|
except CacheOperationInterrupted as e:
|
143
183
|
logger.warning(f"Cache load interrupted: {e}")
|
144
|
-
return
|
145
|
-
|
146
|
-
finally:
|
147
|
-
space_monitor.stop()
|
184
|
+
return OperationStatus.ERROR
|
148
185
|
|
149
186
|
|
150
187
|
"""
|
@@ -165,7 +202,7 @@ More things to consider:
|
|
165
202
|
|
166
203
|
@timed_fn(logger=logger, name="Saving compile cache")
|
167
204
|
@safe_execute("Save failed", False)
|
168
|
-
def save_compile_cache() ->
|
205
|
+
def save_compile_cache() -> OperationStatus:
|
169
206
|
"""Save local PyTorch compilation cache to b10fs using atomic journal pattern.
|
170
207
|
|
171
208
|
This function creates an archive of the local torch cache directory and
|
@@ -179,61 +216,34 @@ def save_compile_cache() -> SaveStatus:
|
|
179
216
|
space becomes insufficient, finally performing an atomic rename to the final cache file.
|
180
217
|
|
181
218
|
Returns:
|
182
|
-
|
183
|
-
|
184
|
-
|
219
|
+
OperationStatus:
|
220
|
+
OperationStatus.SUCCESS if cache was successfully saved or already exists
|
221
|
+
OperationStatus.ERROR if b10fs is unavailable, insufficient disk space caused interruption,
|
185
222
|
no cache exists to save, or saving failed.
|
186
|
-
|
223
|
+
OperationStatus.SKIPPED if no cache exists to save or cache already exists in b10fs
|
187
224
|
|
188
225
|
Raises:
|
189
|
-
CacheValidationError: If b10fs is not enabled (caught and returns
|
226
|
+
CacheValidationError: If b10fs is not enabled (caught and returns OperationStatus.ERROR).
|
190
227
|
CacheOperationInterrupted: If operations interrupted due to insufficient
|
191
|
-
disk space (caught and returns
|
192
|
-
ArchiveError: If archive creation fails (caught and returns
|
193
|
-
Exception: Any other errors during saving (caught and returns
|
228
|
+
disk space (caught and returns OperationStatus.ERROR).
|
229
|
+
ArchiveError: If archive creation fails (caught and returns OperationStatus.ERROR).
|
230
|
+
Exception: Any other errors during saving (caught and returns OperationStatus.ERROR).
|
194
231
|
"""
|
195
232
|
with cache_operation("Save"):
|
196
|
-
|
197
|
-
cooperative_cleanup_b10fs()
|
198
|
-
|
199
|
-
b10fs_dir = Path(B10FS_CACHE_DIR)
|
200
|
-
torch_dir = Path(TORCH_CACHE_DIR)
|
201
|
-
work_dir = Path(LOCAL_WORK_DIR)
|
233
|
+
b10fs_dir, torch_dir, work_dir = _setup_cache_paths()
|
202
234
|
|
203
235
|
# Check if anything to save
|
204
236
|
if not torch_dir.exists() or not any(torch_dir.iterdir()):
|
205
237
|
logger.info("No torch cache to save")
|
206
|
-
return
|
238
|
+
return OperationStatus.SKIPPED
|
207
239
|
|
208
240
|
cache_filename = get_cache_filename()
|
209
|
-
final_file = (
|
210
|
-
b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
|
211
|
-
)
|
241
|
+
final_file, temp_file = _get_cache_file_paths(cache_filename, b10fs_dir)
|
212
242
|
|
213
243
|
# Check for existing cache first (early exit)
|
214
244
|
if final_file.exists():
|
215
245
|
logger.info("Cache already exists in b10fs, skipping save")
|
216
|
-
return
|
217
|
-
|
218
|
-
# Initial disk space checks using calculated space requirements
|
219
|
-
check_sufficient_disk_space(
|
220
|
-
work_dir, MAX_CACHE_SIZE_MB, "local temp file creation"
|
221
|
-
)
|
222
|
-
check_sufficient_disk_space(
|
223
|
-
b10fs_dir, REQUIRED_B10FS_SPACE_MB, "cache save to b10fs"
|
224
|
-
)
|
225
|
-
logger.debug(
|
226
|
-
f"Initial space checks passed: {MAX_CACHE_SIZE_MB:.1f}MB local, {REQUIRED_B10FS_SPACE_MB:.1f}MB b10fs"
|
227
|
-
)
|
228
|
-
|
229
|
-
temp_file = (
|
230
|
-
b10fs_dir
|
231
|
-
/ f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
|
232
|
-
)
|
233
|
-
|
234
|
-
# Start background space monitoring
|
235
|
-
space_monitor = CacheSpaceMonitor(REQUIRED_B10FS_SPACE_MB, b10fs_dir)
|
236
|
-
space_monitor.start()
|
246
|
+
return OperationStatus.SKIPPED
|
237
247
|
|
238
248
|
with tempfile.NamedTemporaryFile(
|
239
249
|
suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
|
@@ -243,32 +253,19 @@ def save_compile_cache() -> SaveStatus:
|
|
243
253
|
|
244
254
|
try:
|
245
255
|
with temp_file_cleanup(local_temp):
|
246
|
-
# Phase 1: Compression
|
247
|
-
|
248
|
-
|
256
|
+
# Phase 1: Compression with space monitoring
|
257
|
+
_run_with_space_monitoring(
|
258
|
+
REQUIRED_B10FS_SPACE_MB,
|
259
|
+
b10fs_dir,
|
260
|
+
"compression",
|
249
261
|
_cache_compression_worker,
|
250
262
|
(str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
|
251
|
-
space_monitor,
|
252
|
-
"compression",
|
253
263
|
)
|
254
264
|
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
def _monitored_copy_to_b10fs():
|
260
|
-
logger.info(f"Starting copy to b10fs: {local_temp} -> {temp_file}")
|
261
|
-
run_monitored_process(
|
262
|
-
_cache_copy_worker,
|
263
|
-
(str(local_temp), str(temp_file)),
|
264
|
-
space_monitor,
|
265
|
-
"b10fs copy",
|
266
|
-
cleanup_func=lambda: safe_unlink(
|
267
|
-
temp_file, f"Failed to cleanup interrupted copy {temp_file}"
|
268
|
-
),
|
269
|
-
)
|
270
|
-
|
271
|
-
_monitored_copy_to_b10fs()
|
265
|
+
# Phase 2: Copy to b10fs with locking
|
266
|
+
_transfer_with_b10fs_lock(
|
267
|
+
str(local_temp), str(temp_file), "copy_in", cleanup_on_failure=True
|
268
|
+
)
|
272
269
|
|
273
270
|
# Phase 3: Atomic rename (fast, don't interrupt)
|
274
271
|
logger.info(
|
@@ -277,19 +274,16 @@ def save_compile_cache() -> SaveStatus:
|
|
277
274
|
temp_file.rename(final_file)
|
278
275
|
|
279
276
|
logger.info("Cache save complete")
|
280
|
-
return
|
277
|
+
return OperationStatus.SUCCESS
|
281
278
|
|
282
279
|
except CacheOperationInterrupted as e:
|
283
280
|
logger.warning(f"Cache save interrupted: {e}")
|
284
|
-
return
|
285
|
-
|
286
|
-
finally:
|
287
|
-
space_monitor.stop()
|
281
|
+
return OperationStatus.ERROR
|
288
282
|
|
289
283
|
|
290
284
|
@timed_fn(logger=logger, name="Transferring file")
|
291
|
-
@safe_execute("Transfer failed",
|
292
|
-
def transfer(source: str, dest: str) ->
|
285
|
+
@safe_execute("Transfer failed", OperationStatus.ERROR)
|
286
|
+
def transfer(source: str, dest: str) -> OperationStatus:
|
293
287
|
"""Transfer a file from source to destination with space monitoring.
|
294
288
|
|
295
289
|
This function copies a file from source to destination using the same
|
@@ -301,15 +295,15 @@ def transfer(source: str, dest: str) -> TransferStatus:
|
|
301
295
|
dest: Path to the destination where the file will be copied.
|
302
296
|
|
303
297
|
Returns:
|
304
|
-
|
305
|
-
|
306
|
-
|
298
|
+
OperationStatus:
|
299
|
+
OperationStatus.SUCCESS if transfer was successful
|
300
|
+
OperationStatus.ERROR if transfer failed due to insufficient disk space,
|
307
301
|
file not found, or other errors.
|
308
302
|
|
309
303
|
Raises:
|
310
304
|
CacheOperationInterrupted: If transfer interrupted due to insufficient
|
311
|
-
disk space (caught and returns
|
312
|
-
Exception: Any other errors during transfer (caught and returns
|
305
|
+
disk space (caught and returns OperationStatus.ERROR).
|
306
|
+
Exception: Any other errors during transfer (caught and returns OperationStatus.ERROR).
|
313
307
|
"""
|
314
308
|
source_path = Path(source)
|
315
309
|
dest_path = Path(dest)
|
@@ -317,7 +311,7 @@ def transfer(source: str, dest: str) -> TransferStatus:
|
|
317
311
|
# Validate source file exists
|
318
312
|
if not source_path.exists():
|
319
313
|
logger.error(f"Source file does not exist: {source}")
|
320
|
-
return
|
314
|
+
return OperationStatus.ERROR
|
321
315
|
|
322
316
|
# Create destination directory if it doesn't exist
|
323
317
|
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
@@ -361,11 +355,11 @@ def transfer(source: str, dest: str) -> TransferStatus:
|
|
361
355
|
)
|
362
356
|
|
363
357
|
logger.info("File transfer complete")
|
364
|
-
return
|
358
|
+
return OperationStatus.SUCCESS
|
365
359
|
|
366
360
|
except CacheOperationInterrupted as e:
|
367
361
|
logger.warning(f"File transfer interrupted: {e}")
|
368
|
-
return
|
362
|
+
return OperationStatus.ERROR
|
369
363
|
|
370
364
|
finally:
|
371
365
|
space_monitor.stop()
|
@@ -432,23 +426,6 @@ def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
|
|
432
426
|
shutil.copy2(source_path, dest_path)
|
433
427
|
|
434
428
|
|
435
|
-
@worker_process("Copy from b10fs was cancelled before starting")
|
436
|
-
def _cache_copy_from_b10fs_worker(source_path_str: str, dest_path_str: str) -> None:
|
437
|
-
"""Worker process that handles file copy from b10fs to local machine.
|
438
|
-
|
439
|
-
This function runs in a separate process to copy the cache file from b10fs
|
440
|
-
to the local filesystem. It can be terminated externally if local disk space becomes insufficient.
|
441
|
-
|
442
|
-
Args:
|
443
|
-
source_path_str: String path to the source file in b10fs to copy.
|
444
|
-
dest_path_str: String path where the file will be copied locally.
|
445
|
-
"""
|
446
|
-
source_path = Path(source_path_str)
|
447
|
-
dest_path = Path(dest_path_str)
|
448
|
-
|
449
|
-
shutil.copy2(source_path, dest_path)
|
450
|
-
|
451
|
-
|
452
429
|
def _cleanup_torch_dir(torch_dir: Path) -> None:
|
453
430
|
"""Helper function to safely cleanup torch directory during interrupted extraction."""
|
454
431
|
try:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|