b10-transfer 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- b10_transfer/__init__.py +1 -1
- b10_transfer/constants.py +7 -0
- b10_transfer/core.py +125 -88
- {b10_transfer-0.1.2.dist-info → b10_transfer-0.1.4.dist-info}/METADATA +1 -1
- {b10_transfer-0.1.2.dist-info → b10_transfer-0.1.4.dist-info}/RECORD +6 -6
- {b10_transfer-0.1.2.dist-info → b10_transfer-0.1.4.dist-info}/WHEEL +0 -0
b10_transfer/__init__.py
CHANGED
b10_transfer/constants.py
CHANGED
b10_transfer/core.py
CHANGED
@@ -35,6 +35,7 @@ from .constants import (
|
|
35
35
|
CACHE_INCOMPLETE_SUFFIX,
|
36
36
|
LoadStatus,
|
37
37
|
SaveStatus,
|
38
|
+
TransferStatus,
|
38
39
|
)
|
39
40
|
|
40
41
|
logger = logging.getLogger(__name__)
|
@@ -89,18 +90,6 @@ def load_compile_cache() -> LoadStatus:
|
|
89
90
|
logger.info("Torch cache already loaded, skipping extraction")
|
90
91
|
return LoadStatus.SKIPPED
|
91
92
|
|
92
|
-
# Initial disk space check for local operations
|
93
|
-
check_sufficient_disk_space(
|
94
|
-
work_dir, MIN_LOCAL_SPACE_MB, "cache load operations"
|
95
|
-
)
|
96
|
-
logger.debug(
|
97
|
-
f"Initial space check passed: {MIN_LOCAL_SPACE_MB:.1f}MB required on local machine"
|
98
|
-
)
|
99
|
-
|
100
|
-
# Start background space monitoring for local disk
|
101
|
-
space_monitor = CacheSpaceMonitor(MIN_LOCAL_SPACE_MB, work_dir)
|
102
|
-
space_monitor.start()
|
103
|
-
|
104
93
|
# Create temp local copy
|
105
94
|
with tempfile.NamedTemporaryFile(
|
106
95
|
suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
|
@@ -110,30 +99,30 @@ def load_compile_cache() -> LoadStatus:
|
|
110
99
|
|
111
100
|
try:
|
112
101
|
with temp_file_cleanup(temp_path):
|
113
|
-
# Phase 1: Copy from b10fs to local temp file
|
102
|
+
# Phase 1: Copy from b10fs to local temp file using transfer()
|
114
103
|
@critical_section_b10fs_file_lock("copy_out")
|
115
104
|
def _monitored_copy_from_b10fs():
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
run_monitored_process(
|
120
|
-
_cache_copy_from_b10fs_worker,
|
121
|
-
(str(cache_file), str(temp_path)),
|
122
|
-
space_monitor,
|
123
|
-
"b10fs to local copy",
|
124
|
-
)
|
105
|
+
result = transfer(str(cache_file), str(temp_path))
|
106
|
+
if result != TransferStatus.SUCCESS:
|
107
|
+
raise Exception("Failed to copy cache file from b10fs")
|
125
108
|
|
126
109
|
_monitored_copy_from_b10fs()
|
127
110
|
|
128
|
-
# Phase 2: Extract archive in separate process
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
111
|
+
# Phase 2: Extract archive in separate process with space monitoring
|
112
|
+
space_monitor = CacheSpaceMonitor(MIN_LOCAL_SPACE_MB, work_dir)
|
113
|
+
space_monitor.start()
|
114
|
+
|
115
|
+
try:
|
116
|
+
logger.info(f"Starting extraction: {temp_path} -> {torch_dir}")
|
117
|
+
run_monitored_process(
|
118
|
+
_cache_extract_worker,
|
119
|
+
(str(temp_path), str(torch_dir)),
|
120
|
+
space_monitor,
|
121
|
+
"archive extraction",
|
122
|
+
cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
|
123
|
+
)
|
124
|
+
finally:
|
125
|
+
space_monitor.stop()
|
137
126
|
|
138
127
|
logger.info("Cache load complete")
|
139
128
|
return LoadStatus.SUCCESS
|
@@ -142,9 +131,6 @@ def load_compile_cache() -> LoadStatus:
|
|
142
131
|
logger.warning(f"Cache load interrupted: {e}")
|
143
132
|
return LoadStatus.ERROR
|
144
133
|
|
145
|
-
finally:
|
146
|
-
space_monitor.stop()
|
147
|
-
|
148
134
|
|
149
135
|
"""
|
150
136
|
FIXME(SRAY):
|
@@ -214,26 +200,11 @@ def save_compile_cache() -> SaveStatus:
|
|
214
200
|
logger.info("Cache already exists in b10fs, skipping save")
|
215
201
|
return SaveStatus.SKIPPED
|
216
202
|
|
217
|
-
# Initial disk space checks using calculated space requirements
|
218
|
-
check_sufficient_disk_space(
|
219
|
-
work_dir, MAX_CACHE_SIZE_MB, "local temp file creation"
|
220
|
-
)
|
221
|
-
check_sufficient_disk_space(
|
222
|
-
b10fs_dir, REQUIRED_B10FS_SPACE_MB, "cache save to b10fs"
|
223
|
-
)
|
224
|
-
logger.debug(
|
225
|
-
f"Initial space checks passed: {MAX_CACHE_SIZE_MB:.1f}MB local, {REQUIRED_B10FS_SPACE_MB:.1f}MB b10fs"
|
226
|
-
)
|
227
|
-
|
228
203
|
temp_file = (
|
229
204
|
b10fs_dir
|
230
205
|
/ f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
|
231
206
|
)
|
232
207
|
|
233
|
-
# Start background space monitoring
|
234
|
-
space_monitor = CacheSpaceMonitor(REQUIRED_B10FS_SPACE_MB, b10fs_dir)
|
235
|
-
space_monitor.start()
|
236
|
-
|
237
208
|
with tempfile.NamedTemporaryFile(
|
238
209
|
suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
|
239
210
|
) as f:
|
@@ -242,30 +213,32 @@ def save_compile_cache() -> SaveStatus:
|
|
242
213
|
|
243
214
|
try:
|
244
215
|
with temp_file_cleanup(local_temp):
|
245
|
-
# Phase 1: Compression in separate process
|
246
|
-
|
247
|
-
|
248
|
-
_cache_compression_worker,
|
249
|
-
(str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
|
250
|
-
space_monitor,
|
251
|
-
"compression",
|
252
|
-
)
|
253
|
-
|
254
|
-
b10fs_dir.mkdir(parents=True, exist_ok=True)
|
216
|
+
# Phase 1: Compression in separate process with space monitoring
|
217
|
+
space_monitor = CacheSpaceMonitor(REQUIRED_B10FS_SPACE_MB, b10fs_dir)
|
218
|
+
space_monitor.start()
|
255
219
|
|
256
|
-
|
257
|
-
|
258
|
-
def _monitored_copy_to_b10fs():
|
259
|
-
logger.info(f"Starting copy to b10fs: {local_temp} -> {temp_file}")
|
220
|
+
try:
|
221
|
+
logger.info(f"Starting compression: {torch_dir} -> {local_temp}")
|
260
222
|
run_monitored_process(
|
261
|
-
|
262
|
-
(str(
|
223
|
+
_cache_compression_worker,
|
224
|
+
(str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
|
263
225
|
space_monitor,
|
264
|
-
"
|
265
|
-
cleanup_func=lambda: safe_unlink(
|
266
|
-
temp_file, f"Failed to cleanup interrupted copy {temp_file}"
|
267
|
-
),
|
226
|
+
"compression",
|
268
227
|
)
|
228
|
+
finally:
|
229
|
+
space_monitor.stop()
|
230
|
+
|
231
|
+
# Phase 2: Copy to b10fs using transfer()
|
232
|
+
@critical_section_b10fs_file_lock("copy_in")
|
233
|
+
def _monitored_copy_to_b10fs():
|
234
|
+
result = transfer(str(local_temp), str(temp_file))
|
235
|
+
if result != TransferStatus.SUCCESS:
|
236
|
+
# Clean up the temp file if transfer failed
|
237
|
+
safe_unlink(
|
238
|
+
temp_file,
|
239
|
+
f"Failed to cleanup after failed copy {temp_file}",
|
240
|
+
)
|
241
|
+
raise Exception("Failed to copy cache file to b10fs")
|
269
242
|
|
270
243
|
_monitored_copy_to_b10fs()
|
271
244
|
|
@@ -282,8 +255,89 @@ def save_compile_cache() -> SaveStatus:
|
|
282
255
|
logger.warning(f"Cache save interrupted: {e}")
|
283
256
|
return SaveStatus.ERROR
|
284
257
|
|
285
|
-
|
286
|
-
|
258
|
+
|
259
|
+
@timed_fn(logger=logger, name="Transferring file")
|
260
|
+
@safe_execute("Transfer failed", TransferStatus.ERROR)
|
261
|
+
def transfer(source: str, dest: str) -> TransferStatus:
|
262
|
+
"""Transfer a file from source to destination with space monitoring.
|
263
|
+
|
264
|
+
This function copies a file from source to destination using the same
|
265
|
+
monitored process approach as the cache operations. It monitors disk space
|
266
|
+
at the destination and can interrupt the transfer if space becomes insufficient.
|
267
|
+
|
268
|
+
Args:
|
269
|
+
source: Path to the source file to copy.
|
270
|
+
dest: Path to the destination where the file will be copied.
|
271
|
+
|
272
|
+
Returns:
|
273
|
+
TransferStatus:
|
274
|
+
TransferStatus.SUCCESS if transfer was successful
|
275
|
+
TransferStatus.ERROR if transfer failed due to insufficient disk space,
|
276
|
+
file not found, or other errors.
|
277
|
+
|
278
|
+
Raises:
|
279
|
+
CacheOperationInterrupted: If transfer interrupted due to insufficient
|
280
|
+
disk space (caught and returns TransferStatus.ERROR).
|
281
|
+
Exception: Any other errors during transfer (caught and returns TransferStatus.ERROR).
|
282
|
+
"""
|
283
|
+
source_path = Path(source)
|
284
|
+
dest_path = Path(dest)
|
285
|
+
|
286
|
+
# Validate source file exists
|
287
|
+
if not source_path.exists():
|
288
|
+
logger.error(f"Source file does not exist: {source}")
|
289
|
+
return TransferStatus.ERROR
|
290
|
+
|
291
|
+
# Create destination directory if it doesn't exist
|
292
|
+
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
293
|
+
|
294
|
+
# Determine appropriate space threshold based on destination directory
|
295
|
+
dest_dir = dest_path.parent
|
296
|
+
if str(dest_dir).startswith(B10FS_CACHE_DIR):
|
297
|
+
# Transferring to b10fs - use b10fs space requirements
|
298
|
+
space_threshold_mb = REQUIRED_B10FS_SPACE_MB
|
299
|
+
logger.debug(
|
300
|
+
f"Transfer to b10fs detected, using {space_threshold_mb:.1f}MB threshold"
|
301
|
+
)
|
302
|
+
else:
|
303
|
+
# Transferring to local directory - use local space requirements
|
304
|
+
space_threshold_mb = MIN_LOCAL_SPACE_MB
|
305
|
+
logger.debug(
|
306
|
+
f"Transfer to local directory detected, using {space_threshold_mb:.1f}MB threshold"
|
307
|
+
)
|
308
|
+
|
309
|
+
# Initial disk space check
|
310
|
+
check_sufficient_disk_space(dest_dir, space_threshold_mb, "file transfer")
|
311
|
+
logger.debug(
|
312
|
+
f"Initial space check passed: {space_threshold_mb:.1f}MB required at destination"
|
313
|
+
)
|
314
|
+
|
315
|
+
# Start background space monitoring for destination directory
|
316
|
+
space_monitor = CacheSpaceMonitor(space_threshold_mb, dest_dir)
|
317
|
+
space_monitor.start()
|
318
|
+
|
319
|
+
try:
|
320
|
+
# Run monitored copy process
|
321
|
+
logger.info(f"Starting transfer: {source} -> {dest}")
|
322
|
+
run_monitored_process(
|
323
|
+
_cache_copy_worker,
|
324
|
+
(str(source_path), str(dest_path)),
|
325
|
+
space_monitor,
|
326
|
+
"file transfer",
|
327
|
+
cleanup_func=lambda: safe_unlink(
|
328
|
+
dest_path, f"Failed to cleanup interrupted transfer {dest_path}"
|
329
|
+
),
|
330
|
+
)
|
331
|
+
|
332
|
+
logger.info("File transfer complete")
|
333
|
+
return TransferStatus.SUCCESS
|
334
|
+
|
335
|
+
except CacheOperationInterrupted as e:
|
336
|
+
logger.warning(f"File transfer interrupted: {e}")
|
337
|
+
return TransferStatus.ERROR
|
338
|
+
|
339
|
+
finally:
|
340
|
+
space_monitor.stop()
|
287
341
|
|
288
342
|
|
289
343
|
@safe_execute("Clear failed", False)
|
@@ -347,23 +401,6 @@ def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
|
|
347
401
|
shutil.copy2(source_path, dest_path)
|
348
402
|
|
349
403
|
|
350
|
-
@worker_process("Copy from b10fs was cancelled before starting")
|
351
|
-
def _cache_copy_from_b10fs_worker(source_path_str: str, dest_path_str: str) -> None:
|
352
|
-
"""Worker process that handles file copy from b10fs to local machine.
|
353
|
-
|
354
|
-
This function runs in a separate process to copy the cache file from b10fs
|
355
|
-
to the local filesystem. It can be terminated externally if local disk space becomes insufficient.
|
356
|
-
|
357
|
-
Args:
|
358
|
-
source_path_str: String path to the source file in b10fs to copy.
|
359
|
-
dest_path_str: String path where the file will be copied locally.
|
360
|
-
"""
|
361
|
-
source_path = Path(source_path_str)
|
362
|
-
dest_path = Path(dest_path_str)
|
363
|
-
|
364
|
-
shutil.copy2(source_path, dest_path)
|
365
|
-
|
366
|
-
|
367
404
|
def _cleanup_torch_dir(torch_dir: Path) -> None:
|
368
405
|
"""Helper function to safely cleanup torch directory during interrupted extraction."""
|
369
406
|
try:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: b10-transfer
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.4
|
4
4
|
Summary: Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management
|
5
5
|
License: MIT
|
6
6
|
Keywords: pytorch,file-transfer,cache,machine-learning,inference
|
@@ -1,12 +1,12 @@
|
|
1
|
-
b10_transfer/__init__.py,sha256=
|
1
|
+
b10_transfer/__init__.py,sha256=LKMroIusY1itfMVrJT07xLS1XVehwr54Wk5dhEl8MzY,641
|
2
2
|
b10_transfer/archive.py,sha256=GKb0mi0-YeM7ch4FLAoOLHXw0T6LkRerYad2N2y9TYM,6400
|
3
3
|
b10_transfer/cleanup.py,sha256=3RnqWNGMCcko5GQdq1Gr9VPpGzAF5J6x7xjIH9SNZ78,6226
|
4
|
-
b10_transfer/constants.py,sha256=
|
5
|
-
b10_transfer/core.py,sha256=
|
4
|
+
b10_transfer/constants.py,sha256=qCViKTyfHTLpiFVF2SwsbHp2IMz3kg3syxJfgRAq2dc,4446
|
5
|
+
b10_transfer/core.py,sha256=XWLuwjHXuhh-6abZMAl2yuLB7R2deyUc6gGPn6-Yfkc,17006
|
6
6
|
b10_transfer/environment.py,sha256=aC0biEMQrtHk0ke_3epdcq1X9J5fPmPpBVt0fH7XF2Y,5625
|
7
7
|
b10_transfer/info.py,sha256=I3iOuImZ5r6DMJTDeBtVvzlSn6IuyPJbLJYUO_OF0ks,6299
|
8
8
|
b10_transfer/space_monitor.py,sha256=C_CKDH43bNsWdq60WStSZ3c_nQkWvScQmqU_SYHesew,10531
|
9
9
|
b10_transfer/utils.py,sha256=Stee0DFK-8MRRYNIocqaK64cJvfs4jPW3Mpx7zkWV6Y,11932
|
10
|
-
b10_transfer-0.1.
|
11
|
-
b10_transfer-0.1.
|
12
|
-
b10_transfer-0.1.
|
10
|
+
b10_transfer-0.1.4.dist-info/METADATA,sha256=69s3ACBUFzGB7J97eVt4aCGSXrIpld1oV0Wj8Z0HLZ8,4108
|
11
|
+
b10_transfer-0.1.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
12
|
+
b10_transfer-0.1.4.dist-info/RECORD,,
|
File without changes
|