b10-transfer 0.1.3__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {b10_transfer-0.1.3 → b10_transfer-0.1.4}/PKG-INFO +1 -1
- {b10_transfer-0.1.3 → b10_transfer-0.1.4}/pyproject.toml +1 -1
- {b10_transfer-0.1.3 → b10_transfer-0.1.4}/src/b10_transfer/__init__.py +1 -1
- {b10_transfer-0.1.3 → b10_transfer-0.1.4}/src/b10_transfer/core.py +41 -89
- {b10_transfer-0.1.3 → b10_transfer-0.1.4}/README.md +0 -0
- {b10_transfer-0.1.3 → b10_transfer-0.1.4}/src/b10_transfer/archive.py +0 -0
- {b10_transfer-0.1.3 → b10_transfer-0.1.4}/src/b10_transfer/cleanup.py +0 -0
- {b10_transfer-0.1.3 → b10_transfer-0.1.4}/src/b10_transfer/constants.py +0 -0
- {b10_transfer-0.1.3 → b10_transfer-0.1.4}/src/b10_transfer/environment.py +0 -0
- {b10_transfer-0.1.3 → b10_transfer-0.1.4}/src/b10_transfer/info.py +0 -0
- {b10_transfer-0.1.3 → b10_transfer-0.1.4}/src/b10_transfer/space_monitor.py +0 -0
- {b10_transfer-0.1.3 → b10_transfer-0.1.4}/src/b10_transfer/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: b10-transfer
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.4
|
4
4
|
Summary: Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management
|
5
5
|
License: MIT
|
6
6
|
Keywords: pytorch,file-transfer,cache,machine-learning,inference
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
4
4
|
|
5
5
|
[tool.poetry]
|
6
6
|
name = "b10-transfer"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.4"
|
8
8
|
description = "Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management"
|
9
9
|
authors = ["Shounak Ray <shounak.noreply@baseten.co>", "Fred Liu <fred.liu.noreply@baseten.co>"]
|
10
10
|
maintainers = ["Fred Liu <fred.liu.noreply@baseten.co>", "Shounak Ray <shounak.noreply@baseten.co>"]
|
@@ -90,18 +90,6 @@ def load_compile_cache() -> LoadStatus:
|
|
90
90
|
logger.info("Torch cache already loaded, skipping extraction")
|
91
91
|
return LoadStatus.SKIPPED
|
92
92
|
|
93
|
-
# Initial disk space check for local operations
|
94
|
-
check_sufficient_disk_space(
|
95
|
-
work_dir, MIN_LOCAL_SPACE_MB, "cache load operations"
|
96
|
-
)
|
97
|
-
logger.debug(
|
98
|
-
f"Initial space check passed: {MIN_LOCAL_SPACE_MB:.1f}MB required on local machine"
|
99
|
-
)
|
100
|
-
|
101
|
-
# Start background space monitoring for local disk
|
102
|
-
space_monitor = CacheSpaceMonitor(MIN_LOCAL_SPACE_MB, work_dir)
|
103
|
-
space_monitor.start()
|
104
|
-
|
105
93
|
# Create temp local copy
|
106
94
|
with tempfile.NamedTemporaryFile(
|
107
95
|
suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
|
@@ -111,30 +99,30 @@ def load_compile_cache() -> LoadStatus:
|
|
111
99
|
|
112
100
|
try:
|
113
101
|
with temp_file_cleanup(temp_path):
|
114
|
-
# Phase 1: Copy from b10fs to local temp file
|
102
|
+
# Phase 1: Copy from b10fs to local temp file using transfer()
|
115
103
|
@critical_section_b10fs_file_lock("copy_out")
|
116
104
|
def _monitored_copy_from_b10fs():
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
run_monitored_process(
|
121
|
-
_cache_copy_from_b10fs_worker,
|
122
|
-
(str(cache_file), str(temp_path)),
|
123
|
-
space_monitor,
|
124
|
-
"b10fs to local copy",
|
125
|
-
)
|
105
|
+
result = transfer(str(cache_file), str(temp_path))
|
106
|
+
if result != TransferStatus.SUCCESS:
|
107
|
+
raise Exception("Failed to copy cache file from b10fs")
|
126
108
|
|
127
109
|
_monitored_copy_from_b10fs()
|
128
110
|
|
129
|
-
# Phase 2: Extract archive in separate process
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
111
|
+
# Phase 2: Extract archive in separate process with space monitoring
|
112
|
+
space_monitor = CacheSpaceMonitor(MIN_LOCAL_SPACE_MB, work_dir)
|
113
|
+
space_monitor.start()
|
114
|
+
|
115
|
+
try:
|
116
|
+
logger.info(f"Starting extraction: {temp_path} -> {torch_dir}")
|
117
|
+
run_monitored_process(
|
118
|
+
_cache_extract_worker,
|
119
|
+
(str(temp_path), str(torch_dir)),
|
120
|
+
space_monitor,
|
121
|
+
"archive extraction",
|
122
|
+
cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
|
123
|
+
)
|
124
|
+
finally:
|
125
|
+
space_monitor.stop()
|
138
126
|
|
139
127
|
logger.info("Cache load complete")
|
140
128
|
return LoadStatus.SUCCESS
|
@@ -143,9 +131,6 @@ def load_compile_cache() -> LoadStatus:
|
|
143
131
|
logger.warning(f"Cache load interrupted: {e}")
|
144
132
|
return LoadStatus.ERROR
|
145
133
|
|
146
|
-
finally:
|
147
|
-
space_monitor.stop()
|
148
|
-
|
149
134
|
|
150
135
|
"""
|
151
136
|
FIXME(SRAY):
|
@@ -215,26 +200,11 @@ def save_compile_cache() -> SaveStatus:
|
|
215
200
|
logger.info("Cache already exists in b10fs, skipping save")
|
216
201
|
return SaveStatus.SKIPPED
|
217
202
|
|
218
|
-
# Initial disk space checks using calculated space requirements
|
219
|
-
check_sufficient_disk_space(
|
220
|
-
work_dir, MAX_CACHE_SIZE_MB, "local temp file creation"
|
221
|
-
)
|
222
|
-
check_sufficient_disk_space(
|
223
|
-
b10fs_dir, REQUIRED_B10FS_SPACE_MB, "cache save to b10fs"
|
224
|
-
)
|
225
|
-
logger.debug(
|
226
|
-
f"Initial space checks passed: {MAX_CACHE_SIZE_MB:.1f}MB local, {REQUIRED_B10FS_SPACE_MB:.1f}MB b10fs"
|
227
|
-
)
|
228
|
-
|
229
203
|
temp_file = (
|
230
204
|
b10fs_dir
|
231
205
|
/ f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
|
232
206
|
)
|
233
207
|
|
234
|
-
# Start background space monitoring
|
235
|
-
space_monitor = CacheSpaceMonitor(REQUIRED_B10FS_SPACE_MB, b10fs_dir)
|
236
|
-
space_monitor.start()
|
237
|
-
|
238
208
|
with tempfile.NamedTemporaryFile(
|
239
209
|
suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
|
240
210
|
) as f:
|
@@ -243,30 +213,32 @@ def save_compile_cache() -> SaveStatus:
|
|
243
213
|
|
244
214
|
try:
|
245
215
|
with temp_file_cleanup(local_temp):
|
246
|
-
# Phase 1: Compression in separate process
|
247
|
-
|
248
|
-
|
249
|
-
_cache_compression_worker,
|
250
|
-
(str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
|
251
|
-
space_monitor,
|
252
|
-
"compression",
|
253
|
-
)
|
254
|
-
|
255
|
-
b10fs_dir.mkdir(parents=True, exist_ok=True)
|
216
|
+
# Phase 1: Compression in separate process with space monitoring
|
217
|
+
space_monitor = CacheSpaceMonitor(REQUIRED_B10FS_SPACE_MB, b10fs_dir)
|
218
|
+
space_monitor.start()
|
256
219
|
|
257
|
-
|
258
|
-
|
259
|
-
def _monitored_copy_to_b10fs():
|
260
|
-
logger.info(f"Starting copy to b10fs: {local_temp} -> {temp_file}")
|
220
|
+
try:
|
221
|
+
logger.info(f"Starting compression: {torch_dir} -> {local_temp}")
|
261
222
|
run_monitored_process(
|
262
|
-
|
263
|
-
(str(
|
223
|
+
_cache_compression_worker,
|
224
|
+
(str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
|
264
225
|
space_monitor,
|
265
|
-
"
|
266
|
-
cleanup_func=lambda: safe_unlink(
|
267
|
-
temp_file, f"Failed to cleanup interrupted copy {temp_file}"
|
268
|
-
),
|
226
|
+
"compression",
|
269
227
|
)
|
228
|
+
finally:
|
229
|
+
space_monitor.stop()
|
230
|
+
|
231
|
+
# Phase 2: Copy to b10fs using transfer()
|
232
|
+
@critical_section_b10fs_file_lock("copy_in")
|
233
|
+
def _monitored_copy_to_b10fs():
|
234
|
+
result = transfer(str(local_temp), str(temp_file))
|
235
|
+
if result != TransferStatus.SUCCESS:
|
236
|
+
# Clean up the temp file if transfer failed
|
237
|
+
safe_unlink(
|
238
|
+
temp_file,
|
239
|
+
f"Failed to cleanup after failed copy {temp_file}",
|
240
|
+
)
|
241
|
+
raise Exception("Failed to copy cache file to b10fs")
|
270
242
|
|
271
243
|
_monitored_copy_to_b10fs()
|
272
244
|
|
@@ -283,9 +255,6 @@ def save_compile_cache() -> SaveStatus:
|
|
283
255
|
logger.warning(f"Cache save interrupted: {e}")
|
284
256
|
return SaveStatus.ERROR
|
285
257
|
|
286
|
-
finally:
|
287
|
-
space_monitor.stop()
|
288
|
-
|
289
258
|
|
290
259
|
@timed_fn(logger=logger, name="Transferring file")
|
291
260
|
@safe_execute("Transfer failed", TransferStatus.ERROR)
|
@@ -432,23 +401,6 @@ def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
|
|
432
401
|
shutil.copy2(source_path, dest_path)
|
433
402
|
|
434
403
|
|
435
|
-
@worker_process("Copy from b10fs was cancelled before starting")
|
436
|
-
def _cache_copy_from_b10fs_worker(source_path_str: str, dest_path_str: str) -> None:
|
437
|
-
"""Worker process that handles file copy from b10fs to local machine.
|
438
|
-
|
439
|
-
This function runs in a separate process to copy the cache file from b10fs
|
440
|
-
to the local filesystem. It can be terminated externally if local disk space becomes insufficient.
|
441
|
-
|
442
|
-
Args:
|
443
|
-
source_path_str: String path to the source file in b10fs to copy.
|
444
|
-
dest_path_str: String path where the file will be copied locally.
|
445
|
-
"""
|
446
|
-
source_path = Path(source_path_str)
|
447
|
-
dest_path = Path(dest_path_str)
|
448
|
-
|
449
|
-
shutil.copy2(source_path, dest_path)
|
450
|
-
|
451
|
-
|
452
404
|
def _cleanup_torch_dir(torch_dir: Path) -> None:
|
453
405
|
"""Helper function to safely cleanup torch directory during interrupted extraction."""
|
454
406
|
try:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|