b10-transfer 0.1.5__tar.gz → 0.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {b10_transfer-0.1.5 → b10_transfer-0.1.7}/PKG-INFO +1 -1
- {b10_transfer-0.1.5 → b10_transfer-0.1.7}/pyproject.toml +1 -1
- {b10_transfer-0.1.5 → b10_transfer-0.1.7}/src/b10_transfer/__init__.py +3 -2
- b10_transfer-0.1.5/src/b10_transfer/core.py → b10_transfer-0.1.7/src/b10_transfer/cache.py +27 -122
- b10_transfer-0.1.7/src/b10_transfer/core.py +131 -0
- {b10_transfer-0.1.5 → b10_transfer-0.1.7}/src/b10_transfer/environment.py +3 -4
- {b10_transfer-0.1.5 → b10_transfer-0.1.7}/README.md +0 -0
- {b10_transfer-0.1.5 → b10_transfer-0.1.7}/src/b10_transfer/archive.py +0 -0
- {b10_transfer-0.1.5 → b10_transfer-0.1.7}/src/b10_transfer/cleanup.py +0 -0
- {b10_transfer-0.1.5 → b10_transfer-0.1.7}/src/b10_transfer/constants.py +0 -0
- {b10_transfer-0.1.5 → b10_transfer-0.1.7}/src/b10_transfer/info.py +0 -0
- {b10_transfer-0.1.5 → b10_transfer-0.1.7}/src/b10_transfer/space_monitor.py +0 -0
- {b10_transfer-0.1.5 → b10_transfer-0.1.7}/src/b10_transfer/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: b10-transfer
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.7
|
4
4
|
Summary: Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management
|
5
5
|
License: MIT
|
6
6
|
Keywords: pytorch,file-transfer,cache,machine-learning,inference
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
4
4
|
|
5
5
|
[tool.poetry]
|
6
6
|
name = "b10-transfer"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.7"
|
8
8
|
description = "Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management"
|
9
9
|
authors = ["Shounak Ray <shounak.noreply@baseten.co>", "Fred Liu <fred.liu.noreply@baseten.co>"]
|
10
10
|
maintainers = ["Fred Liu <fred.liu.noreply@baseten.co>", "Shounak Ray <shounak.noreply@baseten.co>"]
|
@@ -1,13 +1,14 @@
|
|
1
1
|
"""B10 Transfer - Lock-free PyTorch file transfer for Baseten."""
|
2
2
|
|
3
|
-
from .
|
3
|
+
from .cache import load_compile_cache, save_compile_cache, clear_local_cache
|
4
|
+
from .core import transfer
|
4
5
|
from .utils import CacheError, CacheValidationError
|
5
6
|
from .space_monitor import CacheOperationInterrupted
|
6
7
|
from .info import get_cache_info, list_available_caches
|
7
8
|
from .constants import OperationStatus
|
8
9
|
|
9
10
|
# Version
|
10
|
-
__version__ = "0.1.
|
11
|
+
__version__ = "0.1.7"
|
11
12
|
|
12
13
|
__all__ = [
|
13
14
|
"CacheError",
|
@@ -1,11 +1,13 @@
|
|
1
|
-
|
1
|
+
"""Cache operations for PyTorch compilation artifacts.
|
2
|
+
|
3
|
+
This module provides functions for loading and saving PyTorch compilation cache
|
4
|
+
to/from b10fs shared storage using atomic operations and space monitoring.
|
5
|
+
"""
|
6
|
+
|
2
7
|
import logging
|
3
8
|
import tempfile
|
4
|
-
import shutil
|
5
9
|
from pathlib import Path
|
6
10
|
|
7
|
-
import time
|
8
|
-
|
9
11
|
from .environment import get_cache_filename
|
10
12
|
from .cleanup import cooperative_cleanup_b10fs
|
11
13
|
from .utils import (
|
@@ -17,7 +19,6 @@ from .utils import (
|
|
17
19
|
safe_unlink,
|
18
20
|
)
|
19
21
|
from .space_monitor import (
|
20
|
-
check_sufficient_disk_space,
|
21
22
|
CacheSpaceMonitor,
|
22
23
|
CacheOperationInterrupted,
|
23
24
|
run_monitored_process,
|
@@ -35,10 +36,27 @@ from .constants import (
|
|
35
36
|
CACHE_INCOMPLETE_SUFFIX,
|
36
37
|
OperationStatus,
|
37
38
|
)
|
39
|
+
from .core import transfer
|
38
40
|
|
39
41
|
logger = logging.getLogger(__name__)
|
40
42
|
|
41
43
|
|
44
|
+
"""
|
45
|
+
FIXME(SRAY):
|
46
|
+
What about the case in @b10-transfer/ where a single pod finishes an inference request,
|
47
|
+
and then the client calls save_compile_cache. And while we are creating the local archive,
|
48
|
+
another inference call on the same pod is kicked off, which then modifies the torch cache.
|
49
|
+
How would this be handled? Maybe just accept that the cache will be recompiled/overwritten?
|
50
|
+
Otherwise you'd need application level coordination to ensure that the cache is not modified
|
51
|
+
while we are creating the archive, but this doesn't really seem like a good idea in terms of adoption.
|
52
|
+
|
53
|
+
FIXME(SR):
|
54
|
+
More things to consider:
|
55
|
+
- [possible] What if b10fs dies *during* an op? right now we check for b10fs availability in the beginning of the op... Add some constants instead of just False for load().
|
56
|
+
- [possible, and really bad if it happens] potential memory exhaustion during compression if the cache is super super large. very very edge case. higher compression levels also have high memory usage.
|
57
|
+
"""
|
58
|
+
|
59
|
+
|
42
60
|
def _setup_cache_paths():
|
43
61
|
"""Common setup for cache operations - returns paths and performs cleanup."""
|
44
62
|
# Cooperative cleanup of stale shared resources
|
@@ -184,22 +202,6 @@ def load_compile_cache() -> OperationStatus:
|
|
184
202
|
return OperationStatus.ERROR
|
185
203
|
|
186
204
|
|
187
|
-
"""
|
188
|
-
FIXME(SRAY):
|
189
|
-
What about the case in @b10-transfer/ where a single pod finishes an inference request,
|
190
|
-
and then the client calls save_compile_cache. And while we are creating the local archive,
|
191
|
-
another inference call on the same pod is kicked off, which then modifies the torch cache.
|
192
|
-
How would this be handled? Maybe just accept that the cache will be recompiled/overwritten?
|
193
|
-
Otherwise you'd need application level coordination to ensure that the cache is not modified
|
194
|
-
while we are creating the archive, but this doesn't really seem like a good idea in terms of adoption.
|
195
|
-
|
196
|
-
FIXME(SR):
|
197
|
-
More things to consider:
|
198
|
-
- [possible] What if b10fs dies *during* an op? right now we check for b10fs availability in the beginning of the op... Add some constants instead of just False for load().
|
199
|
-
- [possible, and really bad if it happens] potential memory exhaustion during compression if the cache is super super large. very very edge case. higher compression levels also have high memory usage.
|
200
|
-
"""
|
201
|
-
|
202
|
-
|
203
205
|
@timed_fn(logger=logger, name="Saving compile cache")
|
204
206
|
@safe_execute("Save failed", False)
|
205
207
|
def save_compile_cache() -> OperationStatus:
|
@@ -281,90 +283,6 @@ def save_compile_cache() -> OperationStatus:
|
|
281
283
|
return OperationStatus.ERROR
|
282
284
|
|
283
285
|
|
284
|
-
@timed_fn(logger=logger, name="Transferring file")
|
285
|
-
@safe_execute("Transfer failed", OperationStatus.ERROR)
|
286
|
-
def transfer(source: str, dest: str) -> OperationStatus:
|
287
|
-
"""Transfer a file from source to destination with space monitoring.
|
288
|
-
|
289
|
-
This function copies a file from source to destination using the same
|
290
|
-
monitored process approach as the cache operations. It monitors disk space
|
291
|
-
at the destination and can interrupt the transfer if space becomes insufficient.
|
292
|
-
|
293
|
-
Args:
|
294
|
-
source: Path to the source file to copy.
|
295
|
-
dest: Path to the destination where the file will be copied.
|
296
|
-
|
297
|
-
Returns:
|
298
|
-
OperationStatus:
|
299
|
-
OperationStatus.SUCCESS if transfer was successful
|
300
|
-
OperationStatus.ERROR if transfer failed due to insufficient disk space,
|
301
|
-
file not found, or other errors.
|
302
|
-
|
303
|
-
Raises:
|
304
|
-
CacheOperationInterrupted: If transfer interrupted due to insufficient
|
305
|
-
disk space (caught and returns OperationStatus.ERROR).
|
306
|
-
Exception: Any other errors during transfer (caught and returns OperationStatus.ERROR).
|
307
|
-
"""
|
308
|
-
source_path = Path(source)
|
309
|
-
dest_path = Path(dest)
|
310
|
-
|
311
|
-
# Validate source file exists
|
312
|
-
if not source_path.exists():
|
313
|
-
logger.error(f"Source file does not exist: {source}")
|
314
|
-
return OperationStatus.ERROR
|
315
|
-
|
316
|
-
# Create destination directory if it doesn't exist
|
317
|
-
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
318
|
-
|
319
|
-
# Determine appropriate space threshold based on destination directory
|
320
|
-
dest_dir = dest_path.parent
|
321
|
-
if str(dest_dir).startswith(B10FS_CACHE_DIR):
|
322
|
-
# Transferring to b10fs - use b10fs space requirements
|
323
|
-
space_threshold_mb = REQUIRED_B10FS_SPACE_MB
|
324
|
-
logger.debug(
|
325
|
-
f"Transfer to b10fs detected, using {space_threshold_mb:.1f}MB threshold"
|
326
|
-
)
|
327
|
-
else:
|
328
|
-
# Transferring to local directory - use local space requirements
|
329
|
-
space_threshold_mb = MIN_LOCAL_SPACE_MB
|
330
|
-
logger.debug(
|
331
|
-
f"Transfer to local directory detected, using {space_threshold_mb:.1f}MB threshold"
|
332
|
-
)
|
333
|
-
|
334
|
-
# Initial disk space check
|
335
|
-
check_sufficient_disk_space(dest_dir, space_threshold_mb, "file transfer")
|
336
|
-
logger.debug(
|
337
|
-
f"Initial space check passed: {space_threshold_mb:.1f}MB required at destination"
|
338
|
-
)
|
339
|
-
|
340
|
-
# Start background space monitoring for destination directory
|
341
|
-
space_monitor = CacheSpaceMonitor(space_threshold_mb, dest_dir)
|
342
|
-
space_monitor.start()
|
343
|
-
|
344
|
-
try:
|
345
|
-
# Run monitored copy process
|
346
|
-
logger.info(f"Starting transfer: {source} -> {dest}")
|
347
|
-
run_monitored_process(
|
348
|
-
_cache_copy_worker,
|
349
|
-
(str(source_path), str(dest_path)),
|
350
|
-
space_monitor,
|
351
|
-
"file transfer",
|
352
|
-
cleanup_func=lambda: safe_unlink(
|
353
|
-
dest_path, f"Failed to cleanup interrupted transfer {dest_path}"
|
354
|
-
),
|
355
|
-
)
|
356
|
-
|
357
|
-
logger.info("File transfer complete")
|
358
|
-
return OperationStatus.SUCCESS
|
359
|
-
|
360
|
-
except CacheOperationInterrupted as e:
|
361
|
-
logger.warning(f"File transfer interrupted: {e}")
|
362
|
-
return OperationStatus.ERROR
|
363
|
-
|
364
|
-
finally:
|
365
|
-
space_monitor.stop()
|
366
|
-
|
367
|
-
|
368
286
|
@safe_execute("Clear failed", False)
|
369
287
|
def clear_local_cache() -> bool:
|
370
288
|
"""Clear the local PyTorch compilation cache directory.
|
@@ -382,6 +300,8 @@ def clear_local_cache() -> bool:
|
|
382
300
|
torch_dir = Path(TORCH_CACHE_DIR)
|
383
301
|
if not torch_dir.exists():
|
384
302
|
return True
|
303
|
+
import shutil
|
304
|
+
|
385
305
|
shutil.rmtree(torch_dir)
|
386
306
|
return True
|
387
307
|
|
@@ -409,27 +329,12 @@ def _cache_compression_worker(
|
|
409
329
|
create_archive(torch_dir, local_temp, max_size_mb)
|
410
330
|
|
411
331
|
|
412
|
-
@worker_process("Copy was cancelled before starting")
|
413
|
-
def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
|
414
|
-
"""Worker process that handles file copy to b10fs.
|
415
|
-
|
416
|
-
This function runs in a separate process to copy the compressed cache file
|
417
|
-
to the b10fs filesystem. It can be terminated externally if disk space becomes insufficient.
|
418
|
-
|
419
|
-
Args:
|
420
|
-
source_path_str: String path to the source file to copy.
|
421
|
-
dest_path_str: String path where the file will be copied.
|
422
|
-
"""
|
423
|
-
source_path = Path(source_path_str)
|
424
|
-
dest_path = Path(dest_path_str)
|
425
|
-
|
426
|
-
shutil.copy2(source_path, dest_path)
|
427
|
-
|
428
|
-
|
429
332
|
def _cleanup_torch_dir(torch_dir: Path) -> None:
|
430
333
|
"""Helper function to safely cleanup torch directory during interrupted extraction."""
|
431
334
|
try:
|
432
335
|
if torch_dir.exists():
|
336
|
+
import shutil
|
337
|
+
|
433
338
|
shutil.rmtree(torch_dir)
|
434
339
|
logger.debug(f"Cleaned up torch directory: {torch_dir}")
|
435
340
|
except Exception as e:
|
@@ -0,0 +1,131 @@
|
|
1
|
+
"""Core file transfer operations for b10-transfer.
|
2
|
+
|
3
|
+
This module provides generic file transfer functionality with space monitoring
|
4
|
+
and error handling for b10fs operations.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import logging
|
8
|
+
import shutil
|
9
|
+
from pathlib import Path
|
10
|
+
|
11
|
+
from .utils import (
|
12
|
+
timed_fn,
|
13
|
+
safe_execute,
|
14
|
+
safe_unlink,
|
15
|
+
)
|
16
|
+
from .space_monitor import (
|
17
|
+
check_sufficient_disk_space,
|
18
|
+
CacheSpaceMonitor,
|
19
|
+
CacheOperationInterrupted,
|
20
|
+
run_monitored_process,
|
21
|
+
worker_process,
|
22
|
+
)
|
23
|
+
from .constants import (
|
24
|
+
B10FS_CACHE_DIR,
|
25
|
+
REQUIRED_B10FS_SPACE_MB,
|
26
|
+
MIN_LOCAL_SPACE_MB,
|
27
|
+
OperationStatus,
|
28
|
+
)
|
29
|
+
|
30
|
+
logger = logging.getLogger(__name__)
|
31
|
+
|
32
|
+
|
33
|
+
@timed_fn(logger=logger, name="Transferring file")
|
34
|
+
@safe_execute("Transfer failed", OperationStatus.ERROR)
|
35
|
+
def transfer(source: str, dest: str) -> OperationStatus:
|
36
|
+
"""Transfer a file from source to destination with space monitoring.
|
37
|
+
|
38
|
+
This function copies a file from source to destination using the same
|
39
|
+
monitored process approach as the cache operations. It monitors disk space
|
40
|
+
at the destination and can interrupt the transfer if space becomes insufficient.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
source: Path to the source file to copy.
|
44
|
+
dest: Path to the destination where the file will be copied.
|
45
|
+
|
46
|
+
Returns:
|
47
|
+
OperationStatus:
|
48
|
+
OperationStatus.SUCCESS if transfer was successful
|
49
|
+
OperationStatus.ERROR if transfer failed due to insufficient disk space,
|
50
|
+
file not found, or other errors.
|
51
|
+
|
52
|
+
Raises:
|
53
|
+
CacheOperationInterrupted: If transfer interrupted due to insufficient
|
54
|
+
disk space (caught and returns OperationStatus.ERROR).
|
55
|
+
Exception: Any other errors during transfer (caught and returns OperationStatus.ERROR).
|
56
|
+
"""
|
57
|
+
source_path = Path(source)
|
58
|
+
dest_path = Path(dest)
|
59
|
+
|
60
|
+
# Validate source file exists
|
61
|
+
if not source_path.exists():
|
62
|
+
logger.error(f"Source file does not exist: {source}")
|
63
|
+
return OperationStatus.ERROR
|
64
|
+
|
65
|
+
# Create destination directory if it doesn't exist
|
66
|
+
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
67
|
+
|
68
|
+
# Determine appropriate space threshold based on destination directory
|
69
|
+
dest_dir = dest_path.parent
|
70
|
+
if str(dest_dir).startswith(B10FS_CACHE_DIR):
|
71
|
+
# Transferring to b10fs - use b10fs space requirements
|
72
|
+
space_threshold_mb = REQUIRED_B10FS_SPACE_MB
|
73
|
+
logger.debug(
|
74
|
+
f"Transfer to b10fs detected, using {space_threshold_mb:.1f}MB threshold"
|
75
|
+
)
|
76
|
+
else:
|
77
|
+
# Transferring to local directory - use local space requirements
|
78
|
+
space_threshold_mb = MIN_LOCAL_SPACE_MB
|
79
|
+
logger.debug(
|
80
|
+
f"Transfer to local directory detected, using {space_threshold_mb:.1f}MB threshold"
|
81
|
+
)
|
82
|
+
|
83
|
+
# Initial disk space check
|
84
|
+
check_sufficient_disk_space(dest_dir, space_threshold_mb, "file transfer")
|
85
|
+
logger.debug(
|
86
|
+
f"Initial space check passed: {space_threshold_mb:.1f}MB required at destination"
|
87
|
+
)
|
88
|
+
|
89
|
+
# Start background space monitoring for destination directory
|
90
|
+
space_monitor = CacheSpaceMonitor(space_threshold_mb, dest_dir)
|
91
|
+
space_monitor.start()
|
92
|
+
|
93
|
+
try:
|
94
|
+
# Run monitored copy process
|
95
|
+
logger.info(f"Starting transfer: {source} -> {dest}")
|
96
|
+
run_monitored_process(
|
97
|
+
_cache_copy_worker,
|
98
|
+
(str(source_path), str(dest_path)),
|
99
|
+
space_monitor,
|
100
|
+
"file transfer",
|
101
|
+
cleanup_func=lambda: safe_unlink(
|
102
|
+
dest_path, f"Failed to cleanup interrupted transfer {dest_path}"
|
103
|
+
),
|
104
|
+
)
|
105
|
+
|
106
|
+
logger.info("File transfer complete")
|
107
|
+
return OperationStatus.SUCCESS
|
108
|
+
|
109
|
+
except CacheOperationInterrupted as e:
|
110
|
+
logger.warning(f"File transfer interrupted: {e}")
|
111
|
+
return OperationStatus.ERROR
|
112
|
+
|
113
|
+
finally:
|
114
|
+
space_monitor.stop()
|
115
|
+
|
116
|
+
|
117
|
+
@worker_process("Copy was cancelled before starting")
|
118
|
+
def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
|
119
|
+
"""Worker process that handles file copy operations.
|
120
|
+
|
121
|
+
This function runs in a separate process to copy files between locations.
|
122
|
+
It can be terminated externally if disk space becomes insufficient.
|
123
|
+
|
124
|
+
Args:
|
125
|
+
source_path_str: String path to the source file to copy.
|
126
|
+
dest_path_str: String path where the file will be copied.
|
127
|
+
"""
|
128
|
+
source_path = Path(source_path_str)
|
129
|
+
dest_path = Path(dest_path_str)
|
130
|
+
|
131
|
+
shutil.copy2(source_path, dest_path)
|
@@ -28,15 +28,14 @@ def get_cache_filename() -> str:
|
|
28
28
|
"""Get the cache filename prefix for the current environment.
|
29
29
|
|
30
30
|
This function generates a cache filename prefix that includes the
|
31
|
-
environment key
|
31
|
+
environment key to ensure cache files are environment-specific
|
32
32
|
and unique per machine.
|
33
33
|
|
34
34
|
Returns:
|
35
|
-
str: Cache filename prefix in format "cache_{environment_key}
|
35
|
+
str: Cache filename prefix in format "cache_{environment_key}".
|
36
36
|
"""
|
37
37
|
env_key = get_environment_key()
|
38
|
-
|
39
|
-
return f"cache_{env_key}.{hostname}"
|
38
|
+
return f"cache_{env_key}"
|
40
39
|
|
41
40
|
|
42
41
|
def get_environment_key() -> str:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|