b10-transfer 0.0.1__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- b10_transfer/__init__.py +1 -1
- b10_transfer/async_transfers.py +8 -0
- b10_transfer/constants.py +3 -0
- b10_transfer/core.py +12 -3
- b10_transfer/space_monitor.py +13 -0
- b10_transfer/torch_cache.py +15 -3
- {b10_transfer-0.0.1.dist-info → b10_transfer-0.1.0.dist-info}/METADATA +1 -1
- b10_transfer-0.1.0.dist-info/RECORD +15 -0
- b10_transfer-0.0.1.dist-info/RECORD +0 -15
- {b10_transfer-0.0.1.dist-info → b10_transfer-0.1.0.dist-info}/WHEEL +0 -0
b10_transfer/__init__.py
CHANGED
b10_transfer/async_transfers.py
CHANGED
@@ -39,6 +39,7 @@ class TransferProgress:
|
|
39
39
|
AsyncTransferStatus.ERROR,
|
40
40
|
AsyncTransferStatus.INTERRUPTED,
|
41
41
|
AsyncTransferStatus.CANCELLED,
|
42
|
+
AsyncTransferStatus.DOES_NOT_EXIST,
|
42
43
|
]:
|
43
44
|
self.completed_at = datetime.now()
|
44
45
|
|
@@ -140,6 +141,12 @@ class AsyncTransferManager:
|
|
140
141
|
"Transfer interrupted due to insufficient disk space",
|
141
142
|
)
|
142
143
|
logger.warning(f"Transfer interrupted: {operation_id}")
|
144
|
+
elif result == TransferStatus.DOES_NOT_EXIST:
|
145
|
+
progress.update_status(
|
146
|
+
AsyncTransferStatus.DOES_NOT_EXIST,
|
147
|
+
"Cache file not found",
|
148
|
+
)
|
149
|
+
logger.info(f"Transfer failed - file not found: {operation_id}")
|
143
150
|
else:
|
144
151
|
progress.update_status(
|
145
152
|
AsyncTransferStatus.ERROR, "Transfer operation failed"
|
@@ -166,6 +173,7 @@ class AsyncTransferManager:
|
|
166
173
|
AsyncTransferStatus.ERROR,
|
167
174
|
AsyncTransferStatus.INTERRUPTED,
|
168
175
|
AsyncTransferStatus.CANCELLED,
|
176
|
+
AsyncTransferStatus.DOES_NOT_EXIST,
|
169
177
|
]
|
170
178
|
|
171
179
|
def wait_for_completion(
|
b10_transfer/constants.py
CHANGED
@@ -113,6 +113,7 @@ class WorkerStatus(Enum):
|
|
113
113
|
SUCCESS = auto()
|
114
114
|
ERROR = auto()
|
115
115
|
CANCELLED = auto()
|
116
|
+
FILE_NOT_FOUND = auto()
|
116
117
|
|
117
118
|
|
118
119
|
class LoadStatus(Enum):
|
@@ -138,6 +139,7 @@ class TransferStatus(Enum):
|
|
138
139
|
SUCCESS = auto()
|
139
140
|
ERROR = auto()
|
140
141
|
INTERRUPTED = auto()
|
142
|
+
DOES_NOT_EXIST = auto()
|
141
143
|
|
142
144
|
|
143
145
|
class AsyncTransferStatus(Enum):
|
@@ -147,3 +149,4 @@ class AsyncTransferStatus(Enum):
|
|
147
149
|
ERROR = auto()
|
148
150
|
INTERRUPTED = auto()
|
149
151
|
CANCELLED = auto()
|
152
|
+
DOES_NOT_EXIST = auto()
|
b10_transfer/core.py
CHANGED
@@ -11,6 +11,7 @@ from .space_monitor import (
|
|
11
11
|
check_sufficient_disk_space,
|
12
12
|
CacheSpaceMonitor,
|
13
13
|
CacheOperationInterrupted,
|
14
|
+
CacheFileNotFoundError,
|
14
15
|
run_monitored_process,
|
15
16
|
)
|
16
17
|
from .constants import (
|
@@ -118,9 +119,13 @@ def transfer(
|
|
118
119
|
if primary_monitor is None:
|
119
120
|
# No monitoring requested, execute callback directly
|
120
121
|
logger.info(f"Starting transfer (no monitoring): {source} -> {dest}")
|
121
|
-
|
122
|
-
|
123
|
-
|
122
|
+
try:
|
123
|
+
callback(source, dest, *callback_args, **callback_kwargs)
|
124
|
+
logger.info("Transfer complete")
|
125
|
+
return TransferStatus.SUCCESS
|
126
|
+
except (FileNotFoundError, CacheFileNotFoundError) as e:
|
127
|
+
logger.info(f"Transfer failed - file not found: {e}")
|
128
|
+
return TransferStatus.DOES_NOT_EXIST
|
124
129
|
|
125
130
|
# Start the primary space monitor
|
126
131
|
primary_monitor.start()
|
@@ -151,6 +156,10 @@ def transfer(
|
|
151
156
|
logger.info("Transfer complete (unmonitored)")
|
152
157
|
return TransferStatus.SUCCESS
|
153
158
|
|
159
|
+
except (FileNotFoundError, CacheFileNotFoundError) as e:
|
160
|
+
logger.info(f"Transfer failed - file not found: {e}")
|
161
|
+
return TransferStatus.DOES_NOT_EXIST
|
162
|
+
|
154
163
|
except CacheOperationInterrupted as e:
|
155
164
|
logger.warning(f"Transfer interrupted: {e}")
|
156
165
|
return TransferStatus.INTERRUPTED
|
b10_transfer/space_monitor.py
CHANGED
@@ -24,6 +24,12 @@ class CacheOperationInterrupted(Exception):
|
|
24
24
|
pass
|
25
25
|
|
26
26
|
|
27
|
+
class CacheFileNotFoundError(Exception):
|
28
|
+
"""Raised when a cache file is not found during transfer operations."""
|
29
|
+
|
30
|
+
pass
|
31
|
+
|
32
|
+
|
27
33
|
def worker_process(cancelled_message: str):
|
28
34
|
"""Decorator for worker process functions to handle common try/catch/result_queue pattern.
|
29
35
|
|
@@ -60,6 +66,8 @@ def worker_process(cancelled_message: str):
|
|
60
66
|
# If we get here, the function completed successfully
|
61
67
|
result_queue.put((WorkerStatus.SUCCESS.value, None))
|
62
68
|
|
69
|
+
except FileNotFoundError as e:
|
70
|
+
result_queue.put((WorkerStatus.FILE_NOT_FOUND.value, str(e)))
|
63
71
|
except Exception as e:
|
64
72
|
result_queue.put((WorkerStatus.ERROR.value, str(e)))
|
65
73
|
|
@@ -286,6 +294,11 @@ def run_monitored_process(
|
|
286
294
|
if cleanup_func:
|
287
295
|
cleanup_func()
|
288
296
|
raise CacheOperationInterrupted(error_msg)
|
297
|
+
elif status == WorkerStatus.FILE_NOT_FOUND.value:
|
298
|
+
logger.info(
|
299
|
+
f"{operation_name} worker failed - file not found: {error_msg}"
|
300
|
+
)
|
301
|
+
raise CacheFileNotFoundError(error_msg)
|
289
302
|
# status == WorkerStatus.SUCCESS.value - continue normally
|
290
303
|
|
291
304
|
logger.debug(f"{operation_name} completed successfully")
|
b10_transfer/torch_cache.py
CHANGED
@@ -40,7 +40,7 @@ logger = logging.getLogger(__name__)
|
|
40
40
|
|
41
41
|
|
42
42
|
def torch_cache_save_callback(
|
43
|
-
source_dir: Path, dest_file: Path, max_size_mb: int
|
43
|
+
source_dir: Path, dest_file: Path, max_size_mb: int = None, *args, **kwargs
|
44
44
|
) -> None:
|
45
45
|
"""Callback function for saving torch cache: compress then copy to b10fs.
|
46
46
|
|
@@ -51,8 +51,14 @@ def torch_cache_save_callback(
|
|
51
51
|
Args:
|
52
52
|
source_dir: Path to the torch cache directory to compress
|
53
53
|
dest_file: Path to the final cache file in b10fs
|
54
|
-
max_size_mb: Maximum allowed archive size in megabytes
|
54
|
+
max_size_mb: Maximum allowed archive size in megabytes (can be passed as kwarg)
|
55
|
+
*args: Additional arguments passed by the transfer system (ignored)
|
56
|
+
**kwargs: Additional keyword arguments passed by the transfer system (may contain max_size_mb)
|
55
57
|
"""
|
58
|
+
# Handle max_size_mb from kwargs if not provided as positional argument
|
59
|
+
if max_size_mb is None:
|
60
|
+
max_size_mb = kwargs.get("max_size_mb", MAX_CACHE_SIZE_MB)
|
61
|
+
|
56
62
|
work_dir = Path(LOCAL_WORK_DIR)
|
57
63
|
|
58
64
|
# Create temporary archive in local work directory
|
@@ -99,7 +105,9 @@ def torch_cache_save_callback(
|
|
99
105
|
raise
|
100
106
|
|
101
107
|
|
102
|
-
def torch_cache_load_callback(
|
108
|
+
def torch_cache_load_callback(
|
109
|
+
source_file: Path, dest_dir: Path, *args, **kwargs
|
110
|
+
) -> None:
|
103
111
|
"""Callback function for loading torch cache: copy from b10fs then extract.
|
104
112
|
|
105
113
|
This function handles the torch-specific load logic:
|
@@ -109,6 +117,8 @@ def torch_cache_load_callback(source_file: Path, dest_dir: Path) -> None:
|
|
109
117
|
Args:
|
110
118
|
source_file: Path to the cache file in b10fs
|
111
119
|
dest_dir: Path to the torch cache directory where files will be extracted
|
120
|
+
*args: Additional arguments passed by the transfer system (ignored)
|
121
|
+
**kwargs: Additional keyword arguments passed by the transfer system (ignored)
|
112
122
|
"""
|
113
123
|
work_dir = Path(LOCAL_WORK_DIR)
|
114
124
|
|
@@ -126,6 +136,8 @@ def torch_cache_load_callback(source_file: Path, dest_dir: Path) -> None:
|
|
126
136
|
@critical_section_b10fs_file_lock("copy_out")
|
127
137
|
def _copy_from_b10fs():
|
128
138
|
logger.info(f"Copying from b10fs: {source_file} -> {temp_archive}")
|
139
|
+
if not source_file.exists():
|
140
|
+
raise FileNotFoundError(f"Cache file not found: {source_file}")
|
129
141
|
shutil.copy2(source_file, temp_archive)
|
130
142
|
|
131
143
|
_copy_from_b10fs()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: b10-transfer
|
3
|
-
Version: 0.0
|
3
|
+
Version: 0.1.0
|
4
4
|
Summary: Distributed PyTorch compilation cache for Baseten - Environment-aware, lock-free compilation cache management
|
5
5
|
License: MIT
|
6
6
|
Keywords: pytorch,torch.compile,cache,machine-learning,inference
|
@@ -0,0 +1,15 @@
|
|
1
|
+
b10_transfer/__init__.py,sha256=Z_p771iwuROcCSNWKjUZ9j-V7ICmbtwr_qet5FCsnkQ,1400
|
2
|
+
b10_transfer/archive.py,sha256=GKb0mi0-YeM7ch4FLAoOLHXw0T6LkRerYad2N2y9TYM,6400
|
3
|
+
b10_transfer/async_torch_cache.py,sha256=4hMjVR44SLlGes25e_cjgMTywFfIYjH0TnUmg9o-iyI,1903
|
4
|
+
b10_transfer/async_transfers.py,sha256=luqdIStT_j4YduImY67HvX5WDurqV9Q5RjEyMI7bh1k,9476
|
5
|
+
b10_transfer/cleanup.py,sha256=xjKStmBjaarZPxhPTT1-Ds_pvUR7kdJw5Kp19BLvzzY,6224
|
6
|
+
b10_transfer/constants.py,sha256=R2JE_634Ri_9rf8adwiAzcfiej5weAGP1x1ccSZLX8k,4829
|
7
|
+
b10_transfer/core.py,sha256=d-aaQwKYqKIafBYBNahNcnOpwcanOSrWLwdzXpjVLBs,6350
|
8
|
+
b10_transfer/environment.py,sha256=aC0biEMQrtHk0ke_3epdcq1X9J5fPmPpBVt0fH7XF2Y,5625
|
9
|
+
b10_transfer/info.py,sha256=I3iOuImZ5r6DMJTDeBtVvzlSn6IuyPJbLJYUO_OF0ks,6299
|
10
|
+
b10_transfer/space_monitor.py,sha256=G_3wLSJa7HTCihSpLoow2oKo2cARJ2PtvY1XOQZl3-s,11028
|
11
|
+
b10_transfer/torch_cache.py,sha256=e41mDdnP_h61WNwB7TG5c4a7ecw0-K63ytJiKsX0keY,14907
|
12
|
+
b10_transfer/utils.py,sha256=Stee0DFK-8MRRYNIocqaK64cJvfs4jPW3Mpx7zkWV6Y,11932
|
13
|
+
b10_transfer-0.1.0.dist-info/METADATA,sha256=wc0a--Bgr-7filvyS4uUAic9fO1JJbKqc5iNp36A-iU,7502
|
14
|
+
b10_transfer-0.1.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
15
|
+
b10_transfer-0.1.0.dist-info/RECORD,,
|
@@ -1,15 +0,0 @@
|
|
1
|
-
b10_transfer/__init__.py,sha256=o1ej-OtAOsfrJbvh5C3PnqxW2qfcO7l8rllVD-07lXE,1400
|
2
|
-
b10_transfer/archive.py,sha256=GKb0mi0-YeM7ch4FLAoOLHXw0T6LkRerYad2N2y9TYM,6400
|
3
|
-
b10_transfer/async_torch_cache.py,sha256=4hMjVR44SLlGes25e_cjgMTywFfIYjH0TnUmg9o-iyI,1903
|
4
|
-
b10_transfer/async_transfers.py,sha256=AAML562qYzF9NyX9AdfiJ0OcQw6vXr985IZWXZSot9Q,9083
|
5
|
-
b10_transfer/cleanup.py,sha256=xjKStmBjaarZPxhPTT1-Ds_pvUR7kdJw5Kp19BLvzzY,6224
|
6
|
-
b10_transfer/constants.py,sha256=KjSUO6heScDJXQwFlHdeNV4KBBqKz7CKeJzo44-9qMM,4745
|
7
|
-
b10_transfer/core.py,sha256=BOnA6FXkZRm74_CtQBMudpx3q7HTEGEORUV26fb6cvQ,5920
|
8
|
-
b10_transfer/environment.py,sha256=aC0biEMQrtHk0ke_3epdcq1X9J5fPmPpBVt0fH7XF2Y,5625
|
9
|
-
b10_transfer/info.py,sha256=I3iOuImZ5r6DMJTDeBtVvzlSn6IuyPJbLJYUO_OF0ks,6299
|
10
|
-
b10_transfer/space_monitor.py,sha256=5pwW643KAHI3mtT61hYf29953UD9LekzWFF1K-QeYbw,10529
|
11
|
-
b10_transfer/torch_cache.py,sha256=Oe_OeUPGAlmK9wY-L9w4aPaXOoMnL_kD596hew6ETcw,14192
|
12
|
-
b10_transfer/utils.py,sha256=Stee0DFK-8MRRYNIocqaK64cJvfs4jPW3Mpx7zkWV6Y,11932
|
13
|
-
b10_transfer-0.0.1.dist-info/METADATA,sha256=hESeWyidAEbtWkIgepBn1Cxlo9--jIj9vcLxM4zP7lY,7502
|
14
|
-
b10_transfer-0.0.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
15
|
-
b10_transfer-0.0.1.dist-info/RECORD,,
|
File without changes
|