b10-transfer 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- b10_transfer/__init__.py +6 -5
- b10_transfer/cache.py +361 -0
- b10_transfer/constants.py +4 -19
- b10_transfer/core.py +20 -320
- {b10_transfer-0.1.4.dist-info → b10_transfer-0.1.6.dist-info}/METADATA +1 -1
- b10_transfer-0.1.6.dist-info/RECORD +13 -0
- b10_transfer-0.1.4.dist-info/RECORD +0 -12
- {b10_transfer-0.1.4.dist-info → b10_transfer-0.1.6.dist-info}/WHEEL +0 -0
b10_transfer/__init__.py
CHANGED
@@ -1,23 +1,24 @@
|
|
1
1
|
"""B10 Transfer - Lock-free PyTorch file transfer for Baseten."""
|
2
2
|
|
3
|
-
from .
|
3
|
+
from .cache import load_compile_cache, save_compile_cache, clear_local_cache
|
4
|
+
from .core import transfer
|
4
5
|
from .utils import CacheError, CacheValidationError
|
5
6
|
from .space_monitor import CacheOperationInterrupted
|
6
7
|
from .info import get_cache_info, list_available_caches
|
7
|
-
from .constants import
|
8
|
+
from .constants import OperationStatus
|
8
9
|
|
9
10
|
# Version
|
10
|
-
__version__ = "0.1.
|
11
|
+
__version__ = "0.1.6"
|
11
12
|
|
12
13
|
__all__ = [
|
13
14
|
"CacheError",
|
14
15
|
"CacheValidationError",
|
15
16
|
"CacheOperationInterrupted",
|
16
|
-
"
|
17
|
-
"LoadStatus",
|
17
|
+
"OperationStatus",
|
18
18
|
"load_compile_cache",
|
19
19
|
"save_compile_cache",
|
20
20
|
"clear_local_cache",
|
21
|
+
"transfer",
|
21
22
|
"get_cache_info",
|
22
23
|
"list_available_caches",
|
23
24
|
]
|
b10_transfer/cache.py
ADDED
@@ -0,0 +1,361 @@
|
|
1
|
+
"""Cache operations for PyTorch compilation artifacts.
|
2
|
+
|
3
|
+
This module provides functions for loading and saving PyTorch compilation cache
|
4
|
+
to/from b10fs shared storage using atomic operations and space monitoring.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import logging
|
8
|
+
import tempfile
|
9
|
+
from pathlib import Path
|
10
|
+
|
11
|
+
from .environment import get_cache_filename
|
12
|
+
from .cleanup import cooperative_cleanup_b10fs
|
13
|
+
from .utils import (
|
14
|
+
timed_fn,
|
15
|
+
critical_section_b10fs_file_lock,
|
16
|
+
safe_execute,
|
17
|
+
temp_file_cleanup,
|
18
|
+
cache_operation,
|
19
|
+
safe_unlink,
|
20
|
+
)
|
21
|
+
from .space_monitor import (
|
22
|
+
CacheSpaceMonitor,
|
23
|
+
CacheOperationInterrupted,
|
24
|
+
run_monitored_process,
|
25
|
+
worker_process,
|
26
|
+
)
|
27
|
+
from .constants import (
|
28
|
+
TORCH_CACHE_DIR,
|
29
|
+
B10FS_CACHE_DIR,
|
30
|
+
LOCAL_WORK_DIR,
|
31
|
+
MAX_CACHE_SIZE_MB,
|
32
|
+
REQUIRED_B10FS_SPACE_MB,
|
33
|
+
MIN_LOCAL_SPACE_MB,
|
34
|
+
CACHE_FILE_EXTENSION,
|
35
|
+
CACHE_LATEST_SUFFIX,
|
36
|
+
CACHE_INCOMPLETE_SUFFIX,
|
37
|
+
OperationStatus,
|
38
|
+
)
|
39
|
+
from .core import transfer
|
40
|
+
|
41
|
+
logger = logging.getLogger(__name__)
|
42
|
+
|
43
|
+
|
44
|
+
"""
|
45
|
+
FIXME(SRAY):
|
46
|
+
What about the case in @b10-transfer/ where a single pod finishes an inference request,
|
47
|
+
and then the client calls save_compile_cache. And while we are creating the local archive,
|
48
|
+
another inference call on the same pod is kicked off, which then modifies the torch cache.
|
49
|
+
How would this be handled? Maybe just accept that the cache will be recompiled/overwritten?
|
50
|
+
Otherwise you'd need application level coordination to ensure that the cache is not modified
|
51
|
+
while we are creating the archive, but this doesn't really seem like a good idea in terms of adoption.
|
52
|
+
|
53
|
+
FIXME(SR):
|
54
|
+
More things to consider:
|
55
|
+
- [possible] What if b10fs dies *during* an op? right now we check for b10fs availability in the beginning of the op... Add some constants instead of just False for load().
|
56
|
+
- [possible, and really bad if it happens] potential memory exhaustion during compression if the cache is super super large. very very edge case. higher compression levels also have high memory usage.
|
57
|
+
"""
|
58
|
+
|
59
|
+
|
60
|
+
def _setup_cache_paths():
|
61
|
+
"""Common setup for cache operations - returns paths and performs cleanup."""
|
62
|
+
# Cooperative cleanup of stale shared resources
|
63
|
+
cooperative_cleanup_b10fs()
|
64
|
+
|
65
|
+
b10fs_dir = Path(B10FS_CACHE_DIR)
|
66
|
+
torch_dir = Path(TORCH_CACHE_DIR)
|
67
|
+
work_dir = Path(LOCAL_WORK_DIR)
|
68
|
+
|
69
|
+
return b10fs_dir, torch_dir, work_dir
|
70
|
+
|
71
|
+
|
72
|
+
def _get_cache_file_paths(cache_filename: str, b10fs_dir: Path):
|
73
|
+
"""Generate cache file paths for a given cache filename."""
|
74
|
+
final_file = (
|
75
|
+
b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
|
76
|
+
)
|
77
|
+
temp_file = (
|
78
|
+
b10fs_dir / f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
|
79
|
+
)
|
80
|
+
return final_file, temp_file
|
81
|
+
|
82
|
+
|
83
|
+
def _run_with_space_monitoring(
|
84
|
+
space_threshold_mb: float,
|
85
|
+
monitor_dir: Path,
|
86
|
+
operation_name: str,
|
87
|
+
worker_func,
|
88
|
+
worker_args: tuple,
|
89
|
+
cleanup_func=None,
|
90
|
+
):
|
91
|
+
"""Helper to run an operation with space monitoring."""
|
92
|
+
space_monitor = CacheSpaceMonitor(space_threshold_mb, monitor_dir)
|
93
|
+
space_monitor.start()
|
94
|
+
|
95
|
+
try:
|
96
|
+
logger.info(
|
97
|
+
f"Starting {operation_name}: {' -> '.join(str(arg) for arg in worker_args[:2])}"
|
98
|
+
)
|
99
|
+
run_monitored_process(
|
100
|
+
worker_func,
|
101
|
+
worker_args,
|
102
|
+
space_monitor,
|
103
|
+
operation_name,
|
104
|
+
cleanup_func=cleanup_func,
|
105
|
+
)
|
106
|
+
finally:
|
107
|
+
space_monitor.stop()
|
108
|
+
|
109
|
+
|
110
|
+
def _transfer_with_b10fs_lock(
|
111
|
+
source: str, dest: str, lock_type: str, cleanup_on_failure=True
|
112
|
+
):
|
113
|
+
"""Transfer a file with b10fs file locking and error handling."""
|
114
|
+
|
115
|
+
@critical_section_b10fs_file_lock(lock_type)
|
116
|
+
def _locked_transfer():
|
117
|
+
result = transfer(source, dest)
|
118
|
+
if result != OperationStatus.SUCCESS:
|
119
|
+
if cleanup_on_failure:
|
120
|
+
safe_unlink(
|
121
|
+
Path(dest), f"Failed to cleanup after failed transfer {dest}"
|
122
|
+
)
|
123
|
+
raise Exception(f"Failed to transfer {source} -> {dest}")
|
124
|
+
|
125
|
+
_locked_transfer()
|
126
|
+
|
127
|
+
|
128
|
+
@timed_fn(logger=logger, name="Loading compile cache")
|
129
|
+
@safe_execute("Load failed", False)
|
130
|
+
def load_compile_cache() -> OperationStatus:
|
131
|
+
"""Load PyTorch compilation cache from b10fs to local torch cache directory.
|
132
|
+
|
133
|
+
This function implements a lock-free pattern to safely load cached PyTorch
|
134
|
+
compilation artifacts from the b10fs shared filesystem to the local torch
|
135
|
+
cache directory. It validates b10fs availability, checks for existing cache,
|
136
|
+
and extracts the archive if needed.
|
137
|
+
|
138
|
+
The function monitors local disk space during both the copy from b10fs and
|
139
|
+
extraction phases, interrupting operations if space falls below MIN_LOCAL_SPACE_MB.
|
140
|
+
|
141
|
+
Returns:
|
142
|
+
OperationStatus:
|
143
|
+
OperationStatus.SUCCESS if cache was successfully loaded
|
144
|
+
OperationStatus.SKIPPED if already exists
|
145
|
+
OperationStatus.ERROR if b10fs is unavailable, local disk space is insufficient, or loading failed.
|
146
|
+
OperationStatus.DOES_NOT_EXIST if no cache file was found.
|
147
|
+
|
148
|
+
Raises:
|
149
|
+
CacheValidationError: If b10fs is not enabled (caught and returns OperationStatus.ERROR).
|
150
|
+
CacheOperationInterrupted: If operations interrupted due to insufficient
|
151
|
+
local disk space (caught and returns OperationStatus.ERROR).
|
152
|
+
Exception: Any other errors during loading (caught and returns OperationStatus.ERROR).
|
153
|
+
"""
|
154
|
+
with cache_operation("Load"):
|
155
|
+
b10fs_dir, torch_dir, work_dir = _setup_cache_paths()
|
156
|
+
|
157
|
+
cache_filename = get_cache_filename()
|
158
|
+
final_file, _ = _get_cache_file_paths(cache_filename, b10fs_dir)
|
159
|
+
logger.debug(f"Looking for cache file: {final_file}")
|
160
|
+
|
161
|
+
if not final_file.exists():
|
162
|
+
logger.info("No cache file found in b10fs")
|
163
|
+
return OperationStatus.DOES_NOT_EXIST
|
164
|
+
|
165
|
+
# Skip if already loaded
|
166
|
+
if torch_dir.exists() and any(torch_dir.iterdir()):
|
167
|
+
logger.info("Torch cache already loaded, skipping extraction")
|
168
|
+
return OperationStatus.SKIPPED
|
169
|
+
|
170
|
+
# Create temp local copy
|
171
|
+
with tempfile.NamedTemporaryFile(
|
172
|
+
suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
|
173
|
+
) as f:
|
174
|
+
temp_path = Path(f.name)
|
175
|
+
logger.debug(f"Created temporary file for cache: {temp_path}")
|
176
|
+
|
177
|
+
try:
|
178
|
+
with temp_file_cleanup(temp_path):
|
179
|
+
# Phase 1: Copy from b10fs to local temp file
|
180
|
+
_transfer_with_b10fs_lock(
|
181
|
+
str(final_file),
|
182
|
+
str(temp_path),
|
183
|
+
"copy_out",
|
184
|
+
cleanup_on_failure=False,
|
185
|
+
)
|
186
|
+
|
187
|
+
# Phase 2: Extract archive with space monitoring
|
188
|
+
_run_with_space_monitoring(
|
189
|
+
MIN_LOCAL_SPACE_MB,
|
190
|
+
work_dir,
|
191
|
+
"archive extraction",
|
192
|
+
_cache_extract_worker,
|
193
|
+
(str(temp_path), str(torch_dir)),
|
194
|
+
cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
|
195
|
+
)
|
196
|
+
|
197
|
+
logger.info("Cache load complete")
|
198
|
+
return OperationStatus.SUCCESS
|
199
|
+
|
200
|
+
except CacheOperationInterrupted as e:
|
201
|
+
logger.warning(f"Cache load interrupted: {e}")
|
202
|
+
return OperationStatus.ERROR
|
203
|
+
|
204
|
+
|
205
|
+
@timed_fn(logger=logger, name="Saving compile cache")
|
206
|
+
@safe_execute("Save failed", False)
|
207
|
+
def save_compile_cache() -> OperationStatus:
|
208
|
+
"""Save local PyTorch compilation cache to b10fs using atomic journal pattern.
|
209
|
+
|
210
|
+
This function creates an archive of the local torch cache directory and
|
211
|
+
atomically saves it to b10fs using a journal pattern (write to temp file,
|
212
|
+
then rename). This ensures concurrent saves don't corrupt each other.
|
213
|
+
|
214
|
+
The function validates b10fs availability, checks if cache already exists
|
215
|
+
(early exit), performs initial space checks using pre-calculated requirements
|
216
|
+
for concurrent saves, starts background space monitoring, then runs compression
|
217
|
+
and copy operations in separate worker processes that can be terminated if disk
|
218
|
+
space becomes insufficient, finally performing an atomic rename to the final cache file.
|
219
|
+
|
220
|
+
Returns:
|
221
|
+
OperationStatus:
|
222
|
+
OperationStatus.SUCCESS if cache was successfully saved or already exists
|
223
|
+
OperationStatus.ERROR if b10fs is unavailable, insufficient disk space caused interruption,
|
224
|
+
no cache exists to save, or saving failed.
|
225
|
+
OperationStatus.SKIPPED if no cache exists to save or cache already exists in b10fs
|
226
|
+
|
227
|
+
Raises:
|
228
|
+
CacheValidationError: If b10fs is not enabled (caught and returns OperationStatus.ERROR).
|
229
|
+
CacheOperationInterrupted: If operations interrupted due to insufficient
|
230
|
+
disk space (caught and returns OperationStatus.ERROR).
|
231
|
+
ArchiveError: If archive creation fails (caught and returns OperationStatus.ERROR).
|
232
|
+
Exception: Any other errors during saving (caught and returns OperationStatus.ERROR).
|
233
|
+
"""
|
234
|
+
with cache_operation("Save"):
|
235
|
+
b10fs_dir, torch_dir, work_dir = _setup_cache_paths()
|
236
|
+
|
237
|
+
# Check if anything to save
|
238
|
+
if not torch_dir.exists() or not any(torch_dir.iterdir()):
|
239
|
+
logger.info("No torch cache to save")
|
240
|
+
return OperationStatus.SKIPPED
|
241
|
+
|
242
|
+
cache_filename = get_cache_filename()
|
243
|
+
final_file, temp_file = _get_cache_file_paths(cache_filename, b10fs_dir)
|
244
|
+
|
245
|
+
# Check for existing cache first (early exit)
|
246
|
+
if final_file.exists():
|
247
|
+
logger.info("Cache already exists in b10fs, skipping save")
|
248
|
+
return OperationStatus.SKIPPED
|
249
|
+
|
250
|
+
with tempfile.NamedTemporaryFile(
|
251
|
+
suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
|
252
|
+
) as f:
|
253
|
+
local_temp = Path(f.name)
|
254
|
+
logger.debug(f"Created local temp file for archive: {local_temp}")
|
255
|
+
|
256
|
+
try:
|
257
|
+
with temp_file_cleanup(local_temp):
|
258
|
+
# Phase 1: Compression with space monitoring
|
259
|
+
_run_with_space_monitoring(
|
260
|
+
REQUIRED_B10FS_SPACE_MB,
|
261
|
+
b10fs_dir,
|
262
|
+
"compression",
|
263
|
+
_cache_compression_worker,
|
264
|
+
(str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
|
265
|
+
)
|
266
|
+
|
267
|
+
# Phase 2: Copy to b10fs with locking
|
268
|
+
_transfer_with_b10fs_lock(
|
269
|
+
str(local_temp), str(temp_file), "copy_in", cleanup_on_failure=True
|
270
|
+
)
|
271
|
+
|
272
|
+
# Phase 3: Atomic rename (fast, don't interrupt)
|
273
|
+
logger.info(
|
274
|
+
f"Renaming temp file to final cache file: {temp_file} -> {final_file}"
|
275
|
+
)
|
276
|
+
temp_file.rename(final_file)
|
277
|
+
|
278
|
+
logger.info("Cache save complete")
|
279
|
+
return OperationStatus.SUCCESS
|
280
|
+
|
281
|
+
except CacheOperationInterrupted as e:
|
282
|
+
logger.warning(f"Cache save interrupted: {e}")
|
283
|
+
return OperationStatus.ERROR
|
284
|
+
|
285
|
+
|
286
|
+
@safe_execute("Clear failed", False)
|
287
|
+
def clear_local_cache() -> bool:
|
288
|
+
"""Clear the local PyTorch compilation cache directory.
|
289
|
+
|
290
|
+
This function removes the entire local torch cache directory and all its
|
291
|
+
contents. This is useful for cleaning up disk space or forcing recompilation.
|
292
|
+
|
293
|
+
Returns:
|
294
|
+
bool: True if cache was successfully cleared or didn't exist, False if
|
295
|
+
clearing failed due to permissions or other filesystem errors.
|
296
|
+
|
297
|
+
Raises:
|
298
|
+
Exception: Any errors during directory removal (caught and returns False).
|
299
|
+
"""
|
300
|
+
torch_dir = Path(TORCH_CACHE_DIR)
|
301
|
+
if not torch_dir.exists():
|
302
|
+
return True
|
303
|
+
import shutil
|
304
|
+
|
305
|
+
shutil.rmtree(torch_dir)
|
306
|
+
return True
|
307
|
+
|
308
|
+
|
309
|
+
@worker_process("Compression was cancelled before starting")
|
310
|
+
def _cache_compression_worker(
|
311
|
+
torch_dir_str: str, local_temp_str: str, max_size_mb: int
|
312
|
+
) -> None:
|
313
|
+
"""Worker process that handles cache compression.
|
314
|
+
|
315
|
+
This function runs in a separate process to compress the torch cache directory
|
316
|
+
into an archive. It can be terminated externally if disk space becomes insufficient.
|
317
|
+
|
318
|
+
Args:
|
319
|
+
torch_dir_str: String path to the torch cache directory to compress.
|
320
|
+
local_temp_str: String path where the compressed archive will be created.
|
321
|
+
max_size_mb: Maximum allowed archive size in megabytes.
|
322
|
+
"""
|
323
|
+
torch_dir = Path(torch_dir_str)
|
324
|
+
local_temp = Path(local_temp_str)
|
325
|
+
|
326
|
+
# Import here to avoid issues with multiprocessing
|
327
|
+
from .archive import create_archive
|
328
|
+
|
329
|
+
create_archive(torch_dir, local_temp, max_size_mb)
|
330
|
+
|
331
|
+
|
332
|
+
def _cleanup_torch_dir(torch_dir: Path) -> None:
|
333
|
+
"""Helper function to safely cleanup torch directory during interrupted extraction."""
|
334
|
+
try:
|
335
|
+
if torch_dir.exists():
|
336
|
+
import shutil
|
337
|
+
|
338
|
+
shutil.rmtree(torch_dir)
|
339
|
+
logger.debug(f"Cleaned up torch directory: {torch_dir}")
|
340
|
+
except Exception as e:
|
341
|
+
logger.error(f"Failed to cleanup torch directory {torch_dir}: {e}")
|
342
|
+
|
343
|
+
|
344
|
+
@worker_process("Extraction was cancelled before starting")
|
345
|
+
def _cache_extract_worker(archive_path_str: str, dest_dir_str: str) -> None:
|
346
|
+
"""Worker process that handles archive extraction.
|
347
|
+
|
348
|
+
This function runs in a separate process to extract the cache archive to
|
349
|
+
the torch cache directory. It can be terminated externally if local disk space becomes insufficient.
|
350
|
+
|
351
|
+
Args:
|
352
|
+
archive_path_str: String path to the archive file to extract.
|
353
|
+
dest_dir_str: String path to the directory where archive will be extracted.
|
354
|
+
"""
|
355
|
+
archive_path = Path(archive_path_str)
|
356
|
+
dest_dir = Path(dest_dir_str)
|
357
|
+
|
358
|
+
# Import here to avoid issues with multiprocessing
|
359
|
+
from .archive import extract_archive
|
360
|
+
|
361
|
+
extract_archive(archive_path, dest_dir)
|
b10_transfer/constants.py
CHANGED
@@ -114,25 +114,10 @@ class WorkerStatus(Enum):
|
|
114
114
|
CANCELLED = auto()
|
115
115
|
|
116
116
|
|
117
|
-
class
|
118
|
-
"""Status values for
|
119
|
-
|
120
|
-
SUCCESS = auto()
|
121
|
-
ERROR = auto()
|
122
|
-
DOES_NOT_EXIST = auto()
|
123
|
-
SKIPPED = auto()
|
124
|
-
|
125
|
-
|
126
|
-
class SaveStatus(Enum):
|
127
|
-
"""Status values for cache saving operations."""
|
128
|
-
|
129
|
-
SUCCESS = auto()
|
130
|
-
ERROR = auto()
|
131
|
-
SKIPPED = auto()
|
132
|
-
|
133
|
-
|
134
|
-
class TransferStatus(Enum):
|
135
|
-
"""Status values for file transfer operations."""
|
117
|
+
class OperationStatus(Enum):
|
118
|
+
"""Status values for all b10-transfer operations (load, save, transfer)."""
|
136
119
|
|
137
120
|
SUCCESS = auto()
|
138
121
|
ERROR = auto()
|
122
|
+
DOES_NOT_EXIST = auto() # Used by load operations when cache file not found
|
123
|
+
SKIPPED = auto() # Used by load/save operations when operation not needed
|
b10_transfer/core.py
CHANGED
@@ -1,19 +1,16 @@
|
|
1
|
-
|
1
|
+
"""Core file transfer operations for b10-transfer.
|
2
|
+
|
3
|
+
This module provides generic file transfer functionality with space monitoring
|
4
|
+
and error handling for b10fs operations.
|
5
|
+
"""
|
6
|
+
|
2
7
|
import logging
|
3
|
-
import tempfile
|
4
8
|
import shutil
|
5
9
|
from pathlib import Path
|
6
10
|
|
7
|
-
import time
|
8
|
-
|
9
|
-
from .environment import get_cache_filename
|
10
|
-
from .cleanup import cooperative_cleanup_b10fs
|
11
11
|
from .utils import (
|
12
12
|
timed_fn,
|
13
|
-
critical_section_b10fs_file_lock,
|
14
13
|
safe_execute,
|
15
|
-
temp_file_cleanup,
|
16
|
-
cache_operation,
|
17
14
|
safe_unlink,
|
18
15
|
)
|
19
16
|
from .space_monitor import (
|
@@ -24,241 +21,18 @@ from .space_monitor import (
|
|
24
21
|
worker_process,
|
25
22
|
)
|
26
23
|
from .constants import (
|
27
|
-
TORCH_CACHE_DIR,
|
28
24
|
B10FS_CACHE_DIR,
|
29
|
-
LOCAL_WORK_DIR,
|
30
|
-
MAX_CACHE_SIZE_MB,
|
31
25
|
REQUIRED_B10FS_SPACE_MB,
|
32
26
|
MIN_LOCAL_SPACE_MB,
|
33
|
-
|
34
|
-
CACHE_LATEST_SUFFIX,
|
35
|
-
CACHE_INCOMPLETE_SUFFIX,
|
36
|
-
LoadStatus,
|
37
|
-
SaveStatus,
|
38
|
-
TransferStatus,
|
27
|
+
OperationStatus,
|
39
28
|
)
|
40
29
|
|
41
30
|
logger = logging.getLogger(__name__)
|
42
31
|
|
43
32
|
|
44
|
-
@timed_fn(logger=logger, name="Loading compile cache")
|
45
|
-
@safe_execute("Load failed", False)
|
46
|
-
def load_compile_cache() -> LoadStatus:
|
47
|
-
"""Load PyTorch compilation cache from b10fs to local torch cache directory.
|
48
|
-
|
49
|
-
This function implements a lock-free pattern to safely load cached PyTorch
|
50
|
-
compilation artifacts from the b10fs shared filesystem to the local torch
|
51
|
-
cache directory. It validates b10fs availability, checks for existing cache,
|
52
|
-
and extracts the archive if needed.
|
53
|
-
|
54
|
-
The function monitors local disk space during both the copy from b10fs and
|
55
|
-
extraction phases, interrupting operations if space falls below MIN_LOCAL_SPACE_MB.
|
56
|
-
|
57
|
-
Returns:
|
58
|
-
LoadStatus:
|
59
|
-
LoadStatus.SUCCESS if cache was successfully loaded
|
60
|
-
LoadStatus.SKIPPED if already exists
|
61
|
-
LoadStatus.ERROR if b10fs is unavailable, local disk space is insufficient, or loading failed.
|
62
|
-
LoadStatus.DOES_NOT_EXIST if no cache file was found.
|
63
|
-
|
64
|
-
Raises:
|
65
|
-
CacheValidationError: If b10fs is not enabled (caught and returns LoadStatus.ERROR).
|
66
|
-
CacheOperationInterrupted: If operations interrupted due to insufficient
|
67
|
-
local disk space (caught and returns LoadStatus.ERROR).
|
68
|
-
Exception: Any other errors during loading (caught and returns LoadStatus.ERROR).
|
69
|
-
"""
|
70
|
-
with cache_operation("Load"):
|
71
|
-
# Cooperative cleanup of stale shared resources
|
72
|
-
cooperative_cleanup_b10fs()
|
73
|
-
|
74
|
-
b10fs_dir = Path(B10FS_CACHE_DIR)
|
75
|
-
torch_dir = Path(TORCH_CACHE_DIR)
|
76
|
-
work_dir = Path(LOCAL_WORK_DIR)
|
77
|
-
|
78
|
-
cache_filename = get_cache_filename()
|
79
|
-
cache_file = (
|
80
|
-
b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
|
81
|
-
)
|
82
|
-
logger.debug(f"Looking for cache file: {cache_file}")
|
83
|
-
|
84
|
-
if not cache_file.exists():
|
85
|
-
logger.info("No cache file found in b10fs")
|
86
|
-
return LoadStatus.DOES_NOT_EXIST
|
87
|
-
|
88
|
-
# Skip if already loaded
|
89
|
-
if torch_dir.exists() and any(torch_dir.iterdir()):
|
90
|
-
logger.info("Torch cache already loaded, skipping extraction")
|
91
|
-
return LoadStatus.SKIPPED
|
92
|
-
|
93
|
-
# Create temp local copy
|
94
|
-
with tempfile.NamedTemporaryFile(
|
95
|
-
suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
|
96
|
-
) as f:
|
97
|
-
temp_path = Path(f.name)
|
98
|
-
logger.debug(f"Created temporary file for cache: {temp_path}")
|
99
|
-
|
100
|
-
try:
|
101
|
-
with temp_file_cleanup(temp_path):
|
102
|
-
# Phase 1: Copy from b10fs to local temp file using transfer()
|
103
|
-
@critical_section_b10fs_file_lock("copy_out")
|
104
|
-
def _monitored_copy_from_b10fs():
|
105
|
-
result = transfer(str(cache_file), str(temp_path))
|
106
|
-
if result != TransferStatus.SUCCESS:
|
107
|
-
raise Exception("Failed to copy cache file from b10fs")
|
108
|
-
|
109
|
-
_monitored_copy_from_b10fs()
|
110
|
-
|
111
|
-
# Phase 2: Extract archive in separate process with space monitoring
|
112
|
-
space_monitor = CacheSpaceMonitor(MIN_LOCAL_SPACE_MB, work_dir)
|
113
|
-
space_monitor.start()
|
114
|
-
|
115
|
-
try:
|
116
|
-
logger.info(f"Starting extraction: {temp_path} -> {torch_dir}")
|
117
|
-
run_monitored_process(
|
118
|
-
_cache_extract_worker,
|
119
|
-
(str(temp_path), str(torch_dir)),
|
120
|
-
space_monitor,
|
121
|
-
"archive extraction",
|
122
|
-
cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
|
123
|
-
)
|
124
|
-
finally:
|
125
|
-
space_monitor.stop()
|
126
|
-
|
127
|
-
logger.info("Cache load complete")
|
128
|
-
return LoadStatus.SUCCESS
|
129
|
-
|
130
|
-
except CacheOperationInterrupted as e:
|
131
|
-
logger.warning(f"Cache load interrupted: {e}")
|
132
|
-
return LoadStatus.ERROR
|
133
|
-
|
134
|
-
|
135
|
-
"""
|
136
|
-
FIXME(SRAY):
|
137
|
-
What about the case in @b10-transfer/ where a single pod finishes an inference request,
|
138
|
-
and then the client calls save_compile_cache. And while we are creating the local archive,
|
139
|
-
another inference call on the same pod is kicked off, which then modifies the torch cache.
|
140
|
-
How would this be handled? Maybe just accept that the cache will be recompiled/overwritten?
|
141
|
-
Otherwise you'd need application level coordination to ensure that the cache is not modified
|
142
|
-
while we are creating the archive, but this doesn't really seem like a good idea in terms of adoption.
|
143
|
-
|
144
|
-
FIXME(SR):
|
145
|
-
More things to consider:
|
146
|
-
- [possible] What if b10fs dies *during* an op? right now we check for b10fs availability in the beginning of the op... Add some constants instead of just False for load().
|
147
|
-
- [possible, and really bad if it happens] potential memory exhaustion during compression if the cache is super super large. very very edge case. higher compression levels also have high memory usage.
|
148
|
-
"""
|
149
|
-
|
150
|
-
|
151
|
-
@timed_fn(logger=logger, name="Saving compile cache")
|
152
|
-
@safe_execute("Save failed", False)
|
153
|
-
def save_compile_cache() -> SaveStatus:
|
154
|
-
"""Save local PyTorch compilation cache to b10fs using atomic journal pattern.
|
155
|
-
|
156
|
-
This function creates an archive of the local torch cache directory and
|
157
|
-
atomically saves it to b10fs using a journal pattern (write to temp file,
|
158
|
-
then rename). This ensures concurrent saves don't corrupt each other.
|
159
|
-
|
160
|
-
The function validates b10fs availability, checks if cache already exists
|
161
|
-
(early exit), performs initial space checks using pre-calculated requirements
|
162
|
-
for concurrent saves, starts background space monitoring, then runs compression
|
163
|
-
and copy operations in separate worker processes that can be terminated if disk
|
164
|
-
space becomes insufficient, finally performing an atomic rename to the final cache file.
|
165
|
-
|
166
|
-
Returns:
|
167
|
-
SaveStatus:
|
168
|
-
SaveStatus.SUCCESS if cache was successfully saved or already exists
|
169
|
-
SaveStatus.ERROR if b10fs is unavailable, insufficient disk space caused interruption,
|
170
|
-
no cache exists to save, or saving failed.
|
171
|
-
SaveStatus.SKIPPED if no cache exists to save or cache already exists in b10fs
|
172
|
-
|
173
|
-
Raises:
|
174
|
-
CacheValidationError: If b10fs is not enabled (caught and returns SaveStatus.ERROR).
|
175
|
-
CacheOperationInterrupted: If operations interrupted due to insufficient
|
176
|
-
disk space (caught and returns SaveStatus.ERROR).
|
177
|
-
ArchiveError: If archive creation fails (caught and returns SaveStatus.ERROR).
|
178
|
-
Exception: Any other errors during saving (caught and returns SaveStatus.ERROR).
|
179
|
-
"""
|
180
|
-
with cache_operation("Save"):
|
181
|
-
# Cooperative cleanup of stale shared resources
|
182
|
-
cooperative_cleanup_b10fs()
|
183
|
-
|
184
|
-
b10fs_dir = Path(B10FS_CACHE_DIR)
|
185
|
-
torch_dir = Path(TORCH_CACHE_DIR)
|
186
|
-
work_dir = Path(LOCAL_WORK_DIR)
|
187
|
-
|
188
|
-
# Check if anything to save
|
189
|
-
if not torch_dir.exists() or not any(torch_dir.iterdir()):
|
190
|
-
logger.info("No torch cache to save")
|
191
|
-
return SaveStatus.SKIPPED
|
192
|
-
|
193
|
-
cache_filename = get_cache_filename()
|
194
|
-
final_file = (
|
195
|
-
b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
|
196
|
-
)
|
197
|
-
|
198
|
-
# Check for existing cache first (early exit)
|
199
|
-
if final_file.exists():
|
200
|
-
logger.info("Cache already exists in b10fs, skipping save")
|
201
|
-
return SaveStatus.SKIPPED
|
202
|
-
|
203
|
-
temp_file = (
|
204
|
-
b10fs_dir
|
205
|
-
/ f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
|
206
|
-
)
|
207
|
-
|
208
|
-
with tempfile.NamedTemporaryFile(
|
209
|
-
suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
|
210
|
-
) as f:
|
211
|
-
local_temp = Path(f.name)
|
212
|
-
logger.debug(f"Created local temp file for archive: {local_temp}")
|
213
|
-
|
214
|
-
try:
|
215
|
-
with temp_file_cleanup(local_temp):
|
216
|
-
# Phase 1: Compression in separate process with space monitoring
|
217
|
-
space_monitor = CacheSpaceMonitor(REQUIRED_B10FS_SPACE_MB, b10fs_dir)
|
218
|
-
space_monitor.start()
|
219
|
-
|
220
|
-
try:
|
221
|
-
logger.info(f"Starting compression: {torch_dir} -> {local_temp}")
|
222
|
-
run_monitored_process(
|
223
|
-
_cache_compression_worker,
|
224
|
-
(str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
|
225
|
-
space_monitor,
|
226
|
-
"compression",
|
227
|
-
)
|
228
|
-
finally:
|
229
|
-
space_monitor.stop()
|
230
|
-
|
231
|
-
# Phase 2: Copy to b10fs using transfer()
|
232
|
-
@critical_section_b10fs_file_lock("copy_in")
|
233
|
-
def _monitored_copy_to_b10fs():
|
234
|
-
result = transfer(str(local_temp), str(temp_file))
|
235
|
-
if result != TransferStatus.SUCCESS:
|
236
|
-
# Clean up the temp file if transfer failed
|
237
|
-
safe_unlink(
|
238
|
-
temp_file,
|
239
|
-
f"Failed to cleanup after failed copy {temp_file}",
|
240
|
-
)
|
241
|
-
raise Exception("Failed to copy cache file to b10fs")
|
242
|
-
|
243
|
-
_monitored_copy_to_b10fs()
|
244
|
-
|
245
|
-
# Phase 3: Atomic rename (fast, don't interrupt)
|
246
|
-
logger.info(
|
247
|
-
f"Renaming temp file to final cache file: {temp_file} -> {final_file}"
|
248
|
-
)
|
249
|
-
temp_file.rename(final_file)
|
250
|
-
|
251
|
-
logger.info("Cache save complete")
|
252
|
-
return SaveStatus.SUCCESS
|
253
|
-
|
254
|
-
except CacheOperationInterrupted as e:
|
255
|
-
logger.warning(f"Cache save interrupted: {e}")
|
256
|
-
return SaveStatus.ERROR
|
257
|
-
|
258
|
-
|
259
33
|
@timed_fn(logger=logger, name="Transferring file")
|
260
|
-
@safe_execute("Transfer failed",
|
261
|
-
def transfer(source: str, dest: str) ->
|
34
|
+
@safe_execute("Transfer failed", OperationStatus.ERROR)
|
35
|
+
def transfer(source: str, dest: str) -> OperationStatus:
|
262
36
|
"""Transfer a file from source to destination with space monitoring.
|
263
37
|
|
264
38
|
This function copies a file from source to destination using the same
|
@@ -270,15 +44,15 @@ def transfer(source: str, dest: str) -> TransferStatus:
|
|
270
44
|
dest: Path to the destination where the file will be copied.
|
271
45
|
|
272
46
|
Returns:
|
273
|
-
|
274
|
-
|
275
|
-
|
47
|
+
OperationStatus:
|
48
|
+
OperationStatus.SUCCESS if transfer was successful
|
49
|
+
OperationStatus.ERROR if transfer failed due to insufficient disk space,
|
276
50
|
file not found, or other errors.
|
277
51
|
|
278
52
|
Raises:
|
279
53
|
CacheOperationInterrupted: If transfer interrupted due to insufficient
|
280
|
-
disk space (caught and returns
|
281
|
-
Exception: Any other errors during transfer (caught and returns
|
54
|
+
disk space (caught and returns OperationStatus.ERROR).
|
55
|
+
Exception: Any other errors during transfer (caught and returns OperationStatus.ERROR).
|
282
56
|
"""
|
283
57
|
source_path = Path(source)
|
284
58
|
dest_path = Path(dest)
|
@@ -286,7 +60,7 @@ def transfer(source: str, dest: str) -> TransferStatus:
|
|
286
60
|
# Validate source file exists
|
287
61
|
if not source_path.exists():
|
288
62
|
logger.error(f"Source file does not exist: {source}")
|
289
|
-
return
|
63
|
+
return OperationStatus.ERROR
|
290
64
|
|
291
65
|
# Create destination directory if it doesn't exist
|
292
66
|
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
@@ -330,66 +104,22 @@ def transfer(source: str, dest: str) -> TransferStatus:
|
|
330
104
|
)
|
331
105
|
|
332
106
|
logger.info("File transfer complete")
|
333
|
-
return
|
107
|
+
return OperationStatus.SUCCESS
|
334
108
|
|
335
109
|
except CacheOperationInterrupted as e:
|
336
110
|
logger.warning(f"File transfer interrupted: {e}")
|
337
|
-
return
|
111
|
+
return OperationStatus.ERROR
|
338
112
|
|
339
113
|
finally:
|
340
114
|
space_monitor.stop()
|
341
115
|
|
342
116
|
|
343
|
-
@safe_execute("Clear failed", False)
|
344
|
-
def clear_local_cache() -> bool:
|
345
|
-
"""Clear the local PyTorch compilation cache directory.
|
346
|
-
|
347
|
-
This function removes the entire local torch cache directory and all its
|
348
|
-
contents. This is useful for cleaning up disk space or forcing recompilation.
|
349
|
-
|
350
|
-
Returns:
|
351
|
-
bool: True if cache was successfully cleared or didn't exist, False if
|
352
|
-
clearing failed due to permissions or other filesystem errors.
|
353
|
-
|
354
|
-
Raises:
|
355
|
-
Exception: Any errors during directory removal (caught and returns False).
|
356
|
-
"""
|
357
|
-
torch_dir = Path(TORCH_CACHE_DIR)
|
358
|
-
if not torch_dir.exists():
|
359
|
-
return True
|
360
|
-
shutil.rmtree(torch_dir)
|
361
|
-
return True
|
362
|
-
|
363
|
-
|
364
|
-
@worker_process("Compression was cancelled before starting")
|
365
|
-
def _cache_compression_worker(
|
366
|
-
torch_dir_str: str, local_temp_str: str, max_size_mb: int
|
367
|
-
) -> None:
|
368
|
-
"""Worker process that handles cache compression.
|
369
|
-
|
370
|
-
This function runs in a separate process to compress the torch cache directory
|
371
|
-
into an archive. It can be terminated externally if disk space becomes insufficient.
|
372
|
-
|
373
|
-
Args:
|
374
|
-
torch_dir_str: String path to the torch cache directory to compress.
|
375
|
-
local_temp_str: String path where the compressed archive will be created.
|
376
|
-
max_size_mb: Maximum allowed archive size in megabytes.
|
377
|
-
"""
|
378
|
-
torch_dir = Path(torch_dir_str)
|
379
|
-
local_temp = Path(local_temp_str)
|
380
|
-
|
381
|
-
# Import here to avoid issues with multiprocessing
|
382
|
-
from .archive import create_archive
|
383
|
-
|
384
|
-
create_archive(torch_dir, local_temp, max_size_mb)
|
385
|
-
|
386
|
-
|
387
117
|
@worker_process("Copy was cancelled before starting")
|
388
118
|
def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
|
389
|
-
"""Worker process that handles file copy
|
119
|
+
"""Worker process that handles file copy operations.
|
390
120
|
|
391
|
-
This function runs in a separate process to copy
|
392
|
-
|
121
|
+
This function runs in a separate process to copy files between locations.
|
122
|
+
It can be terminated externally if disk space becomes insufficient.
|
393
123
|
|
394
124
|
Args:
|
395
125
|
source_path_str: String path to the source file to copy.
|
@@ -399,33 +129,3 @@ def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
|
|
399
129
|
dest_path = Path(dest_path_str)
|
400
130
|
|
401
131
|
shutil.copy2(source_path, dest_path)
|
402
|
-
|
403
|
-
|
404
|
-
def _cleanup_torch_dir(torch_dir: Path) -> None:
|
405
|
-
"""Helper function to safely cleanup torch directory during interrupted extraction."""
|
406
|
-
try:
|
407
|
-
if torch_dir.exists():
|
408
|
-
shutil.rmtree(torch_dir)
|
409
|
-
logger.debug(f"Cleaned up torch directory: {torch_dir}")
|
410
|
-
except Exception as e:
|
411
|
-
logger.error(f"Failed to cleanup torch directory {torch_dir}: {e}")
|
412
|
-
|
413
|
-
|
414
|
-
@worker_process("Extraction was cancelled before starting")
|
415
|
-
def _cache_extract_worker(archive_path_str: str, dest_dir_str: str) -> None:
|
416
|
-
"""Worker process that handles archive extraction.
|
417
|
-
|
418
|
-
This function runs in a separate process to extract the cache archive to
|
419
|
-
the torch cache directory. It can be terminated externally if local disk space becomes insufficient.
|
420
|
-
|
421
|
-
Args:
|
422
|
-
archive_path_str: String path to the archive file to extract.
|
423
|
-
dest_dir_str: String path to the directory where archive will be extracted.
|
424
|
-
"""
|
425
|
-
archive_path = Path(archive_path_str)
|
426
|
-
dest_dir = Path(dest_dir_str)
|
427
|
-
|
428
|
-
# Import here to avoid issues with multiprocessing
|
429
|
-
from .archive import extract_archive
|
430
|
-
|
431
|
-
extract_archive(archive_path, dest_dir)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: b10-transfer
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.6
|
4
4
|
Summary: Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management
|
5
5
|
License: MIT
|
6
6
|
Keywords: pytorch,file-transfer,cache,machine-learning,inference
|
@@ -0,0 +1,13 @@
|
|
1
|
+
b10_transfer/__init__.py,sha256=1oxaP7np1iu1GbODcGmujd4K4T1bBZjOiVq5e1GW9JM,665
|
2
|
+
b10_transfer/archive.py,sha256=GKb0mi0-YeM7ch4FLAoOLHXw0T6LkRerYad2N2y9TYM,6400
|
3
|
+
b10_transfer/cache.py,sha256=B5fNCJkMIpUBwZuKMoQVbn0NeEuIrcAtYMk0gXkkOAM,13768
|
4
|
+
b10_transfer/cleanup.py,sha256=3RnqWNGMCcko5GQdq1Gr9VPpGzAF5J6x7xjIH9SNZ78,6226
|
5
|
+
b10_transfer/constants.py,sha256=iuLShDW6hInhyz2YTQ8CzBanqW4chCkQOAzPZkCtOoA,4322
|
6
|
+
b10_transfer/core.py,sha256=vsOcH0ve2GP-YBgHU58WgCEbx0h7dXn2R5sJErnQt8k,4437
|
7
|
+
b10_transfer/environment.py,sha256=aC0biEMQrtHk0ke_3epdcq1X9J5fPmPpBVt0fH7XF2Y,5625
|
8
|
+
b10_transfer/info.py,sha256=I3iOuImZ5r6DMJTDeBtVvzlSn6IuyPJbLJYUO_OF0ks,6299
|
9
|
+
b10_transfer/space_monitor.py,sha256=C_CKDH43bNsWdq60WStSZ3c_nQkWvScQmqU_SYHesew,10531
|
10
|
+
b10_transfer/utils.py,sha256=Stee0DFK-8MRRYNIocqaK64cJvfs4jPW3Mpx7zkWV6Y,11932
|
11
|
+
b10_transfer-0.1.6.dist-info/METADATA,sha256=vwu77uY1CnrqTEdLYChlk91M6odOmstA_4a8AzeaH5M,4108
|
12
|
+
b10_transfer-0.1.6.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
13
|
+
b10_transfer-0.1.6.dist-info/RECORD,,
|
@@ -1,12 +0,0 @@
|
|
1
|
-
b10_transfer/__init__.py,sha256=LKMroIusY1itfMVrJT07xLS1XVehwr54Wk5dhEl8MzY,641
|
2
|
-
b10_transfer/archive.py,sha256=GKb0mi0-YeM7ch4FLAoOLHXw0T6LkRerYad2N2y9TYM,6400
|
3
|
-
b10_transfer/cleanup.py,sha256=3RnqWNGMCcko5GQdq1Gr9VPpGzAF5J6x7xjIH9SNZ78,6226
|
4
|
-
b10_transfer/constants.py,sha256=qCViKTyfHTLpiFVF2SwsbHp2IMz3kg3syxJfgRAq2dc,4446
|
5
|
-
b10_transfer/core.py,sha256=XWLuwjHXuhh-6abZMAl2yuLB7R2deyUc6gGPn6-Yfkc,17006
|
6
|
-
b10_transfer/environment.py,sha256=aC0biEMQrtHk0ke_3epdcq1X9J5fPmPpBVt0fH7XF2Y,5625
|
7
|
-
b10_transfer/info.py,sha256=I3iOuImZ5r6DMJTDeBtVvzlSn6IuyPJbLJYUO_OF0ks,6299
|
8
|
-
b10_transfer/space_monitor.py,sha256=C_CKDH43bNsWdq60WStSZ3c_nQkWvScQmqU_SYHesew,10531
|
9
|
-
b10_transfer/utils.py,sha256=Stee0DFK-8MRRYNIocqaK64cJvfs4jPW3Mpx7zkWV6Y,11932
|
10
|
-
b10_transfer-0.1.4.dist-info/METADATA,sha256=69s3ACBUFzGB7J97eVt4aCGSXrIpld1oV0Wj8Z0HLZ8,4108
|
11
|
-
b10_transfer-0.1.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
12
|
-
b10_transfer-0.1.4.dist-info/RECORD,,
|
File without changes
|