b10-transfer 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
b10_transfer/__init__.py CHANGED
@@ -1,23 +1,24 @@
1
1
  """B10 Transfer - Lock-free PyTorch file transfer for Baseten."""
2
2
 
3
- from .core import load_compile_cache, save_compile_cache, clear_local_cache
3
+ from .cache import load_compile_cache, save_compile_cache, clear_local_cache
4
+ from .core import transfer
4
5
  from .utils import CacheError, CacheValidationError
5
6
  from .space_monitor import CacheOperationInterrupted
6
7
  from .info import get_cache_info, list_available_caches
7
- from .constants import SaveStatus, LoadStatus
8
+ from .constants import OperationStatus
8
9
 
9
10
  # Version
10
- __version__ = "0.1.4"
11
+ __version__ = "0.1.6"
11
12
 
12
13
  __all__ = [
13
14
  "CacheError",
14
15
  "CacheValidationError",
15
16
  "CacheOperationInterrupted",
16
- "SaveStatus",
17
- "LoadStatus",
17
+ "OperationStatus",
18
18
  "load_compile_cache",
19
19
  "save_compile_cache",
20
20
  "clear_local_cache",
21
+ "transfer",
21
22
  "get_cache_info",
22
23
  "list_available_caches",
23
24
  ]
b10_transfer/cache.py ADDED
@@ -0,0 +1,361 @@
1
+ """Cache operations for PyTorch compilation artifacts.
2
+
3
+ This module provides functions for loading and saving PyTorch compilation cache
4
+ to/from b10fs shared storage using atomic operations and space monitoring.
5
+ """
6
+
7
+ import logging
8
+ import tempfile
9
+ from pathlib import Path
10
+
11
+ from .environment import get_cache_filename
12
+ from .cleanup import cooperative_cleanup_b10fs
13
+ from .utils import (
14
+ timed_fn,
15
+ critical_section_b10fs_file_lock,
16
+ safe_execute,
17
+ temp_file_cleanup,
18
+ cache_operation,
19
+ safe_unlink,
20
+ )
21
+ from .space_monitor import (
22
+ CacheSpaceMonitor,
23
+ CacheOperationInterrupted,
24
+ run_monitored_process,
25
+ worker_process,
26
+ )
27
+ from .constants import (
28
+ TORCH_CACHE_DIR,
29
+ B10FS_CACHE_DIR,
30
+ LOCAL_WORK_DIR,
31
+ MAX_CACHE_SIZE_MB,
32
+ REQUIRED_B10FS_SPACE_MB,
33
+ MIN_LOCAL_SPACE_MB,
34
+ CACHE_FILE_EXTENSION,
35
+ CACHE_LATEST_SUFFIX,
36
+ CACHE_INCOMPLETE_SUFFIX,
37
+ OperationStatus,
38
+ )
39
+ from .core import transfer
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+
44
+ """
45
+ FIXME(SRAY):
46
+ What about the case in @b10-transfer/ where a single pod finishes an inference request,
47
+ and then the client calls save_compile_cache. And while we are creating the local archive,
48
+ another inference call on the same pod is kicked off, which then modifies the torch cache.
49
+ How would this be handled? Maybe just accept that the cache will be recompiled/overwritten?
50
+ Otherwise you'd need application level coordination to ensure that the cache is not modified
51
+ while we are creating the archive, but this doesn't really seem like a good idea in terms of adoption.
52
+
53
+ FIXME(SR):
54
+ More things to consider:
55
+ - [possible] What if b10fs dies *during* an op? right now we check for b10fs availability in the beginning of the op... Add some constants instead of just False for load().
56
+ - [possible, and really bad if it happens] potential memory exhaustion during compression if the cache is super super large. very very edge case. higher compression levels also have high memory usage.
57
+ """
58
+
59
+
60
+ def _setup_cache_paths():
61
+ """Common setup for cache operations - returns paths and performs cleanup."""
62
+ # Cooperative cleanup of stale shared resources
63
+ cooperative_cleanup_b10fs()
64
+
65
+ b10fs_dir = Path(B10FS_CACHE_DIR)
66
+ torch_dir = Path(TORCH_CACHE_DIR)
67
+ work_dir = Path(LOCAL_WORK_DIR)
68
+
69
+ return b10fs_dir, torch_dir, work_dir
70
+
71
+
72
+ def _get_cache_file_paths(cache_filename: str, b10fs_dir: Path):
73
+ """Generate cache file paths for a given cache filename."""
74
+ final_file = (
75
+ b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
76
+ )
77
+ temp_file = (
78
+ b10fs_dir / f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
79
+ )
80
+ return final_file, temp_file
81
+
82
+
83
+ def _run_with_space_monitoring(
84
+ space_threshold_mb: float,
85
+ monitor_dir: Path,
86
+ operation_name: str,
87
+ worker_func,
88
+ worker_args: tuple,
89
+ cleanup_func=None,
90
+ ):
91
+ """Helper to run an operation with space monitoring."""
92
+ space_monitor = CacheSpaceMonitor(space_threshold_mb, monitor_dir)
93
+ space_monitor.start()
94
+
95
+ try:
96
+ logger.info(
97
+ f"Starting {operation_name}: {' -> '.join(str(arg) for arg in worker_args[:2])}"
98
+ )
99
+ run_monitored_process(
100
+ worker_func,
101
+ worker_args,
102
+ space_monitor,
103
+ operation_name,
104
+ cleanup_func=cleanup_func,
105
+ )
106
+ finally:
107
+ space_monitor.stop()
108
+
109
+
110
+ def _transfer_with_b10fs_lock(
111
+ source: str, dest: str, lock_type: str, cleanup_on_failure=True
112
+ ):
113
+ """Transfer a file with b10fs file locking and error handling."""
114
+
115
+ @critical_section_b10fs_file_lock(lock_type)
116
+ def _locked_transfer():
117
+ result = transfer(source, dest)
118
+ if result != OperationStatus.SUCCESS:
119
+ if cleanup_on_failure:
120
+ safe_unlink(
121
+ Path(dest), f"Failed to cleanup after failed transfer {dest}"
122
+ )
123
+ raise Exception(f"Failed to transfer {source} -> {dest}")
124
+
125
+ _locked_transfer()
126
+
127
+
128
+ @timed_fn(logger=logger, name="Loading compile cache")
129
+ @safe_execute("Load failed", False)
130
+ def load_compile_cache() -> OperationStatus:
131
+ """Load PyTorch compilation cache from b10fs to local torch cache directory.
132
+
133
+ This function implements a lock-free pattern to safely load cached PyTorch
134
+ compilation artifacts from the b10fs shared filesystem to the local torch
135
+ cache directory. It validates b10fs availability, checks for existing cache,
136
+ and extracts the archive if needed.
137
+
138
+ The function monitors local disk space during both the copy from b10fs and
139
+ extraction phases, interrupting operations if space falls below MIN_LOCAL_SPACE_MB.
140
+
141
+ Returns:
142
+ OperationStatus:
143
+ OperationStatus.SUCCESS if cache was successfully loaded
144
+ OperationStatus.SKIPPED if already exists
145
+ OperationStatus.ERROR if b10fs is unavailable, local disk space is insufficient, or loading failed.
146
+ OperationStatus.DOES_NOT_EXIST if no cache file was found.
147
+
148
+ Raises:
149
+ CacheValidationError: If b10fs is not enabled (caught and returns OperationStatus.ERROR).
150
+ CacheOperationInterrupted: If operations interrupted due to insufficient
151
+ local disk space (caught and returns OperationStatus.ERROR).
152
+ Exception: Any other errors during loading (caught and returns OperationStatus.ERROR).
153
+ """
154
+ with cache_operation("Load"):
155
+ b10fs_dir, torch_dir, work_dir = _setup_cache_paths()
156
+
157
+ cache_filename = get_cache_filename()
158
+ final_file, _ = _get_cache_file_paths(cache_filename, b10fs_dir)
159
+ logger.debug(f"Looking for cache file: {final_file}")
160
+
161
+ if not final_file.exists():
162
+ logger.info("No cache file found in b10fs")
163
+ return OperationStatus.DOES_NOT_EXIST
164
+
165
+ # Skip if already loaded
166
+ if torch_dir.exists() and any(torch_dir.iterdir()):
167
+ logger.info("Torch cache already loaded, skipping extraction")
168
+ return OperationStatus.SKIPPED
169
+
170
+ # Create temp local copy
171
+ with tempfile.NamedTemporaryFile(
172
+ suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
173
+ ) as f:
174
+ temp_path = Path(f.name)
175
+ logger.debug(f"Created temporary file for cache: {temp_path}")
176
+
177
+ try:
178
+ with temp_file_cleanup(temp_path):
179
+ # Phase 1: Copy from b10fs to local temp file
180
+ _transfer_with_b10fs_lock(
181
+ str(final_file),
182
+ str(temp_path),
183
+ "copy_out",
184
+ cleanup_on_failure=False,
185
+ )
186
+
187
+ # Phase 2: Extract archive with space monitoring
188
+ _run_with_space_monitoring(
189
+ MIN_LOCAL_SPACE_MB,
190
+ work_dir,
191
+ "archive extraction",
192
+ _cache_extract_worker,
193
+ (str(temp_path), str(torch_dir)),
194
+ cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
195
+ )
196
+
197
+ logger.info("Cache load complete")
198
+ return OperationStatus.SUCCESS
199
+
200
+ except CacheOperationInterrupted as e:
201
+ logger.warning(f"Cache load interrupted: {e}")
202
+ return OperationStatus.ERROR
203
+
204
+
205
+ @timed_fn(logger=logger, name="Saving compile cache")
206
+ @safe_execute("Save failed", False)
207
+ def save_compile_cache() -> OperationStatus:
208
+ """Save local PyTorch compilation cache to b10fs using atomic journal pattern.
209
+
210
+ This function creates an archive of the local torch cache directory and
211
+ atomically saves it to b10fs using a journal pattern (write to temp file,
212
+ then rename). This ensures concurrent saves don't corrupt each other.
213
+
214
+ The function validates b10fs availability, checks if cache already exists
215
+ (early exit), performs initial space checks using pre-calculated requirements
216
+ for concurrent saves, starts background space monitoring, then runs compression
217
+ and copy operations in separate worker processes that can be terminated if disk
218
+ space becomes insufficient, finally performing an atomic rename to the final cache file.
219
+
220
+ Returns:
221
+ OperationStatus:
222
+ OperationStatus.SUCCESS if cache was successfully saved or already exists
223
+ OperationStatus.ERROR if b10fs is unavailable, insufficient disk space caused interruption,
224
+ no cache exists to save, or saving failed.
225
+ OperationStatus.SKIPPED if no cache exists to save or cache already exists in b10fs
226
+
227
+ Raises:
228
+ CacheValidationError: If b10fs is not enabled (caught and returns OperationStatus.ERROR).
229
+ CacheOperationInterrupted: If operations interrupted due to insufficient
230
+ disk space (caught and returns OperationStatus.ERROR).
231
+ ArchiveError: If archive creation fails (caught and returns OperationStatus.ERROR).
232
+ Exception: Any other errors during saving (caught and returns OperationStatus.ERROR).
233
+ """
234
+ with cache_operation("Save"):
235
+ b10fs_dir, torch_dir, work_dir = _setup_cache_paths()
236
+
237
+ # Check if anything to save
238
+ if not torch_dir.exists() or not any(torch_dir.iterdir()):
239
+ logger.info("No torch cache to save")
240
+ return OperationStatus.SKIPPED
241
+
242
+ cache_filename = get_cache_filename()
243
+ final_file, temp_file = _get_cache_file_paths(cache_filename, b10fs_dir)
244
+
245
+ # Check for existing cache first (early exit)
246
+ if final_file.exists():
247
+ logger.info("Cache already exists in b10fs, skipping save")
248
+ return OperationStatus.SKIPPED
249
+
250
+ with tempfile.NamedTemporaryFile(
251
+ suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
252
+ ) as f:
253
+ local_temp = Path(f.name)
254
+ logger.debug(f"Created local temp file for archive: {local_temp}")
255
+
256
+ try:
257
+ with temp_file_cleanup(local_temp):
258
+ # Phase 1: Compression with space monitoring
259
+ _run_with_space_monitoring(
260
+ REQUIRED_B10FS_SPACE_MB,
261
+ b10fs_dir,
262
+ "compression",
263
+ _cache_compression_worker,
264
+ (str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
265
+ )
266
+
267
+ # Phase 2: Copy to b10fs with locking
268
+ _transfer_with_b10fs_lock(
269
+ str(local_temp), str(temp_file), "copy_in", cleanup_on_failure=True
270
+ )
271
+
272
+ # Phase 3: Atomic rename (fast, don't interrupt)
273
+ logger.info(
274
+ f"Renaming temp file to final cache file: {temp_file} -> {final_file}"
275
+ )
276
+ temp_file.rename(final_file)
277
+
278
+ logger.info("Cache save complete")
279
+ return OperationStatus.SUCCESS
280
+
281
+ except CacheOperationInterrupted as e:
282
+ logger.warning(f"Cache save interrupted: {e}")
283
+ return OperationStatus.ERROR
284
+
285
+
286
+ @safe_execute("Clear failed", False)
287
+ def clear_local_cache() -> bool:
288
+ """Clear the local PyTorch compilation cache directory.
289
+
290
+ This function removes the entire local torch cache directory and all its
291
+ contents. This is useful for cleaning up disk space or forcing recompilation.
292
+
293
+ Returns:
294
+ bool: True if cache was successfully cleared or didn't exist, False if
295
+ clearing failed due to permissions or other filesystem errors.
296
+
297
+ Raises:
298
+ Exception: Any errors during directory removal (caught and returns False).
299
+ """
300
+ torch_dir = Path(TORCH_CACHE_DIR)
301
+ if not torch_dir.exists():
302
+ return True
303
+ import shutil
304
+
305
+ shutil.rmtree(torch_dir)
306
+ return True
307
+
308
+
309
+ @worker_process("Compression was cancelled before starting")
310
+ def _cache_compression_worker(
311
+ torch_dir_str: str, local_temp_str: str, max_size_mb: int
312
+ ) -> None:
313
+ """Worker process that handles cache compression.
314
+
315
+ This function runs in a separate process to compress the torch cache directory
316
+ into an archive. It can be terminated externally if disk space becomes insufficient.
317
+
318
+ Args:
319
+ torch_dir_str: String path to the torch cache directory to compress.
320
+ local_temp_str: String path where the compressed archive will be created.
321
+ max_size_mb: Maximum allowed archive size in megabytes.
322
+ """
323
+ torch_dir = Path(torch_dir_str)
324
+ local_temp = Path(local_temp_str)
325
+
326
+ # Import here to avoid issues with multiprocessing
327
+ from .archive import create_archive
328
+
329
+ create_archive(torch_dir, local_temp, max_size_mb)
330
+
331
+
332
+ def _cleanup_torch_dir(torch_dir: Path) -> None:
333
+ """Helper function to safely cleanup torch directory during interrupted extraction."""
334
+ try:
335
+ if torch_dir.exists():
336
+ import shutil
337
+
338
+ shutil.rmtree(torch_dir)
339
+ logger.debug(f"Cleaned up torch directory: {torch_dir}")
340
+ except Exception as e:
341
+ logger.error(f"Failed to cleanup torch directory {torch_dir}: {e}")
342
+
343
+
344
+ @worker_process("Extraction was cancelled before starting")
345
+ def _cache_extract_worker(archive_path_str: str, dest_dir_str: str) -> None:
346
+ """Worker process that handles archive extraction.
347
+
348
+ This function runs in a separate process to extract the cache archive to
349
+ the torch cache directory. It can be terminated externally if local disk space becomes insufficient.
350
+
351
+ Args:
352
+ archive_path_str: String path to the archive file to extract.
353
+ dest_dir_str: String path to the directory where archive will be extracted.
354
+ """
355
+ archive_path = Path(archive_path_str)
356
+ dest_dir = Path(dest_dir_str)
357
+
358
+ # Import here to avoid issues with multiprocessing
359
+ from .archive import extract_archive
360
+
361
+ extract_archive(archive_path, dest_dir)
b10_transfer/constants.py CHANGED
@@ -114,25 +114,10 @@ class WorkerStatus(Enum):
114
114
  CANCELLED = auto()
115
115
 
116
116
 
117
- class LoadStatus(Enum):
118
- """Status values for cache loading operations."""
119
-
120
- SUCCESS = auto()
121
- ERROR = auto()
122
- DOES_NOT_EXIST = auto()
123
- SKIPPED = auto()
124
-
125
-
126
- class SaveStatus(Enum):
127
- """Status values for cache saving operations."""
128
-
129
- SUCCESS = auto()
130
- ERROR = auto()
131
- SKIPPED = auto()
132
-
133
-
134
- class TransferStatus(Enum):
135
- """Status values for file transfer operations."""
117
+ class OperationStatus(Enum):
118
+ """Status values for all b10-transfer operations (load, save, transfer)."""
136
119
 
137
120
  SUCCESS = auto()
138
121
  ERROR = auto()
122
+ DOES_NOT_EXIST = auto() # Used by load operations when cache file not found
123
+ SKIPPED = auto() # Used by load/save operations when operation not needed
b10_transfer/core.py CHANGED
@@ -1,19 +1,16 @@
1
- import os
1
+ """Core file transfer operations for b10-transfer.
2
+
3
+ This module provides generic file transfer functionality with space monitoring
4
+ and error handling for b10fs operations.
5
+ """
6
+
2
7
  import logging
3
- import tempfile
4
8
  import shutil
5
9
  from pathlib import Path
6
10
 
7
- import time
8
-
9
- from .environment import get_cache_filename
10
- from .cleanup import cooperative_cleanup_b10fs
11
11
  from .utils import (
12
12
  timed_fn,
13
- critical_section_b10fs_file_lock,
14
13
  safe_execute,
15
- temp_file_cleanup,
16
- cache_operation,
17
14
  safe_unlink,
18
15
  )
19
16
  from .space_monitor import (
@@ -24,241 +21,18 @@ from .space_monitor import (
24
21
  worker_process,
25
22
  )
26
23
  from .constants import (
27
- TORCH_CACHE_DIR,
28
24
  B10FS_CACHE_DIR,
29
- LOCAL_WORK_DIR,
30
- MAX_CACHE_SIZE_MB,
31
25
  REQUIRED_B10FS_SPACE_MB,
32
26
  MIN_LOCAL_SPACE_MB,
33
- CACHE_FILE_EXTENSION,
34
- CACHE_LATEST_SUFFIX,
35
- CACHE_INCOMPLETE_SUFFIX,
36
- LoadStatus,
37
- SaveStatus,
38
- TransferStatus,
27
+ OperationStatus,
39
28
  )
40
29
 
41
30
  logger = logging.getLogger(__name__)
42
31
 
43
32
 
44
- @timed_fn(logger=logger, name="Loading compile cache")
45
- @safe_execute("Load failed", False)
46
- def load_compile_cache() -> LoadStatus:
47
- """Load PyTorch compilation cache from b10fs to local torch cache directory.
48
-
49
- This function implements a lock-free pattern to safely load cached PyTorch
50
- compilation artifacts from the b10fs shared filesystem to the local torch
51
- cache directory. It validates b10fs availability, checks for existing cache,
52
- and extracts the archive if needed.
53
-
54
- The function monitors local disk space during both the copy from b10fs and
55
- extraction phases, interrupting operations if space falls below MIN_LOCAL_SPACE_MB.
56
-
57
- Returns:
58
- LoadStatus:
59
- LoadStatus.SUCCESS if cache was successfully loaded
60
- LoadStatus.SKIPPED if already exists
61
- LoadStatus.ERROR if b10fs is unavailable, local disk space is insufficient, or loading failed.
62
- LoadStatus.DOES_NOT_EXIST if no cache file was found.
63
-
64
- Raises:
65
- CacheValidationError: If b10fs is not enabled (caught and returns LoadStatus.ERROR).
66
- CacheOperationInterrupted: If operations interrupted due to insufficient
67
- local disk space (caught and returns LoadStatus.ERROR).
68
- Exception: Any other errors during loading (caught and returns LoadStatus.ERROR).
69
- """
70
- with cache_operation("Load"):
71
- # Cooperative cleanup of stale shared resources
72
- cooperative_cleanup_b10fs()
73
-
74
- b10fs_dir = Path(B10FS_CACHE_DIR)
75
- torch_dir = Path(TORCH_CACHE_DIR)
76
- work_dir = Path(LOCAL_WORK_DIR)
77
-
78
- cache_filename = get_cache_filename()
79
- cache_file = (
80
- b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
81
- )
82
- logger.debug(f"Looking for cache file: {cache_file}")
83
-
84
- if not cache_file.exists():
85
- logger.info("No cache file found in b10fs")
86
- return LoadStatus.DOES_NOT_EXIST
87
-
88
- # Skip if already loaded
89
- if torch_dir.exists() and any(torch_dir.iterdir()):
90
- logger.info("Torch cache already loaded, skipping extraction")
91
- return LoadStatus.SKIPPED
92
-
93
- # Create temp local copy
94
- with tempfile.NamedTemporaryFile(
95
- suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
96
- ) as f:
97
- temp_path = Path(f.name)
98
- logger.debug(f"Created temporary file for cache: {temp_path}")
99
-
100
- try:
101
- with temp_file_cleanup(temp_path):
102
- # Phase 1: Copy from b10fs to local temp file using transfer()
103
- @critical_section_b10fs_file_lock("copy_out")
104
- def _monitored_copy_from_b10fs():
105
- result = transfer(str(cache_file), str(temp_path))
106
- if result != TransferStatus.SUCCESS:
107
- raise Exception("Failed to copy cache file from b10fs")
108
-
109
- _monitored_copy_from_b10fs()
110
-
111
- # Phase 2: Extract archive in separate process with space monitoring
112
- space_monitor = CacheSpaceMonitor(MIN_LOCAL_SPACE_MB, work_dir)
113
- space_monitor.start()
114
-
115
- try:
116
- logger.info(f"Starting extraction: {temp_path} -> {torch_dir}")
117
- run_monitored_process(
118
- _cache_extract_worker,
119
- (str(temp_path), str(torch_dir)),
120
- space_monitor,
121
- "archive extraction",
122
- cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
123
- )
124
- finally:
125
- space_monitor.stop()
126
-
127
- logger.info("Cache load complete")
128
- return LoadStatus.SUCCESS
129
-
130
- except CacheOperationInterrupted as e:
131
- logger.warning(f"Cache load interrupted: {e}")
132
- return LoadStatus.ERROR
133
-
134
-
135
- """
136
- FIXME(SRAY):
137
- What about the case in @b10-transfer/ where a single pod finishes an inference request,
138
- and then the client calls save_compile_cache. And while we are creating the local archive,
139
- another inference call on the same pod is kicked off, which then modifies the torch cache.
140
- How would this be handled? Maybe just accept that the cache will be recompiled/overwritten?
141
- Otherwise you'd need application level coordination to ensure that the cache is not modified
142
- while we are creating the archive, but this doesn't really seem like a good idea in terms of adoption.
143
-
144
- FIXME(SR):
145
- More things to consider:
146
- - [possible] What if b10fs dies *during* an op? right now we check for b10fs availability in the beginning of the op... Add some constants instead of just False for load().
147
- - [possible, and really bad if it happens] potential memory exhaustion during compression if the cache is super super large. very very edge case. higher compression levels also have high memory usage.
148
- """
149
-
150
-
151
- @timed_fn(logger=logger, name="Saving compile cache")
152
- @safe_execute("Save failed", False)
153
- def save_compile_cache() -> SaveStatus:
154
- """Save local PyTorch compilation cache to b10fs using atomic journal pattern.
155
-
156
- This function creates an archive of the local torch cache directory and
157
- atomically saves it to b10fs using a journal pattern (write to temp file,
158
- then rename). This ensures concurrent saves don't corrupt each other.
159
-
160
- The function validates b10fs availability, checks if cache already exists
161
- (early exit), performs initial space checks using pre-calculated requirements
162
- for concurrent saves, starts background space monitoring, then runs compression
163
- and copy operations in separate worker processes that can be terminated if disk
164
- space becomes insufficient, finally performing an atomic rename to the final cache file.
165
-
166
- Returns:
167
- SaveStatus:
168
- SaveStatus.SUCCESS if cache was successfully saved or already exists
169
- SaveStatus.ERROR if b10fs is unavailable, insufficient disk space caused interruption,
170
- no cache exists to save, or saving failed.
171
- SaveStatus.SKIPPED if no cache exists to save or cache already exists in b10fs
172
-
173
- Raises:
174
- CacheValidationError: If b10fs is not enabled (caught and returns SaveStatus.ERROR).
175
- CacheOperationInterrupted: If operations interrupted due to insufficient
176
- disk space (caught and returns SaveStatus.ERROR).
177
- ArchiveError: If archive creation fails (caught and returns SaveStatus.ERROR).
178
- Exception: Any other errors during saving (caught and returns SaveStatus.ERROR).
179
- """
180
- with cache_operation("Save"):
181
- # Cooperative cleanup of stale shared resources
182
- cooperative_cleanup_b10fs()
183
-
184
- b10fs_dir = Path(B10FS_CACHE_DIR)
185
- torch_dir = Path(TORCH_CACHE_DIR)
186
- work_dir = Path(LOCAL_WORK_DIR)
187
-
188
- # Check if anything to save
189
- if not torch_dir.exists() or not any(torch_dir.iterdir()):
190
- logger.info("No torch cache to save")
191
- return SaveStatus.SKIPPED
192
-
193
- cache_filename = get_cache_filename()
194
- final_file = (
195
- b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
196
- )
197
-
198
- # Check for existing cache first (early exit)
199
- if final_file.exists():
200
- logger.info("Cache already exists in b10fs, skipping save")
201
- return SaveStatus.SKIPPED
202
-
203
- temp_file = (
204
- b10fs_dir
205
- / f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
206
- )
207
-
208
- with tempfile.NamedTemporaryFile(
209
- suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
210
- ) as f:
211
- local_temp = Path(f.name)
212
- logger.debug(f"Created local temp file for archive: {local_temp}")
213
-
214
- try:
215
- with temp_file_cleanup(local_temp):
216
- # Phase 1: Compression in separate process with space monitoring
217
- space_monitor = CacheSpaceMonitor(REQUIRED_B10FS_SPACE_MB, b10fs_dir)
218
- space_monitor.start()
219
-
220
- try:
221
- logger.info(f"Starting compression: {torch_dir} -> {local_temp}")
222
- run_monitored_process(
223
- _cache_compression_worker,
224
- (str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
225
- space_monitor,
226
- "compression",
227
- )
228
- finally:
229
- space_monitor.stop()
230
-
231
- # Phase 2: Copy to b10fs using transfer()
232
- @critical_section_b10fs_file_lock("copy_in")
233
- def _monitored_copy_to_b10fs():
234
- result = transfer(str(local_temp), str(temp_file))
235
- if result != TransferStatus.SUCCESS:
236
- # Clean up the temp file if transfer failed
237
- safe_unlink(
238
- temp_file,
239
- f"Failed to cleanup after failed copy {temp_file}",
240
- )
241
- raise Exception("Failed to copy cache file to b10fs")
242
-
243
- _monitored_copy_to_b10fs()
244
-
245
- # Phase 3: Atomic rename (fast, don't interrupt)
246
- logger.info(
247
- f"Renaming temp file to final cache file: {temp_file} -> {final_file}"
248
- )
249
- temp_file.rename(final_file)
250
-
251
- logger.info("Cache save complete")
252
- return SaveStatus.SUCCESS
253
-
254
- except CacheOperationInterrupted as e:
255
- logger.warning(f"Cache save interrupted: {e}")
256
- return SaveStatus.ERROR
257
-
258
-
259
33
  @timed_fn(logger=logger, name="Transferring file")
260
- @safe_execute("Transfer failed", TransferStatus.ERROR)
261
- def transfer(source: str, dest: str) -> TransferStatus:
34
+ @safe_execute("Transfer failed", OperationStatus.ERROR)
35
+ def transfer(source: str, dest: str) -> OperationStatus:
262
36
  """Transfer a file from source to destination with space monitoring.
263
37
 
264
38
  This function copies a file from source to destination using the same
@@ -270,15 +44,15 @@ def transfer(source: str, dest: str) -> TransferStatus:
270
44
  dest: Path to the destination where the file will be copied.
271
45
 
272
46
  Returns:
273
- TransferStatus:
274
- TransferStatus.SUCCESS if transfer was successful
275
- TransferStatus.ERROR if transfer failed due to insufficient disk space,
47
+ OperationStatus:
48
+ OperationStatus.SUCCESS if transfer was successful
49
+ OperationStatus.ERROR if transfer failed due to insufficient disk space,
276
50
  file not found, or other errors.
277
51
 
278
52
  Raises:
279
53
  CacheOperationInterrupted: If transfer interrupted due to insufficient
280
- disk space (caught and returns TransferStatus.ERROR).
281
- Exception: Any other errors during transfer (caught and returns TransferStatus.ERROR).
54
+ disk space (caught and returns OperationStatus.ERROR).
55
+ Exception: Any other errors during transfer (caught and returns OperationStatus.ERROR).
282
56
  """
283
57
  source_path = Path(source)
284
58
  dest_path = Path(dest)
@@ -286,7 +60,7 @@ def transfer(source: str, dest: str) -> TransferStatus:
286
60
  # Validate source file exists
287
61
  if not source_path.exists():
288
62
  logger.error(f"Source file does not exist: {source}")
289
- return TransferStatus.ERROR
63
+ return OperationStatus.ERROR
290
64
 
291
65
  # Create destination directory if it doesn't exist
292
66
  dest_path.parent.mkdir(parents=True, exist_ok=True)
@@ -330,66 +104,22 @@ def transfer(source: str, dest: str) -> TransferStatus:
330
104
  )
331
105
 
332
106
  logger.info("File transfer complete")
333
- return TransferStatus.SUCCESS
107
+ return OperationStatus.SUCCESS
334
108
 
335
109
  except CacheOperationInterrupted as e:
336
110
  logger.warning(f"File transfer interrupted: {e}")
337
- return TransferStatus.ERROR
111
+ return OperationStatus.ERROR
338
112
 
339
113
  finally:
340
114
  space_monitor.stop()
341
115
 
342
116
 
343
- @safe_execute("Clear failed", False)
344
- def clear_local_cache() -> bool:
345
- """Clear the local PyTorch compilation cache directory.
346
-
347
- This function removes the entire local torch cache directory and all its
348
- contents. This is useful for cleaning up disk space or forcing recompilation.
349
-
350
- Returns:
351
- bool: True if cache was successfully cleared or didn't exist, False if
352
- clearing failed due to permissions or other filesystem errors.
353
-
354
- Raises:
355
- Exception: Any errors during directory removal (caught and returns False).
356
- """
357
- torch_dir = Path(TORCH_CACHE_DIR)
358
- if not torch_dir.exists():
359
- return True
360
- shutil.rmtree(torch_dir)
361
- return True
362
-
363
-
364
- @worker_process("Compression was cancelled before starting")
365
- def _cache_compression_worker(
366
- torch_dir_str: str, local_temp_str: str, max_size_mb: int
367
- ) -> None:
368
- """Worker process that handles cache compression.
369
-
370
- This function runs in a separate process to compress the torch cache directory
371
- into an archive. It can be terminated externally if disk space becomes insufficient.
372
-
373
- Args:
374
- torch_dir_str: String path to the torch cache directory to compress.
375
- local_temp_str: String path where the compressed archive will be created.
376
- max_size_mb: Maximum allowed archive size in megabytes.
377
- """
378
- torch_dir = Path(torch_dir_str)
379
- local_temp = Path(local_temp_str)
380
-
381
- # Import here to avoid issues with multiprocessing
382
- from .archive import create_archive
383
-
384
- create_archive(torch_dir, local_temp, max_size_mb)
385
-
386
-
387
117
  @worker_process("Copy was cancelled before starting")
388
118
  def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
389
- """Worker process that handles file copy to b10fs.
119
+ """Worker process that handles file copy operations.
390
120
 
391
- This function runs in a separate process to copy the compressed cache file
392
- to the b10fs filesystem. It can be terminated externally if disk space becomes insufficient.
121
+ This function runs in a separate process to copy files between locations.
122
+ It can be terminated externally if disk space becomes insufficient.
393
123
 
394
124
  Args:
395
125
  source_path_str: String path to the source file to copy.
@@ -399,33 +129,3 @@ def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
399
129
  dest_path = Path(dest_path_str)
400
130
 
401
131
  shutil.copy2(source_path, dest_path)
402
-
403
-
404
- def _cleanup_torch_dir(torch_dir: Path) -> None:
405
- """Helper function to safely cleanup torch directory during interrupted extraction."""
406
- try:
407
- if torch_dir.exists():
408
- shutil.rmtree(torch_dir)
409
- logger.debug(f"Cleaned up torch directory: {torch_dir}")
410
- except Exception as e:
411
- logger.error(f"Failed to cleanup torch directory {torch_dir}: {e}")
412
-
413
-
414
- @worker_process("Extraction was cancelled before starting")
415
- def _cache_extract_worker(archive_path_str: str, dest_dir_str: str) -> None:
416
- """Worker process that handles archive extraction.
417
-
418
- This function runs in a separate process to extract the cache archive to
419
- the torch cache directory. It can be terminated externally if local disk space becomes insufficient.
420
-
421
- Args:
422
- archive_path_str: String path to the archive file to extract.
423
- dest_dir_str: String path to the directory where archive will be extracted.
424
- """
425
- archive_path = Path(archive_path_str)
426
- dest_dir = Path(dest_dir_str)
427
-
428
- # Import here to avoid issues with multiprocessing
429
- from .archive import extract_archive
430
-
431
- extract_archive(archive_path, dest_dir)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: b10-transfer
3
- Version: 0.1.4
3
+ Version: 0.1.6
4
4
  Summary: Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management
5
5
  License: MIT
6
6
  Keywords: pytorch,file-transfer,cache,machine-learning,inference
@@ -0,0 +1,13 @@
1
+ b10_transfer/__init__.py,sha256=1oxaP7np1iu1GbODcGmujd4K4T1bBZjOiVq5e1GW9JM,665
2
+ b10_transfer/archive.py,sha256=GKb0mi0-YeM7ch4FLAoOLHXw0T6LkRerYad2N2y9TYM,6400
3
+ b10_transfer/cache.py,sha256=B5fNCJkMIpUBwZuKMoQVbn0NeEuIrcAtYMk0gXkkOAM,13768
4
+ b10_transfer/cleanup.py,sha256=3RnqWNGMCcko5GQdq1Gr9VPpGzAF5J6x7xjIH9SNZ78,6226
5
+ b10_transfer/constants.py,sha256=iuLShDW6hInhyz2YTQ8CzBanqW4chCkQOAzPZkCtOoA,4322
6
+ b10_transfer/core.py,sha256=vsOcH0ve2GP-YBgHU58WgCEbx0h7dXn2R5sJErnQt8k,4437
7
+ b10_transfer/environment.py,sha256=aC0biEMQrtHk0ke_3epdcq1X9J5fPmPpBVt0fH7XF2Y,5625
8
+ b10_transfer/info.py,sha256=I3iOuImZ5r6DMJTDeBtVvzlSn6IuyPJbLJYUO_OF0ks,6299
9
+ b10_transfer/space_monitor.py,sha256=C_CKDH43bNsWdq60WStSZ3c_nQkWvScQmqU_SYHesew,10531
10
+ b10_transfer/utils.py,sha256=Stee0DFK-8MRRYNIocqaK64cJvfs4jPW3Mpx7zkWV6Y,11932
11
+ b10_transfer-0.1.6.dist-info/METADATA,sha256=vwu77uY1CnrqTEdLYChlk91M6odOmstA_4a8AzeaH5M,4108
12
+ b10_transfer-0.1.6.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
13
+ b10_transfer-0.1.6.dist-info/RECORD,,
@@ -1,12 +0,0 @@
1
- b10_transfer/__init__.py,sha256=LKMroIusY1itfMVrJT07xLS1XVehwr54Wk5dhEl8MzY,641
2
- b10_transfer/archive.py,sha256=GKb0mi0-YeM7ch4FLAoOLHXw0T6LkRerYad2N2y9TYM,6400
3
- b10_transfer/cleanup.py,sha256=3RnqWNGMCcko5GQdq1Gr9VPpGzAF5J6x7xjIH9SNZ78,6226
4
- b10_transfer/constants.py,sha256=qCViKTyfHTLpiFVF2SwsbHp2IMz3kg3syxJfgRAq2dc,4446
5
- b10_transfer/core.py,sha256=XWLuwjHXuhh-6abZMAl2yuLB7R2deyUc6gGPn6-Yfkc,17006
6
- b10_transfer/environment.py,sha256=aC0biEMQrtHk0ke_3epdcq1X9J5fPmPpBVt0fH7XF2Y,5625
7
- b10_transfer/info.py,sha256=I3iOuImZ5r6DMJTDeBtVvzlSn6IuyPJbLJYUO_OF0ks,6299
8
- b10_transfer/space_monitor.py,sha256=C_CKDH43bNsWdq60WStSZ3c_nQkWvScQmqU_SYHesew,10531
9
- b10_transfer/utils.py,sha256=Stee0DFK-8MRRYNIocqaK64cJvfs4jPW3Mpx7zkWV6Y,11932
10
- b10_transfer-0.1.4.dist-info/METADATA,sha256=69s3ACBUFzGB7J97eVt4aCGSXrIpld1oV0Wj8Z0HLZ8,4108
11
- b10_transfer-0.1.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
12
- b10_transfer-0.1.4.dist-info/RECORD,,