b10-transfer 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
b10_transfer/__init__.py CHANGED
@@ -1,23 +1,23 @@
1
1
  """B10 Transfer - Lock-free PyTorch file transfer for Baseten."""
2
2
 
3
- from .core import load_compile_cache, save_compile_cache, clear_local_cache
3
+ from .core import load_compile_cache, save_compile_cache, clear_local_cache, transfer
4
4
  from .utils import CacheError, CacheValidationError
5
5
  from .space_monitor import CacheOperationInterrupted
6
6
  from .info import get_cache_info, list_available_caches
7
- from .constants import SaveStatus, LoadStatus
7
+ from .constants import OperationStatus
8
8
 
9
9
  # Version
10
- __version__ = "0.1.4"
10
+ __version__ = "0.1.5"
11
11
 
12
12
  __all__ = [
13
13
  "CacheError",
14
14
  "CacheValidationError",
15
15
  "CacheOperationInterrupted",
16
- "SaveStatus",
17
- "LoadStatus",
16
+ "OperationStatus",
18
17
  "load_compile_cache",
19
18
  "save_compile_cache",
20
19
  "clear_local_cache",
20
+ "transfer",
21
21
  "get_cache_info",
22
22
  "list_available_caches",
23
23
  ]
b10_transfer/constants.py CHANGED
@@ -114,25 +114,10 @@ class WorkerStatus(Enum):
114
114
  CANCELLED = auto()
115
115
 
116
116
 
117
- class LoadStatus(Enum):
118
- """Status values for cache loading operations."""
119
-
120
- SUCCESS = auto()
121
- ERROR = auto()
122
- DOES_NOT_EXIST = auto()
123
- SKIPPED = auto()
124
-
125
-
126
- class SaveStatus(Enum):
127
- """Status values for cache saving operations."""
128
-
129
- SUCCESS = auto()
130
- ERROR = auto()
131
- SKIPPED = auto()
132
-
133
-
134
- class TransferStatus(Enum):
135
- """Status values for file transfer operations."""
117
+ class OperationStatus(Enum):
118
+ """Status values for all b10-transfer operations (load, save, transfer)."""
136
119
 
137
120
  SUCCESS = auto()
138
121
  ERROR = auto()
122
+ DOES_NOT_EXIST = auto() # Used by load operations when cache file not found
123
+ SKIPPED = auto() # Used by load/save operations when operation not needed
b10_transfer/core.py CHANGED
@@ -33,17 +33,83 @@ from .constants import (
33
33
  CACHE_FILE_EXTENSION,
34
34
  CACHE_LATEST_SUFFIX,
35
35
  CACHE_INCOMPLETE_SUFFIX,
36
- LoadStatus,
37
- SaveStatus,
38
- TransferStatus,
36
+ OperationStatus,
39
37
  )
40
38
 
41
39
  logger = logging.getLogger(__name__)
42
40
 
43
41
 
42
+ def _setup_cache_paths():
43
+ """Common setup for cache operations - returns paths and performs cleanup."""
44
+ # Cooperative cleanup of stale shared resources
45
+ cooperative_cleanup_b10fs()
46
+
47
+ b10fs_dir = Path(B10FS_CACHE_DIR)
48
+ torch_dir = Path(TORCH_CACHE_DIR)
49
+ work_dir = Path(LOCAL_WORK_DIR)
50
+
51
+ return b10fs_dir, torch_dir, work_dir
52
+
53
+
54
+ def _get_cache_file_paths(cache_filename: str, b10fs_dir: Path):
55
+ """Generate cache file paths for a given cache filename."""
56
+ final_file = (
57
+ b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
58
+ )
59
+ temp_file = (
60
+ b10fs_dir / f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
61
+ )
62
+ return final_file, temp_file
63
+
64
+
65
+ def _run_with_space_monitoring(
66
+ space_threshold_mb: float,
67
+ monitor_dir: Path,
68
+ operation_name: str,
69
+ worker_func,
70
+ worker_args: tuple,
71
+ cleanup_func=None,
72
+ ):
73
+ """Helper to run an operation with space monitoring."""
74
+ space_monitor = CacheSpaceMonitor(space_threshold_mb, monitor_dir)
75
+ space_monitor.start()
76
+
77
+ try:
78
+ logger.info(
79
+ f"Starting {operation_name}: {' -> '.join(str(arg) for arg in worker_args[:2])}"
80
+ )
81
+ run_monitored_process(
82
+ worker_func,
83
+ worker_args,
84
+ space_monitor,
85
+ operation_name,
86
+ cleanup_func=cleanup_func,
87
+ )
88
+ finally:
89
+ space_monitor.stop()
90
+
91
+
92
+ def _transfer_with_b10fs_lock(
93
+ source: str, dest: str, lock_type: str, cleanup_on_failure=True
94
+ ):
95
+ """Transfer a file with b10fs file locking and error handling."""
96
+
97
+ @critical_section_b10fs_file_lock(lock_type)
98
+ def _locked_transfer():
99
+ result = transfer(source, dest)
100
+ if result != OperationStatus.SUCCESS:
101
+ if cleanup_on_failure:
102
+ safe_unlink(
103
+ Path(dest), f"Failed to cleanup after failed transfer {dest}"
104
+ )
105
+ raise Exception(f"Failed to transfer {source} -> {dest}")
106
+
107
+ _locked_transfer()
108
+
109
+
44
110
  @timed_fn(logger=logger, name="Loading compile cache")
45
111
  @safe_execute("Load failed", False)
46
- def load_compile_cache() -> LoadStatus:
112
+ def load_compile_cache() -> OperationStatus:
47
113
  """Load PyTorch compilation cache from b10fs to local torch cache directory.
48
114
 
49
115
  This function implements a lock-free pattern to safely load cached PyTorch
@@ -55,40 +121,33 @@ def load_compile_cache() -> LoadStatus:
55
121
  extraction phases, interrupting operations if space falls below MIN_LOCAL_SPACE_MB.
56
122
 
57
123
  Returns:
58
- LoadStatus:
59
- LoadStatus.SUCCESS if cache was successfully loaded
60
- LoadStatus.SKIPPED if already exists
61
- LoadStatus.ERROR if b10fs is unavailable, local disk space is insufficient, or loading failed.
62
- LoadStatus.DOES_NOT_EXIST if no cache file was found.
124
+ OperationStatus:
125
+ OperationStatus.SUCCESS if cache was successfully loaded
126
+ OperationStatus.SKIPPED if already exists
127
+ OperationStatus.ERROR if b10fs is unavailable, local disk space is insufficient, or loading failed.
128
+ OperationStatus.DOES_NOT_EXIST if no cache file was found.
63
129
 
64
130
  Raises:
65
- CacheValidationError: If b10fs is not enabled (caught and returns LoadStatus.ERROR).
131
+ CacheValidationError: If b10fs is not enabled (caught and returns OperationStatus.ERROR).
66
132
  CacheOperationInterrupted: If operations interrupted due to insufficient
67
- local disk space (caught and returns LoadStatus.ERROR).
68
- Exception: Any other errors during loading (caught and returns LoadStatus.ERROR).
133
+ local disk space (caught and returns OperationStatus.ERROR).
134
+ Exception: Any other errors during loading (caught and returns OperationStatus.ERROR).
69
135
  """
70
136
  with cache_operation("Load"):
71
- # Cooperative cleanup of stale shared resources
72
- cooperative_cleanup_b10fs()
73
-
74
- b10fs_dir = Path(B10FS_CACHE_DIR)
75
- torch_dir = Path(TORCH_CACHE_DIR)
76
- work_dir = Path(LOCAL_WORK_DIR)
137
+ b10fs_dir, torch_dir, work_dir = _setup_cache_paths()
77
138
 
78
139
  cache_filename = get_cache_filename()
79
- cache_file = (
80
- b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
81
- )
82
- logger.debug(f"Looking for cache file: {cache_file}")
140
+ final_file, _ = _get_cache_file_paths(cache_filename, b10fs_dir)
141
+ logger.debug(f"Looking for cache file: {final_file}")
83
142
 
84
- if not cache_file.exists():
143
+ if not final_file.exists():
85
144
  logger.info("No cache file found in b10fs")
86
- return LoadStatus.DOES_NOT_EXIST
145
+ return OperationStatus.DOES_NOT_EXIST
87
146
 
88
147
  # Skip if already loaded
89
148
  if torch_dir.exists() and any(torch_dir.iterdir()):
90
149
  logger.info("Torch cache already loaded, skipping extraction")
91
- return LoadStatus.SKIPPED
150
+ return OperationStatus.SKIPPED
92
151
 
93
152
  # Create temp local copy
94
153
  with tempfile.NamedTemporaryFile(
@@ -99,37 +158,30 @@ def load_compile_cache() -> LoadStatus:
99
158
 
100
159
  try:
101
160
  with temp_file_cleanup(temp_path):
102
- # Phase 1: Copy from b10fs to local temp file using transfer()
103
- @critical_section_b10fs_file_lock("copy_out")
104
- def _monitored_copy_from_b10fs():
105
- result = transfer(str(cache_file), str(temp_path))
106
- if result != TransferStatus.SUCCESS:
107
- raise Exception("Failed to copy cache file from b10fs")
108
-
109
- _monitored_copy_from_b10fs()
110
-
111
- # Phase 2: Extract archive in separate process with space monitoring
112
- space_monitor = CacheSpaceMonitor(MIN_LOCAL_SPACE_MB, work_dir)
113
- space_monitor.start()
114
-
115
- try:
116
- logger.info(f"Starting extraction: {temp_path} -> {torch_dir}")
117
- run_monitored_process(
118
- _cache_extract_worker,
119
- (str(temp_path), str(torch_dir)),
120
- space_monitor,
121
- "archive extraction",
122
- cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
123
- )
124
- finally:
125
- space_monitor.stop()
161
+ # Phase 1: Copy from b10fs to local temp file
162
+ _transfer_with_b10fs_lock(
163
+ str(final_file),
164
+ str(temp_path),
165
+ "copy_out",
166
+ cleanup_on_failure=False,
167
+ )
168
+
169
+ # Phase 2: Extract archive with space monitoring
170
+ _run_with_space_monitoring(
171
+ MIN_LOCAL_SPACE_MB,
172
+ work_dir,
173
+ "archive extraction",
174
+ _cache_extract_worker,
175
+ (str(temp_path), str(torch_dir)),
176
+ cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
177
+ )
126
178
 
127
179
  logger.info("Cache load complete")
128
- return LoadStatus.SUCCESS
180
+ return OperationStatus.SUCCESS
129
181
 
130
182
  except CacheOperationInterrupted as e:
131
183
  logger.warning(f"Cache load interrupted: {e}")
132
- return LoadStatus.ERROR
184
+ return OperationStatus.ERROR
133
185
 
134
186
 
135
187
  """
@@ -150,7 +202,7 @@ More things to consider:
150
202
 
151
203
  @timed_fn(logger=logger, name="Saving compile cache")
152
204
  @safe_execute("Save failed", False)
153
- def save_compile_cache() -> SaveStatus:
205
+ def save_compile_cache() -> OperationStatus:
154
206
  """Save local PyTorch compilation cache to b10fs using atomic journal pattern.
155
207
 
156
208
  This function creates an archive of the local torch cache directory and
@@ -164,46 +216,34 @@ def save_compile_cache() -> SaveStatus:
164
216
  space becomes insufficient, finally performing an atomic rename to the final cache file.
165
217
 
166
218
  Returns:
167
- SaveStatus:
168
- SaveStatus.SUCCESS if cache was successfully saved or already exists
169
- SaveStatus.ERROR if b10fs is unavailable, insufficient disk space caused interruption,
219
+ OperationStatus:
220
+ OperationStatus.SUCCESS if cache was successfully saved or already exists
221
+ OperationStatus.ERROR if b10fs is unavailable, insufficient disk space caused interruption,
170
222
  no cache exists to save, or saving failed.
171
- SaveStatus.SKIPPED if no cache exists to save or cache already exists in b10fs
223
+ OperationStatus.SKIPPED if no cache exists to save or cache already exists in b10fs
172
224
 
173
225
  Raises:
174
- CacheValidationError: If b10fs is not enabled (caught and returns SaveStatus.ERROR).
226
+ CacheValidationError: If b10fs is not enabled (caught and returns OperationStatus.ERROR).
175
227
  CacheOperationInterrupted: If operations interrupted due to insufficient
176
- disk space (caught and returns SaveStatus.ERROR).
177
- ArchiveError: If archive creation fails (caught and returns SaveStatus.ERROR).
178
- Exception: Any other errors during saving (caught and returns SaveStatus.ERROR).
228
+ disk space (caught and returns OperationStatus.ERROR).
229
+ ArchiveError: If archive creation fails (caught and returns OperationStatus.ERROR).
230
+ Exception: Any other errors during saving (caught and returns OperationStatus.ERROR).
179
231
  """
180
232
  with cache_operation("Save"):
181
- # Cooperative cleanup of stale shared resources
182
- cooperative_cleanup_b10fs()
183
-
184
- b10fs_dir = Path(B10FS_CACHE_DIR)
185
- torch_dir = Path(TORCH_CACHE_DIR)
186
- work_dir = Path(LOCAL_WORK_DIR)
233
+ b10fs_dir, torch_dir, work_dir = _setup_cache_paths()
187
234
 
188
235
  # Check if anything to save
189
236
  if not torch_dir.exists() or not any(torch_dir.iterdir()):
190
237
  logger.info("No torch cache to save")
191
- return SaveStatus.SKIPPED
238
+ return OperationStatus.SKIPPED
192
239
 
193
240
  cache_filename = get_cache_filename()
194
- final_file = (
195
- b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
196
- )
241
+ final_file, temp_file = _get_cache_file_paths(cache_filename, b10fs_dir)
197
242
 
198
243
  # Check for existing cache first (early exit)
199
244
  if final_file.exists():
200
245
  logger.info("Cache already exists in b10fs, skipping save")
201
- return SaveStatus.SKIPPED
202
-
203
- temp_file = (
204
- b10fs_dir
205
- / f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
206
- )
246
+ return OperationStatus.SKIPPED
207
247
 
208
248
  with tempfile.NamedTemporaryFile(
209
249
  suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
@@ -213,34 +253,19 @@ def save_compile_cache() -> SaveStatus:
213
253
 
214
254
  try:
215
255
  with temp_file_cleanup(local_temp):
216
- # Phase 1: Compression in separate process with space monitoring
217
- space_monitor = CacheSpaceMonitor(REQUIRED_B10FS_SPACE_MB, b10fs_dir)
218
- space_monitor.start()
219
-
220
- try:
221
- logger.info(f"Starting compression: {torch_dir} -> {local_temp}")
222
- run_monitored_process(
223
- _cache_compression_worker,
224
- (str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
225
- space_monitor,
226
- "compression",
227
- )
228
- finally:
229
- space_monitor.stop()
230
-
231
- # Phase 2: Copy to b10fs using transfer()
232
- @critical_section_b10fs_file_lock("copy_in")
233
- def _monitored_copy_to_b10fs():
234
- result = transfer(str(local_temp), str(temp_file))
235
- if result != TransferStatus.SUCCESS:
236
- # Clean up the temp file if transfer failed
237
- safe_unlink(
238
- temp_file,
239
- f"Failed to cleanup after failed copy {temp_file}",
240
- )
241
- raise Exception("Failed to copy cache file to b10fs")
242
-
243
- _monitored_copy_to_b10fs()
256
+ # Phase 1: Compression with space monitoring
257
+ _run_with_space_monitoring(
258
+ REQUIRED_B10FS_SPACE_MB,
259
+ b10fs_dir,
260
+ "compression",
261
+ _cache_compression_worker,
262
+ (str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
263
+ )
264
+
265
+ # Phase 2: Copy to b10fs with locking
266
+ _transfer_with_b10fs_lock(
267
+ str(local_temp), str(temp_file), "copy_in", cleanup_on_failure=True
268
+ )
244
269
 
245
270
  # Phase 3: Atomic rename (fast, don't interrupt)
246
271
  logger.info(
@@ -249,16 +274,16 @@ def save_compile_cache() -> SaveStatus:
249
274
  temp_file.rename(final_file)
250
275
 
251
276
  logger.info("Cache save complete")
252
- return SaveStatus.SUCCESS
277
+ return OperationStatus.SUCCESS
253
278
 
254
279
  except CacheOperationInterrupted as e:
255
280
  logger.warning(f"Cache save interrupted: {e}")
256
- return SaveStatus.ERROR
281
+ return OperationStatus.ERROR
257
282
 
258
283
 
259
284
  @timed_fn(logger=logger, name="Transferring file")
260
- @safe_execute("Transfer failed", TransferStatus.ERROR)
261
- def transfer(source: str, dest: str) -> TransferStatus:
285
+ @safe_execute("Transfer failed", OperationStatus.ERROR)
286
+ def transfer(source: str, dest: str) -> OperationStatus:
262
287
  """Transfer a file from source to destination with space monitoring.
263
288
 
264
289
  This function copies a file from source to destination using the same
@@ -270,15 +295,15 @@ def transfer(source: str, dest: str) -> TransferStatus:
270
295
  dest: Path to the destination where the file will be copied.
271
296
 
272
297
  Returns:
273
- TransferStatus:
274
- TransferStatus.SUCCESS if transfer was successful
275
- TransferStatus.ERROR if transfer failed due to insufficient disk space,
298
+ OperationStatus:
299
+ OperationStatus.SUCCESS if transfer was successful
300
+ OperationStatus.ERROR if transfer failed due to insufficient disk space,
276
301
  file not found, or other errors.
277
302
 
278
303
  Raises:
279
304
  CacheOperationInterrupted: If transfer interrupted due to insufficient
280
- disk space (caught and returns TransferStatus.ERROR).
281
- Exception: Any other errors during transfer (caught and returns TransferStatus.ERROR).
305
+ disk space (caught and returns OperationStatus.ERROR).
306
+ Exception: Any other errors during transfer (caught and returns OperationStatus.ERROR).
282
307
  """
283
308
  source_path = Path(source)
284
309
  dest_path = Path(dest)
@@ -286,7 +311,7 @@ def transfer(source: str, dest: str) -> TransferStatus:
286
311
  # Validate source file exists
287
312
  if not source_path.exists():
288
313
  logger.error(f"Source file does not exist: {source}")
289
- return TransferStatus.ERROR
314
+ return OperationStatus.ERROR
290
315
 
291
316
  # Create destination directory if it doesn't exist
292
317
  dest_path.parent.mkdir(parents=True, exist_ok=True)
@@ -330,11 +355,11 @@ def transfer(source: str, dest: str) -> TransferStatus:
330
355
  )
331
356
 
332
357
  logger.info("File transfer complete")
333
- return TransferStatus.SUCCESS
358
+ return OperationStatus.SUCCESS
334
359
 
335
360
  except CacheOperationInterrupted as e:
336
361
  logger.warning(f"File transfer interrupted: {e}")
337
- return TransferStatus.ERROR
362
+ return OperationStatus.ERROR
338
363
 
339
364
  finally:
340
365
  space_monitor.stop()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: b10-transfer
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management
5
5
  License: MIT
6
6
  Keywords: pytorch,file-transfer,cache,machine-learning,inference
@@ -1,12 +1,12 @@
1
- b10_transfer/__init__.py,sha256=LKMroIusY1itfMVrJT07xLS1XVehwr54Wk5dhEl8MzY,641
1
+ b10_transfer/__init__.py,sha256=S7FJhF-BMPbXBqjQbXIj_Dl0r0-kSQD91l-wSD7COTI,647
2
2
  b10_transfer/archive.py,sha256=GKb0mi0-YeM7ch4FLAoOLHXw0T6LkRerYad2N2y9TYM,6400
3
3
  b10_transfer/cleanup.py,sha256=3RnqWNGMCcko5GQdq1Gr9VPpGzAF5J6x7xjIH9SNZ78,6226
4
- b10_transfer/constants.py,sha256=qCViKTyfHTLpiFVF2SwsbHp2IMz3kg3syxJfgRAq2dc,4446
5
- b10_transfer/core.py,sha256=XWLuwjHXuhh-6abZMAl2yuLB7R2deyUc6gGPn6-Yfkc,17006
4
+ b10_transfer/constants.py,sha256=iuLShDW6hInhyz2YTQ8CzBanqW4chCkQOAzPZkCtOoA,4322
5
+ b10_transfer/core.py,sha256=ScD-O9ot-ciVhX121fCchDu8_vM7izSFMxPtenU5RS0,17381
6
6
  b10_transfer/environment.py,sha256=aC0biEMQrtHk0ke_3epdcq1X9J5fPmPpBVt0fH7XF2Y,5625
7
7
  b10_transfer/info.py,sha256=I3iOuImZ5r6DMJTDeBtVvzlSn6IuyPJbLJYUO_OF0ks,6299
8
8
  b10_transfer/space_monitor.py,sha256=C_CKDH43bNsWdq60WStSZ3c_nQkWvScQmqU_SYHesew,10531
9
9
  b10_transfer/utils.py,sha256=Stee0DFK-8MRRYNIocqaK64cJvfs4jPW3Mpx7zkWV6Y,11932
10
- b10_transfer-0.1.4.dist-info/METADATA,sha256=69s3ACBUFzGB7J97eVt4aCGSXrIpld1oV0Wj8Z0HLZ8,4108
11
- b10_transfer-0.1.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
12
- b10_transfer-0.1.4.dist-info/RECORD,,
10
+ b10_transfer-0.1.5.dist-info/METADATA,sha256=lPyE9VvN1srmMZxn7tCOPEEkPHRZE8JsdAVIOeW2SPs,4108
11
+ b10_transfer-0.1.5.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
12
+ b10_transfer-0.1.5.dist-info/RECORD,,