b10-transfer 0.1.2__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: b10-transfer
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management
5
5
  License: MIT
6
6
  Keywords: pytorch,file-transfer,cache,machine-learning,inference
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "b10-transfer"
7
- version = "0.1.2"
7
+ version = "0.1.4"
8
8
  description = "Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management"
9
9
  authors = ["Shounak Ray <shounak.noreply@baseten.co>", "Fred Liu <fred.liu.noreply@baseten.co>"]
10
10
  maintainers = ["Fred Liu <fred.liu.noreply@baseten.co>", "Shounak Ray <shounak.noreply@baseten.co>"]
@@ -7,7 +7,7 @@ from .info import get_cache_info, list_available_caches
7
7
  from .constants import SaveStatus, LoadStatus
8
8
 
9
9
  # Version
10
- __version__ = "0.1.2"
10
+ __version__ = "0.1.4"
11
11
 
12
12
  __all__ = [
13
13
  "CacheError",
@@ -129,3 +129,10 @@ class SaveStatus(Enum):
129
129
  SUCCESS = auto()
130
130
  ERROR = auto()
131
131
  SKIPPED = auto()
132
+
133
+
134
+ class TransferStatus(Enum):
135
+ """Status values for file transfer operations."""
136
+
137
+ SUCCESS = auto()
138
+ ERROR = auto()
@@ -35,6 +35,7 @@ from .constants import (
35
35
  CACHE_INCOMPLETE_SUFFIX,
36
36
  LoadStatus,
37
37
  SaveStatus,
38
+ TransferStatus,
38
39
  )
39
40
 
40
41
  logger = logging.getLogger(__name__)
@@ -89,18 +90,6 @@ def load_compile_cache() -> LoadStatus:
89
90
  logger.info("Torch cache already loaded, skipping extraction")
90
91
  return LoadStatus.SKIPPED
91
92
 
92
- # Initial disk space check for local operations
93
- check_sufficient_disk_space(
94
- work_dir, MIN_LOCAL_SPACE_MB, "cache load operations"
95
- )
96
- logger.debug(
97
- f"Initial space check passed: {MIN_LOCAL_SPACE_MB:.1f}MB required on local machine"
98
- )
99
-
100
- # Start background space monitoring for local disk
101
- space_monitor = CacheSpaceMonitor(MIN_LOCAL_SPACE_MB, work_dir)
102
- space_monitor.start()
103
-
104
93
  # Create temp local copy
105
94
  with tempfile.NamedTemporaryFile(
106
95
  suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
@@ -110,30 +99,30 @@ def load_compile_cache() -> LoadStatus:
110
99
 
111
100
  try:
112
101
  with temp_file_cleanup(temp_path):
113
- # Phase 1: Copy from b10fs to local temp file in separate process
102
+ # Phase 1: Copy from b10fs to local temp file using transfer()
114
103
  @critical_section_b10fs_file_lock("copy_out")
115
104
  def _monitored_copy_from_b10fs():
116
- logger.info(
117
- f"Starting copy from b10fs: {cache_file} -> {temp_path}"
118
- )
119
- run_monitored_process(
120
- _cache_copy_from_b10fs_worker,
121
- (str(cache_file), str(temp_path)),
122
- space_monitor,
123
- "b10fs to local copy",
124
- )
105
+ result = transfer(str(cache_file), str(temp_path))
106
+ if result != TransferStatus.SUCCESS:
107
+ raise Exception("Failed to copy cache file from b10fs")
125
108
 
126
109
  _monitored_copy_from_b10fs()
127
110
 
128
- # Phase 2: Extract archive in separate process
129
- logger.info(f"Starting extraction: {temp_path} -> {torch_dir}")
130
- run_monitored_process(
131
- _cache_extract_worker,
132
- (str(temp_path), str(torch_dir)),
133
- space_monitor,
134
- "archive extraction",
135
- cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
136
- )
111
+ # Phase 2: Extract archive in separate process with space monitoring
112
+ space_monitor = CacheSpaceMonitor(MIN_LOCAL_SPACE_MB, work_dir)
113
+ space_monitor.start()
114
+
115
+ try:
116
+ logger.info(f"Starting extraction: {temp_path} -> {torch_dir}")
117
+ run_monitored_process(
118
+ _cache_extract_worker,
119
+ (str(temp_path), str(torch_dir)),
120
+ space_monitor,
121
+ "archive extraction",
122
+ cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
123
+ )
124
+ finally:
125
+ space_monitor.stop()
137
126
 
138
127
  logger.info("Cache load complete")
139
128
  return LoadStatus.SUCCESS
@@ -142,9 +131,6 @@ def load_compile_cache() -> LoadStatus:
142
131
  logger.warning(f"Cache load interrupted: {e}")
143
132
  return LoadStatus.ERROR
144
133
 
145
- finally:
146
- space_monitor.stop()
147
-
148
134
 
149
135
  """
150
136
  FIXME(SRAY):
@@ -214,26 +200,11 @@ def save_compile_cache() -> SaveStatus:
214
200
  logger.info("Cache already exists in b10fs, skipping save")
215
201
  return SaveStatus.SKIPPED
216
202
 
217
- # Initial disk space checks using calculated space requirements
218
- check_sufficient_disk_space(
219
- work_dir, MAX_CACHE_SIZE_MB, "local temp file creation"
220
- )
221
- check_sufficient_disk_space(
222
- b10fs_dir, REQUIRED_B10FS_SPACE_MB, "cache save to b10fs"
223
- )
224
- logger.debug(
225
- f"Initial space checks passed: {MAX_CACHE_SIZE_MB:.1f}MB local, {REQUIRED_B10FS_SPACE_MB:.1f}MB b10fs"
226
- )
227
-
228
203
  temp_file = (
229
204
  b10fs_dir
230
205
  / f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
231
206
  )
232
207
 
233
- # Start background space monitoring
234
- space_monitor = CacheSpaceMonitor(REQUIRED_B10FS_SPACE_MB, b10fs_dir)
235
- space_monitor.start()
236
-
237
208
  with tempfile.NamedTemporaryFile(
238
209
  suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
239
210
  ) as f:
@@ -242,30 +213,32 @@ def save_compile_cache() -> SaveStatus:
242
213
 
243
214
  try:
244
215
  with temp_file_cleanup(local_temp):
245
- # Phase 1: Compression in separate process
246
- logger.info(f"Starting compression: {torch_dir} -> {local_temp}")
247
- run_monitored_process(
248
- _cache_compression_worker,
249
- (str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
250
- space_monitor,
251
- "compression",
252
- )
253
-
254
- b10fs_dir.mkdir(parents=True, exist_ok=True)
216
+ # Phase 1: Compression in separate process with space monitoring
217
+ space_monitor = CacheSpaceMonitor(REQUIRED_B10FS_SPACE_MB, b10fs_dir)
218
+ space_monitor.start()
255
219
 
256
- # Phase 2: Copy to b10fs in separate process
257
- @critical_section_b10fs_file_lock("copy_in")
258
- def _monitored_copy_to_b10fs():
259
- logger.info(f"Starting copy to b10fs: {local_temp} -> {temp_file}")
220
+ try:
221
+ logger.info(f"Starting compression: {torch_dir} -> {local_temp}")
260
222
  run_monitored_process(
261
- _cache_copy_worker,
262
- (str(local_temp), str(temp_file)),
223
+ _cache_compression_worker,
224
+ (str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
263
225
  space_monitor,
264
- "b10fs copy",
265
- cleanup_func=lambda: safe_unlink(
266
- temp_file, f"Failed to cleanup interrupted copy {temp_file}"
267
- ),
226
+ "compression",
268
227
  )
228
+ finally:
229
+ space_monitor.stop()
230
+
231
+ # Phase 2: Copy to b10fs using transfer()
232
+ @critical_section_b10fs_file_lock("copy_in")
233
+ def _monitored_copy_to_b10fs():
234
+ result = transfer(str(local_temp), str(temp_file))
235
+ if result != TransferStatus.SUCCESS:
236
+ # Clean up the temp file if transfer failed
237
+ safe_unlink(
238
+ temp_file,
239
+ f"Failed to cleanup after failed copy {temp_file}",
240
+ )
241
+ raise Exception("Failed to copy cache file to b10fs")
269
242
 
270
243
  _monitored_copy_to_b10fs()
271
244
 
@@ -282,8 +255,89 @@ def save_compile_cache() -> SaveStatus:
282
255
  logger.warning(f"Cache save interrupted: {e}")
283
256
  return SaveStatus.ERROR
284
257
 
285
- finally:
286
- space_monitor.stop()
258
+
259
+ @timed_fn(logger=logger, name="Transferring file")
260
+ @safe_execute("Transfer failed", TransferStatus.ERROR)
261
+ def transfer(source: str, dest: str) -> TransferStatus:
262
+ """Transfer a file from source to destination with space monitoring.
263
+
264
+ This function copies a file from source to destination using the same
265
+ monitored process approach as the cache operations. It monitors disk space
266
+ at the destination and can interrupt the transfer if space becomes insufficient.
267
+
268
+ Args:
269
+ source: Path to the source file to copy.
270
+ dest: Path to the destination where the file will be copied.
271
+
272
+ Returns:
273
+ TransferStatus:
274
+ TransferStatus.SUCCESS if transfer was successful
275
+ TransferStatus.ERROR if transfer failed due to insufficient disk space,
276
+ file not found, or other errors.
277
+
278
+ Raises:
279
+ CacheOperationInterrupted: If transfer interrupted due to insufficient
280
+ disk space (caught and returns TransferStatus.ERROR).
281
+ Exception: Any other errors during transfer (caught and returns TransferStatus.ERROR).
282
+ """
283
+ source_path = Path(source)
284
+ dest_path = Path(dest)
285
+
286
+ # Validate source file exists
287
+ if not source_path.exists():
288
+ logger.error(f"Source file does not exist: {source}")
289
+ return TransferStatus.ERROR
290
+
291
+ # Create destination directory if it doesn't exist
292
+ dest_path.parent.mkdir(parents=True, exist_ok=True)
293
+
294
+ # Determine appropriate space threshold based on destination directory
295
+ dest_dir = dest_path.parent
296
+ if str(dest_dir).startswith(B10FS_CACHE_DIR):
297
+ # Transferring to b10fs - use b10fs space requirements
298
+ space_threshold_mb = REQUIRED_B10FS_SPACE_MB
299
+ logger.debug(
300
+ f"Transfer to b10fs detected, using {space_threshold_mb:.1f}MB threshold"
301
+ )
302
+ else:
303
+ # Transferring to local directory - use local space requirements
304
+ space_threshold_mb = MIN_LOCAL_SPACE_MB
305
+ logger.debug(
306
+ f"Transfer to local directory detected, using {space_threshold_mb:.1f}MB threshold"
307
+ )
308
+
309
+ # Initial disk space check
310
+ check_sufficient_disk_space(dest_dir, space_threshold_mb, "file transfer")
311
+ logger.debug(
312
+ f"Initial space check passed: {space_threshold_mb:.1f}MB required at destination"
313
+ )
314
+
315
+ # Start background space monitoring for destination directory
316
+ space_monitor = CacheSpaceMonitor(space_threshold_mb, dest_dir)
317
+ space_monitor.start()
318
+
319
+ try:
320
+ # Run monitored copy process
321
+ logger.info(f"Starting transfer: {source} -> {dest}")
322
+ run_monitored_process(
323
+ _cache_copy_worker,
324
+ (str(source_path), str(dest_path)),
325
+ space_monitor,
326
+ "file transfer",
327
+ cleanup_func=lambda: safe_unlink(
328
+ dest_path, f"Failed to cleanup interrupted transfer {dest_path}"
329
+ ),
330
+ )
331
+
332
+ logger.info("File transfer complete")
333
+ return TransferStatus.SUCCESS
334
+
335
+ except CacheOperationInterrupted as e:
336
+ logger.warning(f"File transfer interrupted: {e}")
337
+ return TransferStatus.ERROR
338
+
339
+ finally:
340
+ space_monitor.stop()
287
341
 
288
342
 
289
343
  @safe_execute("Clear failed", False)
@@ -347,23 +401,6 @@ def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
347
401
  shutil.copy2(source_path, dest_path)
348
402
 
349
403
 
350
- @worker_process("Copy from b10fs was cancelled before starting")
351
- def _cache_copy_from_b10fs_worker(source_path_str: str, dest_path_str: str) -> None:
352
- """Worker process that handles file copy from b10fs to local machine.
353
-
354
- This function runs in a separate process to copy the cache file from b10fs
355
- to the local filesystem. It can be terminated externally if local disk space becomes insufficient.
356
-
357
- Args:
358
- source_path_str: String path to the source file in b10fs to copy.
359
- dest_path_str: String path where the file will be copied locally.
360
- """
361
- source_path = Path(source_path_str)
362
- dest_path = Path(dest_path_str)
363
-
364
- shutil.copy2(source_path, dest_path)
365
-
366
-
367
404
  def _cleanup_torch_dir(torch_dir: Path) -> None:
368
405
  """Helper function to safely cleanup torch directory during interrupted extraction."""
369
406
  try:
File without changes