b10-transfer 0.1.5__tar.gz → 0.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: b10-transfer
3
- Version: 0.1.5
3
+ Version: 0.1.7
4
4
  Summary: Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management
5
5
  License: MIT
6
6
  Keywords: pytorch,file-transfer,cache,machine-learning,inference
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "b10-transfer"
7
- version = "0.1.5"
7
+ version = "0.1.7"
8
8
  description = "Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management"
9
9
  authors = ["Shounak Ray <shounak.noreply@baseten.co>", "Fred Liu <fred.liu.noreply@baseten.co>"]
10
10
  maintainers = ["Fred Liu <fred.liu.noreply@baseten.co>", "Shounak Ray <shounak.noreply@baseten.co>"]
@@ -1,13 +1,14 @@
1
1
  """B10 Transfer - Lock-free PyTorch file transfer for Baseten."""
2
2
 
3
- from .core import load_compile_cache, save_compile_cache, clear_local_cache, transfer
3
+ from .cache import load_compile_cache, save_compile_cache, clear_local_cache
4
+ from .core import transfer
4
5
  from .utils import CacheError, CacheValidationError
5
6
  from .space_monitor import CacheOperationInterrupted
6
7
  from .info import get_cache_info, list_available_caches
7
8
  from .constants import OperationStatus
8
9
 
9
10
  # Version
10
- __version__ = "0.1.5"
11
+ __version__ = "0.1.7"
11
12
 
12
13
  __all__ = [
13
14
  "CacheError",
@@ -1,11 +1,13 @@
1
- import os
1
+ """Cache operations for PyTorch compilation artifacts.
2
+
3
+ This module provides functions for loading and saving PyTorch compilation cache
4
+ to/from b10fs shared storage using atomic operations and space monitoring.
5
+ """
6
+
2
7
  import logging
3
8
  import tempfile
4
- import shutil
5
9
  from pathlib import Path
6
10
 
7
- import time
8
-
9
11
  from .environment import get_cache_filename
10
12
  from .cleanup import cooperative_cleanup_b10fs
11
13
  from .utils import (
@@ -17,7 +19,6 @@ from .utils import (
17
19
  safe_unlink,
18
20
  )
19
21
  from .space_monitor import (
20
- check_sufficient_disk_space,
21
22
  CacheSpaceMonitor,
22
23
  CacheOperationInterrupted,
23
24
  run_monitored_process,
@@ -35,10 +36,27 @@ from .constants import (
35
36
  CACHE_INCOMPLETE_SUFFIX,
36
37
  OperationStatus,
37
38
  )
39
+ from .core import transfer
38
40
 
39
41
  logger = logging.getLogger(__name__)
40
42
 
41
43
 
44
+ """
45
+ FIXME(SRAY):
46
+ What about the case in @b10-transfer/ where a single pod finishes an inference request,
47
+ and then the client calls save_compile_cache. And while we are creating the local archive,
48
+ another inference call on the same pod is kicked off, which then modifies the torch cache.
49
+ How would this be handled? Maybe just accept that the cache will be recompiled/overwritten?
50
+ Otherwise you'd need application level coordination to ensure that the cache is not modified
51
+ while we are creating the archive, but this doesn't really seem like a good idea in terms of adoption.
52
+
53
+ FIXME(SR):
54
+ More things to consider:
55
+ - [possible] What if b10fs dies *during* an op? right now we check for b10fs availability in the beginning of the op... Add some constants instead of just False for load().
56
+ - [possible, and really bad if it happens] potential memory exhaustion during compression if the cache is super super large. very very edge case. higher compression levels also have high memory usage.
57
+ """
58
+
59
+
42
60
  def _setup_cache_paths():
43
61
  """Common setup for cache operations - returns paths and performs cleanup."""
44
62
  # Cooperative cleanup of stale shared resources
@@ -184,22 +202,6 @@ def load_compile_cache() -> OperationStatus:
184
202
  return OperationStatus.ERROR
185
203
 
186
204
 
187
- """
188
- FIXME(SRAY):
189
- What about the case in @b10-transfer/ where a single pod finishes an inference request,
190
- and then the client calls save_compile_cache. And while we are creating the local archive,
191
- another inference call on the same pod is kicked off, which then modifies the torch cache.
192
- How would this be handled? Maybe just accept that the cache will be recompiled/overwritten?
193
- Otherwise you'd need application level coordination to ensure that the cache is not modified
194
- while we are creating the archive, but this doesn't really seem like a good idea in terms of adoption.
195
-
196
- FIXME(SR):
197
- More things to consider:
198
- - [possible] What if b10fs dies *during* an op? right now we check for b10fs availability in the beginning of the op... Add some constants instead of just False for load().
199
- - [possible, and really bad if it happens] potential memory exhaustion during compression if the cache is super super large. very very edge case. higher compression levels also have high memory usage.
200
- """
201
-
202
-
203
205
  @timed_fn(logger=logger, name="Saving compile cache")
204
206
  @safe_execute("Save failed", False)
205
207
  def save_compile_cache() -> OperationStatus:
@@ -281,90 +283,6 @@ def save_compile_cache() -> OperationStatus:
281
283
  return OperationStatus.ERROR
282
284
 
283
285
 
284
- @timed_fn(logger=logger, name="Transferring file")
285
- @safe_execute("Transfer failed", OperationStatus.ERROR)
286
- def transfer(source: str, dest: str) -> OperationStatus:
287
- """Transfer a file from source to destination with space monitoring.
288
-
289
- This function copies a file from source to destination using the same
290
- monitored process approach as the cache operations. It monitors disk space
291
- at the destination and can interrupt the transfer if space becomes insufficient.
292
-
293
- Args:
294
- source: Path to the source file to copy.
295
- dest: Path to the destination where the file will be copied.
296
-
297
- Returns:
298
- OperationStatus:
299
- OperationStatus.SUCCESS if transfer was successful
300
- OperationStatus.ERROR if transfer failed due to insufficient disk space,
301
- file not found, or other errors.
302
-
303
- Raises:
304
- CacheOperationInterrupted: If transfer interrupted due to insufficient
305
- disk space (caught and returns OperationStatus.ERROR).
306
- Exception: Any other errors during transfer (caught and returns OperationStatus.ERROR).
307
- """
308
- source_path = Path(source)
309
- dest_path = Path(dest)
310
-
311
- # Validate source file exists
312
- if not source_path.exists():
313
- logger.error(f"Source file does not exist: {source}")
314
- return OperationStatus.ERROR
315
-
316
- # Create destination directory if it doesn't exist
317
- dest_path.parent.mkdir(parents=True, exist_ok=True)
318
-
319
- # Determine appropriate space threshold based on destination directory
320
- dest_dir = dest_path.parent
321
- if str(dest_dir).startswith(B10FS_CACHE_DIR):
322
- # Transferring to b10fs - use b10fs space requirements
323
- space_threshold_mb = REQUIRED_B10FS_SPACE_MB
324
- logger.debug(
325
- f"Transfer to b10fs detected, using {space_threshold_mb:.1f}MB threshold"
326
- )
327
- else:
328
- # Transferring to local directory - use local space requirements
329
- space_threshold_mb = MIN_LOCAL_SPACE_MB
330
- logger.debug(
331
- f"Transfer to local directory detected, using {space_threshold_mb:.1f}MB threshold"
332
- )
333
-
334
- # Initial disk space check
335
- check_sufficient_disk_space(dest_dir, space_threshold_mb, "file transfer")
336
- logger.debug(
337
- f"Initial space check passed: {space_threshold_mb:.1f}MB required at destination"
338
- )
339
-
340
- # Start background space monitoring for destination directory
341
- space_monitor = CacheSpaceMonitor(space_threshold_mb, dest_dir)
342
- space_monitor.start()
343
-
344
- try:
345
- # Run monitored copy process
346
- logger.info(f"Starting transfer: {source} -> {dest}")
347
- run_monitored_process(
348
- _cache_copy_worker,
349
- (str(source_path), str(dest_path)),
350
- space_monitor,
351
- "file transfer",
352
- cleanup_func=lambda: safe_unlink(
353
- dest_path, f"Failed to cleanup interrupted transfer {dest_path}"
354
- ),
355
- )
356
-
357
- logger.info("File transfer complete")
358
- return OperationStatus.SUCCESS
359
-
360
- except CacheOperationInterrupted as e:
361
- logger.warning(f"File transfer interrupted: {e}")
362
- return OperationStatus.ERROR
363
-
364
- finally:
365
- space_monitor.stop()
366
-
367
-
368
286
  @safe_execute("Clear failed", False)
369
287
  def clear_local_cache() -> bool:
370
288
  """Clear the local PyTorch compilation cache directory.
@@ -382,6 +300,8 @@ def clear_local_cache() -> bool:
382
300
  torch_dir = Path(TORCH_CACHE_DIR)
383
301
  if not torch_dir.exists():
384
302
  return True
303
+ import shutil
304
+
385
305
  shutil.rmtree(torch_dir)
386
306
  return True
387
307
 
@@ -409,27 +329,12 @@ def _cache_compression_worker(
409
329
  create_archive(torch_dir, local_temp, max_size_mb)
410
330
 
411
331
 
412
- @worker_process("Copy was cancelled before starting")
413
- def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
414
- """Worker process that handles file copy to b10fs.
415
-
416
- This function runs in a separate process to copy the compressed cache file
417
- to the b10fs filesystem. It can be terminated externally if disk space becomes insufficient.
418
-
419
- Args:
420
- source_path_str: String path to the source file to copy.
421
- dest_path_str: String path where the file will be copied.
422
- """
423
- source_path = Path(source_path_str)
424
- dest_path = Path(dest_path_str)
425
-
426
- shutil.copy2(source_path, dest_path)
427
-
428
-
429
332
  def _cleanup_torch_dir(torch_dir: Path) -> None:
430
333
  """Helper function to safely cleanup torch directory during interrupted extraction."""
431
334
  try:
432
335
  if torch_dir.exists():
336
+ import shutil
337
+
433
338
  shutil.rmtree(torch_dir)
434
339
  logger.debug(f"Cleaned up torch directory: {torch_dir}")
435
340
  except Exception as e:
@@ -0,0 +1,131 @@
1
+ """Core file transfer operations for b10-transfer.
2
+
3
+ This module provides generic file transfer functionality with space monitoring
4
+ and error handling for b10fs operations.
5
+ """
6
+
7
+ import logging
8
+ import shutil
9
+ from pathlib import Path
10
+
11
+ from .utils import (
12
+ timed_fn,
13
+ safe_execute,
14
+ safe_unlink,
15
+ )
16
+ from .space_monitor import (
17
+ check_sufficient_disk_space,
18
+ CacheSpaceMonitor,
19
+ CacheOperationInterrupted,
20
+ run_monitored_process,
21
+ worker_process,
22
+ )
23
+ from .constants import (
24
+ B10FS_CACHE_DIR,
25
+ REQUIRED_B10FS_SPACE_MB,
26
+ MIN_LOCAL_SPACE_MB,
27
+ OperationStatus,
28
+ )
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ @timed_fn(logger=logger, name="Transferring file")
34
+ @safe_execute("Transfer failed", OperationStatus.ERROR)
35
+ def transfer(source: str, dest: str) -> OperationStatus:
36
+ """Transfer a file from source to destination with space monitoring.
37
+
38
+ This function copies a file from source to destination using the same
39
+ monitored process approach as the cache operations. It monitors disk space
40
+ at the destination and can interrupt the transfer if space becomes insufficient.
41
+
42
+ Args:
43
+ source: Path to the source file to copy.
44
+ dest: Path to the destination where the file will be copied.
45
+
46
+ Returns:
47
+ OperationStatus:
48
+ OperationStatus.SUCCESS if transfer was successful
49
+ OperationStatus.ERROR if transfer failed due to insufficient disk space,
50
+ file not found, or other errors.
51
+
52
+ Raises:
53
+ CacheOperationInterrupted: If transfer interrupted due to insufficient
54
+ disk space (caught and returns OperationStatus.ERROR).
55
+ Exception: Any other errors during transfer (caught and returns OperationStatus.ERROR).
56
+ """
57
+ source_path = Path(source)
58
+ dest_path = Path(dest)
59
+
60
+ # Validate source file exists
61
+ if not source_path.exists():
62
+ logger.error(f"Source file does not exist: {source}")
63
+ return OperationStatus.ERROR
64
+
65
+ # Create destination directory if it doesn't exist
66
+ dest_path.parent.mkdir(parents=True, exist_ok=True)
67
+
68
+ # Determine appropriate space threshold based on destination directory
69
+ dest_dir = dest_path.parent
70
+ if str(dest_dir).startswith(B10FS_CACHE_DIR):
71
+ # Transferring to b10fs - use b10fs space requirements
72
+ space_threshold_mb = REQUIRED_B10FS_SPACE_MB
73
+ logger.debug(
74
+ f"Transfer to b10fs detected, using {space_threshold_mb:.1f}MB threshold"
75
+ )
76
+ else:
77
+ # Transferring to local directory - use local space requirements
78
+ space_threshold_mb = MIN_LOCAL_SPACE_MB
79
+ logger.debug(
80
+ f"Transfer to local directory detected, using {space_threshold_mb:.1f}MB threshold"
81
+ )
82
+
83
+ # Initial disk space check
84
+ check_sufficient_disk_space(dest_dir, space_threshold_mb, "file transfer")
85
+ logger.debug(
86
+ f"Initial space check passed: {space_threshold_mb:.1f}MB required at destination"
87
+ )
88
+
89
+ # Start background space monitoring for destination directory
90
+ space_monitor = CacheSpaceMonitor(space_threshold_mb, dest_dir)
91
+ space_monitor.start()
92
+
93
+ try:
94
+ # Run monitored copy process
95
+ logger.info(f"Starting transfer: {source} -> {dest}")
96
+ run_monitored_process(
97
+ _cache_copy_worker,
98
+ (str(source_path), str(dest_path)),
99
+ space_monitor,
100
+ "file transfer",
101
+ cleanup_func=lambda: safe_unlink(
102
+ dest_path, f"Failed to cleanup interrupted transfer {dest_path}"
103
+ ),
104
+ )
105
+
106
+ logger.info("File transfer complete")
107
+ return OperationStatus.SUCCESS
108
+
109
+ except CacheOperationInterrupted as e:
110
+ logger.warning(f"File transfer interrupted: {e}")
111
+ return OperationStatus.ERROR
112
+
113
+ finally:
114
+ space_monitor.stop()
115
+
116
+
117
+ @worker_process("Copy was cancelled before starting")
118
+ def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
119
+ """Worker process that handles file copy operations.
120
+
121
+ This function runs in a separate process to copy files between locations.
122
+ It can be terminated externally if disk space becomes insufficient.
123
+
124
+ Args:
125
+ source_path_str: String path to the source file to copy.
126
+ dest_path_str: String path where the file will be copied.
127
+ """
128
+ source_path = Path(source_path_str)
129
+ dest_path = Path(dest_path_str)
130
+
131
+ shutil.copy2(source_path, dest_path)
@@ -28,15 +28,14 @@ def get_cache_filename() -> str:
28
28
  """Get the cache filename prefix for the current environment.
29
29
 
30
30
  This function generates a cache filename prefix that includes the
31
- environment key and hostname to ensure cache files are environment-specific
31
+ environment key to ensure cache files are environment-specific
32
32
  and unique per machine.
33
33
 
34
34
  Returns:
35
- str: Cache filename prefix in format "cache_{environment_key}.{hostname}".
35
+ str: Cache filename prefix in format "cache_{environment_key}".
36
36
  """
37
37
  env_key = get_environment_key()
38
- hostname = os.uname().nodename or os.getenv("HOSTNAME", UNKNOWN_HOSTNAME)
39
- return f"cache_{env_key}.{hostname}"
38
+ return f"cache_{env_key}"
40
39
 
41
40
 
42
41
  def get_environment_key() -> str:
File without changes