b10-transfer 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
b10_transfer/__init__.py CHANGED
@@ -7,7 +7,7 @@ from .info import get_cache_info, list_available_caches
7
7
  from .constants import SaveStatus, LoadStatus
8
8
 
9
9
  # Version
10
- __version__ = "0.1.3"
10
+ __version__ = "0.1.4"
11
11
 
12
12
  __all__ = [
13
13
  "CacheError",
b10_transfer/core.py CHANGED
@@ -90,18 +90,6 @@ def load_compile_cache() -> LoadStatus:
90
90
  logger.info("Torch cache already loaded, skipping extraction")
91
91
  return LoadStatus.SKIPPED
92
92
 
93
- # Initial disk space check for local operations
94
- check_sufficient_disk_space(
95
- work_dir, MIN_LOCAL_SPACE_MB, "cache load operations"
96
- )
97
- logger.debug(
98
- f"Initial space check passed: {MIN_LOCAL_SPACE_MB:.1f}MB required on local machine"
99
- )
100
-
101
- # Start background space monitoring for local disk
102
- space_monitor = CacheSpaceMonitor(MIN_LOCAL_SPACE_MB, work_dir)
103
- space_monitor.start()
104
-
105
93
  # Create temp local copy
106
94
  with tempfile.NamedTemporaryFile(
107
95
  suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
@@ -111,30 +99,30 @@ def load_compile_cache() -> LoadStatus:
111
99
 
112
100
  try:
113
101
  with temp_file_cleanup(temp_path):
114
- # Phase 1: Copy from b10fs to local temp file in separate process
102
+ # Phase 1: Copy from b10fs to local temp file using transfer()
115
103
  @critical_section_b10fs_file_lock("copy_out")
116
104
  def _monitored_copy_from_b10fs():
117
- logger.info(
118
- f"Starting copy from b10fs: {cache_file} -> {temp_path}"
119
- )
120
- run_monitored_process(
121
- _cache_copy_from_b10fs_worker,
122
- (str(cache_file), str(temp_path)),
123
- space_monitor,
124
- "b10fs to local copy",
125
- )
105
+ result = transfer(str(cache_file), str(temp_path))
106
+ if result != TransferStatus.SUCCESS:
107
+ raise Exception("Failed to copy cache file from b10fs")
126
108
 
127
109
  _monitored_copy_from_b10fs()
128
110
 
129
- # Phase 2: Extract archive in separate process
130
- logger.info(f"Starting extraction: {temp_path} -> {torch_dir}")
131
- run_monitored_process(
132
- _cache_extract_worker,
133
- (str(temp_path), str(torch_dir)),
134
- space_monitor,
135
- "archive extraction",
136
- cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
137
- )
111
+ # Phase 2: Extract archive in separate process with space monitoring
112
+ space_monitor = CacheSpaceMonitor(MIN_LOCAL_SPACE_MB, work_dir)
113
+ space_monitor.start()
114
+
115
+ try:
116
+ logger.info(f"Starting extraction: {temp_path} -> {torch_dir}")
117
+ run_monitored_process(
118
+ _cache_extract_worker,
119
+ (str(temp_path), str(torch_dir)),
120
+ space_monitor,
121
+ "archive extraction",
122
+ cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
123
+ )
124
+ finally:
125
+ space_monitor.stop()
138
126
 
139
127
  logger.info("Cache load complete")
140
128
  return LoadStatus.SUCCESS
@@ -143,9 +131,6 @@ def load_compile_cache() -> LoadStatus:
143
131
  logger.warning(f"Cache load interrupted: {e}")
144
132
  return LoadStatus.ERROR
145
133
 
146
- finally:
147
- space_monitor.stop()
148
-
149
134
 
150
135
  """
151
136
  FIXME(SRAY):
@@ -215,26 +200,11 @@ def save_compile_cache() -> SaveStatus:
215
200
  logger.info("Cache already exists in b10fs, skipping save")
216
201
  return SaveStatus.SKIPPED
217
202
 
218
- # Initial disk space checks using calculated space requirements
219
- check_sufficient_disk_space(
220
- work_dir, MAX_CACHE_SIZE_MB, "local temp file creation"
221
- )
222
- check_sufficient_disk_space(
223
- b10fs_dir, REQUIRED_B10FS_SPACE_MB, "cache save to b10fs"
224
- )
225
- logger.debug(
226
- f"Initial space checks passed: {MAX_CACHE_SIZE_MB:.1f}MB local, {REQUIRED_B10FS_SPACE_MB:.1f}MB b10fs"
227
- )
228
-
229
203
  temp_file = (
230
204
  b10fs_dir
231
205
  / f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
232
206
  )
233
207
 
234
- # Start background space monitoring
235
- space_monitor = CacheSpaceMonitor(REQUIRED_B10FS_SPACE_MB, b10fs_dir)
236
- space_monitor.start()
237
-
238
208
  with tempfile.NamedTemporaryFile(
239
209
  suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
240
210
  ) as f:
@@ -243,30 +213,32 @@ def save_compile_cache() -> SaveStatus:
243
213
 
244
214
  try:
245
215
  with temp_file_cleanup(local_temp):
246
- # Phase 1: Compression in separate process
247
- logger.info(f"Starting compression: {torch_dir} -> {local_temp}")
248
- run_monitored_process(
249
- _cache_compression_worker,
250
- (str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
251
- space_monitor,
252
- "compression",
253
- )
254
-
255
- b10fs_dir.mkdir(parents=True, exist_ok=True)
216
+ # Phase 1: Compression in separate process with space monitoring
217
+ space_monitor = CacheSpaceMonitor(REQUIRED_B10FS_SPACE_MB, b10fs_dir)
218
+ space_monitor.start()
256
219
 
257
- # Phase 2: Copy to b10fs in separate process
258
- @critical_section_b10fs_file_lock("copy_in")
259
- def _monitored_copy_to_b10fs():
260
- logger.info(f"Starting copy to b10fs: {local_temp} -> {temp_file}")
220
+ try:
221
+ logger.info(f"Starting compression: {torch_dir} -> {local_temp}")
261
222
  run_monitored_process(
262
- _cache_copy_worker,
263
- (str(local_temp), str(temp_file)),
223
+ _cache_compression_worker,
224
+ (str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
264
225
  space_monitor,
265
- "b10fs copy",
266
- cleanup_func=lambda: safe_unlink(
267
- temp_file, f"Failed to cleanup interrupted copy {temp_file}"
268
- ),
226
+ "compression",
269
227
  )
228
+ finally:
229
+ space_monitor.stop()
230
+
231
+ # Phase 2: Copy to b10fs using transfer()
232
+ @critical_section_b10fs_file_lock("copy_in")
233
+ def _monitored_copy_to_b10fs():
234
+ result = transfer(str(local_temp), str(temp_file))
235
+ if result != TransferStatus.SUCCESS:
236
+ # Clean up the temp file if transfer failed
237
+ safe_unlink(
238
+ temp_file,
239
+ f"Failed to cleanup after failed copy {temp_file}",
240
+ )
241
+ raise Exception("Failed to copy cache file to b10fs")
270
242
 
271
243
  _monitored_copy_to_b10fs()
272
244
 
@@ -283,9 +255,6 @@ def save_compile_cache() -> SaveStatus:
283
255
  logger.warning(f"Cache save interrupted: {e}")
284
256
  return SaveStatus.ERROR
285
257
 
286
- finally:
287
- space_monitor.stop()
288
-
289
258
 
290
259
  @timed_fn(logger=logger, name="Transferring file")
291
260
  @safe_execute("Transfer failed", TransferStatus.ERROR)
@@ -432,23 +401,6 @@ def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
432
401
  shutil.copy2(source_path, dest_path)
433
402
 
434
403
 
435
- @worker_process("Copy from b10fs was cancelled before starting")
436
- def _cache_copy_from_b10fs_worker(source_path_str: str, dest_path_str: str) -> None:
437
- """Worker process that handles file copy from b10fs to local machine.
438
-
439
- This function runs in a separate process to copy the cache file from b10fs
440
- to the local filesystem. It can be terminated externally if local disk space becomes insufficient.
441
-
442
- Args:
443
- source_path_str: String path to the source file in b10fs to copy.
444
- dest_path_str: String path where the file will be copied locally.
445
- """
446
- source_path = Path(source_path_str)
447
- dest_path = Path(dest_path_str)
448
-
449
- shutil.copy2(source_path, dest_path)
450
-
451
-
452
404
  def _cleanup_torch_dir(torch_dir: Path) -> None:
453
405
  """Helper function to safely cleanup torch directory during interrupted extraction."""
454
406
  try:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: b10-transfer
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management
5
5
  License: MIT
6
6
  Keywords: pytorch,file-transfer,cache,machine-learning,inference
@@ -1,12 +1,12 @@
1
- b10_transfer/__init__.py,sha256=qaqigpp3gmzR4-Cr9aH1ilaYQgp9UB_DzzpsmwX_D-o,641
1
+ b10_transfer/__init__.py,sha256=LKMroIusY1itfMVrJT07xLS1XVehwr54Wk5dhEl8MzY,641
2
2
  b10_transfer/archive.py,sha256=GKb0mi0-YeM7ch4FLAoOLHXw0T6LkRerYad2N2y9TYM,6400
3
3
  b10_transfer/cleanup.py,sha256=3RnqWNGMCcko5GQdq1Gr9VPpGzAF5J6x7xjIH9SNZ78,6226
4
4
  b10_transfer/constants.py,sha256=qCViKTyfHTLpiFVF2SwsbHp2IMz3kg3syxJfgRAq2dc,4446
5
- b10_transfer/core.py,sha256=UVjzcqCqDMqpXqiXzdC6c8nJ_2tM35zlDl-Jp1Gvn20,18657
5
+ b10_transfer/core.py,sha256=XWLuwjHXuhh-6abZMAl2yuLB7R2deyUc6gGPn6-Yfkc,17006
6
6
  b10_transfer/environment.py,sha256=aC0biEMQrtHk0ke_3epdcq1X9J5fPmPpBVt0fH7XF2Y,5625
7
7
  b10_transfer/info.py,sha256=I3iOuImZ5r6DMJTDeBtVvzlSn6IuyPJbLJYUO_OF0ks,6299
8
8
  b10_transfer/space_monitor.py,sha256=C_CKDH43bNsWdq60WStSZ3c_nQkWvScQmqU_SYHesew,10531
9
9
  b10_transfer/utils.py,sha256=Stee0DFK-8MRRYNIocqaK64cJvfs4jPW3Mpx7zkWV6Y,11932
10
- b10_transfer-0.1.3.dist-info/METADATA,sha256=dNTL7He36x0j9nZpbf5Rhaa4FYHeuNaim2zyXCKq_lE,4108
11
- b10_transfer-0.1.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
12
- b10_transfer-0.1.3.dist-info/RECORD,,
10
+ b10_transfer-0.1.4.dist-info/METADATA,sha256=69s3ACBUFzGB7J97eVt4aCGSXrIpld1oV0Wj8Z0HLZ8,4108
11
+ b10_transfer-0.1.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
12
+ b10_transfer-0.1.4.dist-info/RECORD,,