b10-transfer 0.1.4__tar.gz → 0.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: b10-transfer
3
- Version: 0.1.4
3
+ Version: 0.1.6
4
4
  Summary: Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management
5
5
  License: MIT
6
6
  Keywords: pytorch,file-transfer,cache,machine-learning,inference
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "b10-transfer"
7
- version = "0.1.4"
7
+ version = "0.1.6"
8
8
  description = "Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management"
9
9
  authors = ["Shounak Ray <shounak.noreply@baseten.co>", "Fred Liu <fred.liu.noreply@baseten.co>"]
10
10
  maintainers = ["Fred Liu <fred.liu.noreply@baseten.co>", "Shounak Ray <shounak.noreply@baseten.co>"]
@@ -1,23 +1,24 @@
1
1
  """B10 Transfer - Lock-free PyTorch file transfer for Baseten."""
2
2
 
3
- from .core import load_compile_cache, save_compile_cache, clear_local_cache
3
+ from .cache import load_compile_cache, save_compile_cache, clear_local_cache
4
+ from .core import transfer
4
5
  from .utils import CacheError, CacheValidationError
5
6
  from .space_monitor import CacheOperationInterrupted
6
7
  from .info import get_cache_info, list_available_caches
7
- from .constants import SaveStatus, LoadStatus
8
+ from .constants import OperationStatus
8
9
 
9
10
  # Version
10
- __version__ = "0.1.4"
11
+ __version__ = "0.1.6"
11
12
 
12
13
  __all__ = [
13
14
  "CacheError",
14
15
  "CacheValidationError",
15
16
  "CacheOperationInterrupted",
16
- "SaveStatus",
17
- "LoadStatus",
17
+ "OperationStatus",
18
18
  "load_compile_cache",
19
19
  "save_compile_cache",
20
20
  "clear_local_cache",
21
+ "transfer",
21
22
  "get_cache_info",
22
23
  "list_available_caches",
23
24
  ]
@@ -1,11 +1,13 @@
1
- import os
1
+ """Cache operations for PyTorch compilation artifacts.
2
+
3
+ This module provides functions for loading and saving PyTorch compilation cache
4
+ to/from b10fs shared storage using atomic operations and space monitoring.
5
+ """
6
+
2
7
  import logging
3
8
  import tempfile
4
- import shutil
5
9
  from pathlib import Path
6
10
 
7
- import time
8
-
9
11
  from .environment import get_cache_filename
10
12
  from .cleanup import cooperative_cleanup_b10fs
11
13
  from .utils import (
@@ -17,7 +19,6 @@ from .utils import (
17
19
  safe_unlink,
18
20
  )
19
21
  from .space_monitor import (
20
- check_sufficient_disk_space,
21
22
  CacheSpaceMonitor,
22
23
  CacheOperationInterrupted,
23
24
  run_monitored_process,
@@ -33,17 +34,100 @@ from .constants import (
33
34
  CACHE_FILE_EXTENSION,
34
35
  CACHE_LATEST_SUFFIX,
35
36
  CACHE_INCOMPLETE_SUFFIX,
36
- LoadStatus,
37
- SaveStatus,
38
- TransferStatus,
37
+ OperationStatus,
39
38
  )
39
+ from .core import transfer
40
40
 
41
41
  logger = logging.getLogger(__name__)
42
42
 
43
43
 
44
+ """
45
+ FIXME(SRAY):
46
+ What about the case in @b10-transfer/ where a single pod finishes an inference request,
47
+ and then the client calls save_compile_cache. And while we are creating the local archive,
48
+ another inference call on the same pod is kicked off, which then modifies the torch cache.
49
+ How would this be handled? Maybe just accept that the cache will be recompiled/overwritten?
50
+ Otherwise you'd need application level coordination to ensure that the cache is not modified
51
+ while we are creating the archive, but this doesn't really seem like a good idea in terms of adoption.
52
+
53
+ FIXME(SR):
54
+ More things to consider:
55
+ - [possible] What if b10fs dies *during* an op? right now we check for b10fs availability in the beginning of the op... Add some constants instead of just False for load().
56
+ - [possible, and really bad if it happens] potential memory exhaustion during compression if the cache is super super large. very very edge case. higher compression levels also have high memory usage.
57
+ """
58
+
59
+
60
+ def _setup_cache_paths():
61
+ """Common setup for cache operations - returns paths and performs cleanup."""
62
+ # Cooperative cleanup of stale shared resources
63
+ cooperative_cleanup_b10fs()
64
+
65
+ b10fs_dir = Path(B10FS_CACHE_DIR)
66
+ torch_dir = Path(TORCH_CACHE_DIR)
67
+ work_dir = Path(LOCAL_WORK_DIR)
68
+
69
+ return b10fs_dir, torch_dir, work_dir
70
+
71
+
72
+ def _get_cache_file_paths(cache_filename: str, b10fs_dir: Path):
73
+ """Generate cache file paths for a given cache filename."""
74
+ final_file = (
75
+ b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
76
+ )
77
+ temp_file = (
78
+ b10fs_dir / f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
79
+ )
80
+ return final_file, temp_file
81
+
82
+
83
+ def _run_with_space_monitoring(
84
+ space_threshold_mb: float,
85
+ monitor_dir: Path,
86
+ operation_name: str,
87
+ worker_func,
88
+ worker_args: tuple,
89
+ cleanup_func=None,
90
+ ):
91
+ """Helper to run an operation with space monitoring."""
92
+ space_monitor = CacheSpaceMonitor(space_threshold_mb, monitor_dir)
93
+ space_monitor.start()
94
+
95
+ try:
96
+ logger.info(
97
+ f"Starting {operation_name}: {' -> '.join(str(arg) for arg in worker_args[:2])}"
98
+ )
99
+ run_monitored_process(
100
+ worker_func,
101
+ worker_args,
102
+ space_monitor,
103
+ operation_name,
104
+ cleanup_func=cleanup_func,
105
+ )
106
+ finally:
107
+ space_monitor.stop()
108
+
109
+
110
+ def _transfer_with_b10fs_lock(
111
+ source: str, dest: str, lock_type: str, cleanup_on_failure=True
112
+ ):
113
+ """Transfer a file with b10fs file locking and error handling."""
114
+
115
+ @critical_section_b10fs_file_lock(lock_type)
116
+ def _locked_transfer():
117
+ result = transfer(source, dest)
118
+ if result != OperationStatus.SUCCESS:
119
+ if cleanup_on_failure:
120
+ safe_unlink(
121
+ Path(dest), f"Failed to cleanup after failed transfer {dest}"
122
+ )
123
+ raise Exception(f"Failed to transfer {source} -> {dest}")
124
+
125
+ _locked_transfer()
126
+
127
+
44
128
  @timed_fn(logger=logger, name="Loading compile cache")
45
129
  @safe_execute("Load failed", False)
46
- def load_compile_cache() -> LoadStatus:
130
+ def load_compile_cache() -> OperationStatus:
47
131
  """Load PyTorch compilation cache from b10fs to local torch cache directory.
48
132
 
49
133
  This function implements a lock-free pattern to safely load cached PyTorch
@@ -55,40 +139,33 @@ def load_compile_cache() -> LoadStatus:
55
139
  extraction phases, interrupting operations if space falls below MIN_LOCAL_SPACE_MB.
56
140
 
57
141
  Returns:
58
- LoadStatus:
59
- LoadStatus.SUCCESS if cache was successfully loaded
60
- LoadStatus.SKIPPED if already exists
61
- LoadStatus.ERROR if b10fs is unavailable, local disk space is insufficient, or loading failed.
62
- LoadStatus.DOES_NOT_EXIST if no cache file was found.
142
+ OperationStatus:
143
+ OperationStatus.SUCCESS if cache was successfully loaded
144
+ OperationStatus.SKIPPED if already exists
145
+ OperationStatus.ERROR if b10fs is unavailable, local disk space is insufficient, or loading failed.
146
+ OperationStatus.DOES_NOT_EXIST if no cache file was found.
63
147
 
64
148
  Raises:
65
- CacheValidationError: If b10fs is not enabled (caught and returns LoadStatus.ERROR).
149
+ CacheValidationError: If b10fs is not enabled (caught and returns OperationStatus.ERROR).
66
150
  CacheOperationInterrupted: If operations interrupted due to insufficient
67
- local disk space (caught and returns LoadStatus.ERROR).
68
- Exception: Any other errors during loading (caught and returns LoadStatus.ERROR).
151
+ local disk space (caught and returns OperationStatus.ERROR).
152
+ Exception: Any other errors during loading (caught and returns OperationStatus.ERROR).
69
153
  """
70
154
  with cache_operation("Load"):
71
- # Cooperative cleanup of stale shared resources
72
- cooperative_cleanup_b10fs()
73
-
74
- b10fs_dir = Path(B10FS_CACHE_DIR)
75
- torch_dir = Path(TORCH_CACHE_DIR)
76
- work_dir = Path(LOCAL_WORK_DIR)
155
+ b10fs_dir, torch_dir, work_dir = _setup_cache_paths()
77
156
 
78
157
  cache_filename = get_cache_filename()
79
- cache_file = (
80
- b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
81
- )
82
- logger.debug(f"Looking for cache file: {cache_file}")
158
+ final_file, _ = _get_cache_file_paths(cache_filename, b10fs_dir)
159
+ logger.debug(f"Looking for cache file: {final_file}")
83
160
 
84
- if not cache_file.exists():
161
+ if not final_file.exists():
85
162
  logger.info("No cache file found in b10fs")
86
- return LoadStatus.DOES_NOT_EXIST
163
+ return OperationStatus.DOES_NOT_EXIST
87
164
 
88
165
  # Skip if already loaded
89
166
  if torch_dir.exists() and any(torch_dir.iterdir()):
90
167
  logger.info("Torch cache already loaded, skipping extraction")
91
- return LoadStatus.SKIPPED
168
+ return OperationStatus.SKIPPED
92
169
 
93
170
  # Create temp local copy
94
171
  with tempfile.NamedTemporaryFile(
@@ -99,58 +176,35 @@ def load_compile_cache() -> LoadStatus:
99
176
 
100
177
  try:
101
178
  with temp_file_cleanup(temp_path):
102
- # Phase 1: Copy from b10fs to local temp file using transfer()
103
- @critical_section_b10fs_file_lock("copy_out")
104
- def _monitored_copy_from_b10fs():
105
- result = transfer(str(cache_file), str(temp_path))
106
- if result != TransferStatus.SUCCESS:
107
- raise Exception("Failed to copy cache file from b10fs")
108
-
109
- _monitored_copy_from_b10fs()
110
-
111
- # Phase 2: Extract archive in separate process with space monitoring
112
- space_monitor = CacheSpaceMonitor(MIN_LOCAL_SPACE_MB, work_dir)
113
- space_monitor.start()
114
-
115
- try:
116
- logger.info(f"Starting extraction: {temp_path} -> {torch_dir}")
117
- run_monitored_process(
118
- _cache_extract_worker,
119
- (str(temp_path), str(torch_dir)),
120
- space_monitor,
121
- "archive extraction",
122
- cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
123
- )
124
- finally:
125
- space_monitor.stop()
179
+ # Phase 1: Copy from b10fs to local temp file
180
+ _transfer_with_b10fs_lock(
181
+ str(final_file),
182
+ str(temp_path),
183
+ "copy_out",
184
+ cleanup_on_failure=False,
185
+ )
186
+
187
+ # Phase 2: Extract archive with space monitoring
188
+ _run_with_space_monitoring(
189
+ MIN_LOCAL_SPACE_MB,
190
+ work_dir,
191
+ "archive extraction",
192
+ _cache_extract_worker,
193
+ (str(temp_path), str(torch_dir)),
194
+ cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
195
+ )
126
196
 
127
197
  logger.info("Cache load complete")
128
- return LoadStatus.SUCCESS
198
+ return OperationStatus.SUCCESS
129
199
 
130
200
  except CacheOperationInterrupted as e:
131
201
  logger.warning(f"Cache load interrupted: {e}")
132
- return LoadStatus.ERROR
133
-
134
-
135
- """
136
- FIXME(SRAY):
137
- What about the case in @b10-transfer/ where a single pod finishes an inference request,
138
- and then the client calls save_compile_cache. And while we are creating the local archive,
139
- another inference call on the same pod is kicked off, which then modifies the torch cache.
140
- How would this be handled? Maybe just accept that the cache will be recompiled/overwritten?
141
- Otherwise you'd need application level coordination to ensure that the cache is not modified
142
- while we are creating the archive, but this doesn't really seem like a good idea in terms of adoption.
143
-
144
- FIXME(SR):
145
- More things to consider:
146
- - [possible] What if b10fs dies *during* an op? right now we check for b10fs availability in the beginning of the op... Add some constants instead of just False for load().
147
- - [possible, and really bad if it happens] potential memory exhaustion during compression if the cache is super super large. very very edge case. higher compression levels also have high memory usage.
148
- """
202
+ return OperationStatus.ERROR
149
203
 
150
204
 
151
205
  @timed_fn(logger=logger, name="Saving compile cache")
152
206
  @safe_execute("Save failed", False)
153
- def save_compile_cache() -> SaveStatus:
207
+ def save_compile_cache() -> OperationStatus:
154
208
  """Save local PyTorch compilation cache to b10fs using atomic journal pattern.
155
209
 
156
210
  This function creates an archive of the local torch cache directory and
@@ -164,46 +218,34 @@ def save_compile_cache() -> SaveStatus:
164
218
  space becomes insufficient, finally performing an atomic rename to the final cache file.
165
219
 
166
220
  Returns:
167
- SaveStatus:
168
- SaveStatus.SUCCESS if cache was successfully saved or already exists
169
- SaveStatus.ERROR if b10fs is unavailable, insufficient disk space caused interruption,
221
+ OperationStatus:
222
+ OperationStatus.SUCCESS if cache was successfully saved or already exists
223
+ OperationStatus.ERROR if b10fs is unavailable, insufficient disk space caused interruption,
170
224
  no cache exists to save, or saving failed.
171
- SaveStatus.SKIPPED if no cache exists to save or cache already exists in b10fs
225
+ OperationStatus.SKIPPED if no cache exists to save or cache already exists in b10fs
172
226
 
173
227
  Raises:
174
- CacheValidationError: If b10fs is not enabled (caught and returns SaveStatus.ERROR).
228
+ CacheValidationError: If b10fs is not enabled (caught and returns OperationStatus.ERROR).
175
229
  CacheOperationInterrupted: If operations interrupted due to insufficient
176
- disk space (caught and returns SaveStatus.ERROR).
177
- ArchiveError: If archive creation fails (caught and returns SaveStatus.ERROR).
178
- Exception: Any other errors during saving (caught and returns SaveStatus.ERROR).
230
+ disk space (caught and returns OperationStatus.ERROR).
231
+ ArchiveError: If archive creation fails (caught and returns OperationStatus.ERROR).
232
+ Exception: Any other errors during saving (caught and returns OperationStatus.ERROR).
179
233
  """
180
234
  with cache_operation("Save"):
181
- # Cooperative cleanup of stale shared resources
182
- cooperative_cleanup_b10fs()
183
-
184
- b10fs_dir = Path(B10FS_CACHE_DIR)
185
- torch_dir = Path(TORCH_CACHE_DIR)
186
- work_dir = Path(LOCAL_WORK_DIR)
235
+ b10fs_dir, torch_dir, work_dir = _setup_cache_paths()
187
236
 
188
237
  # Check if anything to save
189
238
  if not torch_dir.exists() or not any(torch_dir.iterdir()):
190
239
  logger.info("No torch cache to save")
191
- return SaveStatus.SKIPPED
240
+ return OperationStatus.SKIPPED
192
241
 
193
242
  cache_filename = get_cache_filename()
194
- final_file = (
195
- b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
196
- )
243
+ final_file, temp_file = _get_cache_file_paths(cache_filename, b10fs_dir)
197
244
 
198
245
  # Check for existing cache first (early exit)
199
246
  if final_file.exists():
200
247
  logger.info("Cache already exists in b10fs, skipping save")
201
- return SaveStatus.SKIPPED
202
-
203
- temp_file = (
204
- b10fs_dir
205
- / f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
206
- )
248
+ return OperationStatus.SKIPPED
207
249
 
208
250
  with tempfile.NamedTemporaryFile(
209
251
  suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
@@ -213,34 +255,19 @@ def save_compile_cache() -> SaveStatus:
213
255
 
214
256
  try:
215
257
  with temp_file_cleanup(local_temp):
216
- # Phase 1: Compression in separate process with space monitoring
217
- space_monitor = CacheSpaceMonitor(REQUIRED_B10FS_SPACE_MB, b10fs_dir)
218
- space_monitor.start()
219
-
220
- try:
221
- logger.info(f"Starting compression: {torch_dir} -> {local_temp}")
222
- run_monitored_process(
223
- _cache_compression_worker,
224
- (str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
225
- space_monitor,
226
- "compression",
227
- )
228
- finally:
229
- space_monitor.stop()
230
-
231
- # Phase 2: Copy to b10fs using transfer()
232
- @critical_section_b10fs_file_lock("copy_in")
233
- def _monitored_copy_to_b10fs():
234
- result = transfer(str(local_temp), str(temp_file))
235
- if result != TransferStatus.SUCCESS:
236
- # Clean up the temp file if transfer failed
237
- safe_unlink(
238
- temp_file,
239
- f"Failed to cleanup after failed copy {temp_file}",
240
- )
241
- raise Exception("Failed to copy cache file to b10fs")
242
-
243
- _monitored_copy_to_b10fs()
258
+ # Phase 1: Compression with space monitoring
259
+ _run_with_space_monitoring(
260
+ REQUIRED_B10FS_SPACE_MB,
261
+ b10fs_dir,
262
+ "compression",
263
+ _cache_compression_worker,
264
+ (str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
265
+ )
266
+
267
+ # Phase 2: Copy to b10fs with locking
268
+ _transfer_with_b10fs_lock(
269
+ str(local_temp), str(temp_file), "copy_in", cleanup_on_failure=True
270
+ )
244
271
 
245
272
  # Phase 3: Atomic rename (fast, don't interrupt)
246
273
  logger.info(
@@ -249,95 +276,11 @@ def save_compile_cache() -> SaveStatus:
249
276
  temp_file.rename(final_file)
250
277
 
251
278
  logger.info("Cache save complete")
252
- return SaveStatus.SUCCESS
279
+ return OperationStatus.SUCCESS
253
280
 
254
281
  except CacheOperationInterrupted as e:
255
282
  logger.warning(f"Cache save interrupted: {e}")
256
- return SaveStatus.ERROR
257
-
258
-
259
- @timed_fn(logger=logger, name="Transferring file")
260
- @safe_execute("Transfer failed", TransferStatus.ERROR)
261
- def transfer(source: str, dest: str) -> TransferStatus:
262
- """Transfer a file from source to destination with space monitoring.
263
-
264
- This function copies a file from source to destination using the same
265
- monitored process approach as the cache operations. It monitors disk space
266
- at the destination and can interrupt the transfer if space becomes insufficient.
267
-
268
- Args:
269
- source: Path to the source file to copy.
270
- dest: Path to the destination where the file will be copied.
271
-
272
- Returns:
273
- TransferStatus:
274
- TransferStatus.SUCCESS if transfer was successful
275
- TransferStatus.ERROR if transfer failed due to insufficient disk space,
276
- file not found, or other errors.
277
-
278
- Raises:
279
- CacheOperationInterrupted: If transfer interrupted due to insufficient
280
- disk space (caught and returns TransferStatus.ERROR).
281
- Exception: Any other errors during transfer (caught and returns TransferStatus.ERROR).
282
- """
283
- source_path = Path(source)
284
- dest_path = Path(dest)
285
-
286
- # Validate source file exists
287
- if not source_path.exists():
288
- logger.error(f"Source file does not exist: {source}")
289
- return TransferStatus.ERROR
290
-
291
- # Create destination directory if it doesn't exist
292
- dest_path.parent.mkdir(parents=True, exist_ok=True)
293
-
294
- # Determine appropriate space threshold based on destination directory
295
- dest_dir = dest_path.parent
296
- if str(dest_dir).startswith(B10FS_CACHE_DIR):
297
- # Transferring to b10fs - use b10fs space requirements
298
- space_threshold_mb = REQUIRED_B10FS_SPACE_MB
299
- logger.debug(
300
- f"Transfer to b10fs detected, using {space_threshold_mb:.1f}MB threshold"
301
- )
302
- else:
303
- # Transferring to local directory - use local space requirements
304
- space_threshold_mb = MIN_LOCAL_SPACE_MB
305
- logger.debug(
306
- f"Transfer to local directory detected, using {space_threshold_mb:.1f}MB threshold"
307
- )
308
-
309
- # Initial disk space check
310
- check_sufficient_disk_space(dest_dir, space_threshold_mb, "file transfer")
311
- logger.debug(
312
- f"Initial space check passed: {space_threshold_mb:.1f}MB required at destination"
313
- )
314
-
315
- # Start background space monitoring for destination directory
316
- space_monitor = CacheSpaceMonitor(space_threshold_mb, dest_dir)
317
- space_monitor.start()
318
-
319
- try:
320
- # Run monitored copy process
321
- logger.info(f"Starting transfer: {source} -> {dest}")
322
- run_monitored_process(
323
- _cache_copy_worker,
324
- (str(source_path), str(dest_path)),
325
- space_monitor,
326
- "file transfer",
327
- cleanup_func=lambda: safe_unlink(
328
- dest_path, f"Failed to cleanup interrupted transfer {dest_path}"
329
- ),
330
- )
331
-
332
- logger.info("File transfer complete")
333
- return TransferStatus.SUCCESS
334
-
335
- except CacheOperationInterrupted as e:
336
- logger.warning(f"File transfer interrupted: {e}")
337
- return TransferStatus.ERROR
338
-
339
- finally:
340
- space_monitor.stop()
283
+ return OperationStatus.ERROR
341
284
 
342
285
 
343
286
  @safe_execute("Clear failed", False)
@@ -357,6 +300,8 @@ def clear_local_cache() -> bool:
357
300
  torch_dir = Path(TORCH_CACHE_DIR)
358
301
  if not torch_dir.exists():
359
302
  return True
303
+ import shutil
304
+
360
305
  shutil.rmtree(torch_dir)
361
306
  return True
362
307
 
@@ -384,27 +329,12 @@ def _cache_compression_worker(
384
329
  create_archive(torch_dir, local_temp, max_size_mb)
385
330
 
386
331
 
387
- @worker_process("Copy was cancelled before starting")
388
- def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
389
- """Worker process that handles file copy to b10fs.
390
-
391
- This function runs in a separate process to copy the compressed cache file
392
- to the b10fs filesystem. It can be terminated externally if disk space becomes insufficient.
393
-
394
- Args:
395
- source_path_str: String path to the source file to copy.
396
- dest_path_str: String path where the file will be copied.
397
- """
398
- source_path = Path(source_path_str)
399
- dest_path = Path(dest_path_str)
400
-
401
- shutil.copy2(source_path, dest_path)
402
-
403
-
404
332
  def _cleanup_torch_dir(torch_dir: Path) -> None:
405
333
  """Helper function to safely cleanup torch directory during interrupted extraction."""
406
334
  try:
407
335
  if torch_dir.exists():
336
+ import shutil
337
+
408
338
  shutil.rmtree(torch_dir)
409
339
  logger.debug(f"Cleaned up torch directory: {torch_dir}")
410
340
  except Exception as e:
@@ -114,25 +114,10 @@ class WorkerStatus(Enum):
114
114
  CANCELLED = auto()
115
115
 
116
116
 
117
- class LoadStatus(Enum):
118
- """Status values for cache loading operations."""
119
-
120
- SUCCESS = auto()
121
- ERROR = auto()
122
- DOES_NOT_EXIST = auto()
123
- SKIPPED = auto()
124
-
125
-
126
- class SaveStatus(Enum):
127
- """Status values for cache saving operations."""
128
-
129
- SUCCESS = auto()
130
- ERROR = auto()
131
- SKIPPED = auto()
132
-
133
-
134
- class TransferStatus(Enum):
135
- """Status values for file transfer operations."""
117
+ class OperationStatus(Enum):
118
+ """Status values for all b10-transfer operations (load, save, transfer)."""
136
119
 
137
120
  SUCCESS = auto()
138
121
  ERROR = auto()
122
+ DOES_NOT_EXIST = auto() # Used by load operations when cache file not found
123
+ SKIPPED = auto() # Used by load/save operations when operation not needed
@@ -0,0 +1,131 @@
1
+ """Core file transfer operations for b10-transfer.
2
+
3
+ This module provides generic file transfer functionality with space monitoring
4
+ and error handling for b10fs operations.
5
+ """
6
+
7
+ import logging
8
+ import shutil
9
+ from pathlib import Path
10
+
11
+ from .utils import (
12
+ timed_fn,
13
+ safe_execute,
14
+ safe_unlink,
15
+ )
16
+ from .space_monitor import (
17
+ check_sufficient_disk_space,
18
+ CacheSpaceMonitor,
19
+ CacheOperationInterrupted,
20
+ run_monitored_process,
21
+ worker_process,
22
+ )
23
+ from .constants import (
24
+ B10FS_CACHE_DIR,
25
+ REQUIRED_B10FS_SPACE_MB,
26
+ MIN_LOCAL_SPACE_MB,
27
+ OperationStatus,
28
+ )
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ @timed_fn(logger=logger, name="Transferring file")
34
+ @safe_execute("Transfer failed", OperationStatus.ERROR)
35
+ def transfer(source: str, dest: str) -> OperationStatus:
36
+ """Transfer a file from source to destination with space monitoring.
37
+
38
+ This function copies a file from source to destination using the same
39
+ monitored process approach as the cache operations. It monitors disk space
40
+ at the destination and can interrupt the transfer if space becomes insufficient.
41
+
42
+ Args:
43
+ source: Path to the source file to copy.
44
+ dest: Path to the destination where the file will be copied.
45
+
46
+ Returns:
47
+ OperationStatus:
48
+ OperationStatus.SUCCESS if transfer was successful
49
+ OperationStatus.ERROR if transfer failed due to insufficient disk space,
50
+ file not found, or other errors.
51
+
52
+ Raises:
53
+ CacheOperationInterrupted: If transfer interrupted due to insufficient
54
+ disk space (caught and returns OperationStatus.ERROR).
55
+ Exception: Any other errors during transfer (caught and returns OperationStatus.ERROR).
56
+ """
57
+ source_path = Path(source)
58
+ dest_path = Path(dest)
59
+
60
+ # Validate source file exists
61
+ if not source_path.exists():
62
+ logger.error(f"Source file does not exist: {source}")
63
+ return OperationStatus.ERROR
64
+
65
+ # Create destination directory if it doesn't exist
66
+ dest_path.parent.mkdir(parents=True, exist_ok=True)
67
+
68
+ # Determine appropriate space threshold based on destination directory
69
+ dest_dir = dest_path.parent
70
+ if str(dest_dir).startswith(B10FS_CACHE_DIR):
71
+ # Transferring to b10fs - use b10fs space requirements
72
+ space_threshold_mb = REQUIRED_B10FS_SPACE_MB
73
+ logger.debug(
74
+ f"Transfer to b10fs detected, using {space_threshold_mb:.1f}MB threshold"
75
+ )
76
+ else:
77
+ # Transferring to local directory - use local space requirements
78
+ space_threshold_mb = MIN_LOCAL_SPACE_MB
79
+ logger.debug(
80
+ f"Transfer to local directory detected, using {space_threshold_mb:.1f}MB threshold"
81
+ )
82
+
83
+ # Initial disk space check
84
+ check_sufficient_disk_space(dest_dir, space_threshold_mb, "file transfer")
85
+ logger.debug(
86
+ f"Initial space check passed: {space_threshold_mb:.1f}MB required at destination"
87
+ )
88
+
89
+ # Start background space monitoring for destination directory
90
+ space_monitor = CacheSpaceMonitor(space_threshold_mb, dest_dir)
91
+ space_monitor.start()
92
+
93
+ try:
94
+ # Run monitored copy process
95
+ logger.info(f"Starting transfer: {source} -> {dest}")
96
+ run_monitored_process(
97
+ _cache_copy_worker,
98
+ (str(source_path), str(dest_path)),
99
+ space_monitor,
100
+ "file transfer",
101
+ cleanup_func=lambda: safe_unlink(
102
+ dest_path, f"Failed to cleanup interrupted transfer {dest_path}"
103
+ ),
104
+ )
105
+
106
+ logger.info("File transfer complete")
107
+ return OperationStatus.SUCCESS
108
+
109
+ except CacheOperationInterrupted as e:
110
+ logger.warning(f"File transfer interrupted: {e}")
111
+ return OperationStatus.ERROR
112
+
113
+ finally:
114
+ space_monitor.stop()
115
+
116
+
117
+ @worker_process("Copy was cancelled before starting")
118
+ def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
119
+ """Worker process that handles file copy operations.
120
+
121
+ This function runs in a separate process to copy files between locations.
122
+ It can be terminated externally if disk space becomes insufficient.
123
+
124
+ Args:
125
+ source_path_str: String path to the source file to copy.
126
+ dest_path_str: String path where the file will be copied.
127
+ """
128
+ source_path = Path(source_path_str)
129
+ dest_path = Path(dest_path_str)
130
+
131
+ shutil.copy2(source_path, dest_path)
File without changes