b10-transfer 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
b10_transfer/core.py CHANGED
@@ -1,169 +1,479 @@
1
+ import os
1
2
  import logging
3
+ import tempfile
4
+ import shutil
2
5
  from pathlib import Path
3
6
 
7
+ import time
8
+
9
+ from .environment import get_cache_filename
4
10
  from .cleanup import cooperative_cleanup_b10fs
5
11
  from .utils import (
6
12
  timed_fn,
13
+ critical_section_b10fs_file_lock,
7
14
  safe_execute,
15
+ temp_file_cleanup,
8
16
  cache_operation,
17
+ safe_unlink,
9
18
  )
10
19
  from .space_monitor import (
11
20
  check_sufficient_disk_space,
12
21
  CacheSpaceMonitor,
13
22
  CacheOperationInterrupted,
14
- CacheFileNotFoundError,
15
23
  run_monitored_process,
24
+ worker_process,
16
25
  )
17
26
  from .constants import (
27
+ TORCH_CACHE_DIR,
18
28
  B10FS_CACHE_DIR,
19
29
  LOCAL_WORK_DIR,
30
+ MAX_CACHE_SIZE_MB,
20
31
  REQUIRED_B10FS_SPACE_MB,
21
32
  MIN_LOCAL_SPACE_MB,
33
+ CACHE_FILE_EXTENSION,
34
+ CACHE_LATEST_SUFFIX,
35
+ CACHE_INCOMPLETE_SUFFIX,
36
+ LoadStatus,
37
+ SaveStatus,
22
38
  TransferStatus,
23
39
  )
24
40
 
25
41
  logger = logging.getLogger(__name__)
26
42
 
27
43
 
28
- @timed_fn(logger=logger, name="Generic transfer operation")
29
- @safe_execute("Transfer failed", TransferStatus.ERROR)
30
- def transfer(
31
- source: Path,
32
- dest: Path,
33
- callback: callable,
34
- *callback_args,
35
- monitor_local: bool = True,
36
- monitor_b10fs: bool = True,
37
- **callback_kwargs,
38
- ) -> TransferStatus:
39
- """Generic transfer function with space monitoring and atomic operations.
40
-
41
- The actual transfer logic is provided via callback.
42
-
43
- The function handles:
44
- - Cooperative cleanup of stale shared resources
45
- - Space monitoring during operations (optional for local and b10fs)
46
- - Atomic operations using temp files and rename
47
- - Automatic cleanup on interruption or failure
48
- - Lock management for b10fs operations
44
+ @timed_fn(logger=logger, name="Loading compile cache")
45
+ @safe_execute("Load failed", False)
46
+ def load_compile_cache() -> LoadStatus:
47
+ """Load PyTorch compilation cache from b10fs to local torch cache directory.
49
48
 
50
- Args:
51
- source: Source path for the transfer operation
52
- dest: Destination path for the transfer operation
53
- callback: Function to perform the actual transfer work
54
- *callback_args: Positional arguments to pass to callback
55
- monitor_local: Whether to monitor local disk space (default: True)
56
- monitor_b10fs: Whether to monitor b10fs disk space (default: True)
57
- **callback_kwargs: Keyword arguments to pass to callback
49
+ This function implements a lock-free pattern to safely load cached PyTorch
50
+ compilation artifacts from the b10fs shared filesystem to the local torch
51
+ cache directory. It validates b10fs availability, checks for existing cache,
52
+ and extracts the archive if needed.
53
+
54
+ The function monitors local disk space during both the copy from b10fs and
55
+ extraction phases, interrupting operations if space falls below MIN_LOCAL_SPACE_MB.
58
56
 
59
57
  Returns:
60
- TransferStatus:
61
- TransferStatus.SUCCESS if transfer completed successfully
62
- TransferStatus.ERROR if transfer failed
63
- TransferStatus.INTERRUPTED if transfer was interrupted due to insufficient disk space
58
+ LoadStatus:
59
+ LoadStatus.SUCCESS if cache was successfully loaded
60
+ LoadStatus.SKIPPED if already exists
61
+ LoadStatus.ERROR if b10fs is unavailable, local disk space is insufficient, or loading failed.
62
+ LoadStatus.DOES_NOT_EXIST if no cache file was found.
64
63
 
65
64
  Raises:
66
- CacheValidationError: If b10fs is not enabled (caught and returns TransferStatus.ERROR).
65
+ CacheValidationError: If b10fs is not enabled (caught and returns LoadStatus.ERROR).
67
66
  CacheOperationInterrupted: If operations interrupted due to insufficient
68
- disk space (caught and returns TransferStatus.INTERRUPTED).
69
- Exception: Any other errors during transfer (caught and returns TransferStatus.ERROR).
67
+ local disk space (caught and returns LoadStatus.ERROR).
68
+ Exception: Any other errors during loading (caught and returns LoadStatus.ERROR).
70
69
  """
71
- with cache_operation("Transfer"):
70
+ with cache_operation("Load"):
72
71
  # Cooperative cleanup of stale shared resources
73
72
  cooperative_cleanup_b10fs()
74
73
 
75
74
  b10fs_dir = Path(B10FS_CACHE_DIR)
75
+ torch_dir = Path(TORCH_CACHE_DIR)
76
76
  work_dir = Path(LOCAL_WORK_DIR)
77
77
 
78
- # Determine which paths to monitor based on source/dest
79
- local_path = None
80
- b10fs_path = None
81
-
82
- if str(source).startswith(str(b10fs_dir)) or str(dest).startswith(
83
- str(b10fs_dir)
84
- ):
85
- b10fs_path = b10fs_dir
86
-
87
- if (
88
- str(source).startswith(str(work_dir))
89
- or str(dest).startswith(str(work_dir))
90
- or not str(source).startswith(str(b10fs_dir))
91
- or not str(dest).startswith(str(b10fs_dir))
92
- ):
93
- local_path = work_dir
94
-
95
- # Initial disk space checks
96
- if monitor_local and local_path:
97
- check_sufficient_disk_space(
98
- local_path, MIN_LOCAL_SPACE_MB, "local transfer operations"
99
- )
100
- logger.debug(
101
- f"Initial local space check passed: {MIN_LOCAL_SPACE_MB:.1f}MB required"
102
- )
103
-
104
- if monitor_b10fs and b10fs_path:
105
- check_sufficient_disk_space(
106
- b10fs_path, REQUIRED_B10FS_SPACE_MB, "b10fs transfer operations"
107
- )
108
- logger.debug(
109
- f"Initial b10fs space check passed: {REQUIRED_B10FS_SPACE_MB:.1f}MB required"
110
- )
111
-
112
- # Determine primary space monitor (prioritize b10fs if both are monitored)
113
- primary_monitor = None
114
- if monitor_b10fs and b10fs_path:
115
- primary_monitor = CacheSpaceMonitor(REQUIRED_B10FS_SPACE_MB, b10fs_path)
116
- elif monitor_local and local_path:
117
- primary_monitor = CacheSpaceMonitor(MIN_LOCAL_SPACE_MB, local_path)
118
-
119
- if primary_monitor is None:
120
- # No monitoring requested, execute callback directly
121
- logger.info(f"Starting transfer (no monitoring): {source} -> {dest}")
122
- try:
123
- callback(source, dest, *callback_args, **callback_kwargs)
124
- logger.info("Transfer complete")
125
- return TransferStatus.SUCCESS
126
- except (FileNotFoundError, CacheFileNotFoundError) as e:
127
- logger.info(f"Transfer failed - file not found: {e}")
128
- return TransferStatus.DOES_NOT_EXIST
129
-
130
- # Start the primary space monitor
131
- primary_monitor.start()
78
+ cache_filename = get_cache_filename()
79
+ cache_file = (
80
+ b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
81
+ )
82
+ logger.debug(f"Looking for cache file: {cache_file}")
83
+
84
+ if not cache_file.exists():
85
+ logger.info("No cache file found in b10fs")
86
+ return LoadStatus.DOES_NOT_EXIST
87
+
88
+ # Skip if already loaded
89
+ if torch_dir.exists() and any(torch_dir.iterdir()):
90
+ logger.info("Torch cache already loaded, skipping extraction")
91
+ return LoadStatus.SKIPPED
92
+
93
+ # Initial disk space check for local operations
94
+ check_sufficient_disk_space(
95
+ work_dir, MIN_LOCAL_SPACE_MB, "cache load operations"
96
+ )
97
+ logger.debug(
98
+ f"Initial space check passed: {MIN_LOCAL_SPACE_MB:.1f}MB required on local machine"
99
+ )
100
+
101
+ # Start background space monitoring for local disk
102
+ space_monitor = CacheSpaceMonitor(MIN_LOCAL_SPACE_MB, work_dir)
103
+ space_monitor.start()
104
+
105
+ # Create temp local copy
106
+ with tempfile.NamedTemporaryFile(
107
+ suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
108
+ ) as f:
109
+ temp_path = Path(f.name)
110
+ logger.debug(f"Created temporary file for cache: {temp_path}")
132
111
 
133
112
  try:
134
- # Execute the callback using monitored process for continuous space monitoring
135
- logger.info(f"Starting monitored transfer: {source} -> {dest}")
113
+ with temp_file_cleanup(temp_path):
114
+ # Phase 1: Copy from b10fs to local temp file in separate process
115
+ @critical_section_b10fs_file_lock("copy_out")
116
+ def _monitored_copy_from_b10fs():
117
+ logger.info(
118
+ f"Starting copy from b10fs: {cache_file} -> {temp_path}"
119
+ )
120
+ run_monitored_process(
121
+ _cache_copy_from_b10fs_worker,
122
+ (str(cache_file), str(temp_path)),
123
+ space_monitor,
124
+ "b10fs to local copy",
125
+ )
136
126
 
137
- # Try direct callback with run_monitored_process first
138
- try:
127
+ _monitored_copy_from_b10fs()
128
+
129
+ # Phase 2: Extract archive in separate process
130
+ logger.info(f"Starting extraction: {temp_path} -> {torch_dir}")
139
131
  run_monitored_process(
140
- callback,
141
- (source, dest, *callback_args),
142
- primary_monitor,
143
- "transfer callback",
132
+ _cache_extract_worker,
133
+ (str(temp_path), str(torch_dir)),
134
+ space_monitor,
135
+ "archive extraction",
136
+ cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
144
137
  )
145
- logger.info("Transfer complete (monitored)")
146
- return TransferStatus.SUCCESS
147
138
 
148
- except (TypeError, AttributeError, ImportError, OSError) as e:
149
- # Callback not pickleable or other serialization issue
150
- logger.warning(
151
- f"Callback not suitable for process isolation, running without monitoring: {e}"
139
+ logger.info("Cache load complete")
140
+ return LoadStatus.SUCCESS
141
+
142
+ except CacheOperationInterrupted as e:
143
+ logger.warning(f"Cache load interrupted: {e}")
144
+ return LoadStatus.ERROR
145
+
146
+ finally:
147
+ space_monitor.stop()
148
+
149
+
150
+ """
151
+ FIXME(SRAY):
152
+ What about the case in @b10-transfer/ where a single pod finishes an inference request,
153
+ and then the client calls save_compile_cache. And while we are creating the local archive,
154
+ another inference call on the same pod is kicked off, which then modifies the torch cache.
155
+ How would this be handled? Maybe just accept that the cache will be recompiled/overwritten?
156
+ Otherwise you'd need application level coordination to ensure that the cache is not modified
157
+ while we are creating the archive, but this doesn't really seem like a good idea in terms of adoption.
158
+
159
+ FIXME(SR):
160
+ More things to consider:
161
+ - [possible] What if b10fs dies *during* an op? right now we check for b10fs availability in the beginning of the op... Add some constants instead of just False for load().
162
+ - [possible, and really bad if it happens] potential memory exhaustion during compression if the cache is super super large. very very edge case. higher compression levels also have high memory usage.
163
+ """
164
+
165
+
166
+ @timed_fn(logger=logger, name="Saving compile cache")
167
+ @safe_execute("Save failed", False)
168
+ def save_compile_cache() -> SaveStatus:
169
+ """Save local PyTorch compilation cache to b10fs using atomic journal pattern.
170
+
171
+ This function creates an archive of the local torch cache directory and
172
+ atomically saves it to b10fs using a journal pattern (write to temp file,
173
+ then rename). This ensures concurrent saves don't corrupt each other.
174
+
175
+ The function validates b10fs availability, checks if cache already exists
176
+ (early exit), performs initial space checks using pre-calculated requirements
177
+ for concurrent saves, starts background space monitoring, then runs compression
178
+ and copy operations in separate worker processes that can be terminated if disk
179
+ space becomes insufficient, finally performing an atomic rename to the final cache file.
180
+
181
+ Returns:
182
+ SaveStatus:
183
+ SaveStatus.SUCCESS if cache was successfully saved or already exists
184
+ SaveStatus.ERROR if b10fs is unavailable, insufficient disk space caused interruption,
185
+ no cache exists to save, or saving failed.
186
+ SaveStatus.SKIPPED if no cache exists to save or cache already exists in b10fs
187
+
188
+ Raises:
189
+ CacheValidationError: If b10fs is not enabled (caught and returns SaveStatus.ERROR).
190
+ CacheOperationInterrupted: If operations interrupted due to insufficient
191
+ disk space (caught and returns SaveStatus.ERROR).
192
+ ArchiveError: If archive creation fails (caught and returns SaveStatus.ERROR).
193
+ Exception: Any other errors during saving (caught and returns SaveStatus.ERROR).
194
+ """
195
+ with cache_operation("Save"):
196
+ # Cooperative cleanup of stale shared resources
197
+ cooperative_cleanup_b10fs()
198
+
199
+ b10fs_dir = Path(B10FS_CACHE_DIR)
200
+ torch_dir = Path(TORCH_CACHE_DIR)
201
+ work_dir = Path(LOCAL_WORK_DIR)
202
+
203
+ # Check if anything to save
204
+ if not torch_dir.exists() or not any(torch_dir.iterdir()):
205
+ logger.info("No torch cache to save")
206
+ return SaveStatus.SKIPPED
207
+
208
+ cache_filename = get_cache_filename()
209
+ final_file = (
210
+ b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
211
+ )
212
+
213
+ # Check for existing cache first (early exit)
214
+ if final_file.exists():
215
+ logger.info("Cache already exists in b10fs, skipping save")
216
+ return SaveStatus.SKIPPED
217
+
218
+ # Initial disk space checks using calculated space requirements
219
+ check_sufficient_disk_space(
220
+ work_dir, MAX_CACHE_SIZE_MB, "local temp file creation"
221
+ )
222
+ check_sufficient_disk_space(
223
+ b10fs_dir, REQUIRED_B10FS_SPACE_MB, "cache save to b10fs"
224
+ )
225
+ logger.debug(
226
+ f"Initial space checks passed: {MAX_CACHE_SIZE_MB:.1f}MB local, {REQUIRED_B10FS_SPACE_MB:.1f}MB b10fs"
227
+ )
228
+
229
+ temp_file = (
230
+ b10fs_dir
231
+ / f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
232
+ )
233
+
234
+ # Start background space monitoring
235
+ space_monitor = CacheSpaceMonitor(REQUIRED_B10FS_SPACE_MB, b10fs_dir)
236
+ space_monitor.start()
237
+
238
+ with tempfile.NamedTemporaryFile(
239
+ suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
240
+ ) as f:
241
+ local_temp = Path(f.name)
242
+ logger.debug(f"Created local temp file for archive: {local_temp}")
243
+
244
+ try:
245
+ with temp_file_cleanup(local_temp):
246
+ # Phase 1: Compression in separate process
247
+ logger.info(f"Starting compression: {torch_dir} -> {local_temp}")
248
+ run_monitored_process(
249
+ _cache_compression_worker,
250
+ (str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
251
+ space_monitor,
252
+ "compression",
152
253
  )
153
254
 
154
- # Fallback to direct execution without process isolation
155
- callback(source, dest, *callback_args, **callback_kwargs)
156
- logger.info("Transfer complete (unmonitored)")
157
- return TransferStatus.SUCCESS
255
+ b10fs_dir.mkdir(parents=True, exist_ok=True)
158
256
 
159
- except (FileNotFoundError, CacheFileNotFoundError) as e:
160
- logger.info(f"Transfer failed - file not found: {e}")
161
- return TransferStatus.DOES_NOT_EXIST
257
+ # Phase 2: Copy to b10fs in separate process
258
+ @critical_section_b10fs_file_lock("copy_in")
259
+ def _monitored_copy_to_b10fs():
260
+ logger.info(f"Starting copy to b10fs: {local_temp} -> {temp_file}")
261
+ run_monitored_process(
262
+ _cache_copy_worker,
263
+ (str(local_temp), str(temp_file)),
264
+ space_monitor,
265
+ "b10fs copy",
266
+ cleanup_func=lambda: safe_unlink(
267
+ temp_file, f"Failed to cleanup interrupted copy {temp_file}"
268
+ ),
269
+ )
270
+
271
+ _monitored_copy_to_b10fs()
272
+
273
+ # Phase 3: Atomic rename (fast, don't interrupt)
274
+ logger.info(
275
+ f"Renaming temp file to final cache file: {temp_file} -> {final_file}"
276
+ )
277
+ temp_file.rename(final_file)
278
+
279
+ logger.info("Cache save complete")
280
+ return SaveStatus.SUCCESS
162
281
 
163
282
  except CacheOperationInterrupted as e:
164
- logger.warning(f"Transfer interrupted: {e}")
165
- return TransferStatus.INTERRUPTED
283
+ logger.warning(f"Cache save interrupted: {e}")
284
+ return SaveStatus.ERROR
166
285
 
167
286
  finally:
168
- # Stop space monitor
169
- primary_monitor.stop()
287
+ space_monitor.stop()
288
+
289
+
290
+ @timed_fn(logger=logger, name="Transferring file")
291
+ @safe_execute("Transfer failed", TransferStatus.ERROR)
292
+ def transfer(source: str, dest: str) -> TransferStatus:
293
+ """Transfer a file from source to destination with space monitoring.
294
+
295
+ This function copies a file from source to destination using the same
296
+ monitored process approach as the cache operations. It monitors disk space
297
+ at the destination and can interrupt the transfer if space becomes insufficient.
298
+
299
+ Args:
300
+ source: Path to the source file to copy.
301
+ dest: Path to the destination where the file will be copied.
302
+
303
+ Returns:
304
+ TransferStatus:
305
+ TransferStatus.SUCCESS if transfer was successful
306
+ TransferStatus.ERROR if transfer failed due to insufficient disk space,
307
+ file not found, or other errors.
308
+
309
+ Raises:
310
+ CacheOperationInterrupted: If transfer interrupted due to insufficient
311
+ disk space (caught and returns TransferStatus.ERROR).
312
+ Exception: Any other errors during transfer (caught and returns TransferStatus.ERROR).
313
+ """
314
+ source_path = Path(source)
315
+ dest_path = Path(dest)
316
+
317
+ # Validate source file exists
318
+ if not source_path.exists():
319
+ logger.error(f"Source file does not exist: {source}")
320
+ return TransferStatus.ERROR
321
+
322
+ # Create destination directory if it doesn't exist
323
+ dest_path.parent.mkdir(parents=True, exist_ok=True)
324
+
325
+ # Determine appropriate space threshold based on destination directory
326
+ dest_dir = dest_path.parent
327
+ if str(dest_dir).startswith(B10FS_CACHE_DIR):
328
+ # Transferring to b10fs - use b10fs space requirements
329
+ space_threshold_mb = REQUIRED_B10FS_SPACE_MB
330
+ logger.debug(
331
+ f"Transfer to b10fs detected, using {space_threshold_mb:.1f}MB threshold"
332
+ )
333
+ else:
334
+ # Transferring to local directory - use local space requirements
335
+ space_threshold_mb = MIN_LOCAL_SPACE_MB
336
+ logger.debug(
337
+ f"Transfer to local directory detected, using {space_threshold_mb:.1f}MB threshold"
338
+ )
339
+
340
+ # Initial disk space check
341
+ check_sufficient_disk_space(dest_dir, space_threshold_mb, "file transfer")
342
+ logger.debug(
343
+ f"Initial space check passed: {space_threshold_mb:.1f}MB required at destination"
344
+ )
345
+
346
+ # Start background space monitoring for destination directory
347
+ space_monitor = CacheSpaceMonitor(space_threshold_mb, dest_dir)
348
+ space_monitor.start()
349
+
350
+ try:
351
+ # Run monitored copy process
352
+ logger.info(f"Starting transfer: {source} -> {dest}")
353
+ run_monitored_process(
354
+ _cache_copy_worker,
355
+ (str(source_path), str(dest_path)),
356
+ space_monitor,
357
+ "file transfer",
358
+ cleanup_func=lambda: safe_unlink(
359
+ dest_path, f"Failed to cleanup interrupted transfer {dest_path}"
360
+ ),
361
+ )
362
+
363
+ logger.info("File transfer complete")
364
+ return TransferStatus.SUCCESS
365
+
366
+ except CacheOperationInterrupted as e:
367
+ logger.warning(f"File transfer interrupted: {e}")
368
+ return TransferStatus.ERROR
369
+
370
+ finally:
371
+ space_monitor.stop()
372
+
373
+
374
+ @safe_execute("Clear failed", False)
375
+ def clear_local_cache() -> bool:
376
+ """Clear the local PyTorch compilation cache directory.
377
+
378
+ This function removes the entire local torch cache directory and all its
379
+ contents. This is useful for cleaning up disk space or forcing recompilation.
380
+
381
+ Returns:
382
+ bool: True if cache was successfully cleared or didn't exist, False if
383
+ clearing failed due to permissions or other filesystem errors.
384
+
385
+ Raises:
386
+ Exception: Any errors during directory removal (caught and returns False).
387
+ """
388
+ torch_dir = Path(TORCH_CACHE_DIR)
389
+ if not torch_dir.exists():
390
+ return True
391
+ shutil.rmtree(torch_dir)
392
+ return True
393
+
394
+
395
+ @worker_process("Compression was cancelled before starting")
396
+ def _cache_compression_worker(
397
+ torch_dir_str: str, local_temp_str: str, max_size_mb: int
398
+ ) -> None:
399
+ """Worker process that handles cache compression.
400
+
401
+ This function runs in a separate process to compress the torch cache directory
402
+ into an archive. It can be terminated externally if disk space becomes insufficient.
403
+
404
+ Args:
405
+ torch_dir_str: String path to the torch cache directory to compress.
406
+ local_temp_str: String path where the compressed archive will be created.
407
+ max_size_mb: Maximum allowed archive size in megabytes.
408
+ """
409
+ torch_dir = Path(torch_dir_str)
410
+ local_temp = Path(local_temp_str)
411
+
412
+ # Import here to avoid issues with multiprocessing
413
+ from .archive import create_archive
414
+
415
+ create_archive(torch_dir, local_temp, max_size_mb)
416
+
417
+
418
+ @worker_process("Copy was cancelled before starting")
419
+ def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
420
+ """Worker process that handles file copy to b10fs.
421
+
422
+ This function runs in a separate process to copy the compressed cache file
423
+ to the b10fs filesystem. It can be terminated externally if disk space becomes insufficient.
424
+
425
+ Args:
426
+ source_path_str: String path to the source file to copy.
427
+ dest_path_str: String path where the file will be copied.
428
+ """
429
+ source_path = Path(source_path_str)
430
+ dest_path = Path(dest_path_str)
431
+
432
+ shutil.copy2(source_path, dest_path)
433
+
434
+
435
+ @worker_process("Copy from b10fs was cancelled before starting")
436
+ def _cache_copy_from_b10fs_worker(source_path_str: str, dest_path_str: str) -> None:
437
+ """Worker process that handles file copy from b10fs to local machine.
438
+
439
+ This function runs in a separate process to copy the cache file from b10fs
440
+ to the local filesystem. It can be terminated externally if local disk space becomes insufficient.
441
+
442
+ Args:
443
+ source_path_str: String path to the source file in b10fs to copy.
444
+ dest_path_str: String path where the file will be copied locally.
445
+ """
446
+ source_path = Path(source_path_str)
447
+ dest_path = Path(dest_path_str)
448
+
449
+ shutil.copy2(source_path, dest_path)
450
+
451
+
452
+ def _cleanup_torch_dir(torch_dir: Path) -> None:
453
+ """Helper function to safely cleanup torch directory during interrupted extraction."""
454
+ try:
455
+ if torch_dir.exists():
456
+ shutil.rmtree(torch_dir)
457
+ logger.debug(f"Cleaned up torch directory: {torch_dir}")
458
+ except Exception as e:
459
+ logger.error(f"Failed to cleanup torch directory {torch_dir}: {e}")
460
+
461
+
462
+ @worker_process("Extraction was cancelled before starting")
463
+ def _cache_extract_worker(archive_path_str: str, dest_dir_str: str) -> None:
464
+ """Worker process that handles archive extraction.
465
+
466
+ This function runs in a separate process to extract the cache archive to
467
+ the torch cache directory. It can be terminated externally if local disk space becomes insufficient.
468
+
469
+ Args:
470
+ archive_path_str: String path to the archive file to extract.
471
+ dest_dir_str: String path to the directory where archive will be extracted.
472
+ """
473
+ archive_path = Path(archive_path_str)
474
+ dest_dir = Path(dest_dir_str)
475
+
476
+ # Import here to avoid issues with multiprocessing
477
+ from .archive import extract_archive
478
+
479
+ extract_archive(archive_path, dest_dir)
@@ -24,12 +24,6 @@ class CacheOperationInterrupted(Exception):
24
24
  pass
25
25
 
26
26
 
27
- class CacheFileNotFoundError(Exception):
28
- """Raised when a cache file is not found during transfer operations."""
29
-
30
- pass
31
-
32
-
33
27
  def worker_process(cancelled_message: str):
34
28
  """Decorator for worker process functions to handle common try/catch/result_queue pattern.
35
29
 
@@ -66,8 +60,6 @@ def worker_process(cancelled_message: str):
66
60
  # If we get here, the function completed successfully
67
61
  result_queue.put((WorkerStatus.SUCCESS.value, None))
68
62
 
69
- except FileNotFoundError as e:
70
- result_queue.put((WorkerStatus.FILE_NOT_FOUND.value, str(e)))
71
63
  except Exception as e:
72
64
  result_queue.put((WorkerStatus.ERROR.value, str(e)))
73
65
 
@@ -294,11 +286,6 @@ def run_monitored_process(
294
286
  if cleanup_func:
295
287
  cleanup_func()
296
288
  raise CacheOperationInterrupted(error_msg)
297
- elif status == WorkerStatus.FILE_NOT_FOUND.value:
298
- logger.info(
299
- f"{operation_name} worker failed - file not found: {error_msg}"
300
- )
301
- raise CacheFileNotFoundError(error_msg)
302
289
  # status == WorkerStatus.SUCCESS.value - continue normally
303
290
 
304
291
  logger.debug(f"{operation_name} completed successfully")