b10-transfer 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
b10_transfer/__init__.py CHANGED
@@ -1,13 +1,14 @@
1
1
  """B10 Transfer - Lock-free PyTorch file transfer for Baseten."""
2
2
 
3
- from .core import load_compile_cache, save_compile_cache, clear_local_cache, transfer
3
+ from .cache import load_compile_cache, save_compile_cache, clear_local_cache
4
+ from .core import transfer
4
5
  from .utils import CacheError, CacheValidationError
5
6
  from .space_monitor import CacheOperationInterrupted
6
7
  from .info import get_cache_info, list_available_caches
7
8
  from .constants import OperationStatus
8
9
 
9
10
  # Version
10
- __version__ = "0.1.5"
11
+ __version__ = "0.1.7"
11
12
 
12
13
  __all__ = [
13
14
  "CacheError",
b10_transfer/cache.py ADDED
@@ -0,0 +1,361 @@
1
+ """Cache operations for PyTorch compilation artifacts.
2
+
3
+ This module provides functions for loading and saving PyTorch compilation cache
4
+ to/from b10fs shared storage using atomic operations and space monitoring.
5
+ """
6
+
7
+ import logging
8
+ import tempfile
9
+ from pathlib import Path
10
+
11
+ from .environment import get_cache_filename
12
+ from .cleanup import cooperative_cleanup_b10fs
13
+ from .utils import (
14
+ timed_fn,
15
+ critical_section_b10fs_file_lock,
16
+ safe_execute,
17
+ temp_file_cleanup,
18
+ cache_operation,
19
+ safe_unlink,
20
+ )
21
+ from .space_monitor import (
22
+ CacheSpaceMonitor,
23
+ CacheOperationInterrupted,
24
+ run_monitored_process,
25
+ worker_process,
26
+ )
27
+ from .constants import (
28
+ TORCH_CACHE_DIR,
29
+ B10FS_CACHE_DIR,
30
+ LOCAL_WORK_DIR,
31
+ MAX_CACHE_SIZE_MB,
32
+ REQUIRED_B10FS_SPACE_MB,
33
+ MIN_LOCAL_SPACE_MB,
34
+ CACHE_FILE_EXTENSION,
35
+ CACHE_LATEST_SUFFIX,
36
+ CACHE_INCOMPLETE_SUFFIX,
37
+ OperationStatus,
38
+ )
39
+ from .core import transfer
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+
44
+ """
45
+ FIXME(SRAY):
46
+ What about the case in @b10-transfer/ where a single pod finishes an inference request,
47
+ and then the client calls save_compile_cache. And while we are creating the local archive,
48
+ another inference call on the same pod is kicked off, which then modifies the torch cache.
49
+ How would this be handled? Maybe just accept that the cache will be recompiled/overwritten?
50
+ Otherwise you'd need application level coordination to ensure that the cache is not modified
51
+ while we are creating the archive, but this doesn't really seem like a good idea in terms of adoption.
52
+
53
+ FIXME(SR):
54
+ More things to consider:
55
+ - [possible] What if b10fs dies *during* an op? right now we check for b10fs availability in the beginning of the op... Add some constants instead of just False for load().
56
+ - [possible, and really bad if it happens] potential memory exhaustion during compression if the cache is super super large. very very edge case. higher compression levels also have high memory usage.
57
+ """
58
+
59
+
60
+ def _setup_cache_paths():
61
+ """Common setup for cache operations - returns paths and performs cleanup."""
62
+ # Cooperative cleanup of stale shared resources
63
+ cooperative_cleanup_b10fs()
64
+
65
+ b10fs_dir = Path(B10FS_CACHE_DIR)
66
+ torch_dir = Path(TORCH_CACHE_DIR)
67
+ work_dir = Path(LOCAL_WORK_DIR)
68
+
69
+ return b10fs_dir, torch_dir, work_dir
70
+
71
+
72
+ def _get_cache_file_paths(cache_filename: str, b10fs_dir: Path):
73
+ """Generate cache file paths for a given cache filename."""
74
+ final_file = (
75
+ b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
76
+ )
77
+ temp_file = (
78
+ b10fs_dir / f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
79
+ )
80
+ return final_file, temp_file
81
+
82
+
83
+ def _run_with_space_monitoring(
84
+ space_threshold_mb: float,
85
+ monitor_dir: Path,
86
+ operation_name: str,
87
+ worker_func,
88
+ worker_args: tuple,
89
+ cleanup_func=None,
90
+ ):
91
+ """Helper to run an operation with space monitoring."""
92
+ space_monitor = CacheSpaceMonitor(space_threshold_mb, monitor_dir)
93
+ space_monitor.start()
94
+
95
+ try:
96
+ logger.info(
97
+ f"Starting {operation_name}: {' -> '.join(str(arg) for arg in worker_args[:2])}"
98
+ )
99
+ run_monitored_process(
100
+ worker_func,
101
+ worker_args,
102
+ space_monitor,
103
+ operation_name,
104
+ cleanup_func=cleanup_func,
105
+ )
106
+ finally:
107
+ space_monitor.stop()
108
+
109
+
110
+ def _transfer_with_b10fs_lock(
111
+ source: str, dest: str, lock_type: str, cleanup_on_failure=True
112
+ ):
113
+ """Transfer a file with b10fs file locking and error handling."""
114
+
115
+ @critical_section_b10fs_file_lock(lock_type)
116
+ def _locked_transfer():
117
+ result = transfer(source, dest)
118
+ if result != OperationStatus.SUCCESS:
119
+ if cleanup_on_failure:
120
+ safe_unlink(
121
+ Path(dest), f"Failed to cleanup after failed transfer {dest}"
122
+ )
123
+ raise Exception(f"Failed to transfer {source} -> {dest}")
124
+
125
+ _locked_transfer()
126
+
127
+
128
+ @timed_fn(logger=logger, name="Loading compile cache")
129
+ @safe_execute("Load failed", False)
130
+ def load_compile_cache() -> OperationStatus:
131
+ """Load PyTorch compilation cache from b10fs to local torch cache directory.
132
+
133
+ This function implements a lock-free pattern to safely load cached PyTorch
134
+ compilation artifacts from the b10fs shared filesystem to the local torch
135
+ cache directory. It validates b10fs availability, checks for existing cache,
136
+ and extracts the archive if needed.
137
+
138
+ The function monitors local disk space during both the copy from b10fs and
139
+ extraction phases, interrupting operations if space falls below MIN_LOCAL_SPACE_MB.
140
+
141
+ Returns:
142
+ OperationStatus:
143
+ OperationStatus.SUCCESS if cache was successfully loaded
144
+ OperationStatus.SKIPPED if already exists
145
+ OperationStatus.ERROR if b10fs is unavailable, local disk space is insufficient, or loading failed.
146
+ OperationStatus.DOES_NOT_EXIST if no cache file was found.
147
+
148
+ Raises:
149
+ CacheValidationError: If b10fs is not enabled (caught and returns OperationStatus.ERROR).
150
+ CacheOperationInterrupted: If operations interrupted due to insufficient
151
+ local disk space (caught and returns OperationStatus.ERROR).
152
+ Exception: Any other errors during loading (caught and returns OperationStatus.ERROR).
153
+ """
154
+ with cache_operation("Load"):
155
+ b10fs_dir, torch_dir, work_dir = _setup_cache_paths()
156
+
157
+ cache_filename = get_cache_filename()
158
+ final_file, _ = _get_cache_file_paths(cache_filename, b10fs_dir)
159
+ logger.debug(f"Looking for cache file: {final_file}")
160
+
161
+ if not final_file.exists():
162
+ logger.info("No cache file found in b10fs")
163
+ return OperationStatus.DOES_NOT_EXIST
164
+
165
+ # Skip if already loaded
166
+ if torch_dir.exists() and any(torch_dir.iterdir()):
167
+ logger.info("Torch cache already loaded, skipping extraction")
168
+ return OperationStatus.SKIPPED
169
+
170
+ # Create temp local copy
171
+ with tempfile.NamedTemporaryFile(
172
+ suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
173
+ ) as f:
174
+ temp_path = Path(f.name)
175
+ logger.debug(f"Created temporary file for cache: {temp_path}")
176
+
177
+ try:
178
+ with temp_file_cleanup(temp_path):
179
+ # Phase 1: Copy from b10fs to local temp file
180
+ _transfer_with_b10fs_lock(
181
+ str(final_file),
182
+ str(temp_path),
183
+ "copy_out",
184
+ cleanup_on_failure=False,
185
+ )
186
+
187
+ # Phase 2: Extract archive with space monitoring
188
+ _run_with_space_monitoring(
189
+ MIN_LOCAL_SPACE_MB,
190
+ work_dir,
191
+ "archive extraction",
192
+ _cache_extract_worker,
193
+ (str(temp_path), str(torch_dir)),
194
+ cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
195
+ )
196
+
197
+ logger.info("Cache load complete")
198
+ return OperationStatus.SUCCESS
199
+
200
+ except CacheOperationInterrupted as e:
201
+ logger.warning(f"Cache load interrupted: {e}")
202
+ return OperationStatus.ERROR
203
+
204
+
205
+ @timed_fn(logger=logger, name="Saving compile cache")
206
+ @safe_execute("Save failed", False)
207
+ def save_compile_cache() -> OperationStatus:
208
+ """Save local PyTorch compilation cache to b10fs using atomic journal pattern.
209
+
210
+ This function creates an archive of the local torch cache directory and
211
+ atomically saves it to b10fs using a journal pattern (write to temp file,
212
+ then rename). This ensures concurrent saves don't corrupt each other.
213
+
214
+ The function validates b10fs availability, checks if cache already exists
215
+ (early exit), performs initial space checks using pre-calculated requirements
216
+ for concurrent saves, starts background space monitoring, then runs compression
217
+ and copy operations in separate worker processes that can be terminated if disk
218
+ space becomes insufficient, finally performing an atomic rename to the final cache file.
219
+
220
+ Returns:
221
+ OperationStatus:
222
+ OperationStatus.SUCCESS if cache was successfully saved or already exists
223
+ OperationStatus.ERROR if b10fs is unavailable, insufficient disk space caused interruption,
224
+ no cache exists to save, or saving failed.
225
+ OperationStatus.SKIPPED if no cache exists to save or cache already exists in b10fs
226
+
227
+ Raises:
228
+ CacheValidationError: If b10fs is not enabled (caught and returns OperationStatus.ERROR).
229
+ CacheOperationInterrupted: If operations interrupted due to insufficient
230
+ disk space (caught and returns OperationStatus.ERROR).
231
+ ArchiveError: If archive creation fails (caught and returns OperationStatus.ERROR).
232
+ Exception: Any other errors during saving (caught and returns OperationStatus.ERROR).
233
+ """
234
+ with cache_operation("Save"):
235
+ b10fs_dir, torch_dir, work_dir = _setup_cache_paths()
236
+
237
+ # Check if anything to save
238
+ if not torch_dir.exists() or not any(torch_dir.iterdir()):
239
+ logger.info("No torch cache to save")
240
+ return OperationStatus.SKIPPED
241
+
242
+ cache_filename = get_cache_filename()
243
+ final_file, temp_file = _get_cache_file_paths(cache_filename, b10fs_dir)
244
+
245
+ # Check for existing cache first (early exit)
246
+ if final_file.exists():
247
+ logger.info("Cache already exists in b10fs, skipping save")
248
+ return OperationStatus.SKIPPED
249
+
250
+ with tempfile.NamedTemporaryFile(
251
+ suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
252
+ ) as f:
253
+ local_temp = Path(f.name)
254
+ logger.debug(f"Created local temp file for archive: {local_temp}")
255
+
256
+ try:
257
+ with temp_file_cleanup(local_temp):
258
+ # Phase 1: Compression with space monitoring
259
+ _run_with_space_monitoring(
260
+ REQUIRED_B10FS_SPACE_MB,
261
+ b10fs_dir,
262
+ "compression",
263
+ _cache_compression_worker,
264
+ (str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
265
+ )
266
+
267
+ # Phase 2: Copy to b10fs with locking
268
+ _transfer_with_b10fs_lock(
269
+ str(local_temp), str(temp_file), "copy_in", cleanup_on_failure=True
270
+ )
271
+
272
+ # Phase 3: Atomic rename (fast, don't interrupt)
273
+ logger.info(
274
+ f"Renaming temp file to final cache file: {temp_file} -> {final_file}"
275
+ )
276
+ temp_file.rename(final_file)
277
+
278
+ logger.info("Cache save complete")
279
+ return OperationStatus.SUCCESS
280
+
281
+ except CacheOperationInterrupted as e:
282
+ logger.warning(f"Cache save interrupted: {e}")
283
+ return OperationStatus.ERROR
284
+
285
+
286
+ @safe_execute("Clear failed", False)
287
+ def clear_local_cache() -> bool:
288
+ """Clear the local PyTorch compilation cache directory.
289
+
290
+ This function removes the entire local torch cache directory and all its
291
+ contents. This is useful for cleaning up disk space or forcing recompilation.
292
+
293
+ Returns:
294
+ bool: True if cache was successfully cleared or didn't exist, False if
295
+ clearing failed due to permissions or other filesystem errors.
296
+
297
+ Raises:
298
+ Exception: Any errors during directory removal (caught and returns False).
299
+ """
300
+ torch_dir = Path(TORCH_CACHE_DIR)
301
+ if not torch_dir.exists():
302
+ return True
303
+ import shutil
304
+
305
+ shutil.rmtree(torch_dir)
306
+ return True
307
+
308
+
309
+ @worker_process("Compression was cancelled before starting")
310
+ def _cache_compression_worker(
311
+ torch_dir_str: str, local_temp_str: str, max_size_mb: int
312
+ ) -> None:
313
+ """Worker process that handles cache compression.
314
+
315
+ This function runs in a separate process to compress the torch cache directory
316
+ into an archive. It can be terminated externally if disk space becomes insufficient.
317
+
318
+ Args:
319
+ torch_dir_str: String path to the torch cache directory to compress.
320
+ local_temp_str: String path where the compressed archive will be created.
321
+ max_size_mb: Maximum allowed archive size in megabytes.
322
+ """
323
+ torch_dir = Path(torch_dir_str)
324
+ local_temp = Path(local_temp_str)
325
+
326
+ # Import here to avoid issues with multiprocessing
327
+ from .archive import create_archive
328
+
329
+ create_archive(torch_dir, local_temp, max_size_mb)
330
+
331
+
332
+ def _cleanup_torch_dir(torch_dir: Path) -> None:
333
+ """Helper function to safely cleanup torch directory during interrupted extraction."""
334
+ try:
335
+ if torch_dir.exists():
336
+ import shutil
337
+
338
+ shutil.rmtree(torch_dir)
339
+ logger.debug(f"Cleaned up torch directory: {torch_dir}")
340
+ except Exception as e:
341
+ logger.error(f"Failed to cleanup torch directory {torch_dir}: {e}")
342
+
343
+
344
+ @worker_process("Extraction was cancelled before starting")
345
+ def _cache_extract_worker(archive_path_str: str, dest_dir_str: str) -> None:
346
+ """Worker process that handles archive extraction.
347
+
348
+ This function runs in a separate process to extract the cache archive to
349
+ the torch cache directory. It can be terminated externally if local disk space becomes insufficient.
350
+
351
+ Args:
352
+ archive_path_str: String path to the archive file to extract.
353
+ dest_dir_str: String path to the directory where archive will be extracted.
354
+ """
355
+ archive_path = Path(archive_path_str)
356
+ dest_dir = Path(dest_dir_str)
357
+
358
+ # Import here to avoid issues with multiprocessing
359
+ from .archive import extract_archive
360
+
361
+ extract_archive(archive_path, dest_dir)
b10_transfer/core.py CHANGED
@@ -1,19 +1,16 @@
1
- import os
1
+ """Core file transfer operations for b10-transfer.
2
+
3
+ This module provides generic file transfer functionality with space monitoring
4
+ and error handling for b10fs operations.
5
+ """
6
+
2
7
  import logging
3
- import tempfile
4
8
  import shutil
5
9
  from pathlib import Path
6
10
 
7
- import time
8
-
9
- from .environment import get_cache_filename
10
- from .cleanup import cooperative_cleanup_b10fs
11
11
  from .utils import (
12
12
  timed_fn,
13
- critical_section_b10fs_file_lock,
14
13
  safe_execute,
15
- temp_file_cleanup,
16
- cache_operation,
17
14
  safe_unlink,
18
15
  )
19
16
  from .space_monitor import (
@@ -24,263 +21,15 @@ from .space_monitor import (
24
21
  worker_process,
25
22
  )
26
23
  from .constants import (
27
- TORCH_CACHE_DIR,
28
24
  B10FS_CACHE_DIR,
29
- LOCAL_WORK_DIR,
30
- MAX_CACHE_SIZE_MB,
31
25
  REQUIRED_B10FS_SPACE_MB,
32
26
  MIN_LOCAL_SPACE_MB,
33
- CACHE_FILE_EXTENSION,
34
- CACHE_LATEST_SUFFIX,
35
- CACHE_INCOMPLETE_SUFFIX,
36
27
  OperationStatus,
37
28
  )
38
29
 
39
30
  logger = logging.getLogger(__name__)
40
31
 
41
32
 
42
- def _setup_cache_paths():
43
- """Common setup for cache operations - returns paths and performs cleanup."""
44
- # Cooperative cleanup of stale shared resources
45
- cooperative_cleanup_b10fs()
46
-
47
- b10fs_dir = Path(B10FS_CACHE_DIR)
48
- torch_dir = Path(TORCH_CACHE_DIR)
49
- work_dir = Path(LOCAL_WORK_DIR)
50
-
51
- return b10fs_dir, torch_dir, work_dir
52
-
53
-
54
- def _get_cache_file_paths(cache_filename: str, b10fs_dir: Path):
55
- """Generate cache file paths for a given cache filename."""
56
- final_file = (
57
- b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
58
- )
59
- temp_file = (
60
- b10fs_dir / f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
61
- )
62
- return final_file, temp_file
63
-
64
-
65
- def _run_with_space_monitoring(
66
- space_threshold_mb: float,
67
- monitor_dir: Path,
68
- operation_name: str,
69
- worker_func,
70
- worker_args: tuple,
71
- cleanup_func=None,
72
- ):
73
- """Helper to run an operation with space monitoring."""
74
- space_monitor = CacheSpaceMonitor(space_threshold_mb, monitor_dir)
75
- space_monitor.start()
76
-
77
- try:
78
- logger.info(
79
- f"Starting {operation_name}: {' -> '.join(str(arg) for arg in worker_args[:2])}"
80
- )
81
- run_monitored_process(
82
- worker_func,
83
- worker_args,
84
- space_monitor,
85
- operation_name,
86
- cleanup_func=cleanup_func,
87
- )
88
- finally:
89
- space_monitor.stop()
90
-
91
-
92
- def _transfer_with_b10fs_lock(
93
- source: str, dest: str, lock_type: str, cleanup_on_failure=True
94
- ):
95
- """Transfer a file with b10fs file locking and error handling."""
96
-
97
- @critical_section_b10fs_file_lock(lock_type)
98
- def _locked_transfer():
99
- result = transfer(source, dest)
100
- if result != OperationStatus.SUCCESS:
101
- if cleanup_on_failure:
102
- safe_unlink(
103
- Path(dest), f"Failed to cleanup after failed transfer {dest}"
104
- )
105
- raise Exception(f"Failed to transfer {source} -> {dest}")
106
-
107
- _locked_transfer()
108
-
109
-
110
- @timed_fn(logger=logger, name="Loading compile cache")
111
- @safe_execute("Load failed", False)
112
- def load_compile_cache() -> OperationStatus:
113
- """Load PyTorch compilation cache from b10fs to local torch cache directory.
114
-
115
- This function implements a lock-free pattern to safely load cached PyTorch
116
- compilation artifacts from the b10fs shared filesystem to the local torch
117
- cache directory. It validates b10fs availability, checks for existing cache,
118
- and extracts the archive if needed.
119
-
120
- The function monitors local disk space during both the copy from b10fs and
121
- extraction phases, interrupting operations if space falls below MIN_LOCAL_SPACE_MB.
122
-
123
- Returns:
124
- OperationStatus:
125
- OperationStatus.SUCCESS if cache was successfully loaded
126
- OperationStatus.SKIPPED if already exists
127
- OperationStatus.ERROR if b10fs is unavailable, local disk space is insufficient, or loading failed.
128
- OperationStatus.DOES_NOT_EXIST if no cache file was found.
129
-
130
- Raises:
131
- CacheValidationError: If b10fs is not enabled (caught and returns OperationStatus.ERROR).
132
- CacheOperationInterrupted: If operations interrupted due to insufficient
133
- local disk space (caught and returns OperationStatus.ERROR).
134
- Exception: Any other errors during loading (caught and returns OperationStatus.ERROR).
135
- """
136
- with cache_operation("Load"):
137
- b10fs_dir, torch_dir, work_dir = _setup_cache_paths()
138
-
139
- cache_filename = get_cache_filename()
140
- final_file, _ = _get_cache_file_paths(cache_filename, b10fs_dir)
141
- logger.debug(f"Looking for cache file: {final_file}")
142
-
143
- if not final_file.exists():
144
- logger.info("No cache file found in b10fs")
145
- return OperationStatus.DOES_NOT_EXIST
146
-
147
- # Skip if already loaded
148
- if torch_dir.exists() and any(torch_dir.iterdir()):
149
- logger.info("Torch cache already loaded, skipping extraction")
150
- return OperationStatus.SKIPPED
151
-
152
- # Create temp local copy
153
- with tempfile.NamedTemporaryFile(
154
- suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
155
- ) as f:
156
- temp_path = Path(f.name)
157
- logger.debug(f"Created temporary file for cache: {temp_path}")
158
-
159
- try:
160
- with temp_file_cleanup(temp_path):
161
- # Phase 1: Copy from b10fs to local temp file
162
- _transfer_with_b10fs_lock(
163
- str(final_file),
164
- str(temp_path),
165
- "copy_out",
166
- cleanup_on_failure=False,
167
- )
168
-
169
- # Phase 2: Extract archive with space monitoring
170
- _run_with_space_monitoring(
171
- MIN_LOCAL_SPACE_MB,
172
- work_dir,
173
- "archive extraction",
174
- _cache_extract_worker,
175
- (str(temp_path), str(torch_dir)),
176
- cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
177
- )
178
-
179
- logger.info("Cache load complete")
180
- return OperationStatus.SUCCESS
181
-
182
- except CacheOperationInterrupted as e:
183
- logger.warning(f"Cache load interrupted: {e}")
184
- return OperationStatus.ERROR
185
-
186
-
187
- """
188
- FIXME(SRAY):
189
- What about the case in @b10-transfer/ where a single pod finishes an inference request,
190
- and then the client calls save_compile_cache. And while we are creating the local archive,
191
- another inference call on the same pod is kicked off, which then modifies the torch cache.
192
- How would this be handled? Maybe just accept that the cache will be recompiled/overwritten?
193
- Otherwise you'd need application level coordination to ensure that the cache is not modified
194
- while we are creating the archive, but this doesn't really seem like a good idea in terms of adoption.
195
-
196
- FIXME(SR):
197
- More things to consider:
198
- - [possible] What if b10fs dies *during* an op? right now we check for b10fs availability in the beginning of the op... Add some constants instead of just False for load().
199
- - [possible, and really bad if it happens] potential memory exhaustion during compression if the cache is super super large. very very edge case. higher compression levels also have high memory usage.
200
- """
201
-
202
-
203
- @timed_fn(logger=logger, name="Saving compile cache")
204
- @safe_execute("Save failed", False)
205
- def save_compile_cache() -> OperationStatus:
206
- """Save local PyTorch compilation cache to b10fs using atomic journal pattern.
207
-
208
- This function creates an archive of the local torch cache directory and
209
- atomically saves it to b10fs using a journal pattern (write to temp file,
210
- then rename). This ensures concurrent saves don't corrupt each other.
211
-
212
- The function validates b10fs availability, checks if cache already exists
213
- (early exit), performs initial space checks using pre-calculated requirements
214
- for concurrent saves, starts background space monitoring, then runs compression
215
- and copy operations in separate worker processes that can be terminated if disk
216
- space becomes insufficient, finally performing an atomic rename to the final cache file.
217
-
218
- Returns:
219
- OperationStatus:
220
- OperationStatus.SUCCESS if cache was successfully saved or already exists
221
- OperationStatus.ERROR if b10fs is unavailable, insufficient disk space caused interruption,
222
- no cache exists to save, or saving failed.
223
- OperationStatus.SKIPPED if no cache exists to save or cache already exists in b10fs
224
-
225
- Raises:
226
- CacheValidationError: If b10fs is not enabled (caught and returns OperationStatus.ERROR).
227
- CacheOperationInterrupted: If operations interrupted due to insufficient
228
- disk space (caught and returns OperationStatus.ERROR).
229
- ArchiveError: If archive creation fails (caught and returns OperationStatus.ERROR).
230
- Exception: Any other errors during saving (caught and returns OperationStatus.ERROR).
231
- """
232
- with cache_operation("Save"):
233
- b10fs_dir, torch_dir, work_dir = _setup_cache_paths()
234
-
235
- # Check if anything to save
236
- if not torch_dir.exists() or not any(torch_dir.iterdir()):
237
- logger.info("No torch cache to save")
238
- return OperationStatus.SKIPPED
239
-
240
- cache_filename = get_cache_filename()
241
- final_file, temp_file = _get_cache_file_paths(cache_filename, b10fs_dir)
242
-
243
- # Check for existing cache first (early exit)
244
- if final_file.exists():
245
- logger.info("Cache already exists in b10fs, skipping save")
246
- return OperationStatus.SKIPPED
247
-
248
- with tempfile.NamedTemporaryFile(
249
- suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
250
- ) as f:
251
- local_temp = Path(f.name)
252
- logger.debug(f"Created local temp file for archive: {local_temp}")
253
-
254
- try:
255
- with temp_file_cleanup(local_temp):
256
- # Phase 1: Compression with space monitoring
257
- _run_with_space_monitoring(
258
- REQUIRED_B10FS_SPACE_MB,
259
- b10fs_dir,
260
- "compression",
261
- _cache_compression_worker,
262
- (str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
263
- )
264
-
265
- # Phase 2: Copy to b10fs with locking
266
- _transfer_with_b10fs_lock(
267
- str(local_temp), str(temp_file), "copy_in", cleanup_on_failure=True
268
- )
269
-
270
- # Phase 3: Atomic rename (fast, don't interrupt)
271
- logger.info(
272
- f"Renaming temp file to final cache file: {temp_file} -> {final_file}"
273
- )
274
- temp_file.rename(final_file)
275
-
276
- logger.info("Cache save complete")
277
- return OperationStatus.SUCCESS
278
-
279
- except CacheOperationInterrupted as e:
280
- logger.warning(f"Cache save interrupted: {e}")
281
- return OperationStatus.ERROR
282
-
283
-
284
33
  @timed_fn(logger=logger, name="Transferring file")
285
34
  @safe_execute("Transfer failed", OperationStatus.ERROR)
286
35
  def transfer(source: str, dest: str) -> OperationStatus:
@@ -365,56 +114,12 @@ def transfer(source: str, dest: str) -> OperationStatus:
365
114
  space_monitor.stop()
366
115
 
367
116
 
368
- @safe_execute("Clear failed", False)
369
- def clear_local_cache() -> bool:
370
- """Clear the local PyTorch compilation cache directory.
371
-
372
- This function removes the entire local torch cache directory and all its
373
- contents. This is useful for cleaning up disk space or forcing recompilation.
374
-
375
- Returns:
376
- bool: True if cache was successfully cleared or didn't exist, False if
377
- clearing failed due to permissions or other filesystem errors.
378
-
379
- Raises:
380
- Exception: Any errors during directory removal (caught and returns False).
381
- """
382
- torch_dir = Path(TORCH_CACHE_DIR)
383
- if not torch_dir.exists():
384
- return True
385
- shutil.rmtree(torch_dir)
386
- return True
387
-
388
-
389
- @worker_process("Compression was cancelled before starting")
390
- def _cache_compression_worker(
391
- torch_dir_str: str, local_temp_str: str, max_size_mb: int
392
- ) -> None:
393
- """Worker process that handles cache compression.
394
-
395
- This function runs in a separate process to compress the torch cache directory
396
- into an archive. It can be terminated externally if disk space becomes insufficient.
397
-
398
- Args:
399
- torch_dir_str: String path to the torch cache directory to compress.
400
- local_temp_str: String path where the compressed archive will be created.
401
- max_size_mb: Maximum allowed archive size in megabytes.
402
- """
403
- torch_dir = Path(torch_dir_str)
404
- local_temp = Path(local_temp_str)
405
-
406
- # Import here to avoid issues with multiprocessing
407
- from .archive import create_archive
408
-
409
- create_archive(torch_dir, local_temp, max_size_mb)
410
-
411
-
412
117
  @worker_process("Copy was cancelled before starting")
413
118
  def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
414
- """Worker process that handles file copy to b10fs.
119
+ """Worker process that handles file copy operations.
415
120
 
416
- This function runs in a separate process to copy the compressed cache file
417
- to the b10fs filesystem. It can be terminated externally if disk space becomes insufficient.
121
+ This function runs in a separate process to copy files between locations.
122
+ It can be terminated externally if disk space becomes insufficient.
418
123
 
419
124
  Args:
420
125
  source_path_str: String path to the source file to copy.
@@ -424,33 +129,3 @@ def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
424
129
  dest_path = Path(dest_path_str)
425
130
 
426
131
  shutil.copy2(source_path, dest_path)
427
-
428
-
429
- def _cleanup_torch_dir(torch_dir: Path) -> None:
430
- """Helper function to safely cleanup torch directory during interrupted extraction."""
431
- try:
432
- if torch_dir.exists():
433
- shutil.rmtree(torch_dir)
434
- logger.debug(f"Cleaned up torch directory: {torch_dir}")
435
- except Exception as e:
436
- logger.error(f"Failed to cleanup torch directory {torch_dir}: {e}")
437
-
438
-
439
- @worker_process("Extraction was cancelled before starting")
440
- def _cache_extract_worker(archive_path_str: str, dest_dir_str: str) -> None:
441
- """Worker process that handles archive extraction.
442
-
443
- This function runs in a separate process to extract the cache archive to
444
- the torch cache directory. It can be terminated externally if local disk space becomes insufficient.
445
-
446
- Args:
447
- archive_path_str: String path to the archive file to extract.
448
- dest_dir_str: String path to the directory where archive will be extracted.
449
- """
450
- archive_path = Path(archive_path_str)
451
- dest_dir = Path(dest_dir_str)
452
-
453
- # Import here to avoid issues with multiprocessing
454
- from .archive import extract_archive
455
-
456
- extract_archive(archive_path, dest_dir)
@@ -28,15 +28,14 @@ def get_cache_filename() -> str:
28
28
  """Get the cache filename prefix for the current environment.
29
29
 
30
30
  This function generates a cache filename prefix that includes the
31
- environment key and hostname to ensure cache files are environment-specific
31
+ environment key to ensure cache files are environment-specific
32
32
  and unique per machine.
33
33
 
34
34
  Returns:
35
- str: Cache filename prefix in format "cache_{environment_key}.{hostname}".
35
+ str: Cache filename prefix in format "cache_{environment_key}".
36
36
  """
37
37
  env_key = get_environment_key()
38
- hostname = os.uname().nodename or os.getenv("HOSTNAME", UNKNOWN_HOSTNAME)
39
- return f"cache_{env_key}.{hostname}"
38
+ return f"cache_{env_key}"
40
39
 
41
40
 
42
41
  def get_environment_key() -> str:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: b10-transfer
3
- Version: 0.1.5
3
+ Version: 0.1.7
4
4
  Summary: Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management
5
5
  License: MIT
6
6
  Keywords: pytorch,file-transfer,cache,machine-learning,inference
@@ -0,0 +1,13 @@
1
+ b10_transfer/__init__.py,sha256=OIS1vZWROizMUeBPNQ9efJfV2cNrBNtq68HBrryoViA,665
2
+ b10_transfer/archive.py,sha256=GKb0mi0-YeM7ch4FLAoOLHXw0T6LkRerYad2N2y9TYM,6400
3
+ b10_transfer/cache.py,sha256=B5fNCJkMIpUBwZuKMoQVbn0NeEuIrcAtYMk0gXkkOAM,13768
4
+ b10_transfer/cleanup.py,sha256=3RnqWNGMCcko5GQdq1Gr9VPpGzAF5J6x7xjIH9SNZ78,6226
5
+ b10_transfer/constants.py,sha256=iuLShDW6hInhyz2YTQ8CzBanqW4chCkQOAzPZkCtOoA,4322
6
+ b10_transfer/core.py,sha256=vsOcH0ve2GP-YBgHU58WgCEbx0h7dXn2R5sJErnQt8k,4437
7
+ b10_transfer/environment.py,sha256=LEos7wCt1KfQYGkl3XSUN-WsxCODZFop4AgMJuYw_cE,5512
8
+ b10_transfer/info.py,sha256=I3iOuImZ5r6DMJTDeBtVvzlSn6IuyPJbLJYUO_OF0ks,6299
9
+ b10_transfer/space_monitor.py,sha256=C_CKDH43bNsWdq60WStSZ3c_nQkWvScQmqU_SYHesew,10531
10
+ b10_transfer/utils.py,sha256=Stee0DFK-8MRRYNIocqaK64cJvfs4jPW3Mpx7zkWV6Y,11932
11
+ b10_transfer-0.1.7.dist-info/METADATA,sha256=M9fliL91oazjodOAy7-FoBPAgdZ87Qhj2r5tplLttrM,4108
12
+ b10_transfer-0.1.7.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
13
+ b10_transfer-0.1.7.dist-info/RECORD,,
@@ -1,12 +0,0 @@
1
- b10_transfer/__init__.py,sha256=S7FJhF-BMPbXBqjQbXIj_Dl0r0-kSQD91l-wSD7COTI,647
2
- b10_transfer/archive.py,sha256=GKb0mi0-YeM7ch4FLAoOLHXw0T6LkRerYad2N2y9TYM,6400
3
- b10_transfer/cleanup.py,sha256=3RnqWNGMCcko5GQdq1Gr9VPpGzAF5J6x7xjIH9SNZ78,6226
4
- b10_transfer/constants.py,sha256=iuLShDW6hInhyz2YTQ8CzBanqW4chCkQOAzPZkCtOoA,4322
5
- b10_transfer/core.py,sha256=ScD-O9ot-ciVhX121fCchDu8_vM7izSFMxPtenU5RS0,17381
6
- b10_transfer/environment.py,sha256=aC0biEMQrtHk0ke_3epdcq1X9J5fPmPpBVt0fH7XF2Y,5625
7
- b10_transfer/info.py,sha256=I3iOuImZ5r6DMJTDeBtVvzlSn6IuyPJbLJYUO_OF0ks,6299
8
- b10_transfer/space_monitor.py,sha256=C_CKDH43bNsWdq60WStSZ3c_nQkWvScQmqU_SYHesew,10531
9
- b10_transfer/utils.py,sha256=Stee0DFK-8MRRYNIocqaK64cJvfs4jPW3Mpx7zkWV6Y,11932
10
- b10_transfer-0.1.5.dist-info/METADATA,sha256=lPyE9VvN1srmMZxn7tCOPEEkPHRZE8JsdAVIOeW2SPs,4108
11
- b10_transfer-0.1.5.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
12
- b10_transfer-0.1.5.dist-info/RECORD,,