b10-transfer 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- b10_transfer/__init__.py +4 -32
- b10_transfer/cleanup.py +1 -1
- b10_transfer/constants.py +2 -23
- b10_transfer/core.py +343 -118
- b10_transfer/space_monitor.py +1 -14
- b10_transfer-0.1.2.dist-info/METADATA +127 -0
- b10_transfer-0.1.2.dist-info/RECORD +12 -0
- b10_transfer/async_torch_cache.py +0 -62
- b10_transfer/async_transfers.py +0 -283
- b10_transfer/torch_cache.py +0 -388
- b10_transfer-0.1.0.dist-info/METADATA +0 -219
- b10_transfer-0.1.0.dist-info/RECORD +0 -15
- {b10_transfer-0.1.0.dist-info → b10_transfer-0.1.2.dist-info}/WHEEL +0 -0
b10_transfer/torch_cache.py
DELETED
@@ -1,388 +0,0 @@
|
|
1
|
-
"""PyTorch compilation cache management using the generic transfer system.
|
2
|
-
|
3
|
-
This module provides torch-specific cache operations (save/load) that use the
|
4
|
-
generic transfer infrastructure from core.py. It handles the torch-specific
|
5
|
-
logic like compression, extraction, and file naming while delegating the
|
6
|
-
robust transfer operations to the core transfer function.
|
7
|
-
"""
|
8
|
-
|
9
|
-
import os
|
10
|
-
import logging
|
11
|
-
import tempfile
|
12
|
-
import shutil
|
13
|
-
from pathlib import Path
|
14
|
-
|
15
|
-
from .core import transfer
|
16
|
-
from .environment import get_cache_filename
|
17
|
-
from .archive import create_archive, extract_archive
|
18
|
-
from .utils import (
|
19
|
-
timed_fn,
|
20
|
-
critical_section_b10fs_file_lock,
|
21
|
-
safe_execute,
|
22
|
-
temp_file_cleanup,
|
23
|
-
safe_unlink,
|
24
|
-
)
|
25
|
-
from .space_monitor import worker_process
|
26
|
-
from .constants import (
|
27
|
-
TORCH_CACHE_DIR,
|
28
|
-
B10FS_CACHE_DIR,
|
29
|
-
LOCAL_WORK_DIR,
|
30
|
-
MAX_CACHE_SIZE_MB,
|
31
|
-
CACHE_FILE_EXTENSION,
|
32
|
-
CACHE_LATEST_SUFFIX,
|
33
|
-
CACHE_INCOMPLETE_SUFFIX,
|
34
|
-
LoadStatus,
|
35
|
-
SaveStatus,
|
36
|
-
TransferStatus,
|
37
|
-
)
|
38
|
-
|
39
|
-
logger = logging.getLogger(__name__)
|
40
|
-
|
41
|
-
|
42
|
-
def torch_cache_save_callback(
|
43
|
-
source_dir: Path, dest_file: Path, max_size_mb: int = None, *args, **kwargs
|
44
|
-
) -> None:
|
45
|
-
"""Callback function for saving torch cache: compress then copy to b10fs.
|
46
|
-
|
47
|
-
This function handles the torch-specific save logic:
|
48
|
-
1. Compress the torch cache directory to a temporary archive
|
49
|
-
2. Copy the archive to b10fs using atomic operations (temp file + rename)
|
50
|
-
|
51
|
-
Args:
|
52
|
-
source_dir: Path to the torch cache directory to compress
|
53
|
-
dest_file: Path to the final cache file in b10fs
|
54
|
-
max_size_mb: Maximum allowed archive size in megabytes (can be passed as kwarg)
|
55
|
-
*args: Additional arguments passed by the transfer system (ignored)
|
56
|
-
**kwargs: Additional keyword arguments passed by the transfer system (may contain max_size_mb)
|
57
|
-
"""
|
58
|
-
# Handle max_size_mb from kwargs if not provided as positional argument
|
59
|
-
if max_size_mb is None:
|
60
|
-
max_size_mb = kwargs.get("max_size_mb", MAX_CACHE_SIZE_MB)
|
61
|
-
|
62
|
-
work_dir = Path(LOCAL_WORK_DIR)
|
63
|
-
|
64
|
-
# Create temporary archive in local work directory
|
65
|
-
with tempfile.NamedTemporaryFile(
|
66
|
-
suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
|
67
|
-
) as f:
|
68
|
-
temp_archive = Path(f.name)
|
69
|
-
|
70
|
-
logger.debug(f"Created temporary archive: {temp_archive}")
|
71
|
-
|
72
|
-
try:
|
73
|
-
with temp_file_cleanup(temp_archive):
|
74
|
-
# Step 1: Compress torch cache to temporary archive
|
75
|
-
logger.info(f"Compressing torch cache: {source_dir} -> {temp_archive}")
|
76
|
-
create_archive(source_dir, temp_archive, max_size_mb)
|
77
|
-
|
78
|
-
# Step 2: Atomic copy to b10fs (temp file + rename)
|
79
|
-
b10fs_dir = dest_file.parent
|
80
|
-
b10fs_dir.mkdir(parents=True, exist_ok=True)
|
81
|
-
|
82
|
-
# Use incomplete suffix for atomic operation
|
83
|
-
cache_filename = get_cache_filename()
|
84
|
-
temp_dest = (
|
85
|
-
b10fs_dir
|
86
|
-
/ f"{cache_filename}{CACHE_INCOMPLETE_SUFFIX}{CACHE_FILE_EXTENSION}"
|
87
|
-
)
|
88
|
-
|
89
|
-
logger.info(f"Copying to b10fs: {temp_archive} -> {temp_dest}")
|
90
|
-
|
91
|
-
@critical_section_b10fs_file_lock("copy_in")
|
92
|
-
def _atomic_copy_to_b10fs():
|
93
|
-
shutil.copy2(temp_archive, temp_dest)
|
94
|
-
# Atomic rename to final destination
|
95
|
-
logger.info(f"Atomic rename: {temp_dest} -> {dest_file}")
|
96
|
-
temp_dest.rename(dest_file)
|
97
|
-
|
98
|
-
_atomic_copy_to_b10fs()
|
99
|
-
|
100
|
-
except Exception as e:
|
101
|
-
# Cleanup any partial b10fs files
|
102
|
-
temp_dest_pattern = dest_file.parent / f"*{CACHE_INCOMPLETE_SUFFIX}*"
|
103
|
-
for temp_file in dest_file.parent.glob(f"*{CACHE_INCOMPLETE_SUFFIX}*"):
|
104
|
-
safe_unlink(temp_file, f"Failed to cleanup incomplete file {temp_file}")
|
105
|
-
raise
|
106
|
-
|
107
|
-
|
108
|
-
def torch_cache_load_callback(
|
109
|
-
source_file: Path, dest_dir: Path, *args, **kwargs
|
110
|
-
) -> None:
|
111
|
-
"""Callback function for loading torch cache: copy from b10fs then extract.
|
112
|
-
|
113
|
-
This function handles the torch-specific load logic:
|
114
|
-
1. Copy the cache file from b10fs to a temporary local file
|
115
|
-
2. Extract the archive to the torch cache directory
|
116
|
-
|
117
|
-
Args:
|
118
|
-
source_file: Path to the cache file in b10fs
|
119
|
-
dest_dir: Path to the torch cache directory where files will be extracted
|
120
|
-
*args: Additional arguments passed by the transfer system (ignored)
|
121
|
-
**kwargs: Additional keyword arguments passed by the transfer system (ignored)
|
122
|
-
"""
|
123
|
-
work_dir = Path(LOCAL_WORK_DIR)
|
124
|
-
|
125
|
-
# Create temporary file for local copy
|
126
|
-
with tempfile.NamedTemporaryFile(
|
127
|
-
suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
|
128
|
-
) as f:
|
129
|
-
temp_archive = Path(f.name)
|
130
|
-
|
131
|
-
logger.debug(f"Created temporary file for cache copy: {temp_archive}")
|
132
|
-
|
133
|
-
try:
|
134
|
-
with temp_file_cleanup(temp_archive):
|
135
|
-
# Step 1: Copy from b10fs to local temp file
|
136
|
-
@critical_section_b10fs_file_lock("copy_out")
|
137
|
-
def _copy_from_b10fs():
|
138
|
-
logger.info(f"Copying from b10fs: {source_file} -> {temp_archive}")
|
139
|
-
if not source_file.exists():
|
140
|
-
raise FileNotFoundError(f"Cache file not found: {source_file}")
|
141
|
-
shutil.copy2(source_file, temp_archive)
|
142
|
-
|
143
|
-
_copy_from_b10fs()
|
144
|
-
|
145
|
-
# Step 2: Extract archive to torch cache directory
|
146
|
-
logger.info(f"Extracting archive: {temp_archive} -> {dest_dir}")
|
147
|
-
extract_archive(temp_archive, dest_dir)
|
148
|
-
|
149
|
-
except Exception as e:
|
150
|
-
# Cleanup partial torch directory on failure
|
151
|
-
if dest_dir.exists():
|
152
|
-
try:
|
153
|
-
shutil.rmtree(dest_dir)
|
154
|
-
logger.debug(f"Cleaned up partial torch directory: {dest_dir}")
|
155
|
-
except Exception as cleanup_error:
|
156
|
-
logger.error(
|
157
|
-
f"Failed to cleanup torch directory {dest_dir}: {cleanup_error}"
|
158
|
-
)
|
159
|
-
raise
|
160
|
-
|
161
|
-
|
162
|
-
@timed_fn(logger=logger, name="Loading compile cache")
|
163
|
-
@safe_execute("Load failed", LoadStatus.ERROR)
|
164
|
-
def load_compile_cache() -> LoadStatus:
|
165
|
-
"""Load PyTorch compilation cache from b10fs to local torch cache directory.
|
166
|
-
|
167
|
-
This function loads cached PyTorch compilation artifacts from the b10fs shared
|
168
|
-
filesystem to the local torch cache directory using the generic transfer system.
|
169
|
-
It validates cache availability, checks for existing cache, and extracts the
|
170
|
-
archive if needed.
|
171
|
-
|
172
|
-
Returns:
|
173
|
-
LoadStatus:
|
174
|
-
LoadStatus.SUCCESS if cache was successfully loaded
|
175
|
-
LoadStatus.SKIPPED if already exists
|
176
|
-
LoadStatus.ERROR if b10fs is unavailable, local disk space is insufficient, or loading failed.
|
177
|
-
LoadStatus.DOES_NOT_EXIST if no cache file was found.
|
178
|
-
|
179
|
-
Raises:
|
180
|
-
CacheValidationError: If b10fs is not enabled (caught and returns LoadStatus.ERROR).
|
181
|
-
CacheOperationInterrupted: If operations interrupted due to insufficient
|
182
|
-
local disk space (caught and returns LoadStatus.ERROR).
|
183
|
-
Exception: Any other errors during loading (caught and returns LoadStatus.ERROR).
|
184
|
-
"""
|
185
|
-
b10fs_dir = Path(B10FS_CACHE_DIR)
|
186
|
-
torch_dir = Path(TORCH_CACHE_DIR)
|
187
|
-
|
188
|
-
cache_filename = get_cache_filename()
|
189
|
-
cache_file = (
|
190
|
-
b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
|
191
|
-
)
|
192
|
-
logger.debug(f"Looking for cache file: {cache_file}")
|
193
|
-
|
194
|
-
if not cache_file.exists():
|
195
|
-
logger.info("No cache file found in b10fs")
|
196
|
-
return LoadStatus.DOES_NOT_EXIST
|
197
|
-
|
198
|
-
# Skip if already loaded
|
199
|
-
if torch_dir.exists() and any(torch_dir.iterdir()):
|
200
|
-
logger.info("Torch cache already loaded, skipping extraction")
|
201
|
-
return LoadStatus.SKIPPED
|
202
|
-
|
203
|
-
# Use generic transfer system with torch-specific callback
|
204
|
-
result = transfer(
|
205
|
-
source=cache_file,
|
206
|
-
dest=torch_dir,
|
207
|
-
callback=torch_cache_load_callback,
|
208
|
-
monitor_local=True,
|
209
|
-
monitor_b10fs=False, # No need to monitor b10fs for read operations
|
210
|
-
)
|
211
|
-
|
212
|
-
# Convert TransferStatus to LoadStatus
|
213
|
-
if result == TransferStatus.SUCCESS:
|
214
|
-
logger.info("Cache load complete")
|
215
|
-
return LoadStatus.SUCCESS
|
216
|
-
else:
|
217
|
-
logger.error(f"Cache load failed with status: {result}")
|
218
|
-
return LoadStatus.ERROR
|
219
|
-
|
220
|
-
|
221
|
-
@timed_fn(logger=logger, name="Saving compile cache")
|
222
|
-
@safe_execute("Save failed", SaveStatus.ERROR)
|
223
|
-
def save_compile_cache() -> SaveStatus:
|
224
|
-
"""Save local PyTorch compilation cache to b10fs using atomic journal pattern.
|
225
|
-
|
226
|
-
This function creates an archive of the local torch cache directory and
|
227
|
-
atomically saves it to b10fs using the generic transfer system. It validates
|
228
|
-
cache availability, checks if cache already exists (early exit), and performs
|
229
|
-
compression and copy operations with proper space monitoring.
|
230
|
-
|
231
|
-
Returns:
|
232
|
-
SaveStatus:
|
233
|
-
SaveStatus.SUCCESS if cache was successfully saved
|
234
|
-
SaveStatus.ERROR if b10fs is unavailable, insufficient disk space caused interruption,
|
235
|
-
no cache exists to save, or saving failed.
|
236
|
-
SaveStatus.SKIPPED if no cache exists to save or cache already exists in b10fs
|
237
|
-
|
238
|
-
Raises:
|
239
|
-
CacheValidationError: If b10fs is not enabled (caught and returns SaveStatus.ERROR).
|
240
|
-
CacheOperationInterrupted: If operations interrupted due to insufficient
|
241
|
-
disk space (caught and returns SaveStatus.ERROR).
|
242
|
-
ArchiveError: If archive creation fails (caught and returns SaveStatus.ERROR).
|
243
|
-
Exception: Any other errors during saving (caught and returns SaveStatus.ERROR).
|
244
|
-
"""
|
245
|
-
b10fs_dir = Path(B10FS_CACHE_DIR)
|
246
|
-
torch_dir = Path(TORCH_CACHE_DIR)
|
247
|
-
|
248
|
-
# Check if anything to save
|
249
|
-
if not torch_dir.exists() or not any(torch_dir.iterdir()):
|
250
|
-
logger.info("No torch cache to save")
|
251
|
-
return SaveStatus.SKIPPED
|
252
|
-
|
253
|
-
cache_filename = get_cache_filename()
|
254
|
-
final_file = (
|
255
|
-
b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
|
256
|
-
)
|
257
|
-
|
258
|
-
# Check for existing cache first (early exit)
|
259
|
-
if final_file.exists():
|
260
|
-
logger.info("Cache already exists in b10fs, skipping save")
|
261
|
-
return SaveStatus.SKIPPED
|
262
|
-
|
263
|
-
# Use generic transfer system with torch-specific callback
|
264
|
-
result = transfer(
|
265
|
-
source=torch_dir,
|
266
|
-
dest=final_file,
|
267
|
-
callback=torch_cache_save_callback,
|
268
|
-
max_size_mb=MAX_CACHE_SIZE_MB,
|
269
|
-
monitor_local=True,
|
270
|
-
monitor_b10fs=True,
|
271
|
-
)
|
272
|
-
|
273
|
-
# Convert TransferStatus to SaveStatus
|
274
|
-
if result == TransferStatus.SUCCESS:
|
275
|
-
logger.info("Cache save complete")
|
276
|
-
return SaveStatus.SUCCESS
|
277
|
-
elif result == TransferStatus.INTERRUPTED:
|
278
|
-
logger.warning("Cache save interrupted due to insufficient disk space")
|
279
|
-
return SaveStatus.ERROR
|
280
|
-
else:
|
281
|
-
logger.error(f"Cache save failed with status: {result}")
|
282
|
-
return SaveStatus.ERROR
|
283
|
-
|
284
|
-
|
285
|
-
@safe_execute("Clear failed", False)
|
286
|
-
def clear_local_cache() -> bool:
|
287
|
-
"""Clear the local PyTorch compilation cache directory.
|
288
|
-
|
289
|
-
This function removes the entire local torch cache directory and all its
|
290
|
-
contents. This is useful for cleaning up disk space or forcing recompilation.
|
291
|
-
|
292
|
-
Returns:
|
293
|
-
bool: True if cache was successfully cleared or didn't exist, False if
|
294
|
-
clearing failed due to permissions or other filesystem errors.
|
295
|
-
|
296
|
-
Raises:
|
297
|
-
Exception: Any errors during directory removal (caught and returns False).
|
298
|
-
"""
|
299
|
-
torch_dir = Path(TORCH_CACHE_DIR)
|
300
|
-
if not torch_dir.exists():
|
301
|
-
return True
|
302
|
-
shutil.rmtree(torch_dir)
|
303
|
-
return True
|
304
|
-
|
305
|
-
|
306
|
-
# Worker functions for backward compatibility with existing monitored process system
|
307
|
-
# These are used if someone wants to use the old worker-based approach
|
308
|
-
|
309
|
-
|
310
|
-
@worker_process("Compression was cancelled before starting")
|
311
|
-
def _cache_compression_worker(
|
312
|
-
torch_dir_str: str, local_temp_str: str, max_size_mb: int
|
313
|
-
) -> None:
|
314
|
-
"""Worker process that handles cache compression.
|
315
|
-
|
316
|
-
This function runs in a separate process to compress the torch cache directory
|
317
|
-
into an archive. It can be terminated externally if disk space becomes insufficient.
|
318
|
-
|
319
|
-
Args:
|
320
|
-
torch_dir_str: String path to the torch cache directory to compress.
|
321
|
-
local_temp_str: String path where the compressed archive will be created.
|
322
|
-
max_size_mb: Maximum allowed archive size in megabytes.
|
323
|
-
"""
|
324
|
-
torch_dir = Path(torch_dir_str)
|
325
|
-
local_temp = Path(local_temp_str)
|
326
|
-
|
327
|
-
create_archive(torch_dir, local_temp, max_size_mb)
|
328
|
-
|
329
|
-
|
330
|
-
@worker_process("Copy was cancelled before starting")
|
331
|
-
def _cache_copy_worker(source_path_str: str, dest_path_str: str) -> None:
|
332
|
-
"""Worker process that handles file copy to b10fs.
|
333
|
-
|
334
|
-
This function runs in a separate process to copy the compressed cache file
|
335
|
-
to the b10fs filesystem. It can be terminated externally if disk space becomes insufficient.
|
336
|
-
|
337
|
-
Args:
|
338
|
-
source_path_str: String path to the source file to copy.
|
339
|
-
dest_path_str: String path where the file will be copied.
|
340
|
-
"""
|
341
|
-
source_path = Path(source_path_str)
|
342
|
-
dest_path = Path(dest_path_str)
|
343
|
-
|
344
|
-
shutil.copy2(source_path, dest_path)
|
345
|
-
|
346
|
-
|
347
|
-
@worker_process("Copy from b10fs was cancelled before starting")
|
348
|
-
def _cache_copy_from_b10fs_worker(source_path_str: str, dest_path_str: str) -> None:
|
349
|
-
"""Worker process that handles file copy from b10fs to local machine.
|
350
|
-
|
351
|
-
This function runs in a separate process to copy the cache file from b10fs
|
352
|
-
to the local filesystem. It can be terminated externally if local disk space becomes insufficient.
|
353
|
-
|
354
|
-
Args:
|
355
|
-
source_path_str: String path to the source file in b10fs to copy.
|
356
|
-
dest_path_str: String path where the file will be copied locally.
|
357
|
-
"""
|
358
|
-
source_path = Path(source_path_str)
|
359
|
-
dest_path = Path(dest_path_str)
|
360
|
-
|
361
|
-
shutil.copy2(source_path, dest_path)
|
362
|
-
|
363
|
-
|
364
|
-
@worker_process("Extraction was cancelled before starting")
|
365
|
-
def _cache_extract_worker(archive_path_str: str, dest_dir_str: str) -> None:
|
366
|
-
"""Worker process that handles archive extraction.
|
367
|
-
|
368
|
-
This function runs in a separate process to extract the cache archive to
|
369
|
-
the torch cache directory. It can be terminated externally if local disk space becomes insufficient.
|
370
|
-
|
371
|
-
Args:
|
372
|
-
archive_path_str: String path to the archive file to extract.
|
373
|
-
dest_dir_str: String path to the directory where archive will be extracted.
|
374
|
-
"""
|
375
|
-
archive_path = Path(archive_path_str)
|
376
|
-
dest_dir = Path(dest_dir_str)
|
377
|
-
|
378
|
-
extract_archive(archive_path, dest_dir)
|
379
|
-
|
380
|
-
|
381
|
-
def _cleanup_torch_dir(torch_dir: Path) -> None:
|
382
|
-
"""Helper function to safely cleanup torch directory during interrupted extraction."""
|
383
|
-
try:
|
384
|
-
if torch_dir.exists():
|
385
|
-
shutil.rmtree(torch_dir)
|
386
|
-
logger.debug(f"Cleaned up torch directory: {torch_dir}")
|
387
|
-
except Exception as e:
|
388
|
-
logger.error(f"Failed to cleanup torch directory {torch_dir}: {e}")
|
@@ -1,219 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.3
|
2
|
-
Name: b10-transfer
|
3
|
-
Version: 0.1.0
|
4
|
-
Summary: Distributed PyTorch compilation cache for Baseten - Environment-aware, lock-free compilation cache management
|
5
|
-
License: MIT
|
6
|
-
Keywords: pytorch,torch.compile,cache,machine-learning,inference
|
7
|
-
Author: Shounak Ray
|
8
|
-
Author-email: shounak.noreply@baseten.co
|
9
|
-
Maintainer: Fred Liu
|
10
|
-
Maintainer-email: fred.liu.noreply@baseten.co
|
11
|
-
Requires-Python: >=3.9,<4.0
|
12
|
-
Classifier: Development Status :: 4 - Beta
|
13
|
-
Classifier: Intended Audience :: Developers
|
14
|
-
Classifier: License :: OSI Approved :: MIT License
|
15
|
-
Classifier: Programming Language :: Python :: 3
|
16
|
-
Classifier: Programming Language :: Python :: 3.9
|
17
|
-
Classifier: Programming Language :: Python :: 3.10
|
18
|
-
Classifier: Programming Language :: Python :: 3.11
|
19
|
-
Classifier: Programming Language :: Python :: 3.12
|
20
|
-
Classifier: Programming Language :: Python :: 3.13
|
21
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
22
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
23
|
-
Requires-Dist: torch (>=2.0.0)
|
24
|
-
Requires-Dist: triton (>=2.0.0)
|
25
|
-
Project-URL: Documentation, https://docs.baseten.co/development/model/b10-transfer
|
26
|
-
Project-URL: Homepage, https://docs.baseten.co/development/model/b10-transfer
|
27
|
-
Project-URL: Repository, https://pypi.org/project/b10-transfer/
|
28
|
-
Description-Content-Type: text/markdown
|
29
|
-
|
30
|
-
https://www.notion.so/ml-infra/mega-base-cache-24291d247273805b8e20fe26677b7b0f
|
31
|
-
|
32
|
-
# B10 Transfer
|
33
|
-
|
34
|
-
PyTorch compilation cache for Baseten deployments.
|
35
|
-
|
36
|
-
## Usage
|
37
|
-
|
38
|
-
### Synchronous Operations (Blocking)
|
39
|
-
|
40
|
-
```python
|
41
|
-
import b10_transfer
|
42
|
-
|
43
|
-
# Inside model.load() function
|
44
|
-
def load():
|
45
|
-
# Load cache before torch.compile()
|
46
|
-
status = b10_transfer.load_compile_cache()
|
47
|
-
|
48
|
-
# ...
|
49
|
-
|
50
|
-
# Your model compilation
|
51
|
-
model = torch.compile(model)
|
52
|
-
# Warm up the model with dummy prompts, and arguments that would be typically used in your requests (e.g resolutions)
|
53
|
-
dummy_input = "What is the capital of France?"
|
54
|
-
model(dummy_input)
|
55
|
-
|
56
|
-
# ...
|
57
|
-
|
58
|
-
# Save cache after compilation
|
59
|
-
if status != b10_transfer.LoadStatus.SUCCESS:
|
60
|
-
b10_transfer.save_compile_cache()
|
61
|
-
```
|
62
|
-
|
63
|
-
### Asynchronous Operations (Non-blocking)
|
64
|
-
|
65
|
-
```python
|
66
|
-
import b10_transfer
|
67
|
-
|
68
|
-
def load_with_async_cache():
|
69
|
-
# Start async cache load (returns immediately with operation ID)
|
70
|
-
operation_id = b10_transfer.load_compile_cache_async()
|
71
|
-
|
72
|
-
# Check status periodically
|
73
|
-
while not b10_transfer.is_transfer_complete(operation_id):
|
74
|
-
status = b10_transfer.get_transfer_status(operation_id)
|
75
|
-
print(f"Cache load status: {status.status}")
|
76
|
-
time.sleep(1)
|
77
|
-
|
78
|
-
# Get final status
|
79
|
-
final_status = b10_transfer.get_transfer_status(operation_id)
|
80
|
-
if final_status.status == b10_transfer.AsyncTransferStatus.SUCCESS:
|
81
|
-
print("Cache loaded successfully!")
|
82
|
-
|
83
|
-
# Your model compilation...
|
84
|
-
model = torch.compile(model)
|
85
|
-
|
86
|
-
# Async save
|
87
|
-
save_op_id = b10_transfer.save_compile_cache_async()
|
88
|
-
|
89
|
-
# You can continue with other work while save happens in background
|
90
|
-
# Or wait for completion if needed
|
91
|
-
b10_transfer.wait_for_completion(save_op_id, timeout=300) # 5 minute timeout
|
92
|
-
|
93
|
-
# With progress callback
|
94
|
-
def on_progress(operation_id: str):
|
95
|
-
status = b10_transfer.get_transfer_status(operation_id)
|
96
|
-
print(f"Transfer {operation_id}: {status.status}")
|
97
|
-
|
98
|
-
operation_id = b10_transfer.load_compile_cache_async(progress_callback=on_progress)
|
99
|
-
```
|
100
|
-
|
101
|
-
### Generic Async Operations
|
102
|
-
|
103
|
-
You can also use the generic async system for custom transfer operations:
|
104
|
-
|
105
|
-
```python
|
106
|
-
import b10_transfer
|
107
|
-
from pathlib import Path
|
108
|
-
|
109
|
-
def my_custom_callback(source: Path, dest: Path):
|
110
|
-
# Your custom transfer logic here
|
111
|
-
# This could be any file operation, compression, etc.
|
112
|
-
shutil.copy2(source, dest)
|
113
|
-
|
114
|
-
# Start a generic async transfer
|
115
|
-
operation_id = b10_transfer.start_transfer_async(
|
116
|
-
source=Path("/source/file.txt"),
|
117
|
-
dest=Path("/dest/file.txt"),
|
118
|
-
callback=my_custom_callback,
|
119
|
-
operation_name="custom_file_copy",
|
120
|
-
monitor_local=True,
|
121
|
-
monitor_b10fs=False
|
122
|
-
)
|
123
|
-
|
124
|
-
# Use the same progress tracking as torch cache operations
|
125
|
-
b10_transfer.wait_for_completion(operation_id)
|
126
|
-
```
|
127
|
-
|
128
|
-
## Configuration
|
129
|
-
|
130
|
-
Configure via environment variables:
|
131
|
-
|
132
|
-
```bash
|
133
|
-
# Cache directories
|
134
|
-
export TORCH_CACHE_DIR="/tmp/torchinductor_root" # Default
|
135
|
-
export B10FS_CACHE_DIR="/cache/model/compile_cache" # Default
|
136
|
-
export LOCAL_WORK_DIR="/app" # Default
|
137
|
-
|
138
|
-
# Cache limits
|
139
|
-
export MAX_CACHE_SIZE_MB="1024" # 1GB default
|
140
|
-
```
|
141
|
-
|
142
|
-
## How It Works
|
143
|
-
|
144
|
-
### Environment-Specific Caching
|
145
|
-
|
146
|
-
The library automatically creates unique cache keys based on your environment:
|
147
|
-
|
148
|
-
```
|
149
|
-
torch-2.1.0_cuda-12.1_cc-8.6_triton-2.1.0 → cache_a1b2c3d4e5f6.latest.tar.gz
|
150
|
-
torch-2.0.1_cuda-11.8_cc-7.5_triton-2.0.1 → cache_x9y8z7w6v5u4.latest.tar.gz
|
151
|
-
torch-2.1.0_cpu_triton-none → cache_m1n2o3p4q5r6.latest.tar.gz
|
152
|
-
```
|
153
|
-
|
154
|
-
**Components used:**
|
155
|
-
- **PyTorch version** (e.g., `torch-2.1.0`)
|
156
|
-
- **CUDA version** (e.g., `cuda-12.1` or `cpu`)
|
157
|
-
- **GPU compute capability** (e.g., `cc-8.6` for A100)
|
158
|
-
- **Triton version** (e.g., `triton-2.1.0` or `triton-none`)
|
159
|
-
|
160
|
-
### Cache Workflow
|
161
|
-
|
162
|
-
1. **Load Phase** (startup): Generate environment key, check for matching cache in B10FS, extract to local directory
|
163
|
-
2. **Save Phase** (after compilation): Create archive, atomic copy to B10FS with environment-specific filename
|
164
|
-
|
165
|
-
### Lock-Free Race Prevention
|
166
|
-
|
167
|
-
Uses journal pattern with atomic filesystem operations for parallel-safe cache saves.
|
168
|
-
|
169
|
-
## API Reference
|
170
|
-
|
171
|
-
### Synchronous Functions
|
172
|
-
|
173
|
-
- `load_compile_cache() -> LoadStatus`: Load cache from B10FS for current environment
|
174
|
-
- `save_compile_cache() -> SaveStatus`: Save cache to B10FS with environment-specific filename
|
175
|
-
- `clear_local_cache() -> bool`: Clear local cache directory
|
176
|
-
- `get_cache_info() -> Dict[str, Any]`: Get cache status information for current environment
|
177
|
-
- `list_available_caches() -> Dict[str, Any]`: List all cache files with environment details
|
178
|
-
|
179
|
-
### Generic Asynchronous Functions
|
180
|
-
|
181
|
-
- `start_transfer_async(source, dest, callback, operation_name, **kwargs) -> str`: Start any async transfer operation
|
182
|
-
- `get_transfer_status(operation_id: str) -> TransferProgress`: Get current status of async operation
|
183
|
-
- `is_transfer_complete(operation_id: str) -> bool`: Check if async operation has completed
|
184
|
-
- `wait_for_completion(operation_id: str, timeout=None) -> bool`: Wait for async operation to complete
|
185
|
-
- `cancel_transfer(operation_id: str) -> bool`: Attempt to cancel running operation
|
186
|
-
- `list_active_transfers() -> Dict[str, TransferProgress]`: Get all active transfer operations
|
187
|
-
|
188
|
-
### Torch Cache Async Functions
|
189
|
-
|
190
|
-
- `load_compile_cache_async(progress_callback=None) -> str`: Start async cache load, returns operation ID
|
191
|
-
- `save_compile_cache_async(progress_callback=None) -> str`: Start async cache save, returns operation ID
|
192
|
-
|
193
|
-
### Status Enums
|
194
|
-
|
195
|
-
- `LoadStatus`: SUCCESS, ERROR, DOES_NOT_EXIST, SKIPPED
|
196
|
-
- `SaveStatus`: SUCCESS, ERROR, SKIPPED
|
197
|
-
- `AsyncTransferStatus`: NOT_STARTED, IN_PROGRESS, SUCCESS, ERROR, INTERRUPTED, CANCELLED
|
198
|
-
|
199
|
-
### Data Classes
|
200
|
-
|
201
|
-
- `TransferProgress`: Contains operation_id, status, started_at, completed_at, error_message
|
202
|
-
|
203
|
-
### Exceptions
|
204
|
-
|
205
|
-
- `CacheError`: Base exception for cache operations
|
206
|
-
- `CacheValidationError`: Path validation or compatibility check failed
|
207
|
-
- `CacheOperationInterrupted`: Operation interrupted due to insufficient disk space
|
208
|
-
|
209
|
-
## Performance Impact
|
210
|
-
|
211
|
-
### Debugging
|
212
|
-
|
213
|
-
Enable debug logging:
|
214
|
-
|
215
|
-
```python
|
216
|
-
import logging
|
217
|
-
logging.getLogger('b10_tcache').setLevel(logging.DEBUG)
|
218
|
-
```
|
219
|
-
|
@@ -1,15 +0,0 @@
|
|
1
|
-
b10_transfer/__init__.py,sha256=Z_p771iwuROcCSNWKjUZ9j-V7ICmbtwr_qet5FCsnkQ,1400
|
2
|
-
b10_transfer/archive.py,sha256=GKb0mi0-YeM7ch4FLAoOLHXw0T6LkRerYad2N2y9TYM,6400
|
3
|
-
b10_transfer/async_torch_cache.py,sha256=4hMjVR44SLlGes25e_cjgMTywFfIYjH0TnUmg9o-iyI,1903
|
4
|
-
b10_transfer/async_transfers.py,sha256=luqdIStT_j4YduImY67HvX5WDurqV9Q5RjEyMI7bh1k,9476
|
5
|
-
b10_transfer/cleanup.py,sha256=xjKStmBjaarZPxhPTT1-Ds_pvUR7kdJw5Kp19BLvzzY,6224
|
6
|
-
b10_transfer/constants.py,sha256=R2JE_634Ri_9rf8adwiAzcfiej5weAGP1x1ccSZLX8k,4829
|
7
|
-
b10_transfer/core.py,sha256=d-aaQwKYqKIafBYBNahNcnOpwcanOSrWLwdzXpjVLBs,6350
|
8
|
-
b10_transfer/environment.py,sha256=aC0biEMQrtHk0ke_3epdcq1X9J5fPmPpBVt0fH7XF2Y,5625
|
9
|
-
b10_transfer/info.py,sha256=I3iOuImZ5r6DMJTDeBtVvzlSn6IuyPJbLJYUO_OF0ks,6299
|
10
|
-
b10_transfer/space_monitor.py,sha256=G_3wLSJa7HTCihSpLoow2oKo2cARJ2PtvY1XOQZl3-s,11028
|
11
|
-
b10_transfer/torch_cache.py,sha256=e41mDdnP_h61WNwB7TG5c4a7ecw0-K63ytJiKsX0keY,14907
|
12
|
-
b10_transfer/utils.py,sha256=Stee0DFK-8MRRYNIocqaK64cJvfs4jPW3Mpx7zkWV6Y,11932
|
13
|
-
b10_transfer-0.1.0.dist-info/METADATA,sha256=wc0a--Bgr-7filvyS4uUAic9fO1JJbKqc5iNp36A-iU,7502
|
14
|
-
b10_transfer-0.1.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
15
|
-
b10_transfer-0.1.0.dist-info/RECORD,,
|
File without changes
|