b10-transfer 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- b10_transfer/__init__.py +1 -1
- b10_transfer/archive.py +3 -4
- b10_transfer/cache.py +8 -12
- b10_transfer/cache_cli.py +4 -0
- b10_transfer/cleanup.py +9 -9
- b10_transfer/config.py +158 -0
- b10_transfer/constants.py +18 -101
- b10_transfer/core.py +3 -4
- b10_transfer/environment.py +0 -2
- b10_transfer/info.py +4 -5
- b10_transfer/logging_utils.py +0 -1
- b10_transfer/space_monitor.py +0 -1
- b10_transfer/utils.py +0 -2
- {b10_transfer-0.3.1.dist-info → b10_transfer-0.3.2.dist-info}/METADATA +1 -1
- b10_transfer-0.3.2.dist-info/RECORD +17 -0
- b10_transfer-0.3.1.dist-info/RECORD +0 -16
- {b10_transfer-0.3.1.dist-info → b10_transfer-0.3.2.dist-info}/WHEEL +0 -0
- {b10_transfer-0.3.1.dist-info → b10_transfer-0.3.2.dist-info}/entry_points.txt +0 -0
b10_transfer/__init__.py
CHANGED
b10_transfer/archive.py
CHANGED
@@ -1,10 +1,9 @@
|
|
1
1
|
import os
|
2
|
-
import logging
|
3
2
|
import subprocess
|
4
3
|
from pathlib import Path
|
5
4
|
|
6
5
|
from .utils import timed_fn, safe_unlink, CacheValidationError, validate_path_security
|
7
|
-
from .
|
6
|
+
from .config import config
|
8
7
|
from .logging_utils import get_b10_logger
|
9
8
|
|
10
9
|
logger = get_b10_logger(__name__)
|
@@ -62,7 +61,7 @@ def _compress_directory_to_tar(source_dir: Path, target_file: Path) -> None:
|
|
62
61
|
|
63
62
|
@timed_fn(logger=logger, name="Creating archive")
|
64
63
|
def create_archive(
|
65
|
-
source_dir: Path, target_file: Path, max_size_mb: int = MAX_CACHE_SIZE_MB
|
64
|
+
source_dir: Path, target_file: Path, max_size_mb: int = config.MAX_CACHE_SIZE_MB
|
66
65
|
) -> None:
|
67
66
|
"""Create a compressed archive with path validation and size limits.
|
68
67
|
|
@@ -75,7 +74,7 @@ def create_archive(
|
|
75
74
|
allowed directories (/tmp/ or its parent).
|
76
75
|
target_file: Path where the archive will be created. Must be within
|
77
76
|
allowed directories (/app or /cache).
|
78
|
-
max_size_mb: Maximum allowed archive size in megabytes. Defaults to MAX_CACHE_SIZE_MB.
|
77
|
+
max_size_mb: Maximum allowed archive size in megabytes. Defaults to config.MAX_CACHE_SIZE_MB.
|
79
78
|
|
80
79
|
Raises:
|
81
80
|
CacheValidationError: If paths are outside allowed directories.
|
b10_transfer/cache.py
CHANGED
@@ -26,12 +26,8 @@ from .space_monitor import (
|
|
26
26
|
run_monitored_process,
|
27
27
|
worker_process,
|
28
28
|
)
|
29
|
+
from .config import config
|
29
30
|
from .constants import (
|
30
|
-
TORCH_CACHE_DIR,
|
31
|
-
B10FS_CACHE_DIR,
|
32
|
-
LOCAL_WORK_DIR,
|
33
|
-
MAX_CACHE_SIZE_MB,
|
34
|
-
REQUIRED_B10FS_SPACE_MB,
|
35
31
|
MIN_LOCAL_SPACE_MB,
|
36
32
|
CACHE_FILE_EXTENSION,
|
37
33
|
CACHE_LATEST_SUFFIX,
|
@@ -64,9 +60,9 @@ def _setup_cache_paths():
|
|
64
60
|
# Cooperative cleanup of stale shared resources
|
65
61
|
cooperative_cleanup_b10fs()
|
66
62
|
|
67
|
-
b10fs_dir = Path(B10FS_CACHE_DIR)
|
68
|
-
torch_dir = Path(TORCH_CACHE_DIR)
|
69
|
-
work_dir = Path(LOCAL_WORK_DIR)
|
63
|
+
b10fs_dir = Path(config.B10FS_CACHE_DIR)
|
64
|
+
torch_dir = Path(config.TORCH_CACHE_DIR)
|
65
|
+
work_dir = Path(config.LOCAL_WORK_DIR)
|
70
66
|
|
71
67
|
return b10fs_dir, torch_dir, work_dir
|
72
68
|
|
@@ -305,14 +301,14 @@ def save_compile_cache() -> OperationStatus:
|
|
305
301
|
with temp_file_cleanup(local_temp):
|
306
302
|
# Phase 1: Compression with space monitoring
|
307
303
|
logger.info(
|
308
|
-
f"[SAVING] Phase 1: Compressing torch cache directory ({torch_dir} -> {local_temp}, max size: {MAX_CACHE_SIZE_MB} MB)"
|
304
|
+
f"[SAVING] Phase 1: Compressing torch cache directory ({torch_dir} -> {local_temp}, max size: {config.MAX_CACHE_SIZE_MB} MB)"
|
309
305
|
)
|
310
306
|
_run_with_space_monitoring(
|
311
|
-
REQUIRED_B10FS_SPACE_MB,
|
307
|
+
config.REQUIRED_B10FS_SPACE_MB,
|
312
308
|
b10fs_dir,
|
313
309
|
"compression",
|
314
310
|
_cache_compression_worker,
|
315
|
-
(str(torch_dir), str(local_temp), MAX_CACHE_SIZE_MB),
|
311
|
+
(str(torch_dir), str(local_temp), config.MAX_CACHE_SIZE_MB),
|
316
312
|
)
|
317
313
|
|
318
314
|
# Phase 2: Copy to b10fs with locking
|
@@ -357,7 +353,7 @@ def clear_local_cache() -> bool:
|
|
357
353
|
Raises:
|
358
354
|
Exception: Any errors during directory removal (caught and returns False).
|
359
355
|
"""
|
360
|
-
torch_dir = Path(TORCH_CACHE_DIR)
|
356
|
+
torch_dir = Path(config.TORCH_CACHE_DIR)
|
361
357
|
if not torch_dir.exists():
|
362
358
|
logger.info(
|
363
359
|
f"[CLEARING] No torch cache directory found at {torch_dir}, nothing to clear"
|
b10_transfer/cache_cli.py
CHANGED
@@ -8,6 +8,7 @@ import urllib.request
|
|
8
8
|
from dataclasses import dataclass
|
9
9
|
|
10
10
|
from .cache import load_compile_cache, save_compile_cache
|
11
|
+
from .config import config
|
11
12
|
from .constants import OperationStatus
|
12
13
|
|
13
14
|
|
@@ -76,6 +77,9 @@ def _wait_for_ready(cfg: WaitCfg, logger: logging.Logger) -> bool:
|
|
76
77
|
|
77
78
|
|
78
79
|
def main() -> None:
|
80
|
+
vllm_cache_dir = os.getenv("VLLM_CACHE_ROOT", "~/.cache/vllm")
|
81
|
+
os.environ["TORCHINDUCTOR_CACHE"] = vllm_cache_dir
|
82
|
+
|
79
83
|
cfg = WaitCfg(
|
80
84
|
url=DEFAULT_URL,
|
81
85
|
timeout_s=DEFAULT_TIMEOUT_S,
|
b10_transfer/cleanup.py
CHANGED
@@ -7,10 +7,10 @@ lock files and incomplete cache files.
|
|
7
7
|
|
8
8
|
import fnmatch
|
9
9
|
import time
|
10
|
-
import logging
|
11
10
|
from pathlib import Path
|
12
|
-
from typing import List
|
11
|
+
from typing import List
|
13
12
|
|
13
|
+
from .config import config
|
14
14
|
from .constants import (
|
15
15
|
B10FS_CACHE_DIR,
|
16
16
|
CACHE_INCOMPLETE_SUFFIX,
|
@@ -119,7 +119,7 @@ def cooperative_cleanup_b10fs() -> None:
|
|
119
119
|
This function is safe to run concurrently from multiple pods as file
|
120
120
|
deletion operations are atomic and missing files are handled gracefully.
|
121
121
|
"""
|
122
|
-
b10fs_dir = Path(B10FS_CACHE_DIR)
|
122
|
+
b10fs_dir = Path(config.B10FS_CACHE_DIR)
|
123
123
|
if not b10fs_dir.exists():
|
124
124
|
logger.debug("[CLEANUP] b10fs cache directory doesn't exist, skipping cleanup")
|
125
125
|
return
|
@@ -153,26 +153,26 @@ def get_cleanup_info() -> dict:
|
|
153
153
|
dict: Dictionary containing cleanup configuration and statistics:
|
154
154
|
- lock_timeout_seconds: Current lock file cleanup threshold
|
155
155
|
- incomplete_timeout_seconds: Current incomplete file cleanup threshold
|
156
|
-
-
|
156
|
+
- b10_cache_dir: Path to b10fs cache directory
|
157
157
|
- b10fs_exists: Whether b10fs cache directory exists
|
158
158
|
- stale_locks_count: Number of lock files that would be cleaned
|
159
159
|
- stale_incomplete_count: Number of incomplete files that would be cleaned
|
160
160
|
"""
|
161
|
-
b10fs_dir = Path(B10FS_CACHE_DIR)
|
161
|
+
b10fs_dir = Path(config.B10FS_CACHE_DIR)
|
162
162
|
|
163
163
|
info = {
|
164
|
-
"lock_timeout_seconds": CLEANUP_LOCK_TIMEOUT_SECONDS,
|
165
|
-
"incomplete_timeout_seconds": CLEANUP_INCOMPLETE_TIMEOUT_SECONDS,
|
164
|
+
"lock_timeout_seconds": config.CLEANUP_LOCK_TIMEOUT_SECONDS,
|
165
|
+
"incomplete_timeout_seconds": config.CLEANUP_INCOMPLETE_TIMEOUT_SECONDS,
|
166
166
|
"b10fs_cache_dir": str(b10fs_dir),
|
167
167
|
"b10fs_exists": b10fs_dir.exists(),
|
168
168
|
"stale_locks_count": len(
|
169
|
-
_find_stale_files(b10fs_dir, "*.lock", CLEANUP_LOCK_TIMEOUT_SECONDS)
|
169
|
+
_find_stale_files(b10fs_dir, "*.lock", config.CLEANUP_LOCK_TIMEOUT_SECONDS)
|
170
170
|
),
|
171
171
|
"stale_incomplete_count": len(
|
172
172
|
_find_stale_files(
|
173
173
|
b10fs_dir,
|
174
174
|
f"*{CACHE_INCOMPLETE_SUFFIX}*",
|
175
|
-
CLEANUP_INCOMPLETE_TIMEOUT_SECONDS,
|
175
|
+
config.CLEANUP_INCOMPLETE_TIMEOUT_SECONDS,
|
176
176
|
)
|
177
177
|
),
|
178
178
|
}
|
b10_transfer/config.py
ADDED
@@ -0,0 +1,158 @@
|
|
1
|
+
import os
|
2
|
+
from typing import List
|
3
|
+
|
4
|
+
from .utils import (
|
5
|
+
get_current_username,
|
6
|
+
validate_path_security,
|
7
|
+
validate_boolean_env,
|
8
|
+
apply_cap,
|
9
|
+
)
|
10
|
+
from .constants import (
|
11
|
+
MAX_CACHE_SIZE_CAP_MB,
|
12
|
+
MAX_CONCURRENT_SAVES_CAP,
|
13
|
+
MIN_LOCAL_SPACE_MB,
|
14
|
+
LOCK_TIMEOUT_CAP_SECONDS,
|
15
|
+
INCOMPLETE_TIMEOUT_CAP_SECONDS,
|
16
|
+
REQUIRED_TORCH_CACHE_DIR_PREFIX,
|
17
|
+
)
|
18
|
+
|
19
|
+
|
20
|
+
class Config:
|
21
|
+
def _allowed_torch_cache_prefixes(self) -> List[str]:
|
22
|
+
home_cache = os.path.expanduser("~/.cache")
|
23
|
+
return ["/tmp/", "/cache/", f"{home_cache}"]
|
24
|
+
|
25
|
+
# --------- dynamic properties ---------
|
26
|
+
@property
|
27
|
+
def TORCH_CACHE_DIR(self) -> str:
|
28
|
+
"""
|
29
|
+
Validated torch compile cache directory.
|
30
|
+
|
31
|
+
Env:
|
32
|
+
- TORCHINDUCTOR_CACHE_DIR (optional)
|
33
|
+
Defaults to /tmp/torchinductor_<username> if not set.
|
34
|
+
"""
|
35
|
+
default_dir = f"/tmp/torchinductor_{get_current_username()}"
|
36
|
+
chosen = os.getenv("TORCHINDUCTOR_CACHE_DIR", default_dir)
|
37
|
+
return validate_path_security(
|
38
|
+
chosen,
|
39
|
+
self._allowed_torch_cache_prefixes(),
|
40
|
+
"TORCHINDUCTOR_CACHE_DIR",
|
41
|
+
)
|
42
|
+
|
43
|
+
@property
|
44
|
+
def B10FS_CACHE_DIR(self) -> str:
|
45
|
+
"""
|
46
|
+
Validated B10FS cache directory.
|
47
|
+
|
48
|
+
Env:
|
49
|
+
- B10FS_CACHE_DIR (optional)
|
50
|
+
Defaults to f"{REQUIRED_TORCH_CACHE_DIR_PREFIX}/compile_cache"
|
51
|
+
"""
|
52
|
+
default_dir = f"{REQUIRED_TORCH_CACHE_DIR_PREFIX}/compile_cache"
|
53
|
+
chosen = os.getenv("B10FS_CACHE_DIR", default_dir)
|
54
|
+
return validate_path_security(
|
55
|
+
chosen,
|
56
|
+
[REQUIRED_TORCH_CACHE_DIR_PREFIX],
|
57
|
+
"B10FS_CACHE_DIR",
|
58
|
+
)
|
59
|
+
|
60
|
+
@property
|
61
|
+
def LOCAL_WORK_DIR(self) -> str:
|
62
|
+
"""
|
63
|
+
Validated local work directory.
|
64
|
+
|
65
|
+
Env:
|
66
|
+
- LOCAL_WORK_DIR (optional, default: /app)
|
67
|
+
"""
|
68
|
+
chosen = os.getenv("LOCAL_WORK_DIR", "/app")
|
69
|
+
return validate_path_security(
|
70
|
+
chosen,
|
71
|
+
["/app/", "/tmp/", "/cache/"],
|
72
|
+
"LOCAL_WORK_DIR",
|
73
|
+
)
|
74
|
+
|
75
|
+
@property
|
76
|
+
def MAX_CACHE_SIZE_MB(self) -> int:
|
77
|
+
"""
|
78
|
+
Max size of a single cache archive (MB), capped for safety.
|
79
|
+
|
80
|
+
Env:
|
81
|
+
- MAX_CACHE_SIZE_MB (optional, default: 1024)
|
82
|
+
Caps:
|
83
|
+
- <= MAX_CACHE_SIZE_CAP_MB
|
84
|
+
"""
|
85
|
+
requested = int(os.getenv("MAX_CACHE_SIZE_MB", 1024))
|
86
|
+
return apply_cap(requested, MAX_CACHE_SIZE_CAP_MB, "MAX_CACHE_SIZE_MB")
|
87
|
+
|
88
|
+
@property
|
89
|
+
def MAX_CONCURRENT_SAVES(self) -> int:
|
90
|
+
"""
|
91
|
+
Max concurrent save operations, capped for safety.
|
92
|
+
|
93
|
+
Env:
|
94
|
+
- MAX_CONCURRENT_SAVES (optional, default: 50)
|
95
|
+
Caps:
|
96
|
+
- <= MAX_CONCURRENT_SAVES_CAP
|
97
|
+
"""
|
98
|
+
requested = int(os.getenv("MAX_CONCURRENT_SAVES", 50))
|
99
|
+
return apply_cap(requested, MAX_CONCURRENT_SAVES_CAP, "MAX_CONCURRENT_SAVES")
|
100
|
+
|
101
|
+
@property
|
102
|
+
def REQUIRED_B10FS_SPACE_MB(self) -> int:
|
103
|
+
"""
|
104
|
+
Estimated required space on B10FS (MB) based on concurrency and per-archive size.
|
105
|
+
Lower-bounded to ensure a sane minimum.
|
106
|
+
"""
|
107
|
+
return max(self.MAX_CONCURRENT_SAVES * self.MAX_CACHE_SIZE_MB, 100_000)
|
108
|
+
|
109
|
+
@property
|
110
|
+
def MIN_LOCAL_SPACE_MB(self) -> int:
|
111
|
+
"""Minimum required free space on local filesystem (MB)."""
|
112
|
+
return MIN_LOCAL_SPACE_MB
|
113
|
+
|
114
|
+
@property
|
115
|
+
def BASETEN_FS_ENABLED(self) -> bool:
|
116
|
+
"""
|
117
|
+
Whether Baseten FS features are enabled.
|
118
|
+
|
119
|
+
Env:
|
120
|
+
- BASETEN_FS_ENABLED (string "0" or "1", default "0")
|
121
|
+
"""
|
122
|
+
raw = os.getenv("BASETEN_FS_ENABLED", "0")
|
123
|
+
return validate_boolean_env(raw, "BASETEN_FS_ENABLED")
|
124
|
+
|
125
|
+
@property
|
126
|
+
def CLEANUP_LOCK_TIMEOUT_SECONDS(self) -> int:
|
127
|
+
"""
|
128
|
+
Timeout for cleaning up lock files (seconds).
|
129
|
+
|
130
|
+
Env:
|
131
|
+
- CLEANUP_LOCK_TIMEOUT_SECONDS (optional, default: 30)
|
132
|
+
Caps:
|
133
|
+
- <= LOCK_TIMEOUT_CAP_SECONDS
|
134
|
+
"""
|
135
|
+
requested = int(os.getenv("CLEANUP_LOCK_TIMEOUT_SECONDS", 30))
|
136
|
+
return apply_cap(
|
137
|
+
requested, LOCK_TIMEOUT_CAP_SECONDS, "CLEANUP_LOCK_TIMEOUT_SECONDS"
|
138
|
+
)
|
139
|
+
|
140
|
+
@property
|
141
|
+
def CLEANUP_INCOMPLETE_TIMEOUT_SECONDS(self) -> int:
|
142
|
+
"""
|
143
|
+
Timeout for cleaning up incomplete files (seconds).
|
144
|
+
|
145
|
+
Env:
|
146
|
+
- CLEANUP_INCOMPLETE_TIMEOUT_SECONDS (optional, default: 60)
|
147
|
+
Caps:
|
148
|
+
- <= INCOMPLETE_TIMEOUT_CAP_SECONDS
|
149
|
+
"""
|
150
|
+
requested = int(os.getenv("CLEANUP_INCOMPLETE_TIMEOUT_SECONDS", 60))
|
151
|
+
return apply_cap(
|
152
|
+
requested,
|
153
|
+
INCOMPLETE_TIMEOUT_CAP_SECONDS,
|
154
|
+
"CLEANUP_INCOMPLETE_TIMEOUT_SECONDS",
|
155
|
+
)
|
156
|
+
|
157
|
+
|
158
|
+
config = Config()
|
b10_transfer/constants.py
CHANGED
@@ -1,113 +1,30 @@
|
|
1
|
-
"""Configuration constants for b10-transfer.
|
2
|
-
|
3
|
-
This module defines configuration constants for the PyTorch compilation cache system.
|
4
|
-
Some values can be overridden by environment variables, but security caps are enforced
|
5
|
-
to prevent malicious or accidental misuse in production environments.
|
6
|
-
"""
|
7
|
-
|
8
|
-
import os
|
9
1
|
from enum import Enum, auto
|
10
2
|
|
11
|
-
#
|
12
|
-
|
13
|
-
|
14
|
-
validate_path_security,
|
15
|
-
validate_boolean_env,
|
16
|
-
apply_cap,
|
17
|
-
)
|
18
|
-
|
19
|
-
# Cache directories with security validation
|
20
|
-
|
21
|
-
# Validate TORCH_CACHE_DIR - allow /tmp and /cache paths
|
22
|
-
# TORCHINDUCTOR_CACHE_DIR is what torch uses by default. If it is not set, we use a different value.
|
23
|
-
_torch_cache_dir = os.getenv(
|
24
|
-
"TORCHINDUCTOR_CACHE_DIR", f"/tmp/torchinductor_{get_current_username()}"
|
25
|
-
)
|
26
|
-
TORCH_CACHE_DIR = validate_path_security(
|
27
|
-
_torch_cache_dir,
|
28
|
-
["/tmp/", "/cache/", f"{os.path.expanduser('~')}/.cache"],
|
29
|
-
"TORCHINDUCTOR_CACHE_DIR",
|
30
|
-
)
|
31
|
-
|
32
|
-
# B10FS cache directory validation
|
33
|
-
_REQUIRED_TORCH_CACHE_DIR_PREFIX = "/cache/model"
|
34
|
-
_b10fs_cache_dir = os.getenv(
|
35
|
-
"B10FS_CACHE_DIR", f"{_REQUIRED_TORCH_CACHE_DIR_PREFIX}/compile_cache"
|
36
|
-
)
|
37
|
-
B10FS_CACHE_DIR = validate_path_security(
|
38
|
-
_b10fs_cache_dir, [_REQUIRED_TORCH_CACHE_DIR_PREFIX], "B10FS_CACHE_DIR"
|
39
|
-
)
|
40
|
-
|
41
|
-
# Validate LOCAL_WORK_DIR - allow /app, /tmp, and /cache paths
|
42
|
-
_local_work_dir = os.getenv("LOCAL_WORK_DIR", "/app")
|
43
|
-
LOCAL_WORK_DIR = validate_path_security(
|
44
|
-
_local_work_dir, ["/app/", "/tmp/", "/cache/"], "LOCAL_WORK_DIR"
|
45
|
-
)
|
46
|
-
|
47
|
-
# Security caps to prevent resource exhaustion
|
48
|
-
_MAX_CACHE_SIZE_CAP_MB = 1 * 1024 # 1GB hard limit per cache archive
|
49
|
-
_MAX_CONCURRENT_SAVES_CAP = 100 # Maximum concurrent save operations (only used as estimate for b10fs space requirements/thresholding)
|
3
|
+
# ----- Hard caps & fixed thresholds (security / safety) -----
|
4
|
+
MAX_CACHE_SIZE_CAP_MB: int = 1 * 1024 # 1GB hard limit per cache archive
|
5
|
+
MAX_CONCURRENT_SAVES_CAP: int = 100 # Max concurrent save ops (estimate for space calc)
|
50
6
|
|
7
|
+
# Minimum required space on local disk
|
8
|
+
MIN_LOCAL_SPACE_MB: int = 50 * 1024 # 50GB
|
51
9
|
|
52
|
-
#
|
53
|
-
|
54
|
-
|
55
|
-
_user_max_cache_size, _MAX_CACHE_SIZE_CAP_MB, "MAX_CACHE_SIZE_MB"
|
56
|
-
)
|
10
|
+
# Cleanup hard limits
|
11
|
+
LOCK_TIMEOUT_CAP_SECONDS: int = 3600 # 1 hour hard limit
|
12
|
+
INCOMPLETE_TIMEOUT_CAP_SECONDS: int = 7200 # 2 hours hard limit
|
57
13
|
|
58
|
-
|
59
|
-
|
60
|
-
_user_max_concurrent_saves, _MAX_CONCURRENT_SAVES_CAP, "MAX_CONCURRENT_SAVES"
|
61
|
-
)
|
62
|
-
|
63
|
-
# Space requirements
|
64
|
-
MIN_LOCAL_SPACE_MB = 50 * 1024 # 50GB minimum space on local machine
|
65
|
-
REQUIRED_B10FS_SPACE_MB = max(MAX_CONCURRENT_SAVES * MAX_CACHE_SIZE_MB, 100_000)
|
66
|
-
|
67
|
-
# B10FS configuration
|
68
|
-
# The default is "0" (disabled) to prevent accidental enabling.
|
69
|
-
# But this does limit the ability to enable b10fs for debugging purposes.
|
70
|
-
# Probably should use B10FS_ENABLED instead for that.
|
71
|
-
_baseten_fs_enabled = os.getenv("BASETEN_FS_ENABLED", "0")
|
72
|
-
BASETEN_FS_ENABLED = validate_boolean_env(_baseten_fs_enabled, "BASETEN_FS_ENABLED")
|
14
|
+
# Allowed / required path patterns
|
15
|
+
REQUIRED_TORCH_CACHE_DIR_PREFIX: str = "/cache/model" # For B10FS cache dir validation
|
73
16
|
|
74
17
|
# File naming patterns
|
75
|
-
CACHE_FILE_EXTENSION = ".tar.gz"
|
76
|
-
CACHE_LATEST_SUFFIX = ".latest"
|
77
|
-
CACHE_INCOMPLETE_SUFFIX = ".incomplete"
|
78
|
-
CACHE_PREFIX = "cache_"
|
79
|
-
|
80
|
-
|
81
|
-
# Space monitoring settings
|
82
|
-
SPACE_MONITOR_CHECK_INTERVAL_SECONDS = (
|
83
|
-
0.5 # How often to check disk space during operations
|
84
|
-
)
|
85
|
-
|
86
|
-
# Cooperative cleanup settings
|
87
|
-
# Cache operations (load/save) should complete within ~15 seconds under normal conditions
|
88
|
-
_LOCK_TIMEOUT_CAP_SECONDS = 3600 # 1 hour hard limit
|
89
|
-
_INCOMPLETE_TIMEOUT_CAP_SECONDS = 7200 # 2 hours hard limit
|
90
|
-
|
91
|
-
# Lock file cleanup timeout (default: 2x expected operation time)
|
92
|
-
_user_lock_timeout = int(
|
93
|
-
os.getenv("CLEANUP_LOCK_TIMEOUT_SECONDS", "30")
|
94
|
-
) # 30 seconds default
|
95
|
-
CLEANUP_LOCK_TIMEOUT_SECONDS = apply_cap(
|
96
|
-
_user_lock_timeout, _LOCK_TIMEOUT_CAP_SECONDS, "CLEANUP_LOCK_TIMEOUT_SECONDS"
|
97
|
-
)
|
18
|
+
CACHE_FILE_EXTENSION: str = ".tar.gz"
|
19
|
+
CACHE_LATEST_SUFFIX: str = ".latest"
|
20
|
+
CACHE_INCOMPLETE_SUFFIX: str = ".incomplete"
|
21
|
+
CACHE_PREFIX: str = "cache_"
|
98
22
|
|
99
|
-
#
|
100
|
-
|
101
|
-
os.getenv("CLEANUP_INCOMPLETE_TIMEOUT_SECONDS", "60")
|
102
|
-
) # 1 minute default
|
103
|
-
CLEANUP_INCOMPLETE_TIMEOUT_SECONDS = apply_cap(
|
104
|
-
_user_incomplete_timeout,
|
105
|
-
_INCOMPLETE_TIMEOUT_CAP_SECONDS,
|
106
|
-
"CLEANUP_INCOMPLETE_TIMEOUT_SECONDS",
|
107
|
-
)
|
23
|
+
# Monitoring cadence
|
24
|
+
SPACE_MONITOR_CHECK_INTERVAL_SECONDS: float = 0.5
|
108
25
|
|
109
26
|
|
110
|
-
#
|
27
|
+
# ----- Enums -----
|
111
28
|
class WorkerStatus(Enum):
|
112
29
|
"""Status values for worker process results."""
|
113
30
|
|
@@ -122,4 +39,4 @@ class OperationStatus(Enum):
|
|
122
39
|
SUCCESS = auto()
|
123
40
|
ERROR = auto()
|
124
41
|
DOES_NOT_EXIST = auto() # Used by load operations when cache file not found
|
125
|
-
SKIPPED = auto() # Used by load/save
|
42
|
+
SKIPPED = auto() # Used by load/save ops when operation not needed
|
b10_transfer/core.py
CHANGED
@@ -20,9 +20,8 @@ from .space_monitor import (
|
|
20
20
|
run_monitored_process,
|
21
21
|
worker_process,
|
22
22
|
)
|
23
|
+
from .config import config
|
23
24
|
from .constants import (
|
24
|
-
B10FS_CACHE_DIR,
|
25
|
-
REQUIRED_B10FS_SPACE_MB,
|
26
25
|
MIN_LOCAL_SPACE_MB,
|
27
26
|
OperationStatus,
|
28
27
|
)
|
@@ -68,9 +67,9 @@ def transfer(source: str, dest: str) -> OperationStatus:
|
|
68
67
|
|
69
68
|
# Determine appropriate space threshold based on destination directory
|
70
69
|
dest_dir = dest_path.parent
|
71
|
-
if str(dest_dir).startswith(B10FS_CACHE_DIR):
|
70
|
+
if str(dest_dir).startswith(config.B10FS_CACHE_DIR):
|
72
71
|
# Transferring to b10fs - use b10fs space requirements
|
73
|
-
space_threshold_mb = REQUIRED_B10FS_SPACE_MB
|
72
|
+
space_threshold_mb = config.REQUIRED_B10FS_SPACE_MB
|
74
73
|
logger.debug(
|
75
74
|
f"[TRANSFER] Transfer to b10fs detected, using {space_threshold_mb:.1f}MB threshold"
|
76
75
|
)
|
b10_transfer/environment.py
CHANGED
b10_transfer/info.py
CHANGED
@@ -4,9 +4,8 @@ from typing import Dict, Any
|
|
4
4
|
|
5
5
|
from .environment import get_cache_filename, get_environment_key
|
6
6
|
from .archive import get_file_size_mb
|
7
|
+
from .config import config
|
7
8
|
from .constants import (
|
8
|
-
TORCH_CACHE_DIR,
|
9
|
-
B10FS_CACHE_DIR,
|
10
9
|
CACHE_PREFIX,
|
11
10
|
CACHE_LATEST_SUFFIX,
|
12
11
|
CACHE_FILE_EXTENSION,
|
@@ -91,8 +90,8 @@ def get_cache_info() -> Dict[str, Any]:
|
|
91
90
|
Raises:
|
92
91
|
No exceptions are raised; errors are handled gracefully with None values.
|
93
92
|
"""
|
94
|
-
torch_dir = Path(TORCH_CACHE_DIR)
|
95
|
-
b10fs_dir = Path(B10FS_CACHE_DIR)
|
93
|
+
torch_dir = Path(config.TORCH_CACHE_DIR)
|
94
|
+
b10fs_dir = Path(config.B10FS_CACHE_DIR)
|
96
95
|
cache_filename = get_cache_filename()
|
97
96
|
cache_file = (
|
98
97
|
b10fs_dir / f"{cache_filename}{CACHE_LATEST_SUFFIX}{CACHE_FILE_EXTENSION}"
|
@@ -143,7 +142,7 @@ def list_available_caches() -> Dict[str, Any]:
|
|
143
142
|
"error": "b10fs is not enabled",
|
144
143
|
}
|
145
144
|
|
146
|
-
b10fs_dir = Path(B10FS_CACHE_DIR)
|
145
|
+
b10fs_dir = Path(config.B10FS_CACHE_DIR)
|
147
146
|
|
148
147
|
if not b10fs_dir.exists():
|
149
148
|
return {
|
b10_transfer/logging_utils.py
CHANGED
b10_transfer/space_monitor.py
CHANGED
b10_transfer/utils.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: b10-transfer
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.2
|
4
4
|
Summary: Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management
|
5
5
|
License: MIT
|
6
6
|
Keywords: pytorch,file-transfer,cache,machine-learning,inference
|
@@ -0,0 +1,17 @@
|
|
1
|
+
b10_transfer/__init__.py,sha256=zjzYISk0NzEHPk5aQnoDq9ZXDY-ylsdcPwFbL4bvzds,729
|
2
|
+
b10_transfer/archive.py,sha256=RGk7pmOdF24aATKygkFOAfHB-90arnW67nj-WURZfcw,6424
|
3
|
+
b10_transfer/cache.py,sha256=VbAQx935rqdMXu6ejloa6Jw3n2KxkSMp3VLrNrgP--k,17480
|
4
|
+
b10_transfer/cache_cli.py,sha256=29H7HPKOvHP2LVI6J1Gad42iTXAZIwVqjvrZyIuCb_Q,3378
|
5
|
+
b10_transfer/cleanup.py,sha256=skC9KrIpTgv9NKMqz8PcoeCIcMDOoNiT8tHC5fZuK00,6390
|
6
|
+
b10_transfer/config.py,sha256=z_7emRsb-IG7_KbGy9jLtzkbIkLGEbKpWDUPWC_PB58,4661
|
7
|
+
b10_transfer/constants.py,sha256=oBfAvw2QyCRS1rFD9g1kDaToQqRX2bcnCVBUnCYD8uQ,1323
|
8
|
+
b10_transfer/core.py,sha256=r79CI8Kpw9FyT17qYOze4tg4UyoL_EQXiucX5YiqXEM,4659
|
9
|
+
b10_transfer/environment.py,sha256=7DcEFmuxEMWwRM0M92q8al5rx8mpGY8dZ81Hokhb9B0,5577
|
10
|
+
b10_transfer/info.py,sha256=MR6gXvL3gBImnafuDgbOsZAHy9_akwZVlyIPGu2t8jQ,6345
|
11
|
+
b10_transfer/logging_utils.py,sha256=c7iKNK9daNsrkHgsKl32rXscqqmWsq3a8ttz9M5ev3o,3460
|
12
|
+
b10_transfer/space_monitor.py,sha256=24rClldo6EYaeikEla3_q4IwwI0r3DEPzaIbfU4hTRE,10746
|
13
|
+
b10_transfer/utils.py,sha256=eyCckvl11xFOdsac4pjLEAleoahhIlBqAr81mwoGO-o,12036
|
14
|
+
b10_transfer-0.3.2.dist-info/METADATA,sha256=I-LwjB-64v3_tWXga2zt-texaz_55cFBGxjg2pAZR4Y,4108
|
15
|
+
b10_transfer-0.3.2.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
16
|
+
b10_transfer-0.3.2.dist-info/entry_points.txt,sha256=AFH8EfkeBv6ZuarnaUvbTjvySylf3GCSioYqB_ijrH8,65
|
17
|
+
b10_transfer-0.3.2.dist-info/RECORD,,
|
@@ -1,16 +0,0 @@
|
|
1
|
-
b10_transfer/__init__.py,sha256=90KV_leqq1fVtM3oZAZh4v8vdIcnhATmxlqlrqRMel8,729
|
2
|
-
b10_transfer/archive.py,sha256=gunAZ6oTUz0SxodUCho1uP-MNLuQyuhMKFusx7E0xDg,6439
|
3
|
-
b10_transfer/cache.py,sha256=k759Cs4IIUdAJ90ctK1m8ws3Fceje8ckug09L8X6W5I,17518
|
4
|
-
b10_transfer/cache_cli.py,sha256=aTM59jNNpEQ0m95YLYn9d1yjSTXZkzYAMszGqrDGmt4,3228
|
5
|
-
b10_transfer/cleanup.py,sha256=IAjRlpzCcXVrokTXdOsm_EJo_-U5kiy8KJoip8V4vvk,6345
|
6
|
-
b10_transfer/constants.py,sha256=0RWa4ZusICpYoN_V0u4yj4GZfcouQ6PjfCvSRVUamJU,4368
|
7
|
-
b10_transfer/core.py,sha256=Ny4lViiPup4Zb_OCx_Z2IcbcWq42Ymj_zF7xyVroHxE,4668
|
8
|
-
b10_transfer/environment.py,sha256=NkIs3EevhfATMZS0KL7f1w3SSoxAF9rCrA9qYkhbV7s,5602
|
9
|
-
b10_transfer/info.py,sha256=WlpSQNEKi93d3EYo2HnbbzrLazOhK9dpX5w7Kf6DVfA,6339
|
10
|
-
b10_transfer/logging_utils.py,sha256=vnjnuVsVO5bibHSHiF5sYhHqfprIezU0fM3YaCugMC0,3488
|
11
|
-
b10_transfer/space_monitor.py,sha256=-hFc9f29K-7gF1vDZRrKDa-FrHkmD6OBLsV8l8knrMM,10761
|
12
|
-
b10_transfer/utils.py,sha256=XG4dGLAlQQyUPeIK48OzuDkz1SOpdEs2_GrXKd8fr5M,12061
|
13
|
-
b10_transfer-0.3.1.dist-info/METADATA,sha256=jFlRoRUeru5LfMwkn5lvxIbumYBpooncVHbeUzvxBXc,4108
|
14
|
-
b10_transfer-0.3.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
15
|
-
b10_transfer-0.3.1.dist-info/entry_points.txt,sha256=AFH8EfkeBv6ZuarnaUvbTjvySylf3GCSioYqB_ijrH8,65
|
16
|
-
b10_transfer-0.3.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|