b10-transfer 0.1.7__tar.gz → 0.1.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {b10_transfer-0.1.7 → b10_transfer-0.1.8}/PKG-INFO +1 -1
- {b10_transfer-0.1.7 → b10_transfer-0.1.8}/pyproject.toml +1 -1
- {b10_transfer-0.1.7 → b10_transfer-0.1.8}/src/b10_transfer/__init__.py +3 -1
- {b10_transfer-0.1.7 → b10_transfer-0.1.8}/src/b10_transfer/archive.py +2 -1
- {b10_transfer-0.1.7 → b10_transfer-0.1.8}/src/b10_transfer/cache.py +95 -16
- {b10_transfer-0.1.7 → b10_transfer-0.1.8}/src/b10_transfer/cleanup.py +8 -7
- {b10_transfer-0.1.7 → b10_transfer-0.1.8}/src/b10_transfer/core.py +13 -8
- {b10_transfer-0.1.7 → b10_transfer-0.1.8}/src/b10_transfer/environment.py +7 -3
- {b10_transfer-0.1.7 → b10_transfer-0.1.8}/src/b10_transfer/info.py +3 -1
- b10_transfer-0.1.8/src/b10_transfer/logging_utils.py +117 -0
- {b10_transfer-0.1.7 → b10_transfer-0.1.8}/src/b10_transfer/space_monitor.py +16 -11
- {b10_transfer-0.1.7 → b10_transfer-0.1.8}/src/b10_transfer/utils.py +12 -10
- {b10_transfer-0.1.7 → b10_transfer-0.1.8}/README.md +0 -0
- {b10_transfer-0.1.7 → b10_transfer-0.1.8}/src/b10_transfer/constants.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: b10-transfer
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.8
|
4
4
|
Summary: Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management
|
5
5
|
License: MIT
|
6
6
|
Keywords: pytorch,file-transfer,cache,machine-learning,inference
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
4
4
|
|
5
5
|
[tool.poetry]
|
6
6
|
name = "b10-transfer"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.8"
|
8
8
|
description = "Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management"
|
9
9
|
authors = ["Shounak Ray <shounak.noreply@baseten.co>", "Fred Liu <fred.liu.noreply@baseten.co>"]
|
10
10
|
maintainers = ["Fred Liu <fred.liu.noreply@baseten.co>", "Shounak Ray <shounak.noreply@baseten.co>"]
|
@@ -6,9 +6,10 @@ from .utils import CacheError, CacheValidationError
|
|
6
6
|
from .space_monitor import CacheOperationInterrupted
|
7
7
|
from .info import get_cache_info, list_available_caches
|
8
8
|
from .constants import OperationStatus
|
9
|
+
from .logging_utils import get_b10_logger
|
9
10
|
|
10
11
|
# Version
|
11
|
-
__version__ = "0.1.
|
12
|
+
__version__ = "0.1.8"
|
12
13
|
|
13
14
|
__all__ = [
|
14
15
|
"CacheError",
|
@@ -21,4 +22,5 @@ __all__ = [
|
|
21
22
|
"transfer",
|
22
23
|
"get_cache_info",
|
23
24
|
"list_available_caches",
|
25
|
+
"get_b10_logger",
|
24
26
|
]
|
@@ -5,8 +5,9 @@ from pathlib import Path
|
|
5
5
|
|
6
6
|
from .utils import timed_fn, safe_unlink, CacheValidationError, validate_path_security
|
7
7
|
from .constants import MAX_CACHE_SIZE_MB
|
8
|
+
from .logging_utils import get_b10_logger
|
8
9
|
|
9
|
-
logger =
|
10
|
+
logger = get_b10_logger(__name__)
|
10
11
|
|
11
12
|
|
12
13
|
class ArchiveError(Exception):
|
@@ -8,6 +8,8 @@ import logging
|
|
8
8
|
import tempfile
|
9
9
|
from pathlib import Path
|
10
10
|
|
11
|
+
from .logging_utils import get_b10_logger
|
12
|
+
|
11
13
|
from .environment import get_cache_filename
|
12
14
|
from .cleanup import cooperative_cleanup_b10fs
|
13
15
|
from .utils import (
|
@@ -38,7 +40,7 @@ from .constants import (
|
|
38
40
|
)
|
39
41
|
from .core import transfer
|
40
42
|
|
41
|
-
logger =
|
43
|
+
logger = get_b10_logger(__name__)
|
42
44
|
|
43
45
|
|
44
46
|
"""
|
@@ -94,7 +96,7 @@ def _run_with_space_monitoring(
|
|
94
96
|
|
95
97
|
try:
|
96
98
|
logger.info(
|
97
|
-
f"Starting {operation_name}: {' -> '.join(str(arg) for arg in worker_args[:2])}"
|
99
|
+
f"[MONITORING] Starting {operation_name} with space monitoring: {' -> '.join(str(arg) for arg in worker_args[:2])}"
|
98
100
|
)
|
99
101
|
run_monitored_process(
|
100
102
|
worker_func,
|
@@ -114,14 +116,29 @@ def _transfer_with_b10fs_lock(
|
|
114
116
|
|
115
117
|
@critical_section_b10fs_file_lock(lock_type)
|
116
118
|
def _locked_transfer():
|
119
|
+
# Get file size for logging
|
120
|
+
source_path = Path(source)
|
121
|
+
source_size_mb = (
|
122
|
+
source_path.stat().st_size / (1024 * 1024) if source_path.exists() else 0
|
123
|
+
)
|
124
|
+
logger.info(
|
125
|
+
f"[TRANSFER] Starting locked transfer: {source} -> {dest} (size: {source_size_mb:.2f} MB, lock: {lock_type})"
|
126
|
+
)
|
127
|
+
|
117
128
|
result = transfer(source, dest)
|
118
129
|
if result != OperationStatus.SUCCESS:
|
130
|
+
logger.error(f"[TRANSFER] Transfer failed with status: {result}")
|
119
131
|
if cleanup_on_failure:
|
132
|
+
logger.info(
|
133
|
+
f"[TRANSFER] Cleaning up failed transfer destination: {dest}"
|
134
|
+
)
|
120
135
|
safe_unlink(
|
121
136
|
Path(dest), f"Failed to cleanup after failed transfer {dest}"
|
122
137
|
)
|
123
138
|
raise Exception(f"Failed to transfer {source} -> {dest}")
|
124
139
|
|
140
|
+
logger.info(f"[TRANSFER] Transfer completed successfully: {source} -> {dest}")
|
141
|
+
|
125
142
|
_locked_transfer()
|
126
143
|
|
127
144
|
|
@@ -156,15 +173,20 @@ def load_compile_cache() -> OperationStatus:
|
|
156
173
|
|
157
174
|
cache_filename = get_cache_filename()
|
158
175
|
final_file, _ = _get_cache_file_paths(cache_filename, b10fs_dir)
|
159
|
-
logger.
|
176
|
+
logger.info(f"[LOADING] Searching for cache file: {final_file}")
|
160
177
|
|
161
178
|
if not final_file.exists():
|
162
|
-
logger.info("No cache file found in b10fs")
|
179
|
+
logger.info(f"[LOADING] No cache file found in b10fs at: {final_file}")
|
163
180
|
return OperationStatus.DOES_NOT_EXIST
|
164
181
|
|
165
182
|
# Skip if already loaded
|
166
183
|
if torch_dir.exists() and any(torch_dir.iterdir()):
|
167
|
-
|
184
|
+
size_mb = sum(
|
185
|
+
f.stat().st_size for f in torch_dir.rglob("*") if f.is_file()
|
186
|
+
) / (1024 * 1024)
|
187
|
+
logger.info(
|
188
|
+
f"[LOADING] Torch cache already exists at {torch_dir}, skipping extraction (size: {size_mb:.2f} MB)"
|
189
|
+
)
|
168
190
|
return OperationStatus.SKIPPED
|
169
191
|
|
170
192
|
# Create temp local copy
|
@@ -172,11 +194,14 @@ def load_compile_cache() -> OperationStatus:
|
|
172
194
|
suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
|
173
195
|
) as f:
|
174
196
|
temp_path = Path(f.name)
|
175
|
-
logger.
|
197
|
+
logger.info(f"[LOADING] Created temporary file for cache download: {temp_path}")
|
176
198
|
|
177
199
|
try:
|
178
200
|
with temp_file_cleanup(temp_path):
|
179
201
|
# Phase 1: Copy from b10fs to local temp file
|
202
|
+
logger.info(
|
203
|
+
f"[LOADING] Phase 1: Copying cache from b10fs to local temp file ({final_file} -> {temp_path})"
|
204
|
+
)
|
180
205
|
_transfer_with_b10fs_lock(
|
181
206
|
str(final_file),
|
182
207
|
str(temp_path),
|
@@ -185,6 +210,9 @@ def load_compile_cache() -> OperationStatus:
|
|
185
210
|
)
|
186
211
|
|
187
212
|
# Phase 2: Extract archive with space monitoring
|
213
|
+
logger.info(
|
214
|
+
f"[LOADING] Phase 2: Extracting cache archive to torch directory ({temp_path} -> {torch_dir})"
|
215
|
+
)
|
188
216
|
_run_with_space_monitoring(
|
189
217
|
MIN_LOCAL_SPACE_MB,
|
190
218
|
work_dir,
|
@@ -194,11 +222,22 @@ def load_compile_cache() -> OperationStatus:
|
|
194
222
|
cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
|
195
223
|
)
|
196
224
|
|
197
|
-
|
225
|
+
# Calculate final cache size for logging
|
226
|
+
final_size_mb = (
|
227
|
+
sum(f.stat().st_size for f in torch_dir.rglob("*") if f.is_file())
|
228
|
+
/ (1024 * 1024)
|
229
|
+
if torch_dir.exists()
|
230
|
+
else 0
|
231
|
+
)
|
232
|
+
logger.info(
|
233
|
+
f"[LOADING] Cache load completed successfully (final size: {final_size_mb:.2f} MB)"
|
234
|
+
)
|
198
235
|
return OperationStatus.SUCCESS
|
199
236
|
|
200
237
|
except CacheOperationInterrupted as e:
|
201
|
-
logger.warning(
|
238
|
+
logger.warning(
|
239
|
+
f"[LOADING] Cache load interrupted due to insufficient disk space: {e}"
|
240
|
+
)
|
202
241
|
return OperationStatus.ERROR
|
203
242
|
|
204
243
|
|
@@ -236,7 +275,7 @@ def save_compile_cache() -> OperationStatus:
|
|
236
275
|
|
237
276
|
# Check if anything to save
|
238
277
|
if not torch_dir.exists() or not any(torch_dir.iterdir()):
|
239
|
-
logger.info("No torch cache to save")
|
278
|
+
logger.info(f"[SAVING] No torch cache found at {torch_dir} to save")
|
240
279
|
return OperationStatus.SKIPPED
|
241
280
|
|
242
281
|
cache_filename = get_cache_filename()
|
@@ -244,18 +283,30 @@ def save_compile_cache() -> OperationStatus:
|
|
244
283
|
|
245
284
|
# Check for existing cache first (early exit)
|
246
285
|
if final_file.exists():
|
247
|
-
|
286
|
+
file_size_mb = final_file.stat().st_size / (1024 * 1024)
|
287
|
+
logger.info(
|
288
|
+
f"[SAVING] Cache already exists in b10fs at {final_file} (size: {file_size_mb:.2f} MB), skipping save"
|
289
|
+
)
|
248
290
|
return OperationStatus.SKIPPED
|
249
291
|
|
250
292
|
with tempfile.NamedTemporaryFile(
|
251
293
|
suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
|
252
294
|
) as f:
|
253
295
|
local_temp = Path(f.name)
|
254
|
-
|
296
|
+
# Calculate source cache size for logging
|
297
|
+
source_size_mb = sum(
|
298
|
+
f.stat().st_size for f in torch_dir.rglob("*") if f.is_file()
|
299
|
+
) / (1024 * 1024)
|
300
|
+
logger.info(
|
301
|
+
f"[SAVING] Created local temp file for archive: {local_temp} (source cache size: {source_size_mb:.2f} MB)"
|
302
|
+
)
|
255
303
|
|
256
304
|
try:
|
257
305
|
with temp_file_cleanup(local_temp):
|
258
306
|
# Phase 1: Compression with space monitoring
|
307
|
+
logger.info(
|
308
|
+
f"[SAVING] Phase 1: Compressing torch cache directory ({torch_dir} -> {local_temp}, max size: {MAX_CACHE_SIZE_MB} MB)"
|
309
|
+
)
|
259
310
|
_run_with_space_monitoring(
|
260
311
|
REQUIRED_B10FS_SPACE_MB,
|
261
312
|
b10fs_dir,
|
@@ -265,21 +316,30 @@ def save_compile_cache() -> OperationStatus:
|
|
265
316
|
)
|
266
317
|
|
267
318
|
# Phase 2: Copy to b10fs with locking
|
319
|
+
compressed_size_mb = local_temp.stat().st_size / (1024 * 1024)
|
320
|
+
logger.info(
|
321
|
+
f"[SAVING] Phase 2: Copying compressed archive to b10fs ({local_temp} -> {temp_file}, size: {compressed_size_mb:.2f} MB)"
|
322
|
+
)
|
268
323
|
_transfer_with_b10fs_lock(
|
269
324
|
str(local_temp), str(temp_file), "copy_in", cleanup_on_failure=True
|
270
325
|
)
|
271
326
|
|
272
327
|
# Phase 3: Atomic rename (fast, don't interrupt)
|
273
328
|
logger.info(
|
274
|
-
f"
|
329
|
+
f"[SAVING] Phase 3: Atomically renaming temp file to final cache file: {temp_file} -> {final_file}"
|
275
330
|
)
|
276
331
|
temp_file.rename(final_file)
|
277
332
|
|
278
|
-
|
333
|
+
final_file_size_mb = final_file.stat().st_size / (1024 * 1024)
|
334
|
+
logger.info(
|
335
|
+
f"[SAVING] Cache save completed successfully (final file: {final_file}, size: {final_file_size_mb:.2f} MB)"
|
336
|
+
)
|
279
337
|
return OperationStatus.SUCCESS
|
280
338
|
|
281
339
|
except CacheOperationInterrupted as e:
|
282
|
-
logger.warning(
|
340
|
+
logger.warning(
|
341
|
+
f"[SAVING] Cache save interrupted due to insufficient disk space: {e}"
|
342
|
+
)
|
283
343
|
return OperationStatus.ERROR
|
284
344
|
|
285
345
|
|
@@ -299,10 +359,23 @@ def clear_local_cache() -> bool:
|
|
299
359
|
"""
|
300
360
|
torch_dir = Path(TORCH_CACHE_DIR)
|
301
361
|
if not torch_dir.exists():
|
362
|
+
logger.info(
|
363
|
+
f"[CLEARING] No torch cache directory found at {torch_dir}, nothing to clear"
|
364
|
+
)
|
302
365
|
return True
|
366
|
+
|
367
|
+
# Calculate size before clearing for logging
|
368
|
+
size_mb = sum(f.stat().st_size for f in torch_dir.rglob("*") if f.is_file()) / (
|
369
|
+
1024 * 1024
|
370
|
+
)
|
371
|
+
logger.info(
|
372
|
+
f"[CLEARING] Removing torch cache directory: {torch_dir} (size: {size_mb:.2f} MB)"
|
373
|
+
)
|
374
|
+
|
303
375
|
import shutil
|
304
376
|
|
305
377
|
shutil.rmtree(torch_dir)
|
378
|
+
logger.info(f"[CLEARING] Successfully cleared torch cache directory: {torch_dir}")
|
306
379
|
return True
|
307
380
|
|
308
381
|
|
@@ -326,6 +399,8 @@ def _cache_compression_worker(
|
|
326
399
|
# Import here to avoid issues with multiprocessing
|
327
400
|
from .archive import create_archive
|
328
401
|
|
402
|
+
# Note: We can't use the main logger here due to multiprocessing
|
403
|
+
# The create_archive function should handle its own logging
|
329
404
|
create_archive(torch_dir, local_temp, max_size_mb)
|
330
405
|
|
331
406
|
|
@@ -336,9 +411,11 @@ def _cleanup_torch_dir(torch_dir: Path) -> None:
|
|
336
411
|
import shutil
|
337
412
|
|
338
413
|
shutil.rmtree(torch_dir)
|
339
|
-
logger.
|
414
|
+
logger.info(
|
415
|
+
f"[CLEANUP] Successfully cleaned up torch directory: {torch_dir}"
|
416
|
+
)
|
340
417
|
except Exception as e:
|
341
|
-
logger.error(f"Failed to cleanup torch directory {torch_dir}: {e}")
|
418
|
+
logger.error(f"[CLEANUP] Failed to cleanup torch directory {torch_dir}: {e}")
|
342
419
|
|
343
420
|
|
344
421
|
@worker_process("Extraction was cancelled before starting")
|
@@ -358,4 +435,6 @@ def _cache_extract_worker(archive_path_str: str, dest_dir_str: str) -> None:
|
|
358
435
|
# Import here to avoid issues with multiprocessing
|
359
436
|
from .archive import extract_archive
|
360
437
|
|
438
|
+
# Note: We can't use the main logger here due to multiprocessing
|
439
|
+
# The extract_archive function should handle its own logging
|
361
440
|
extract_archive(archive_path, dest_dir)
|
@@ -18,8 +18,9 @@ from .constants import (
|
|
18
18
|
CLEANUP_INCOMPLETE_TIMEOUT_SECONDS,
|
19
19
|
)
|
20
20
|
from .utils import safe_execute, safe_unlink
|
21
|
+
from .logging_utils import get_b10_logger
|
21
22
|
|
22
|
-
logger =
|
23
|
+
logger = get_b10_logger(__name__)
|
23
24
|
|
24
25
|
|
25
26
|
@safe_execute("Failed to find stale files", [])
|
@@ -48,14 +49,14 @@ def _find_stale_files(
|
|
48
49
|
# Skip directories - we only want files
|
49
50
|
if not file_path.is_file():
|
50
51
|
logger.warning(
|
51
|
-
f"Found non-file in b10fs cache directory: {file_path}, skipping consideration for deletion in cleanup phase."
|
52
|
+
f"[CLEANUP] Found non-file in b10fs cache directory: {file_path}, skipping consideration for deletion in cleanup phase."
|
52
53
|
)
|
53
54
|
continue
|
54
55
|
|
55
56
|
# Check if filename matches pattern for the type of file we're looking for
|
56
57
|
if not fnmatch.fnmatch(file_path.name, pattern):
|
57
58
|
logger.warning(
|
58
|
-
f"Found non-matching file in b10fs cache directory: {file_path}, skipping consideration for deletion in cleanup phase."
|
59
|
+
f"[CLEANUP] Found non-matching file in b10fs cache directory: {file_path}, skipping consideration for deletion in cleanup phase."
|
59
60
|
)
|
60
61
|
continue
|
61
62
|
|
@@ -91,7 +92,7 @@ def _cleanup_files(files: List[Path], file_type: str) -> int:
|
|
91
92
|
)
|
92
93
|
cleaned_count += 1
|
93
94
|
logger.debug(
|
94
|
-
f"Cleaned stale {file_type} file: {file_path.name} (age: {file_age:.1f}s)"
|
95
|
+
f"[CLEANUP] Cleaned stale {file_type} file: {file_path.name} (age: {file_age:.1f}s)"
|
95
96
|
)
|
96
97
|
except OSError:
|
97
98
|
# File might have been deleted by another pod
|
@@ -120,7 +121,7 @@ def cooperative_cleanup_b10fs() -> None:
|
|
120
121
|
"""
|
121
122
|
b10fs_dir = Path(B10FS_CACHE_DIR)
|
122
123
|
if not b10fs_dir.exists():
|
123
|
-
logger.debug("b10fs cache directory doesn't exist, skipping cleanup")
|
124
|
+
logger.debug("[CLEANUP] b10fs cache directory doesn't exist, skipping cleanup")
|
124
125
|
return
|
125
126
|
|
126
127
|
# Find and clean stale lock files
|
@@ -138,11 +139,11 @@ def cooperative_cleanup_b10fs() -> None:
|
|
138
139
|
total_cleaned = cleaned_locks + cleaned_incomplete
|
139
140
|
if total_cleaned > 0:
|
140
141
|
logger.info(
|
141
|
-
f"Cooperative cleanup: removed {cleaned_locks} stale locks, "
|
142
|
+
f"[CLEANUP] Cooperative cleanup completed: removed {cleaned_locks} stale locks, "
|
142
143
|
f"{cleaned_incomplete} incomplete files"
|
143
144
|
)
|
144
145
|
else:
|
145
|
-
logger.debug("Cooperative cleanup: no stale files found")
|
146
|
+
logger.debug("[CLEANUP] Cooperative cleanup completed: no stale files found")
|
146
147
|
|
147
148
|
|
148
149
|
def get_cleanup_info() -> dict:
|
@@ -26,8 +26,9 @@ from .constants import (
|
|
26
26
|
MIN_LOCAL_SPACE_MB,
|
27
27
|
OperationStatus,
|
28
28
|
)
|
29
|
+
from .logging_utils import get_b10_logger
|
29
30
|
|
30
|
-
logger =
|
31
|
+
logger = get_b10_logger(__name__)
|
31
32
|
|
32
33
|
|
33
34
|
@timed_fn(logger=logger, name="Transferring file")
|
@@ -59,7 +60,7 @@ def transfer(source: str, dest: str) -> OperationStatus:
|
|
59
60
|
|
60
61
|
# Validate source file exists
|
61
62
|
if not source_path.exists():
|
62
|
-
logger.error(f"Source file does not exist: {source}")
|
63
|
+
logger.error(f"[TRANSFER] Source file does not exist: {source}")
|
63
64
|
return OperationStatus.ERROR
|
64
65
|
|
65
66
|
# Create destination directory if it doesn't exist
|
@@ -71,19 +72,19 @@ def transfer(source: str, dest: str) -> OperationStatus:
|
|
71
72
|
# Transferring to b10fs - use b10fs space requirements
|
72
73
|
space_threshold_mb = REQUIRED_B10FS_SPACE_MB
|
73
74
|
logger.debug(
|
74
|
-
f"Transfer to b10fs detected, using {space_threshold_mb:.1f}MB threshold"
|
75
|
+
f"[TRANSFER] Transfer to b10fs detected, using {space_threshold_mb:.1f}MB threshold"
|
75
76
|
)
|
76
77
|
else:
|
77
78
|
# Transferring to local directory - use local space requirements
|
78
79
|
space_threshold_mb = MIN_LOCAL_SPACE_MB
|
79
80
|
logger.debug(
|
80
|
-
f"Transfer to local directory detected, using {space_threshold_mb:.1f}MB threshold"
|
81
|
+
f"[TRANSFER] Transfer to local directory detected, using {space_threshold_mb:.1f}MB threshold"
|
81
82
|
)
|
82
83
|
|
83
84
|
# Initial disk space check
|
84
85
|
check_sufficient_disk_space(dest_dir, space_threshold_mb, "file transfer")
|
85
86
|
logger.debug(
|
86
|
-
f"Initial space check passed: {space_threshold_mb:.1f}MB required at destination"
|
87
|
+
f"[TRANSFER] Initial space check passed: {space_threshold_mb:.1f}MB required at destination"
|
87
88
|
)
|
88
89
|
|
89
90
|
# Start background space monitoring for destination directory
|
@@ -92,7 +93,7 @@ def transfer(source: str, dest: str) -> OperationStatus:
|
|
92
93
|
|
93
94
|
try:
|
94
95
|
# Run monitored copy process
|
95
|
-
logger.info(f"Starting transfer: {source} -> {dest}")
|
96
|
+
logger.info(f"[TRANSFER] Starting file transfer: {source} -> {dest}")
|
96
97
|
run_monitored_process(
|
97
98
|
_cache_copy_worker,
|
98
99
|
(str(source_path), str(dest_path)),
|
@@ -103,11 +104,15 @@ def transfer(source: str, dest: str) -> OperationStatus:
|
|
103
104
|
),
|
104
105
|
)
|
105
106
|
|
106
|
-
logger.info(
|
107
|
+
logger.info(
|
108
|
+
f"[TRANSFER] File transfer completed successfully: {source} -> {dest}"
|
109
|
+
)
|
107
110
|
return OperationStatus.SUCCESS
|
108
111
|
|
109
112
|
except CacheOperationInterrupted as e:
|
110
|
-
logger.warning(
|
113
|
+
logger.warning(
|
114
|
+
f"[TRANSFER] File transfer interrupted due to insufficient disk space: {e}"
|
115
|
+
)
|
111
116
|
return OperationStatus.ERROR
|
112
117
|
|
113
118
|
finally:
|
@@ -18,7 +18,9 @@ except ImportError:
|
|
18
18
|
torch = None
|
19
19
|
TORCH_AVAILABLE = False
|
20
20
|
|
21
|
-
|
21
|
+
from .logging_utils import get_b10_logger
|
22
|
+
|
23
|
+
logger = get_b10_logger(__name__)
|
22
24
|
|
23
25
|
KEY_LENGTH = 16
|
24
26
|
UNKNOWN_HOSTNAME = "unknown-host"
|
@@ -85,10 +87,12 @@ def get_environment_key() -> str:
|
|
85
87
|
return hashlib.sha256(node_json.encode("utf-8")).hexdigest()[:KEY_LENGTH]
|
86
88
|
|
87
89
|
except (ImportError, RuntimeError, AssertionError) as e:
|
88
|
-
logger.error(f"GPU environment unavailable: {e}")
|
90
|
+
logger.error(f"[ENVIRONMENT] GPU environment unavailable: {e}")
|
89
91
|
raise RuntimeError(f"Cannot generate environment key: {e}") from e
|
90
92
|
except Exception as e:
|
91
|
-
logger.error(
|
93
|
+
logger.error(
|
94
|
+
f"[ENVIRONMENT] Unexpected error during environment key generation: {e}"
|
95
|
+
)
|
92
96
|
raise RuntimeError(f"Environment key generation failed: {e}") from e
|
93
97
|
|
94
98
|
|
@@ -13,7 +13,9 @@ from .constants import (
|
|
13
13
|
)
|
14
14
|
from .utils import safe_execute, _is_b10fs_enabled
|
15
15
|
|
16
|
-
|
16
|
+
from .logging_utils import get_b10_logger
|
17
|
+
|
18
|
+
logger = get_b10_logger(__name__)
|
17
19
|
|
18
20
|
|
19
21
|
@safe_execute("Failed to calculate local cache size", None)
|
@@ -0,0 +1,117 @@
|
|
1
|
+
"""Centralized logging utilities for b10-transfer package with colored output."""
|
2
|
+
|
3
|
+
import logging
|
4
|
+
from typing import Optional
|
5
|
+
|
6
|
+
|
7
|
+
class ColoredFormatter(logging.Formatter):
|
8
|
+
"""Custom formatter that adds colors and b10-transfer prefix to log messages."""
|
9
|
+
|
10
|
+
# ANSI color codes
|
11
|
+
COLORS = {
|
12
|
+
"cyan": "\033[96m",
|
13
|
+
"green": "\033[92m",
|
14
|
+
"red": "\033[91m",
|
15
|
+
"yellow": "\033[93m",
|
16
|
+
"reset": "\033[0m",
|
17
|
+
}
|
18
|
+
|
19
|
+
def format(self, record):
|
20
|
+
# Add the b10-transfer prefix to the message
|
21
|
+
original_msg = record.getMessage()
|
22
|
+
|
23
|
+
# Determine color based on log level and message content
|
24
|
+
color = self._get_message_color(record, original_msg)
|
25
|
+
|
26
|
+
# Format the message with color and prefix
|
27
|
+
colored_msg = f"{self.COLORS[color]}[b10-transfer log] {original_msg}{self.COLORS['reset']}"
|
28
|
+
|
29
|
+
# Temporarily replace the message for formatting
|
30
|
+
record.msg = colored_msg
|
31
|
+
record.args = ()
|
32
|
+
|
33
|
+
# Use the parent formatter
|
34
|
+
formatted = super().format(record)
|
35
|
+
|
36
|
+
return formatted
|
37
|
+
|
38
|
+
def _get_message_color(self, record, message: str) -> str:
|
39
|
+
"""Determine the appropriate color for the log message."""
|
40
|
+
# Red for errors and failures
|
41
|
+
if record.levelno >= logging.ERROR:
|
42
|
+
return "red"
|
43
|
+
|
44
|
+
# Red for warning messages that indicate failures
|
45
|
+
if record.levelno == logging.WARNING and any(
|
46
|
+
keyword in message.lower()
|
47
|
+
for keyword in ["failed", "error", "interrupted", "cancelled", "abort"]
|
48
|
+
):
|
49
|
+
return "red"
|
50
|
+
|
51
|
+
# Green for success messages
|
52
|
+
if any(
|
53
|
+
keyword in message.lower()
|
54
|
+
for keyword in [
|
55
|
+
"completed successfully",
|
56
|
+
"success",
|
57
|
+
"complete",
|
58
|
+
"finished",
|
59
|
+
"saved",
|
60
|
+
"loaded",
|
61
|
+
"extracted",
|
62
|
+
"compressed",
|
63
|
+
"transferred",
|
64
|
+
"cleared successfully",
|
65
|
+
]
|
66
|
+
):
|
67
|
+
return "green"
|
68
|
+
|
69
|
+
# Default to cyan
|
70
|
+
return "cyan"
|
71
|
+
|
72
|
+
|
73
|
+
def get_b10_logger(name: str) -> logging.Logger:
|
74
|
+
"""Get a logger configured with b10-transfer colored formatting.
|
75
|
+
|
76
|
+
Args:
|
77
|
+
name: The logger name (typically __name__)
|
78
|
+
|
79
|
+
Returns:
|
80
|
+
Logger configured with colored b10-transfer formatting
|
81
|
+
"""
|
82
|
+
logger = logging.getLogger(name)
|
83
|
+
|
84
|
+
# Only add handler if it doesn't already exist
|
85
|
+
if not any(
|
86
|
+
isinstance(h, logging.StreamHandler)
|
87
|
+
and isinstance(h.formatter, ColoredFormatter)
|
88
|
+
for h in logger.handlers
|
89
|
+
):
|
90
|
+
# Create handler with colored formatter
|
91
|
+
handler = logging.StreamHandler()
|
92
|
+
formatter = ColoredFormatter("%(levelname)s - %(message)s")
|
93
|
+
handler.setFormatter(formatter)
|
94
|
+
|
95
|
+
# Add handler to logger
|
96
|
+
logger.addHandler(handler)
|
97
|
+
logger.setLevel(logging.INFO)
|
98
|
+
|
99
|
+
# Prevent duplicate messages from parent loggers
|
100
|
+
logger.propagate = False
|
101
|
+
|
102
|
+
return logger
|
103
|
+
|
104
|
+
|
105
|
+
def log_success(logger: logging.Logger, message: str):
|
106
|
+
"""Log a success message that will be colored green."""
|
107
|
+
logger.info(message)
|
108
|
+
|
109
|
+
|
110
|
+
def log_failure(logger: logging.Logger, message: str, level: int = logging.ERROR):
|
111
|
+
"""Log a failure message that will be colored red."""
|
112
|
+
logger.log(level, message)
|
113
|
+
|
114
|
+
|
115
|
+
def log_info(logger: logging.Logger, message: str):
|
116
|
+
"""Log an info message that will be colored cyan."""
|
117
|
+
logger.info(message)
|
@@ -14,8 +14,9 @@ from multiprocessing import Process, Queue
|
|
14
14
|
from functools import wraps
|
15
15
|
|
16
16
|
from .constants import WorkerStatus, SPACE_MONITOR_CHECK_INTERVAL_SECONDS
|
17
|
+
from .logging_utils import get_b10_logger
|
17
18
|
|
18
|
-
logger =
|
19
|
+
logger = get_b10_logger(__name__)
|
19
20
|
|
20
21
|
|
21
22
|
class CacheOperationInterrupted(Exception):
|
@@ -161,7 +162,7 @@ class CacheSpaceMonitor:
|
|
161
162
|
self.thread = threading.Thread(target=self._monitor, daemon=True)
|
162
163
|
self.thread.start()
|
163
164
|
logger.debug(
|
164
|
-
f"Started space monitor for {self.path} (required: {self.required_space_mb:.1f}MB)"
|
165
|
+
f"[MONITORING] Started space monitor for {self.path} (required: {self.required_space_mb:.1f}MB)"
|
165
166
|
)
|
166
167
|
|
167
168
|
def _monitor(self) -> None:
|
@@ -170,18 +171,18 @@ class CacheSpaceMonitor:
|
|
170
171
|
try:
|
171
172
|
available_mb = get_available_disk_space_mb(self.path)
|
172
173
|
logger.debug(
|
173
|
-
f"[
|
174
|
+
f"[MONITORING] Available space: {available_mb:.1f}MB (required: {self.required_space_mb:.1f}MB)"
|
174
175
|
)
|
175
176
|
|
176
177
|
if available_mb < self.required_space_mb:
|
177
178
|
logger.error(
|
178
|
-
f"CRITICAL: Space ({available_mb:.1f}MB) below required {self.required_space_mb:.1f}MB. Signaling stop!"
|
179
|
+
f"[MONITORING] CRITICAL: Space ({available_mb:.1f}MB) below required {self.required_space_mb:.1f}MB. Signaling stop!"
|
179
180
|
)
|
180
181
|
self.stop_operation.set()
|
181
182
|
break
|
182
183
|
|
183
184
|
except Exception as e:
|
184
|
-
logger.warning(f"Space monitor error: {e}")
|
185
|
+
logger.warning(f"[MONITORING] Space monitor error: {e}")
|
185
186
|
|
186
187
|
time.sleep(self.check_interval)
|
187
188
|
|
@@ -197,7 +198,7 @@ class CacheSpaceMonitor:
|
|
197
198
|
"""Stop the background monitoring thread."""
|
198
199
|
self.stop_operation.set()
|
199
200
|
if self.thread is not None:
|
200
|
-
logger.debug("Stopped space monitor")
|
201
|
+
logger.debug("[MONITORING] Stopped space monitor")
|
201
202
|
|
202
203
|
|
203
204
|
def cleanup_process(
|
@@ -217,7 +218,7 @@ def cleanup_process(
|
|
217
218
|
process.terminate()
|
218
219
|
process.join(timeout=timeout)
|
219
220
|
if process.is_alive():
|
220
|
-
logger.warning(f"Force killing {operation_name} process")
|
221
|
+
logger.warning(f"[MONITORING] Force killing {operation_name} process")
|
221
222
|
process.kill()
|
222
223
|
process.join()
|
223
224
|
|
@@ -260,7 +261,9 @@ def run_monitored_process(
|
|
260
261
|
# Monitor the process
|
261
262
|
while process.is_alive():
|
262
263
|
if space_monitor.should_stop():
|
263
|
-
logger.warning(
|
264
|
+
logger.warning(
|
265
|
+
f"[MONITORING] Low disk space detected, cancelling {operation_name}"
|
266
|
+
)
|
264
267
|
stop_event.set()
|
265
268
|
cleanup_process(process, operation_name)
|
266
269
|
|
@@ -280,7 +283,9 @@ def run_monitored_process(
|
|
280
283
|
if not result_queue.empty():
|
281
284
|
status, error_msg = result_queue.get()
|
282
285
|
if status == WorkerStatus.ERROR.value:
|
283
|
-
logger.error(
|
286
|
+
logger.error(
|
287
|
+
f"[MONITORING] {operation_name} worker failed: {error_msg}"
|
288
|
+
)
|
284
289
|
raise Exception(error_msg)
|
285
290
|
elif status == WorkerStatus.CANCELLED.value:
|
286
291
|
if cleanup_func:
|
@@ -288,12 +293,12 @@ def run_monitored_process(
|
|
288
293
|
raise CacheOperationInterrupted(error_msg)
|
289
294
|
# status == WorkerStatus.SUCCESS.value - continue normally
|
290
295
|
|
291
|
-
logger.debug(f"{operation_name} completed successfully")
|
296
|
+
logger.debug(f"[MONITORING] {operation_name} completed successfully")
|
292
297
|
|
293
298
|
except Exception as e:
|
294
299
|
# Ensure process is cleaned up
|
295
300
|
cleanup_process(process, operation_name)
|
296
301
|
|
297
302
|
if not isinstance(e, CacheOperationInterrupted):
|
298
|
-
logger.error(f"{operation_name} failed: {e}")
|
303
|
+
logger.error(f"[MONITORING] {operation_name} failed: {e}")
|
299
304
|
raise
|
@@ -6,7 +6,9 @@ from pathlib import Path
|
|
6
6
|
from contextlib import contextmanager
|
7
7
|
from typing import Generator, Any
|
8
8
|
|
9
|
-
|
9
|
+
from .logging_utils import get_b10_logger
|
10
|
+
|
11
|
+
logger = get_b10_logger(__name__)
|
10
12
|
|
11
13
|
# Lock file settings
|
12
14
|
LOCK_WAIT_SLEEP_SECONDS = 1.0 # How long to wait between lock file checks
|
@@ -137,7 +139,7 @@ def apply_cap(value: int, cap: int, name: str) -> int:
|
|
137
139
|
"""
|
138
140
|
if value > cap:
|
139
141
|
logger.warning(
|
140
|
-
f"{name} capped at {cap} (requested {value}) for security/stability"
|
142
|
+
f"[UTILS] {name} capped at {cap} (requested {value}) for security/stability"
|
141
143
|
)
|
142
144
|
return cap
|
143
145
|
return value
|
@@ -159,11 +161,11 @@ def timed_fn(logger=logger, name=None):
|
|
159
161
|
|
160
162
|
def decorator(fn):
|
161
163
|
def wrapper(*args, **kwargs):
|
162
|
-
logger.info(f"{name or fn.__name__} started")
|
164
|
+
logger.info(f"[TIMING] {name or fn.__name__} started")
|
163
165
|
start = time.perf_counter()
|
164
166
|
result = fn(*args, **kwargs)
|
165
167
|
logger.info(
|
166
|
-
f"{name or fn.__name__} finished in {time.perf_counter() - start:.2f}s"
|
168
|
+
f"[TIMING] {name or fn.__name__} finished in {time.perf_counter() - start:.2f}s"
|
167
169
|
)
|
168
170
|
return result
|
169
171
|
|
@@ -193,7 +195,7 @@ def safe_execute(error_message: str, default_return: Any = None):
|
|
193
195
|
try:
|
194
196
|
return func(*args, **kwargs)
|
195
197
|
except Exception as e:
|
196
|
-
logger.error(f"{error_message}: {e}")
|
198
|
+
logger.error(f"[ERROR] {error_message}: {e}")
|
197
199
|
return default_return
|
198
200
|
|
199
201
|
return wrapper
|
@@ -233,7 +235,7 @@ def critical_section_b10fs_file_lock(name):
|
|
233
235
|
|
234
236
|
lock_file = lock_dir / f"{name}.lock"
|
235
237
|
while lock_file.exists():
|
236
|
-
logger.debug("Waiting for lock file to be released...")
|
238
|
+
logger.debug("[LOCKING] Waiting for lock file to be released...")
|
237
239
|
time.sleep(LOCK_WAIT_SLEEP_SECONDS)
|
238
240
|
|
239
241
|
try:
|
@@ -267,9 +269,9 @@ def safe_unlink(
|
|
267
269
|
try:
|
268
270
|
file_path.unlink(missing_ok=True)
|
269
271
|
if success_message:
|
270
|
-
logger.debug(success_message)
|
272
|
+
logger.debug(f"[UTILS] {success_message}")
|
271
273
|
except Exception as e:
|
272
|
-
logger.error(f"{error_message}: {e}")
|
274
|
+
logger.error(f"[UTILS] {error_message}: {e}")
|
273
275
|
|
274
276
|
|
275
277
|
@contextmanager
|
@@ -348,8 +350,8 @@ def cache_operation(operation_name: str) -> Generator[None, None, None]:
|
|
348
350
|
_validate_b10fs_available()
|
349
351
|
yield
|
350
352
|
except CacheValidationError as e:
|
351
|
-
logger.debug(f"{operation_name} failed: {e}")
|
353
|
+
logger.debug(f"[OPERATION] {operation_name} failed: {e}")
|
352
354
|
raise
|
353
355
|
except Exception as e:
|
354
|
-
logger.debug(f"{operation_name} failed: {e}")
|
356
|
+
logger.debug(f"[OPERATION] {operation_name} failed: {e}")
|
355
357
|
raise
|
File without changes
|
File without changes
|