b10-transfer 0.1.7__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: b10-transfer
3
- Version: 0.1.7
3
+ Version: 0.2.0
4
4
  Summary: Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management
5
5
  License: MIT
6
6
  Keywords: pytorch,file-transfer,cache,machine-learning,inference
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "b10-transfer"
7
- version = "0.1.7"
7
+ version = "0.2.0"
8
8
  description = "Distributed PyTorch file transfer for Baseten - Environment-aware, lock-free file transfer management"
9
9
  authors = ["Shounak Ray <shounak.noreply@baseten.co>", "Fred Liu <fred.liu.noreply@baseten.co>"]
10
10
  maintainers = ["Fred Liu <fred.liu.noreply@baseten.co>", "Shounak Ray <shounak.noreply@baseten.co>"]
@@ -28,6 +28,9 @@ classifiers = [
28
28
  ]
29
29
  packages = [{include = "b10_transfer", from = "src"}]
30
30
 
31
+ [tool.poetry.scripts]
32
+ b10-transfer = "b10_transfer.cache_cli:main"
33
+
31
34
  [tool.poetry.dependencies]
32
35
  python = "^3.9"
33
36
  torch = ">=2.0.0"
@@ -6,9 +6,10 @@ from .utils import CacheError, CacheValidationError
6
6
  from .space_monitor import CacheOperationInterrupted
7
7
  from .info import get_cache_info, list_available_caches
8
8
  from .constants import OperationStatus
9
+ from .logging_utils import get_b10_logger
9
10
 
10
11
  # Version
11
- __version__ = "0.1.7"
12
+ __version__ = "0.2.0"
12
13
 
13
14
  __all__ = [
14
15
  "CacheError",
@@ -21,4 +22,5 @@ __all__ = [
21
22
  "transfer",
22
23
  "get_cache_info",
23
24
  "list_available_caches",
25
+ "get_b10_logger",
24
26
  ]
@@ -5,8 +5,9 @@ from pathlib import Path
5
5
 
6
6
  from .utils import timed_fn, safe_unlink, CacheValidationError, validate_path_security
7
7
  from .constants import MAX_CACHE_SIZE_MB
8
+ from .logging_utils import get_b10_logger
8
9
 
9
- logger = logging.getLogger(__name__)
10
+ logger = get_b10_logger(__name__)
10
11
 
11
12
 
12
13
  class ArchiveError(Exception):
@@ -8,6 +8,8 @@ import logging
8
8
  import tempfile
9
9
  from pathlib import Path
10
10
 
11
+ from .logging_utils import get_b10_logger
12
+
11
13
  from .environment import get_cache_filename
12
14
  from .cleanup import cooperative_cleanup_b10fs
13
15
  from .utils import (
@@ -38,7 +40,7 @@ from .constants import (
38
40
  )
39
41
  from .core import transfer
40
42
 
41
- logger = logging.getLogger(__name__)
43
+ logger = get_b10_logger(__name__)
42
44
 
43
45
 
44
46
  """
@@ -94,7 +96,7 @@ def _run_with_space_monitoring(
94
96
 
95
97
  try:
96
98
  logger.info(
97
- f"Starting {operation_name}: {' -> '.join(str(arg) for arg in worker_args[:2])}"
99
+ f"[MONITORING] Starting {operation_name} with space monitoring: {' -> '.join(str(arg) for arg in worker_args[:2])}"
98
100
  )
99
101
  run_monitored_process(
100
102
  worker_func,
@@ -114,14 +116,29 @@ def _transfer_with_b10fs_lock(
114
116
 
115
117
  @critical_section_b10fs_file_lock(lock_type)
116
118
  def _locked_transfer():
119
+ # Get file size for logging
120
+ source_path = Path(source)
121
+ source_size_mb = (
122
+ source_path.stat().st_size / (1024 * 1024) if source_path.exists() else 0
123
+ )
124
+ logger.info(
125
+ f"[TRANSFER] Starting locked transfer: {source} -> {dest} (size: {source_size_mb:.2f} MB, lock: {lock_type})"
126
+ )
127
+
117
128
  result = transfer(source, dest)
118
129
  if result != OperationStatus.SUCCESS:
130
+ logger.error(f"[TRANSFER] Transfer failed with status: {result}")
119
131
  if cleanup_on_failure:
132
+ logger.info(
133
+ f"[TRANSFER] Cleaning up failed transfer destination: {dest}"
134
+ )
120
135
  safe_unlink(
121
136
  Path(dest), f"Failed to cleanup after failed transfer {dest}"
122
137
  )
123
138
  raise Exception(f"Failed to transfer {source} -> {dest}")
124
139
 
140
+ logger.info(f"[TRANSFER] Transfer completed successfully: {source} -> {dest}")
141
+
125
142
  _locked_transfer()
126
143
 
127
144
 
@@ -156,15 +173,20 @@ def load_compile_cache() -> OperationStatus:
156
173
 
157
174
  cache_filename = get_cache_filename()
158
175
  final_file, _ = _get_cache_file_paths(cache_filename, b10fs_dir)
159
- logger.debug(f"Looking for cache file: {final_file}")
176
+ logger.info(f"[LOADING] Searching for cache file: {final_file}")
160
177
 
161
178
  if not final_file.exists():
162
- logger.info("No cache file found in b10fs")
179
+ logger.info(f"[LOADING] No cache file found in b10fs at: {final_file}")
163
180
  return OperationStatus.DOES_NOT_EXIST
164
181
 
165
182
  # Skip if already loaded
166
183
  if torch_dir.exists() and any(torch_dir.iterdir()):
167
- logger.info("Torch cache already loaded, skipping extraction")
184
+ size_mb = sum(
185
+ f.stat().st_size for f in torch_dir.rglob("*") if f.is_file()
186
+ ) / (1024 * 1024)
187
+ logger.info(
188
+ f"[LOADING] Torch cache already exists at {torch_dir}, skipping extraction (size: {size_mb:.2f} MB)"
189
+ )
168
190
  return OperationStatus.SKIPPED
169
191
 
170
192
  # Create temp local copy
@@ -172,11 +194,14 @@ def load_compile_cache() -> OperationStatus:
172
194
  suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
173
195
  ) as f:
174
196
  temp_path = Path(f.name)
175
- logger.debug(f"Created temporary file for cache: {temp_path}")
197
+ logger.info(f"[LOADING] Created temporary file for cache download: {temp_path}")
176
198
 
177
199
  try:
178
200
  with temp_file_cleanup(temp_path):
179
201
  # Phase 1: Copy from b10fs to local temp file
202
+ logger.info(
203
+ f"[LOADING] Phase 1: Copying cache from b10fs to local temp file ({final_file} -> {temp_path})"
204
+ )
180
205
  _transfer_with_b10fs_lock(
181
206
  str(final_file),
182
207
  str(temp_path),
@@ -185,6 +210,9 @@ def load_compile_cache() -> OperationStatus:
185
210
  )
186
211
 
187
212
  # Phase 2: Extract archive with space monitoring
213
+ logger.info(
214
+ f"[LOADING] Phase 2: Extracting cache archive to torch directory ({temp_path} -> {torch_dir})"
215
+ )
188
216
  _run_with_space_monitoring(
189
217
  MIN_LOCAL_SPACE_MB,
190
218
  work_dir,
@@ -194,11 +222,22 @@ def load_compile_cache() -> OperationStatus:
194
222
  cleanup_func=lambda: _cleanup_torch_dir(torch_dir),
195
223
  )
196
224
 
197
- logger.info("Cache load complete")
225
+ # Calculate final cache size for logging
226
+ final_size_mb = (
227
+ sum(f.stat().st_size for f in torch_dir.rglob("*") if f.is_file())
228
+ / (1024 * 1024)
229
+ if torch_dir.exists()
230
+ else 0
231
+ )
232
+ logger.info(
233
+ f"[LOADING] Cache load completed successfully (final size: {final_size_mb:.2f} MB)"
234
+ )
198
235
  return OperationStatus.SUCCESS
199
236
 
200
237
  except CacheOperationInterrupted as e:
201
- logger.warning(f"Cache load interrupted: {e}")
238
+ logger.warning(
239
+ f"[LOADING] Cache load interrupted due to insufficient disk space: {e}"
240
+ )
202
241
  return OperationStatus.ERROR
203
242
 
204
243
 
@@ -236,7 +275,7 @@ def save_compile_cache() -> OperationStatus:
236
275
 
237
276
  # Check if anything to save
238
277
  if not torch_dir.exists() or not any(torch_dir.iterdir()):
239
- logger.info("No torch cache to save")
278
+ logger.info(f"[SAVING] No torch cache found at {torch_dir} to save")
240
279
  return OperationStatus.SKIPPED
241
280
 
242
281
  cache_filename = get_cache_filename()
@@ -244,18 +283,30 @@ def save_compile_cache() -> OperationStatus:
244
283
 
245
284
  # Check for existing cache first (early exit)
246
285
  if final_file.exists():
247
- logger.info("Cache already exists in b10fs, skipping save")
286
+ file_size_mb = final_file.stat().st_size / (1024 * 1024)
287
+ logger.info(
288
+ f"[SAVING] Cache already exists in b10fs at {final_file} (size: {file_size_mb:.2f} MB), skipping save"
289
+ )
248
290
  return OperationStatus.SKIPPED
249
291
 
250
292
  with tempfile.NamedTemporaryFile(
251
293
  suffix=CACHE_FILE_EXTENSION, dir=work_dir, delete=False
252
294
  ) as f:
253
295
  local_temp = Path(f.name)
254
- logger.debug(f"Created local temp file for archive: {local_temp}")
296
+ # Calculate source cache size for logging
297
+ source_size_mb = sum(
298
+ f.stat().st_size for f in torch_dir.rglob("*") if f.is_file()
299
+ ) / (1024 * 1024)
300
+ logger.info(
301
+ f"[SAVING] Created local temp file for archive: {local_temp} (source cache size: {source_size_mb:.2f} MB)"
302
+ )
255
303
 
256
304
  try:
257
305
  with temp_file_cleanup(local_temp):
258
306
  # Phase 1: Compression with space monitoring
307
+ logger.info(
308
+ f"[SAVING] Phase 1: Compressing torch cache directory ({torch_dir} -> {local_temp}, max size: {MAX_CACHE_SIZE_MB} MB)"
309
+ )
259
310
  _run_with_space_monitoring(
260
311
  REQUIRED_B10FS_SPACE_MB,
261
312
  b10fs_dir,
@@ -265,21 +316,30 @@ def save_compile_cache() -> OperationStatus:
265
316
  )
266
317
 
267
318
  # Phase 2: Copy to b10fs with locking
319
+ compressed_size_mb = local_temp.stat().st_size / (1024 * 1024)
320
+ logger.info(
321
+ f"[SAVING] Phase 2: Copying compressed archive to b10fs ({local_temp} -> {temp_file}, size: {compressed_size_mb:.2f} MB)"
322
+ )
268
323
  _transfer_with_b10fs_lock(
269
324
  str(local_temp), str(temp_file), "copy_in", cleanup_on_failure=True
270
325
  )
271
326
 
272
327
  # Phase 3: Atomic rename (fast, don't interrupt)
273
328
  logger.info(
274
- f"Renaming temp file to final cache file: {temp_file} -> {final_file}"
329
+ f"[SAVING] Phase 3: Atomically renaming temp file to final cache file: {temp_file} -> {final_file}"
275
330
  )
276
331
  temp_file.rename(final_file)
277
332
 
278
- logger.info("Cache save complete")
333
+ final_file_size_mb = final_file.stat().st_size / (1024 * 1024)
334
+ logger.info(
335
+ f"[SAVING] Cache save completed successfully (final file: {final_file}, size: {final_file_size_mb:.2f} MB)"
336
+ )
279
337
  return OperationStatus.SUCCESS
280
338
 
281
339
  except CacheOperationInterrupted as e:
282
- logger.warning(f"Cache save interrupted: {e}")
340
+ logger.warning(
341
+ f"[SAVING] Cache save interrupted due to insufficient disk space: {e}"
342
+ )
283
343
  return OperationStatus.ERROR
284
344
 
285
345
 
@@ -299,10 +359,23 @@ def clear_local_cache() -> bool:
299
359
  """
300
360
  torch_dir = Path(TORCH_CACHE_DIR)
301
361
  if not torch_dir.exists():
362
+ logger.info(
363
+ f"[CLEARING] No torch cache directory found at {torch_dir}, nothing to clear"
364
+ )
302
365
  return True
366
+
367
+ # Calculate size before clearing for logging
368
+ size_mb = sum(f.stat().st_size for f in torch_dir.rglob("*") if f.is_file()) / (
369
+ 1024 * 1024
370
+ )
371
+ logger.info(
372
+ f"[CLEARING] Removing torch cache directory: {torch_dir} (size: {size_mb:.2f} MB)"
373
+ )
374
+
303
375
  import shutil
304
376
 
305
377
  shutil.rmtree(torch_dir)
378
+ logger.info(f"[CLEARING] Successfully cleared torch cache directory: {torch_dir}")
306
379
  return True
307
380
 
308
381
 
@@ -326,6 +399,8 @@ def _cache_compression_worker(
326
399
  # Import here to avoid issues with multiprocessing
327
400
  from .archive import create_archive
328
401
 
402
+ # Note: We can't use the main logger here due to multiprocessing
403
+ # The create_archive function should handle its own logging
329
404
  create_archive(torch_dir, local_temp, max_size_mb)
330
405
 
331
406
 
@@ -336,9 +411,11 @@ def _cleanup_torch_dir(torch_dir: Path) -> None:
336
411
  import shutil
337
412
 
338
413
  shutil.rmtree(torch_dir)
339
- logger.debug(f"Cleaned up torch directory: {torch_dir}")
414
+ logger.info(
415
+ f"[CLEANUP] Successfully cleaned up torch directory: {torch_dir}"
416
+ )
340
417
  except Exception as e:
341
- logger.error(f"Failed to cleanup torch directory {torch_dir}: {e}")
418
+ logger.error(f"[CLEANUP] Failed to cleanup torch directory {torch_dir}: {e}")
342
419
 
343
420
 
344
421
  @worker_process("Extraction was cancelled before starting")
@@ -358,4 +435,6 @@ def _cache_extract_worker(archive_path_str: str, dest_dir_str: str) -> None:
358
435
  # Import here to avoid issues with multiprocessing
359
436
  from .archive import extract_archive
360
437
 
438
+ # Note: We can't use the main logger here due to multiprocessing
439
+ # The extract_archive function should handle its own logging
361
440
  extract_archive(archive_path, dest_dir)
@@ -0,0 +1,124 @@
1
+ # src/b10_tcache/cli.py
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ import os
6
+ import sys
7
+ import time
8
+ import urllib.error
9
+ import urllib.request
10
+ from dataclasses import dataclass
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class WaitCfg:
15
+ url: str
16
+ timeout_s: float
17
+ interval_s: float
18
+ loglevel: str
19
+
20
+
21
+ DEFAULT_URL = os.getenv("B10_TRANSFER_VLLM_URL", "http://127.0.0.1:8000/v1/models")
22
+ DEFAULT_TIMEOUT_S = float(os.getenv("B10_TRANSFER_TIMEOUT_S", "1800")) # 30m default
23
+ DEFAULT_INTERVAL_S = float(os.getenv("B10_TRANSFER_INTERVAL_S", "2"))
24
+ DEFAULT_LOGLEVEL = os.getenv("B10_TRANSFER_CLI_LOGLEVEL", "INFO").upper()
25
+
26
+
27
+ VLLM_CACHE_DIR = os.getenv("VLLM_CACHE_ROOT", "~/.cache/vllm")
28
+
29
+
30
+ def _setup_logging(level: str) -> logging.Logger:
31
+ logging.basicConfig(
32
+ level=getattr(logging, level, logging.INFO),
33
+ format="%(asctime)s | %(levelname)s | %(message)s",
34
+ )
35
+ return logging.getLogger("b10_transfer.cli")
36
+
37
+
38
+ def _http_ok(url: str, logger: logging.Logger) -> bool:
39
+ """
40
+ Return True if vLLM readiness looks good.
41
+
42
+ We consider it 'ready' if GET <url> returns 200.
43
+ """
44
+ try:
45
+ req = urllib.request.Request(url, method="GET")
46
+ with urllib.request.urlopen(req, timeout=5) as resp:
47
+ if resp.status != 200:
48
+ return False
49
+ return True
50
+ except (urllib.error.URLError, urllib.error.HTTPError) as e:
51
+ logger.debug("Readiness probe failed: %s", e)
52
+ return False
53
+ except Exception as e:
54
+ logger.debug("Unexpected readiness error: %s", e)
55
+ return False
56
+
57
+
58
+ def _wait_for_ready(cfg: WaitCfg, logger: logging.Logger) -> bool:
59
+ t0 = time.monotonic()
60
+ logger.info(
61
+ "Waiting for vLLM readiness at %s (timeout=%.0fs, interval=%.1fs)",
62
+ cfg.url,
63
+ cfg.timeout_s,
64
+ cfg.interval_s,
65
+ )
66
+
67
+ while True:
68
+ if _http_ok(cfg.url, logger):
69
+ logger.info("vLLM reported ready at %s", cfg.url)
70
+ return True
71
+ if time.monotonic() - t0 > cfg.timeout_s:
72
+ logger.error(
73
+ "Timed out after %.0fs waiting for vLLM readiness.", cfg.timeout_s
74
+ )
75
+ return False
76
+
77
+ time.sleep(cfg.interval_s)
78
+
79
+
80
+ def main() -> None:
81
+ # Configure torch compile cache location
82
+ os.environ["TORCHINDUCTOR_CACHE_DIR"] = VLLM_CACHE_DIR
83
+
84
+ # Import here to allow environment variables to be set before the imported script uses them
85
+ from cache import load_compile_cache, save_compile_cache
86
+
87
+ cfg = WaitCfg(
88
+ url=DEFAULT_URL,
89
+ timeout_s=DEFAULT_TIMEOUT_S,
90
+ interval_s=DEFAULT_INTERVAL_S,
91
+ loglevel=DEFAULT_LOGLEVEL,
92
+ )
93
+
94
+ logger = _setup_logging(cfg.loglevel)
95
+
96
+ # 1) Preload any existing cache (non-fatal on error)
97
+ try:
98
+ logger.info("Calling load_compile_cache() …")
99
+ load_compile_cache()
100
+ logger.info("load_compile_cache() returned.")
101
+ except Exception as e:
102
+ logger.exception("load_compile_cache() failed: %s", e)
103
+
104
+ # 2) Wait for vLLM HTTP to be ready
105
+ try:
106
+ ready = _wait_for_ready(cfg, logger)
107
+ except Exception as e:
108
+ logger.exception("Readiness wait crashed: %s", e)
109
+ sys.exit(3)
110
+
111
+ if not ready:
112
+ # Loop timed out. Safe exit.
113
+ sys.exit(4)
114
+
115
+ # 3) Save compile cache
116
+ try:
117
+ logger.info("Calling save_compile_cache() …")
118
+ save_compile_cache()
119
+ logger.info("save_compile_cache() completed.")
120
+ except Exception as e:
121
+ logger.exception("save_compile_cache() failed: %s", e)
122
+ sys.exit(5)
123
+
124
+ logger.info("vLLM automatic torch compile cache done.")
@@ -18,8 +18,9 @@ from .constants import (
18
18
  CLEANUP_INCOMPLETE_TIMEOUT_SECONDS,
19
19
  )
20
20
  from .utils import safe_execute, safe_unlink
21
+ from .logging_utils import get_b10_logger
21
22
 
22
- logger = logging.getLogger(__name__)
23
+ logger = get_b10_logger(__name__)
23
24
 
24
25
 
25
26
  @safe_execute("Failed to find stale files", [])
@@ -48,14 +49,14 @@ def _find_stale_files(
48
49
  # Skip directories - we only want files
49
50
  if not file_path.is_file():
50
51
  logger.warning(
51
- f"Found non-file in b10fs cache directory: {file_path}, skipping consideration for deletion in cleanup phase."
52
+ f"[CLEANUP] Found non-file in b10fs cache directory: {file_path}, skipping consideration for deletion in cleanup phase."
52
53
  )
53
54
  continue
54
55
 
55
56
  # Check if filename matches pattern for the type of file we're looking for
56
57
  if not fnmatch.fnmatch(file_path.name, pattern):
57
58
  logger.warning(
58
- f"Found non-matching file in b10fs cache directory: {file_path}, skipping consideration for deletion in cleanup phase."
59
+ f"[CLEANUP] Found non-matching file in b10fs cache directory: {file_path}, skipping consideration for deletion in cleanup phase."
59
60
  )
60
61
  continue
61
62
 
@@ -91,7 +92,7 @@ def _cleanup_files(files: List[Path], file_type: str) -> int:
91
92
  )
92
93
  cleaned_count += 1
93
94
  logger.debug(
94
- f"Cleaned stale {file_type} file: {file_path.name} (age: {file_age:.1f}s)"
95
+ f"[CLEANUP] Cleaned stale {file_type} file: {file_path.name} (age: {file_age:.1f}s)"
95
96
  )
96
97
  except OSError:
97
98
  # File might have been deleted by another pod
@@ -120,7 +121,7 @@ def cooperative_cleanup_b10fs() -> None:
120
121
  """
121
122
  b10fs_dir = Path(B10FS_CACHE_DIR)
122
123
  if not b10fs_dir.exists():
123
- logger.debug("b10fs cache directory doesn't exist, skipping cleanup")
124
+ logger.debug("[CLEANUP] b10fs cache directory doesn't exist, skipping cleanup")
124
125
  return
125
126
 
126
127
  # Find and clean stale lock files
@@ -138,11 +139,11 @@ def cooperative_cleanup_b10fs() -> None:
138
139
  total_cleaned = cleaned_locks + cleaned_incomplete
139
140
  if total_cleaned > 0:
140
141
  logger.info(
141
- f"Cooperative cleanup: removed {cleaned_locks} stale locks, "
142
+ f"[CLEANUP] Cooperative cleanup completed: removed {cleaned_locks} stale locks, "
142
143
  f"{cleaned_incomplete} incomplete files"
143
144
  )
144
145
  else:
145
- logger.debug("Cooperative cleanup: no stale files found")
146
+ logger.debug("[CLEANUP] Cooperative cleanup completed: no stale files found")
146
147
 
147
148
 
148
149
  def get_cleanup_info() -> dict:
@@ -26,8 +26,9 @@ from .constants import (
26
26
  MIN_LOCAL_SPACE_MB,
27
27
  OperationStatus,
28
28
  )
29
+ from .logging_utils import get_b10_logger
29
30
 
30
- logger = logging.getLogger(__name__)
31
+ logger = get_b10_logger(__name__)
31
32
 
32
33
 
33
34
  @timed_fn(logger=logger, name="Transferring file")
@@ -59,7 +60,7 @@ def transfer(source: str, dest: str) -> OperationStatus:
59
60
 
60
61
  # Validate source file exists
61
62
  if not source_path.exists():
62
- logger.error(f"Source file does not exist: {source}")
63
+ logger.error(f"[TRANSFER] Source file does not exist: {source}")
63
64
  return OperationStatus.ERROR
64
65
 
65
66
  # Create destination directory if it doesn't exist
@@ -71,19 +72,19 @@ def transfer(source: str, dest: str) -> OperationStatus:
71
72
  # Transferring to b10fs - use b10fs space requirements
72
73
  space_threshold_mb = REQUIRED_B10FS_SPACE_MB
73
74
  logger.debug(
74
- f"Transfer to b10fs detected, using {space_threshold_mb:.1f}MB threshold"
75
+ f"[TRANSFER] Transfer to b10fs detected, using {space_threshold_mb:.1f}MB threshold"
75
76
  )
76
77
  else:
77
78
  # Transferring to local directory - use local space requirements
78
79
  space_threshold_mb = MIN_LOCAL_SPACE_MB
79
80
  logger.debug(
80
- f"Transfer to local directory detected, using {space_threshold_mb:.1f}MB threshold"
81
+ f"[TRANSFER] Transfer to local directory detected, using {space_threshold_mb:.1f}MB threshold"
81
82
  )
82
83
 
83
84
  # Initial disk space check
84
85
  check_sufficient_disk_space(dest_dir, space_threshold_mb, "file transfer")
85
86
  logger.debug(
86
- f"Initial space check passed: {space_threshold_mb:.1f}MB required at destination"
87
+ f"[TRANSFER] Initial space check passed: {space_threshold_mb:.1f}MB required at destination"
87
88
  )
88
89
 
89
90
  # Start background space monitoring for destination directory
@@ -92,7 +93,7 @@ def transfer(source: str, dest: str) -> OperationStatus:
92
93
 
93
94
  try:
94
95
  # Run monitored copy process
95
- logger.info(f"Starting transfer: {source} -> {dest}")
96
+ logger.info(f"[TRANSFER] Starting file transfer: {source} -> {dest}")
96
97
  run_monitored_process(
97
98
  _cache_copy_worker,
98
99
  (str(source_path), str(dest_path)),
@@ -103,11 +104,15 @@ def transfer(source: str, dest: str) -> OperationStatus:
103
104
  ),
104
105
  )
105
106
 
106
- logger.info("File transfer complete")
107
+ logger.info(
108
+ f"[TRANSFER] File transfer completed successfully: {source} -> {dest}"
109
+ )
107
110
  return OperationStatus.SUCCESS
108
111
 
109
112
  except CacheOperationInterrupted as e:
110
- logger.warning(f"File transfer interrupted: {e}")
113
+ logger.warning(
114
+ f"[TRANSFER] File transfer interrupted due to insufficient disk space: {e}"
115
+ )
111
116
  return OperationStatus.ERROR
112
117
 
113
118
  finally:
@@ -18,7 +18,9 @@ except ImportError:
18
18
  torch = None
19
19
  TORCH_AVAILABLE = False
20
20
 
21
- logger = logging.getLogger(__name__)
21
+ from .logging_utils import get_b10_logger
22
+
23
+ logger = get_b10_logger(__name__)
22
24
 
23
25
  KEY_LENGTH = 16
24
26
  UNKNOWN_HOSTNAME = "unknown-host"
@@ -85,10 +87,12 @@ def get_environment_key() -> str:
85
87
  return hashlib.sha256(node_json.encode("utf-8")).hexdigest()[:KEY_LENGTH]
86
88
 
87
89
  except (ImportError, RuntimeError, AssertionError) as e:
88
- logger.error(f"GPU environment unavailable: {e}")
90
+ logger.error(f"[ENVIRONMENT] GPU environment unavailable: {e}")
89
91
  raise RuntimeError(f"Cannot generate environment key: {e}") from e
90
92
  except Exception as e:
91
- logger.error(f"Unexpected error during environment key generation: {e}")
93
+ logger.error(
94
+ f"[ENVIRONMENT] Unexpected error during environment key generation: {e}"
95
+ )
92
96
  raise RuntimeError(f"Environment key generation failed: {e}") from e
93
97
 
94
98
 
@@ -13,7 +13,9 @@ from .constants import (
13
13
  )
14
14
  from .utils import safe_execute, _is_b10fs_enabled
15
15
 
16
- logger = logging.getLogger(__name__)
16
+ from .logging_utils import get_b10_logger
17
+
18
+ logger = get_b10_logger(__name__)
17
19
 
18
20
 
19
21
  @safe_execute("Failed to calculate local cache size", None)
@@ -0,0 +1,117 @@
1
+ """Centralized logging utilities for b10-transfer package with colored output."""
2
+
3
+ import logging
4
+ from typing import Optional
5
+
6
+
7
+ class ColoredFormatter(logging.Formatter):
8
+ """Custom formatter that adds colors and b10-transfer prefix to log messages."""
9
+
10
+ # ANSI color codes
11
+ COLORS = {
12
+ "cyan": "\033[96m",
13
+ "green": "\033[92m",
14
+ "red": "\033[91m",
15
+ "yellow": "\033[93m",
16
+ "reset": "\033[0m",
17
+ }
18
+
19
+ def format(self, record):
20
+ # Add the b10-transfer prefix to the message
21
+ original_msg = record.getMessage()
22
+
23
+ # Determine color based on log level and message content
24
+ color = self._get_message_color(record, original_msg)
25
+
26
+ # Format the message with color and prefix
27
+ colored_msg = f"{self.COLORS[color]}[b10-transfer log] {original_msg}{self.COLORS['reset']}"
28
+
29
+ # Temporarily replace the message for formatting
30
+ record.msg = colored_msg
31
+ record.args = ()
32
+
33
+ # Use the parent formatter
34
+ formatted = super().format(record)
35
+
36
+ return formatted
37
+
38
+ def _get_message_color(self, record, message: str) -> str:
39
+ """Determine the appropriate color for the log message."""
40
+ # Red for errors and failures
41
+ if record.levelno >= logging.ERROR:
42
+ return "red"
43
+
44
+ # Red for warning messages that indicate failures
45
+ if record.levelno == logging.WARNING and any(
46
+ keyword in message.lower()
47
+ for keyword in ["failed", "error", "interrupted", "cancelled", "abort"]
48
+ ):
49
+ return "red"
50
+
51
+ # Green for success messages
52
+ if any(
53
+ keyword in message.lower()
54
+ for keyword in [
55
+ "completed successfully",
56
+ "success",
57
+ "complete",
58
+ "finished",
59
+ "saved",
60
+ "loaded",
61
+ "extracted",
62
+ "compressed",
63
+ "transferred",
64
+ "cleared successfully",
65
+ ]
66
+ ):
67
+ return "green"
68
+
69
+ # Default to cyan
70
+ return "cyan"
71
+
72
+
73
+ def get_b10_logger(name: str) -> logging.Logger:
74
+ """Get a logger configured with b10-transfer colored formatting.
75
+
76
+ Args:
77
+ name: The logger name (typically __name__)
78
+
79
+ Returns:
80
+ Logger configured with colored b10-transfer formatting
81
+ """
82
+ logger = logging.getLogger(name)
83
+
84
+ # Only add handler if it doesn't already exist
85
+ if not any(
86
+ isinstance(h, logging.StreamHandler)
87
+ and isinstance(h.formatter, ColoredFormatter)
88
+ for h in logger.handlers
89
+ ):
90
+ # Create handler with colored formatter
91
+ handler = logging.StreamHandler()
92
+ formatter = ColoredFormatter("%(levelname)s - %(message)s")
93
+ handler.setFormatter(formatter)
94
+
95
+ # Add handler to logger
96
+ logger.addHandler(handler)
97
+ logger.setLevel(logging.INFO)
98
+
99
+ # Prevent duplicate messages from parent loggers
100
+ logger.propagate = False
101
+
102
+ return logger
103
+
104
+
105
+ def log_success(logger: logging.Logger, message: str):
106
+ """Log a success message that will be colored green."""
107
+ logger.info(message)
108
+
109
+
110
+ def log_failure(logger: logging.Logger, message: str, level: int = logging.ERROR):
111
+ """Log a failure message that will be colored red."""
112
+ logger.log(level, message)
113
+
114
+
115
+ def log_info(logger: logging.Logger, message: str):
116
+ """Log an info message that will be colored cyan."""
117
+ logger.info(message)
@@ -14,8 +14,9 @@ from multiprocessing import Process, Queue
14
14
  from functools import wraps
15
15
 
16
16
  from .constants import WorkerStatus, SPACE_MONITOR_CHECK_INTERVAL_SECONDS
17
+ from .logging_utils import get_b10_logger
17
18
 
18
- logger = logging.getLogger(__name__)
19
+ logger = get_b10_logger(__name__)
19
20
 
20
21
 
21
22
  class CacheOperationInterrupted(Exception):
@@ -161,7 +162,7 @@ class CacheSpaceMonitor:
161
162
  self.thread = threading.Thread(target=self._monitor, daemon=True)
162
163
  self.thread.start()
163
164
  logger.debug(
164
- f"Started space monitor for {self.path} (required: {self.required_space_mb:.1f}MB)"
165
+ f"[MONITORING] Started space monitor for {self.path} (required: {self.required_space_mb:.1f}MB)"
165
166
  )
166
167
 
167
168
  def _monitor(self) -> None:
@@ -170,18 +171,18 @@ class CacheSpaceMonitor:
170
171
  try:
171
172
  available_mb = get_available_disk_space_mb(self.path)
172
173
  logger.debug(
173
- f"[SpaceMonitor] Available space: {available_mb:.1f}MB (required: {self.required_space_mb:.1f}MB)"
174
+ f"[MONITORING] Available space: {available_mb:.1f}MB (required: {self.required_space_mb:.1f}MB)"
174
175
  )
175
176
 
176
177
  if available_mb < self.required_space_mb:
177
178
  logger.error(
178
- f"CRITICAL: Space ({available_mb:.1f}MB) below required {self.required_space_mb:.1f}MB. Signaling stop!"
179
+ f"[MONITORING] CRITICAL: Space ({available_mb:.1f}MB) below required {self.required_space_mb:.1f}MB. Signaling stop!"
179
180
  )
180
181
  self.stop_operation.set()
181
182
  break
182
183
 
183
184
  except Exception as e:
184
- logger.warning(f"Space monitor error: {e}")
185
+ logger.warning(f"[MONITORING] Space monitor error: {e}")
185
186
 
186
187
  time.sleep(self.check_interval)
187
188
 
@@ -197,7 +198,7 @@ class CacheSpaceMonitor:
197
198
  """Stop the background monitoring thread."""
198
199
  self.stop_operation.set()
199
200
  if self.thread is not None:
200
- logger.debug("Stopped space monitor")
201
+ logger.debug("[MONITORING] Stopped space monitor")
201
202
 
202
203
 
203
204
  def cleanup_process(
@@ -217,7 +218,7 @@ def cleanup_process(
217
218
  process.terminate()
218
219
  process.join(timeout=timeout)
219
220
  if process.is_alive():
220
- logger.warning(f"Force killing {operation_name} process")
221
+ logger.warning(f"[MONITORING] Force killing {operation_name} process")
221
222
  process.kill()
222
223
  process.join()
223
224
 
@@ -260,7 +261,9 @@ def run_monitored_process(
260
261
  # Monitor the process
261
262
  while process.is_alive():
262
263
  if space_monitor.should_stop():
263
- logger.warning(f"Low disk space detected, cancelling {operation_name}")
264
+ logger.warning(
265
+ f"[MONITORING] Low disk space detected, cancelling {operation_name}"
266
+ )
264
267
  stop_event.set()
265
268
  cleanup_process(process, operation_name)
266
269
 
@@ -280,7 +283,9 @@ def run_monitored_process(
280
283
  if not result_queue.empty():
281
284
  status, error_msg = result_queue.get()
282
285
  if status == WorkerStatus.ERROR.value:
283
- logger.error(f"{operation_name} worker failed: {error_msg}")
286
+ logger.error(
287
+ f"[MONITORING] {operation_name} worker failed: {error_msg}"
288
+ )
284
289
  raise Exception(error_msg)
285
290
  elif status == WorkerStatus.CANCELLED.value:
286
291
  if cleanup_func:
@@ -288,12 +293,12 @@ def run_monitored_process(
288
293
  raise CacheOperationInterrupted(error_msg)
289
294
  # status == WorkerStatus.SUCCESS.value - continue normally
290
295
 
291
- logger.debug(f"{operation_name} completed successfully")
296
+ logger.debug(f"[MONITORING] {operation_name} completed successfully")
292
297
 
293
298
  except Exception as e:
294
299
  # Ensure process is cleaned up
295
300
  cleanup_process(process, operation_name)
296
301
 
297
302
  if not isinstance(e, CacheOperationInterrupted):
298
- logger.error(f"{operation_name} failed: {e}")
303
+ logger.error(f"[MONITORING] {operation_name} failed: {e}")
299
304
  raise
@@ -6,7 +6,9 @@ from pathlib import Path
6
6
  from contextlib import contextmanager
7
7
  from typing import Generator, Any
8
8
 
9
- logger = logging.getLogger(__name__)
9
+ from .logging_utils import get_b10_logger
10
+
11
+ logger = get_b10_logger(__name__)
10
12
 
11
13
  # Lock file settings
12
14
  LOCK_WAIT_SLEEP_SECONDS = 1.0 # How long to wait between lock file checks
@@ -137,7 +139,7 @@ def apply_cap(value: int, cap: int, name: str) -> int:
137
139
  """
138
140
  if value > cap:
139
141
  logger.warning(
140
- f"{name} capped at {cap} (requested {value}) for security/stability"
142
+ f"[UTILS] {name} capped at {cap} (requested {value}) for security/stability"
141
143
  )
142
144
  return cap
143
145
  return value
@@ -159,11 +161,11 @@ def timed_fn(logger=logger, name=None):
159
161
 
160
162
  def decorator(fn):
161
163
  def wrapper(*args, **kwargs):
162
- logger.info(f"{name or fn.__name__} started")
164
+ logger.info(f"[TIMING] {name or fn.__name__} started")
163
165
  start = time.perf_counter()
164
166
  result = fn(*args, **kwargs)
165
167
  logger.info(
166
- f"{name or fn.__name__} finished in {time.perf_counter() - start:.2f}s"
168
+ f"[TIMING] {name or fn.__name__} finished in {time.perf_counter() - start:.2f}s"
167
169
  )
168
170
  return result
169
171
 
@@ -193,7 +195,7 @@ def safe_execute(error_message: str, default_return: Any = None):
193
195
  try:
194
196
  return func(*args, **kwargs)
195
197
  except Exception as e:
196
- logger.error(f"{error_message}: {e}")
198
+ logger.error(f"[ERROR] {error_message}: {e}")
197
199
  return default_return
198
200
 
199
201
  return wrapper
@@ -233,7 +235,7 @@ def critical_section_b10fs_file_lock(name):
233
235
 
234
236
  lock_file = lock_dir / f"{name}.lock"
235
237
  while lock_file.exists():
236
- logger.debug("Waiting for lock file to be released...")
238
+ logger.debug("[LOCKING] Waiting for lock file to be released...")
237
239
  time.sleep(LOCK_WAIT_SLEEP_SECONDS)
238
240
 
239
241
  try:
@@ -267,9 +269,9 @@ def safe_unlink(
267
269
  try:
268
270
  file_path.unlink(missing_ok=True)
269
271
  if success_message:
270
- logger.debug(success_message)
272
+ logger.debug(f"[UTILS] {success_message}")
271
273
  except Exception as e:
272
- logger.error(f"{error_message}: {e}")
274
+ logger.error(f"[UTILS] {error_message}: {e}")
273
275
 
274
276
 
275
277
  @contextmanager
@@ -348,8 +350,8 @@ def cache_operation(operation_name: str) -> Generator[None, None, None]:
348
350
  _validate_b10fs_available()
349
351
  yield
350
352
  except CacheValidationError as e:
351
- logger.debug(f"{operation_name} failed: {e}")
353
+ logger.debug(f"[OPERATION] {operation_name} failed: {e}")
352
354
  raise
353
355
  except Exception as e:
354
- logger.debug(f"{operation_name} failed: {e}")
356
+ logger.debug(f"[OPERATION] {operation_name} failed: {e}")
355
357
  raise
File without changes