PyPI - b10-transfer - Versions diffs - 0.0.1__py3-none-any.whl - Mend

b10-transfer 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

b10_transfer/__init__.py +51 -0
b10_transfer/archive.py +175 -0
b10_transfer/async_torch_cache.py +62 -0
b10_transfer/async_transfers.py +275 -0
b10_transfer/cleanup.py +179 -0
b10_transfer/constants.py +149 -0
b10_transfer/core.py +160 -0
b10_transfer/environment.py +134 -0
b10_transfer/info.py +172 -0
b10_transfer/space_monitor.py +299 -0
b10_transfer/torch_cache.py +376 -0
b10_transfer/utils.py +355 -0
b10_transfer-0.0.1.dist-info/METADATA +219 -0
b10_transfer-0.0.1.dist-info/RECORD +15 -0
b10_transfer-0.0.1.dist-info/WHEEL +4 -0

b10_transfer/utils.py ADDED Viewed

@@ -0,0 +1,355 @@
+import os
+import time
+import logging
+import getpass
+from pathlib import Path
+from contextlib import contextmanager
+from typing import Generator, Any
+logger = logging.getLogger(__name__)
+# Lock file settings
+LOCK_WAIT_SLEEP_SECONDS = 1.0  # How long to wait between lock file checks
+class CacheError(Exception):
+    """Base cache operation error."""
+    pass
+class CacheValidationError(CacheError):
+    """Path validation or compatibility check failed."""
+    pass
+def get_current_username() -> str:
+    """
+    Get the current username using getpass.getuser().
+    This uses the same method as PyTorch for consistency.
+    Returns:
+        str: Current username.
+    Raises:
+        RuntimeError: If unable to determine the current username.
+    """
+    try:
+        return getpass.getuser()
+    except Exception as e:
+        raise RuntimeError(f"Unable to determine current username: {e}") from e
+def validate_path_security(
+    path: str,
+    allowed_prefixes: list[str],
+    name: str,
+    exception_class: type = EnvironmentError,
+) -> str:
+    """
+    Validate that a path is secure and within allowed directory prefixes.
+    This function prevents directory traversal attacks and ensures paths
+    are within expected locations for security. It handles symlinks like
+    macOS /tmp -> /private/tmp by resolving both path and prefixes.
+        Args:
+        path: The path string to validate.
+        allowed_prefixes: List of allowed directory prefix strings.
+        name: Name of the configuration for error messages.
+        exception_class: Exception class to raise on validation failure.
+                        Defaults to EnvironmentError.
+    Returns:
+        str: The validated resolved path.
+    Raises:
+        exception_class: If path is outside allowed prefixes or contains
+                        unsafe components.
+    """
+    if not path:
+        raise exception_class(f"{name} cannot be empty")
+    # Convert to Path and resolve to handle symlinks and relative paths
+    try:
+        resolved_path = str(Path(path).resolve())
+    except (OSError, ValueError) as e:
+        raise exception_class(f"{name} path resolution failed: {e}")
+    # Check for directory traversal attempts
+    if ".." in path or path != path.strip():
+        raise exception_class(f"{name} contains unsafe path components: {path}")
+    # Validate against allowed prefixes
+    # Handle symlinks like macOS /tmp -> /private/tmp by checking both resolved and canonical forms
+    path_matches = False
+    for prefix in allowed_prefixes:
+        # Check resolved path against resolved prefix
+        try:
+            resolved_prefix = str(Path(prefix).resolve())
+            if resolved_path.startswith(resolved_prefix):
+                path_matches = True
+                break
+        except (OSError, ValueError):
+            # If prefix resolution fails, fall back to string comparison
+            if resolved_path.startswith(prefix):
+                path_matches = True
+                break
+    if not path_matches:
+        raise exception_class(
+            f"{name} path '{resolved_path}' must start with one of: {allowed_prefixes}"
+        )
+    return resolved_path
+def validate_boolean_env(env_var: str, name: str) -> str:
+    """
+    Validate that an environment variable contains a safe boolean-like value.
+    Args:
+        env_var: The environment variable value to validate.
+        name: Name of the configuration for error messages.
+    Returns:
+        str: The validated environment variable value.
+    Raises:
+        CacheValidationError: If the value is not a recognized boolean string.
+    """
+    valid_values = {"0", "1", "true", "false", "True", "False", ""}
+    if env_var not in valid_values:
+        raise CacheValidationError(
+            f"{name} must be one of {valid_values}, got: {env_var}"
+        )
+    return env_var
+def apply_cap(value: int, cap: int, name: str) -> int:
+    """
+    Apply security cap to user-provided values.
+    Not amazing (doesn't prevent the user from modifying the pip package
+    source code), but at least it prevents accidental environment variable
+    setting that could cause resource exhaustion.
+    """
+    if value > cap:
+        logger.warning(
+            f"{name} capped at {cap} (requested {value}) for security/stability"
+        )
+        return cap
+    return value
+def timed_fn(logger=logger, name=None):
+    """Decorator to log function execution time.
+    This decorator logs when a function starts and finishes, including the
+    total execution time in seconds.
+    Args:
+        logger: Logger instance to use for logging. Defaults to module logger.
+        name: Custom name to use in log messages. If None, uses function name.
+    Returns:
+        Decorator function that wraps the target function with timing logic.
+    """
+    def decorator(fn):
+        def wrapper(*args, **kwargs):
+            logger.info(f"{name or fn.__name__} started")
+            start = time.perf_counter()
+            result = fn(*args, **kwargs)
+            logger.info(
+                f"{name or fn.__name__} finished in {time.perf_counter() - start:.2f}s"
+            )
+            return result
+        return wrapper
+    return decorator
+def safe_execute(error_message: str, default_return: Any = None):
+    """Decorator to safely execute a function with error handling.
+    This decorator catches all exceptions from the wrapped function and logs
+    them with a custom error message, then returns a default value instead
+    of propagating the exception.
+    Args:
+        error_message: Message to log when an exception occurs.
+        default_return: Value to return if the function raises an exception.
+                       Defaults to None.
+    Returns:
+        Decorator function that wraps the target function with error handling.
+    """
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            try:
+                return func(*args, **kwargs)
+            except Exception as e:
+                logger.error(f"{error_message}: {e}")
+                return default_return
+        return wrapper
+    return decorator
+# TODO(SR): Make the 1-second sleep a configurable parameter + document what it does.
+# FIXME(SR): There's a race condition here. If a single process creates a lock file
+#            (say because they are copy-ing in or copy-ing out, and the pod/replica crashes for whatever reason),
+#            then the lock file will never be released. This is bad because then a bunch of other replicas will
+#            be blocked from doing anything (loading in the cache or saving out the cache).
+#            We either need to find a way to ENSURE that the lock file will be released if the pod/replica crashes OR in a certain amount of time.
+#            OR enforce some retry-timeout logic to ensure that other replicas proceed with reading from the cache/writing to the cache if they are "held up" by the lock file N number of times or seconds perhaps
+#            Just a thought...need to think more + test this out.
+def critical_section_b10fs_file_lock(name):
+    """Decorator to ensure critical section for b10fs file operations.
+    This decorator ensures that the decorated function runs in a critical section
+    where no other b10fs file operations can interfere. It uses a lock file to
+    synchronize access.
+    Args:
+        name: The name of the operation, used for the lock file name.
+    Returns:
+        The decorated function with critical section handling.
+    """
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            # Import here to avoid circular dependency
+            from .constants import B10FS_CACHE_DIR
+            lock_dir = Path(B10FS_CACHE_DIR)
+            lock_dir.mkdir(parents=True, exist_ok=True)
+            lock_file = lock_dir / f"{name}.lock"
+            while lock_file.exists():
+                logger.debug("Waiting for lock file to be released...")
+                time.sleep(LOCK_WAIT_SLEEP_SECONDS)
+            try:
+                lock_file.touch()
+                return func(*args, **kwargs)
+            finally:
+                lock_file.unlink(missing_ok=True)
+        return wrapper
+    return decorator
+def safe_unlink(
+    file_path: Path, error_message: str, success_message: str = None
+) -> None:
+    """Safely unlink a file with dead mount filesystem protection.
+    This function attempts to delete a file while gracefully handling cases
+    where the filesystem (like b10fs) becomes unavailable or dead during
+    the operation. It uses missing_ok=True to handle missing files.
+    Args:
+        file_path: Path to the file to delete.
+        error_message: Message to log if deletion fails.
+        success_message: Optional message to log if deletion succeeds.
+    Raises:
+        No exceptions are raised; all errors are caught and logged.
+    """
+    try:
+        file_path.unlink(missing_ok=True)
+        if success_message:
+            logger.debug(success_message)
+    except Exception as e:
+        logger.error(f"{error_message}: {e}")
+@contextmanager
+def temp_file_cleanup(temp_path: Path) -> Generator[Path, None, None]:
+    """Context manager for temporary file with automatic safe cleanup.
+    This context manager ensures that temporary files are cleaned up even
+    if the filesystem becomes unavailable during the operation. It uses
+    safe_unlink to handle dead mount scenarios gracefully.
+    Args:
+        temp_path: Path to the temporary file to manage.
+    Yields:
+        Path: The temporary file path for use within the context.
+    Raises:
+        Cleanup errors are handled gracefully and logged but not raised.
+    """
+    try:
+        yield temp_path
+    finally:
+        safe_unlink(temp_path, f"Failed to delete temporary file {temp_path}")
+def _is_b10fs_enabled() -> bool:
+    """Check if b10fs filesystem is enabled via environment variable.
+    This function checks the BASETEN_FS_ENABLED environment variable to
+    determine if the b10fs shared filesystem is available for cache operations.
+    Returns:
+        bool: True if BASETEN_FS_ENABLED is set to "1" or "True", False otherwise.
+    """
+    # Import here to avoid circular dependency
+    from .constants import BASETEN_FS_ENABLED
+    return BASETEN_FS_ENABLED in ("1", "True", "true")
+def _validate_b10fs_available() -> None:
+    """Validate that b10fs filesystem is available for cache operations.
+    This function checks if b10fs is enabled and raises an exception if not.
+    It should be called before any operations that require b10fs access.
+    Raises:
+        CacheValidationError: If b10fs is not enabled (BASETEN_FS_ENABLED
+                            is not set to "1" or "True").
+    """
+    if not _is_b10fs_enabled():
+        raise CacheValidationError(
+            "b10fs is not enabled. Set BASETEN_FS_ENABLED=1 or BASETEN_FS_ENABLED=True to enable cache operations."
+        )
+@contextmanager
+def cache_operation(operation_name: str) -> Generator[None, None, None]:
+    """Context manager for cache operations with b10fs validation and error handling.
+    This context manager validates that b10fs is available before executing
+    cache operations and provides consistent error logging. It should wrap
+    any operations that require b10fs access.
+    Args:
+        operation_name: Name of the operation for error logging (e.g., "Load", "Save").
+    Yields:
+        None: Context for the operation to execute.
+    Raises:
+        CacheValidationError: If b10fs is not available (re-raised after logging).
+        Exception: Any other errors during the operation (re-raised after logging).
+    """
+    try:
+        _validate_b10fs_available()
+        yield
+    except CacheValidationError as e:
+        logger.debug(f"{operation_name} failed: {e}")
+        raise
+    except Exception as e:
+        logger.debug(f"{operation_name} failed: {e}")
+        raise

b10_transfer-0.0.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,219 @@
+Metadata-Version: 2.3
+Name: b10-transfer
+Version: 0.0.1
+Summary: Distributed PyTorch compilation cache for Baseten - Environment-aware, lock-free compilation cache management
+License: MIT
+Keywords: pytorch,torch.compile,cache,machine-learning,inference
+Author: Shounak Ray
+Author-email: shounak.noreply@baseten.co
+Maintainer: Fred Liu
+Maintainer-email: fred.liu.noreply@baseten.co
+Requires-Python: >=3.9,<4.0
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Dist: torch (>=2.0.0)
+Requires-Dist: triton (>=2.0.0)
+Project-URL: Documentation, https://docs.baseten.co/development/model/b10-transfer
+Project-URL: Homepage, https://docs.baseten.co/development/model/b10-transfer
+Project-URL: Repository, https://pypi.org/project/b10-transfer/
+Description-Content-Type: text/markdown
+https://www.notion.so/ml-infra/mega-base-cache-24291d247273805b8e20fe26677b7b0f
+# B10 Transfer
+PyTorch compilation cache for Baseten deployments.
+## Usage
+### Synchronous Operations (Blocking)
+```python
+import b10_transfer
+# Inside model.load() function
+def load():
+    # Load cache before torch.compile()
+    status = b10_transfer.load_compile_cache()
+    # ...
+    # Your model compilation
+    model = torch.compile(model)
+    # Warm up the model with dummy prompts, and arguments that would be typically used in your requests (e.g resolutions)
+    dummy_input = "What is the capital of France?"
+    model(dummy_input)
+    # ...
+    # Save cache after compilation
+    if status != b10_transfer.LoadStatus.SUCCESS:
+        b10_transfer.save_compile_cache()
+```
+### Asynchronous Operations (Non-blocking)
+```python
+import b10_transfer
+def load_with_async_cache():
+    # Start async cache load (returns immediately with operation ID)
+    operation_id = b10_transfer.load_compile_cache_async()
+    # Check status periodically
+    while not b10_transfer.is_transfer_complete(operation_id):
+        status = b10_transfer.get_transfer_status(operation_id)
+        print(f"Cache load status: {status.status}")
+        time.sleep(1)
+    # Get final status
+    final_status = b10_transfer.get_transfer_status(operation_id)
+    if final_status.status == b10_transfer.AsyncTransferStatus.SUCCESS:
+        print("Cache loaded successfully!")
+    # Your model compilation...
+    model = torch.compile(model)
+    # Async save
+    save_op_id = b10_transfer.save_compile_cache_async()
+    # You can continue with other work while save happens in background
+    # Or wait for completion if needed
+    b10_transfer.wait_for_completion(save_op_id, timeout=300)  # 5 minute timeout
+# With progress callback
+def on_progress(operation_id: str):
+    status = b10_transfer.get_transfer_status(operation_id)
+    print(f"Transfer {operation_id}: {status.status}")
+operation_id = b10_transfer.load_compile_cache_async(progress_callback=on_progress)
+```
+### Generic Async Operations
+You can also use the generic async system for custom transfer operations:
+```python
+import b10_transfer
+from pathlib import Path
+def my_custom_callback(source: Path, dest: Path):
+    # Your custom transfer logic here
+    # This could be any file operation, compression, etc.
+    shutil.copy2(source, dest)
+# Start a generic async transfer
+operation_id = b10_transfer.start_transfer_async(
+    source=Path("/source/file.txt"),
+    dest=Path("/dest/file.txt"),
+    callback=my_custom_callback,
+    operation_name="custom_file_copy",
+    monitor_local=True,
+    monitor_b10fs=False
+)
+# Use the same progress tracking as torch cache operations
+b10_transfer.wait_for_completion(operation_id)
+```
+## Configuration
+Configure via environment variables:
+```bash
+# Cache directories
+export TORCH_CACHE_DIR="/tmp/torchinductor_root"      # Default
+export B10FS_CACHE_DIR="/cache/model/compile_cache"   # Default
+export LOCAL_WORK_DIR="/app"                          # Default
+# Cache limits
+export MAX_CACHE_SIZE_MB="1024"                       # 1GB default
+```
+## How It Works
+### Environment-Specific Caching
+The library automatically creates unique cache keys based on your environment:
+```
+torch-2.1.0_cuda-12.1_cc-8.6_triton-2.1.0 → cache_a1b2c3d4e5f6.latest.tar.gz
+torch-2.0.1_cuda-11.8_cc-7.5_triton-2.0.1 → cache_x9y8z7w6v5u4.latest.tar.gz
+torch-2.1.0_cpu_triton-none                → cache_m1n2o3p4q5r6.latest.tar.gz
+```
+**Components used:**
+- **PyTorch version** (e.g., `torch-2.1.0`)
+- **CUDA version** (e.g., `cuda-12.1` or `cpu`)
+- **GPU compute capability** (e.g., `cc-8.6` for A100)
+- **Triton version** (e.g., `triton-2.1.0` or `triton-none`)
+### Cache Workflow
+1. **Load Phase** (startup): Generate environment key, check for matching cache in B10FS, extract to local directory
+2. **Save Phase** (after compilation): Create archive, atomic copy to B10FS with environment-specific filename
+### Lock-Free Race Prevention
+Uses journal pattern with atomic filesystem operations for parallel-safe cache saves.
+## API Reference
+### Synchronous Functions
+- `load_compile_cache() -> LoadStatus`: Load cache from B10FS for current environment
+- `save_compile_cache() -> SaveStatus`: Save cache to B10FS with environment-specific filename
+- `clear_local_cache() -> bool`: Clear local cache directory
+- `get_cache_info() -> Dict[str, Any]`: Get cache status information for current environment
+- `list_available_caches() -> Dict[str, Any]`: List all cache files with environment details
+### Generic Asynchronous Functions
+- `start_transfer_async(source, dest, callback, operation_name, **kwargs) -> str`: Start any async transfer operation
+- `get_transfer_status(operation_id: str) -> TransferProgress`: Get current status of async operation
+- `is_transfer_complete(operation_id: str) -> bool`: Check if async operation has completed
+- `wait_for_completion(operation_id: str, timeout=None) -> bool`: Wait for async operation to complete
+- `cancel_transfer(operation_id: str) -> bool`: Attempt to cancel running operation
+- `list_active_transfers() -> Dict[str, TransferProgress]`: Get all active transfer operations
+### Torch Cache Async Functions
+- `load_compile_cache_async(progress_callback=None) -> str`: Start async cache load, returns operation ID
+- `save_compile_cache_async(progress_callback=None) -> str`: Start async cache save, returns operation ID
+### Status Enums
+- `LoadStatus`: SUCCESS, ERROR, DOES_NOT_EXIST, SKIPPED
+- `SaveStatus`: SUCCESS, ERROR, SKIPPED
+- `AsyncTransferStatus`: NOT_STARTED, IN_PROGRESS, SUCCESS, ERROR, INTERRUPTED, CANCELLED
+### Data Classes
+- `TransferProgress`: Contains operation_id, status, started_at, completed_at, error_message
+### Exceptions
+- `CacheError`: Base exception for cache operations
+- `CacheValidationError`: Path validation or compatibility check failed
+- `CacheOperationInterrupted`: Operation interrupted due to insufficient disk space
+## Performance Impact
+### Debugging
+Enable debug logging:
+```python
+import logging
+logging.getLogger('b10_tcache').setLevel(logging.DEBUG)
+```

b10_transfer-0.0.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+b10_transfer/__init__.py,sha256=o1ej-OtAOsfrJbvh5C3PnqxW2qfcO7l8rllVD-07lXE,1400
+b10_transfer/archive.py,sha256=GKb0mi0-YeM7ch4FLAoOLHXw0T6LkRerYad2N2y9TYM,6400
+b10_transfer/async_torch_cache.py,sha256=4hMjVR44SLlGes25e_cjgMTywFfIYjH0TnUmg9o-iyI,1903
+b10_transfer/async_transfers.py,sha256=AAML562qYzF9NyX9AdfiJ0OcQw6vXr985IZWXZSot9Q,9083
+b10_transfer/cleanup.py,sha256=xjKStmBjaarZPxhPTT1-Ds_pvUR7kdJw5Kp19BLvzzY,6224
+b10_transfer/constants.py,sha256=KjSUO6heScDJXQwFlHdeNV4KBBqKz7CKeJzo44-9qMM,4745
+b10_transfer/core.py,sha256=BOnA6FXkZRm74_CtQBMudpx3q7HTEGEORUV26fb6cvQ,5920
+b10_transfer/environment.py,sha256=aC0biEMQrtHk0ke_3epdcq1X9J5fPmPpBVt0fH7XF2Y,5625
+b10_transfer/info.py,sha256=I3iOuImZ5r6DMJTDeBtVvzlSn6IuyPJbLJYUO_OF0ks,6299
+b10_transfer/space_monitor.py,sha256=5pwW643KAHI3mtT61hYf29953UD9LekzWFF1K-QeYbw,10529
+b10_transfer/torch_cache.py,sha256=Oe_OeUPGAlmK9wY-L9w4aPaXOoMnL_kD596hew6ETcw,14192
+b10_transfer/utils.py,sha256=Stee0DFK-8MRRYNIocqaK64cJvfs4jPW3Mpx7zkWV6Y,11932
+b10_transfer-0.0.1.dist-info/METADATA,sha256=hESeWyidAEbtWkIgepBn1Cxlo9--jIj9vcLxM4zP7lY,7502
+b10_transfer-0.0.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+b10_transfer-0.0.1.dist-info/RECORD,,

b10_transfer-0.0.1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: poetry-core 2.1.3
+Root-Is-Purelib: true
+Tag: py3-none-any