PyPI - unifiedefficientloader - Versions diffs - 0.2.2__tar.gz → 0.2.3__tar.gz - Mend

unifiedefficientloader 0.2.2tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

{unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: unifiedefficientloader
-Version: 0.2.2
+Version: 0.2.3
 Summary: A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
 Author: silveroxides
 License: MIT
@@ -111,6 +111,33 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
             loader.mark_processed(key)
 ```
+### Direct-to-GPU Streaming (Zero-Copy)
+For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
+```python
+from unifiedefficientloader import UnifiedSafetensorsLoader
+with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
+    keys_to_load = loader.keys()
+    # async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
+    stream = loader.async_stream(
+        keys_to_load,
+        batch_size=8,
+        prefetch_batches=2,
+        direct_gpu=True # optional here since we passed it in __init__
+    )
+    for batch in stream:
+        for key, gpu_tensor in batch:
+            # gpu_tensor is already on the GPU!
+            assert gpu_tensor.device.type == "cuda"
+            # ... process gpu_tensor ...
+            loader.mark_processed(key)
+```
 ### Tensor/Dict Conversion
 ```python

{unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/README.md RENAMED Viewed

@@ -85,6 +85,33 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
             loader.mark_processed(key)
 ```
+### Direct-to-GPU Streaming (Zero-Copy)
+For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
+```python
+from unifiedefficientloader import UnifiedSafetensorsLoader
+with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
+    keys_to_load = loader.keys()
+    # async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
+    stream = loader.async_stream(
+        keys_to_load,
+        batch_size=8,
+        prefetch_batches=2,
+        direct_gpu=True # optional here since we passed it in __init__
+    )
+    for batch in stream:
+        for key, gpu_tensor in batch:
+            # gpu_tensor is already on the GPU!
+            assert gpu_tensor.device.type == "cuda"
+            # ... process gpu_tensor ...
+            loader.mark_processed(key)
+```
 ### Tensor/Dict Conversion
 ```python

{unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "unifiedefficientloader"
-version = "0.2.2"
+version = "0.2.3"
 description = "A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts."
 readme = "README.md"
 authors = [
@@ -34,4 +34,9 @@ log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(linen
 log_cli_date_format = "%Y-%m-%d %H:%M:%S"
 filterwarnings = [
     "ignore:.*argument 'device' of Tensor.*:DeprecationWarning"
-]
+]
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["unifiedefficientloader*"]
+exclude = ["reference"]

{unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/setup.py RENAMED Viewed

@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 """Minimal setup.py for backward compatibility with legacy pip install workflows."""
 from setuptools import setup

unifiedefficientloader-0.2.3/tests/test_direct_gpu.py ADDED Viewed

@@ -0,0 +1,95 @@
+import os
+import tempfile
+import pytest
+try:
+    import torch
+    from safetensors.torch import save_file
+    HAS_TORCH = True
+except ImportError:
+    HAS_TORCH = False
+from unifiedefficientloader import MemoryEfficientSafeOpen
+@pytest.fixture
+def sample_safetensors():
+    if not HAS_TORCH:
+        pytest.skip("Requires torch and safetensors")
+    with tempfile.NamedTemporaryFile(suffix=".safetensors", delete=False) as f:
+        path = f.name
+    tensors = {
+        "weight1": torch.randn(10, 10),
+        "weight2": torch.randn(20, 20),
+        "bias": torch.zeros(10),
+    }
+    save_file(tensors, path)
+    yield path, tensors
+    if os.path.exists(path):
+        os.remove(path)
+@pytest.mark.skipif(not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA")
+def test_direct_gpu_streaming(sample_safetensors):
+    path, original_tensors = sample_safetensors
+    loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
+    # Test load_all which uses async_stream under the hood
+    loaded_tensors = loader.load_all()
+    for key, orig_tensor in original_tensors.items():
+        assert key in loaded_tensors
+        loaded_tensor = loaded_tensors[key]
+        # Verify it's on GPU
+        assert loaded_tensor.device.type == "cuda"
+        # Verify data matches
+        torch.testing.assert_close(loaded_tensor.cpu(), orig_tensor)
+    loader.close()
+@pytest.mark.skipif(not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA")
+def test_direct_gpu_async_stream(sample_safetensors):
+    path, original_tensors = sample_safetensors
+    loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
+    stream = loader.async_stream(
+        keys=list(original_tensors.keys()),
+        batch_size=2,
+        prefetch_batches=1,
+        direct_gpu=True
+    )
+    loaded_count = 0
+    for batch in stream:
+        for key, tensor in batch:
+            assert tensor.device.type == "cuda"
+            torch.testing.assert_close(tensor.cpu(), original_tensors[key])
+            loaded_count += 1
+    assert loaded_count == len(original_tensors)
+    loader.close()
+@pytest.mark.skipif(not HAS_TORCH, reason="Requires torch")
+def test_direct_gpu_fallback_no_cuda(sample_safetensors, monkeypatch):
+    # Force cuda to be unavailable
+    monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
+    path, original_tensors = sample_safetensors
+    # Should fallback to CPU silently
+    loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
+    loaded_tensors = loader.load_all()
+    for key, orig_tensor in original_tensors.items():
+        loaded_tensor = loaded_tensors[key]
+        assert loaded_tensor.device.type == "cpu"
+        torch.testing.assert_close(loaded_tensor, orig_tensor)
+    loader.close()

{unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader/__init__.py RENAMED Viewed

@@ -9,7 +9,7 @@ def check_dependencies(*packages):
     for pkg in packages:
         if importlib.util.find_spec(pkg) is None:
             missing.append(pkg)
     if missing:
         missing_str = ", ".join(missing)
         raise ImportError(
@@ -23,11 +23,13 @@ check_dependencies("torch")
 from .memory_efficient_loader import UnifiedSafetensorsLoader, MemoryEfficientSafeOpen
 from .tensor_utils import dict_to_tensor, tensor_to_dict
 from .pinned_transfer import transfer_to_gpu_pinned, set_verbose, get_pinned_transfer_stats, reset_pinned_transfer_stats
+from .gpu_buffer_pool import GpuBufferPool
+from .pinned_buffer_pool import PinnedBufferPool
 from .logging_utils import (
     setup_logging,
-    MINIMAL_LEVEL,
-    NORMAL_LEVEL,
-    VERBOSE_LEVEL,
+    MINIMAL_LEVEL,
+    NORMAL_LEVEL,
+    VERBOSE_LEVEL,
     DEBUG_LEVEL,
     debug,
     verbose,
@@ -47,6 +49,8 @@ __all__ = [
     "set_verbose",
     "get_pinned_transfer_stats",
     "reset_pinned_transfer_stats",
+    "GpuBufferPool",
+    "PinnedBufferPool",
     "setup_logging",
     "MINIMAL_LEVEL",
     "NORMAL_LEVEL",

unifiedefficientloader-0.2.3/unifiedefficientloader/gpu_buffer_pool.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""
+GPU memory buffer pool for direct-to-GPU streaming.
+Maintains a pool of pre-allocated GPU tensors to avoid allocation overhead
+and ensure strictly bounded VRAM usage during streaming.
+"""
+from typing import Tuple, Optional
+import torch
+from . import logging_utils
+logger = logging_utils.get_logger(__name__)
+class GpuBufferPool:
+    """Manages a pool of fixed-size GPU memory buffers."""
+    def __init__(self, size_bytes: int, num_buffers: int, device: str = "cuda"):
+        import torch
+        import queue
+        self.device = device
+        self.size_bytes = size_bytes
+        self.num_buffers = num_buffers
+        logging_utils.verbose(f"Initializing GpuBufferPool: {num_buffers} buffers of {size_bytes / (1024**2):.2f} MB each on {device}.")
+        self.buffers = []
+        for _ in range(num_buffers):
+            buf = torch.empty(size_bytes, dtype=torch.uint8, device=device)
+            self.buffers.append(buf)
+        self.free_queue = queue.Queue()
+        for i in range(num_buffers):
+            self.free_queue.put(i)
+    def acquire(self) -> Tuple[int, 'torch.Tensor']:
+        """Acquire a free buffer. Blocks if empty."""
+        idx = self.free_queue.get()
+        return idx, self.buffers[idx]
+    def release(self, idx: int):
+        """Release buffer back to pool."""
+        self.free_queue.put(idx)

{unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader/memory_efficient_loader.py RENAMED Viewed

@@ -53,26 +53,36 @@ class UnifiedSafetensorsLoader:
     """
     @logging_utils.log_debug
-    def __init__(self, filename: str, low_memory: bool = False):
+    def __init__(self, filename: str, low_memory: bool = False, direct_gpu: bool = False):
         """Initialize the loader.
         Args:
             filename: Path to safetensors file
             low_memory: If True, use streaming mode; if False, preload all tensors
+            direct_gpu: If True, stream directly to GPU pinned/slab memory (requires low_memory=True)
         """
         torch = _ensure_torch()
         safe_open = _ensure_safetensors()
         self.filename = filename
         self.low_memory = low_memory
+        self.direct_gpu = direct_gpu
+        if self.direct_gpu and not self.low_memory:
+            logging_utils.warning("direct_gpu=True requires low_memory=True. Forcing low_memory=True.")
+            self.low_memory = True
         self._tensors: Dict[str, 'torch.Tensor'] = {}
+        self._gpu_buffer_indices: Dict[str, int] = {}
+        self._gpu_pool = None
         self._all_keys = []
         self._file = None
         self._header = None
         self._header_size = None
         self._metadata: Dict[str, str] = {}
-        if low_memory:
+        if self.low_memory:
             # Streaming mode: read header only
             self._header, self._header_size = self._read_header()
             self._file = None # Opened lazily to support multiprocessing DataLoader
@@ -83,7 +93,7 @@ class UnifiedSafetensorsLoader:
             logging_utils.verbose(f"Found {len(self._all_keys)} tensors (streaming mode)")
         else:
             # Standard mode: preload all tensors
-            with safe_open(filename, framework="pt", device="cpu") as f:
+            with safe_open(self.filename, framework="pt", device="cpu") as f:
                 self._metadata = f.metadata() or {}
                 self._all_keys = list(f.keys())
                 logging_utils.normal(f"Loading {len(self._all_keys)} tensors from source file...")
@@ -180,12 +190,17 @@ class UnifiedSafetensorsLoader:
         """Mark a tensor as processed, freeing memory if in low-memory mode.
         In standard mode, optionally deletes from cache.
-        In low-memory mode, this is a no-op (tensor was never cached).
+        In low-memory mode, frees GPU buffer back to pool if direct_gpu.
         """
         if not self.low_memory and key in self._tensors:
             del self._tensors[key]
             gc.collect()
+        if self.direct_gpu and key in self._gpu_buffer_indices:
+            idx = self._gpu_buffer_indices.pop(key)
+            if self._gpu_pool:
+                self._gpu_pool.release(idx)
     def _read_header(self):
         """Read and parse the safetensors header."""
         with open(self.filename, "rb") as f:
@@ -279,6 +294,7 @@ class UnifiedSafetensorsLoader:
             batch_size: Number of tensors to yield in each batch
             prefetch_batches: Number of batches to pre-fetch in background
             pin_memory: If True, tensors will be pinned in CPU memory (sequentially in main thread)
+            direct_gpu: Stream via pinned buffer directly to GPU
         Yields:
             List of (key, tensor) tuples
@@ -286,44 +302,134 @@ class UnifiedSafetensorsLoader:
         import threading
         import queue
         from concurrent.futures import ThreadPoolExecutor
+        import os
         torch = _ensure_torch()
         thread_local = threading.local()
+        # Initialize GPU slab and Pinned Buffer Pool if direct_gpu
+        pinned_pool = None
+        cuda_stream = None
+        if self.direct_gpu and torch.cuda.is_available():
+            try:
+                from .gpu_buffer_pool import GpuBufferPool
+                from .pinned_buffer_pool import PinnedBufferPool
+                # Pre-calculate required slab size
+                max_tensor_bytes = 0
+                for k in keys:
+                    meta = self._header[k]
+                    start, end = meta["data_offsets"]
+                    sz = end - start
+                    max_tensor_bytes = max(max_tensor_bytes, sz)
+                # Initialize pools (size of largest tensor)
+                # We need a larger pool to allow the GPU to lag behind the CPU without stalling
+                max_workers = min(16, max(4, batch_size))
+                max_in_flight = max(max_workers, prefetch_batches * batch_size)
+                # Double the buffers for a smooth pipeline
+                num_buffers = (max_in_flight + max_workers) * 2 + 2
+                # Assign pool to instance to survive the generator lifetime
+                if not getattr(self, '_gpu_pool', None):
+                    self._gpu_pool = GpuBufferPool(max_tensor_bytes, num_buffers)
+                pinned_pool = PinnedBufferPool(max_tensor_bytes, num_buffers)
+                cuda_stream = torch.cuda.Stream()
+                logging_utils.normal(f"Direct GPU pipeline initialized: {num_buffers} buffers, max {max_tensor_bytes / (1024**2):.1f}MB each (Total VRAM: {(num_buffers*max_tensor_bytes)/(1024**2):.1f}MB)")
+            except Exception as e:
+                logging_utils.warning(f"Failed to initialize direct GPU pipeline: {e}. Falling back.")
+                self.direct_gpu = False
+                pinned_pool = None
+        elif self.direct_gpu:
+            logging_utils.warning("direct_gpu=True requested but CUDA is not available. Falling back to CPU.")
+            self.direct_gpu = False
         def get_file_handle():
             if not hasattr(thread_local, 'file'):
                 thread_local.file = open(self.filename, "rb")
             return thread_local.file
         def _worker_load(key):
+            buf_idx = None
+            gpu_idx = None
             try:
-                # Direct thread-safe read
                 metadata = self._header[key]
                 offset_start, offset_end = metadata["data_offsets"]
-                if offset_start != offset_end:
-                    f = get_file_handle()
-                    f.seek(self._header_size + 8 + offset_start)
-                    tensor_bytes = bytearray(offset_end - offset_start)
-                    f.readinto(tensor_bytes)
-                else:
-                    tensor_bytes = None
+                sz = offset_end - offset_start
-                tensor = self._deserialize_tensor(tensor_bytes, metadata)
-                return key, tensor, None
+                if self.direct_gpu and sz > 0:
+                    # Direct GPU Pipeline Path
+                    buf_idx, pinned_buf = pinned_pool.acquire()
+                    try:
+                        # Schedule GPU transfer
+                        gpu_idx, gpu_buf = self._gpu_pool.acquire()
+                        try:
+                            # Read into pinned memory directly (Zero-Copy CPU path)
+                            import ctypes
+                            view = pinned_buf[:sz]
+                            # Create a ctypes c_uint8 array spanning the pinned buffer memory
+                            # This allows f.readinto() to write bytes directly to the torch tensor memory
+                            c_uint8_array = (ctypes.c_uint8 * sz).from_address(view.data_ptr())
+                            f = get_file_handle()
+                            f.seek(self._header_size + 8 + offset_start)
+                            f.readinto(c_uint8_array)
+                            gpu_view = gpu_buf[:sz]
+                            with torch.cuda.stream(cuda_stream):
+                                gpu_view.copy_(view, non_blocking=True)
+                                # Create event to track when copy finishes
+                                event = torch.cuda.Event()
+                                event.record()
+                            # Critical: wait for stream before allowing worker to finish
+                            # If worker finishes, buffer might be overwritten by next worker
+                            # if pool sizing is tight.
+                            # In direct_gpu, the tensor is the gpu_view.
+                            return key, gpu_view, metadata, buf_idx, gpu_idx, event
+                        except Exception as e:
+                            # If reading or copying fails, release GPU buffer
+                            self._gpu_pool.release(gpu_idx)
+                            raise e
+                    except Exception as e:
+                        # If acquiring GPU buffer fails, release pinned buffer
+                        pinned_pool.release(buf_idx)
+                        raise e
+                else:
+                    # Standard CPU Path
+                    if offset_start != offset_end:
+                        f = get_file_handle()
+                        f.seek(self._header_size + 8 + offset_start)
+                        tensor_bytes = bytearray(offset_end - offset_start)
+                        f.readinto(tensor_bytes)
+                    else:
+                        tensor_bytes = None
+                    tensor = self._deserialize_tensor(tensor_bytes, metadata)
+                    return key, tensor, None, None, None, None
             except Exception as e:
-                # Fallback info for main thread
-                return key, None, e
+                return key, None, e, None, None, None
+        max_workers = min(16, max(4, batch_size))
+        max_in_flight = max(max_workers, prefetch_batches * batch_size)
         # Queue for individual (key, tensor) pairs
-        # Size it to hold enough for prefetch_batches
-        q = queue.Queue(maxsize=prefetch_batches * batch_size)
+        # Size it to hold enough for prefetch_batches PLUS max_workers to prevent stalling
+        q = queue.Queue(maxsize=max_in_flight + max_workers)
         def _producer():
-            # Use a reasonable number of workers for I/O bound tasks
-            max_workers = min(16, max(4, batch_size))
-            # Limit task submission to maintain backpressure on memory
-            max_in_flight = max(max_workers, prefetch_batches * batch_size)
             with ThreadPoolExecutor(max_workers=max_workers) as executor:
                 futures = []
                 key_iter = iter(keys)
@@ -351,19 +457,26 @@ class UnifiedSafetensorsLoader:
             q.put(None) # Sentinel
+        producer_thread = threading.local()
         producer_thread = threading.Thread(target=_producer, daemon=True)
         producer_thread.start()
         batch = []
+        pending_pinned = [] # Track (event, buf_idx) to release later
         while True:
             res = q.get()
             if res is None:
+                # Synchronize and cleanup any remaining buffers on exit
+                for ev, idx in pending_pinned:
+                    ev.synchronize()
+                    pinned_pool.release(idx)
                 if batch:
                     yield batch
                 break
-            k, t, err = res
-            if err is not None:
+            k, t, err, buf_idx, gpu_idx, event = res
+            if err is not None and not isinstance(err, dict):
                 logging_utils.warning(f"Async load failed for {k}, falling back to sync: {err}")
                 # Fallback synchronous load
                 try:
@@ -372,8 +485,32 @@ class UnifiedSafetensorsLoader:
                     logging_utils.error(f"Sync fallback also failed for {k}: {sync_err}")
                     raise sync_err
+            if buf_idx is not None and event is not None:
+                # Don't block here! Yield the tensor with its event.
+                # Only release the PREVIOUS batch's buffers.
+                # This creates a sliding window of safety.
+                while len(pending_pinned) >= (max_in_flight + 1):
+                    ev, idx = pending_pinned.pop(0)
+                    ev.synchronize() # Wait only if we MUST reuse a buffer
+                    pinned_pool.release(idx)
+                pending_pinned.append((event, buf_idx))
+                # Register GPU index for cleanup
+                self._gpu_buffer_indices[k] = gpu_idx
+                # Reshape GPU view to tensor
+                meta = err # we reused err for metadata in direct_gpu path
+                dtype = self._get_torch_dtype(meta["dtype"])
+                shape = meta["shape"]
+                if meta["dtype"] in ["F8_E5M2", "F8_E4M3"]:
+                    t = self._convert_float8(t, meta["dtype"], shape)
+                else:
+                    t = t.view(dtype).reshape(shape)
             # Pin memory sequentially in the main thread to avoid OS-level lock contention
-            if pin_memory and t.device.type == 'cpu':
+            elif pin_memory and t.device.type == 'cpu':
                 try:
                     t = t.pin_memory()
                 except Exception as e:

unifiedefficientloader-0.2.3/unifiedefficientloader/pinned_buffer_pool.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""
+Pinned memory pool manager.
+Provides pre-allocated pinned buffers to avoid per-tensor allocation overhead.
+"""
+from typing import Optional
+import torch
+from . import logging_utils
+logger = logging_utils.get_logger(__name__)
+class PinnedBufferPool:
+    """Manages a pool of pinned memory buffers for fast disk-to-GPU transfer."""
+    def __init__(self, size_bytes: int, num_buffers: int):
+        import torch
+        self.size_bytes = size_bytes
+        self.num_buffers = num_buffers
+        logging_utils.verbose(f"Initializing PinnedBufferPool: {num_buffers} buffers of {size_bytes / (1024**2):.2f} MB each.")
+        self.buffers = []
+        for _ in range(num_buffers):
+            buf = torch.empty(size_bytes, dtype=torch.uint8, pin_memory=True)
+            self.buffers.append(buf)
+        import queue
+        self.free_queue = queue.Queue()
+        for i in range(num_buffers):
+            self.free_queue.put(i)
+    def acquire(self) -> tuple[int, 'torch.Tensor']:
+        """Acquire a free buffer. Blocks if empty."""
+        idx = self.free_queue.get()
+        return idx, self.buffers[idx]
+    def release(self, idx: int):
+        """Release buffer back to pool."""
+        self.free_queue.put(idx)

{unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader/pinned_transfer.py RENAMED Viewed

@@ -44,7 +44,8 @@ def reset_pinned_transfer_stats():
 def transfer_to_gpu_pinned(
     tensor,
     device: str = 'cuda',
-    dtype = None
+    dtype = None,
+    non_blocking: bool = True
 ):
     """Transfer tensor to GPU using pinned memory for faster transfer."""
     torch = _ensure_torch()
@@ -53,22 +54,22 @@ def transfer_to_gpu_pinned(
     # Skip if not a CPU tensor or CUDA unavailable
     if tensor.device.type != 'cpu' or not torch.cuda.is_available():
         if dtype is not None:
-            return tensor.to(device=device, dtype=dtype)
-        return tensor.to(device=device)
+            return tensor.to(device=device, dtype=dtype, non_blocking=non_blocking)
+        return tensor.to(device=device, non_blocking=non_blocking)
     # Skip if target is not CUDA
     if not str(device).startswith('cuda'):
         if dtype is not None:
-            return tensor.to(device=device, dtype=dtype)
-        return tensor.to(device=device)
+            return tensor.to(device=device, dtype=dtype, non_blocking=non_blocking)
+        return tensor.to(device=device, non_blocking=non_blocking)
     try:
         pinned = tensor.pin_memory()
         if dtype is not None:
-            result = pinned.to(device=device, dtype=dtype, non_blocking=True)
+            result = pinned.to(device=device, dtype=dtype, non_blocking=non_blocking)
         else:
-            result = pinned.to(device=device, non_blocking=True)
+            result = pinned.to(device=device, non_blocking=non_blocking)
         torch.cuda.current_stream().synchronize()
@@ -95,5 +96,5 @@ def transfer_to_gpu_pinned(
             logging_utils.verbose(msg)
         if dtype is not None:
-            return tensor.to(device=device, dtype=dtype)
-        return tensor.to(device=device)
+            return tensor.to(device=device, dtype=dtype, non_blocking=non_blocking)
+        return tensor.to(device=device, non_blocking=non_blocking)

{unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: unifiedefficientloader
-Version: 0.2.2
+Version: 0.2.3
 Summary: A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
 Author: silveroxides
 License: MIT
@@ -111,6 +111,33 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
             loader.mark_processed(key)
 ```
+### Direct-to-GPU Streaming (Zero-Copy)
+For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
+```python
+from unifiedefficientloader import UnifiedSafetensorsLoader
+with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
+    keys_to_load = loader.keys()
+    # async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
+    stream = loader.async_stream(
+        keys_to_load,
+        batch_size=8,
+        prefetch_batches=2,
+        direct_gpu=True # optional here since we passed it in __init__
+    )
+    for batch in stream:
+        for key, gpu_tensor in batch:
+            # gpu_tensor is already on the GPU!
+            assert gpu_tensor.device.type == "cuda"
+            # ... process gpu_tensor ...
+            loader.mark_processed(key)
+```
 ### Tensor/Dict Conversion
 ```python

{unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/SOURCES.txt RENAMED Viewed

@@ -2,11 +2,14 @@ LICENSE
 README.md
 pyproject.toml
 setup.py
+tests/test_direct_gpu.py
 tests/test_logging.py
 tests/test_utils.py
 unifiedefficientloader/__init__.py
+unifiedefficientloader/gpu_buffer_pool.py
 unifiedefficientloader/logging_utils.py
 unifiedefficientloader/memory_efficient_loader.py
+unifiedefficientloader/pinned_buffer_pool.py
 unifiedefficientloader/pinned_transfer.py
 unifiedefficientloader/tensor_utils.py
 unifiedefficientloader.egg-info/PKG-INFO