unifiedefficientloader 0.2.1__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/PKG-INFO +28 -1
  2. {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/README.md +27 -0
  3. {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/pyproject.toml +7 -2
  4. {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/setup.py +1 -0
  5. unifiedefficientloader-0.2.3/tests/test_direct_gpu.py +95 -0
  6. unifiedefficientloader-0.2.3/tests/test_logging.py +51 -0
  7. {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader/__init__.py +32 -2
  8. unifiedefficientloader-0.2.3/unifiedefficientloader/gpu_buffer_pool.py +40 -0
  9. unifiedefficientloader-0.2.3/unifiedefficientloader/logging_utils.py +117 -0
  10. {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader/memory_efficient_loader.py +188 -48
  11. unifiedefficientloader-0.2.3/unifiedefficientloader/pinned_buffer_pool.py +39 -0
  12. {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader/pinned_transfer.py +28 -17
  13. {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader/tensor_utils.py +8 -5
  14. {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/PKG-INFO +28 -1
  15. {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/SOURCES.txt +5 -0
  16. {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/LICENSE +0 -0
  17. {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/setup.cfg +0 -0
  18. {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/tests/test_utils.py +0 -0
  19. {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/dependency_links.txt +0 -0
  20. {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/requires.txt +0 -0
  21. {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unifiedefficientloader
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
5
5
  Author: silveroxides
6
6
  License: MIT
@@ -111,6 +111,33 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
111
111
  loader.mark_processed(key)
112
112
  ```
113
113
 
114
+ ### Direct-to-GPU Streaming (Zero-Copy)
115
+
116
+ For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
117
+
118
+ ```python
119
+ from unifiedefficientloader import UnifiedSafetensorsLoader
120
+
121
+ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
122
+ keys_to_load = loader.keys()
123
+
124
+ # async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
125
+ stream = loader.async_stream(
126
+ keys_to_load,
127
+ batch_size=8,
128
+ prefetch_batches=2,
129
+ direct_gpu=True # optional here since we passed it in __init__
130
+ )
131
+
132
+ for batch in stream:
133
+ for key, gpu_tensor in batch:
134
+ # gpu_tensor is already on the GPU!
135
+ assert gpu_tensor.device.type == "cuda"
136
+
137
+ # ... process gpu_tensor ...
138
+ loader.mark_processed(key)
139
+ ```
140
+
114
141
  ### Tensor/Dict Conversion
115
142
 
116
143
  ```python
@@ -85,6 +85,33 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
85
85
  loader.mark_processed(key)
86
86
  ```
87
87
 
88
+ ### Direct-to-GPU Streaming (Zero-Copy)
89
+
90
+ For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
91
+
92
+ ```python
93
+ from unifiedefficientloader import UnifiedSafetensorsLoader
94
+
95
+ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
96
+ keys_to_load = loader.keys()
97
+
98
+ # async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
99
+ stream = loader.async_stream(
100
+ keys_to_load,
101
+ batch_size=8,
102
+ prefetch_batches=2,
103
+ direct_gpu=True # optional here since we passed it in __init__
104
+ )
105
+
106
+ for batch in stream:
107
+ for key, gpu_tensor in batch:
108
+ # gpu_tensor is already on the GPU!
109
+ assert gpu_tensor.device.type == "cuda"
110
+
111
+ # ... process gpu_tensor ...
112
+ loader.mark_processed(key)
113
+ ```
114
+
88
115
  ### Tensor/Dict Conversion
89
116
 
90
117
  ```python
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "unifiedefficientloader"
7
- version = "0.2.1"
7
+ version = "0.2.3"
8
8
  description = "A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts."
9
9
  readme = "README.md"
10
10
  authors = [
@@ -34,4 +34,9 @@ log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(linen
34
34
  log_cli_date_format = "%Y-%m-%d %H:%M:%S"
35
35
  filterwarnings = [
36
36
  "ignore:.*argument 'device' of Tensor.*:DeprecationWarning"
37
- ]
37
+ ]
38
+
39
+ [tool.setuptools.packages.find]
40
+ where = ["."]
41
+ include = ["unifiedefficientloader*"]
42
+ exclude = ["reference"]
@@ -1,3 +1,4 @@
1
+ #!/usr/bin/env python
1
2
  """Minimal setup.py for backward compatibility with legacy pip install workflows."""
2
3
  from setuptools import setup
3
4
 
@@ -0,0 +1,95 @@
1
+ import os
2
+ import tempfile
3
+ import pytest
4
+
5
+ try:
6
+ import torch
7
+ from safetensors.torch import save_file
8
+ HAS_TORCH = True
9
+ except ImportError:
10
+ HAS_TORCH = False
11
+
12
+ from unifiedefficientloader import MemoryEfficientSafeOpen
13
+
14
+ @pytest.fixture
15
+ def sample_safetensors():
16
+ if not HAS_TORCH:
17
+ pytest.skip("Requires torch and safetensors")
18
+
19
+ with tempfile.NamedTemporaryFile(suffix=".safetensors", delete=False) as f:
20
+ path = f.name
21
+
22
+ tensors = {
23
+ "weight1": torch.randn(10, 10),
24
+ "weight2": torch.randn(20, 20),
25
+ "bias": torch.zeros(10),
26
+ }
27
+ save_file(tensors, path)
28
+
29
+ yield path, tensors
30
+
31
+ if os.path.exists(path):
32
+ os.remove(path)
33
+
34
+ @pytest.mark.skipif(not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA")
35
+ def test_direct_gpu_streaming(sample_safetensors):
36
+ path, original_tensors = sample_safetensors
37
+
38
+ loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
39
+
40
+ # Test load_all which uses async_stream under the hood
41
+ loaded_tensors = loader.load_all()
42
+
43
+ for key, orig_tensor in original_tensors.items():
44
+ assert key in loaded_tensors
45
+ loaded_tensor = loaded_tensors[key]
46
+
47
+ # Verify it's on GPU
48
+ assert loaded_tensor.device.type == "cuda"
49
+
50
+ # Verify data matches
51
+ torch.testing.assert_close(loaded_tensor.cpu(), orig_tensor)
52
+
53
+ loader.close()
54
+
55
+ @pytest.mark.skipif(not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA")
56
+ def test_direct_gpu_async_stream(sample_safetensors):
57
+ path, original_tensors = sample_safetensors
58
+
59
+ loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
60
+
61
+ stream = loader.async_stream(
62
+ keys=list(original_tensors.keys()),
63
+ batch_size=2,
64
+ prefetch_batches=1,
65
+ direct_gpu=True
66
+ )
67
+
68
+ loaded_count = 0
69
+ for batch in stream:
70
+ for key, tensor in batch:
71
+ assert tensor.device.type == "cuda"
72
+ torch.testing.assert_close(tensor.cpu(), original_tensors[key])
73
+ loaded_count += 1
74
+
75
+ assert loaded_count == len(original_tensors)
76
+ loader.close()
77
+
78
+ @pytest.mark.skipif(not HAS_TORCH, reason="Requires torch")
79
+ def test_direct_gpu_fallback_no_cuda(sample_safetensors, monkeypatch):
80
+ # Force cuda to be unavailable
81
+ monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
82
+
83
+ path, original_tensors = sample_safetensors
84
+
85
+ # Should fallback to CPU silently
86
+ loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
87
+
88
+ loaded_tensors = loader.load_all()
89
+
90
+ for key, orig_tensor in original_tensors.items():
91
+ loaded_tensor = loaded_tensors[key]
92
+ assert loaded_tensor.device.type == "cpu"
93
+ torch.testing.assert_close(loaded_tensor, orig_tensor)
94
+
95
+ loader.close()
@@ -0,0 +1,51 @@
1
+ import os
2
+ import torch
3
+ from unifiedefficientloader import (
4
+ UnifiedSafetensorsLoader,
5
+ setup_logging,
6
+ VERBOSE_LEVEL,
7
+ DEBUG_LEVEL
8
+ )
9
+ from safetensors.torch import save_file
10
+
11
+ def test_logging():
12
+ # 1. Create a dummy safetensors file
13
+ dummy_file = "test_logging.safetensors"
14
+ tensors = {
15
+ "weight1": torch.randn(10, 10),
16
+ "weight2": torch.randn(5, 5)
17
+ }
18
+ save_file(tensors, dummy_file)
19
+
20
+ try:
21
+ # 2. Test NORMAL logging (default)
22
+ print("\n--- Testing NORMAL Logging (Default) ---")
23
+ setup_logging("NORMAL")
24
+ with UnifiedSafetensorsLoader(dummy_file, low_memory=False) as loader:
25
+ _ = loader.get_tensor("weight1")
26
+
27
+ # 3. Test VERBOSE logging
28
+ print("\n--- Testing VERBOSE Logging ---")
29
+ setup_logging("VERBOSE")
30
+ with UnifiedSafetensorsLoader(dummy_file, low_memory=True) as loader:
31
+ _ = loader.get_tensor("weight1")
32
+ _ = loader.get_tensor("weight2")
33
+
34
+ # 4. Test DEBUG logging (includes function traces)
35
+ print("\n--- Testing DEBUG Logging ---")
36
+ setup_logging("DEBUG")
37
+ with UnifiedSafetensorsLoader(dummy_file, low_memory=True) as loader:
38
+ _ = loader.get_tensor("weight1")
39
+
40
+ # 5. Test MINIMAL logging
41
+ print("\n--- Testing MINIMAL Logging ---")
42
+ setup_logging("MINIMAL")
43
+ with UnifiedSafetensorsLoader(dummy_file, low_memory=False) as loader:
44
+ _ = loader.get_tensor("weight1")
45
+
46
+ finally:
47
+ if os.path.exists(dummy_file):
48
+ os.remove(dummy_file)
49
+
50
+ if __name__ == "__main__":
51
+ test_logging()
@@ -9,7 +9,7 @@ def check_dependencies(*packages):
9
9
  for pkg in packages:
10
10
  if importlib.util.find_spec(pkg) is None:
11
11
  missing.append(pkg)
12
-
12
+
13
13
  if missing:
14
14
  missing_str = ", ".join(missing)
15
15
  raise ImportError(
@@ -23,6 +23,22 @@ check_dependencies("torch")
23
23
  from .memory_efficient_loader import UnifiedSafetensorsLoader, MemoryEfficientSafeOpen
24
24
  from .tensor_utils import dict_to_tensor, tensor_to_dict
25
25
  from .pinned_transfer import transfer_to_gpu_pinned, set_verbose, get_pinned_transfer_stats, reset_pinned_transfer_stats
26
+ from .gpu_buffer_pool import GpuBufferPool
27
+ from .pinned_buffer_pool import PinnedBufferPool
28
+ from .logging_utils import (
29
+ setup_logging,
30
+ MINIMAL_LEVEL,
31
+ NORMAL_LEVEL,
32
+ VERBOSE_LEVEL,
33
+ DEBUG_LEVEL,
34
+ debug,
35
+ verbose,
36
+ normal,
37
+ info,
38
+ minimal,
39
+ warning,
40
+ error
41
+ )
26
42
 
27
43
  __all__ = [
28
44
  "UnifiedSafetensorsLoader",
@@ -33,4 +49,18 @@ __all__ = [
33
49
  "set_verbose",
34
50
  "get_pinned_transfer_stats",
35
51
  "reset_pinned_transfer_stats",
36
- ]
52
+ "GpuBufferPool",
53
+ "PinnedBufferPool",
54
+ "setup_logging",
55
+ "MINIMAL_LEVEL",
56
+ "NORMAL_LEVEL",
57
+ "VERBOSE_LEVEL",
58
+ "DEBUG_LEVEL",
59
+ "debug",
60
+ "verbose",
61
+ "normal",
62
+ "info",
63
+ "minimal",
64
+ "warning",
65
+ "error",
66
+ ]
@@ -0,0 +1,40 @@
1
+ """
2
+ GPU memory buffer pool for direct-to-GPU streaming.
3
+
4
+ Maintains a pool of pre-allocated GPU tensors to avoid allocation overhead
5
+ and ensure strictly bounded VRAM usage during streaming.
6
+ """
7
+ from typing import Tuple, Optional
8
+ import torch
9
+ from . import logging_utils
10
+
11
+ logger = logging_utils.get_logger(__name__)
12
+
13
+ class GpuBufferPool:
14
+ """Manages a pool of fixed-size GPU memory buffers."""
15
+ def __init__(self, size_bytes: int, num_buffers: int, device: str = "cuda"):
16
+ import torch
17
+ import queue
18
+ self.device = device
19
+ self.size_bytes = size_bytes
20
+ self.num_buffers = num_buffers
21
+
22
+ logging_utils.verbose(f"Initializing GpuBufferPool: {num_buffers} buffers of {size_bytes / (1024**2):.2f} MB each on {device}.")
23
+
24
+ self.buffers = []
25
+ for _ in range(num_buffers):
26
+ buf = torch.empty(size_bytes, dtype=torch.uint8, device=device)
27
+ self.buffers.append(buf)
28
+
29
+ self.free_queue = queue.Queue()
30
+ for i in range(num_buffers):
31
+ self.free_queue.put(i)
32
+
33
+ def acquire(self) -> Tuple[int, 'torch.Tensor']:
34
+ """Acquire a free buffer. Blocks if empty."""
35
+ idx = self.free_queue.get()
36
+ return idx, self.buffers[idx]
37
+
38
+ def release(self, idx: int):
39
+ """Release buffer back to pool."""
40
+ self.free_queue.put(idx)
@@ -0,0 +1,117 @@
1
+ import logging
2
+ import sys
3
+ import functools
4
+
5
+ # Custom Levels
6
+ # MINIMAL (30): WARNING+ (Reduced)
7
+ # NORMAL (20): INFO+ (Default)
8
+ # VERBOSE (15): Custom+ (Increased)
9
+ # DEBUG (10): DEBUG+ (Every function call)
10
+
11
+ MINIMAL_LEVEL = 30 # Use logging.WARNING
12
+ NORMAL_LEVEL = 20 # Use logging.INFO
13
+ VERBOSE_LEVEL = 15 # Custom level between INFO and DEBUG
14
+ DEBUG_LEVEL = 10 # logging.DEBUG
15
+
16
+ logging.addLevelName(VERBOSE_LEVEL, "VERBOSE")
17
+ logging.addLevelName(MINIMAL_LEVEL, "MINIMAL")
18
+
19
+ class CustomFormatter(logging.Formatter):
20
+ def format(self, record):
21
+ # Save original format to restore it later
22
+ orig_fmt = self._style._fmt
23
+
24
+ if record.levelno <= DEBUG_LEVEL:
25
+ # Debug: Full trace info
26
+ self._style._fmt = "[%(levelname)s] %(name)s:%(lineno)d - %(message)s"
27
+ elif record.levelno <= VERBOSE_LEVEL:
28
+ # Verbose: Detail
29
+ self._style._fmt = "[%(levelname)s] %(message)s"
30
+ elif record.levelno <= NORMAL_LEVEL:
31
+ # Normal: Standard output
32
+ self._style._fmt = "%(message)s"
33
+ else:
34
+ # Minimal/Warning
35
+ self._style._fmt = "[%(levelname)s] %(message)s"
36
+
37
+ result = super().format(record)
38
+
39
+ # Restore original format
40
+ self._style._fmt = orig_fmt
41
+ return result
42
+
43
+ def setup_logging(verbose_arg: str = "NORMAL"):
44
+ """
45
+ Setup logging based on verbosity name.
46
+ """
47
+ level_map = {
48
+ "DEBUG": DEBUG_LEVEL,
49
+ "VERBOSE": VERBOSE_LEVEL,
50
+ "NORMAL": NORMAL_LEVEL,
51
+ "MINIMAL": MINIMAL_LEVEL
52
+ }
53
+
54
+ level = level_map.get(verbose_arg.upper(), NORMAL_LEVEL)
55
+
56
+ logger = logging.getLogger("unifiedefficientloader")
57
+ logger.setLevel(level)
58
+
59
+ # Clear existing handlers to prevent duplicates
60
+ if logger.handlers:
61
+ logger.handlers.clear()
62
+
63
+ handler = logging.StreamHandler(sys.stdout)
64
+ handler.setFormatter(CustomFormatter())
65
+ logger.addHandler(handler)
66
+
67
+ return logger
68
+
69
+ def get_logger(name=None):
70
+ if name:
71
+ if not name.startswith("unifiedefficientloader"):
72
+ name = f"unifiedefficientloader.{name}"
73
+ return logging.getLogger(name)
74
+ return logging.getLogger("unifiedefficientloader")
75
+
76
+ # Decorator for DEBUG level tracing
77
+ def log_debug(func):
78
+ """Decorator to log function entry/exit with args (DEBUG level only)."""
79
+ @functools.wraps(func)
80
+ def wrapper(*args, **kwargs):
81
+ # We only want to construct the string if debug is enabled to save perf
82
+ logger = get_logger(func.__module__.split('.')[-1])
83
+ if logger.isEnabledFor(DEBUG_LEVEL):
84
+ arg_str = ", ".join([repr(a) for a in args])
85
+ kw_str = ", ".join([f"{k}={v!r}" for k, v in kwargs.items()])
86
+ all_args = ", ".join(filter(None, [arg_str, kw_str]))
87
+ logger.log(DEBUG_LEVEL, f"CALL {func.__name__}({all_args})")
88
+
89
+ result = func(*args, **kwargs)
90
+
91
+ if logger.isEnabledFor(DEBUG_LEVEL):
92
+ logger.log(DEBUG_LEVEL, f"RET {func.__name__} -> {type(result)}")
93
+ return result
94
+ return wrapper
95
+
96
+ # Convenience wrappers
97
+ def debug(msg, *args, **kwargs):
98
+ get_logger().log(DEBUG_LEVEL, msg, *args, **kwargs)
99
+
100
+ def verbose(msg, *args, **kwargs):
101
+ get_logger().log(VERBOSE_LEVEL, msg, *args, **kwargs)
102
+
103
+ def normal(msg, *args, **kwargs):
104
+ get_logger().log(NORMAL_LEVEL, msg, *args, **kwargs)
105
+
106
+ def info(msg, *args, **kwargs):
107
+ """Alias for normal/INFO level logging."""
108
+ normal(msg, *args, **kwargs)
109
+
110
+ def minimal(msg, *args, **kwargs):
111
+ get_logger().log(MINIMAL_LEVEL, msg, *args, **kwargs)
112
+
113
+ def warning(msg, *args, **kwargs):
114
+ get_logger().warning(msg, *args, **kwargs)
115
+
116
+ def error(msg, *args, **kwargs):
117
+ get_logger().error(msg, *args, **kwargs)
@@ -7,10 +7,11 @@ Requires `torch`, `safetensors`, and optionally `tqdm`.
7
7
  import gc
8
8
  import json
9
9
  import struct
10
- import logging
11
10
  from typing import Dict, Optional, Tuple
12
11
 
13
- logger = logging.getLogger(__name__)
12
+ from . import logging_utils
13
+
14
+ logger = logging_utils.get_logger(__name__)
14
15
 
15
16
  def _ensure_torch():
16
17
  try:
@@ -51,46 +52,57 @@ class UnifiedSafetensorsLoader:
51
52
  loader.mark_processed(key) # Frees memory in low_memory mode
52
53
  """
53
54
 
54
- def __init__(self, filename: str, low_memory: bool = False):
55
+ @logging_utils.log_debug
56
+ def __init__(self, filename: str, low_memory: bool = False, direct_gpu: bool = False):
55
57
  """Initialize the loader.
56
58
 
57
59
  Args:
58
60
  filename: Path to safetensors file
59
61
  low_memory: If True, use streaming mode; if False, preload all tensors
62
+ direct_gpu: If True, stream directly to GPU pinned/slab memory (requires low_memory=True)
60
63
  """
61
64
  torch = _ensure_torch()
62
65
  safe_open = _ensure_safetensors()
63
-
66
+
64
67
  self.filename = filename
65
68
  self.low_memory = low_memory
69
+ self.direct_gpu = direct_gpu
70
+
71
+ if self.direct_gpu and not self.low_memory:
72
+ logging_utils.warning("direct_gpu=True requires low_memory=True. Forcing low_memory=True.")
73
+ self.low_memory = True
74
+
66
75
  self._tensors: Dict[str, 'torch.Tensor'] = {}
76
+ self._gpu_buffer_indices: Dict[str, int] = {}
77
+ self._gpu_pool = None
78
+
67
79
  self._all_keys = []
68
80
  self._file = None
69
81
  self._header = None
70
82
  self._header_size = None
71
83
  self._metadata: Dict[str, str] = {}
72
84
 
73
- if low_memory:
85
+ if self.low_memory:
74
86
  # Streaming mode: read header only
75
87
  self._header, self._header_size = self._read_header()
76
88
  self._file = None # Opened lazily to support multiprocessing DataLoader
77
89
  self._all_keys = [k for k in self._header.keys() if k != "__metadata__"]
78
90
  # Extract metadata from header (safetensors stores it under __metadata__ key)
79
91
  self._metadata = self._header.get("__metadata__", {})
80
- logger.debug(f"Initialized Low-memory mode: parsed header of size {self._header_size} bytes.")
81
- logger.debug(f"Found {len(self._all_keys)} tensors (streaming mode)")
92
+ logging_utils.verbose(f"Initialized Low-memory mode: parsed header of size {self._header_size} bytes.")
93
+ logging_utils.verbose(f"Found {len(self._all_keys)} tensors (streaming mode)")
82
94
  else:
83
95
  # Standard mode: preload all tensors
84
- with safe_open(filename, framework="pt", device="cpu") as f:
96
+ with safe_open(self.filename, framework="pt", device="cpu") as f:
85
97
  self._metadata = f.metadata() or {}
86
98
  self._all_keys = list(f.keys())
87
- print(f"Loading {len(self._all_keys)} tensors from source file...")
99
+ logging_utils.normal(f"Loading {len(self._all_keys)} tensors from source file...")
88
100
  try:
89
101
  from tqdm import tqdm
90
- iterator = tqdm(self._all_keys, desc="Loading tensors")
102
+ iterator = tqdm(self._all_keys, desc="Loading tensors", disable=not logger.isEnabledFor(logging_utils.NORMAL_LEVEL))
91
103
  except ImportError:
92
104
  iterator = self._all_keys
93
-
105
+
94
106
  for key in iterator:
95
107
  self._tensors[key] = f.get_tensor(key)
96
108
 
@@ -141,6 +153,7 @@ class UnifiedSafetensorsLoader:
141
153
  """Get tensor ndim without loading tensor data."""
142
154
  return len(self.get_shape(key))
143
155
 
156
+ @logging_utils.log_debug
144
157
  def get_tensor(self, key: str) -> 'torch.Tensor':
145
158
  """Get a tensor by key.
146
159
 
@@ -162,7 +175,7 @@ class UnifiedSafetensorsLoader:
162
175
  offset_start, offset_end = metadata["data_offsets"]
163
176
 
164
177
  if offset_start != offset_end:
165
- logger.debug(f"Loading tensor '{key}' from offset {offset_start} to {offset_end} ({(offset_end - offset_start)} bytes)")
178
+ logging_utils.debug(f"Loading tensor '{key}' from offset {offset_start} to {offset_end} ({(offset_end - offset_start)} bytes)")
166
179
  self._file.seek(self._header_size + 8 + offset_start)
167
180
  # Use bytearray to create a writable buffer, avoiding PyTorch warning
168
181
  # about non-writable tensors from read-only bytes.
@@ -177,12 +190,17 @@ class UnifiedSafetensorsLoader:
177
190
  """Mark a tensor as processed, freeing memory if in low-memory mode.
178
191
 
179
192
  In standard mode, optionally deletes from cache.
180
- In low-memory mode, this is a no-op (tensor was never cached).
193
+ In low-memory mode, frees GPU buffer back to pool if direct_gpu.
181
194
  """
182
195
  if not self.low_memory and key in self._tensors:
183
196
  del self._tensors[key]
184
197
  gc.collect()
185
198
 
199
+ if self.direct_gpu and key in self._gpu_buffer_indices:
200
+ idx = self._gpu_buffer_indices.pop(key)
201
+ if self._gpu_pool:
202
+ self._gpu_pool.release(idx)
203
+
186
204
  def _read_header(self):
187
205
  """Read and parse the safetensors header."""
188
206
  with open(self.filename, "rb") as f:
@@ -270,61 +288,152 @@ class UnifiedSafetensorsLoader:
270
288
 
271
289
  def async_stream(self, keys: list, batch_size: int = 1, prefetch_batches: int = 2, pin_memory: bool = False):
272
290
  """Asynchronously stream tensors from disk.
273
-
291
+
274
292
  Args:
275
293
  keys: List of tensor keys to load
276
294
  batch_size: Number of tensors to yield in each batch
277
295
  prefetch_batches: Number of batches to pre-fetch in background
278
296
  pin_memory: If True, tensors will be pinned in CPU memory (sequentially in main thread)
279
-
297
+ direct_gpu: Stream via pinned buffer directly to GPU
298
+
280
299
  Yields:
281
300
  List of (key, tensor) tuples
282
301
  """
283
302
  import threading
284
303
  import queue
285
304
  from concurrent.futures import ThreadPoolExecutor
305
+ import os
286
306
 
287
307
  torch = _ensure_torch()
288
308
  thread_local = threading.local()
289
309
 
310
+ # Initialize GPU slab and Pinned Buffer Pool if direct_gpu
311
+ pinned_pool = None
312
+ cuda_stream = None
313
+
314
+ if self.direct_gpu and torch.cuda.is_available():
315
+ try:
316
+ from .gpu_buffer_pool import GpuBufferPool
317
+ from .pinned_buffer_pool import PinnedBufferPool
318
+
319
+ # Pre-calculate required slab size
320
+ max_tensor_bytes = 0
321
+ for k in keys:
322
+ meta = self._header[k]
323
+ start, end = meta["data_offsets"]
324
+ sz = end - start
325
+ max_tensor_bytes = max(max_tensor_bytes, sz)
326
+
327
+ # Initialize pools (size of largest tensor)
328
+ # We need a larger pool to allow the GPU to lag behind the CPU without stalling
329
+ max_workers = min(16, max(4, batch_size))
330
+ max_in_flight = max(max_workers, prefetch_batches * batch_size)
331
+
332
+ # Double the buffers for a smooth pipeline
333
+ num_buffers = (max_in_flight + max_workers) * 2 + 2
334
+
335
+ # Assign pool to instance to survive the generator lifetime
336
+ if not getattr(self, '_gpu_pool', None):
337
+ self._gpu_pool = GpuBufferPool(max_tensor_bytes, num_buffers)
338
+
339
+ pinned_pool = PinnedBufferPool(max_tensor_bytes, num_buffers)
340
+ cuda_stream = torch.cuda.Stream()
341
+
342
+ logging_utils.normal(f"Direct GPU pipeline initialized: {num_buffers} buffers, max {max_tensor_bytes / (1024**2):.1f}MB each (Total VRAM: {(num_buffers*max_tensor_bytes)/(1024**2):.1f}MB)")
343
+
344
+ except Exception as e:
345
+ logging_utils.warning(f"Failed to initialize direct GPU pipeline: {e}. Falling back.")
346
+ self.direct_gpu = False
347
+ pinned_pool = None
348
+ elif self.direct_gpu:
349
+ logging_utils.warning("direct_gpu=True requested but CUDA is not available. Falling back to CPU.")
350
+ self.direct_gpu = False
351
+
290
352
  def get_file_handle():
291
353
  if not hasattr(thread_local, 'file'):
292
354
  thread_local.file = open(self.filename, "rb")
293
355
  return thread_local.file
294
356
 
295
357
  def _worker_load(key):
358
+ buf_idx = None
359
+ gpu_idx = None
296
360
  try:
297
- # Direct thread-safe read
298
361
  metadata = self._header[key]
299
362
  offset_start, offset_end = metadata["data_offsets"]
300
- if offset_start != offset_end:
301
- f = get_file_handle()
302
- f.seek(self._header_size + 8 + offset_start)
303
- tensor_bytes = bytearray(offset_end - offset_start)
304
- f.readinto(tensor_bytes)
305
- else:
306
- tensor_bytes = None
363
+ sz = offset_end - offset_start
364
+
365
+ if self.direct_gpu and sz > 0:
366
+ # Direct GPU Pipeline Path
367
+ buf_idx, pinned_buf = pinned_pool.acquire()
307
368
 
308
- tensor = self._deserialize_tensor(tensor_bytes, metadata)
309
- return key, tensor, None
369
+ try:
370
+ # Schedule GPU transfer
371
+ gpu_idx, gpu_buf = self._gpu_pool.acquire()
372
+
373
+ try:
374
+ # Read into pinned memory directly (Zero-Copy CPU path)
375
+ import ctypes
376
+ view = pinned_buf[:sz]
377
+
378
+ # Create a ctypes c_uint8 array spanning the pinned buffer memory
379
+ # This allows f.readinto() to write bytes directly to the torch tensor memory
380
+ c_uint8_array = (ctypes.c_uint8 * sz).from_address(view.data_ptr())
381
+
382
+ f = get_file_handle()
383
+ f.seek(self._header_size + 8 + offset_start)
384
+ f.readinto(c_uint8_array)
385
+
386
+ gpu_view = gpu_buf[:sz]
387
+
388
+ with torch.cuda.stream(cuda_stream):
389
+ gpu_view.copy_(view, non_blocking=True)
390
+
391
+ # Create event to track when copy finishes
392
+ event = torch.cuda.Event()
393
+ event.record()
394
+
395
+ # Critical: wait for stream before allowing worker to finish
396
+ # If worker finishes, buffer might be overwritten by next worker
397
+ # if pool sizing is tight.
398
+ # In direct_gpu, the tensor is the gpu_view.
399
+ return key, gpu_view, metadata, buf_idx, gpu_idx, event
400
+
401
+ except Exception as e:
402
+ # If reading or copying fails, release GPU buffer
403
+ self._gpu_pool.release(gpu_idx)
404
+ raise e
405
+
406
+ except Exception as e:
407
+ # If acquiring GPU buffer fails, release pinned buffer
408
+ pinned_pool.release(buf_idx)
409
+ raise e
410
+ else:
411
+ # Standard CPU Path
412
+ if offset_start != offset_end:
413
+ f = get_file_handle()
414
+ f.seek(self._header_size + 8 + offset_start)
415
+ tensor_bytes = bytearray(offset_end - offset_start)
416
+ f.readinto(tensor_bytes)
417
+ else:
418
+ tensor_bytes = None
419
+
420
+ tensor = self._deserialize_tensor(tensor_bytes, metadata)
421
+ return key, tensor, None, None, None, None
310
422
  except Exception as e:
311
- # Fallback info for main thread
312
- return key, None, e
423
+ return key, None, e, None, None, None
424
+
425
+ max_workers = min(16, max(4, batch_size))
426
+ max_in_flight = max(max_workers, prefetch_batches * batch_size)
313
427
 
314
428
  # Queue for individual (key, tensor) pairs
315
- # Size it to hold enough for prefetch_batches
316
- q = queue.Queue(maxsize=prefetch_batches * batch_size)
317
-
429
+ # Size it to hold enough for prefetch_batches PLUS max_workers to prevent stalling
430
+ q = queue.Queue(maxsize=max_in_flight + max_workers)
431
+
318
432
  def _producer():
319
- # Use a reasonable number of workers for I/O bound tasks
320
- max_workers = min(16, max(4, batch_size))
321
- # Limit task submission to maintain backpressure on memory
322
- max_in_flight = max(max_workers, prefetch_batches * batch_size)
323
-
324
433
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
325
434
  futures = []
326
435
  key_iter = iter(keys)
327
-
436
+
328
437
  # Fill the pipeline
329
438
  for _ in range(max_in_flight):
330
439
  try:
@@ -332,50 +441,81 @@ class UnifiedSafetensorsLoader:
332
441
  futures.append(executor.submit(_worker_load, k))
333
442
  except StopIteration:
334
443
  break
335
-
444
+
336
445
  while futures:
337
446
  # Maintain order by taking the first future
338
447
  f = futures.pop(0)
339
448
  result = f.result() # Blocks until this specific tensor is loaded
340
449
  q.put(result) # Blocks if the consumption queue is full
341
-
450
+
342
451
  # Submit next task if available
343
452
  try:
344
453
  k = next(key_iter)
345
454
  futures.append(executor.submit(_worker_load, k))
346
455
  except StopIteration:
347
456
  pass
348
-
457
+
349
458
  q.put(None) # Sentinel
350
459
 
460
+ producer_thread = threading.local()
351
461
  producer_thread = threading.Thread(target=_producer, daemon=True)
352
462
  producer_thread.start()
353
463
 
354
464
  batch = []
465
+ pending_pinned = [] # Track (event, buf_idx) to release later
466
+
355
467
  while True:
356
468
  res = q.get()
357
469
  if res is None:
470
+ # Synchronize and cleanup any remaining buffers on exit
471
+ for ev, idx in pending_pinned:
472
+ ev.synchronize()
473
+ pinned_pool.release(idx)
358
474
  if batch:
359
475
  yield batch
360
476
  break
361
-
362
- k, t, err = res
363
- if err is not None:
364
- logger.warning(f"Async load failed for {k}, falling back to sync: {err}")
477
+
478
+ k, t, err, buf_idx, gpu_idx, event = res
479
+ if err is not None and not isinstance(err, dict):
480
+ logging_utils.warning(f"Async load failed for {k}, falling back to sync: {err}")
365
481
  # Fallback synchronous load
366
482
  try:
367
483
  t = self.get_tensor(k)
368
484
  except Exception as sync_err:
369
- logger.error(f"Sync fallback also failed for {k}: {sync_err}")
485
+ logging_utils.error(f"Sync fallback also failed for {k}: {sync_err}")
370
486
  raise sync_err
371
-
487
+
488
+ if buf_idx is not None and event is not None:
489
+ # Don't block here! Yield the tensor with its event.
490
+ # Only release the PREVIOUS batch's buffers.
491
+ # This creates a sliding window of safety.
492
+ while len(pending_pinned) >= (max_in_flight + 1):
493
+ ev, idx = pending_pinned.pop(0)
494
+ ev.synchronize() # Wait only if we MUST reuse a buffer
495
+ pinned_pool.release(idx)
496
+
497
+ pending_pinned.append((event, buf_idx))
498
+
499
+ # Register GPU index for cleanup
500
+ self._gpu_buffer_indices[k] = gpu_idx
501
+
502
+ # Reshape GPU view to tensor
503
+ meta = err # we reused err for metadata in direct_gpu path
504
+ dtype = self._get_torch_dtype(meta["dtype"])
505
+ shape = meta["shape"]
506
+
507
+ if meta["dtype"] in ["F8_E5M2", "F8_E4M3"]:
508
+ t = self._convert_float8(t, meta["dtype"], shape)
509
+ else:
510
+ t = t.view(dtype).reshape(shape)
511
+
372
512
  # Pin memory sequentially in the main thread to avoid OS-level lock contention
373
- if pin_memory and t.device.type == 'cpu':
513
+ elif pin_memory and t.device.type == 'cpu':
374
514
  try:
375
515
  t = t.pin_memory()
376
516
  except Exception as e:
377
- logger.warning(f"Failed to pin memory for {k}: {e}")
378
-
517
+ logging_utils.warning(f"Failed to pin memory for {k}: {e}")
518
+
379
519
  batch.append((k, t))
380
520
  if len(batch) == batch_size:
381
521
  yield batch
@@ -0,0 +1,39 @@
1
+ """
2
+ Pinned memory pool manager.
3
+
4
+ Provides pre-allocated pinned buffers to avoid per-tensor allocation overhead.
5
+ """
6
+ from typing import Optional
7
+ import torch
8
+ from . import logging_utils
9
+
10
+ logger = logging_utils.get_logger(__name__)
11
+
12
+ class PinnedBufferPool:
13
+ """Manages a pool of pinned memory buffers for fast disk-to-GPU transfer."""
14
+ def __init__(self, size_bytes: int, num_buffers: int):
15
+ import torch
16
+ self.size_bytes = size_bytes
17
+ self.num_buffers = num_buffers
18
+
19
+ logging_utils.verbose(f"Initializing PinnedBufferPool: {num_buffers} buffers of {size_bytes / (1024**2):.2f} MB each.")
20
+
21
+ self.buffers = []
22
+ for _ in range(num_buffers):
23
+ buf = torch.empty(size_bytes, dtype=torch.uint8, pin_memory=True)
24
+ self.buffers.append(buf)
25
+
26
+ import queue
27
+ self.free_queue = queue.Queue()
28
+ for i in range(num_buffers):
29
+ self.free_queue.put(i)
30
+
31
+ def acquire(self) -> tuple[int, 'torch.Tensor']:
32
+ """Acquire a free buffer. Blocks if empty."""
33
+ idx = self.free_queue.get()
34
+ return idx, self.buffers[idx]
35
+
36
+ def release(self, idx: int):
37
+ """Release buffer back to pool."""
38
+ self.free_queue.put(idx)
39
+
@@ -4,10 +4,11 @@ Pinned memory utilities for faster CPU→GPU tensor transfers.
4
4
  Pinned (page-locked) memory enables faster DMA transfers to GPU.
5
5
  Uses PyTorch's native pin_memory() with non_blocking transfers.
6
6
  """
7
- import logging
8
7
  from typing import Optional
9
8
 
10
- logger = logging.getLogger(__name__)
9
+ from . import logging_utils
10
+
11
+ logger = logging_utils.get_logger(__name__)
11
12
 
12
13
  def _ensure_torch():
13
14
  try:
@@ -21,9 +22,14 @@ _verbose = False
21
22
  _pinned_transfer_stats = {"pinned": 0, "fallback": 0}
22
23
 
23
24
  def set_verbose(enabled: bool):
24
- """Enable/disable verbose output for pinned transfers."""
25
+ """
26
+ Enable/disable verbose output for pinned transfers.
27
+ Also adjusts logging level to VERBOSE if enabled.
28
+ """
25
29
  global _verbose
26
30
  _verbose = enabled
31
+ if enabled:
32
+ logging_utils.setup_logging("VERBOSE")
27
33
 
28
34
  def get_pinned_transfer_stats():
29
35
  """Return pinned transfer statistics for verification."""
@@ -34,10 +40,12 @@ def reset_pinned_transfer_stats():
34
40
  global _pinned_transfer_stats
35
41
  _pinned_transfer_stats = {"pinned": 0, "fallback": 0}
36
42
 
43
+ @logging_utils.log_debug
37
44
  def transfer_to_gpu_pinned(
38
45
  tensor,
39
46
  device: str = 'cuda',
40
- dtype = None
47
+ dtype = None,
48
+ non_blocking: bool = True
41
49
  ):
42
50
  """Transfer tensor to GPU using pinned memory for faster transfer."""
43
51
  torch = _ensure_torch()
@@ -46,44 +54,47 @@ def transfer_to_gpu_pinned(
46
54
  # Skip if not a CPU tensor or CUDA unavailable
47
55
  if tensor.device.type != 'cpu' or not torch.cuda.is_available():
48
56
  if dtype is not None:
49
- return tensor.to(device=device, dtype=dtype)
50
- return tensor.to(device=device)
57
+ return tensor.to(device=device, dtype=dtype, non_blocking=non_blocking)
58
+ return tensor.to(device=device, non_blocking=non_blocking)
51
59
 
52
60
  # Skip if target is not CUDA
53
61
  if not str(device).startswith('cuda'):
54
62
  if dtype is not None:
55
- return tensor.to(device=device, dtype=dtype)
56
- return tensor.to(device=device)
63
+ return tensor.to(device=device, dtype=dtype, non_blocking=non_blocking)
64
+ return tensor.to(device=device, non_blocking=non_blocking)
57
65
 
58
66
  try:
59
67
  pinned = tensor.pin_memory()
60
68
 
61
69
  if dtype is not None:
62
- result = pinned.to(device=device, dtype=dtype, non_blocking=True)
70
+ result = pinned.to(device=device, dtype=dtype, non_blocking=non_blocking)
63
71
  else:
64
- result = pinned.to(device=device, non_blocking=True)
72
+ result = pinned.to(device=device, non_blocking=non_blocking)
65
73
 
66
74
  torch.cuda.current_stream().synchronize()
67
75
 
68
76
  # One-time confirmation on first success
69
77
  if _pinned_transfer_stats["pinned"] == 0:
70
- logger.debug("[pinned_transfer] Pinned memory active - faster GPU transfers enabled")
78
+ logging_utils.verbose("[pinned_transfer] Pinned memory active - faster GPU transfers enabled")
71
79
 
72
80
  _pinned_transfer_stats["pinned"] += 1
81
+
82
+ msg = f"[pinned_transfer] Pinned: {tensor.shape} ({tensor.numel() * tensor.element_size() / 1024:.1f} KB)"
73
83
  if _verbose:
74
- logger.debug(f"[pinned_transfer] Pinned: {tensor.shape} ({tensor.numel() * tensor.element_size() / 1024:.1f} KB)")
84
+ logging_utils.normal(msg)
75
85
  else:
76
- logger.debug(f"[pinned_transfer] Transferred tensor {tensor.shape} to {device} via pinned memory")
86
+ logging_utils.verbose(msg)
77
87
 
78
88
  return result
79
89
 
80
90
  except Exception as e:
81
91
  _pinned_transfer_stats["fallback"] += 1
92
+ msg = f"[pinned_transfer] Fallback transfer to {device} due to error: {e}"
82
93
  if _verbose:
83
- logger.debug(f"[pinned_transfer] Fallback: {e}")
94
+ logging_utils.warning(msg)
84
95
  else:
85
- logger.debug(f"[pinned_transfer] Fallback transfer to {device} due to error: {e}")
96
+ logging_utils.verbose(msg)
86
97
 
87
98
  if dtype is not None:
88
- return tensor.to(device=device, dtype=dtype)
89
- return tensor.to(device=device)
99
+ return tensor.to(device=device, dtype=dtype, non_blocking=non_blocking)
100
+ return tensor.to(device=device, non_blocking=non_blocking)
@@ -5,10 +5,11 @@ Provides serialization helpers for dictionary/tensor conversion.
5
5
  Requires `torch`.
6
6
  """
7
7
  import json
8
- import logging
9
8
  from typing import Dict, Tuple
10
9
 
11
- logger = logging.getLogger(__name__)
10
+ from . import logging_utils
11
+
12
+ logger = logging_utils.get_logger(__name__)
12
13
 
13
14
  def _ensure_torch():
14
15
  try:
@@ -18,6 +19,7 @@ def _ensure_torch():
18
19
  raise ImportError("The 'torch' package is required but not installed. Please install it.")
19
20
 
20
21
 
22
+ @logging_utils.log_debug
21
23
  def dict_to_tensor(data_dict: dict):
22
24
  """
23
25
  Convert a dictionary to a torch.uint8 tensor containing JSON bytes.
@@ -32,9 +34,10 @@ def dict_to_tensor(data_dict: dict):
32
34
  json_str = json.dumps(data_dict)
33
35
  byte_data = json_str.encode("utf-8")
34
36
  tensor_data = torch.tensor(list(byte_data), dtype=torch.uint8)
35
- logger.debug(f"dict_to_tensor: serialized dict to uint8 tensor of shape {tensor_data.shape}")
37
+ logging_utils.debug(f"dict_to_tensor: serialized dict to uint8 tensor of shape {tensor_data.shape}")
36
38
  return tensor_data
37
39
 
40
+ @logging_utils.log_debug
38
41
  def tensor_to_dict(tensor_data) -> dict:
39
42
  """
40
43
  Convert a torch.uint8 tensor containing JSON bytes to a dictionary.
@@ -50,5 +53,5 @@ def tensor_to_dict(tensor_data) -> dict:
50
53
  byte_data = bytes(tensor_data.tolist())
51
54
  json_str = byte_data.decode("utf-8")
52
55
  data_dict = json.loads(json_str)
53
- logger.debug(f"tensor_to_dict: deserialized tensor of shape {tensor_data.shape} to dict with keys: {list(data_dict.keys())}")
54
- return data_dict
56
+ logging_utils.debug(f"tensor_to_dict: deserialized tensor of shape {tensor_data.shape} to dict with keys: {list(data_dict.keys())}")
57
+ return data_dict
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unifiedefficientloader
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
5
5
  Author: silveroxides
6
6
  License: MIT
@@ -111,6 +111,33 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
111
111
  loader.mark_processed(key)
112
112
  ```
113
113
 
114
+ ### Direct-to-GPU Streaming (Zero-Copy)
115
+
116
+ For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
117
+
118
+ ```python
119
+ from unifiedefficientloader import UnifiedSafetensorsLoader
120
+
121
+ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
122
+ keys_to_load = loader.keys()
123
+
124
+ # async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
125
+ stream = loader.async_stream(
126
+ keys_to_load,
127
+ batch_size=8,
128
+ prefetch_batches=2,
129
+ direct_gpu=True # optional here since we passed it in __init__
130
+ )
131
+
132
+ for batch in stream:
133
+ for key, gpu_tensor in batch:
134
+ # gpu_tensor is already on the GPU!
135
+ assert gpu_tensor.device.type == "cuda"
136
+
137
+ # ... process gpu_tensor ...
138
+ loader.mark_processed(key)
139
+ ```
140
+
114
141
  ### Tensor/Dict Conversion
115
142
 
116
143
  ```python
@@ -2,9 +2,14 @@ LICENSE
2
2
  README.md
3
3
  pyproject.toml
4
4
  setup.py
5
+ tests/test_direct_gpu.py
6
+ tests/test_logging.py
5
7
  tests/test_utils.py
6
8
  unifiedefficientloader/__init__.py
9
+ unifiedefficientloader/gpu_buffer_pool.py
10
+ unifiedefficientloader/logging_utils.py
7
11
  unifiedefficientloader/memory_efficient_loader.py
12
+ unifiedefficientloader/pinned_buffer_pool.py
8
13
  unifiedefficientloader/pinned_transfer.py
9
14
  unifiedefficientloader/tensor_utils.py
10
15
  unifiedefficientloader.egg-info/PKG-INFO