unifiedefficientloader 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/PKG-INFO +28 -1
  2. {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/README.md +27 -0
  3. {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/pyproject.toml +7 -2
  4. {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/setup.py +1 -0
  5. unifiedefficientloader-0.2.3/tests/test_direct_gpu.py +95 -0
  6. {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader/__init__.py +8 -4
  7. unifiedefficientloader-0.2.3/unifiedefficientloader/gpu_buffer_pool.py +40 -0
  8. {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader/memory_efficient_loader.py +163 -26
  9. unifiedefficientloader-0.2.3/unifiedefficientloader/pinned_buffer_pool.py +39 -0
  10. {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader/pinned_transfer.py +10 -9
  11. {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/PKG-INFO +28 -1
  12. {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/SOURCES.txt +3 -0
  13. {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/LICENSE +0 -0
  14. {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/setup.cfg +0 -0
  15. {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/tests/test_logging.py +0 -0
  16. {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/tests/test_utils.py +0 -0
  17. {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader/logging_utils.py +0 -0
  18. {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader/tensor_utils.py +0 -0
  19. {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/dependency_links.txt +0 -0
  20. {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/requires.txt +0 -0
  21. {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unifiedefficientloader
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
5
5
  Author: silveroxides
6
6
  License: MIT
@@ -111,6 +111,33 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
111
111
  loader.mark_processed(key)
112
112
  ```
113
113
 
114
+ ### Direct-to-GPU Streaming (Zero-Copy)
115
+
116
+ For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
117
+
118
+ ```python
119
+ from unifiedefficientloader import UnifiedSafetensorsLoader
120
+
121
+ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
122
+ keys_to_load = loader.keys()
123
+
124
+ # async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
125
+ stream = loader.async_stream(
126
+ keys_to_load,
127
+ batch_size=8,
128
+ prefetch_batches=2,
129
+ direct_gpu=True # optional here since we passed it in __init__
130
+ )
131
+
132
+ for batch in stream:
133
+ for key, gpu_tensor in batch:
134
+ # gpu_tensor is already on the GPU!
135
+ assert gpu_tensor.device.type == "cuda"
136
+
137
+ # ... process gpu_tensor ...
138
+ loader.mark_processed(key)
139
+ ```
140
+
114
141
  ### Tensor/Dict Conversion
115
142
 
116
143
  ```python
@@ -85,6 +85,33 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
85
85
  loader.mark_processed(key)
86
86
  ```
87
87
 
88
+ ### Direct-to-GPU Streaming (Zero-Copy)
89
+
90
+ For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
91
+
92
+ ```python
93
+ from unifiedefficientloader import UnifiedSafetensorsLoader
94
+
95
+ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
96
+ keys_to_load = loader.keys()
97
+
98
+ # async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
99
+ stream = loader.async_stream(
100
+ keys_to_load,
101
+ batch_size=8,
102
+ prefetch_batches=2,
103
+ direct_gpu=True # optional here since we passed it in __init__
104
+ )
105
+
106
+ for batch in stream:
107
+ for key, gpu_tensor in batch:
108
+ # gpu_tensor is already on the GPU!
109
+ assert gpu_tensor.device.type == "cuda"
110
+
111
+ # ... process gpu_tensor ...
112
+ loader.mark_processed(key)
113
+ ```
114
+
88
115
  ### Tensor/Dict Conversion
89
116
 
90
117
  ```python
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "unifiedefficientloader"
7
- version = "0.2.2"
7
+ version = "0.2.3"
8
8
  description = "A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts."
9
9
  readme = "README.md"
10
10
  authors = [
@@ -34,4 +34,9 @@ log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(linen
34
34
  log_cli_date_format = "%Y-%m-%d %H:%M:%S"
35
35
  filterwarnings = [
36
36
  "ignore:.*argument 'device' of Tensor.*:DeprecationWarning"
37
- ]
37
+ ]
38
+
39
+ [tool.setuptools.packages.find]
40
+ where = ["."]
41
+ include = ["unifiedefficientloader*"]
42
+ exclude = ["reference"]
@@ -1,3 +1,4 @@
1
+ #!/usr/bin/env python
1
2
  """Minimal setup.py for backward compatibility with legacy pip install workflows."""
2
3
  from setuptools import setup
3
4
 
@@ -0,0 +1,95 @@
1
+ import os
2
+ import tempfile
3
+ import pytest
4
+
5
+ try:
6
+ import torch
7
+ from safetensors.torch import save_file
8
+ HAS_TORCH = True
9
+ except ImportError:
10
+ HAS_TORCH = False
11
+
12
+ from unifiedefficientloader import MemoryEfficientSafeOpen
13
+
14
+ @pytest.fixture
15
+ def sample_safetensors():
16
+ if not HAS_TORCH:
17
+ pytest.skip("Requires torch and safetensors")
18
+
19
+ with tempfile.NamedTemporaryFile(suffix=".safetensors", delete=False) as f:
20
+ path = f.name
21
+
22
+ tensors = {
23
+ "weight1": torch.randn(10, 10),
24
+ "weight2": torch.randn(20, 20),
25
+ "bias": torch.zeros(10),
26
+ }
27
+ save_file(tensors, path)
28
+
29
+ yield path, tensors
30
+
31
+ if os.path.exists(path):
32
+ os.remove(path)
33
+
34
+ @pytest.mark.skipif(not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA")
35
+ def test_direct_gpu_streaming(sample_safetensors):
36
+ path, original_tensors = sample_safetensors
37
+
38
+ loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
39
+
40
+ # Test load_all which uses async_stream under the hood
41
+ loaded_tensors = loader.load_all()
42
+
43
+ for key, orig_tensor in original_tensors.items():
44
+ assert key in loaded_tensors
45
+ loaded_tensor = loaded_tensors[key]
46
+
47
+ # Verify it's on GPU
48
+ assert loaded_tensor.device.type == "cuda"
49
+
50
+ # Verify data matches
51
+ torch.testing.assert_close(loaded_tensor.cpu(), orig_tensor)
52
+
53
+ loader.close()
54
+
55
+ @pytest.mark.skipif(not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA")
56
+ def test_direct_gpu_async_stream(sample_safetensors):
57
+ path, original_tensors = sample_safetensors
58
+
59
+ loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
60
+
61
+ stream = loader.async_stream(
62
+ keys=list(original_tensors.keys()),
63
+ batch_size=2,
64
+ prefetch_batches=1,
65
+ direct_gpu=True
66
+ )
67
+
68
+ loaded_count = 0
69
+ for batch in stream:
70
+ for key, tensor in batch:
71
+ assert tensor.device.type == "cuda"
72
+ torch.testing.assert_close(tensor.cpu(), original_tensors[key])
73
+ loaded_count += 1
74
+
75
+ assert loaded_count == len(original_tensors)
76
+ loader.close()
77
+
78
+ @pytest.mark.skipif(not HAS_TORCH, reason="Requires torch")
79
+ def test_direct_gpu_fallback_no_cuda(sample_safetensors, monkeypatch):
80
+ # Force cuda to be unavailable
81
+ monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
82
+
83
+ path, original_tensors = sample_safetensors
84
+
85
+ # Should fallback to CPU silently
86
+ loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
87
+
88
+ loaded_tensors = loader.load_all()
89
+
90
+ for key, orig_tensor in original_tensors.items():
91
+ loaded_tensor = loaded_tensors[key]
92
+ assert loaded_tensor.device.type == "cpu"
93
+ torch.testing.assert_close(loaded_tensor, orig_tensor)
94
+
95
+ loader.close()
@@ -9,7 +9,7 @@ def check_dependencies(*packages):
9
9
  for pkg in packages:
10
10
  if importlib.util.find_spec(pkg) is None:
11
11
  missing.append(pkg)
12
-
12
+
13
13
  if missing:
14
14
  missing_str = ", ".join(missing)
15
15
  raise ImportError(
@@ -23,11 +23,13 @@ check_dependencies("torch")
23
23
  from .memory_efficient_loader import UnifiedSafetensorsLoader, MemoryEfficientSafeOpen
24
24
  from .tensor_utils import dict_to_tensor, tensor_to_dict
25
25
  from .pinned_transfer import transfer_to_gpu_pinned, set_verbose, get_pinned_transfer_stats, reset_pinned_transfer_stats
26
+ from .gpu_buffer_pool import GpuBufferPool
27
+ from .pinned_buffer_pool import PinnedBufferPool
26
28
  from .logging_utils import (
27
29
  setup_logging,
28
- MINIMAL_LEVEL,
29
- NORMAL_LEVEL,
30
- VERBOSE_LEVEL,
30
+ MINIMAL_LEVEL,
31
+ NORMAL_LEVEL,
32
+ VERBOSE_LEVEL,
31
33
  DEBUG_LEVEL,
32
34
  debug,
33
35
  verbose,
@@ -47,6 +49,8 @@ __all__ = [
47
49
  "set_verbose",
48
50
  "get_pinned_transfer_stats",
49
51
  "reset_pinned_transfer_stats",
52
+ "GpuBufferPool",
53
+ "PinnedBufferPool",
50
54
  "setup_logging",
51
55
  "MINIMAL_LEVEL",
52
56
  "NORMAL_LEVEL",
@@ -0,0 +1,40 @@
1
+ """
2
+ GPU memory buffer pool for direct-to-GPU streaming.
3
+
4
+ Maintains a pool of pre-allocated GPU tensors to avoid allocation overhead
5
+ and ensure strictly bounded VRAM usage during streaming.
6
+ """
7
+ from typing import Tuple, Optional
8
+ import torch
9
+ from . import logging_utils
10
+
11
+ logger = logging_utils.get_logger(__name__)
12
+
13
+ class GpuBufferPool:
14
+ """Manages a pool of fixed-size GPU memory buffers."""
15
+ def __init__(self, size_bytes: int, num_buffers: int, device: str = "cuda"):
16
+ import torch
17
+ import queue
18
+ self.device = device
19
+ self.size_bytes = size_bytes
20
+ self.num_buffers = num_buffers
21
+
22
+ logging_utils.verbose(f"Initializing GpuBufferPool: {num_buffers} buffers of {size_bytes / (1024**2):.2f} MB each on {device}.")
23
+
24
+ self.buffers = []
25
+ for _ in range(num_buffers):
26
+ buf = torch.empty(size_bytes, dtype=torch.uint8, device=device)
27
+ self.buffers.append(buf)
28
+
29
+ self.free_queue = queue.Queue()
30
+ for i in range(num_buffers):
31
+ self.free_queue.put(i)
32
+
33
+ def acquire(self) -> Tuple[int, 'torch.Tensor']:
34
+ """Acquire a free buffer. Blocks if empty."""
35
+ idx = self.free_queue.get()
36
+ return idx, self.buffers[idx]
37
+
38
+ def release(self, idx: int):
39
+ """Release buffer back to pool."""
40
+ self.free_queue.put(idx)
@@ -53,26 +53,36 @@ class UnifiedSafetensorsLoader:
53
53
  """
54
54
 
55
55
  @logging_utils.log_debug
56
- def __init__(self, filename: str, low_memory: bool = False):
56
+ def __init__(self, filename: str, low_memory: bool = False, direct_gpu: bool = False):
57
57
  """Initialize the loader.
58
58
 
59
59
  Args:
60
60
  filename: Path to safetensors file
61
61
  low_memory: If True, use streaming mode; if False, preload all tensors
62
+ direct_gpu: If True, stream directly to GPU pinned/slab memory (requires low_memory=True)
62
63
  """
63
64
  torch = _ensure_torch()
64
65
  safe_open = _ensure_safetensors()
65
66
 
66
67
  self.filename = filename
67
68
  self.low_memory = low_memory
69
+ self.direct_gpu = direct_gpu
70
+
71
+ if self.direct_gpu and not self.low_memory:
72
+ logging_utils.warning("direct_gpu=True requires low_memory=True. Forcing low_memory=True.")
73
+ self.low_memory = True
74
+
68
75
  self._tensors: Dict[str, 'torch.Tensor'] = {}
76
+ self._gpu_buffer_indices: Dict[str, int] = {}
77
+ self._gpu_pool = None
78
+
69
79
  self._all_keys = []
70
80
  self._file = None
71
81
  self._header = None
72
82
  self._header_size = None
73
83
  self._metadata: Dict[str, str] = {}
74
84
 
75
- if low_memory:
85
+ if self.low_memory:
76
86
  # Streaming mode: read header only
77
87
  self._header, self._header_size = self._read_header()
78
88
  self._file = None # Opened lazily to support multiprocessing DataLoader
@@ -83,7 +93,7 @@ class UnifiedSafetensorsLoader:
83
93
  logging_utils.verbose(f"Found {len(self._all_keys)} tensors (streaming mode)")
84
94
  else:
85
95
  # Standard mode: preload all tensors
86
- with safe_open(filename, framework="pt", device="cpu") as f:
96
+ with safe_open(self.filename, framework="pt", device="cpu") as f:
87
97
  self._metadata = f.metadata() or {}
88
98
  self._all_keys = list(f.keys())
89
99
  logging_utils.normal(f"Loading {len(self._all_keys)} tensors from source file...")
@@ -180,12 +190,17 @@ class UnifiedSafetensorsLoader:
180
190
  """Mark a tensor as processed, freeing memory if in low-memory mode.
181
191
 
182
192
  In standard mode, optionally deletes from cache.
183
- In low-memory mode, this is a no-op (tensor was never cached).
193
+ In low-memory mode, frees GPU buffer back to pool if direct_gpu.
184
194
  """
185
195
  if not self.low_memory and key in self._tensors:
186
196
  del self._tensors[key]
187
197
  gc.collect()
188
198
 
199
+ if self.direct_gpu and key in self._gpu_buffer_indices:
200
+ idx = self._gpu_buffer_indices.pop(key)
201
+ if self._gpu_pool:
202
+ self._gpu_pool.release(idx)
203
+
189
204
  def _read_header(self):
190
205
  """Read and parse the safetensors header."""
191
206
  with open(self.filename, "rb") as f:
@@ -279,6 +294,7 @@ class UnifiedSafetensorsLoader:
279
294
  batch_size: Number of tensors to yield in each batch
280
295
  prefetch_batches: Number of batches to pre-fetch in background
281
296
  pin_memory: If True, tensors will be pinned in CPU memory (sequentially in main thread)
297
+ direct_gpu: Stream via pinned buffer directly to GPU
282
298
 
283
299
  Yields:
284
300
  List of (key, tensor) tuples
@@ -286,44 +302,134 @@ class UnifiedSafetensorsLoader:
286
302
  import threading
287
303
  import queue
288
304
  from concurrent.futures import ThreadPoolExecutor
305
+ import os
289
306
 
290
307
  torch = _ensure_torch()
291
308
  thread_local = threading.local()
292
309
 
310
+ # Initialize GPU slab and Pinned Buffer Pool if direct_gpu
311
+ pinned_pool = None
312
+ cuda_stream = None
313
+
314
+ if self.direct_gpu and torch.cuda.is_available():
315
+ try:
316
+ from .gpu_buffer_pool import GpuBufferPool
317
+ from .pinned_buffer_pool import PinnedBufferPool
318
+
319
+ # Pre-calculate required slab size
320
+ max_tensor_bytes = 0
321
+ for k in keys:
322
+ meta = self._header[k]
323
+ start, end = meta["data_offsets"]
324
+ sz = end - start
325
+ max_tensor_bytes = max(max_tensor_bytes, sz)
326
+
327
+ # Initialize pools (size of largest tensor)
328
+ # We need a larger pool to allow the GPU to lag behind the CPU without stalling
329
+ max_workers = min(16, max(4, batch_size))
330
+ max_in_flight = max(max_workers, prefetch_batches * batch_size)
331
+
332
+ # Double the buffers for a smooth pipeline
333
+ num_buffers = (max_in_flight + max_workers) * 2 + 2
334
+
335
+ # Assign pool to instance to survive the generator lifetime
336
+ if not getattr(self, '_gpu_pool', None):
337
+ self._gpu_pool = GpuBufferPool(max_tensor_bytes, num_buffers)
338
+
339
+ pinned_pool = PinnedBufferPool(max_tensor_bytes, num_buffers)
340
+ cuda_stream = torch.cuda.Stream()
341
+
342
+ logging_utils.normal(f"Direct GPU pipeline initialized: {num_buffers} buffers, max {max_tensor_bytes / (1024**2):.1f}MB each (Total VRAM: {(num_buffers*max_tensor_bytes)/(1024**2):.1f}MB)")
343
+
344
+ except Exception as e:
345
+ logging_utils.warning(f"Failed to initialize direct GPU pipeline: {e}. Falling back.")
346
+ self.direct_gpu = False
347
+ pinned_pool = None
348
+ elif self.direct_gpu:
349
+ logging_utils.warning("direct_gpu=True requested but CUDA is not available. Falling back to CPU.")
350
+ self.direct_gpu = False
351
+
293
352
  def get_file_handle():
294
353
  if not hasattr(thread_local, 'file'):
295
354
  thread_local.file = open(self.filename, "rb")
296
355
  return thread_local.file
297
356
 
298
357
  def _worker_load(key):
358
+ buf_idx = None
359
+ gpu_idx = None
299
360
  try:
300
- # Direct thread-safe read
301
361
  metadata = self._header[key]
302
362
  offset_start, offset_end = metadata["data_offsets"]
303
- if offset_start != offset_end:
304
- f = get_file_handle()
305
- f.seek(self._header_size + 8 + offset_start)
306
- tensor_bytes = bytearray(offset_end - offset_start)
307
- f.readinto(tensor_bytes)
308
- else:
309
- tensor_bytes = None
363
+ sz = offset_end - offset_start
310
364
 
311
- tensor = self._deserialize_tensor(tensor_bytes, metadata)
312
- return key, tensor, None
365
+ if self.direct_gpu and sz > 0:
366
+ # Direct GPU Pipeline Path
367
+ buf_idx, pinned_buf = pinned_pool.acquire()
368
+
369
+ try:
370
+ # Schedule GPU transfer
371
+ gpu_idx, gpu_buf = self._gpu_pool.acquire()
372
+
373
+ try:
374
+ # Read into pinned memory directly (Zero-Copy CPU path)
375
+ import ctypes
376
+ view = pinned_buf[:sz]
377
+
378
+ # Create a ctypes c_uint8 array spanning the pinned buffer memory
379
+ # This allows f.readinto() to write bytes directly to the torch tensor memory
380
+ c_uint8_array = (ctypes.c_uint8 * sz).from_address(view.data_ptr())
381
+
382
+ f = get_file_handle()
383
+ f.seek(self._header_size + 8 + offset_start)
384
+ f.readinto(c_uint8_array)
385
+
386
+ gpu_view = gpu_buf[:sz]
387
+
388
+ with torch.cuda.stream(cuda_stream):
389
+ gpu_view.copy_(view, non_blocking=True)
390
+
391
+ # Create event to track when copy finishes
392
+ event = torch.cuda.Event()
393
+ event.record()
394
+
395
+ # Critical: wait for stream before allowing worker to finish
396
+ # If worker finishes, buffer might be overwritten by next worker
397
+ # if pool sizing is tight.
398
+ # In direct_gpu, the tensor is the gpu_view.
399
+ return key, gpu_view, metadata, buf_idx, gpu_idx, event
400
+
401
+ except Exception as e:
402
+ # If reading or copying fails, release GPU buffer
403
+ self._gpu_pool.release(gpu_idx)
404
+ raise e
405
+
406
+ except Exception as e:
407
+ # If acquiring GPU buffer fails, release pinned buffer
408
+ pinned_pool.release(buf_idx)
409
+ raise e
410
+ else:
411
+ # Standard CPU Path
412
+ if offset_start != offset_end:
413
+ f = get_file_handle()
414
+ f.seek(self._header_size + 8 + offset_start)
415
+ tensor_bytes = bytearray(offset_end - offset_start)
416
+ f.readinto(tensor_bytes)
417
+ else:
418
+ tensor_bytes = None
419
+
420
+ tensor = self._deserialize_tensor(tensor_bytes, metadata)
421
+ return key, tensor, None, None, None, None
313
422
  except Exception as e:
314
- # Fallback info for main thread
315
- return key, None, e
423
+ return key, None, e, None, None, None
424
+
425
+ max_workers = min(16, max(4, batch_size))
426
+ max_in_flight = max(max_workers, prefetch_batches * batch_size)
316
427
 
317
428
  # Queue for individual (key, tensor) pairs
318
- # Size it to hold enough for prefetch_batches
319
- q = queue.Queue(maxsize=prefetch_batches * batch_size)
429
+ # Size it to hold enough for prefetch_batches PLUS max_workers to prevent stalling
430
+ q = queue.Queue(maxsize=max_in_flight + max_workers)
320
431
 
321
432
  def _producer():
322
- # Use a reasonable number of workers for I/O bound tasks
323
- max_workers = min(16, max(4, batch_size))
324
- # Limit task submission to maintain backpressure on memory
325
- max_in_flight = max(max_workers, prefetch_batches * batch_size)
326
-
327
433
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
328
434
  futures = []
329
435
  key_iter = iter(keys)
@@ -351,19 +457,26 @@ class UnifiedSafetensorsLoader:
351
457
 
352
458
  q.put(None) # Sentinel
353
459
 
460
+ producer_thread = threading.local()
354
461
  producer_thread = threading.Thread(target=_producer, daemon=True)
355
462
  producer_thread.start()
356
463
 
357
464
  batch = []
465
+ pending_pinned = [] # Track (event, buf_idx) to release later
466
+
358
467
  while True:
359
468
  res = q.get()
360
469
  if res is None:
470
+ # Synchronize and cleanup any remaining buffers on exit
471
+ for ev, idx in pending_pinned:
472
+ ev.synchronize()
473
+ pinned_pool.release(idx)
361
474
  if batch:
362
475
  yield batch
363
476
  break
364
477
 
365
- k, t, err = res
366
- if err is not None:
478
+ k, t, err, buf_idx, gpu_idx, event = res
479
+ if err is not None and not isinstance(err, dict):
367
480
  logging_utils.warning(f"Async load failed for {k}, falling back to sync: {err}")
368
481
  # Fallback synchronous load
369
482
  try:
@@ -372,8 +485,32 @@ class UnifiedSafetensorsLoader:
372
485
  logging_utils.error(f"Sync fallback also failed for {k}: {sync_err}")
373
486
  raise sync_err
374
487
 
488
+ if buf_idx is not None and event is not None:
489
+ # Don't block here! Yield the tensor with its event.
490
+ # Only release the PREVIOUS batch's buffers.
491
+ # This creates a sliding window of safety.
492
+ while len(pending_pinned) >= (max_in_flight + 1):
493
+ ev, idx = pending_pinned.pop(0)
494
+ ev.synchronize() # Wait only if we MUST reuse a buffer
495
+ pinned_pool.release(idx)
496
+
497
+ pending_pinned.append((event, buf_idx))
498
+
499
+ # Register GPU index for cleanup
500
+ self._gpu_buffer_indices[k] = gpu_idx
501
+
502
+ # Reshape GPU view to tensor
503
+ meta = err # we reused err for metadata in direct_gpu path
504
+ dtype = self._get_torch_dtype(meta["dtype"])
505
+ shape = meta["shape"]
506
+
507
+ if meta["dtype"] in ["F8_E5M2", "F8_E4M3"]:
508
+ t = self._convert_float8(t, meta["dtype"], shape)
509
+ else:
510
+ t = t.view(dtype).reshape(shape)
511
+
375
512
  # Pin memory sequentially in the main thread to avoid OS-level lock contention
376
- if pin_memory and t.device.type == 'cpu':
513
+ elif pin_memory and t.device.type == 'cpu':
377
514
  try:
378
515
  t = t.pin_memory()
379
516
  except Exception as e:
@@ -0,0 +1,39 @@
1
+ """
2
+ Pinned memory pool manager.
3
+
4
+ Provides pre-allocated pinned buffers to avoid per-tensor allocation overhead.
5
+ """
6
+ from typing import Optional
7
+ import torch
8
+ from . import logging_utils
9
+
10
+ logger = logging_utils.get_logger(__name__)
11
+
12
+ class PinnedBufferPool:
13
+ """Manages a pool of pinned memory buffers for fast disk-to-GPU transfer."""
14
+ def __init__(self, size_bytes: int, num_buffers: int):
15
+ import torch
16
+ self.size_bytes = size_bytes
17
+ self.num_buffers = num_buffers
18
+
19
+ logging_utils.verbose(f"Initializing PinnedBufferPool: {num_buffers} buffers of {size_bytes / (1024**2):.2f} MB each.")
20
+
21
+ self.buffers = []
22
+ for _ in range(num_buffers):
23
+ buf = torch.empty(size_bytes, dtype=torch.uint8, pin_memory=True)
24
+ self.buffers.append(buf)
25
+
26
+ import queue
27
+ self.free_queue = queue.Queue()
28
+ for i in range(num_buffers):
29
+ self.free_queue.put(i)
30
+
31
+ def acquire(self) -> tuple[int, 'torch.Tensor']:
32
+ """Acquire a free buffer. Blocks if empty."""
33
+ idx = self.free_queue.get()
34
+ return idx, self.buffers[idx]
35
+
36
+ def release(self, idx: int):
37
+ """Release buffer back to pool."""
38
+ self.free_queue.put(idx)
39
+
@@ -44,7 +44,8 @@ def reset_pinned_transfer_stats():
44
44
  def transfer_to_gpu_pinned(
45
45
  tensor,
46
46
  device: str = 'cuda',
47
- dtype = None
47
+ dtype = None,
48
+ non_blocking: bool = True
48
49
  ):
49
50
  """Transfer tensor to GPU using pinned memory for faster transfer."""
50
51
  torch = _ensure_torch()
@@ -53,22 +54,22 @@ def transfer_to_gpu_pinned(
53
54
  # Skip if not a CPU tensor or CUDA unavailable
54
55
  if tensor.device.type != 'cpu' or not torch.cuda.is_available():
55
56
  if dtype is not None:
56
- return tensor.to(device=device, dtype=dtype)
57
- return tensor.to(device=device)
57
+ return tensor.to(device=device, dtype=dtype, non_blocking=non_blocking)
58
+ return tensor.to(device=device, non_blocking=non_blocking)
58
59
 
59
60
  # Skip if target is not CUDA
60
61
  if not str(device).startswith('cuda'):
61
62
  if dtype is not None:
62
- return tensor.to(device=device, dtype=dtype)
63
- return tensor.to(device=device)
63
+ return tensor.to(device=device, dtype=dtype, non_blocking=non_blocking)
64
+ return tensor.to(device=device, non_blocking=non_blocking)
64
65
 
65
66
  try:
66
67
  pinned = tensor.pin_memory()
67
68
 
68
69
  if dtype is not None:
69
- result = pinned.to(device=device, dtype=dtype, non_blocking=True)
70
+ result = pinned.to(device=device, dtype=dtype, non_blocking=non_blocking)
70
71
  else:
71
- result = pinned.to(device=device, non_blocking=True)
72
+ result = pinned.to(device=device, non_blocking=non_blocking)
72
73
 
73
74
  torch.cuda.current_stream().synchronize()
74
75
 
@@ -95,5 +96,5 @@ def transfer_to_gpu_pinned(
95
96
  logging_utils.verbose(msg)
96
97
 
97
98
  if dtype is not None:
98
- return tensor.to(device=device, dtype=dtype)
99
- return tensor.to(device=device)
99
+ return tensor.to(device=device, dtype=dtype, non_blocking=non_blocking)
100
+ return tensor.to(device=device, non_blocking=non_blocking)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unifiedefficientloader
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
5
5
  Author: silveroxides
6
6
  License: MIT
@@ -111,6 +111,33 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
111
111
  loader.mark_processed(key)
112
112
  ```
113
113
 
114
+ ### Direct-to-GPU Streaming (Zero-Copy)
115
+
116
+ For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
117
+
118
+ ```python
119
+ from unifiedefficientloader import UnifiedSafetensorsLoader
120
+
121
+ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
122
+ keys_to_load = loader.keys()
123
+
124
+ # async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
125
+ stream = loader.async_stream(
126
+ keys_to_load,
127
+ batch_size=8,
128
+ prefetch_batches=2,
129
+ direct_gpu=True # optional here since we passed it in __init__
130
+ )
131
+
132
+ for batch in stream:
133
+ for key, gpu_tensor in batch:
134
+ # gpu_tensor is already on the GPU!
135
+ assert gpu_tensor.device.type == "cuda"
136
+
137
+ # ... process gpu_tensor ...
138
+ loader.mark_processed(key)
139
+ ```
140
+
114
141
  ### Tensor/Dict Conversion
115
142
 
116
143
  ```python
@@ -2,11 +2,14 @@ LICENSE
2
2
  README.md
3
3
  pyproject.toml
4
4
  setup.py
5
+ tests/test_direct_gpu.py
5
6
  tests/test_logging.py
6
7
  tests/test_utils.py
7
8
  unifiedefficientloader/__init__.py
9
+ unifiedefficientloader/gpu_buffer_pool.py
8
10
  unifiedefficientloader/logging_utils.py
9
11
  unifiedefficientloader/memory_efficient_loader.py
12
+ unifiedefficientloader/pinned_buffer_pool.py
10
13
  unifiedefficientloader/pinned_transfer.py
11
14
  unifiedefficientloader/tensor_utils.py
12
15
  unifiedefficientloader.egg-info/PKG-INFO