unifiedefficientloader 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/PKG-INFO +28 -1
- {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/README.md +27 -0
- {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/pyproject.toml +7 -2
- {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/setup.py +1 -0
- unifiedefficientloader-0.2.3/tests/test_direct_gpu.py +95 -0
- {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader/__init__.py +8 -4
- unifiedefficientloader-0.2.3/unifiedefficientloader/gpu_buffer_pool.py +40 -0
- {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader/memory_efficient_loader.py +163 -26
- unifiedefficientloader-0.2.3/unifiedefficientloader/pinned_buffer_pool.py +39 -0
- {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader/pinned_transfer.py +10 -9
- {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/PKG-INFO +28 -1
- {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/SOURCES.txt +3 -0
- {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/LICENSE +0 -0
- {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/setup.cfg +0 -0
- {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/tests/test_logging.py +0 -0
- {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/tests/test_utils.py +0 -0
- {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader/logging_utils.py +0 -0
- {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader/tensor_utils.py +0 -0
- {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/dependency_links.txt +0 -0
- {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/requires.txt +0 -0
- {unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: unifiedefficientloader
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
|
|
5
5
|
Author: silveroxides
|
|
6
6
|
License: MIT
|
|
@@ -111,6 +111,33 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
|
|
|
111
111
|
loader.mark_processed(key)
|
|
112
112
|
```
|
|
113
113
|
|
|
114
|
+
### Direct-to-GPU Streaming (Zero-Copy)
|
|
115
|
+
|
|
116
|
+
For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
from unifiedefficientloader import UnifiedSafetensorsLoader
|
|
120
|
+
|
|
121
|
+
with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
|
|
122
|
+
keys_to_load = loader.keys()
|
|
123
|
+
|
|
124
|
+
# async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
|
|
125
|
+
stream = loader.async_stream(
|
|
126
|
+
keys_to_load,
|
|
127
|
+
batch_size=8,
|
|
128
|
+
prefetch_batches=2,
|
|
129
|
+
direct_gpu=True # optional here since we passed it in __init__
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
for batch in stream:
|
|
133
|
+
for key, gpu_tensor in batch:
|
|
134
|
+
# gpu_tensor is already on the GPU!
|
|
135
|
+
assert gpu_tensor.device.type == "cuda"
|
|
136
|
+
|
|
137
|
+
# ... process gpu_tensor ...
|
|
138
|
+
loader.mark_processed(key)
|
|
139
|
+
```
|
|
140
|
+
|
|
114
141
|
### Tensor/Dict Conversion
|
|
115
142
|
|
|
116
143
|
```python
|
|
@@ -85,6 +85,33 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
|
|
|
85
85
|
loader.mark_processed(key)
|
|
86
86
|
```
|
|
87
87
|
|
|
88
|
+
### Direct-to-GPU Streaming (Zero-Copy)
|
|
89
|
+
|
|
90
|
+
For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from unifiedefficientloader import UnifiedSafetensorsLoader
|
|
94
|
+
|
|
95
|
+
with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
|
|
96
|
+
keys_to_load = loader.keys()
|
|
97
|
+
|
|
98
|
+
# async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
|
|
99
|
+
stream = loader.async_stream(
|
|
100
|
+
keys_to_load,
|
|
101
|
+
batch_size=8,
|
|
102
|
+
prefetch_batches=2,
|
|
103
|
+
direct_gpu=True # optional here since we passed it in __init__
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
for batch in stream:
|
|
107
|
+
for key, gpu_tensor in batch:
|
|
108
|
+
# gpu_tensor is already on the GPU!
|
|
109
|
+
assert gpu_tensor.device.type == "cuda"
|
|
110
|
+
|
|
111
|
+
# ... process gpu_tensor ...
|
|
112
|
+
loader.mark_processed(key)
|
|
113
|
+
```
|
|
114
|
+
|
|
88
115
|
### Tensor/Dict Conversion
|
|
89
116
|
|
|
90
117
|
```python
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "unifiedefficientloader"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.3"
|
|
8
8
|
description = "A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [
|
|
@@ -34,4 +34,9 @@ log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(linen
|
|
|
34
34
|
log_cli_date_format = "%Y-%m-%d %H:%M:%S"
|
|
35
35
|
filterwarnings = [
|
|
36
36
|
"ignore:.*argument 'device' of Tensor.*:DeprecationWarning"
|
|
37
|
-
]
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[tool.setuptools.packages.find]
|
|
40
|
+
where = ["."]
|
|
41
|
+
include = ["unifiedefficientloader*"]
|
|
42
|
+
exclude = ["reference"]
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
import torch
|
|
7
|
+
from safetensors.torch import save_file
|
|
8
|
+
HAS_TORCH = True
|
|
9
|
+
except ImportError:
|
|
10
|
+
HAS_TORCH = False
|
|
11
|
+
|
|
12
|
+
from unifiedefficientloader import MemoryEfficientSafeOpen
|
|
13
|
+
|
|
14
|
+
@pytest.fixture
|
|
15
|
+
def sample_safetensors():
|
|
16
|
+
if not HAS_TORCH:
|
|
17
|
+
pytest.skip("Requires torch and safetensors")
|
|
18
|
+
|
|
19
|
+
with tempfile.NamedTemporaryFile(suffix=".safetensors", delete=False) as f:
|
|
20
|
+
path = f.name
|
|
21
|
+
|
|
22
|
+
tensors = {
|
|
23
|
+
"weight1": torch.randn(10, 10),
|
|
24
|
+
"weight2": torch.randn(20, 20),
|
|
25
|
+
"bias": torch.zeros(10),
|
|
26
|
+
}
|
|
27
|
+
save_file(tensors, path)
|
|
28
|
+
|
|
29
|
+
yield path, tensors
|
|
30
|
+
|
|
31
|
+
if os.path.exists(path):
|
|
32
|
+
os.remove(path)
|
|
33
|
+
|
|
34
|
+
@pytest.mark.skipif(not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA")
|
|
35
|
+
def test_direct_gpu_streaming(sample_safetensors):
|
|
36
|
+
path, original_tensors = sample_safetensors
|
|
37
|
+
|
|
38
|
+
loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
|
|
39
|
+
|
|
40
|
+
# Test load_all which uses async_stream under the hood
|
|
41
|
+
loaded_tensors = loader.load_all()
|
|
42
|
+
|
|
43
|
+
for key, orig_tensor in original_tensors.items():
|
|
44
|
+
assert key in loaded_tensors
|
|
45
|
+
loaded_tensor = loaded_tensors[key]
|
|
46
|
+
|
|
47
|
+
# Verify it's on GPU
|
|
48
|
+
assert loaded_tensor.device.type == "cuda"
|
|
49
|
+
|
|
50
|
+
# Verify data matches
|
|
51
|
+
torch.testing.assert_close(loaded_tensor.cpu(), orig_tensor)
|
|
52
|
+
|
|
53
|
+
loader.close()
|
|
54
|
+
|
|
55
|
+
@pytest.mark.skipif(not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA")
|
|
56
|
+
def test_direct_gpu_async_stream(sample_safetensors):
|
|
57
|
+
path, original_tensors = sample_safetensors
|
|
58
|
+
|
|
59
|
+
loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
|
|
60
|
+
|
|
61
|
+
stream = loader.async_stream(
|
|
62
|
+
keys=list(original_tensors.keys()),
|
|
63
|
+
batch_size=2,
|
|
64
|
+
prefetch_batches=1,
|
|
65
|
+
direct_gpu=True
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
loaded_count = 0
|
|
69
|
+
for batch in stream:
|
|
70
|
+
for key, tensor in batch:
|
|
71
|
+
assert tensor.device.type == "cuda"
|
|
72
|
+
torch.testing.assert_close(tensor.cpu(), original_tensors[key])
|
|
73
|
+
loaded_count += 1
|
|
74
|
+
|
|
75
|
+
assert loaded_count == len(original_tensors)
|
|
76
|
+
loader.close()
|
|
77
|
+
|
|
78
|
+
@pytest.mark.skipif(not HAS_TORCH, reason="Requires torch")
|
|
79
|
+
def test_direct_gpu_fallback_no_cuda(sample_safetensors, monkeypatch):
|
|
80
|
+
# Force cuda to be unavailable
|
|
81
|
+
monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
|
|
82
|
+
|
|
83
|
+
path, original_tensors = sample_safetensors
|
|
84
|
+
|
|
85
|
+
# Should fallback to CPU silently
|
|
86
|
+
loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
|
|
87
|
+
|
|
88
|
+
loaded_tensors = loader.load_all()
|
|
89
|
+
|
|
90
|
+
for key, orig_tensor in original_tensors.items():
|
|
91
|
+
loaded_tensor = loaded_tensors[key]
|
|
92
|
+
assert loaded_tensor.device.type == "cpu"
|
|
93
|
+
torch.testing.assert_close(loaded_tensor, orig_tensor)
|
|
94
|
+
|
|
95
|
+
loader.close()
|
{unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader/__init__.py
RENAMED
|
@@ -9,7 +9,7 @@ def check_dependencies(*packages):
|
|
|
9
9
|
for pkg in packages:
|
|
10
10
|
if importlib.util.find_spec(pkg) is None:
|
|
11
11
|
missing.append(pkg)
|
|
12
|
-
|
|
12
|
+
|
|
13
13
|
if missing:
|
|
14
14
|
missing_str = ", ".join(missing)
|
|
15
15
|
raise ImportError(
|
|
@@ -23,11 +23,13 @@ check_dependencies("torch")
|
|
|
23
23
|
from .memory_efficient_loader import UnifiedSafetensorsLoader, MemoryEfficientSafeOpen
|
|
24
24
|
from .tensor_utils import dict_to_tensor, tensor_to_dict
|
|
25
25
|
from .pinned_transfer import transfer_to_gpu_pinned, set_verbose, get_pinned_transfer_stats, reset_pinned_transfer_stats
|
|
26
|
+
from .gpu_buffer_pool import GpuBufferPool
|
|
27
|
+
from .pinned_buffer_pool import PinnedBufferPool
|
|
26
28
|
from .logging_utils import (
|
|
27
29
|
setup_logging,
|
|
28
|
-
MINIMAL_LEVEL,
|
|
29
|
-
NORMAL_LEVEL,
|
|
30
|
-
VERBOSE_LEVEL,
|
|
30
|
+
MINIMAL_LEVEL,
|
|
31
|
+
NORMAL_LEVEL,
|
|
32
|
+
VERBOSE_LEVEL,
|
|
31
33
|
DEBUG_LEVEL,
|
|
32
34
|
debug,
|
|
33
35
|
verbose,
|
|
@@ -47,6 +49,8 @@ __all__ = [
|
|
|
47
49
|
"set_verbose",
|
|
48
50
|
"get_pinned_transfer_stats",
|
|
49
51
|
"reset_pinned_transfer_stats",
|
|
52
|
+
"GpuBufferPool",
|
|
53
|
+
"PinnedBufferPool",
|
|
50
54
|
"setup_logging",
|
|
51
55
|
"MINIMAL_LEVEL",
|
|
52
56
|
"NORMAL_LEVEL",
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
GPU memory buffer pool for direct-to-GPU streaming.
|
|
3
|
+
|
|
4
|
+
Maintains a pool of pre-allocated GPU tensors to avoid allocation overhead
|
|
5
|
+
and ensure strictly bounded VRAM usage during streaming.
|
|
6
|
+
"""
|
|
7
|
+
from typing import Tuple, Optional
|
|
8
|
+
import torch
|
|
9
|
+
from . import logging_utils
|
|
10
|
+
|
|
11
|
+
logger = logging_utils.get_logger(__name__)
|
|
12
|
+
|
|
13
|
+
class GpuBufferPool:
|
|
14
|
+
"""Manages a pool of fixed-size GPU memory buffers."""
|
|
15
|
+
def __init__(self, size_bytes: int, num_buffers: int, device: str = "cuda"):
|
|
16
|
+
import torch
|
|
17
|
+
import queue
|
|
18
|
+
self.device = device
|
|
19
|
+
self.size_bytes = size_bytes
|
|
20
|
+
self.num_buffers = num_buffers
|
|
21
|
+
|
|
22
|
+
logging_utils.verbose(f"Initializing GpuBufferPool: {num_buffers} buffers of {size_bytes / (1024**2):.2f} MB each on {device}.")
|
|
23
|
+
|
|
24
|
+
self.buffers = []
|
|
25
|
+
for _ in range(num_buffers):
|
|
26
|
+
buf = torch.empty(size_bytes, dtype=torch.uint8, device=device)
|
|
27
|
+
self.buffers.append(buf)
|
|
28
|
+
|
|
29
|
+
self.free_queue = queue.Queue()
|
|
30
|
+
for i in range(num_buffers):
|
|
31
|
+
self.free_queue.put(i)
|
|
32
|
+
|
|
33
|
+
def acquire(self) -> Tuple[int, 'torch.Tensor']:
|
|
34
|
+
"""Acquire a free buffer. Blocks if empty."""
|
|
35
|
+
idx = self.free_queue.get()
|
|
36
|
+
return idx, self.buffers[idx]
|
|
37
|
+
|
|
38
|
+
def release(self, idx: int):
|
|
39
|
+
"""Release buffer back to pool."""
|
|
40
|
+
self.free_queue.put(idx)
|
|
@@ -53,26 +53,36 @@ class UnifiedSafetensorsLoader:
|
|
|
53
53
|
"""
|
|
54
54
|
|
|
55
55
|
@logging_utils.log_debug
|
|
56
|
-
def __init__(self, filename: str, low_memory: bool = False):
|
|
56
|
+
def __init__(self, filename: str, low_memory: bool = False, direct_gpu: bool = False):
|
|
57
57
|
"""Initialize the loader.
|
|
58
58
|
|
|
59
59
|
Args:
|
|
60
60
|
filename: Path to safetensors file
|
|
61
61
|
low_memory: If True, use streaming mode; if False, preload all tensors
|
|
62
|
+
direct_gpu: If True, stream directly to GPU pinned/slab memory (requires low_memory=True)
|
|
62
63
|
"""
|
|
63
64
|
torch = _ensure_torch()
|
|
64
65
|
safe_open = _ensure_safetensors()
|
|
65
66
|
|
|
66
67
|
self.filename = filename
|
|
67
68
|
self.low_memory = low_memory
|
|
69
|
+
self.direct_gpu = direct_gpu
|
|
70
|
+
|
|
71
|
+
if self.direct_gpu and not self.low_memory:
|
|
72
|
+
logging_utils.warning("direct_gpu=True requires low_memory=True. Forcing low_memory=True.")
|
|
73
|
+
self.low_memory = True
|
|
74
|
+
|
|
68
75
|
self._tensors: Dict[str, 'torch.Tensor'] = {}
|
|
76
|
+
self._gpu_buffer_indices: Dict[str, int] = {}
|
|
77
|
+
self._gpu_pool = None
|
|
78
|
+
|
|
69
79
|
self._all_keys = []
|
|
70
80
|
self._file = None
|
|
71
81
|
self._header = None
|
|
72
82
|
self._header_size = None
|
|
73
83
|
self._metadata: Dict[str, str] = {}
|
|
74
84
|
|
|
75
|
-
if low_memory:
|
|
85
|
+
if self.low_memory:
|
|
76
86
|
# Streaming mode: read header only
|
|
77
87
|
self._header, self._header_size = self._read_header()
|
|
78
88
|
self._file = None # Opened lazily to support multiprocessing DataLoader
|
|
@@ -83,7 +93,7 @@ class UnifiedSafetensorsLoader:
|
|
|
83
93
|
logging_utils.verbose(f"Found {len(self._all_keys)} tensors (streaming mode)")
|
|
84
94
|
else:
|
|
85
95
|
# Standard mode: preload all tensors
|
|
86
|
-
with safe_open(filename, framework="pt", device="cpu") as f:
|
|
96
|
+
with safe_open(self.filename, framework="pt", device="cpu") as f:
|
|
87
97
|
self._metadata = f.metadata() or {}
|
|
88
98
|
self._all_keys = list(f.keys())
|
|
89
99
|
logging_utils.normal(f"Loading {len(self._all_keys)} tensors from source file...")
|
|
@@ -180,12 +190,17 @@ class UnifiedSafetensorsLoader:
|
|
|
180
190
|
"""Mark a tensor as processed, freeing memory if in low-memory mode.
|
|
181
191
|
|
|
182
192
|
In standard mode, optionally deletes from cache.
|
|
183
|
-
In low-memory mode,
|
|
193
|
+
In low-memory mode, frees GPU buffer back to pool if direct_gpu.
|
|
184
194
|
"""
|
|
185
195
|
if not self.low_memory and key in self._tensors:
|
|
186
196
|
del self._tensors[key]
|
|
187
197
|
gc.collect()
|
|
188
198
|
|
|
199
|
+
if self.direct_gpu and key in self._gpu_buffer_indices:
|
|
200
|
+
idx = self._gpu_buffer_indices.pop(key)
|
|
201
|
+
if self._gpu_pool:
|
|
202
|
+
self._gpu_pool.release(idx)
|
|
203
|
+
|
|
189
204
|
def _read_header(self):
|
|
190
205
|
"""Read and parse the safetensors header."""
|
|
191
206
|
with open(self.filename, "rb") as f:
|
|
@@ -279,6 +294,7 @@ class UnifiedSafetensorsLoader:
|
|
|
279
294
|
batch_size: Number of tensors to yield in each batch
|
|
280
295
|
prefetch_batches: Number of batches to pre-fetch in background
|
|
281
296
|
pin_memory: If True, tensors will be pinned in CPU memory (sequentially in main thread)
|
|
297
|
+
direct_gpu: Stream via pinned buffer directly to GPU
|
|
282
298
|
|
|
283
299
|
Yields:
|
|
284
300
|
List of (key, tensor) tuples
|
|
@@ -286,44 +302,134 @@ class UnifiedSafetensorsLoader:
|
|
|
286
302
|
import threading
|
|
287
303
|
import queue
|
|
288
304
|
from concurrent.futures import ThreadPoolExecutor
|
|
305
|
+
import os
|
|
289
306
|
|
|
290
307
|
torch = _ensure_torch()
|
|
291
308
|
thread_local = threading.local()
|
|
292
309
|
|
|
310
|
+
# Initialize GPU slab and Pinned Buffer Pool if direct_gpu
|
|
311
|
+
pinned_pool = None
|
|
312
|
+
cuda_stream = None
|
|
313
|
+
|
|
314
|
+
if self.direct_gpu and torch.cuda.is_available():
|
|
315
|
+
try:
|
|
316
|
+
from .gpu_buffer_pool import GpuBufferPool
|
|
317
|
+
from .pinned_buffer_pool import PinnedBufferPool
|
|
318
|
+
|
|
319
|
+
# Pre-calculate required slab size
|
|
320
|
+
max_tensor_bytes = 0
|
|
321
|
+
for k in keys:
|
|
322
|
+
meta = self._header[k]
|
|
323
|
+
start, end = meta["data_offsets"]
|
|
324
|
+
sz = end - start
|
|
325
|
+
max_tensor_bytes = max(max_tensor_bytes, sz)
|
|
326
|
+
|
|
327
|
+
# Initialize pools (size of largest tensor)
|
|
328
|
+
# We need a larger pool to allow the GPU to lag behind the CPU without stalling
|
|
329
|
+
max_workers = min(16, max(4, batch_size))
|
|
330
|
+
max_in_flight = max(max_workers, prefetch_batches * batch_size)
|
|
331
|
+
|
|
332
|
+
# Double the buffers for a smooth pipeline
|
|
333
|
+
num_buffers = (max_in_flight + max_workers) * 2 + 2
|
|
334
|
+
|
|
335
|
+
# Assign pool to instance to survive the generator lifetime
|
|
336
|
+
if not getattr(self, '_gpu_pool', None):
|
|
337
|
+
self._gpu_pool = GpuBufferPool(max_tensor_bytes, num_buffers)
|
|
338
|
+
|
|
339
|
+
pinned_pool = PinnedBufferPool(max_tensor_bytes, num_buffers)
|
|
340
|
+
cuda_stream = torch.cuda.Stream()
|
|
341
|
+
|
|
342
|
+
logging_utils.normal(f"Direct GPU pipeline initialized: {num_buffers} buffers, max {max_tensor_bytes / (1024**2):.1f}MB each (Total VRAM: {(num_buffers*max_tensor_bytes)/(1024**2):.1f}MB)")
|
|
343
|
+
|
|
344
|
+
except Exception as e:
|
|
345
|
+
logging_utils.warning(f"Failed to initialize direct GPU pipeline: {e}. Falling back.")
|
|
346
|
+
self.direct_gpu = False
|
|
347
|
+
pinned_pool = None
|
|
348
|
+
elif self.direct_gpu:
|
|
349
|
+
logging_utils.warning("direct_gpu=True requested but CUDA is not available. Falling back to CPU.")
|
|
350
|
+
self.direct_gpu = False
|
|
351
|
+
|
|
293
352
|
def get_file_handle():
|
|
294
353
|
if not hasattr(thread_local, 'file'):
|
|
295
354
|
thread_local.file = open(self.filename, "rb")
|
|
296
355
|
return thread_local.file
|
|
297
356
|
|
|
298
357
|
def _worker_load(key):
|
|
358
|
+
buf_idx = None
|
|
359
|
+
gpu_idx = None
|
|
299
360
|
try:
|
|
300
|
-
# Direct thread-safe read
|
|
301
361
|
metadata = self._header[key]
|
|
302
362
|
offset_start, offset_end = metadata["data_offsets"]
|
|
303
|
-
|
|
304
|
-
f = get_file_handle()
|
|
305
|
-
f.seek(self._header_size + 8 + offset_start)
|
|
306
|
-
tensor_bytes = bytearray(offset_end - offset_start)
|
|
307
|
-
f.readinto(tensor_bytes)
|
|
308
|
-
else:
|
|
309
|
-
tensor_bytes = None
|
|
363
|
+
sz = offset_end - offset_start
|
|
310
364
|
|
|
311
|
-
|
|
312
|
-
|
|
365
|
+
if self.direct_gpu and sz > 0:
|
|
366
|
+
# Direct GPU Pipeline Path
|
|
367
|
+
buf_idx, pinned_buf = pinned_pool.acquire()
|
|
368
|
+
|
|
369
|
+
try:
|
|
370
|
+
# Schedule GPU transfer
|
|
371
|
+
gpu_idx, gpu_buf = self._gpu_pool.acquire()
|
|
372
|
+
|
|
373
|
+
try:
|
|
374
|
+
# Read into pinned memory directly (Zero-Copy CPU path)
|
|
375
|
+
import ctypes
|
|
376
|
+
view = pinned_buf[:sz]
|
|
377
|
+
|
|
378
|
+
# Create a ctypes c_uint8 array spanning the pinned buffer memory
|
|
379
|
+
# This allows f.readinto() to write bytes directly to the torch tensor memory
|
|
380
|
+
c_uint8_array = (ctypes.c_uint8 * sz).from_address(view.data_ptr())
|
|
381
|
+
|
|
382
|
+
f = get_file_handle()
|
|
383
|
+
f.seek(self._header_size + 8 + offset_start)
|
|
384
|
+
f.readinto(c_uint8_array)
|
|
385
|
+
|
|
386
|
+
gpu_view = gpu_buf[:sz]
|
|
387
|
+
|
|
388
|
+
with torch.cuda.stream(cuda_stream):
|
|
389
|
+
gpu_view.copy_(view, non_blocking=True)
|
|
390
|
+
|
|
391
|
+
# Create event to track when copy finishes
|
|
392
|
+
event = torch.cuda.Event()
|
|
393
|
+
event.record()
|
|
394
|
+
|
|
395
|
+
# Critical: wait for stream before allowing worker to finish
|
|
396
|
+
# If worker finishes, buffer might be overwritten by next worker
|
|
397
|
+
# if pool sizing is tight.
|
|
398
|
+
# In direct_gpu, the tensor is the gpu_view.
|
|
399
|
+
return key, gpu_view, metadata, buf_idx, gpu_idx, event
|
|
400
|
+
|
|
401
|
+
except Exception as e:
|
|
402
|
+
# If reading or copying fails, release GPU buffer
|
|
403
|
+
self._gpu_pool.release(gpu_idx)
|
|
404
|
+
raise e
|
|
405
|
+
|
|
406
|
+
except Exception as e:
|
|
407
|
+
# If acquiring GPU buffer fails, release pinned buffer
|
|
408
|
+
pinned_pool.release(buf_idx)
|
|
409
|
+
raise e
|
|
410
|
+
else:
|
|
411
|
+
# Standard CPU Path
|
|
412
|
+
if offset_start != offset_end:
|
|
413
|
+
f = get_file_handle()
|
|
414
|
+
f.seek(self._header_size + 8 + offset_start)
|
|
415
|
+
tensor_bytes = bytearray(offset_end - offset_start)
|
|
416
|
+
f.readinto(tensor_bytes)
|
|
417
|
+
else:
|
|
418
|
+
tensor_bytes = None
|
|
419
|
+
|
|
420
|
+
tensor = self._deserialize_tensor(tensor_bytes, metadata)
|
|
421
|
+
return key, tensor, None, None, None, None
|
|
313
422
|
except Exception as e:
|
|
314
|
-
|
|
315
|
-
|
|
423
|
+
return key, None, e, None, None, None
|
|
424
|
+
|
|
425
|
+
max_workers = min(16, max(4, batch_size))
|
|
426
|
+
max_in_flight = max(max_workers, prefetch_batches * batch_size)
|
|
316
427
|
|
|
317
428
|
# Queue for individual (key, tensor) pairs
|
|
318
|
-
# Size it to hold enough for prefetch_batches
|
|
319
|
-
q = queue.Queue(maxsize=
|
|
429
|
+
# Size it to hold enough for prefetch_batches PLUS max_workers to prevent stalling
|
|
430
|
+
q = queue.Queue(maxsize=max_in_flight + max_workers)
|
|
320
431
|
|
|
321
432
|
def _producer():
|
|
322
|
-
# Use a reasonable number of workers for I/O bound tasks
|
|
323
|
-
max_workers = min(16, max(4, batch_size))
|
|
324
|
-
# Limit task submission to maintain backpressure on memory
|
|
325
|
-
max_in_flight = max(max_workers, prefetch_batches * batch_size)
|
|
326
|
-
|
|
327
433
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
328
434
|
futures = []
|
|
329
435
|
key_iter = iter(keys)
|
|
@@ -351,19 +457,26 @@ class UnifiedSafetensorsLoader:
|
|
|
351
457
|
|
|
352
458
|
q.put(None) # Sentinel
|
|
353
459
|
|
|
460
|
+
producer_thread = threading.local()
|
|
354
461
|
producer_thread = threading.Thread(target=_producer, daemon=True)
|
|
355
462
|
producer_thread.start()
|
|
356
463
|
|
|
357
464
|
batch = []
|
|
465
|
+
pending_pinned = [] # Track (event, buf_idx) to release later
|
|
466
|
+
|
|
358
467
|
while True:
|
|
359
468
|
res = q.get()
|
|
360
469
|
if res is None:
|
|
470
|
+
# Synchronize and cleanup any remaining buffers on exit
|
|
471
|
+
for ev, idx in pending_pinned:
|
|
472
|
+
ev.synchronize()
|
|
473
|
+
pinned_pool.release(idx)
|
|
361
474
|
if batch:
|
|
362
475
|
yield batch
|
|
363
476
|
break
|
|
364
477
|
|
|
365
|
-
k, t, err = res
|
|
366
|
-
if err is not None:
|
|
478
|
+
k, t, err, buf_idx, gpu_idx, event = res
|
|
479
|
+
if err is not None and not isinstance(err, dict):
|
|
367
480
|
logging_utils.warning(f"Async load failed for {k}, falling back to sync: {err}")
|
|
368
481
|
# Fallback synchronous load
|
|
369
482
|
try:
|
|
@@ -372,8 +485,32 @@ class UnifiedSafetensorsLoader:
|
|
|
372
485
|
logging_utils.error(f"Sync fallback also failed for {k}: {sync_err}")
|
|
373
486
|
raise sync_err
|
|
374
487
|
|
|
488
|
+
if buf_idx is not None and event is not None:
|
|
489
|
+
# Don't block here! Yield the tensor with its event.
|
|
490
|
+
# Only release the PREVIOUS batch's buffers.
|
|
491
|
+
# This creates a sliding window of safety.
|
|
492
|
+
while len(pending_pinned) >= (max_in_flight + 1):
|
|
493
|
+
ev, idx = pending_pinned.pop(0)
|
|
494
|
+
ev.synchronize() # Wait only if we MUST reuse a buffer
|
|
495
|
+
pinned_pool.release(idx)
|
|
496
|
+
|
|
497
|
+
pending_pinned.append((event, buf_idx))
|
|
498
|
+
|
|
499
|
+
# Register GPU index for cleanup
|
|
500
|
+
self._gpu_buffer_indices[k] = gpu_idx
|
|
501
|
+
|
|
502
|
+
# Reshape GPU view to tensor
|
|
503
|
+
meta = err # we reused err for metadata in direct_gpu path
|
|
504
|
+
dtype = self._get_torch_dtype(meta["dtype"])
|
|
505
|
+
shape = meta["shape"]
|
|
506
|
+
|
|
507
|
+
if meta["dtype"] in ["F8_E5M2", "F8_E4M3"]:
|
|
508
|
+
t = self._convert_float8(t, meta["dtype"], shape)
|
|
509
|
+
else:
|
|
510
|
+
t = t.view(dtype).reshape(shape)
|
|
511
|
+
|
|
375
512
|
# Pin memory sequentially in the main thread to avoid OS-level lock contention
|
|
376
|
-
|
|
513
|
+
elif pin_memory and t.device.type == 'cpu':
|
|
377
514
|
try:
|
|
378
515
|
t = t.pin_memory()
|
|
379
516
|
except Exception as e:
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pinned memory pool manager.
|
|
3
|
+
|
|
4
|
+
Provides pre-allocated pinned buffers to avoid per-tensor allocation overhead.
|
|
5
|
+
"""
|
|
6
|
+
from typing import Optional
|
|
7
|
+
import torch
|
|
8
|
+
from . import logging_utils
|
|
9
|
+
|
|
10
|
+
logger = logging_utils.get_logger(__name__)
|
|
11
|
+
|
|
12
|
+
class PinnedBufferPool:
|
|
13
|
+
"""Manages a pool of pinned memory buffers for fast disk-to-GPU transfer."""
|
|
14
|
+
def __init__(self, size_bytes: int, num_buffers: int):
|
|
15
|
+
import torch
|
|
16
|
+
self.size_bytes = size_bytes
|
|
17
|
+
self.num_buffers = num_buffers
|
|
18
|
+
|
|
19
|
+
logging_utils.verbose(f"Initializing PinnedBufferPool: {num_buffers} buffers of {size_bytes / (1024**2):.2f} MB each.")
|
|
20
|
+
|
|
21
|
+
self.buffers = []
|
|
22
|
+
for _ in range(num_buffers):
|
|
23
|
+
buf = torch.empty(size_bytes, dtype=torch.uint8, pin_memory=True)
|
|
24
|
+
self.buffers.append(buf)
|
|
25
|
+
|
|
26
|
+
import queue
|
|
27
|
+
self.free_queue = queue.Queue()
|
|
28
|
+
for i in range(num_buffers):
|
|
29
|
+
self.free_queue.put(i)
|
|
30
|
+
|
|
31
|
+
def acquire(self) -> tuple[int, 'torch.Tensor']:
|
|
32
|
+
"""Acquire a free buffer. Blocks if empty."""
|
|
33
|
+
idx = self.free_queue.get()
|
|
34
|
+
return idx, self.buffers[idx]
|
|
35
|
+
|
|
36
|
+
def release(self, idx: int):
|
|
37
|
+
"""Release buffer back to pool."""
|
|
38
|
+
self.free_queue.put(idx)
|
|
39
|
+
|
|
@@ -44,7 +44,8 @@ def reset_pinned_transfer_stats():
|
|
|
44
44
|
def transfer_to_gpu_pinned(
|
|
45
45
|
tensor,
|
|
46
46
|
device: str = 'cuda',
|
|
47
|
-
dtype = None
|
|
47
|
+
dtype = None,
|
|
48
|
+
non_blocking: bool = True
|
|
48
49
|
):
|
|
49
50
|
"""Transfer tensor to GPU using pinned memory for faster transfer."""
|
|
50
51
|
torch = _ensure_torch()
|
|
@@ -53,22 +54,22 @@ def transfer_to_gpu_pinned(
|
|
|
53
54
|
# Skip if not a CPU tensor or CUDA unavailable
|
|
54
55
|
if tensor.device.type != 'cpu' or not torch.cuda.is_available():
|
|
55
56
|
if dtype is not None:
|
|
56
|
-
return tensor.to(device=device, dtype=dtype)
|
|
57
|
-
return tensor.to(device=device)
|
|
57
|
+
return tensor.to(device=device, dtype=dtype, non_blocking=non_blocking)
|
|
58
|
+
return tensor.to(device=device, non_blocking=non_blocking)
|
|
58
59
|
|
|
59
60
|
# Skip if target is not CUDA
|
|
60
61
|
if not str(device).startswith('cuda'):
|
|
61
62
|
if dtype is not None:
|
|
62
|
-
return tensor.to(device=device, dtype=dtype)
|
|
63
|
-
return tensor.to(device=device)
|
|
63
|
+
return tensor.to(device=device, dtype=dtype, non_blocking=non_blocking)
|
|
64
|
+
return tensor.to(device=device, non_blocking=non_blocking)
|
|
64
65
|
|
|
65
66
|
try:
|
|
66
67
|
pinned = tensor.pin_memory()
|
|
67
68
|
|
|
68
69
|
if dtype is not None:
|
|
69
|
-
result = pinned.to(device=device, dtype=dtype, non_blocking=
|
|
70
|
+
result = pinned.to(device=device, dtype=dtype, non_blocking=non_blocking)
|
|
70
71
|
else:
|
|
71
|
-
result = pinned.to(device=device, non_blocking=
|
|
72
|
+
result = pinned.to(device=device, non_blocking=non_blocking)
|
|
72
73
|
|
|
73
74
|
torch.cuda.current_stream().synchronize()
|
|
74
75
|
|
|
@@ -95,5 +96,5 @@ def transfer_to_gpu_pinned(
|
|
|
95
96
|
logging_utils.verbose(msg)
|
|
96
97
|
|
|
97
98
|
if dtype is not None:
|
|
98
|
-
return tensor.to(device=device, dtype=dtype)
|
|
99
|
-
return tensor.to(device=device)
|
|
99
|
+
return tensor.to(device=device, dtype=dtype, non_blocking=non_blocking)
|
|
100
|
+
return tensor.to(device=device, non_blocking=non_blocking)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: unifiedefficientloader
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
|
|
5
5
|
Author: silveroxides
|
|
6
6
|
License: MIT
|
|
@@ -111,6 +111,33 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
|
|
|
111
111
|
loader.mark_processed(key)
|
|
112
112
|
```
|
|
113
113
|
|
|
114
|
+
### Direct-to-GPU Streaming (Zero-Copy)
|
|
115
|
+
|
|
116
|
+
For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
from unifiedefficientloader import UnifiedSafetensorsLoader
|
|
120
|
+
|
|
121
|
+
with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
|
|
122
|
+
keys_to_load = loader.keys()
|
|
123
|
+
|
|
124
|
+
# async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
|
|
125
|
+
stream = loader.async_stream(
|
|
126
|
+
keys_to_load,
|
|
127
|
+
batch_size=8,
|
|
128
|
+
prefetch_batches=2,
|
|
129
|
+
direct_gpu=True # optional here since we passed it in __init__
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
for batch in stream:
|
|
133
|
+
for key, gpu_tensor in batch:
|
|
134
|
+
# gpu_tensor is already on the GPU!
|
|
135
|
+
assert gpu_tensor.device.type == "cuda"
|
|
136
|
+
|
|
137
|
+
# ... process gpu_tensor ...
|
|
138
|
+
loader.mark_processed(key)
|
|
139
|
+
```
|
|
140
|
+
|
|
114
141
|
### Tensor/Dict Conversion
|
|
115
142
|
|
|
116
143
|
```python
|
|
@@ -2,11 +2,14 @@ LICENSE
|
|
|
2
2
|
README.md
|
|
3
3
|
pyproject.toml
|
|
4
4
|
setup.py
|
|
5
|
+
tests/test_direct_gpu.py
|
|
5
6
|
tests/test_logging.py
|
|
6
7
|
tests/test_utils.py
|
|
7
8
|
unifiedefficientloader/__init__.py
|
|
9
|
+
unifiedefficientloader/gpu_buffer_pool.py
|
|
8
10
|
unifiedefficientloader/logging_utils.py
|
|
9
11
|
unifiedefficientloader/memory_efficient_loader.py
|
|
12
|
+
unifiedefficientloader/pinned_buffer_pool.py
|
|
10
13
|
unifiedefficientloader/pinned_transfer.py
|
|
11
14
|
unifiedefficientloader/tensor_utils.py
|
|
12
15
|
unifiedefficientloader.egg-info/PKG-INFO
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unifiedefficientloader-0.2.2 → unifiedefficientloader-0.2.3}/unifiedefficientloader/tensor_utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|