unifiedefficientloader 0.2.3__tar.gz → 0.4.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/LICENSE +1 -1
- {unifiedefficientloader-0.2.3/unifiedefficientloader.egg-info → unifiedefficientloader-0.4.4}/PKG-INFO +103 -22
- unifiedefficientloader-0.2.3/PKG-INFO → unifiedefficientloader-0.4.4/README.md +77 -44
- {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/pyproject.toml +7 -7
- unifiedefficientloader-0.4.4/setup.py +29 -0
- {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/tests/test_direct_gpu.py +30 -22
- unifiedefficientloader-0.4.4/tests/test_incremental_writer.py +249 -0
- unifiedefficientloader-0.4.4/tests/test_mmap.py +205 -0
- unifiedefficientloader-0.4.4/tests/test_unified_data_loader.py +57 -0
- {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/tests/test_utils.py +24 -24
- {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader/__init__.py +7 -1
- unifiedefficientloader-0.4.4/unifiedefficientloader/incremental_writer.py +296 -0
- {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader/memory_efficient_loader.py +188 -97
- {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader/pinned_buffer_pool.py +21 -3
- unifiedefficientloader-0.4.4/unifiedefficientloader/tensor_utils.py +126 -0
- unifiedefficientloader-0.4.4/unifiedefficientloader/uel/__init__.py +48 -0
- unifiedefficientloader-0.4.4/unifiedefficientloader/uel/control.py +57 -0
- unifiedefficientloader-0.4.4/unifiedefficientloader/uel/host_buffer.py +4 -0
- unifiedefficientloader-0.4.4/unifiedefficientloader/uel/model_mmap.py +62 -0
- unifiedefficientloader-0.4.4/unifiedefficientloader/uel/model_vbar.py +4 -0
- unifiedefficientloader-0.4.4/unifiedefficientloader/uel/torch.py +12 -0
- unifiedefficientloader-0.4.4/unifiedefficientloader/unified_data_loader.py +295 -0
- unifiedefficientloader-0.2.3/README.md → unifiedefficientloader-0.4.4/unifiedefficientloader.egg-info/PKG-INFO +126 -19
- {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader.egg-info/SOURCES.txt +12 -1
- {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader.egg-info/requires.txt +1 -0
- unifiedefficientloader-0.2.3/setup.py +0 -6
- unifiedefficientloader-0.2.3/unifiedefficientloader/tensor_utils.py +0 -57
- {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/setup.cfg +0 -0
- {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/tests/test_logging.py +0 -0
- {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader/gpu_buffer_pool.py +0 -0
- {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader/logging_utils.py +0 -0
- {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader/pinned_transfer.py +0 -0
- {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader.egg-info/dependency_links.txt +0 -0
- {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader.egg-info/top_level.txt +0 -0
|
@@ -1,17 +1,39 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: unifiedefficientloader
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.4
|
|
4
4
|
Summary: A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
|
|
5
5
|
Author: silveroxides
|
|
6
|
-
License: MIT
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 silveroxides
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
7
28
|
Classifier: Development Status :: 4 - Beta
|
|
8
29
|
Classifier: Intended Audience :: Developers
|
|
9
30
|
Classifier: Programming Language :: Python :: 3
|
|
10
|
-
Classifier:
|
|
11
|
-
Classifier: Operating System ::
|
|
31
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
32
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
12
33
|
Requires-Python: >=3.9
|
|
13
34
|
Description-Content-Type: text/markdown
|
|
14
35
|
License-File: LICENSE
|
|
36
|
+
Requires-Dist: comfy-aimdo==0.3.0
|
|
15
37
|
Provides-Extra: torch
|
|
16
38
|
Requires-Dist: torch; extra == "torch"
|
|
17
39
|
Provides-Extra: safetensors
|
|
@@ -28,6 +50,10 @@ Dynamic: license-file
|
|
|
28
50
|
|
|
29
51
|
A unified interface for loading safetensors, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
|
|
30
52
|
|
|
53
|
+
## Documentation
|
|
54
|
+
|
|
55
|
+
Full API reference and guides in [docs/](docs/index.md).
|
|
56
|
+
|
|
31
57
|
## Installation
|
|
32
58
|
|
|
33
59
|
You can install this package via pip. Since it heavily relies on `torch` and `safetensors` but doesn't strictly force them as hard dependencies for package building/installation, make sure you have them installed in your environment:
|
|
@@ -56,6 +82,28 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
|
|
|
56
82
|
loader.mark_processed(key) # Frees memory
|
|
57
83
|
```
|
|
58
84
|
|
|
85
|
+
### Incremental Safetensors Writer
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from unifiedefficientloader import UnifiedSafetensorsLoader, IncrementalSafetensorsWriter
|
|
90
|
+
|
|
91
|
+
# Initialize Writer
|
|
92
|
+
writer = IncrementalSafetensorsWriter(output_path, metadata=metadata)
|
|
93
|
+
writer.__enter__()
|
|
94
|
+
|
|
95
|
+
# Load model tensors and process them.
|
|
96
|
+
with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
|
|
97
|
+
for key in loader.keys():
|
|
98
|
+
tensor = loader.get_tensor(key)
|
|
99
|
+
# Process tensor...
|
|
100
|
+
writer.write(key, tensor)
|
|
101
|
+
del tensor
|
|
102
|
+
loader.mark_processed(key) # Frees memory
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
```
|
|
106
|
+
|
|
59
107
|
### Loading Specific Tensors Dynamically (Header Analysis)
|
|
60
108
|
|
|
61
109
|
You can analyze the file's header without loading the entire multi-gigabyte safetensors file into memory. This allows you to locate specific data (like embedded JSON dictionaries stored as `uint8` tensors) and load *only* those specific tensors directly from their file offsets.
|
|
@@ -70,13 +118,13 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
|
|
|
70
118
|
key for key, info in loader._header.items()
|
|
71
119
|
if isinstance(info, dict) and info.get("dtype") == "U8"
|
|
72
120
|
]
|
|
73
|
-
|
|
121
|
+
|
|
74
122
|
# 2. Load ONLY those specific tensors using their keys
|
|
75
123
|
for key in uint8_tensor_keys:
|
|
76
|
-
# get_tensor dynamically reads only the bytes for this tensor
|
|
124
|
+
# get_tensor dynamically reads only the bytes for this tensor
|
|
77
125
|
# based on the offsets found in the header
|
|
78
126
|
loaded_tensor = loader.get_tensor(key)
|
|
79
|
-
|
|
127
|
+
|
|
80
128
|
# 3. Decode the uint8 tensor back into a Python dictionary
|
|
81
129
|
extracted_dict = tensor_to_dict(loaded_tensor)
|
|
82
130
|
print(f"Decoded {key}:", extracted_dict)
|
|
@@ -91,26 +139,51 @@ from unifiedefficientloader import UnifiedSafetensorsLoader, transfer_to_gpu_pin
|
|
|
91
139
|
|
|
92
140
|
with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
|
|
93
141
|
keys_to_load = loader.keys()
|
|
94
|
-
|
|
142
|
+
|
|
95
143
|
# Create the continuous streaming generator
|
|
96
144
|
# prefetch_batches controls how many batches to buffer in memory
|
|
97
145
|
stream = loader.async_stream(
|
|
98
|
-
keys_to_load,
|
|
99
|
-
batch_size=8,
|
|
100
|
-
prefetch_batches=2,
|
|
146
|
+
keys_to_load,
|
|
147
|
+
batch_size=8,
|
|
148
|
+
prefetch_batches=2,
|
|
101
149
|
pin_memory=True
|
|
102
150
|
)
|
|
103
|
-
|
|
151
|
+
|
|
104
152
|
# Iterate directly over the generator
|
|
105
153
|
for batch in stream:
|
|
106
154
|
for key, pinned_tensor in batch:
|
|
107
155
|
# Transfer directly to GPU via DMA (pinning is already done)
|
|
108
156
|
gpu_tensor = transfer_to_gpu_pinned(pinned_tensor, device="cuda")
|
|
109
|
-
|
|
157
|
+
|
|
110
158
|
# ... process gpu_tensor ...
|
|
111
159
|
loader.mark_processed(key)
|
|
112
160
|
```
|
|
113
161
|
|
|
162
|
+
### Unified Data Loader
|
|
163
|
+
|
|
164
|
+
A high-performance, threaded alternative to PyTorch's standard `DataLoader`. It eliminates multiprocessing IPC overhead and features a zero-copy pipeline capable of streaming batches directly from pinned CPU memory to VRAM (`direct_gpu=True`).
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
from unifiedefficientloader import UnifiedDataLoader
|
|
168
|
+
from torchvision import datasets, transforms
|
|
169
|
+
|
|
170
|
+
dataset = datasets.FakeData(transform=transforms.ToTensor())
|
|
171
|
+
|
|
172
|
+
# Replaces torch.utils.data.DataLoader
|
|
173
|
+
# Pre-allocates pinned buffer pools and streams directly to GPU
|
|
174
|
+
loader = UnifiedDataLoader(
|
|
175
|
+
dataset,
|
|
176
|
+
batch_size=32,
|
|
177
|
+
shuffle=True,
|
|
178
|
+
num_workers=4,
|
|
179
|
+
direct_gpu=True
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
for batch_image, batch_label in loader:
|
|
183
|
+
# batch is already on the GPU (device="cuda")
|
|
184
|
+
pass
|
|
185
|
+
```
|
|
186
|
+
|
|
114
187
|
### Direct-to-GPU Streaming (Zero-Copy)
|
|
115
188
|
|
|
116
189
|
For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
|
|
@@ -119,25 +192,33 @@ For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True
|
|
|
119
192
|
from unifiedefficientloader import UnifiedSafetensorsLoader
|
|
120
193
|
|
|
121
194
|
with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
|
|
122
|
-
keys_to_load = loader.keys()
|
|
123
|
-
|
|
124
|
-
# async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
|
|
125
195
|
stream = loader.async_stream(
|
|
126
|
-
|
|
196
|
+
loader.keys(),
|
|
127
197
|
batch_size=8,
|
|
128
198
|
prefetch_batches=2,
|
|
129
|
-
direct_gpu=True # optional here since we passed it in __init__
|
|
130
199
|
)
|
|
131
|
-
|
|
132
200
|
for batch in stream:
|
|
133
201
|
for key, gpu_tensor in batch:
|
|
134
|
-
# gpu_tensor is already on the GPU
|
|
202
|
+
# gpu_tensor is already on the GPU
|
|
135
203
|
assert gpu_tensor.device.type == "cuda"
|
|
136
|
-
|
|
137
204
|
# ... process gpu_tensor ...
|
|
138
|
-
loader.mark_processed(key)
|
|
205
|
+
loader.mark_processed(key) # releases GPU buffer back to pool
|
|
139
206
|
```
|
|
140
207
|
|
|
208
|
+
### Zero-Copy MMAP Loading
|
|
209
|
+
|
|
210
|
+
`use_mmap=True` maps the file into virtual memory via the `uel` native extension. No data is copied into RAM — PyTorch holds a direct pointer into OS page cache.
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
from unifiedefficientloader import UnifiedSafetensorsLoader
|
|
214
|
+
|
|
215
|
+
with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, use_mmap=True) as loader:
|
|
216
|
+
state_dict = loader.load_all()
|
|
217
|
+
# all tensors are zero-copy views into mapped memory
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
Requires the `uel` native extension to be compiled. Falls back silently to standard IO if unavailable. See [docs/mmap.md](docs/mmap.md) and [docs/building.md](docs/building.md).
|
|
221
|
+
|
|
141
222
|
### Tensor/Dict Conversion
|
|
142
223
|
|
|
143
224
|
```python
|
|
@@ -1,33 +1,11 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: unifiedefficientloader
|
|
3
|
-
Version: 0.2.3
|
|
4
|
-
Summary: A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
|
|
5
|
-
Author: silveroxides
|
|
6
|
-
License: MIT
|
|
7
|
-
Classifier: Development Status :: 4 - Beta
|
|
8
|
-
Classifier: Intended Audience :: Developers
|
|
9
|
-
Classifier: Programming Language :: Python :: 3
|
|
10
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
-
Classifier: Operating System :: OS Independent
|
|
12
|
-
Requires-Python: >=3.9
|
|
13
|
-
Description-Content-Type: text/markdown
|
|
14
|
-
License-File: LICENSE
|
|
15
|
-
Provides-Extra: torch
|
|
16
|
-
Requires-Dist: torch; extra == "torch"
|
|
17
|
-
Provides-Extra: safetensors
|
|
18
|
-
Requires-Dist: safetensors; extra == "safetensors"
|
|
19
|
-
Provides-Extra: tqdm
|
|
20
|
-
Requires-Dist: tqdm; extra == "tqdm"
|
|
21
|
-
Provides-Extra: all
|
|
22
|
-
Requires-Dist: torch; extra == "all"
|
|
23
|
-
Requires-Dist: safetensors; extra == "all"
|
|
24
|
-
Requires-Dist: tqdm; extra == "all"
|
|
25
|
-
Dynamic: license-file
|
|
26
|
-
|
|
27
1
|
# unifiedefficientloader
|
|
28
2
|
|
|
29
3
|
A unified interface for loading safetensors, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
|
|
30
4
|
|
|
5
|
+
## Documentation
|
|
6
|
+
|
|
7
|
+
Full API reference and guides in [docs/](docs/index.md).
|
|
8
|
+
|
|
31
9
|
## Installation
|
|
32
10
|
|
|
33
11
|
You can install this package via pip. Since it heavily relies on `torch` and `safetensors` but doesn't strictly force them as hard dependencies for package building/installation, make sure you have them installed in your environment:
|
|
@@ -56,6 +34,28 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
|
|
|
56
34
|
loader.mark_processed(key) # Frees memory
|
|
57
35
|
```
|
|
58
36
|
|
|
37
|
+
### Incremental Safetensors Writer
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from unifiedefficientloader import UnifiedSafetensorsLoader, IncrementalSafetensorsWriter
|
|
42
|
+
|
|
43
|
+
# Initialize Writer
|
|
44
|
+
writer = IncrementalSafetensorsWriter(output_path, metadata=metadata)
|
|
45
|
+
writer.__enter__()
|
|
46
|
+
|
|
47
|
+
# Load model tensors and process them.
|
|
48
|
+
with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
|
|
49
|
+
for key in loader.keys():
|
|
50
|
+
tensor = loader.get_tensor(key)
|
|
51
|
+
# Process tensor...
|
|
52
|
+
writer.write(key, tensor)
|
|
53
|
+
del tensor
|
|
54
|
+
loader.mark_processed(key) # Frees memory
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
```
|
|
58
|
+
|
|
59
59
|
### Loading Specific Tensors Dynamically (Header Analysis)
|
|
60
60
|
|
|
61
61
|
You can analyze the file's header without loading the entire multi-gigabyte safetensors file into memory. This allows you to locate specific data (like embedded JSON dictionaries stored as `uint8` tensors) and load *only* those specific tensors directly from their file offsets.
|
|
@@ -70,13 +70,13 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
|
|
|
70
70
|
key for key, info in loader._header.items()
|
|
71
71
|
if isinstance(info, dict) and info.get("dtype") == "U8"
|
|
72
72
|
]
|
|
73
|
-
|
|
73
|
+
|
|
74
74
|
# 2. Load ONLY those specific tensors using their keys
|
|
75
75
|
for key in uint8_tensor_keys:
|
|
76
|
-
# get_tensor dynamically reads only the bytes for this tensor
|
|
76
|
+
# get_tensor dynamically reads only the bytes for this tensor
|
|
77
77
|
# based on the offsets found in the header
|
|
78
78
|
loaded_tensor = loader.get_tensor(key)
|
|
79
|
-
|
|
79
|
+
|
|
80
80
|
# 3. Decode the uint8 tensor back into a Python dictionary
|
|
81
81
|
extracted_dict = tensor_to_dict(loaded_tensor)
|
|
82
82
|
print(f"Decoded {key}:", extracted_dict)
|
|
@@ -91,26 +91,51 @@ from unifiedefficientloader import UnifiedSafetensorsLoader, transfer_to_gpu_pin
|
|
|
91
91
|
|
|
92
92
|
with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
|
|
93
93
|
keys_to_load = loader.keys()
|
|
94
|
-
|
|
94
|
+
|
|
95
95
|
# Create the continuous streaming generator
|
|
96
96
|
# prefetch_batches controls how many batches to buffer in memory
|
|
97
97
|
stream = loader.async_stream(
|
|
98
|
-
keys_to_load,
|
|
99
|
-
batch_size=8,
|
|
100
|
-
prefetch_batches=2,
|
|
98
|
+
keys_to_load,
|
|
99
|
+
batch_size=8,
|
|
100
|
+
prefetch_batches=2,
|
|
101
101
|
pin_memory=True
|
|
102
102
|
)
|
|
103
|
-
|
|
103
|
+
|
|
104
104
|
# Iterate directly over the generator
|
|
105
105
|
for batch in stream:
|
|
106
106
|
for key, pinned_tensor in batch:
|
|
107
107
|
# Transfer directly to GPU via DMA (pinning is already done)
|
|
108
108
|
gpu_tensor = transfer_to_gpu_pinned(pinned_tensor, device="cuda")
|
|
109
|
-
|
|
109
|
+
|
|
110
110
|
# ... process gpu_tensor ...
|
|
111
111
|
loader.mark_processed(key)
|
|
112
112
|
```
|
|
113
113
|
|
|
114
|
+
### Unified Data Loader
|
|
115
|
+
|
|
116
|
+
A high-performance, threaded alternative to PyTorch's standard `DataLoader`. It eliminates multiprocessing IPC overhead and features a zero-copy pipeline capable of streaming batches directly from pinned CPU memory to VRAM (`direct_gpu=True`).
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
from unifiedefficientloader import UnifiedDataLoader
|
|
120
|
+
from torchvision import datasets, transforms
|
|
121
|
+
|
|
122
|
+
dataset = datasets.FakeData(transform=transforms.ToTensor())
|
|
123
|
+
|
|
124
|
+
# Replaces torch.utils.data.DataLoader
|
|
125
|
+
# Pre-allocates pinned buffer pools and streams directly to GPU
|
|
126
|
+
loader = UnifiedDataLoader(
|
|
127
|
+
dataset,
|
|
128
|
+
batch_size=32,
|
|
129
|
+
shuffle=True,
|
|
130
|
+
num_workers=4,
|
|
131
|
+
direct_gpu=True
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
for batch_image, batch_label in loader:
|
|
135
|
+
# batch is already on the GPU (device="cuda")
|
|
136
|
+
pass
|
|
137
|
+
```
|
|
138
|
+
|
|
114
139
|
### Direct-to-GPU Streaming (Zero-Copy)
|
|
115
140
|
|
|
116
141
|
For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
|
|
@@ -119,25 +144,33 @@ For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True
|
|
|
119
144
|
from unifiedefficientloader import UnifiedSafetensorsLoader
|
|
120
145
|
|
|
121
146
|
with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
|
|
122
|
-
keys_to_load = loader.keys()
|
|
123
|
-
|
|
124
|
-
# async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
|
|
125
147
|
stream = loader.async_stream(
|
|
126
|
-
|
|
148
|
+
loader.keys(),
|
|
127
149
|
batch_size=8,
|
|
128
150
|
prefetch_batches=2,
|
|
129
|
-
direct_gpu=True # optional here since we passed it in __init__
|
|
130
151
|
)
|
|
131
|
-
|
|
132
152
|
for batch in stream:
|
|
133
153
|
for key, gpu_tensor in batch:
|
|
134
|
-
# gpu_tensor is already on the GPU
|
|
154
|
+
# gpu_tensor is already on the GPU
|
|
135
155
|
assert gpu_tensor.device.type == "cuda"
|
|
136
|
-
|
|
137
156
|
# ... process gpu_tensor ...
|
|
138
|
-
loader.mark_processed(key)
|
|
157
|
+
loader.mark_processed(key) # releases GPU buffer back to pool
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### Zero-Copy MMAP Loading
|
|
161
|
+
|
|
162
|
+
`use_mmap=True` maps the file into virtual memory via the `uel` native extension. No data is copied into RAM — PyTorch holds a direct pointer into OS page cache.
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
from unifiedefficientloader import UnifiedSafetensorsLoader
|
|
166
|
+
|
|
167
|
+
with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, use_mmap=True) as loader:
|
|
168
|
+
state_dict = loader.load_all()
|
|
169
|
+
# all tensors are zero-copy views into mapped memory
|
|
139
170
|
```
|
|
140
171
|
|
|
172
|
+
Requires the `uel` native extension to be compiled. Falls back silently to standard IO if unavailable. See [docs/mmap.md](docs/mmap.md) and [docs/building.md](docs/building.md).
|
|
173
|
+
|
|
141
174
|
### Tensor/Dict Conversion
|
|
142
175
|
|
|
143
176
|
```python
|
|
@@ -1,25 +1,25 @@
|
|
|
1
1
|
[build-system]
|
|
2
|
-
requires = ["setuptools>=
|
|
2
|
+
requires = ["setuptools>=70.1.0", "wheel"]
|
|
3
3
|
build-backend = "setuptools.build_meta"
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "unifiedefficientloader"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.4.4"
|
|
8
8
|
description = "A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [
|
|
11
11
|
{ name="silveroxides" }
|
|
12
12
|
]
|
|
13
|
-
license = {
|
|
13
|
+
license = { file = "LICENSE" }
|
|
14
14
|
classifiers = [
|
|
15
15
|
"Development Status :: 4 - Beta",
|
|
16
16
|
"Intended Audience :: Developers",
|
|
17
17
|
"Programming Language :: Python :: 3",
|
|
18
|
-
"
|
|
19
|
-
"Operating System ::
|
|
18
|
+
"Operating System :: Microsoft :: Windows",
|
|
19
|
+
"Operating System :: POSIX :: Linux",
|
|
20
20
|
]
|
|
21
21
|
requires-python = ">=3.9"
|
|
22
|
-
dependencies = []
|
|
22
|
+
dependencies = ["comfy-aimdo==0.3.0"]
|
|
23
23
|
|
|
24
24
|
[project.optional-dependencies]
|
|
25
25
|
torch = ["torch"]
|
|
@@ -39,4 +39,4 @@ filterwarnings = [
|
|
|
39
39
|
[tool.setuptools.packages.find]
|
|
40
40
|
where = ["."]
|
|
41
41
|
include = ["unifiedefficientloader*"]
|
|
42
|
-
exclude = ["reference"]
|
|
42
|
+
exclude = ["reference"]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from setuptools import setup
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _read_pyproject_version():
|
|
7
|
+
"""Read version from pyproject.toml without importing any build tools."""
|
|
8
|
+
pyproject = Path(__file__).parent / "pyproject.toml"
|
|
9
|
+
text = pyproject.read_text(encoding="utf-8")
|
|
10
|
+
m = re.search(r'^version\s*=\s*"([^"]+)"', text, re.MULTILINE)
|
|
11
|
+
if not m:
|
|
12
|
+
raise RuntimeError("Could not find version in pyproject.toml")
|
|
13
|
+
return m.group(1)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _read_readme():
|
|
17
|
+
readme = Path(__file__).parent / "README.md"
|
|
18
|
+
if readme.exists():
|
|
19
|
+
return readme.read_text(encoding="utf-8")
|
|
20
|
+
return ""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
setup(
|
|
24
|
+
name="unifiedefficientloader",
|
|
25
|
+
version=_read_pyproject_version(),
|
|
26
|
+
long_description=_read_readme(),
|
|
27
|
+
long_description_content_type="text/markdown",
|
|
28
|
+
packages=["unifiedefficientloader", "unifiedefficientloader.uel"],
|
|
29
|
+
)
|
|
@@ -5,91 +5,99 @@ import pytest
|
|
|
5
5
|
try:
|
|
6
6
|
import torch
|
|
7
7
|
from safetensors.torch import save_file
|
|
8
|
+
|
|
8
9
|
HAS_TORCH = True
|
|
9
10
|
except ImportError:
|
|
10
11
|
HAS_TORCH = False
|
|
11
12
|
|
|
12
13
|
from unifiedefficientloader import MemoryEfficientSafeOpen
|
|
13
14
|
|
|
15
|
+
|
|
14
16
|
@pytest.fixture
|
|
15
17
|
def sample_safetensors():
|
|
16
18
|
if not HAS_TORCH:
|
|
17
19
|
pytest.skip("Requires torch and safetensors")
|
|
18
|
-
|
|
20
|
+
|
|
19
21
|
with tempfile.NamedTemporaryFile(suffix=".safetensors", delete=False) as f:
|
|
20
22
|
path = f.name
|
|
21
|
-
|
|
23
|
+
|
|
22
24
|
tensors = {
|
|
23
25
|
"weight1": torch.randn(10, 10),
|
|
24
26
|
"weight2": torch.randn(20, 20),
|
|
25
27
|
"bias": torch.zeros(10),
|
|
26
28
|
}
|
|
27
29
|
save_file(tensors, path)
|
|
28
|
-
|
|
30
|
+
|
|
29
31
|
yield path, tensors
|
|
30
|
-
|
|
32
|
+
|
|
31
33
|
if os.path.exists(path):
|
|
32
34
|
os.remove(path)
|
|
33
35
|
|
|
34
|
-
|
|
36
|
+
|
|
37
|
+
@pytest.mark.skipif(
|
|
38
|
+
not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA"
|
|
39
|
+
)
|
|
35
40
|
def test_direct_gpu_streaming(sample_safetensors):
|
|
36
41
|
path, original_tensors = sample_safetensors
|
|
37
|
-
|
|
42
|
+
|
|
38
43
|
loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
|
|
39
|
-
|
|
44
|
+
|
|
40
45
|
# Test load_all which uses async_stream under the hood
|
|
41
46
|
loaded_tensors = loader.load_all()
|
|
42
|
-
|
|
47
|
+
|
|
43
48
|
for key, orig_tensor in original_tensors.items():
|
|
44
49
|
assert key in loaded_tensors
|
|
45
50
|
loaded_tensor = loaded_tensors[key]
|
|
46
|
-
|
|
51
|
+
|
|
47
52
|
# Verify it's on GPU
|
|
48
53
|
assert loaded_tensor.device.type == "cuda"
|
|
49
|
-
|
|
54
|
+
|
|
50
55
|
# Verify data matches
|
|
51
56
|
torch.testing.assert_close(loaded_tensor.cpu(), orig_tensor)
|
|
52
|
-
|
|
57
|
+
|
|
53
58
|
loader.close()
|
|
54
59
|
|
|
55
|
-
|
|
60
|
+
|
|
61
|
+
@pytest.mark.skipif(
|
|
62
|
+
not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA"
|
|
63
|
+
)
|
|
56
64
|
def test_direct_gpu_async_stream(sample_safetensors):
|
|
57
65
|
path, original_tensors = sample_safetensors
|
|
58
|
-
|
|
66
|
+
|
|
59
67
|
loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
|
|
60
|
-
|
|
68
|
+
|
|
61
69
|
stream = loader.async_stream(
|
|
62
70
|
keys=list(original_tensors.keys()),
|
|
63
71
|
batch_size=2,
|
|
64
72
|
prefetch_batches=1,
|
|
65
|
-
direct_gpu=True
|
|
66
73
|
)
|
|
67
|
-
|
|
74
|
+
|
|
68
75
|
loaded_count = 0
|
|
69
76
|
for batch in stream:
|
|
70
77
|
for key, tensor in batch:
|
|
71
78
|
assert tensor.device.type == "cuda"
|
|
72
79
|
torch.testing.assert_close(tensor.cpu(), original_tensors[key])
|
|
73
80
|
loaded_count += 1
|
|
74
|
-
|
|
81
|
+
|
|
75
82
|
assert loaded_count == len(original_tensors)
|
|
76
83
|
loader.close()
|
|
77
84
|
|
|
85
|
+
|
|
78
86
|
@pytest.mark.skipif(not HAS_TORCH, reason="Requires torch")
|
|
79
87
|
def test_direct_gpu_fallback_no_cuda(sample_safetensors, monkeypatch):
|
|
80
88
|
# Force cuda to be unavailable
|
|
81
89
|
monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
|
|
82
|
-
|
|
90
|
+
|
|
83
91
|
path, original_tensors = sample_safetensors
|
|
84
|
-
|
|
92
|
+
|
|
85
93
|
# Should fallback to CPU silently
|
|
86
94
|
loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
|
|
87
|
-
|
|
95
|
+
|
|
88
96
|
loaded_tensors = loader.load_all()
|
|
89
|
-
|
|
97
|
+
|
|
90
98
|
for key, orig_tensor in original_tensors.items():
|
|
91
99
|
loaded_tensor = loaded_tensors[key]
|
|
92
100
|
assert loaded_tensor.device.type == "cpu"
|
|
93
101
|
torch.testing.assert_close(loaded_tensor, orig_tensor)
|
|
94
|
-
|
|
102
|
+
|
|
95
103
|
loader.close()
|