unifiedefficientloader 0.2.3__tar.gz → 0.4.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/LICENSE +1 -1
  2. {unifiedefficientloader-0.2.3/unifiedefficientloader.egg-info → unifiedefficientloader-0.4.4}/PKG-INFO +103 -22
  3. unifiedefficientloader-0.2.3/PKG-INFO → unifiedefficientloader-0.4.4/README.md +77 -44
  4. {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/pyproject.toml +7 -7
  5. unifiedefficientloader-0.4.4/setup.py +29 -0
  6. {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/tests/test_direct_gpu.py +30 -22
  7. unifiedefficientloader-0.4.4/tests/test_incremental_writer.py +249 -0
  8. unifiedefficientloader-0.4.4/tests/test_mmap.py +205 -0
  9. unifiedefficientloader-0.4.4/tests/test_unified_data_loader.py +57 -0
  10. {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/tests/test_utils.py +24 -24
  11. {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader/__init__.py +7 -1
  12. unifiedefficientloader-0.4.4/unifiedefficientloader/incremental_writer.py +296 -0
  13. {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader/memory_efficient_loader.py +188 -97
  14. {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader/pinned_buffer_pool.py +21 -3
  15. unifiedefficientloader-0.4.4/unifiedefficientloader/tensor_utils.py +126 -0
  16. unifiedefficientloader-0.4.4/unifiedefficientloader/uel/__init__.py +48 -0
  17. unifiedefficientloader-0.4.4/unifiedefficientloader/uel/control.py +57 -0
  18. unifiedefficientloader-0.4.4/unifiedefficientloader/uel/host_buffer.py +4 -0
  19. unifiedefficientloader-0.4.4/unifiedefficientloader/uel/model_mmap.py +62 -0
  20. unifiedefficientloader-0.4.4/unifiedefficientloader/uel/model_vbar.py +4 -0
  21. unifiedefficientloader-0.4.4/unifiedefficientloader/uel/torch.py +12 -0
  22. unifiedefficientloader-0.4.4/unifiedefficientloader/unified_data_loader.py +295 -0
  23. unifiedefficientloader-0.2.3/README.md → unifiedefficientloader-0.4.4/unifiedefficientloader.egg-info/PKG-INFO +126 -19
  24. {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader.egg-info/SOURCES.txt +12 -1
  25. {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader.egg-info/requires.txt +1 -0
  26. unifiedefficientloader-0.2.3/setup.py +0 -6
  27. unifiedefficientloader-0.2.3/unifiedefficientloader/tensor_utils.py +0 -57
  28. {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/setup.cfg +0 -0
  29. {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/tests/test_logging.py +0 -0
  30. {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader/gpu_buffer_pool.py +0 -0
  31. {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader/logging_utils.py +0 -0
  32. {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader/pinned_transfer.py +0 -0
  33. {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader.egg-info/dependency_links.txt +0 -0
  34. {unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/unifiedefficientloader.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2026 Silver
3
+ Copyright (c) 2026 silveroxides
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -1,17 +1,39 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unifiedefficientloader
3
- Version: 0.2.3
3
+ Version: 0.4.4
4
4
  Summary: A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
5
5
  Author: silveroxides
6
- License: MIT
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 silveroxides
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
7
28
  Classifier: Development Status :: 4 - Beta
8
29
  Classifier: Intended Audience :: Developers
9
30
  Classifier: Programming Language :: Python :: 3
10
- Classifier: License :: OSI Approved :: MIT License
11
- Classifier: Operating System :: OS Independent
31
+ Classifier: Operating System :: Microsoft :: Windows
32
+ Classifier: Operating System :: POSIX :: Linux
12
33
  Requires-Python: >=3.9
13
34
  Description-Content-Type: text/markdown
14
35
  License-File: LICENSE
36
+ Requires-Dist: comfy-aimdo==0.3.0
15
37
  Provides-Extra: torch
16
38
  Requires-Dist: torch; extra == "torch"
17
39
  Provides-Extra: safetensors
@@ -28,6 +50,10 @@ Dynamic: license-file
28
50
 
29
51
  A unified interface for loading safetensors, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
30
52
 
53
+ ## Documentation
54
+
55
+ Full API reference and guides in [docs/](docs/index.md).
56
+
31
57
  ## Installation
32
58
 
33
59
  You can install this package via pip. Since it heavily relies on `torch` and `safetensors` but doesn't strictly force them as hard dependencies for package building/installation, make sure you have them installed in your environment:
@@ -56,6 +82,28 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
56
82
  loader.mark_processed(key) # Frees memory
57
83
  ```
58
84
 
85
+ ### Incremental Safetensors Writer
86
+
87
+
88
+ ```python
89
+ from unifiedefficientloader import UnifiedSafetensorsLoader, IncrementalSafetensorsWriter
90
+
91
+ # Initialize Writer
92
+ writer = IncrementalSafetensorsWriter(output_path, metadata=metadata)
93
+ writer.__enter__()
94
+
95
+ # Load model tensors and process them.
96
+ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
97
+ for key in loader.keys():
98
+ tensor = loader.get_tensor(key)
99
+ # Process tensor...
100
+ writer.write(key, tensor)
101
+ del tensor
102
+ loader.mark_processed(key) # Frees memory
103
+
104
+
105
+ ```
106
+
59
107
  ### Loading Specific Tensors Dynamically (Header Analysis)
60
108
 
61
109
  You can analyze the file's header without loading the entire multi-gigabyte safetensors file into memory. This allows you to locate specific data (like embedded JSON dictionaries stored as `uint8` tensors) and load *only* those specific tensors directly from their file offsets.
@@ -70,13 +118,13 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
70
118
  key for key, info in loader._header.items()
71
119
  if isinstance(info, dict) and info.get("dtype") == "U8"
72
120
  ]
73
-
121
+
74
122
  # 2. Load ONLY those specific tensors using their keys
75
123
  for key in uint8_tensor_keys:
76
- # get_tensor dynamically reads only the bytes for this tensor
124
+ # get_tensor dynamically reads only the bytes for this tensor
77
125
  # based on the offsets found in the header
78
126
  loaded_tensor = loader.get_tensor(key)
79
-
127
+
80
128
  # 3. Decode the uint8 tensor back into a Python dictionary
81
129
  extracted_dict = tensor_to_dict(loaded_tensor)
82
130
  print(f"Decoded {key}:", extracted_dict)
@@ -91,26 +139,51 @@ from unifiedefficientloader import UnifiedSafetensorsLoader, transfer_to_gpu_pin
91
139
 
92
140
  with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
93
141
  keys_to_load = loader.keys()
94
-
142
+
95
143
  # Create the continuous streaming generator
96
144
  # prefetch_batches controls how many batches to buffer in memory
97
145
  stream = loader.async_stream(
98
- keys_to_load,
99
- batch_size=8,
100
- prefetch_batches=2,
146
+ keys_to_load,
147
+ batch_size=8,
148
+ prefetch_batches=2,
101
149
  pin_memory=True
102
150
  )
103
-
151
+
104
152
  # Iterate directly over the generator
105
153
  for batch in stream:
106
154
  for key, pinned_tensor in batch:
107
155
  # Transfer directly to GPU via DMA (pinning is already done)
108
156
  gpu_tensor = transfer_to_gpu_pinned(pinned_tensor, device="cuda")
109
-
157
+
110
158
  # ... process gpu_tensor ...
111
159
  loader.mark_processed(key)
112
160
  ```
113
161
 
162
+ ### Unified Data Loader
163
+
164
+ A high-performance, threaded alternative to PyTorch's standard `DataLoader`. It eliminates multiprocessing IPC overhead and features a zero-copy pipeline capable of streaming batches directly from pinned CPU memory to VRAM (`direct_gpu=True`).
165
+
166
+ ```python
167
+ from unifiedefficientloader import UnifiedDataLoader
168
+ from torchvision import datasets, transforms
169
+
170
+ dataset = datasets.FakeData(transform=transforms.ToTensor())
171
+
172
+ # Replaces torch.utils.data.DataLoader
173
+ # Pre-allocates pinned buffer pools and streams directly to GPU
174
+ loader = UnifiedDataLoader(
175
+ dataset,
176
+ batch_size=32,
177
+ shuffle=True,
178
+ num_workers=4,
179
+ direct_gpu=True
180
+ )
181
+
182
+ for batch_image, batch_label in loader:
183
+ # batch is already on the GPU (device="cuda")
184
+ pass
185
+ ```
186
+
114
187
  ### Direct-to-GPU Streaming (Zero-Copy)
115
188
 
116
189
  For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
@@ -119,25 +192,33 @@ For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True
119
192
  from unifiedefficientloader import UnifiedSafetensorsLoader
120
193
 
121
194
  with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
122
- keys_to_load = loader.keys()
123
-
124
- # async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
125
195
  stream = loader.async_stream(
126
- keys_to_load,
196
+ loader.keys(),
127
197
  batch_size=8,
128
198
  prefetch_batches=2,
129
- direct_gpu=True # optional here since we passed it in __init__
130
199
  )
131
-
132
200
  for batch in stream:
133
201
  for key, gpu_tensor in batch:
134
- # gpu_tensor is already on the GPU!
202
+ # gpu_tensor is already on the GPU
135
203
  assert gpu_tensor.device.type == "cuda"
136
-
137
204
  # ... process gpu_tensor ...
138
- loader.mark_processed(key)
205
+ loader.mark_processed(key) # releases GPU buffer back to pool
139
206
  ```
140
207
 
208
+ ### Zero-Copy MMAP Loading
209
+
210
+ `use_mmap=True` maps the file into virtual memory via the `uel` native extension. No data is copied into RAM — PyTorch holds a direct pointer into OS page cache.
211
+
212
+ ```python
213
+ from unifiedefficientloader import UnifiedSafetensorsLoader
214
+
215
+ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, use_mmap=True) as loader:
216
+ state_dict = loader.load_all()
217
+ # all tensors are zero-copy views into mapped memory
218
+ ```
219
+
220
+ Requires the `uel` native extension to be compiled. Falls back silently to standard IO if unavailable. See [docs/mmap.md](docs/mmap.md) and [docs/building.md](docs/building.md).
221
+
141
222
  ### Tensor/Dict Conversion
142
223
 
143
224
  ```python
@@ -1,33 +1,11 @@
1
- Metadata-Version: 2.4
2
- Name: unifiedefficientloader
3
- Version: 0.2.3
4
- Summary: A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
5
- Author: silveroxides
6
- License: MIT
7
- Classifier: Development Status :: 4 - Beta
8
- Classifier: Intended Audience :: Developers
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: License :: OSI Approved :: MIT License
11
- Classifier: Operating System :: OS Independent
12
- Requires-Python: >=3.9
13
- Description-Content-Type: text/markdown
14
- License-File: LICENSE
15
- Provides-Extra: torch
16
- Requires-Dist: torch; extra == "torch"
17
- Provides-Extra: safetensors
18
- Requires-Dist: safetensors; extra == "safetensors"
19
- Provides-Extra: tqdm
20
- Requires-Dist: tqdm; extra == "tqdm"
21
- Provides-Extra: all
22
- Requires-Dist: torch; extra == "all"
23
- Requires-Dist: safetensors; extra == "all"
24
- Requires-Dist: tqdm; extra == "all"
25
- Dynamic: license-file
26
-
27
1
  # unifiedefficientloader
28
2
 
29
3
  A unified interface for loading safetensors, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
30
4
 
5
+ ## Documentation
6
+
7
+ Full API reference and guides in [docs/](docs/index.md).
8
+
31
9
  ## Installation
32
10
 
33
11
  You can install this package via pip. Since it heavily relies on `torch` and `safetensors` but doesn't strictly force them as hard dependencies for package building/installation, make sure you have them installed in your environment:
@@ -56,6 +34,28 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
56
34
  loader.mark_processed(key) # Frees memory
57
35
  ```
58
36
 
37
+ ### Incremental Safetensors Writer
38
+
39
+
40
+ ```python
41
+ from unifiedefficientloader import UnifiedSafetensorsLoader, IncrementalSafetensorsWriter
42
+
43
+ # Initialize Writer
44
+ writer = IncrementalSafetensorsWriter(output_path, metadata=metadata)
45
+ writer.__enter__()
46
+
47
+ # Load model tensors and process them.
48
+ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
49
+ for key in loader.keys():
50
+ tensor = loader.get_tensor(key)
51
+ # Process tensor...
52
+ writer.write(key, tensor)
53
+ del tensor
54
+ loader.mark_processed(key) # Frees memory
55
+
56
+
57
+ ```
58
+
59
59
  ### Loading Specific Tensors Dynamically (Header Analysis)
60
60
 
61
61
  You can analyze the file's header without loading the entire multi-gigabyte safetensors file into memory. This allows you to locate specific data (like embedded JSON dictionaries stored as `uint8` tensors) and load *only* those specific tensors directly from their file offsets.
@@ -70,13 +70,13 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
70
70
  key for key, info in loader._header.items()
71
71
  if isinstance(info, dict) and info.get("dtype") == "U8"
72
72
  ]
73
-
73
+
74
74
  # 2. Load ONLY those specific tensors using their keys
75
75
  for key in uint8_tensor_keys:
76
- # get_tensor dynamically reads only the bytes for this tensor
76
+ # get_tensor dynamically reads only the bytes for this tensor
77
77
  # based on the offsets found in the header
78
78
  loaded_tensor = loader.get_tensor(key)
79
-
79
+
80
80
  # 3. Decode the uint8 tensor back into a Python dictionary
81
81
  extracted_dict = tensor_to_dict(loaded_tensor)
82
82
  print(f"Decoded {key}:", extracted_dict)
@@ -91,26 +91,51 @@ from unifiedefficientloader import UnifiedSafetensorsLoader, transfer_to_gpu_pin
91
91
 
92
92
  with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
93
93
  keys_to_load = loader.keys()
94
-
94
+
95
95
  # Create the continuous streaming generator
96
96
  # prefetch_batches controls how many batches to buffer in memory
97
97
  stream = loader.async_stream(
98
- keys_to_load,
99
- batch_size=8,
100
- prefetch_batches=2,
98
+ keys_to_load,
99
+ batch_size=8,
100
+ prefetch_batches=2,
101
101
  pin_memory=True
102
102
  )
103
-
103
+
104
104
  # Iterate directly over the generator
105
105
  for batch in stream:
106
106
  for key, pinned_tensor in batch:
107
107
  # Transfer directly to GPU via DMA (pinning is already done)
108
108
  gpu_tensor = transfer_to_gpu_pinned(pinned_tensor, device="cuda")
109
-
109
+
110
110
  # ... process gpu_tensor ...
111
111
  loader.mark_processed(key)
112
112
  ```
113
113
 
114
+ ### Unified Data Loader
115
+
116
+ A high-performance, threaded alternative to PyTorch's standard `DataLoader`. It eliminates multiprocessing IPC overhead and features a zero-copy pipeline capable of streaming batches directly from pinned CPU memory to VRAM (`direct_gpu=True`).
117
+
118
+ ```python
119
+ from unifiedefficientloader import UnifiedDataLoader
120
+ from torchvision import datasets, transforms
121
+
122
+ dataset = datasets.FakeData(transform=transforms.ToTensor())
123
+
124
+ # Replaces torch.utils.data.DataLoader
125
+ # Pre-allocates pinned buffer pools and streams directly to GPU
126
+ loader = UnifiedDataLoader(
127
+ dataset,
128
+ batch_size=32,
129
+ shuffle=True,
130
+ num_workers=4,
131
+ direct_gpu=True
132
+ )
133
+
134
+ for batch_image, batch_label in loader:
135
+ # batch is already on the GPU (device="cuda")
136
+ pass
137
+ ```
138
+
114
139
  ### Direct-to-GPU Streaming (Zero-Copy)
115
140
 
116
141
  For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
@@ -119,25 +144,33 @@ For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True
119
144
  from unifiedefficientloader import UnifiedSafetensorsLoader
120
145
 
121
146
  with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
122
- keys_to_load = loader.keys()
123
-
124
- # async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
125
147
  stream = loader.async_stream(
126
- keys_to_load,
148
+ loader.keys(),
127
149
  batch_size=8,
128
150
  prefetch_batches=2,
129
- direct_gpu=True # optional here since we passed it in __init__
130
151
  )
131
-
132
152
  for batch in stream:
133
153
  for key, gpu_tensor in batch:
134
- # gpu_tensor is already on the GPU!
154
+ # gpu_tensor is already on the GPU
135
155
  assert gpu_tensor.device.type == "cuda"
136
-
137
156
  # ... process gpu_tensor ...
138
- loader.mark_processed(key)
157
+ loader.mark_processed(key) # releases GPU buffer back to pool
158
+ ```
159
+
160
+ ### Zero-Copy MMAP Loading
161
+
162
+ `use_mmap=True` maps the file into virtual memory via the `uel` native extension. No data is copied into RAM — PyTorch holds a direct pointer into OS page cache.
163
+
164
+ ```python
165
+ from unifiedefficientloader import UnifiedSafetensorsLoader
166
+
167
+ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, use_mmap=True) as loader:
168
+ state_dict = loader.load_all()
169
+ # all tensors are zero-copy views into mapped memory
139
170
  ```
140
171
 
172
+ Requires the `uel` native extension to be compiled. Falls back silently to standard IO if unavailable. See [docs/mmap.md](docs/mmap.md) and [docs/building.md](docs/building.md).
173
+
141
174
  ### Tensor/Dict Conversion
142
175
 
143
176
  ```python
@@ -1,25 +1,25 @@
1
1
  [build-system]
2
- requires = ["setuptools>=61.0.0", "wheel"]
2
+ requires = ["setuptools>=70.1.0", "wheel"]
3
3
  build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "unifiedefficientloader"
7
- version = "0.2.3"
7
+ version = "0.4.4"
8
8
  description = "A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts."
9
9
  readme = "README.md"
10
10
  authors = [
11
11
  { name="silveroxides" }
12
12
  ]
13
- license = { text="MIT" }
13
+ license = { file = "LICENSE" }
14
14
  classifiers = [
15
15
  "Development Status :: 4 - Beta",
16
16
  "Intended Audience :: Developers",
17
17
  "Programming Language :: Python :: 3",
18
- "License :: OSI Approved :: MIT License",
19
- "Operating System :: OS Independent",
18
+ "Operating System :: Microsoft :: Windows",
19
+ "Operating System :: POSIX :: Linux",
20
20
  ]
21
21
  requires-python = ">=3.9"
22
- dependencies = []
22
+ dependencies = ["comfy-aimdo==0.3.0"]
23
23
 
24
24
  [project.optional-dependencies]
25
25
  torch = ["torch"]
@@ -39,4 +39,4 @@ filterwarnings = [
39
39
  [tool.setuptools.packages.find]
40
40
  where = ["."]
41
41
  include = ["unifiedefficientloader*"]
42
- exclude = ["reference"]
42
+ exclude = ["reference"]
@@ -0,0 +1,29 @@
1
+ import re
2
+ from pathlib import Path
3
+ from setuptools import setup
4
+
5
+
6
+ def _read_pyproject_version():
7
+ """Read version from pyproject.toml without importing any build tools."""
8
+ pyproject = Path(__file__).parent / "pyproject.toml"
9
+ text = pyproject.read_text(encoding="utf-8")
10
+ m = re.search(r'^version\s*=\s*"([^"]+)"', text, re.MULTILINE)
11
+ if not m:
12
+ raise RuntimeError("Could not find version in pyproject.toml")
13
+ return m.group(1)
14
+
15
+
16
+ def _read_readme():
17
+ readme = Path(__file__).parent / "README.md"
18
+ if readme.exists():
19
+ return readme.read_text(encoding="utf-8")
20
+ return ""
21
+
22
+
23
+ setup(
24
+ name="unifiedefficientloader",
25
+ version=_read_pyproject_version(),
26
+ long_description=_read_readme(),
27
+ long_description_content_type="text/markdown",
28
+ packages=["unifiedefficientloader", "unifiedefficientloader.uel"],
29
+ )
@@ -5,91 +5,99 @@ import pytest
5
5
  try:
6
6
  import torch
7
7
  from safetensors.torch import save_file
8
+
8
9
  HAS_TORCH = True
9
10
  except ImportError:
10
11
  HAS_TORCH = False
11
12
 
12
13
  from unifiedefficientloader import MemoryEfficientSafeOpen
13
14
 
15
+
14
16
  @pytest.fixture
15
17
  def sample_safetensors():
16
18
  if not HAS_TORCH:
17
19
  pytest.skip("Requires torch and safetensors")
18
-
20
+
19
21
  with tempfile.NamedTemporaryFile(suffix=".safetensors", delete=False) as f:
20
22
  path = f.name
21
-
23
+
22
24
  tensors = {
23
25
  "weight1": torch.randn(10, 10),
24
26
  "weight2": torch.randn(20, 20),
25
27
  "bias": torch.zeros(10),
26
28
  }
27
29
  save_file(tensors, path)
28
-
30
+
29
31
  yield path, tensors
30
-
32
+
31
33
  if os.path.exists(path):
32
34
  os.remove(path)
33
35
 
34
- @pytest.mark.skipif(not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA")
36
+
37
+ @pytest.mark.skipif(
38
+ not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA"
39
+ )
35
40
  def test_direct_gpu_streaming(sample_safetensors):
36
41
  path, original_tensors = sample_safetensors
37
-
42
+
38
43
  loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
39
-
44
+
40
45
  # Test load_all which uses async_stream under the hood
41
46
  loaded_tensors = loader.load_all()
42
-
47
+
43
48
  for key, orig_tensor in original_tensors.items():
44
49
  assert key in loaded_tensors
45
50
  loaded_tensor = loaded_tensors[key]
46
-
51
+
47
52
  # Verify it's on GPU
48
53
  assert loaded_tensor.device.type == "cuda"
49
-
54
+
50
55
  # Verify data matches
51
56
  torch.testing.assert_close(loaded_tensor.cpu(), orig_tensor)
52
-
57
+
53
58
  loader.close()
54
59
 
55
- @pytest.mark.skipif(not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA")
60
+
61
+ @pytest.mark.skipif(
62
+ not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA"
63
+ )
56
64
  def test_direct_gpu_async_stream(sample_safetensors):
57
65
  path, original_tensors = sample_safetensors
58
-
66
+
59
67
  loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
60
-
68
+
61
69
  stream = loader.async_stream(
62
70
  keys=list(original_tensors.keys()),
63
71
  batch_size=2,
64
72
  prefetch_batches=1,
65
- direct_gpu=True
66
73
  )
67
-
74
+
68
75
  loaded_count = 0
69
76
  for batch in stream:
70
77
  for key, tensor in batch:
71
78
  assert tensor.device.type == "cuda"
72
79
  torch.testing.assert_close(tensor.cpu(), original_tensors[key])
73
80
  loaded_count += 1
74
-
81
+
75
82
  assert loaded_count == len(original_tensors)
76
83
  loader.close()
77
84
 
85
+
78
86
  @pytest.mark.skipif(not HAS_TORCH, reason="Requires torch")
79
87
  def test_direct_gpu_fallback_no_cuda(sample_safetensors, monkeypatch):
80
88
  # Force cuda to be unavailable
81
89
  monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
82
-
90
+
83
91
  path, original_tensors = sample_safetensors
84
-
92
+
85
93
  # Should fallback to CPU silently
86
94
  loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
87
-
95
+
88
96
  loaded_tensors = loader.load_all()
89
-
97
+
90
98
  for key, orig_tensor in original_tensors.items():
91
99
  loaded_tensor = loaded_tensors[key]
92
100
  assert loaded_tensor.device.type == "cpu"
93
101
  torch.testing.assert_close(loaded_tensor, orig_tensor)
94
-
102
+
95
103
  loader.close()