PyPI - unifiedefficientloader - Versions diffs - 0.2.3__tar.gz → 0.4.4__tar.gz - Mend

unifiedefficientloader 0.2.3tar.gz → 0.4.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

{unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/LICENSE RENAMED Viewed

@@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2026 Silver
+Copyright (c) 2026 silveroxides
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

{unifiedefficientloader-0.2.3/unifiedefficientloader.egg-info → unifiedefficientloader-0.4.4}/PKG-INFO RENAMED Viewed

@@ -1,17 +1,39 @@
 Metadata-Version: 2.4
 Name: unifiedefficientloader
-Version: 0.2.3
+Version: 0.4.4
 Summary: A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
 Author: silveroxides
-License: MIT
+License: MIT License
+        Copyright (c) 2026 silveroxides
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
 Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Developers
 Classifier: Programming Language :: Python :: 3
-Classifier: License :: OSI Approved :: MIT License
-Classifier: Operating System :: OS Independent
+Classifier: Operating System :: Microsoft :: Windows
+Classifier: Operating System :: POSIX :: Linux
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Requires-Dist: comfy-aimdo==0.3.0
 Provides-Extra: torch
 Requires-Dist: torch; extra == "torch"
 Provides-Extra: safetensors
@@ -28,6 +50,10 @@ Dynamic: license-file
 A unified interface for loading safetensors, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
+## Documentation
+Full API reference and guides in [docs/](docs/index.md).
 ## Installation
 You can install this package via pip. Since it heavily relies on `torch` and `safetensors` but doesn't strictly force them as hard dependencies for package building/installation, make sure you have them installed in your environment:
@@ -56,6 +82,28 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
         loader.mark_processed(key) # Frees memory
 ```
+### Incremental Safetensors Writer
+```python
+from unifiedefficientloader import UnifiedSafetensorsLoader, IncrementalSafetensorsWriter
+# Initialize Writer
+writer = IncrementalSafetensorsWriter(output_path, metadata=metadata)
+writer.__enter__()
+# Load model tensors and process them.
+with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
+    for key in loader.keys():
+        tensor = loader.get_tensor(key)
+        # Process tensor...
+        writer.write(key, tensor)
+        del tensor
+        loader.mark_processed(key) # Frees memory
+```
 ### Loading Specific Tensors Dynamically (Header Analysis)
 You can analyze the file's header without loading the entire multi-gigabyte safetensors file into memory. This allows you to locate specific data (like embedded JSON dictionaries stored as `uint8` tensors) and load *only* those specific tensors directly from their file offsets.
@@ -70,13 +118,13 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
         key for key, info in loader._header.items()
         if isinstance(info, dict) and info.get("dtype") == "U8"
     ]
     # 2. Load ONLY those specific tensors using their keys
     for key in uint8_tensor_keys:
-        # get_tensor dynamically reads only the bytes for this tensor
+        # get_tensor dynamically reads only the bytes for this tensor
         # based on the offsets found in the header
         loaded_tensor = loader.get_tensor(key)
         # 3. Decode the uint8 tensor back into a Python dictionary
         extracted_dict = tensor_to_dict(loaded_tensor)
         print(f"Decoded {key}:", extracted_dict)
@@ -91,26 +139,51 @@ from unifiedefficientloader import UnifiedSafetensorsLoader, transfer_to_gpu_pin
 with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
     keys_to_load = loader.keys()
     # Create the continuous streaming generator
     # prefetch_batches controls how many batches to buffer in memory
     stream = loader.async_stream(
-        keys_to_load,
-        batch_size=8,
-        prefetch_batches=2,
+        keys_to_load,
+        batch_size=8,
+        prefetch_batches=2,
         pin_memory=True
     )
     # Iterate directly over the generator
     for batch in stream:
         for key, pinned_tensor in batch:
             # Transfer directly to GPU via DMA (pinning is already done)
             gpu_tensor = transfer_to_gpu_pinned(pinned_tensor, device="cuda")
             # ... process gpu_tensor ...
             loader.mark_processed(key)
 ```
+### Unified Data Loader
+A high-performance, threaded alternative to PyTorch's standard `DataLoader`. It eliminates multiprocessing IPC overhead and features a zero-copy pipeline capable of streaming batches directly from pinned CPU memory to VRAM (`direct_gpu=True`).
+```python
+from unifiedefficientloader import UnifiedDataLoader
+from torchvision import datasets, transforms
+dataset = datasets.FakeData(transform=transforms.ToTensor())
+# Replaces torch.utils.data.DataLoader
+# Pre-allocates pinned buffer pools and streams directly to GPU
+loader = UnifiedDataLoader(
+    dataset,
+    batch_size=32,
+    shuffle=True,
+    num_workers=4,
+    direct_gpu=True
+)
+for batch_image, batch_label in loader:
+    # batch is already on the GPU (device="cuda")
+    pass
+```
 ### Direct-to-GPU Streaming (Zero-Copy)
 For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
@@ -119,25 +192,33 @@ For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True
 from unifiedefficientloader import UnifiedSafetensorsLoader
 with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
-    keys_to_load = loader.keys()
-    # async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
     stream = loader.async_stream(
-        keys_to_load,
+        loader.keys(),
         batch_size=8,
         prefetch_batches=2,
-        direct_gpu=True # optional here since we passed it in __init__
     )
     for batch in stream:
         for key, gpu_tensor in batch:
-            # gpu_tensor is already on the GPU!
+            # gpu_tensor is already on the GPU
             assert gpu_tensor.device.type == "cuda"
             # ... process gpu_tensor ...
-            loader.mark_processed(key)
+            loader.mark_processed(key)  # releases GPU buffer back to pool
 ```
+### Zero-Copy MMAP Loading
+`use_mmap=True` maps the file into virtual memory via the `uel` native extension. No data is copied into RAM — PyTorch holds a direct pointer into OS page cache.
+```python
+from unifiedefficientloader import UnifiedSafetensorsLoader
+with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, use_mmap=True) as loader:
+    state_dict = loader.load_all()
+    # all tensors are zero-copy views into mapped memory
+```
+Requires the `uel` native extension to be compiled. Falls back silently to standard IO if unavailable. See [docs/mmap.md](docs/mmap.md) and [docs/building.md](docs/building.md).
 ### Tensor/Dict Conversion
 ```python

unifiedefficientloader-0.2.3/PKG-INFO → unifiedefficientloader-0.4.4/README.md RENAMED Viewed

@@ -1,33 +1,11 @@
-Metadata-Version: 2.4
-Name: unifiedefficientloader
-Version: 0.2.3
-Summary: A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
-Author: silveroxides
-License: MIT
-Classifier: Development Status :: 4 - Beta
-Classifier: Intended Audience :: Developers
-Classifier: Programming Language :: Python :: 3
-Classifier: License :: OSI Approved :: MIT License
-Classifier: Operating System :: OS Independent
-Requires-Python: >=3.9
-Description-Content-Type: text/markdown
-License-File: LICENSE
-Provides-Extra: torch
-Requires-Dist: torch; extra == "torch"
-Provides-Extra: safetensors
-Requires-Dist: safetensors; extra == "safetensors"
-Provides-Extra: tqdm
-Requires-Dist: tqdm; extra == "tqdm"
-Provides-Extra: all
-Requires-Dist: torch; extra == "all"
-Requires-Dist: safetensors; extra == "all"
-Requires-Dist: tqdm; extra == "all"
-Dynamic: license-file
 # unifiedefficientloader
 A unified interface for loading safetensors, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
+## Documentation
+Full API reference and guides in [docs/](docs/index.md).
 ## Installation
 You can install this package via pip. Since it heavily relies on `torch` and `safetensors` but doesn't strictly force them as hard dependencies for package building/installation, make sure you have them installed in your environment:
@@ -56,6 +34,28 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
         loader.mark_processed(key) # Frees memory
 ```
+### Incremental Safetensors Writer
+```python
+from unifiedefficientloader import UnifiedSafetensorsLoader, IncrementalSafetensorsWriter
+# Initialize Writer
+writer = IncrementalSafetensorsWriter(output_path, metadata=metadata)
+writer.__enter__()
+# Load model tensors and process them.
+with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
+    for key in loader.keys():
+        tensor = loader.get_tensor(key)
+        # Process tensor...
+        writer.write(key, tensor)
+        del tensor
+        loader.mark_processed(key) # Frees memory
+```
 ### Loading Specific Tensors Dynamically (Header Analysis)
 You can analyze the file's header without loading the entire multi-gigabyte safetensors file into memory. This allows you to locate specific data (like embedded JSON dictionaries stored as `uint8` tensors) and load *only* those specific tensors directly from their file offsets.
@@ -70,13 +70,13 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
         key for key, info in loader._header.items()
         if isinstance(info, dict) and info.get("dtype") == "U8"
     ]
     # 2. Load ONLY those specific tensors using their keys
     for key in uint8_tensor_keys:
-        # get_tensor dynamically reads only the bytes for this tensor
+        # get_tensor dynamically reads only the bytes for this tensor
         # based on the offsets found in the header
         loaded_tensor = loader.get_tensor(key)
         # 3. Decode the uint8 tensor back into a Python dictionary
         extracted_dict = tensor_to_dict(loaded_tensor)
         print(f"Decoded {key}:", extracted_dict)
@@ -91,26 +91,51 @@ from unifiedefficientloader import UnifiedSafetensorsLoader, transfer_to_gpu_pin
 with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
     keys_to_load = loader.keys()
     # Create the continuous streaming generator
     # prefetch_batches controls how many batches to buffer in memory
     stream = loader.async_stream(
-        keys_to_load,
-        batch_size=8,
-        prefetch_batches=2,
+        keys_to_load,
+        batch_size=8,
+        prefetch_batches=2,
         pin_memory=True
     )
     # Iterate directly over the generator
     for batch in stream:
         for key, pinned_tensor in batch:
             # Transfer directly to GPU via DMA (pinning is already done)
             gpu_tensor = transfer_to_gpu_pinned(pinned_tensor, device="cuda")
             # ... process gpu_tensor ...
             loader.mark_processed(key)
 ```
+### Unified Data Loader
+A high-performance, threaded alternative to PyTorch's standard `DataLoader`. It eliminates multiprocessing IPC overhead and features a zero-copy pipeline capable of streaming batches directly from pinned CPU memory to VRAM (`direct_gpu=True`).
+```python
+from unifiedefficientloader import UnifiedDataLoader
+from torchvision import datasets, transforms
+dataset = datasets.FakeData(transform=transforms.ToTensor())
+# Replaces torch.utils.data.DataLoader
+# Pre-allocates pinned buffer pools and streams directly to GPU
+loader = UnifiedDataLoader(
+    dataset,
+    batch_size=32,
+    shuffle=True,
+    num_workers=4,
+    direct_gpu=True
+)
+for batch_image, batch_label in loader:
+    # batch is already on the GPU (device="cuda")
+    pass
+```
 ### Direct-to-GPU Streaming (Zero-Copy)
 For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
@@ -119,25 +144,33 @@ For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True
 from unifiedefficientloader import UnifiedSafetensorsLoader
 with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
-    keys_to_load = loader.keys()
-    # async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
     stream = loader.async_stream(
-        keys_to_load,
+        loader.keys(),
         batch_size=8,
         prefetch_batches=2,
-        direct_gpu=True # optional here since we passed it in __init__
     )
     for batch in stream:
         for key, gpu_tensor in batch:
-            # gpu_tensor is already on the GPU!
+            # gpu_tensor is already on the GPU
             assert gpu_tensor.device.type == "cuda"
             # ... process gpu_tensor ...
-            loader.mark_processed(key)
+            loader.mark_processed(key)  # releases GPU buffer back to pool
+```
+### Zero-Copy MMAP Loading
+`use_mmap=True` maps the file into virtual memory via the `uel` native extension. No data is copied into RAM — PyTorch holds a direct pointer into OS page cache.
+```python
+from unifiedefficientloader import UnifiedSafetensorsLoader
+with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, use_mmap=True) as loader:
+    state_dict = loader.load_all()
+    # all tensors are zero-copy views into mapped memory
 ```
+Requires the `uel` native extension to be compiled. Falls back silently to standard IO if unavailable. See [docs/mmap.md](docs/mmap.md) and [docs/building.md](docs/building.md).
 ### Tensor/Dict Conversion
 ```python

{unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/pyproject.toml RENAMED Viewed

@@ -1,25 +1,25 @@
 [build-system]
-requires = ["setuptools>=61.0.0", "wheel"]
+requires = ["setuptools>=70.1.0", "wheel"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "unifiedefficientloader"
-version = "0.2.3"
+version = "0.4.4"
 description = "A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts."
 readme = "README.md"
 authors = [
   { name="silveroxides" }
 ]
-license = { text="MIT" }
+license = { file = "LICENSE" }
 classifiers = [
     "Development Status :: 4 - Beta",
     "Intended Audience :: Developers",
     "Programming Language :: Python :: 3",
-    "License :: OSI Approved :: MIT License",
-    "Operating System :: OS Independent",
+    "Operating System :: Microsoft :: Windows",
+    "Operating System :: POSIX :: Linux",
 ]
 requires-python = ">=3.9"
-dependencies = []
+dependencies = ["comfy-aimdo==0.3.0"]
 [project.optional-dependencies]
 torch = ["torch"]
@@ -39,4 +39,4 @@ filterwarnings = [
 [tool.setuptools.packages.find]
 where = ["."]
 include = ["unifiedefficientloader*"]
-exclude = ["reference"]
+exclude = ["reference"]

unifiedefficientloader-0.4.4/setup.py ADDED Viewed

@@ -0,0 +1,29 @@
+import re
+from pathlib import Path
+from setuptools import setup
+def _read_pyproject_version():
+    """Read version from pyproject.toml without importing any build tools."""
+    pyproject = Path(__file__).parent / "pyproject.toml"
+    text = pyproject.read_text(encoding="utf-8")
+    m = re.search(r'^version\s*=\s*"([^"]+)"', text, re.MULTILINE)
+    if not m:
+        raise RuntimeError("Could not find version in pyproject.toml")
+    return m.group(1)
+def _read_readme():
+    readme = Path(__file__).parent / "README.md"
+    if readme.exists():
+        return readme.read_text(encoding="utf-8")
+    return ""
+setup(
+    name="unifiedefficientloader",
+    version=_read_pyproject_version(),
+    long_description=_read_readme(),
+    long_description_content_type="text/markdown",
+    packages=["unifiedefficientloader", "unifiedefficientloader.uel"],
+)

{unifiedefficientloader-0.2.3 → unifiedefficientloader-0.4.4}/tests/test_direct_gpu.py RENAMED Viewed

@@ -5,91 +5,99 @@ import pytest
 try:
     import torch
     from safetensors.torch import save_file
     HAS_TORCH = True
 except ImportError:
     HAS_TORCH = False
 from unifiedefficientloader import MemoryEfficientSafeOpen
 @pytest.fixture
 def sample_safetensors():
     if not HAS_TORCH:
         pytest.skip("Requires torch and safetensors")
     with tempfile.NamedTemporaryFile(suffix=".safetensors", delete=False) as f:
         path = f.name
     tensors = {
         "weight1": torch.randn(10, 10),
         "weight2": torch.randn(20, 20),
         "bias": torch.zeros(10),
     }
     save_file(tensors, path)
     yield path, tensors
     if os.path.exists(path):
         os.remove(path)
-@pytest.mark.skipif(not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA")
+@pytest.mark.skipif(
+    not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA"
+)
 def test_direct_gpu_streaming(sample_safetensors):
     path, original_tensors = sample_safetensors
     loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
     # Test load_all which uses async_stream under the hood
     loaded_tensors = loader.load_all()
     for key, orig_tensor in original_tensors.items():
         assert key in loaded_tensors
         loaded_tensor = loaded_tensors[key]
         # Verify it's on GPU
         assert loaded_tensor.device.type == "cuda"
         # Verify data matches
         torch.testing.assert_close(loaded_tensor.cpu(), orig_tensor)
     loader.close()
-@pytest.mark.skipif(not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA")
+@pytest.mark.skipif(
+    not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA"
+)
 def test_direct_gpu_async_stream(sample_safetensors):
     path, original_tensors = sample_safetensors
     loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
     stream = loader.async_stream(
         keys=list(original_tensors.keys()),
         batch_size=2,
         prefetch_batches=1,
-        direct_gpu=True
     )
     loaded_count = 0
     for batch in stream:
         for key, tensor in batch:
             assert tensor.device.type == "cuda"
             torch.testing.assert_close(tensor.cpu(), original_tensors[key])
             loaded_count += 1
     assert loaded_count == len(original_tensors)
     loader.close()
 @pytest.mark.skipif(not HAS_TORCH, reason="Requires torch")
 def test_direct_gpu_fallback_no_cuda(sample_safetensors, monkeypatch):
     # Force cuda to be unavailable
     monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
     path, original_tensors = sample_safetensors
     # Should fallback to CPU silently
     loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
     loaded_tensors = loader.load_all()
     for key, orig_tensor in original_tensors.items():
         loaded_tensor = loaded_tensors[key]
         assert loaded_tensor.device.type == "cpu"
         torch.testing.assert_close(loaded_tensor, orig_tensor)
     loader.close()

unifiedefficientloader 0.2.3__tar.gz → 0.4.4__tar.gz

unifiedefficientloader 0.2.3tar.gz → 0.4.4tar.gz