unifiedefficientloader 0.2.1__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/PKG-INFO +28 -1
- {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/README.md +27 -0
- {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/pyproject.toml +7 -2
- {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/setup.py +1 -0
- unifiedefficientloader-0.2.3/tests/test_direct_gpu.py +95 -0
- unifiedefficientloader-0.2.3/tests/test_logging.py +51 -0
- {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader/__init__.py +32 -2
- unifiedefficientloader-0.2.3/unifiedefficientloader/gpu_buffer_pool.py +40 -0
- unifiedefficientloader-0.2.3/unifiedefficientloader/logging_utils.py +117 -0
- {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader/memory_efficient_loader.py +188 -48
- unifiedefficientloader-0.2.3/unifiedefficientloader/pinned_buffer_pool.py +39 -0
- {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader/pinned_transfer.py +28 -17
- {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader/tensor_utils.py +8 -5
- {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/PKG-INFO +28 -1
- {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/SOURCES.txt +5 -0
- {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/LICENSE +0 -0
- {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/setup.cfg +0 -0
- {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/tests/test_utils.py +0 -0
- {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/dependency_links.txt +0 -0
- {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/requires.txt +0 -0
- {unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: unifiedefficientloader
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
|
|
5
5
|
Author: silveroxides
|
|
6
6
|
License: MIT
|
|
@@ -111,6 +111,33 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
|
|
|
111
111
|
loader.mark_processed(key)
|
|
112
112
|
```
|
|
113
113
|
|
|
114
|
+
### Direct-to-GPU Streaming (Zero-Copy)
|
|
115
|
+
|
|
116
|
+
For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
from unifiedefficientloader import UnifiedSafetensorsLoader
|
|
120
|
+
|
|
121
|
+
with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
|
|
122
|
+
keys_to_load = loader.keys()
|
|
123
|
+
|
|
124
|
+
# async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
|
|
125
|
+
stream = loader.async_stream(
|
|
126
|
+
keys_to_load,
|
|
127
|
+
batch_size=8,
|
|
128
|
+
prefetch_batches=2,
|
|
129
|
+
direct_gpu=True # optional here since we passed it in __init__
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
for batch in stream:
|
|
133
|
+
for key, gpu_tensor in batch:
|
|
134
|
+
# gpu_tensor is already on the GPU!
|
|
135
|
+
assert gpu_tensor.device.type == "cuda"
|
|
136
|
+
|
|
137
|
+
# ... process gpu_tensor ...
|
|
138
|
+
loader.mark_processed(key)
|
|
139
|
+
```
|
|
140
|
+
|
|
114
141
|
### Tensor/Dict Conversion
|
|
115
142
|
|
|
116
143
|
```python
|
|
@@ -85,6 +85,33 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
|
|
|
85
85
|
loader.mark_processed(key)
|
|
86
86
|
```
|
|
87
87
|
|
|
88
|
+
### Direct-to-GPU Streaming (Zero-Copy)
|
|
89
|
+
|
|
90
|
+
For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from unifiedefficientloader import UnifiedSafetensorsLoader
|
|
94
|
+
|
|
95
|
+
with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
|
|
96
|
+
keys_to_load = loader.keys()
|
|
97
|
+
|
|
98
|
+
# async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
|
|
99
|
+
stream = loader.async_stream(
|
|
100
|
+
keys_to_load,
|
|
101
|
+
batch_size=8,
|
|
102
|
+
prefetch_batches=2,
|
|
103
|
+
direct_gpu=True # optional here since we passed it in __init__
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
for batch in stream:
|
|
107
|
+
for key, gpu_tensor in batch:
|
|
108
|
+
# gpu_tensor is already on the GPU!
|
|
109
|
+
assert gpu_tensor.device.type == "cuda"
|
|
110
|
+
|
|
111
|
+
# ... process gpu_tensor ...
|
|
112
|
+
loader.mark_processed(key)
|
|
113
|
+
```
|
|
114
|
+
|
|
88
115
|
### Tensor/Dict Conversion
|
|
89
116
|
|
|
90
117
|
```python
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "unifiedefficientloader"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.3"
|
|
8
8
|
description = "A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [
|
|
@@ -34,4 +34,9 @@ log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(linen
|
|
|
34
34
|
log_cli_date_format = "%Y-%m-%d %H:%M:%S"
|
|
35
35
|
filterwarnings = [
|
|
36
36
|
"ignore:.*argument 'device' of Tensor.*:DeprecationWarning"
|
|
37
|
-
]
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[tool.setuptools.packages.find]
|
|
40
|
+
where = ["."]
|
|
41
|
+
include = ["unifiedefficientloader*"]
|
|
42
|
+
exclude = ["reference"]
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
import torch
|
|
7
|
+
from safetensors.torch import save_file
|
|
8
|
+
HAS_TORCH = True
|
|
9
|
+
except ImportError:
|
|
10
|
+
HAS_TORCH = False
|
|
11
|
+
|
|
12
|
+
from unifiedefficientloader import MemoryEfficientSafeOpen
|
|
13
|
+
|
|
14
|
+
@pytest.fixture
|
|
15
|
+
def sample_safetensors():
|
|
16
|
+
if not HAS_TORCH:
|
|
17
|
+
pytest.skip("Requires torch and safetensors")
|
|
18
|
+
|
|
19
|
+
with tempfile.NamedTemporaryFile(suffix=".safetensors", delete=False) as f:
|
|
20
|
+
path = f.name
|
|
21
|
+
|
|
22
|
+
tensors = {
|
|
23
|
+
"weight1": torch.randn(10, 10),
|
|
24
|
+
"weight2": torch.randn(20, 20),
|
|
25
|
+
"bias": torch.zeros(10),
|
|
26
|
+
}
|
|
27
|
+
save_file(tensors, path)
|
|
28
|
+
|
|
29
|
+
yield path, tensors
|
|
30
|
+
|
|
31
|
+
if os.path.exists(path):
|
|
32
|
+
os.remove(path)
|
|
33
|
+
|
|
34
|
+
@pytest.mark.skipif(not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA")
|
|
35
|
+
def test_direct_gpu_streaming(sample_safetensors):
|
|
36
|
+
path, original_tensors = sample_safetensors
|
|
37
|
+
|
|
38
|
+
loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
|
|
39
|
+
|
|
40
|
+
# Test load_all which uses async_stream under the hood
|
|
41
|
+
loaded_tensors = loader.load_all()
|
|
42
|
+
|
|
43
|
+
for key, orig_tensor in original_tensors.items():
|
|
44
|
+
assert key in loaded_tensors
|
|
45
|
+
loaded_tensor = loaded_tensors[key]
|
|
46
|
+
|
|
47
|
+
# Verify it's on GPU
|
|
48
|
+
assert loaded_tensor.device.type == "cuda"
|
|
49
|
+
|
|
50
|
+
# Verify data matches
|
|
51
|
+
torch.testing.assert_close(loaded_tensor.cpu(), orig_tensor)
|
|
52
|
+
|
|
53
|
+
loader.close()
|
|
54
|
+
|
|
55
|
+
@pytest.mark.skipif(not HAS_TORCH or not torch.cuda.is_available(), reason="Requires CUDA")
|
|
56
|
+
def test_direct_gpu_async_stream(sample_safetensors):
|
|
57
|
+
path, original_tensors = sample_safetensors
|
|
58
|
+
|
|
59
|
+
loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
|
|
60
|
+
|
|
61
|
+
stream = loader.async_stream(
|
|
62
|
+
keys=list(original_tensors.keys()),
|
|
63
|
+
batch_size=2,
|
|
64
|
+
prefetch_batches=1,
|
|
65
|
+
direct_gpu=True
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
loaded_count = 0
|
|
69
|
+
for batch in stream:
|
|
70
|
+
for key, tensor in batch:
|
|
71
|
+
assert tensor.device.type == "cuda"
|
|
72
|
+
torch.testing.assert_close(tensor.cpu(), original_tensors[key])
|
|
73
|
+
loaded_count += 1
|
|
74
|
+
|
|
75
|
+
assert loaded_count == len(original_tensors)
|
|
76
|
+
loader.close()
|
|
77
|
+
|
|
78
|
+
@pytest.mark.skipif(not HAS_TORCH, reason="Requires torch")
|
|
79
|
+
def test_direct_gpu_fallback_no_cuda(sample_safetensors, monkeypatch):
|
|
80
|
+
# Force cuda to be unavailable
|
|
81
|
+
monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
|
|
82
|
+
|
|
83
|
+
path, original_tensors = sample_safetensors
|
|
84
|
+
|
|
85
|
+
# Should fallback to CPU silently
|
|
86
|
+
loader = MemoryEfficientSafeOpen(path, low_memory=True, direct_gpu=True)
|
|
87
|
+
|
|
88
|
+
loaded_tensors = loader.load_all()
|
|
89
|
+
|
|
90
|
+
for key, orig_tensor in original_tensors.items():
|
|
91
|
+
loaded_tensor = loaded_tensors[key]
|
|
92
|
+
assert loaded_tensor.device.type == "cpu"
|
|
93
|
+
torch.testing.assert_close(loaded_tensor, orig_tensor)
|
|
94
|
+
|
|
95
|
+
loader.close()
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import torch
|
|
3
|
+
from unifiedefficientloader import (
|
|
4
|
+
UnifiedSafetensorsLoader,
|
|
5
|
+
setup_logging,
|
|
6
|
+
VERBOSE_LEVEL,
|
|
7
|
+
DEBUG_LEVEL
|
|
8
|
+
)
|
|
9
|
+
from safetensors.torch import save_file
|
|
10
|
+
|
|
11
|
+
def test_logging():
|
|
12
|
+
# 1. Create a dummy safetensors file
|
|
13
|
+
dummy_file = "test_logging.safetensors"
|
|
14
|
+
tensors = {
|
|
15
|
+
"weight1": torch.randn(10, 10),
|
|
16
|
+
"weight2": torch.randn(5, 5)
|
|
17
|
+
}
|
|
18
|
+
save_file(tensors, dummy_file)
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
# 2. Test NORMAL logging (default)
|
|
22
|
+
print("\n--- Testing NORMAL Logging (Default) ---")
|
|
23
|
+
setup_logging("NORMAL")
|
|
24
|
+
with UnifiedSafetensorsLoader(dummy_file, low_memory=False) as loader:
|
|
25
|
+
_ = loader.get_tensor("weight1")
|
|
26
|
+
|
|
27
|
+
# 3. Test VERBOSE logging
|
|
28
|
+
print("\n--- Testing VERBOSE Logging ---")
|
|
29
|
+
setup_logging("VERBOSE")
|
|
30
|
+
with UnifiedSafetensorsLoader(dummy_file, low_memory=True) as loader:
|
|
31
|
+
_ = loader.get_tensor("weight1")
|
|
32
|
+
_ = loader.get_tensor("weight2")
|
|
33
|
+
|
|
34
|
+
# 4. Test DEBUG logging (includes function traces)
|
|
35
|
+
print("\n--- Testing DEBUG Logging ---")
|
|
36
|
+
setup_logging("DEBUG")
|
|
37
|
+
with UnifiedSafetensorsLoader(dummy_file, low_memory=True) as loader:
|
|
38
|
+
_ = loader.get_tensor("weight1")
|
|
39
|
+
|
|
40
|
+
# 5. Test MINIMAL logging
|
|
41
|
+
print("\n--- Testing MINIMAL Logging ---")
|
|
42
|
+
setup_logging("MINIMAL")
|
|
43
|
+
with UnifiedSafetensorsLoader(dummy_file, low_memory=False) as loader:
|
|
44
|
+
_ = loader.get_tensor("weight1")
|
|
45
|
+
|
|
46
|
+
finally:
|
|
47
|
+
if os.path.exists(dummy_file):
|
|
48
|
+
os.remove(dummy_file)
|
|
49
|
+
|
|
50
|
+
if __name__ == "__main__":
|
|
51
|
+
test_logging()
|
{unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader/__init__.py
RENAMED
|
@@ -9,7 +9,7 @@ def check_dependencies(*packages):
|
|
|
9
9
|
for pkg in packages:
|
|
10
10
|
if importlib.util.find_spec(pkg) is None:
|
|
11
11
|
missing.append(pkg)
|
|
12
|
-
|
|
12
|
+
|
|
13
13
|
if missing:
|
|
14
14
|
missing_str = ", ".join(missing)
|
|
15
15
|
raise ImportError(
|
|
@@ -23,6 +23,22 @@ check_dependencies("torch")
|
|
|
23
23
|
from .memory_efficient_loader import UnifiedSafetensorsLoader, MemoryEfficientSafeOpen
|
|
24
24
|
from .tensor_utils import dict_to_tensor, tensor_to_dict
|
|
25
25
|
from .pinned_transfer import transfer_to_gpu_pinned, set_verbose, get_pinned_transfer_stats, reset_pinned_transfer_stats
|
|
26
|
+
from .gpu_buffer_pool import GpuBufferPool
|
|
27
|
+
from .pinned_buffer_pool import PinnedBufferPool
|
|
28
|
+
from .logging_utils import (
|
|
29
|
+
setup_logging,
|
|
30
|
+
MINIMAL_LEVEL,
|
|
31
|
+
NORMAL_LEVEL,
|
|
32
|
+
VERBOSE_LEVEL,
|
|
33
|
+
DEBUG_LEVEL,
|
|
34
|
+
debug,
|
|
35
|
+
verbose,
|
|
36
|
+
normal,
|
|
37
|
+
info,
|
|
38
|
+
minimal,
|
|
39
|
+
warning,
|
|
40
|
+
error
|
|
41
|
+
)
|
|
26
42
|
|
|
27
43
|
__all__ = [
|
|
28
44
|
"UnifiedSafetensorsLoader",
|
|
@@ -33,4 +49,18 @@ __all__ = [
|
|
|
33
49
|
"set_verbose",
|
|
34
50
|
"get_pinned_transfer_stats",
|
|
35
51
|
"reset_pinned_transfer_stats",
|
|
36
|
-
|
|
52
|
+
"GpuBufferPool",
|
|
53
|
+
"PinnedBufferPool",
|
|
54
|
+
"setup_logging",
|
|
55
|
+
"MINIMAL_LEVEL",
|
|
56
|
+
"NORMAL_LEVEL",
|
|
57
|
+
"VERBOSE_LEVEL",
|
|
58
|
+
"DEBUG_LEVEL",
|
|
59
|
+
"debug",
|
|
60
|
+
"verbose",
|
|
61
|
+
"normal",
|
|
62
|
+
"info",
|
|
63
|
+
"minimal",
|
|
64
|
+
"warning",
|
|
65
|
+
"error",
|
|
66
|
+
]
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
GPU memory buffer pool for direct-to-GPU streaming.
|
|
3
|
+
|
|
4
|
+
Maintains a pool of pre-allocated GPU tensors to avoid allocation overhead
|
|
5
|
+
and ensure strictly bounded VRAM usage during streaming.
|
|
6
|
+
"""
|
|
7
|
+
from typing import Tuple, Optional
|
|
8
|
+
import torch
|
|
9
|
+
from . import logging_utils
|
|
10
|
+
|
|
11
|
+
logger = logging_utils.get_logger(__name__)
|
|
12
|
+
|
|
13
|
+
class GpuBufferPool:
|
|
14
|
+
"""Manages a pool of fixed-size GPU memory buffers."""
|
|
15
|
+
def __init__(self, size_bytes: int, num_buffers: int, device: str = "cuda"):
|
|
16
|
+
import torch
|
|
17
|
+
import queue
|
|
18
|
+
self.device = device
|
|
19
|
+
self.size_bytes = size_bytes
|
|
20
|
+
self.num_buffers = num_buffers
|
|
21
|
+
|
|
22
|
+
logging_utils.verbose(f"Initializing GpuBufferPool: {num_buffers} buffers of {size_bytes / (1024**2):.2f} MB each on {device}.")
|
|
23
|
+
|
|
24
|
+
self.buffers = []
|
|
25
|
+
for _ in range(num_buffers):
|
|
26
|
+
buf = torch.empty(size_bytes, dtype=torch.uint8, device=device)
|
|
27
|
+
self.buffers.append(buf)
|
|
28
|
+
|
|
29
|
+
self.free_queue = queue.Queue()
|
|
30
|
+
for i in range(num_buffers):
|
|
31
|
+
self.free_queue.put(i)
|
|
32
|
+
|
|
33
|
+
def acquire(self) -> Tuple[int, 'torch.Tensor']:
|
|
34
|
+
"""Acquire a free buffer. Blocks if empty."""
|
|
35
|
+
idx = self.free_queue.get()
|
|
36
|
+
return idx, self.buffers[idx]
|
|
37
|
+
|
|
38
|
+
def release(self, idx: int):
|
|
39
|
+
"""Release buffer back to pool."""
|
|
40
|
+
self.free_queue.put(idx)
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
import functools
|
|
4
|
+
|
|
5
|
+
# Custom Levels
|
|
6
|
+
# MINIMAL (30): WARNING+ (Reduced)
|
|
7
|
+
# NORMAL (20): INFO+ (Default)
|
|
8
|
+
# VERBOSE (15): Custom+ (Increased)
|
|
9
|
+
# DEBUG (10): DEBUG+ (Every function call)
|
|
10
|
+
|
|
11
|
+
MINIMAL_LEVEL = 30 # Use logging.WARNING
|
|
12
|
+
NORMAL_LEVEL = 20 # Use logging.INFO
|
|
13
|
+
VERBOSE_LEVEL = 15 # Custom level between INFO and DEBUG
|
|
14
|
+
DEBUG_LEVEL = 10 # logging.DEBUG
|
|
15
|
+
|
|
16
|
+
logging.addLevelName(VERBOSE_LEVEL, "VERBOSE")
|
|
17
|
+
logging.addLevelName(MINIMAL_LEVEL, "MINIMAL")
|
|
18
|
+
|
|
19
|
+
class CustomFormatter(logging.Formatter):
|
|
20
|
+
def format(self, record):
|
|
21
|
+
# Save original format to restore it later
|
|
22
|
+
orig_fmt = self._style._fmt
|
|
23
|
+
|
|
24
|
+
if record.levelno <= DEBUG_LEVEL:
|
|
25
|
+
# Debug: Full trace info
|
|
26
|
+
self._style._fmt = "[%(levelname)s] %(name)s:%(lineno)d - %(message)s"
|
|
27
|
+
elif record.levelno <= VERBOSE_LEVEL:
|
|
28
|
+
# Verbose: Detail
|
|
29
|
+
self._style._fmt = "[%(levelname)s] %(message)s"
|
|
30
|
+
elif record.levelno <= NORMAL_LEVEL:
|
|
31
|
+
# Normal: Standard output
|
|
32
|
+
self._style._fmt = "%(message)s"
|
|
33
|
+
else:
|
|
34
|
+
# Minimal/Warning
|
|
35
|
+
self._style._fmt = "[%(levelname)s] %(message)s"
|
|
36
|
+
|
|
37
|
+
result = super().format(record)
|
|
38
|
+
|
|
39
|
+
# Restore original format
|
|
40
|
+
self._style._fmt = orig_fmt
|
|
41
|
+
return result
|
|
42
|
+
|
|
43
|
+
def setup_logging(verbose_arg: str = "NORMAL"):
|
|
44
|
+
"""
|
|
45
|
+
Setup logging based on verbosity name.
|
|
46
|
+
"""
|
|
47
|
+
level_map = {
|
|
48
|
+
"DEBUG": DEBUG_LEVEL,
|
|
49
|
+
"VERBOSE": VERBOSE_LEVEL,
|
|
50
|
+
"NORMAL": NORMAL_LEVEL,
|
|
51
|
+
"MINIMAL": MINIMAL_LEVEL
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
level = level_map.get(verbose_arg.upper(), NORMAL_LEVEL)
|
|
55
|
+
|
|
56
|
+
logger = logging.getLogger("unifiedefficientloader")
|
|
57
|
+
logger.setLevel(level)
|
|
58
|
+
|
|
59
|
+
# Clear existing handlers to prevent duplicates
|
|
60
|
+
if logger.handlers:
|
|
61
|
+
logger.handlers.clear()
|
|
62
|
+
|
|
63
|
+
handler = logging.StreamHandler(sys.stdout)
|
|
64
|
+
handler.setFormatter(CustomFormatter())
|
|
65
|
+
logger.addHandler(handler)
|
|
66
|
+
|
|
67
|
+
return logger
|
|
68
|
+
|
|
69
|
+
def get_logger(name=None):
|
|
70
|
+
if name:
|
|
71
|
+
if not name.startswith("unifiedefficientloader"):
|
|
72
|
+
name = f"unifiedefficientloader.{name}"
|
|
73
|
+
return logging.getLogger(name)
|
|
74
|
+
return logging.getLogger("unifiedefficientloader")
|
|
75
|
+
|
|
76
|
+
# Decorator for DEBUG level tracing
|
|
77
|
+
def log_debug(func):
|
|
78
|
+
"""Decorator to log function entry/exit with args (DEBUG level only)."""
|
|
79
|
+
@functools.wraps(func)
|
|
80
|
+
def wrapper(*args, **kwargs):
|
|
81
|
+
# We only want to construct the string if debug is enabled to save perf
|
|
82
|
+
logger = get_logger(func.__module__.split('.')[-1])
|
|
83
|
+
if logger.isEnabledFor(DEBUG_LEVEL):
|
|
84
|
+
arg_str = ", ".join([repr(a) for a in args])
|
|
85
|
+
kw_str = ", ".join([f"{k}={v!r}" for k, v in kwargs.items()])
|
|
86
|
+
all_args = ", ".join(filter(None, [arg_str, kw_str]))
|
|
87
|
+
logger.log(DEBUG_LEVEL, f"CALL {func.__name__}({all_args})")
|
|
88
|
+
|
|
89
|
+
result = func(*args, **kwargs)
|
|
90
|
+
|
|
91
|
+
if logger.isEnabledFor(DEBUG_LEVEL):
|
|
92
|
+
logger.log(DEBUG_LEVEL, f"RET {func.__name__} -> {type(result)}")
|
|
93
|
+
return result
|
|
94
|
+
return wrapper
|
|
95
|
+
|
|
96
|
+
# Convenience wrappers
|
|
97
|
+
def debug(msg, *args, **kwargs):
|
|
98
|
+
get_logger().log(DEBUG_LEVEL, msg, *args, **kwargs)
|
|
99
|
+
|
|
100
|
+
def verbose(msg, *args, **kwargs):
|
|
101
|
+
get_logger().log(VERBOSE_LEVEL, msg, *args, **kwargs)
|
|
102
|
+
|
|
103
|
+
def normal(msg, *args, **kwargs):
|
|
104
|
+
get_logger().log(NORMAL_LEVEL, msg, *args, **kwargs)
|
|
105
|
+
|
|
106
|
+
def info(msg, *args, **kwargs):
|
|
107
|
+
"""Alias for normal/INFO level logging."""
|
|
108
|
+
normal(msg, *args, **kwargs)
|
|
109
|
+
|
|
110
|
+
def minimal(msg, *args, **kwargs):
|
|
111
|
+
get_logger().log(MINIMAL_LEVEL, msg, *args, **kwargs)
|
|
112
|
+
|
|
113
|
+
def warning(msg, *args, **kwargs):
|
|
114
|
+
get_logger().warning(msg, *args, **kwargs)
|
|
115
|
+
|
|
116
|
+
def error(msg, *args, **kwargs):
|
|
117
|
+
get_logger().error(msg, *args, **kwargs)
|
|
@@ -7,10 +7,11 @@ Requires `torch`, `safetensors`, and optionally `tqdm`.
|
|
|
7
7
|
import gc
|
|
8
8
|
import json
|
|
9
9
|
import struct
|
|
10
|
-
import logging
|
|
11
10
|
from typing import Dict, Optional, Tuple
|
|
12
11
|
|
|
13
|
-
|
|
12
|
+
from . import logging_utils
|
|
13
|
+
|
|
14
|
+
logger = logging_utils.get_logger(__name__)
|
|
14
15
|
|
|
15
16
|
def _ensure_torch():
|
|
16
17
|
try:
|
|
@@ -51,46 +52,57 @@ class UnifiedSafetensorsLoader:
|
|
|
51
52
|
loader.mark_processed(key) # Frees memory in low_memory mode
|
|
52
53
|
"""
|
|
53
54
|
|
|
54
|
-
|
|
55
|
+
@logging_utils.log_debug
|
|
56
|
+
def __init__(self, filename: str, low_memory: bool = False, direct_gpu: bool = False):
|
|
55
57
|
"""Initialize the loader.
|
|
56
58
|
|
|
57
59
|
Args:
|
|
58
60
|
filename: Path to safetensors file
|
|
59
61
|
low_memory: If True, use streaming mode; if False, preload all tensors
|
|
62
|
+
direct_gpu: If True, stream directly to GPU pinned/slab memory (requires low_memory=True)
|
|
60
63
|
"""
|
|
61
64
|
torch = _ensure_torch()
|
|
62
65
|
safe_open = _ensure_safetensors()
|
|
63
|
-
|
|
66
|
+
|
|
64
67
|
self.filename = filename
|
|
65
68
|
self.low_memory = low_memory
|
|
69
|
+
self.direct_gpu = direct_gpu
|
|
70
|
+
|
|
71
|
+
if self.direct_gpu and not self.low_memory:
|
|
72
|
+
logging_utils.warning("direct_gpu=True requires low_memory=True. Forcing low_memory=True.")
|
|
73
|
+
self.low_memory = True
|
|
74
|
+
|
|
66
75
|
self._tensors: Dict[str, 'torch.Tensor'] = {}
|
|
76
|
+
self._gpu_buffer_indices: Dict[str, int] = {}
|
|
77
|
+
self._gpu_pool = None
|
|
78
|
+
|
|
67
79
|
self._all_keys = []
|
|
68
80
|
self._file = None
|
|
69
81
|
self._header = None
|
|
70
82
|
self._header_size = None
|
|
71
83
|
self._metadata: Dict[str, str] = {}
|
|
72
84
|
|
|
73
|
-
if low_memory:
|
|
85
|
+
if self.low_memory:
|
|
74
86
|
# Streaming mode: read header only
|
|
75
87
|
self._header, self._header_size = self._read_header()
|
|
76
88
|
self._file = None # Opened lazily to support multiprocessing DataLoader
|
|
77
89
|
self._all_keys = [k for k in self._header.keys() if k != "__metadata__"]
|
|
78
90
|
# Extract metadata from header (safetensors stores it under __metadata__ key)
|
|
79
91
|
self._metadata = self._header.get("__metadata__", {})
|
|
80
|
-
|
|
81
|
-
|
|
92
|
+
logging_utils.verbose(f"Initialized Low-memory mode: parsed header of size {self._header_size} bytes.")
|
|
93
|
+
logging_utils.verbose(f"Found {len(self._all_keys)} tensors (streaming mode)")
|
|
82
94
|
else:
|
|
83
95
|
# Standard mode: preload all tensors
|
|
84
|
-
with safe_open(filename, framework="pt", device="cpu") as f:
|
|
96
|
+
with safe_open(self.filename, framework="pt", device="cpu") as f:
|
|
85
97
|
self._metadata = f.metadata() or {}
|
|
86
98
|
self._all_keys = list(f.keys())
|
|
87
|
-
|
|
99
|
+
logging_utils.normal(f"Loading {len(self._all_keys)} tensors from source file...")
|
|
88
100
|
try:
|
|
89
101
|
from tqdm import tqdm
|
|
90
|
-
iterator = tqdm(self._all_keys, desc="Loading tensors")
|
|
102
|
+
iterator = tqdm(self._all_keys, desc="Loading tensors", disable=not logger.isEnabledFor(logging_utils.NORMAL_LEVEL))
|
|
91
103
|
except ImportError:
|
|
92
104
|
iterator = self._all_keys
|
|
93
|
-
|
|
105
|
+
|
|
94
106
|
for key in iterator:
|
|
95
107
|
self._tensors[key] = f.get_tensor(key)
|
|
96
108
|
|
|
@@ -141,6 +153,7 @@ class UnifiedSafetensorsLoader:
|
|
|
141
153
|
"""Get tensor ndim without loading tensor data."""
|
|
142
154
|
return len(self.get_shape(key))
|
|
143
155
|
|
|
156
|
+
@logging_utils.log_debug
|
|
144
157
|
def get_tensor(self, key: str) -> 'torch.Tensor':
|
|
145
158
|
"""Get a tensor by key.
|
|
146
159
|
|
|
@@ -162,7 +175,7 @@ class UnifiedSafetensorsLoader:
|
|
|
162
175
|
offset_start, offset_end = metadata["data_offsets"]
|
|
163
176
|
|
|
164
177
|
if offset_start != offset_end:
|
|
165
|
-
|
|
178
|
+
logging_utils.debug(f"Loading tensor '{key}' from offset {offset_start} to {offset_end} ({(offset_end - offset_start)} bytes)")
|
|
166
179
|
self._file.seek(self._header_size + 8 + offset_start)
|
|
167
180
|
# Use bytearray to create a writable buffer, avoiding PyTorch warning
|
|
168
181
|
# about non-writable tensors from read-only bytes.
|
|
@@ -177,12 +190,17 @@ class UnifiedSafetensorsLoader:
|
|
|
177
190
|
"""Mark a tensor as processed, freeing memory if in low-memory mode.
|
|
178
191
|
|
|
179
192
|
In standard mode, optionally deletes from cache.
|
|
180
|
-
In low-memory mode,
|
|
193
|
+
In low-memory mode, frees GPU buffer back to pool if direct_gpu.
|
|
181
194
|
"""
|
|
182
195
|
if not self.low_memory and key in self._tensors:
|
|
183
196
|
del self._tensors[key]
|
|
184
197
|
gc.collect()
|
|
185
198
|
|
|
199
|
+
if self.direct_gpu and key in self._gpu_buffer_indices:
|
|
200
|
+
idx = self._gpu_buffer_indices.pop(key)
|
|
201
|
+
if self._gpu_pool:
|
|
202
|
+
self._gpu_pool.release(idx)
|
|
203
|
+
|
|
186
204
|
def _read_header(self):
|
|
187
205
|
"""Read and parse the safetensors header."""
|
|
188
206
|
with open(self.filename, "rb") as f:
|
|
@@ -270,61 +288,152 @@ class UnifiedSafetensorsLoader:
|
|
|
270
288
|
|
|
271
289
|
def async_stream(self, keys: list, batch_size: int = 1, prefetch_batches: int = 2, pin_memory: bool = False):
|
|
272
290
|
"""Asynchronously stream tensors from disk.
|
|
273
|
-
|
|
291
|
+
|
|
274
292
|
Args:
|
|
275
293
|
keys: List of tensor keys to load
|
|
276
294
|
batch_size: Number of tensors to yield in each batch
|
|
277
295
|
prefetch_batches: Number of batches to pre-fetch in background
|
|
278
296
|
pin_memory: If True, tensors will be pinned in CPU memory (sequentially in main thread)
|
|
279
|
-
|
|
297
|
+
direct_gpu: Stream via pinned buffer directly to GPU
|
|
298
|
+
|
|
280
299
|
Yields:
|
|
281
300
|
List of (key, tensor) tuples
|
|
282
301
|
"""
|
|
283
302
|
import threading
|
|
284
303
|
import queue
|
|
285
304
|
from concurrent.futures import ThreadPoolExecutor
|
|
305
|
+
import os
|
|
286
306
|
|
|
287
307
|
torch = _ensure_torch()
|
|
288
308
|
thread_local = threading.local()
|
|
289
309
|
|
|
310
|
+
# Initialize GPU slab and Pinned Buffer Pool if direct_gpu
|
|
311
|
+
pinned_pool = None
|
|
312
|
+
cuda_stream = None
|
|
313
|
+
|
|
314
|
+
if self.direct_gpu and torch.cuda.is_available():
|
|
315
|
+
try:
|
|
316
|
+
from .gpu_buffer_pool import GpuBufferPool
|
|
317
|
+
from .pinned_buffer_pool import PinnedBufferPool
|
|
318
|
+
|
|
319
|
+
# Pre-calculate required slab size
|
|
320
|
+
max_tensor_bytes = 0
|
|
321
|
+
for k in keys:
|
|
322
|
+
meta = self._header[k]
|
|
323
|
+
start, end = meta["data_offsets"]
|
|
324
|
+
sz = end - start
|
|
325
|
+
max_tensor_bytes = max(max_tensor_bytes, sz)
|
|
326
|
+
|
|
327
|
+
# Initialize pools (size of largest tensor)
|
|
328
|
+
# We need a larger pool to allow the GPU to lag behind the CPU without stalling
|
|
329
|
+
max_workers = min(16, max(4, batch_size))
|
|
330
|
+
max_in_flight = max(max_workers, prefetch_batches * batch_size)
|
|
331
|
+
|
|
332
|
+
# Double the buffers for a smooth pipeline
|
|
333
|
+
num_buffers = (max_in_flight + max_workers) * 2 + 2
|
|
334
|
+
|
|
335
|
+
# Assign pool to instance to survive the generator lifetime
|
|
336
|
+
if not getattr(self, '_gpu_pool', None):
|
|
337
|
+
self._gpu_pool = GpuBufferPool(max_tensor_bytes, num_buffers)
|
|
338
|
+
|
|
339
|
+
pinned_pool = PinnedBufferPool(max_tensor_bytes, num_buffers)
|
|
340
|
+
cuda_stream = torch.cuda.Stream()
|
|
341
|
+
|
|
342
|
+
logging_utils.normal(f"Direct GPU pipeline initialized: {num_buffers} buffers, max {max_tensor_bytes / (1024**2):.1f}MB each (Total VRAM: {(num_buffers*max_tensor_bytes)/(1024**2):.1f}MB)")
|
|
343
|
+
|
|
344
|
+
except Exception as e:
|
|
345
|
+
logging_utils.warning(f"Failed to initialize direct GPU pipeline: {e}. Falling back.")
|
|
346
|
+
self.direct_gpu = False
|
|
347
|
+
pinned_pool = None
|
|
348
|
+
elif self.direct_gpu:
|
|
349
|
+
logging_utils.warning("direct_gpu=True requested but CUDA is not available. Falling back to CPU.")
|
|
350
|
+
self.direct_gpu = False
|
|
351
|
+
|
|
290
352
|
def get_file_handle():
|
|
291
353
|
if not hasattr(thread_local, 'file'):
|
|
292
354
|
thread_local.file = open(self.filename, "rb")
|
|
293
355
|
return thread_local.file
|
|
294
356
|
|
|
295
357
|
def _worker_load(key):
|
|
358
|
+
buf_idx = None
|
|
359
|
+
gpu_idx = None
|
|
296
360
|
try:
|
|
297
|
-
# Direct thread-safe read
|
|
298
361
|
metadata = self._header[key]
|
|
299
362
|
offset_start, offset_end = metadata["data_offsets"]
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
else:
|
|
306
|
-
tensor_bytes = None
|
|
363
|
+
sz = offset_end - offset_start
|
|
364
|
+
|
|
365
|
+
if self.direct_gpu and sz > 0:
|
|
366
|
+
# Direct GPU Pipeline Path
|
|
367
|
+
buf_idx, pinned_buf = pinned_pool.acquire()
|
|
307
368
|
|
|
308
|
-
|
|
309
|
-
|
|
369
|
+
try:
|
|
370
|
+
# Schedule GPU transfer
|
|
371
|
+
gpu_idx, gpu_buf = self._gpu_pool.acquire()
|
|
372
|
+
|
|
373
|
+
try:
|
|
374
|
+
# Read into pinned memory directly (Zero-Copy CPU path)
|
|
375
|
+
import ctypes
|
|
376
|
+
view = pinned_buf[:sz]
|
|
377
|
+
|
|
378
|
+
# Create a ctypes c_uint8 array spanning the pinned buffer memory
|
|
379
|
+
# This allows f.readinto() to write bytes directly to the torch tensor memory
|
|
380
|
+
c_uint8_array = (ctypes.c_uint8 * sz).from_address(view.data_ptr())
|
|
381
|
+
|
|
382
|
+
f = get_file_handle()
|
|
383
|
+
f.seek(self._header_size + 8 + offset_start)
|
|
384
|
+
f.readinto(c_uint8_array)
|
|
385
|
+
|
|
386
|
+
gpu_view = gpu_buf[:sz]
|
|
387
|
+
|
|
388
|
+
with torch.cuda.stream(cuda_stream):
|
|
389
|
+
gpu_view.copy_(view, non_blocking=True)
|
|
390
|
+
|
|
391
|
+
# Create event to track when copy finishes
|
|
392
|
+
event = torch.cuda.Event()
|
|
393
|
+
event.record()
|
|
394
|
+
|
|
395
|
+
# Critical: wait for stream before allowing worker to finish
|
|
396
|
+
# If worker finishes, buffer might be overwritten by next worker
|
|
397
|
+
# if pool sizing is tight.
|
|
398
|
+
# In direct_gpu, the tensor is the gpu_view.
|
|
399
|
+
return key, gpu_view, metadata, buf_idx, gpu_idx, event
|
|
400
|
+
|
|
401
|
+
except Exception as e:
|
|
402
|
+
# If reading or copying fails, release GPU buffer
|
|
403
|
+
self._gpu_pool.release(gpu_idx)
|
|
404
|
+
raise e
|
|
405
|
+
|
|
406
|
+
except Exception as e:
|
|
407
|
+
# If acquiring GPU buffer fails, release pinned buffer
|
|
408
|
+
pinned_pool.release(buf_idx)
|
|
409
|
+
raise e
|
|
410
|
+
else:
|
|
411
|
+
# Standard CPU Path
|
|
412
|
+
if offset_start != offset_end:
|
|
413
|
+
f = get_file_handle()
|
|
414
|
+
f.seek(self._header_size + 8 + offset_start)
|
|
415
|
+
tensor_bytes = bytearray(offset_end - offset_start)
|
|
416
|
+
f.readinto(tensor_bytes)
|
|
417
|
+
else:
|
|
418
|
+
tensor_bytes = None
|
|
419
|
+
|
|
420
|
+
tensor = self._deserialize_tensor(tensor_bytes, metadata)
|
|
421
|
+
return key, tensor, None, None, None, None
|
|
310
422
|
except Exception as e:
|
|
311
|
-
|
|
312
|
-
|
|
423
|
+
return key, None, e, None, None, None
|
|
424
|
+
|
|
425
|
+
max_workers = min(16, max(4, batch_size))
|
|
426
|
+
max_in_flight = max(max_workers, prefetch_batches * batch_size)
|
|
313
427
|
|
|
314
428
|
# Queue for individual (key, tensor) pairs
|
|
315
|
-
# Size it to hold enough for prefetch_batches
|
|
316
|
-
q = queue.Queue(maxsize=
|
|
317
|
-
|
|
429
|
+
# Size it to hold enough for prefetch_batches PLUS max_workers to prevent stalling
|
|
430
|
+
q = queue.Queue(maxsize=max_in_flight + max_workers)
|
|
431
|
+
|
|
318
432
|
def _producer():
|
|
319
|
-
# Use a reasonable number of workers for I/O bound tasks
|
|
320
|
-
max_workers = min(16, max(4, batch_size))
|
|
321
|
-
# Limit task submission to maintain backpressure on memory
|
|
322
|
-
max_in_flight = max(max_workers, prefetch_batches * batch_size)
|
|
323
|
-
|
|
324
433
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
325
434
|
futures = []
|
|
326
435
|
key_iter = iter(keys)
|
|
327
|
-
|
|
436
|
+
|
|
328
437
|
# Fill the pipeline
|
|
329
438
|
for _ in range(max_in_flight):
|
|
330
439
|
try:
|
|
@@ -332,50 +441,81 @@ class UnifiedSafetensorsLoader:
|
|
|
332
441
|
futures.append(executor.submit(_worker_load, k))
|
|
333
442
|
except StopIteration:
|
|
334
443
|
break
|
|
335
|
-
|
|
444
|
+
|
|
336
445
|
while futures:
|
|
337
446
|
# Maintain order by taking the first future
|
|
338
447
|
f = futures.pop(0)
|
|
339
448
|
result = f.result() # Blocks until this specific tensor is loaded
|
|
340
449
|
q.put(result) # Blocks if the consumption queue is full
|
|
341
|
-
|
|
450
|
+
|
|
342
451
|
# Submit next task if available
|
|
343
452
|
try:
|
|
344
453
|
k = next(key_iter)
|
|
345
454
|
futures.append(executor.submit(_worker_load, k))
|
|
346
455
|
except StopIteration:
|
|
347
456
|
pass
|
|
348
|
-
|
|
457
|
+
|
|
349
458
|
q.put(None) # Sentinel
|
|
350
459
|
|
|
460
|
+
producer_thread = threading.local()
|
|
351
461
|
producer_thread = threading.Thread(target=_producer, daemon=True)
|
|
352
462
|
producer_thread.start()
|
|
353
463
|
|
|
354
464
|
batch = []
|
|
465
|
+
pending_pinned = [] # Track (event, buf_idx) to release later
|
|
466
|
+
|
|
355
467
|
while True:
|
|
356
468
|
res = q.get()
|
|
357
469
|
if res is None:
|
|
470
|
+
# Synchronize and cleanup any remaining buffers on exit
|
|
471
|
+
for ev, idx in pending_pinned:
|
|
472
|
+
ev.synchronize()
|
|
473
|
+
pinned_pool.release(idx)
|
|
358
474
|
if batch:
|
|
359
475
|
yield batch
|
|
360
476
|
break
|
|
361
|
-
|
|
362
|
-
k, t, err = res
|
|
363
|
-
if err is not None:
|
|
364
|
-
|
|
477
|
+
|
|
478
|
+
k, t, err, buf_idx, gpu_idx, event = res
|
|
479
|
+
if err is not None and not isinstance(err, dict):
|
|
480
|
+
logging_utils.warning(f"Async load failed for {k}, falling back to sync: {err}")
|
|
365
481
|
# Fallback synchronous load
|
|
366
482
|
try:
|
|
367
483
|
t = self.get_tensor(k)
|
|
368
484
|
except Exception as sync_err:
|
|
369
|
-
|
|
485
|
+
logging_utils.error(f"Sync fallback also failed for {k}: {sync_err}")
|
|
370
486
|
raise sync_err
|
|
371
|
-
|
|
487
|
+
|
|
488
|
+
if buf_idx is not None and event is not None:
|
|
489
|
+
# Don't block here! Yield the tensor with its event.
|
|
490
|
+
# Only release the PREVIOUS batch's buffers.
|
|
491
|
+
# This creates a sliding window of safety.
|
|
492
|
+
while len(pending_pinned) >= (max_in_flight + 1):
|
|
493
|
+
ev, idx = pending_pinned.pop(0)
|
|
494
|
+
ev.synchronize() # Wait only if we MUST reuse a buffer
|
|
495
|
+
pinned_pool.release(idx)
|
|
496
|
+
|
|
497
|
+
pending_pinned.append((event, buf_idx))
|
|
498
|
+
|
|
499
|
+
# Register GPU index for cleanup
|
|
500
|
+
self._gpu_buffer_indices[k] = gpu_idx
|
|
501
|
+
|
|
502
|
+
# Reshape GPU view to tensor
|
|
503
|
+
meta = err # we reused err for metadata in direct_gpu path
|
|
504
|
+
dtype = self._get_torch_dtype(meta["dtype"])
|
|
505
|
+
shape = meta["shape"]
|
|
506
|
+
|
|
507
|
+
if meta["dtype"] in ["F8_E5M2", "F8_E4M3"]:
|
|
508
|
+
t = self._convert_float8(t, meta["dtype"], shape)
|
|
509
|
+
else:
|
|
510
|
+
t = t.view(dtype).reshape(shape)
|
|
511
|
+
|
|
372
512
|
# Pin memory sequentially in the main thread to avoid OS-level lock contention
|
|
373
|
-
|
|
513
|
+
elif pin_memory and t.device.type == 'cpu':
|
|
374
514
|
try:
|
|
375
515
|
t = t.pin_memory()
|
|
376
516
|
except Exception as e:
|
|
377
|
-
|
|
378
|
-
|
|
517
|
+
logging_utils.warning(f"Failed to pin memory for {k}: {e}")
|
|
518
|
+
|
|
379
519
|
batch.append((k, t))
|
|
380
520
|
if len(batch) == batch_size:
|
|
381
521
|
yield batch
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pinned memory pool manager.
|
|
3
|
+
|
|
4
|
+
Provides pre-allocated pinned buffers to avoid per-tensor allocation overhead.
|
|
5
|
+
"""
|
|
6
|
+
from typing import Optional
|
|
7
|
+
import torch
|
|
8
|
+
from . import logging_utils
|
|
9
|
+
|
|
10
|
+
logger = logging_utils.get_logger(__name__)
|
|
11
|
+
|
|
12
|
+
class PinnedBufferPool:
|
|
13
|
+
"""Manages a pool of pinned memory buffers for fast disk-to-GPU transfer."""
|
|
14
|
+
def __init__(self, size_bytes: int, num_buffers: int):
|
|
15
|
+
import torch
|
|
16
|
+
self.size_bytes = size_bytes
|
|
17
|
+
self.num_buffers = num_buffers
|
|
18
|
+
|
|
19
|
+
logging_utils.verbose(f"Initializing PinnedBufferPool: {num_buffers} buffers of {size_bytes / (1024**2):.2f} MB each.")
|
|
20
|
+
|
|
21
|
+
self.buffers = []
|
|
22
|
+
for _ in range(num_buffers):
|
|
23
|
+
buf = torch.empty(size_bytes, dtype=torch.uint8, pin_memory=True)
|
|
24
|
+
self.buffers.append(buf)
|
|
25
|
+
|
|
26
|
+
import queue
|
|
27
|
+
self.free_queue = queue.Queue()
|
|
28
|
+
for i in range(num_buffers):
|
|
29
|
+
self.free_queue.put(i)
|
|
30
|
+
|
|
31
|
+
def acquire(self) -> tuple[int, 'torch.Tensor']:
|
|
32
|
+
"""Acquire a free buffer. Blocks if empty."""
|
|
33
|
+
idx = self.free_queue.get()
|
|
34
|
+
return idx, self.buffers[idx]
|
|
35
|
+
|
|
36
|
+
def release(self, idx: int):
|
|
37
|
+
"""Release buffer back to pool."""
|
|
38
|
+
self.free_queue.put(idx)
|
|
39
|
+
|
|
@@ -4,10 +4,11 @@ Pinned memory utilities for faster CPU→GPU tensor transfers.
|
|
|
4
4
|
Pinned (page-locked) memory enables faster DMA transfers to GPU.
|
|
5
5
|
Uses PyTorch's native pin_memory() with non_blocking transfers.
|
|
6
6
|
"""
|
|
7
|
-
import logging
|
|
8
7
|
from typing import Optional
|
|
9
8
|
|
|
10
|
-
|
|
9
|
+
from . import logging_utils
|
|
10
|
+
|
|
11
|
+
logger = logging_utils.get_logger(__name__)
|
|
11
12
|
|
|
12
13
|
def _ensure_torch():
|
|
13
14
|
try:
|
|
@@ -21,9 +22,14 @@ _verbose = False
|
|
|
21
22
|
_pinned_transfer_stats = {"pinned": 0, "fallback": 0}
|
|
22
23
|
|
|
23
24
|
def set_verbose(enabled: bool):
|
|
24
|
-
"""
|
|
25
|
+
"""
|
|
26
|
+
Enable/disable verbose output for pinned transfers.
|
|
27
|
+
Also adjusts logging level to VERBOSE if enabled.
|
|
28
|
+
"""
|
|
25
29
|
global _verbose
|
|
26
30
|
_verbose = enabled
|
|
31
|
+
if enabled:
|
|
32
|
+
logging_utils.setup_logging("VERBOSE")
|
|
27
33
|
|
|
28
34
|
def get_pinned_transfer_stats():
|
|
29
35
|
"""Return pinned transfer statistics for verification."""
|
|
@@ -34,10 +40,12 @@ def reset_pinned_transfer_stats():
|
|
|
34
40
|
global _pinned_transfer_stats
|
|
35
41
|
_pinned_transfer_stats = {"pinned": 0, "fallback": 0}
|
|
36
42
|
|
|
43
|
+
@logging_utils.log_debug
|
|
37
44
|
def transfer_to_gpu_pinned(
|
|
38
45
|
tensor,
|
|
39
46
|
device: str = 'cuda',
|
|
40
|
-
dtype = None
|
|
47
|
+
dtype = None,
|
|
48
|
+
non_blocking: bool = True
|
|
41
49
|
):
|
|
42
50
|
"""Transfer tensor to GPU using pinned memory for faster transfer."""
|
|
43
51
|
torch = _ensure_torch()
|
|
@@ -46,44 +54,47 @@ def transfer_to_gpu_pinned(
|
|
|
46
54
|
# Skip if not a CPU tensor or CUDA unavailable
|
|
47
55
|
if tensor.device.type != 'cpu' or not torch.cuda.is_available():
|
|
48
56
|
if dtype is not None:
|
|
49
|
-
return tensor.to(device=device, dtype=dtype)
|
|
50
|
-
return tensor.to(device=device)
|
|
57
|
+
return tensor.to(device=device, dtype=dtype, non_blocking=non_blocking)
|
|
58
|
+
return tensor.to(device=device, non_blocking=non_blocking)
|
|
51
59
|
|
|
52
60
|
# Skip if target is not CUDA
|
|
53
61
|
if not str(device).startswith('cuda'):
|
|
54
62
|
if dtype is not None:
|
|
55
|
-
return tensor.to(device=device, dtype=dtype)
|
|
56
|
-
return tensor.to(device=device)
|
|
63
|
+
return tensor.to(device=device, dtype=dtype, non_blocking=non_blocking)
|
|
64
|
+
return tensor.to(device=device, non_blocking=non_blocking)
|
|
57
65
|
|
|
58
66
|
try:
|
|
59
67
|
pinned = tensor.pin_memory()
|
|
60
68
|
|
|
61
69
|
if dtype is not None:
|
|
62
|
-
result = pinned.to(device=device, dtype=dtype, non_blocking=
|
|
70
|
+
result = pinned.to(device=device, dtype=dtype, non_blocking=non_blocking)
|
|
63
71
|
else:
|
|
64
|
-
result = pinned.to(device=device, non_blocking=
|
|
72
|
+
result = pinned.to(device=device, non_blocking=non_blocking)
|
|
65
73
|
|
|
66
74
|
torch.cuda.current_stream().synchronize()
|
|
67
75
|
|
|
68
76
|
# One-time confirmation on first success
|
|
69
77
|
if _pinned_transfer_stats["pinned"] == 0:
|
|
70
|
-
|
|
78
|
+
logging_utils.verbose("[pinned_transfer] Pinned memory active - faster GPU transfers enabled")
|
|
71
79
|
|
|
72
80
|
_pinned_transfer_stats["pinned"] += 1
|
|
81
|
+
|
|
82
|
+
msg = f"[pinned_transfer] Pinned: {tensor.shape} ({tensor.numel() * tensor.element_size() / 1024:.1f} KB)"
|
|
73
83
|
if _verbose:
|
|
74
|
-
|
|
84
|
+
logging_utils.normal(msg)
|
|
75
85
|
else:
|
|
76
|
-
|
|
86
|
+
logging_utils.verbose(msg)
|
|
77
87
|
|
|
78
88
|
return result
|
|
79
89
|
|
|
80
90
|
except Exception as e:
|
|
81
91
|
_pinned_transfer_stats["fallback"] += 1
|
|
92
|
+
msg = f"[pinned_transfer] Fallback transfer to {device} due to error: {e}"
|
|
82
93
|
if _verbose:
|
|
83
|
-
|
|
94
|
+
logging_utils.warning(msg)
|
|
84
95
|
else:
|
|
85
|
-
|
|
96
|
+
logging_utils.verbose(msg)
|
|
86
97
|
|
|
87
98
|
if dtype is not None:
|
|
88
|
-
return tensor.to(device=device, dtype=dtype)
|
|
89
|
-
return tensor.to(device=device)
|
|
99
|
+
return tensor.to(device=device, dtype=dtype, non_blocking=non_blocking)
|
|
100
|
+
return tensor.to(device=device, non_blocking=non_blocking)
|
{unifiedefficientloader-0.2.1 → unifiedefficientloader-0.2.3}/unifiedefficientloader/tensor_utils.py
RENAMED
|
@@ -5,10 +5,11 @@ Provides serialization helpers for dictionary/tensor conversion.
|
|
|
5
5
|
Requires `torch`.
|
|
6
6
|
"""
|
|
7
7
|
import json
|
|
8
|
-
import logging
|
|
9
8
|
from typing import Dict, Tuple
|
|
10
9
|
|
|
11
|
-
|
|
10
|
+
from . import logging_utils
|
|
11
|
+
|
|
12
|
+
logger = logging_utils.get_logger(__name__)
|
|
12
13
|
|
|
13
14
|
def _ensure_torch():
|
|
14
15
|
try:
|
|
@@ -18,6 +19,7 @@ def _ensure_torch():
|
|
|
18
19
|
raise ImportError("The 'torch' package is required but not installed. Please install it.")
|
|
19
20
|
|
|
20
21
|
|
|
22
|
+
@logging_utils.log_debug
|
|
21
23
|
def dict_to_tensor(data_dict: dict):
|
|
22
24
|
"""
|
|
23
25
|
Convert a dictionary to a torch.uint8 tensor containing JSON bytes.
|
|
@@ -32,9 +34,10 @@ def dict_to_tensor(data_dict: dict):
|
|
|
32
34
|
json_str = json.dumps(data_dict)
|
|
33
35
|
byte_data = json_str.encode("utf-8")
|
|
34
36
|
tensor_data = torch.tensor(list(byte_data), dtype=torch.uint8)
|
|
35
|
-
|
|
37
|
+
logging_utils.debug(f"dict_to_tensor: serialized dict to uint8 tensor of shape {tensor_data.shape}")
|
|
36
38
|
return tensor_data
|
|
37
39
|
|
|
40
|
+
@logging_utils.log_debug
|
|
38
41
|
def tensor_to_dict(tensor_data) -> dict:
|
|
39
42
|
"""
|
|
40
43
|
Convert a torch.uint8 tensor containing JSON bytes to a dictionary.
|
|
@@ -50,5 +53,5 @@ def tensor_to_dict(tensor_data) -> dict:
|
|
|
50
53
|
byte_data = bytes(tensor_data.tolist())
|
|
51
54
|
json_str = byte_data.decode("utf-8")
|
|
52
55
|
data_dict = json.loads(json_str)
|
|
53
|
-
|
|
54
|
-
return data_dict
|
|
56
|
+
logging_utils.debug(f"tensor_to_dict: deserialized tensor of shape {tensor_data.shape} to dict with keys: {list(data_dict.keys())}")
|
|
57
|
+
return data_dict
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: unifiedefficientloader
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: A unified interface for memory efficient per tensor loading of safetensors files as raw bytes from offset, handling CPU/GPU pinned transfers, and converting between tensors and dicts.
|
|
5
5
|
Author: silveroxides
|
|
6
6
|
License: MIT
|
|
@@ -111,6 +111,33 @@ with UnifiedSafetensorsLoader("model.safetensors", low_memory=True) as loader:
|
|
|
111
111
|
loader.mark_processed(key)
|
|
112
112
|
```
|
|
113
113
|
|
|
114
|
+
### Direct-to-GPU Streaming (Zero-Copy)
|
|
115
|
+
|
|
116
|
+
For the absolute fastest loading times on CUDA devices, use the `direct_gpu=True` flag. This creates a pipeline that pre-allocates pinned memory pools and GPU memory slabs. Tensors are loaded from disk directly into pinned buffers, and immediately asynchronously copied to the GPU using CUDA streams, hiding the PCIe transfer latency completely behind the disk I/O.
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
from unifiedefficientloader import UnifiedSafetensorsLoader
|
|
120
|
+
|
|
121
|
+
with UnifiedSafetensorsLoader("model.safetensors", low_memory=True, direct_gpu=True) as loader:
|
|
122
|
+
keys_to_load = loader.keys()
|
|
123
|
+
|
|
124
|
+
# async_stream will automatically coordinate disk -> pinned buffer -> GPU slab -> tensor header
|
|
125
|
+
stream = loader.async_stream(
|
|
126
|
+
keys_to_load,
|
|
127
|
+
batch_size=8,
|
|
128
|
+
prefetch_batches=2,
|
|
129
|
+
direct_gpu=True # optional here since we passed it in __init__
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
for batch in stream:
|
|
133
|
+
for key, gpu_tensor in batch:
|
|
134
|
+
# gpu_tensor is already on the GPU!
|
|
135
|
+
assert gpu_tensor.device.type == "cuda"
|
|
136
|
+
|
|
137
|
+
# ... process gpu_tensor ...
|
|
138
|
+
loader.mark_processed(key)
|
|
139
|
+
```
|
|
140
|
+
|
|
114
141
|
### Tensor/Dict Conversion
|
|
115
142
|
|
|
116
143
|
```python
|
|
@@ -2,9 +2,14 @@ LICENSE
|
|
|
2
2
|
README.md
|
|
3
3
|
pyproject.toml
|
|
4
4
|
setup.py
|
|
5
|
+
tests/test_direct_gpu.py
|
|
6
|
+
tests/test_logging.py
|
|
5
7
|
tests/test_utils.py
|
|
6
8
|
unifiedefficientloader/__init__.py
|
|
9
|
+
unifiedefficientloader/gpu_buffer_pool.py
|
|
10
|
+
unifiedefficientloader/logging_utils.py
|
|
7
11
|
unifiedefficientloader/memory_efficient_loader.py
|
|
12
|
+
unifiedefficientloader/pinned_buffer_pool.py
|
|
8
13
|
unifiedefficientloader/pinned_transfer.py
|
|
9
14
|
unifiedefficientloader/tensor_utils.py
|
|
10
15
|
unifiedefficientloader.egg-info/PKG-INFO
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|