PyPI - mmgp - Versions diffs - 3.0.3__py3-none-any.whl → 3.1.0__py3-none-any.whl - Mend

mmgp 3.0.3py3-none-any.whl → 3.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mmgp might be problematic. Click here for more details.

Files changed (8) hide show

mmgp/offload.py +503 -394
mmgp/safetensors2.py +85 -32
{mmgp-3.0.3.dist-info → mmgp-3.1.0.dist-info}/METADATA +14 -10
mmgp-3.1.0.dist-info/RECORD +9 -0
{mmgp-3.0.3.dist-info → mmgp-3.1.0.dist-info}/WHEEL +1 -1
mmgp-3.0.3.dist-info/RECORD +0 -9
{mmgp-3.0.3.dist-info → mmgp-3.1.0.dist-info}/LICENSE.md +0 -0
{mmgp-3.0.3.dist-info → mmgp-3.1.0.dist-info}/top_level.txt +0 -0

mmgp/safetensors2.py CHANGED Viewed

@@ -155,20 +155,33 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
            torch.bool : 'BOOL' ,  torch.float64 : 'F64' , torch.float32 : 'F32' , torch.float16 : 'F16', torch.float8_e5m2 : "F8_E5M2", torch.float8_e4m3fn: "F8_E4M3" }
     pos = 0
     i = 0
-    mx = 1000000
+    mx = 100000
+    metadata = dict()
     for k , t  in sd.items():
-        entry = {}
-        dtypestr= map[t.dtype]
-        entry["dtype"] = dtypestr
-        entry["shape"] = list(t.shape)
-        size = torch.numel(t) * t.element_size()
-        entry["data_offsets"] = [pos, pos + size]
-        pos += size
-        sf_sd[k] = entry
+        if torch.is_tensor(t):
+            entry = {}
+            dtypestr= map[t.dtype]
+            entry["dtype"] = dtypestr
+            entry["shape"] = list(t.shape)
+            size = torch.numel(t) * t.element_size()
+            if size == 0:
+                pass
+            entry["data_offsets"] = [pos, pos + size]
+            pos += size
+            sf_sd[k] = entry
+        else:
+            if isinstance(t, str):
+                metadata[k] = t
+            else:
+                try:
+                    b64 = base64.b64encode(json.dumps(t, ensure_ascii=False).encode('utf8')).decode('utf8')
+                    metadata[k + "_base64"] = b64
+                except:
+                    pass
         i+=1
         if i==mx:
             break
-    metadata = dict()
     if not quantization_map is None:
         metadata["quantization_format"] = "quanto"
         metadata["quantization_map_base64"] =  base64.b64encode(json.dumps(quantization_map, ensure_ascii=False).encode('utf8')).decode('utf8')
@@ -186,21 +199,24 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
     length_of_header_bytes = struct.pack('<Q', size_header)
-    empty_tensor = b'\x80\x3f'
     with open(file_path, "wb") as writer:
         bytes_written = writer.write(length_of_header_bytes)
         bytes_written = writer.write(header_bytes)
         i = 0
         for k , t  in sd.items():
-            size = torch.numel(t) * t.element_size()
-            if len(t.shape) == 0:
-                bytes_written = writer.write(empty_tensor)
-            else:
-                buffer = t.view(torch.uint8).numpy().tobytes()
-                bytes_written = writer.write(buffer)
-            assert bytes_written == size
+            if torch.is_tensor(t):
+                size = torch.numel(t) * t.element_size()
+                if size != 0:
+                    dtype = t.dtype
+                    # convert in a friendly format, scalars types not supported by numpy
+                    if  dtype ==  torch.bfloat16:
+                        t = t.view(torch.uint16)
+                    elif  dtype ==  torch.float8_e5m2 or dtype ==  torch.float8_e4m3fn:
+                        t = t.view(torch.uint8)
+                    buffer = t.numpy().tobytes()
+                    bytes_written = writer.write(buffer)
+                    assert bytes_written == size
             i+=1
             if i==mx:
                 break
@@ -208,7 +224,7 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
 class SafeTensorFile:
     """Main class for accessing safetensors files that provides memory-efficient access"""
-    def __init__(self, file_path, metadata, catalog, skip_bytes):
+    def __init__(self, file_path, metadata, catalog, skip_bytes, lazy_loading = True):
         self._file_path = file_path
         self._metadata = metadata
         self._catalog = catalog
@@ -216,20 +232,30 @@ class SafeTensorFile:
         self._keys = None
         self.sd = None
         self.mtracker = None
+        self.lazy_loading = lazy_loading
     @classmethod
-    def load_metadata(cls, file_path):
+    def load_metadata(cls, file_path, lazy_loading = True):
         with open(file_path, 'rb') as f:
             catalog, metadata, skip_bytes = _read_safetensors_header(file_path, f)
-        return cls(file_path, metadata, catalog, skip_bytes)
+        return cls(file_path, metadata, catalog, skip_bytes, lazy_loading)
-    def init_tensors(self):
+    def init_tensors(self, lazyTensors = True):
         if self.sd is None:
-            self.sd = self.create_tensors()
+            self.lazy_loading = lazyTensors
+            if lazyTensors:
+                self.sd = self.create_tensors_with_mmap()
+            else:
+                self.sd = self.create_tensors_without_mmap()
+        # else:
+        #     if not self.lazy_loading and lazyTensors:
+        #         raise Exception("Every tensor should be either lazy loaded or not lazy loaded")
         return self.sd
-    def create_tensors(self):
+    def create_tensors_with_mmap(self):
         self.mtracker = MmapTracker(self._file_path)
         import mmap
@@ -281,8 +307,12 @@ class SafeTensorFile:
                 length = data_offsets[1]-data_offsets[0]
                 map_idx = next(iter_tensor_no)
                 offset = current_pos - maps[map_idx][1]
-                if len(shape) == 0:
-                    t = torch.ones((), dtype=dtype, device="cpu")
+                if length == 0:
+                    t = torch.empty(shape, dtype=dtype)
+                elif len(shape) == 0:
+                    # don't waste a memory view for a scalar
+                    t = torch.frombuffer(bytearray(maps[map_idx][0][offset:offset + length]), dtype=torch.uint8)
+                    t = t.view(dtype)
                 else:
                     mv = memoryview(maps[map_idx][0])[offset:offset + length]
                     t = torch.frombuffer(mv, dtype=dtype)
@@ -293,8 +323,33 @@ class SafeTensorFile:
         return sd
+    def create_tensors_without_mmap(self):
+        sd = OrderedDict()
+        with open(self._file_path, 'rb') as f:
+            f.seek(self._skip_bytes, 0)
+            for k,v in self._catalog.items():
+                dtypestr =  v["dtype"]
+                dtype= _map_to_dtype[dtypestr]
+                shape = v["shape"]
+                data_offsets = v["data_offsets"]
+                length = data_offsets[1]-data_offsets[0]
+                buffer = f.read(length)
+                if len(shape) == 0:
+                    if length == 0:
+                        t = torch.empty(0, dtype=dtype)
+                    else:
+                        t = torch.frombuffer(bytearray(buffer), dtype=torch.uint8)
+                        t = t.view(dtype)
+                else:
+                    t = torch.frombuffer(bytearray(buffer), dtype=dtype)
+                    t = torch.reshape(t, shape)
+                sd[k] = t
+        return sd
     def get_tensor(self, name: str) -> torch.tensor:
         """Get a tensor by name"""
+        # To do : switch to a JIT tensor creation per tensor
         self.init_tensors()
         return self.sd[name]
@@ -310,7 +365,7 @@ class SafeTensorFile:
     def tensors(self) -> Dict[str, torch.tensor]:
         """Get dictionary of all tensors"""
-        self.init_tensors()
+        self.init_tensors(self.lazy_loading)
         return self.sd
     def metadata(self) -> Optional[Dict[str, str]]:
@@ -319,7 +374,7 @@ class SafeTensorFile:
     def __len__(self) -> int:
         """Get number of tensors"""
-        self.init_tensors()
+        self.init_tensors(self.lazy_loading)
         return len(self.keys())
     def __contains__(self, key: str) -> bool:
@@ -337,10 +392,9 @@ class SafeTensorFile:
 class _SafeTensorLoader:
     """Context manager for loading SafeTensorFile"""
-    def __init__(self, filename: str):
+    def __init__(self, filename: str ):
         self.filename = Path(filename)
         self.sft = None
         if not self.filename.exists():
             raise FileNotFoundError(f"File not found: {filename}")
@@ -367,7 +421,6 @@ class _SafeTensorLoader:
 def safe_open(filename: str, framework: str = "pt",device = "cpu") -> _SafeTensorLoader:
     if device != "cpu" or framework !="pt":
-        pass
         return _old_safe_open(filename =filename, framework=framework, device=device)
     return _SafeTensorLoader(filename)

{mmgp-3.0.3.dist-info → mmgp-3.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: mmgp
-Version: 3.0.3
+Version: 3.1.0
 Summary: Memory Management for the GPU Poor
 Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
 License:                     GNU GENERAL PUBLIC LICENSE
@@ -13,10 +13,11 @@ Requires-Dist: optimum-quanto
 Requires-Dist: accelerate
 Requires-Dist: safetensors
 Requires-Dist: psutil
+Requires-Dist: peft
 <p align="center">
-  <H2>Memory Management 3.0 for the GPU Poor by DeepBeepMeep</H2>
+  <H2>Memory Management 3.1.0 for the GPU Poor by DeepBeepMeep</H2>
 </p>
@@ -38,8 +39,9 @@ Each profile may use a combination of the following:
 - Ability to pin models to reserved RAM to accelerate transfers to VRAM
 - Async transfers to VRAM to avoid a pause when loading a new slice of a model
 - Automated on the fly quantization or ability to load pre quantized models
-- support for pytorch compilation on Linux and WSL (not supported so far on pure Windows).
+- Pretrained Lora support with low RAM requirements
+- Support for pytorch compilation on Linux and WSL (supported on pure Windows but requires a complex Triton Installation).
+-
 ## Installation
 First you need to install the module in your current project with:
 ```shell
@@ -98,27 +100,29 @@ For example:
 The smaller this number, the more VRAM left for image data / longer video but also the slower because there will be lots of loading / unloading between the RAM and the VRAM. If model is too big to fit in a budget, it will be broken down in multiples parts that will be unloaded / loaded consequently. The speed of low budget can be  increased (up to 2 times) by turning on the options pinnedMemory and asyncTransfers.
 - asyncTransfers: boolean, load to the GPU the next model part while the current part is being processed. This requires twice the budget if any is defined. This may increase speed by 20% (mostly visible on fast modern GPUs).
 - verboseLevel: number between 0 and 2 (1 by default), provides various level of feedback of the different processes
-- compile: list of model ids to compile, may accelerate up x2 depending on the type of GPU. As of 01/01/2025 it will work only on Linux or WSL since compilation relies on Triton which is not yet supported on Windows
+- compile: list of model ids to compile, may accelerate up x2 depending on the type of GPU. It makes sens to compile only the model that is frequently used such as the "transformer" model in the case of video or image generation. As of 01/01/2025 it will work only on Linux or WSL since compilation relies on Triton which is not yet supported on Windows
 If you are short on RAM and plan to work with quantized models, it is recommended to load pre-quantized models direclty rather than using on the fly quantization, it will be faster and consume slightly less RAM.
 ##  Going further
 The module includes several tools to package a light version of your favorite video / image generator:
-- *save_model(model, file_path, do_quantize = False, quantization_type = qint8 )*\
+- *save_model(model, file_path, do_quantize = False, quantizationType = qint8 )*\
 Save tensors of a model already loaded in memory in a safetensor format (much faster to reload). You can save it in a quantized format (default qint8 quantization recommended).
 The resulting safetensor file will contain extra fields in its metadata such as the quantization map and its configuration, so you will be able to move the file around without files such as *config.json* or *file_map.json*.
 You will need *load_model_data* or *fast_load_transformers_model* to read the file again . You may also load it using the default *safetensor* librar however you will need to provide in the same directory any complementary file that are usually requested (for instance *config.json*)
-- *load_model_data(model, file_path: str, do_quantize = False, quantization_type = qint8, pinToRAM = False, partialPin = False)*\
+- *load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToRAM = False, partialPin = False)*\
 Load the tensors data of a model in RAM of a model already initialized with no data. Detect and handle quantized models saved previously with *save_model*.A model can also be quantized on the fly while being loaded. The model which is loaded can be pinned to RAM while it is loaded, this is more RAM efficient than pinning tensors later using *offline.all* or *offline.profile*
-- *fast_load_transformers_model(model_path: str, do_quantize = False, quantization_type = qint8, pinToRAM = False, partialPin = False)*\
+- *fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToRAM = False, partialPin = False)*\
 Initialize (build the model hierarchy in memory) and fast load the corresponding tensors of a 'transformers' or 'diffusers' library model.
 The advantages over the original *from_pretrained* method is that a full model can fit into a single file with a filename of your choosing (thefore you can have multiple 'transformers' versions of the same model in the same directory) and prequantized models are processed in a transparent way.
 Last but not least, you can also on the fly pin to RAM the whole model or the most important part of it (partialPin = True) in a more efficient way (faster and requires less RAM) than if you did through *offload.all* or *offload.profile*.
+- *load_loras_into_model(model, lora_path, lora_multi)
+Load in a model a list of Lora described by a list of path *lora_path* and a list of *weights coefficients*.
+The Lora file must be in the *diffusers* format. This function works also on non diffusers models. However if there is already an official Lora support for a model it is recommended to use the official diffusers functions.
 The typical workflow wil be:
 1) temporarly insert the *save_model* function just after a model has been fully loaded to save a copy of the model / quantized model.

mmgp-3.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
+mmgp/offload.py,sha256=VDau0VCAWHnS40swGuqxn7LIyZJdI0qYI58iGCRyw3Y,67352
+mmgp/safetensors2.py,sha256=mTXL-rZ2lZwYKRujNAc8lUJoqQjq6lpD2XrkuZjA_2Y,16138
+mmgp-3.1.0.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
+mmgp-3.1.0.dist-info/METADATA,sha256=A5Tvc-FGxjk3FuzNHlQ6g6ztJg7hqIwPKvL5EK1pXTc,12708
+mmgp-3.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+mmgp-3.1.0.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
+mmgp-3.1.0.dist-info/RECORD,,

{mmgp-3.0.3.dist-info → mmgp-3.1.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.7.0)
+Generator: setuptools (75.8.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

mmgp-3.0.3.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
-mmgp/offload.py,sha256=N_n12QJmZlPRbZiYl6BQVfmJaqxxIbiCKkT6w-2CVo4,61781
-mmgp/safetensors2.py,sha256=CSv8HdrjURUzBazpaBDU1WNwUL1lhzpCyzG0GWygbGE,13602
-mmgp-3.0.3.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
-mmgp-3.0.3.dist-info/METADATA,sha256=0dw13_XUzNPCV6VL-e5FAjvMIUDDT1ffFf7rLG_34zc,12079
-mmgp-3.0.3.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
-mmgp-3.0.3.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
-mmgp-3.0.3.dist-info/RECORD,,

{mmgp-3.0.3.dist-info → mmgp-3.1.0.dist-info}/LICENSE.md RENAMED Viewed

File without changes

{mmgp-3.0.3.dist-info → mmgp-3.1.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

mmgp 3.0.3__py3-none-any.whl → 3.1.0__py3-none-any.whl

Potentially problematic release.

mmgp 3.0.3py3-none-any.whl → 3.1.0py3-none-any.whl