mmgp 3.0.3__py3-none-any.whl → 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

mmgp/safetensors2.py CHANGED
@@ -155,20 +155,33 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
155
155
  torch.bool : 'BOOL' , torch.float64 : 'F64' , torch.float32 : 'F32' , torch.float16 : 'F16', torch.float8_e5m2 : "F8_E5M2", torch.float8_e4m3fn: "F8_E4M3" }
156
156
  pos = 0
157
157
  i = 0
158
- mx = 1000000
158
+ mx = 100000
159
+ metadata = dict()
159
160
  for k , t in sd.items():
160
- entry = {}
161
- dtypestr= map[t.dtype]
162
- entry["dtype"] = dtypestr
163
- entry["shape"] = list(t.shape)
164
- size = torch.numel(t) * t.element_size()
165
- entry["data_offsets"] = [pos, pos + size]
166
- pos += size
167
- sf_sd[k] = entry
161
+ if torch.is_tensor(t):
162
+ entry = {}
163
+ dtypestr= map[t.dtype]
164
+ entry["dtype"] = dtypestr
165
+ entry["shape"] = list(t.shape)
166
+ size = torch.numel(t) * t.element_size()
167
+ if size == 0:
168
+ pass
169
+ entry["data_offsets"] = [pos, pos + size]
170
+ pos += size
171
+ sf_sd[k] = entry
172
+ else:
173
+ if isinstance(t, str):
174
+ metadata[k] = t
175
+ else:
176
+ try:
177
+ b64 = base64.b64encode(json.dumps(t, ensure_ascii=False).encode('utf8')).decode('utf8')
178
+ metadata[k + "_base64"] = b64
179
+ except:
180
+ pass
181
+
168
182
  i+=1
169
183
  if i==mx:
170
184
  break
171
- metadata = dict()
172
185
  if not quantization_map is None:
173
186
  metadata["quantization_format"] = "quanto"
174
187
  metadata["quantization_map_base64"] = base64.b64encode(json.dumps(quantization_map, ensure_ascii=False).encode('utf8')).decode('utf8')
@@ -186,21 +199,24 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
186
199
 
187
200
  length_of_header_bytes = struct.pack('<Q', size_header)
188
201
 
189
- empty_tensor = b'\x80\x3f'
190
-
191
202
  with open(file_path, "wb") as writer:
192
203
  bytes_written = writer.write(length_of_header_bytes)
193
204
  bytes_written = writer.write(header_bytes)
194
205
 
195
206
  i = 0
196
207
  for k , t in sd.items():
197
- size = torch.numel(t) * t.element_size()
198
- if len(t.shape) == 0:
199
- bytes_written = writer.write(empty_tensor)
200
- else:
201
- buffer = t.view(torch.uint8).numpy().tobytes()
202
- bytes_written = writer.write(buffer)
203
- assert bytes_written == size
208
+ if torch.is_tensor(t):
209
+ size = torch.numel(t) * t.element_size()
210
+ if size != 0:
211
+ dtype = t.dtype
212
+ # convert in a friendly format, scalars types not supported by numpy
213
+ if dtype == torch.bfloat16:
214
+ t = t.view(torch.uint16)
215
+ elif dtype == torch.float8_e5m2 or dtype == torch.float8_e4m3fn:
216
+ t = t.view(torch.uint8)
217
+ buffer = t.numpy().tobytes()
218
+ bytes_written = writer.write(buffer)
219
+ assert bytes_written == size
204
220
  i+=1
205
221
  if i==mx:
206
222
  break
@@ -208,7 +224,7 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
208
224
  class SafeTensorFile:
209
225
  """Main class for accessing safetensors files that provides memory-efficient access"""
210
226
 
211
- def __init__(self, file_path, metadata, catalog, skip_bytes):
227
+ def __init__(self, file_path, metadata, catalog, skip_bytes, lazy_loading = True):
212
228
  self._file_path = file_path
213
229
  self._metadata = metadata
214
230
  self._catalog = catalog
@@ -216,20 +232,30 @@ class SafeTensorFile:
216
232
  self._keys = None
217
233
  self.sd = None
218
234
  self.mtracker = None
235
+ self.lazy_loading = lazy_loading
219
236
 
220
237
  @classmethod
221
- def load_metadata(cls, file_path):
238
+ def load_metadata(cls, file_path, lazy_loading = True):
222
239
  with open(file_path, 'rb') as f:
223
240
  catalog, metadata, skip_bytes = _read_safetensors_header(file_path, f)
224
241
 
225
- return cls(file_path, metadata, catalog, skip_bytes)
242
+ return cls(file_path, metadata, catalog, skip_bytes, lazy_loading)
226
243
 
227
- def init_tensors(self):
244
+ def init_tensors(self, lazyTensors = True):
228
245
  if self.sd is None:
229
- self.sd = self.create_tensors()
246
+ self.lazy_loading = lazyTensors
247
+ if lazyTensors:
248
+ self.sd = self.create_tensors_with_mmap()
249
+ else:
250
+ self.sd = self.create_tensors_without_mmap()
251
+ # else:
252
+ # if not self.lazy_loading and lazyTensors:
253
+ # raise Exception("Every tensor should be either lazy loaded or not lazy loaded")
254
+
230
255
  return self.sd
231
256
 
232
- def create_tensors(self):
257
+
258
+ def create_tensors_with_mmap(self):
233
259
 
234
260
  self.mtracker = MmapTracker(self._file_path)
235
261
  import mmap
@@ -281,8 +307,12 @@ class SafeTensorFile:
281
307
  length = data_offsets[1]-data_offsets[0]
282
308
  map_idx = next(iter_tensor_no)
283
309
  offset = current_pos - maps[map_idx][1]
284
- if len(shape) == 0:
285
- t = torch.ones((), dtype=dtype, device="cpu")
310
+ if length == 0:
311
+ t = torch.empty(shape, dtype=dtype)
312
+ elif len(shape) == 0:
313
+ # don't waste a memory view for a scalar
314
+ t = torch.frombuffer(bytearray(maps[map_idx][0][offset:offset + length]), dtype=torch.uint8)
315
+ t = t.view(dtype)
286
316
  else:
287
317
  mv = memoryview(maps[map_idx][0])[offset:offset + length]
288
318
  t = torch.frombuffer(mv, dtype=dtype)
@@ -293,8 +323,33 @@ class SafeTensorFile:
293
323
 
294
324
  return sd
295
325
 
326
+ def create_tensors_without_mmap(self):
327
+ sd = OrderedDict()
328
+
329
+ with open(self._file_path, 'rb') as f:
330
+ f.seek(self._skip_bytes, 0)
331
+ for k,v in self._catalog.items():
332
+ dtypestr = v["dtype"]
333
+ dtype= _map_to_dtype[dtypestr]
334
+ shape = v["shape"]
335
+ data_offsets = v["data_offsets"]
336
+ length = data_offsets[1]-data_offsets[0]
337
+ buffer = f.read(length)
338
+ if len(shape) == 0:
339
+ if length == 0:
340
+ t = torch.empty(0, dtype=dtype)
341
+ else:
342
+ t = torch.frombuffer(bytearray(buffer), dtype=torch.uint8)
343
+ t = t.view(dtype)
344
+ else:
345
+ t = torch.frombuffer(bytearray(buffer), dtype=dtype)
346
+ t = torch.reshape(t, shape)
347
+ sd[k] = t
348
+ return sd
349
+
296
350
  def get_tensor(self, name: str) -> torch.tensor:
297
351
  """Get a tensor by name"""
352
+ # To do : switch to a JIT tensor creation per tensor
298
353
  self.init_tensors()
299
354
  return self.sd[name]
300
355
 
@@ -310,7 +365,7 @@ class SafeTensorFile:
310
365
 
311
366
  def tensors(self) -> Dict[str, torch.tensor]:
312
367
  """Get dictionary of all tensors"""
313
- self.init_tensors()
368
+ self.init_tensors(self.lazy_loading)
314
369
  return self.sd
315
370
 
316
371
  def metadata(self) -> Optional[Dict[str, str]]:
@@ -319,7 +374,7 @@ class SafeTensorFile:
319
374
 
320
375
  def __len__(self) -> int:
321
376
  """Get number of tensors"""
322
- self.init_tensors()
377
+ self.init_tensors(self.lazy_loading)
323
378
  return len(self.keys())
324
379
 
325
380
  def __contains__(self, key: str) -> bool:
@@ -337,10 +392,9 @@ class SafeTensorFile:
337
392
  class _SafeTensorLoader:
338
393
  """Context manager for loading SafeTensorFile"""
339
394
 
340
- def __init__(self, filename: str):
395
+ def __init__(self, filename: str ):
341
396
  self.filename = Path(filename)
342
397
  self.sft = None
343
-
344
398
  if not self.filename.exists():
345
399
  raise FileNotFoundError(f"File not found: {filename}")
346
400
 
@@ -367,7 +421,6 @@ class _SafeTensorLoader:
367
421
 
368
422
  def safe_open(filename: str, framework: str = "pt",device = "cpu") -> _SafeTensorLoader:
369
423
  if device != "cpu" or framework !="pt":
370
- pass
371
424
  return _old_safe_open(filename =filename, framework=framework, device=device)
372
425
  return _SafeTensorLoader(filename)
373
426
 
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: mmgp
3
- Version: 3.0.3
3
+ Version: 3.1.0
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -13,10 +13,11 @@ Requires-Dist: optimum-quanto
13
13
  Requires-Dist: accelerate
14
14
  Requires-Dist: safetensors
15
15
  Requires-Dist: psutil
16
+ Requires-Dist: peft
16
17
 
17
18
 
18
19
  <p align="center">
19
- <H2>Memory Management 3.0 for the GPU Poor by DeepBeepMeep</H2>
20
+ <H2>Memory Management 3.1.0 for the GPU Poor by DeepBeepMeep</H2>
20
21
  </p>
21
22
 
22
23
 
@@ -38,8 +39,9 @@ Each profile may use a combination of the following:
38
39
  - Ability to pin models to reserved RAM to accelerate transfers to VRAM
39
40
  - Async transfers to VRAM to avoid a pause when loading a new slice of a model
40
41
  - Automated on the fly quantization or ability to load pre quantized models
41
- - support for pytorch compilation on Linux and WSL (not supported so far on pure Windows).
42
-
42
+ - Pretrained Lora support with low RAM requirements
43
+ - Support for pytorch compilation on Linux and WSL (supported on pure Windows but requires a complex Triton Installation).
44
+ -
43
45
  ## Installation
44
46
  First you need to install the module in your current project with:
45
47
  ```shell
@@ -98,27 +100,29 @@ For example:
98
100
  The smaller this number, the more VRAM left for image data / longer video but also the slower because there will be lots of loading / unloading between the RAM and the VRAM. If model is too big to fit in a budget, it will be broken down in multiples parts that will be unloaded / loaded consequently. The speed of low budget can be increased (up to 2 times) by turning on the options pinnedMemory and asyncTransfers.
99
101
  - asyncTransfers: boolean, load to the GPU the next model part while the current part is being processed. This requires twice the budget if any is defined. This may increase speed by 20% (mostly visible on fast modern GPUs).
100
102
  - verboseLevel: number between 0 and 2 (1 by default), provides various level of feedback of the different processes
101
- - compile: list of model ids to compile, may accelerate up x2 depending on the type of GPU. As of 01/01/2025 it will work only on Linux or WSL since compilation relies on Triton which is not yet supported on Windows
103
+ - compile: list of model ids to compile, may accelerate up x2 depending on the type of GPU. It makes sens to compile only the model that is frequently used such as the "transformer" model in the case of video or image generation. As of 01/01/2025 it will work only on Linux or WSL since compilation relies on Triton which is not yet supported on Windows
102
104
 
103
105
  If you are short on RAM and plan to work with quantized models, it is recommended to load pre-quantized models direclty rather than using on the fly quantization, it will be faster and consume slightly less RAM.
104
106
 
105
107
  ## Going further
106
108
 
107
109
  The module includes several tools to package a light version of your favorite video / image generator:
108
- - *save_model(model, file_path, do_quantize = False, quantization_type = qint8 )*\
110
+ - *save_model(model, file_path, do_quantize = False, quantizationType = qint8 )*\
109
111
  Save tensors of a model already loaded in memory in a safetensor format (much faster to reload). You can save it in a quantized format (default qint8 quantization recommended).
110
112
  The resulting safetensor file will contain extra fields in its metadata such as the quantization map and its configuration, so you will be able to move the file around without files such as *config.json* or *file_map.json*.
111
113
  You will need *load_model_data* or *fast_load_transformers_model* to read the file again . You may also load it using the default *safetensor* librar however you will need to provide in the same directory any complementary file that are usually requested (for instance *config.json*)
112
114
 
113
- - *load_model_data(model, file_path: str, do_quantize = False, quantization_type = qint8, pinToRAM = False, partialPin = False)*\
115
+ - *load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToRAM = False, partialPin = False)*\
114
116
  Load the tensors data of a model in RAM of a model already initialized with no data. Detect and handle quantized models saved previously with *save_model*.A model can also be quantized on the fly while being loaded. The model which is loaded can be pinned to RAM while it is loaded, this is more RAM efficient than pinning tensors later using *offline.all* or *offline.profile*
115
117
 
116
- - *fast_load_transformers_model(model_path: str, do_quantize = False, quantization_type = qint8, pinToRAM = False, partialPin = False)*\
118
+ - *fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToRAM = False, partialPin = False)*\
117
119
  Initialize (build the model hierarchy in memory) and fast load the corresponding tensors of a 'transformers' or 'diffusers' library model.
118
120
  The advantages over the original *from_pretrained* method is that a full model can fit into a single file with a filename of your choosing (thefore you can have multiple 'transformers' versions of the same model in the same directory) and prequantized models are processed in a transparent way.
119
121
  Last but not least, you can also on the fly pin to RAM the whole model or the most important part of it (partialPin = True) in a more efficient way (faster and requires less RAM) than if you did through *offload.all* or *offload.profile*.
120
122
 
121
-
123
+ - *load_loras_into_model(model, lora_path, lora_multi)
124
+ Load in a model a list of Lora described by a list of path *lora_path* and a list of *weights coefficients*.
125
+ The Lora file must be in the *diffusers* format. This function works also on non diffusers models. However if there is already an official Lora support for a model it is recommended to use the official diffusers functions.
122
126
 
123
127
  The typical workflow wil be:
124
128
  1) temporarly insert the *save_model* function just after a model has been fully loaded to save a copy of the model / quantized model.
@@ -0,0 +1,9 @@
1
+ __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
+ mmgp/offload.py,sha256=VDau0VCAWHnS40swGuqxn7LIyZJdI0qYI58iGCRyw3Y,67352
4
+ mmgp/safetensors2.py,sha256=mTXL-rZ2lZwYKRujNAc8lUJoqQjq6lpD2XrkuZjA_2Y,16138
5
+ mmgp-3.1.0.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
+ mmgp-3.1.0.dist-info/METADATA,sha256=A5Tvc-FGxjk3FuzNHlQ6g6ztJg7hqIwPKvL5EK1pXTc,12708
7
+ mmgp-3.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
8
+ mmgp-3.1.0.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
+ mmgp-3.1.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.7.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,9 +0,0 @@
1
- __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
- mmgp/offload.py,sha256=N_n12QJmZlPRbZiYl6BQVfmJaqxxIbiCKkT6w-2CVo4,61781
4
- mmgp/safetensors2.py,sha256=CSv8HdrjURUzBazpaBDU1WNwUL1lhzpCyzG0GWygbGE,13602
5
- mmgp-3.0.3.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
- mmgp-3.0.3.dist-info/METADATA,sha256=0dw13_XUzNPCV6VL-e5FAjvMIUDDT1ffFf7rLG_34zc,12079
7
- mmgp-3.0.3.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
8
- mmgp-3.0.3.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
- mmgp-3.0.3.dist-info/RECORD,,