mmgp 3.0.1__py3-none-any.whl → 3.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

mmgp/offload.py CHANGED
@@ -260,6 +260,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
260
260
  if include:
261
261
  params_list = params_list + list(sub_module.buffers(recurse=False)) + list(sub_module.parameters(recurse=False))
262
262
 
263
+ # print(f"num params to pin {model_id}: {len(params_list)}")
263
264
  for p in params_list:
264
265
  if isinstance(p, QTensor):
265
266
  length = torch.numel(p._data) * p._data.element_size() + torch.numel(p._scale) * p._scale.element_size()
@@ -605,18 +606,18 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
605
606
 
606
607
  # force read non quantized parameters so that their lazy tensors and corresponding mmap are released
607
608
  # otherwise we may end up to keep in memory both the quantized and the non quantize model
608
-
609
-
610
- for name, m in model_to_quantize.named_modules():
609
+ for m in model_to_quantize.modules():
611
610
  # do not read quantized weights (detected them directly or behind an adapter)
612
- if isinstance(m, QModuleMixin) or hasattr(m, "base_layer") and isinstance(m.base_layer, QModuleMixin):
613
- pass
611
+ if isinstance(m, QModuleMixin) or hasattr(m, "base_layer") and isinstance(m.base_layer, QModuleMixin):
612
+ if hasattr(m, "bias") and m.bias is not None:
613
+ m.bias.data = m.bias.data + 0
614
614
  else:
615
- if hasattr(m, "weight") and m.weight is not None:
616
- m.weight.data = m.weight.data + 0
615
+ for n, p in m.named_parameters(recurse = False):
616
+ data = getattr(m, n)
617
+ setattr(m,n, torch.nn.Parameter(data + 0 ) )
617
618
 
618
- if hasattr(m, "bias") and m.bias is not None:
619
- m.bias.data = m.bias.data + 0
619
+ for b in m.buffers(recurse = False):
620
+ b.data = b.data + 0
620
621
 
621
622
 
622
623
  freeze(model_to_quantize)
@@ -974,6 +975,8 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
974
975
  quick version of .LoadfromPretrained of the transformers library
975
976
  used to build a model and load the corresponding weights (quantized or not)
976
977
  """
978
+
979
+
977
980
  import os.path
978
981
  from accelerate import init_empty_weights
979
982
 
@@ -1313,6 +1316,13 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1313
1316
  submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode="default" ) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
1314
1317
  #dynamic=True,
1315
1318
 
1319
+ if pinAllModels or model_id in modelsToPin:
1320
+ if hasattr(current_model,"_already_pinned"):
1321
+ if self.verboseLevel >=1:
1322
+ print(f"Model '{model_id}' already pinned to reserved memory")
1323
+ else:
1324
+ _pin_to_memory(current_model, model_id, partialPinning= partialPinning, perc_reserved_mem_max=perc_reserved_mem_max, verboseLevel=verboseLevel)
1325
+
1316
1326
  for submodule_name, submodule in current_model.named_modules():
1317
1327
  # create a fake 'accelerate' parameter so that the _execution_device property returns always "cuda"
1318
1328
  # (it is queried in many pipelines even if offloading is not properly implemented)
@@ -1358,12 +1368,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1358
1368
 
1359
1369
  current_size = self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name)
1360
1370
 
1361
- if pinAllModels or model_id in modelsToPin:
1362
- if hasattr(current_model,"_already_pinned"):
1363
- if self.verboseLevel >=1:
1364
- print(f"Model '{model_id}' already pinned to reserved memory")
1365
- else:
1366
- _pin_to_memory(current_model, model_id, partialPinning= partialPinning, perc_reserved_mem_max=perc_reserved_mem_max, verboseLevel=verboseLevel)
1367
1371
 
1368
1372
 
1369
1373
 
@@ -1422,6 +1426,7 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
1422
1426
 
1423
1427
  default_budgets = { "transformer" : 600 , "text_encoder": 3000, "text_encoder_2": 3000 }
1424
1428
  extraModelsToQuantize = None
1429
+ asyncTransfers = True
1425
1430
 
1426
1431
  if profile_no == profile_type.HighRAM_HighVRAM:
1427
1432
  pinnedMemory= True
@@ -1439,7 +1444,6 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
1439
1444
  pinnedMemory= "transformer"
1440
1445
  extraModelsToQuantize = default_extraModelsToQuantize
1441
1446
  budgets=default_budgets
1442
- asyncTransfers = True
1443
1447
  info = "You have chosen a profile that requires at least 32 GB of RAM and 12 GB of VRAM. Some RAM is consumed to reduce VRAM consumption. "
1444
1448
  elif profile_no == profile_type.VerylowRAM_LowVRAM:
1445
1449
  pinnedMemory= False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mmgp
3
- Version: 3.0.1
3
+ Version: 3.0.3
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -69,7 +69,8 @@ You can choose between 5 profiles depending on your hardware:
69
69
  - VerylowRAM_LowVRAM (5): at least 24 GB of RAM and 10 GB of VRAM : if you don't have much it won't be fast but maybe it will work
70
70
 
71
71
  Profile 2 (High RAM) and 4 (Low RAM)are the most recommended profiles since they are versatile (support for long videos for a slight performance cost).\
72
- However, a safe approach is to start from profile 5 (default profile) and then go down progressively to profile 4 and then to profile 2 as long as the app remains responsive or doesn't trigger any out of memory error.
72
+ If you use Flux derived applciation profile 1 and 3 will offer much faster generation times.
73
+ In any case, a safe approach is to start from profile 5 (default profile) and then go down progressively to profile 4 and then to profile 2 as long as the app remains responsive or doesn't trigger any out of memory error.
73
74
 
74
75
  By default the 'transformer' will be quantized to 8 bits for all profiles. If you don't want that you may specify the optional parameter *quantizeTransformer = False*.
75
76
 
@@ -80,6 +81,9 @@ Every parameter set automatically by a profile can be overridden with one or mul
80
81
  ```
81
82
  If you want to know which parameter are set by one specific profile you can use the parameter *verboseLevel=2*
82
83
 
84
+ **It is highly recommended to put the *from mmgp import offload, profile_type* at the top of your main python file (that is as the first import) so that all the existing safetensors calls are redirected to mmpg.**
85
+
86
+
83
87
  ## Alternatively you may want to create your own profile with specific parameters:
84
88
 
85
89
  For example:
@@ -0,0 +1,9 @@
1
+ __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
+ mmgp/offload.py,sha256=N_n12QJmZlPRbZiYl6BQVfmJaqxxIbiCKkT6w-2CVo4,61781
4
+ mmgp/safetensors2.py,sha256=CSv8HdrjURUzBazpaBDU1WNwUL1lhzpCyzG0GWygbGE,13602
5
+ mmgp-3.0.3.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
+ mmgp-3.0.3.dist-info/METADATA,sha256=0dw13_XUzNPCV6VL-e5FAjvMIUDDT1ffFf7rLG_34zc,12079
7
+ mmgp-3.0.3.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
8
+ mmgp-3.0.3.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
+ mmgp-3.0.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.6.0)
2
+ Generator: setuptools (75.7.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,9 +0,0 @@
1
- __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
- mmgp/offload.py,sha256=T9RBAibAyAnKV-8AiYmop_UOGl_N1l5EJo5ucCZfxK8,61611
4
- mmgp/safetensors2.py,sha256=CSv8HdrjURUzBazpaBDU1WNwUL1lhzpCyzG0GWygbGE,13602
5
- mmgp-3.0.1.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
- mmgp-3.0.1.dist-info/METADATA,sha256=uSsBc5pBaYBL4Ek3TR99J9hP7AQQlwnnUM_JQlkNwbE,11765
7
- mmgp-3.0.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
8
- mmgp-3.0.1.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
- mmgp-3.0.1.dist-info/RECORD,,