mmgp 3.0.1__py3-none-any.whl → 3.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- mmgp/offload.py +20 -16
- {mmgp-3.0.1.dist-info → mmgp-3.0.3.dist-info}/METADATA +6 -2
- mmgp-3.0.3.dist-info/RECORD +9 -0
- {mmgp-3.0.1.dist-info → mmgp-3.0.3.dist-info}/WHEEL +1 -1
- mmgp-3.0.1.dist-info/RECORD +0 -9
- {mmgp-3.0.1.dist-info → mmgp-3.0.3.dist-info}/LICENSE.md +0 -0
- {mmgp-3.0.1.dist-info → mmgp-3.0.3.dist-info}/top_level.txt +0 -0
mmgp/offload.py
CHANGED
|
@@ -260,6 +260,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
|
|
|
260
260
|
if include:
|
|
261
261
|
params_list = params_list + list(sub_module.buffers(recurse=False)) + list(sub_module.parameters(recurse=False))
|
|
262
262
|
|
|
263
|
+
# print(f"num params to pin {model_id}: {len(params_list)}")
|
|
263
264
|
for p in params_list:
|
|
264
265
|
if isinstance(p, QTensor):
|
|
265
266
|
length = torch.numel(p._data) * p._data.element_size() + torch.numel(p._scale) * p._scale.element_size()
|
|
@@ -605,18 +606,18 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
605
606
|
|
|
606
607
|
# force read non quantized parameters so that their lazy tensors and corresponding mmap are released
|
|
607
608
|
# otherwise we may end up to keep in memory both the quantized and the non quantize model
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
for name, m in model_to_quantize.named_modules():
|
|
609
|
+
for m in model_to_quantize.modules():
|
|
611
610
|
# do not read quantized weights (detected them directly or behind an adapter)
|
|
612
|
-
if isinstance(m, QModuleMixin) or hasattr(m, "base_layer") and isinstance(m.base_layer, QModuleMixin):
|
|
613
|
-
|
|
611
|
+
if isinstance(m, QModuleMixin) or hasattr(m, "base_layer") and isinstance(m.base_layer, QModuleMixin):
|
|
612
|
+
if hasattr(m, "bias") and m.bias is not None:
|
|
613
|
+
m.bias.data = m.bias.data + 0
|
|
614
614
|
else:
|
|
615
|
-
|
|
616
|
-
|
|
615
|
+
for n, p in m.named_parameters(recurse = False):
|
|
616
|
+
data = getattr(m, n)
|
|
617
|
+
setattr(m,n, torch.nn.Parameter(data + 0 ) )
|
|
617
618
|
|
|
618
|
-
|
|
619
|
-
|
|
619
|
+
for b in m.buffers(recurse = False):
|
|
620
|
+
b.data = b.data + 0
|
|
620
621
|
|
|
621
622
|
|
|
622
623
|
freeze(model_to_quantize)
|
|
@@ -974,6 +975,8 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
|
|
|
974
975
|
quick version of .LoadfromPretrained of the transformers library
|
|
975
976
|
used to build a model and load the corresponding weights (quantized or not)
|
|
976
977
|
"""
|
|
978
|
+
|
|
979
|
+
|
|
977
980
|
import os.path
|
|
978
981
|
from accelerate import init_empty_weights
|
|
979
982
|
|
|
@@ -1313,6 +1316,13 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1313
1316
|
submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode="default" ) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
|
|
1314
1317
|
#dynamic=True,
|
|
1315
1318
|
|
|
1319
|
+
if pinAllModels or model_id in modelsToPin:
|
|
1320
|
+
if hasattr(current_model,"_already_pinned"):
|
|
1321
|
+
if self.verboseLevel >=1:
|
|
1322
|
+
print(f"Model '{model_id}' already pinned to reserved memory")
|
|
1323
|
+
else:
|
|
1324
|
+
_pin_to_memory(current_model, model_id, partialPinning= partialPinning, perc_reserved_mem_max=perc_reserved_mem_max, verboseLevel=verboseLevel)
|
|
1325
|
+
|
|
1316
1326
|
for submodule_name, submodule in current_model.named_modules():
|
|
1317
1327
|
# create a fake 'accelerate' parameter so that the _execution_device property returns always "cuda"
|
|
1318
1328
|
# (it is queried in many pipelines even if offloading is not properly implemented)
|
|
@@ -1358,12 +1368,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1358
1368
|
|
|
1359
1369
|
current_size = self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name)
|
|
1360
1370
|
|
|
1361
|
-
if pinAllModels or model_id in modelsToPin:
|
|
1362
|
-
if hasattr(current_model,"_already_pinned"):
|
|
1363
|
-
if self.verboseLevel >=1:
|
|
1364
|
-
print(f"Model '{model_id}' already pinned to reserved memory")
|
|
1365
|
-
else:
|
|
1366
|
-
_pin_to_memory(current_model, model_id, partialPinning= partialPinning, perc_reserved_mem_max=perc_reserved_mem_max, verboseLevel=verboseLevel)
|
|
1367
1371
|
|
|
1368
1372
|
|
|
1369
1373
|
|
|
@@ -1422,6 +1426,7 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
|
|
|
1422
1426
|
|
|
1423
1427
|
default_budgets = { "transformer" : 600 , "text_encoder": 3000, "text_encoder_2": 3000 }
|
|
1424
1428
|
extraModelsToQuantize = None
|
|
1429
|
+
asyncTransfers = True
|
|
1425
1430
|
|
|
1426
1431
|
if profile_no == profile_type.HighRAM_HighVRAM:
|
|
1427
1432
|
pinnedMemory= True
|
|
@@ -1439,7 +1444,6 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
|
|
|
1439
1444
|
pinnedMemory= "transformer"
|
|
1440
1445
|
extraModelsToQuantize = default_extraModelsToQuantize
|
|
1441
1446
|
budgets=default_budgets
|
|
1442
|
-
asyncTransfers = True
|
|
1443
1447
|
info = "You have chosen a profile that requires at least 32 GB of RAM and 12 GB of VRAM. Some RAM is consumed to reduce VRAM consumption. "
|
|
1444
1448
|
elif profile_no == profile_type.VerylowRAM_LowVRAM:
|
|
1445
1449
|
pinnedMemory= False
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.3
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -69,7 +69,8 @@ You can choose between 5 profiles depending on your hardware:
|
|
|
69
69
|
- VerylowRAM_LowVRAM (5): at least 24 GB of RAM and 10 GB of VRAM : if you don't have much it won't be fast but maybe it will work
|
|
70
70
|
|
|
71
71
|
Profile 2 (High RAM) and 4 (Low RAM)are the most recommended profiles since they are versatile (support for long videos for a slight performance cost).\
|
|
72
|
-
|
|
72
|
+
If you use Flux derived applciation profile 1 and 3 will offer much faster generation times.
|
|
73
|
+
In any case, a safe approach is to start from profile 5 (default profile) and then go down progressively to profile 4 and then to profile 2 as long as the app remains responsive or doesn't trigger any out of memory error.
|
|
73
74
|
|
|
74
75
|
By default the 'transformer' will be quantized to 8 bits for all profiles. If you don't want that you may specify the optional parameter *quantizeTransformer = False*.
|
|
75
76
|
|
|
@@ -80,6 +81,9 @@ Every parameter set automatically by a profile can be overridden with one or mul
|
|
|
80
81
|
```
|
|
81
82
|
If you want to know which parameter are set by one specific profile you can use the parameter *verboseLevel=2*
|
|
82
83
|
|
|
84
|
+
**It is highly recommended to put the *from mmgp import offload, profile_type* at the top of your main python file (that is as the first import) so that all the existing safetensors calls are redirected to mmpg.**
|
|
85
|
+
|
|
86
|
+
|
|
83
87
|
## Alternatively you may want to create your own profile with specific parameters:
|
|
84
88
|
|
|
85
89
|
For example:
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
+
mmgp/offload.py,sha256=N_n12QJmZlPRbZiYl6BQVfmJaqxxIbiCKkT6w-2CVo4,61781
|
|
4
|
+
mmgp/safetensors2.py,sha256=CSv8HdrjURUzBazpaBDU1WNwUL1lhzpCyzG0GWygbGE,13602
|
|
5
|
+
mmgp-3.0.3.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
+
mmgp-3.0.3.dist-info/METADATA,sha256=0dw13_XUzNPCV6VL-e5FAjvMIUDDT1ffFf7rLG_34zc,12079
|
|
7
|
+
mmgp-3.0.3.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
|
|
8
|
+
mmgp-3.0.3.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
+
mmgp-3.0.3.dist-info/RECORD,,
|
mmgp-3.0.1.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
-
mmgp/offload.py,sha256=T9RBAibAyAnKV-8AiYmop_UOGl_N1l5EJo5ucCZfxK8,61611
|
|
4
|
-
mmgp/safetensors2.py,sha256=CSv8HdrjURUzBazpaBDU1WNwUL1lhzpCyzG0GWygbGE,13602
|
|
5
|
-
mmgp-3.0.1.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
-
mmgp-3.0.1.dist-info/METADATA,sha256=uSsBc5pBaYBL4Ek3TR99J9hP7AQQlwnnUM_JQlkNwbE,11765
|
|
7
|
-
mmgp-3.0.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
8
|
-
mmgp-3.0.1.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
-
mmgp-3.0.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|