mmgp 3.1.3__py3-none-any.whl → 3.1.4.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- mmgp/offload.py +36 -71
- {mmgp-3.1.3.dist-info → mmgp-3.1.4.post1.dist-info}/METADATA +35 -13
- mmgp-3.1.4.post1.dist-info/RECORD +9 -0
- mmgp-3.1.3.dist-info/RECORD +0 -9
- {mmgp-3.1.3.dist-info → mmgp-3.1.4.post1.dist-info}/LICENSE.md +0 -0
- {mmgp-3.1.3.dist-info → mmgp-3.1.4.post1.dist-info}/WHEEL +0 -0
- {mmgp-3.1.3.dist-info → mmgp-3.1.4.post1.dist-info}/top_level.txt +0 -0
mmgp/offload.py
CHANGED
|
@@ -149,12 +149,12 @@ def _compute_verbose_level(level):
|
|
|
149
149
|
safetensors2.verboseLevel = level
|
|
150
150
|
return level
|
|
151
151
|
|
|
152
|
-
def
|
|
152
|
+
def _get_perc_reserved_mem_max(perc_reserved_mem_max):
|
|
153
153
|
if perc_reserved_mem_max<=0:
|
|
154
154
|
perc_reserved_mem_max = 0.40 if os.name == 'nt' else 0.5
|
|
155
|
-
return perc_reserved_mem_max
|
|
155
|
+
return perc_reserved_mem_max
|
|
156
156
|
|
|
157
|
-
def _detect_main_towers(model, min_floors = 5
|
|
157
|
+
def _detect_main_towers(model, min_floors = 5):
|
|
158
158
|
cur_blocks_prefix = None
|
|
159
159
|
towers_modules= []
|
|
160
160
|
towers_names= []
|
|
@@ -191,39 +191,16 @@ def _detect_main_towers(model, min_floors = 5, verboseLevel=1):
|
|
|
191
191
|
pre , num = _extract_num_from_str(submodule_name)
|
|
192
192
|
if isinstance(submodule, (torch.nn.ModuleList)):
|
|
193
193
|
cur_blocks_prefix, cur_blocks_seq = pre + ".", -1
|
|
194
|
-
tower_name = submodule_name
|
|
194
|
+
tower_name = submodule_name #+ ".*"
|
|
195
195
|
elif num >=0:
|
|
196
196
|
cur_blocks_prefix, cur_blocks_seq = pre, num
|
|
197
|
-
tower_name = submodule_name[ :-1]
|
|
197
|
+
tower_name = submodule_name[ :-1] #+ "*"
|
|
198
198
|
floors_modules.append(submodule)
|
|
199
199
|
|
|
200
200
|
if len(floors_modules) >= min_floors:
|
|
201
201
|
towers_modules += floors_modules
|
|
202
202
|
towers_names.append(tower_name)
|
|
203
203
|
|
|
204
|
-
# for submodule_name, submodule in model.named_modules():
|
|
205
|
-
# if submodule_name=='':
|
|
206
|
-
# continue
|
|
207
|
-
|
|
208
|
-
# if isinstance(submodule, torch.nn.ModuleList):
|
|
209
|
-
# newList =False
|
|
210
|
-
# if cur_blocks_prefix == None:
|
|
211
|
-
# cur_blocks_prefix = submodule_name + "."
|
|
212
|
-
# newList = True
|
|
213
|
-
# else:
|
|
214
|
-
# if not submodule_name.startswith(cur_blocks_prefix):
|
|
215
|
-
# cur_blocks_prefix = submodule_name + "."
|
|
216
|
-
# newList = True
|
|
217
|
-
|
|
218
|
-
# if newList and len(submodule)>=5:
|
|
219
|
-
# towers_names.append(submodule_name)
|
|
220
|
-
# towers_modules.append(submodule)
|
|
221
|
-
|
|
222
|
-
# else:
|
|
223
|
-
# if cur_blocks_prefix is not None:
|
|
224
|
-
# if not submodule_name.startswith(cur_blocks_prefix):
|
|
225
|
-
# cur_blocks_prefix = None
|
|
226
|
-
|
|
227
204
|
return towers_names, towers_modules
|
|
228
205
|
|
|
229
206
|
|
|
@@ -261,30 +238,7 @@ def _remove_model_wrapper(model):
|
|
|
261
238
|
return sub_module
|
|
262
239
|
return model
|
|
263
240
|
|
|
264
|
-
|
|
265
|
-
# c = torch.nn.Parameter(t + 0)
|
|
266
|
-
# torch.utils.swap_tensors(t, c)
|
|
267
|
-
# del c
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
# for n,m in model_to_quantize.named_modules():
|
|
271
|
-
# # do not read quantized weights (detected them directly or behind an adapter)
|
|
272
|
-
# if isinstance(m, QModuleMixin) or hasattr(m, "base_layer") and isinstance(m.base_layer, QModuleMixin):
|
|
273
|
-
# if hasattr(m, "bias") and m.bias is not None:
|
|
274
|
-
# force_load_tensor(m.bias.data)
|
|
275
|
-
# # m.bias.data = m.bias.data + 0
|
|
276
|
-
# else:
|
|
277
|
-
# for n, p in m.named_parameters(recurse = False):
|
|
278
|
-
# data = getattr(m, n)
|
|
279
|
-
# force_load_tensor(data)
|
|
280
|
-
# # setattr(m,n, torch.nn.Parameter(data + 0 ) )
|
|
281
|
-
|
|
282
|
-
# for b in m.buffers(recurse = False):
|
|
283
|
-
# # b.data = b.data + 0
|
|
284
|
-
# b.data = torch.nn.Buffer(b.data + 0)
|
|
285
|
-
# force_load_tensor(b.data)
|
|
286
|
-
|
|
287
|
-
|
|
241
|
+
|
|
288
242
|
|
|
289
243
|
def _move_to_pinned_tensor(source_tensor, big_tensor, offset, length):
|
|
290
244
|
dtype= source_tensor.dtype
|
|
@@ -324,17 +278,11 @@ def _force_load_parameter(p):
|
|
|
324
278
|
torch.utils.swap_tensors(p, q)
|
|
325
279
|
del q
|
|
326
280
|
|
|
327
|
-
def _pin_to_memory(model, model_id, partialPinning = False,
|
|
328
|
-
|
|
329
|
-
if partialPinning:
|
|
330
|
-
print(f"Partial pinning of data of '{model_id}' to reserved RAM")
|
|
331
|
-
else:
|
|
332
|
-
print(f"Pinning data of '{model_id}' to reserved RAM")
|
|
281
|
+
def _pin_to_memory(model, model_id, partialPinning = False, verboseLevel = 1):
|
|
282
|
+
|
|
333
283
|
|
|
334
|
-
max_reservable_memory = _get_max_reservable_memory(perc_reserved_mem_max)
|
|
335
284
|
if partialPinning:
|
|
336
285
|
towers_names, _ = _detect_main_towers(model)
|
|
337
|
-
towers_names = [n +"." for n in towers_names]
|
|
338
286
|
|
|
339
287
|
|
|
340
288
|
BIG_TENSOR_MAX_SIZE = 2**28 # 256 MB
|
|
@@ -353,6 +301,20 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
|
|
|
353
301
|
params_list = params_list + [ (k + '.' + n, p, False) for n, p in sub_module.named_parameters(recurse=False)] + [ (k + '.' + n, p, True) for n, p in sub_module.named_buffers(recurse=False)]
|
|
354
302
|
|
|
355
303
|
|
|
304
|
+
if verboseLevel>=1 :
|
|
305
|
+
if partialPinning:
|
|
306
|
+
if len(params_list) == 0:
|
|
307
|
+
print(f"Unable to apply Partial of '{model_id}' as no isolated main structures were found")
|
|
308
|
+
else:
|
|
309
|
+
print(f"Partial pinning of data of '{model_id}' to reserved RAM")
|
|
310
|
+
else:
|
|
311
|
+
print(f"Pinning data of '{model_id}' to reserved RAM")
|
|
312
|
+
|
|
313
|
+
if partialPinning and len(params_list) == 0:
|
|
314
|
+
return
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
|
|
356
318
|
for n, p, _ in params_list:
|
|
357
319
|
if isinstance(p, QTensor):
|
|
358
320
|
if p._qtype == qint4:
|
|
@@ -442,10 +404,10 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
|
|
|
442
404
|
gc.collect()
|
|
443
405
|
|
|
444
406
|
if verboseLevel >=1:
|
|
445
|
-
if
|
|
446
|
-
print(f"The
|
|
407
|
+
if partialPinning:
|
|
408
|
+
print(f"The model was partially pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
447
409
|
else:
|
|
448
|
-
print(f"
|
|
410
|
+
print(f"The whole model was pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
449
411
|
|
|
450
412
|
model._already_pinned = True
|
|
451
413
|
|
|
@@ -461,13 +423,14 @@ def _welcome():
|
|
|
461
423
|
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.1) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
462
424
|
|
|
463
425
|
def _extract_num_from_str(num_in_str):
|
|
464
|
-
|
|
426
|
+
size = len(num_in_str)
|
|
427
|
+
for i in range(size):
|
|
465
428
|
if not num_in_str[-i-1:].isnumeric():
|
|
466
429
|
if i == 0:
|
|
467
430
|
return num_in_str, -1
|
|
468
431
|
else:
|
|
469
432
|
return num_in_str[: -i], int(num_in_str[-i:])
|
|
470
|
-
return "", int(num_in_str)
|
|
433
|
+
return "", -1 if size == 0 else int(num_in_str)
|
|
471
434
|
|
|
472
435
|
def _quantize_dirty_hack(model):
|
|
473
436
|
# dirty hack: add a hook on state_dict() to return a fake non quantized state_dict if called by Lora Diffusers initialization functions
|
|
@@ -1425,7 +1388,9 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1425
1388
|
# torch._logging.set_logs(recompiles=True)
|
|
1426
1389
|
# torch._inductor.config.realize_opcount_threshold = 100 # workaround bug "AssertionError: increase TRITON_MAX_BLOCK['X'] to 4096."
|
|
1427
1390
|
|
|
1428
|
-
|
|
1391
|
+
|
|
1392
|
+
perc_reserved_mem_max = _get_perc_reserved_mem_max(perc_reserved_mem_max)
|
|
1393
|
+
max_reservable_memory = perc_reserved_mem_max * physical_memory
|
|
1429
1394
|
|
|
1430
1395
|
estimatesBytesToPin = 0
|
|
1431
1396
|
for model_id in models:
|
|
@@ -1486,7 +1451,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1486
1451
|
|
|
1487
1452
|
if estimatesBytesToPin > 0 and estimatesBytesToPin >= (max_reservable_memory - total_pinned_bytes):
|
|
1488
1453
|
if self.verboseLevel >=1:
|
|
1489
|
-
print(f"Switching to partial pinning since full requirements for pinned models is {estimatesBytesToPin/ONE_MB:0.1f} MB while estimated reservable RAM is {max_reservable_memory/ONE_MB:0.1f} MB" )
|
|
1454
|
+
print(f"Switching to partial pinning since full requirements for pinned models is {estimatesBytesToPin/ONE_MB:0.1f} MB while estimated reservable RAM is {max_reservable_memory/ONE_MB:0.1f} MB. You may increase the value of parameter 'perc_reserved_mem_max' to a value higher than {perc_reserved_mem_max:0.2f} to force full pinnning." )
|
|
1490
1455
|
partialPinning = True
|
|
1491
1456
|
|
|
1492
1457
|
# Hook forward methods of modules
|
|
@@ -1498,7 +1463,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1498
1463
|
if compilationInThisOne:
|
|
1499
1464
|
if self.verboseLevel>=1:
|
|
1500
1465
|
if len(towers_modules)>0:
|
|
1501
|
-
print(f"Pytorch compilation of '{model_id}' is scheduled for these modules : {towers_names}
|
|
1466
|
+
print(f"Pytorch compilation of '{model_id}' is scheduled for these modules : {towers_names}*.")
|
|
1502
1467
|
else:
|
|
1503
1468
|
print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
|
|
1504
1469
|
|
|
@@ -1511,7 +1476,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1511
1476
|
if self.verboseLevel >=1:
|
|
1512
1477
|
print(f"Model '{model_id}' already pinned to reserved memory")
|
|
1513
1478
|
else:
|
|
1514
|
-
_pin_to_memory(current_model, model_id, partialPinning= partialPinning,
|
|
1479
|
+
_pin_to_memory(current_model, model_id, partialPinning= partialPinning, verboseLevel=verboseLevel)
|
|
1515
1480
|
|
|
1516
1481
|
current_budget = model_budgets[model_id]
|
|
1517
1482
|
current_size = 0
|
|
@@ -1538,7 +1503,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1538
1503
|
if num != cur_blocks_seq and (cur_blocks_seq == -1 or current_size > current_budget):
|
|
1539
1504
|
prev_blocks_name = cur_blocks_name
|
|
1540
1505
|
cur_blocks_name = cur_blocks_prefix + str(num)
|
|
1541
|
-
print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
|
|
1506
|
+
# print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
|
|
1542
1507
|
cur_blocks_seq = num
|
|
1543
1508
|
else:
|
|
1544
1509
|
cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq = None, None, None, -1
|
|
@@ -1550,7 +1515,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1550
1515
|
elif num >=0:
|
|
1551
1516
|
cur_blocks_prefix, prev_blocks_name, cur_blocks_seq = pre, None, num
|
|
1552
1517
|
cur_blocks_name = submodule_name
|
|
1553
|
-
print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
|
|
1518
|
+
# print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
|
|
1554
1519
|
|
|
1555
1520
|
|
|
1556
1521
|
if hasattr(submodule, "forward"):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.1.
|
|
3
|
+
Version: 3.1.4.post1
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -17,7 +17,7 @@ Requires-Dist: peft
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
<p align="center">
|
|
20
|
-
<H2>Memory Management 3.1.
|
|
20
|
+
<H2>Memory Management 3.1.4 for the GPU Poor by DeepBeepMeep</H2>
|
|
21
21
|
</p>
|
|
22
22
|
|
|
23
23
|
|
|
@@ -26,10 +26,10 @@ This a replacement for the accelerate library that should in theory manage offlo
|
|
|
26
26
|
times in a pipe (eg VAE).
|
|
27
27
|
|
|
28
28
|
Requirements:
|
|
29
|
-
- VRAM: minimum
|
|
29
|
+
- VRAM: minimum 6 GB, recommended 24 GB (RTX 3090/ RTX 4090)
|
|
30
30
|
- RAM: minimum 24 GB, recommended 48 GB
|
|
31
31
|
|
|
32
|
-
This module features 5 profiles in order to able to run the model at a decent speed on a low end consumer config (
|
|
32
|
+
This module features 5 profiles in order to able to run the model at a decent speed on a low end consumer config (24 GB of RAM and 6 VRAM) and to run it at a very good speed (if not the best) on a high end consumer config (48 GB of RAM and 24 GB of VRAM).\
|
|
33
33
|
These RAM requirements are for Linux systems. Due to different memory management Windows will require an extra 16 GB of RAM to run the corresponding profile.
|
|
34
34
|
|
|
35
35
|
Each profile may use a combination of the following:
|
|
@@ -41,7 +41,25 @@ Each profile may use a combination of the following:
|
|
|
41
41
|
- Automated on the fly quantization or ability to load pre quantized models
|
|
42
42
|
- Pretrained Lora support with low RAM requirements
|
|
43
43
|
- Support for pytorch compilation on Linux and WSL (supported on pure Windows but requires a complex Triton Installation).
|
|
44
|
-
|
|
44
|
+
|
|
45
|
+
## Sample applications that use mmgp
|
|
46
|
+
It is recommended to have a look at these applications to see how mmgp was implemented in each of them:
|
|
47
|
+
- Hunyuan3D-2GP: https://github.com/deepbeepmeep/Hunyuan3D-2GP\
|
|
48
|
+
A great image to 3D and text to 3D tool by the Tencent team. Thanks to mmgp it can run with less than 6 GB of VRAM
|
|
49
|
+
|
|
50
|
+
- HuanyuanVideoGP: https://github.com/deepbeepmeep/HunyuanVideoGP\
|
|
51
|
+
One of the best open source Text to Video generator
|
|
52
|
+
|
|
53
|
+
- FluxFillGP: https://github.com/deepbeepmeep/FluxFillGP\
|
|
54
|
+
One of the best inpainting / outpainting tools based on Flux that can run with less than 12 GB of VRAM.
|
|
55
|
+
|
|
56
|
+
- Cosmos1GP: https://github.com/deepbeepmeep/Cosmos1GP\
|
|
57
|
+
This application include two models: a text to world generator and a image / video to world (probably the best open source image to video generator).
|
|
58
|
+
|
|
59
|
+
- OminiControlGP: https://github.com/deepbeepmeep/OminiControlGP\
|
|
60
|
+
A Flux derived application very powerful that can be used to transfer an object of your choice in a prompted scene. With mmgp you can run it with only 6 GB of VRAM.
|
|
61
|
+
|
|
62
|
+
|
|
45
63
|
## Installation
|
|
46
64
|
First you need to install the module in your current project with:
|
|
47
65
|
```shell
|
|
@@ -74,7 +92,7 @@ Profile 2 (High RAM) and 4 (Low RAM)are the most recommended profiles since they
|
|
|
74
92
|
If you use Flux derived applciation profile 1 and 3 will offer much faster generation times.
|
|
75
93
|
In any case, a safe approach is to start from profile 5 (default profile) and then go down progressively to profile 4 and then to profile 2 as long as the app remains responsive or doesn't trigger any out of memory error.
|
|
76
94
|
|
|
77
|
-
By default the 'transformer' will be quantized to 8 bits for all profiles. If you don't want that you may specify the optional parameter *quantizeTransformer = False*.
|
|
95
|
+
By default the model named 'transformer' will be quantized to 8 bits for all profiles. If you don't want that you may specify the optional parameter *quantizeTransformer = False*.
|
|
78
96
|
|
|
79
97
|
Every parameter set automatically by a profile can be overridden with one or multiple parameters accepted by *offload.all* (see below):
|
|
80
98
|
```
|
|
@@ -100,13 +118,20 @@ For example:
|
|
|
100
118
|
The smaller this number, the more VRAM left for image data / longer video but also the slower because there will be lots of loading / unloading between the RAM and the VRAM. If model is too big to fit in a budget, it will be broken down in multiples parts that will be unloaded / loaded consequently. The speed of low budget can be increased (up to 2 times) by turning on the options pinnedMemory and asyncTransfers.
|
|
101
119
|
- asyncTransfers: boolean, load to the GPU the next model part while the current part is being processed. This requires twice the budget if any is defined. This may increase speed by 20% (mostly visible on fast modern GPUs).
|
|
102
120
|
- verboseLevel: number between 0 and 2 (1 by default), provides various level of feedback of the different processes
|
|
103
|
-
- compile: list of model ids to compile, may accelerate up x2 depending on the type of GPU. It makes
|
|
121
|
+
- compile: list of model ids to compile, may accelerate up x2 depending on the type of GPU. It makes sense to compile only the model that is frequently used such as the "transformer" model in the case of video or image generation. Compilation requires Triton to be installed. Triton is available out of the box on Linux or WSL but requires to be installed with Windows: https://github.com/woct0rdho/triton-windows
|
|
104
122
|
|
|
105
123
|
If you are short on RAM and plan to work with quantized models, it is recommended to load pre-quantized models direclty rather than using on the fly quantization, it will be faster and consume slightly less RAM.
|
|
106
124
|
|
|
107
125
|
## Going further
|
|
108
126
|
|
|
109
127
|
The module includes several tools to package a light version of your favorite video / image generator:
|
|
128
|
+
- *extract_models(string prefix, obj to explore)*\
|
|
129
|
+
This tool will try to detect for you models that are embedded in a pipeline or in some custom class. It will save you time by building a pipe dictionary required par *offload.all* or "offload.profile*. The prefix correponds to the text that will appear before the name of each model in the dictionary.
|
|
130
|
+
|
|
131
|
+
- *load_loras_into_model(model, lora_path, lora_multi)*\
|
|
132
|
+
Load in a model a list of Lora described by a list of path *lora_path* and a list of *weights coefficients*.
|
|
133
|
+
The Lora file must be in the *diffusers* format. This function works also on non diffusers models. However if there is already an official Lora support for a model it is recommended to use the official diffusers functions.
|
|
134
|
+
|
|
110
135
|
- *save_model(model, file_path, do_quantize = False, quantizationType = qint8 )*\
|
|
111
136
|
Save tensors of a model already loaded in memory in a safetensor format (much faster to reload). You can save it in a quantized format (default qint8 quantization recommended).
|
|
112
137
|
The resulting safetensor file will contain extra fields in its metadata such as the quantization map and its configuration, so you will be able to move the file around without files such as *config.json* or *file_map.json*.
|
|
@@ -120,16 +145,13 @@ Initialize (build the model hierarchy in memory) and fast load the corresponding
|
|
|
120
145
|
The advantages over the original *from_pretrained* method is that a full model can fit into a single file with a filename of your choosing (thefore you can have multiple 'transformers' versions of the same model in the same directory) and prequantized models are processed in a transparent way.
|
|
121
146
|
Last but not least, you can also on the fly pin to RAM the whole model or the most important part of it (partialPin = True) in a more efficient way (faster and requires less RAM) than if you did through *offload.all* or *offload.profile*.
|
|
122
147
|
|
|
123
|
-
- *load_loras_into_model(model, lora_path, lora_multi)
|
|
124
|
-
Load in a model a list of Lora described by a list of path *lora_path* and a list of *weights coefficients*.
|
|
125
|
-
The Lora file must be in the *diffusers* format. This function works also on non diffusers models. However if there is already an official Lora support for a model it is recommended to use the official diffusers functions.
|
|
126
148
|
|
|
127
149
|
The typical workflow wil be:
|
|
128
150
|
1) temporarly insert the *save_model* function just after a model has been fully loaded to save a copy of the model / quantized model.
|
|
129
151
|
2) replace the full initalizing / loading logic with *fast_load_transformers_model* (if there is a *from_pretrained* call to a transformers object) or only the tensor loading functions (*torch.load_model_file* and *torch.load_state_dict*) with *load_model_data after* the initializing logic.
|
|
130
152
|
|
|
131
153
|
## Special cases
|
|
132
|
-
Sometime there isn't an explicit pipe object as each submodel is loaded separately in the main app. If this is the case, you
|
|
154
|
+
Sometime there isn't an explicit pipe object as each submodel is loaded separately in the main app. If this is the case, you may try to use *extract_models* or create a dictionary that manually maps all the models.\
|
|
133
155
|
For instance :
|
|
134
156
|
|
|
135
157
|
|
|
@@ -143,9 +165,9 @@ pipe = { "text_encoder": self.text_encoder, "transformer": self.dit, "vae":self.
|
|
|
143
165
|
```
|
|
144
166
|
|
|
145
167
|
|
|
146
|
-
Please note
|
|
168
|
+
Please note it is recommended to have always one model whose Id is 'transformer' so that you can leverage predefined profiles. The 'transformer' corresponds to the main image / video model which usually needs to be quantized (this is done on the fly by default when loading the model).
|
|
147
169
|
|
|
148
|
-
|
|
170
|
+
Be careful, lots of models use the T5 XXL as a text encoder. However, quite often their corresponding pipeline configurations point at the official Google T5 XXL repository
|
|
149
171
|
where there is a huge 40GB model to download and load. It is cumbersorme as it is a 32 bits model and contains the decoder part of T5 that is not used.
|
|
150
172
|
I suggest you use instead one of the 16 bits encoder only version available around, for instance:
|
|
151
173
|
```
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
+
mmgp/offload.py,sha256=W74N_5-UPC3VXjmrpnRsrXxeB3xF17xqy4D0bZywfzI,69497
|
|
4
|
+
mmgp/safetensors2.py,sha256=OkJAvENfWeb-PL0FcxS1-eYeHLbemTaNXYvNxURrzIs,16154
|
|
5
|
+
mmgp-3.1.4.post1.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
+
mmgp-3.1.4.post1.dist-info/METADATA,sha256=hNYMSq_iwLiuk3oJD4WL_41K4lESTCYU_AeQU0VDB8w,14277
|
|
7
|
+
mmgp-3.1.4.post1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
8
|
+
mmgp-3.1.4.post1.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
+
mmgp-3.1.4.post1.dist-info/RECORD,,
|
mmgp-3.1.3.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
-
mmgp/offload.py,sha256=JB40Ky84Njhuf2BauLvNhH_-IS_27lhfYuLqVVhmJtA,71080
|
|
4
|
-
mmgp/safetensors2.py,sha256=OkJAvENfWeb-PL0FcxS1-eYeHLbemTaNXYvNxURrzIs,16154
|
|
5
|
-
mmgp-3.1.3.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
-
mmgp-3.1.3.dist-info/METADATA,sha256=pfkzWdQKY-7wNEMN66pwUPxfmXDGZSjJpBwvYolUDb4,12708
|
|
7
|
-
mmgp-3.1.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
8
|
-
mmgp-3.1.3.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
-
mmgp-3.1.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|