mmgp 3.5.0__py3-none-any.whl → 3.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- mmgp/offload.py +162 -46
- {mmgp-3.5.0.dist-info → mmgp-3.5.3.dist-info}/METADATA +2 -2
- mmgp-3.5.3.dist-info/RECORD +9 -0
- {mmgp-3.5.0.dist-info → mmgp-3.5.3.dist-info}/licenses/LICENSE.md +1 -1
- mmgp-3.5.0.dist-info/RECORD +0 -9
- {mmgp-3.5.0.dist-info → mmgp-3.5.3.dist-info}/WHEEL +0 -0
- {mmgp-3.5.0.dist-info → mmgp-3.5.3.dist-info}/top_level.txt +0 -0
mmgp/offload.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.5.
|
|
1
|
+
# ------------------ Memory Management 3.5.3 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -253,17 +253,17 @@ def _remove_model_wrapper(model):
|
|
|
253
253
|
def _move_to_pinned_tensor(source_tensor, big_tensor, offset, length):
|
|
254
254
|
dtype= source_tensor.dtype
|
|
255
255
|
shape = source_tensor.shape
|
|
256
|
-
if len(shape)
|
|
257
|
-
return source_tensor
|
|
258
|
-
else:
|
|
256
|
+
if len(shape) > 0 :
|
|
259
257
|
t = source_tensor.view(torch.uint8)
|
|
260
258
|
t = torch.reshape(t, (length,))
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
259
|
+
else:
|
|
260
|
+
t = source_tensor
|
|
261
|
+
# magic swap !
|
|
262
|
+
big_tensor[offset: offset + length] = t
|
|
263
|
+
t = big_tensor[offset: offset + length]
|
|
264
|
+
t = t.view(dtype)
|
|
265
|
+
t = torch.reshape(t, shape)
|
|
266
|
+
assert t.is_pinned()
|
|
267
267
|
return t
|
|
268
268
|
|
|
269
269
|
def _safetensors_load_file(file_path, writable_tensors = True):
|
|
@@ -336,9 +336,8 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
|
|
|
336
336
|
names_list = sd_name if isinstance(sd, list) else [sd_name]
|
|
337
337
|
|
|
338
338
|
if max_pinnable_bytes > 0 and total_pinned_bytes >= max_pinnable_bytes:
|
|
339
|
-
|
|
340
339
|
if verboseLevel>=1 :
|
|
341
|
-
print(f"Unable pin data of '{','.join(names_list)}' to reserved RAM as there is no reserved RAM left")
|
|
340
|
+
print(f"Unable to pin data of '{','.join(names_list)}' to reserved RAM as there is no reserved RAM left. Transfer speed from RAM to VRAM will may be slower.")
|
|
342
341
|
return
|
|
343
342
|
|
|
344
343
|
|
|
@@ -404,7 +403,7 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
|
|
|
404
403
|
big_tensors.append(current_big_tensor)
|
|
405
404
|
except:
|
|
406
405
|
incomplete_pinning = True
|
|
407
|
-
print(f"Unable to pin more tensors for '{sd_name}' as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
|
|
406
|
+
print(f"Unable to pin more tensors for '{sd_name}' as the maximum reservable memory has been reached ({total/ONE_MB:.2f}). Transfer speed from RAM to VRAM may be slower.")
|
|
408
407
|
break
|
|
409
408
|
|
|
410
409
|
last_big_tensor += 1
|
|
@@ -442,12 +441,12 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
|
|
|
442
441
|
|
|
443
442
|
if verboseLevel >=1:
|
|
444
443
|
if incomplete_pinning :
|
|
445
|
-
if len(names_list) >
|
|
444
|
+
if len(names_list) > 1:
|
|
446
445
|
print(f"'{','.join(names_list)}' were partially pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
447
446
|
else:
|
|
448
447
|
print(f"'{','.join(names_list)}' was partially pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
449
448
|
else:
|
|
450
|
-
if len(names_list) >
|
|
449
|
+
if len(names_list) > 1:
|
|
451
450
|
print(f"'{','.join(names_list)}' were pinned entirely to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
452
451
|
else:
|
|
453
452
|
print(f"'{','.join(names_list)}' was pinned entirely to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
@@ -462,7 +461,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
462
461
|
if max_pinnable_bytes > 0 and total_pinned_bytes >= max_pinnable_bytes:
|
|
463
462
|
|
|
464
463
|
if verboseLevel>=1 :
|
|
465
|
-
print(f"Unable pin data of '{model_id}' to reserved RAM as there is no reserved RAM left")
|
|
464
|
+
print(f"Unable to pin data of '{model_id}' to reserved RAM as there is no reserved RAM left. Transfer speed from RAM to VRAM may be slower.")
|
|
466
465
|
return
|
|
467
466
|
|
|
468
467
|
if partialPinning:
|
|
@@ -499,7 +498,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
499
498
|
else:
|
|
500
499
|
print(f"Pinning data of '{model_id}' to reserved RAM")
|
|
501
500
|
|
|
502
|
-
if
|
|
501
|
+
if len(params_dict) == 0:
|
|
503
502
|
return
|
|
504
503
|
|
|
505
504
|
ref_cache = {}
|
|
@@ -521,13 +520,22 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
521
520
|
else:
|
|
522
521
|
if isinstance(p, QTensor):
|
|
523
522
|
if p._qtype == qint4:
|
|
523
|
+
if p._data._data.is_pinned():
|
|
524
|
+
params_dict[n] = (None, False)
|
|
525
|
+
continue
|
|
524
526
|
if hasattr(p,"_scale_shift"):
|
|
525
527
|
length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale_shift) * p._scale_shift.element_size()
|
|
526
528
|
else:
|
|
527
529
|
length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale) * p._scale.element_size() + torch.numel(p._shift) * p._shift.element_size()
|
|
528
530
|
else:
|
|
529
531
|
length = torch.numel(p._data) * p._data.element_size() + torch.numel(p._scale) * p._scale.element_size()
|
|
532
|
+
if p._data.is_pinned():
|
|
533
|
+
params_dict[n] = (None, False)
|
|
534
|
+
continue
|
|
530
535
|
else:
|
|
536
|
+
if p.data.is_pinned():
|
|
537
|
+
params_dict[n] = (None, False)
|
|
538
|
+
continue
|
|
531
539
|
length = torch.numel(p.data) * p.data.element_size()
|
|
532
540
|
|
|
533
541
|
ref_cache[ref] = (n, length)
|
|
@@ -544,7 +552,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
544
552
|
current_big_tensor_size += length
|
|
545
553
|
|
|
546
554
|
total_tensor_bytes += length
|
|
547
|
-
|
|
555
|
+
p = None
|
|
548
556
|
if verboseLevel >=1 and tied_weights_count > 0:
|
|
549
557
|
if tied_weights_count == 1:
|
|
550
558
|
print(f"Tied weights of {tied_weights_total/ONE_MB:0.2f} MB detected: {tied_weights_last}")
|
|
@@ -570,6 +578,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
570
578
|
tensor_no = 0
|
|
571
579
|
# prev_big_tensor = 0
|
|
572
580
|
for n, (p, is_buffer) in params_dict.items():
|
|
581
|
+
if p is None: continue
|
|
573
582
|
q_name = tied_weights.get(n,None)
|
|
574
583
|
if q_name != None:
|
|
575
584
|
q , _ = params_dict[q_name]
|
|
@@ -658,7 +667,7 @@ def _welcome():
|
|
|
658
667
|
if welcome_displayed:
|
|
659
668
|
return
|
|
660
669
|
welcome_displayed = True
|
|
661
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.
|
|
670
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.3) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
662
671
|
|
|
663
672
|
def change_dtype(model, new_dtype, exclude_buffers = False):
|
|
664
673
|
for submodule_name, submodule in model.named_modules():
|
|
@@ -1220,9 +1229,29 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1220
1229
|
activate_loras(model, loras_nos, loras_multi)
|
|
1221
1230
|
return new_lora_path
|
|
1222
1231
|
|
|
1232
|
+
|
|
1233
|
+
def merge_dicts(A, B):
|
|
1234
|
+
for key, value in A.items():
|
|
1235
|
+
if isinstance(value, dict):
|
|
1236
|
+
if key not in B or not isinstance(B[key], dict):
|
|
1237
|
+
B[key] = value # Copy entire dict reference from A
|
|
1238
|
+
else:
|
|
1239
|
+
merge_dicts(value, B[key]) # Recurse into both dicts
|
|
1240
|
+
else:
|
|
1241
|
+
B[key] = value # Copy non-dict value from A to B
|
|
1242
|
+
|
|
1243
|
+
|
|
1244
|
+
def sync_models_loras(model, model2):
|
|
1245
|
+
merge_dicts(model._loras_model_shortcuts , model2._loras_model_shortcuts)
|
|
1246
|
+
model2._loras_active_adapters = model._loras_active_adapters
|
|
1247
|
+
model2._loras_adapters = model._loras_adapters
|
|
1248
|
+
model2._loras_scaling = model._loras_scaling
|
|
1249
|
+
|
|
1223
1250
|
def unload_loras_from_model(model):
|
|
1224
1251
|
for _, v in model._loras_model_data.items():
|
|
1225
1252
|
v.clear()
|
|
1253
|
+
for _, v in model._loras_model_shortcuts.items():
|
|
1254
|
+
v.clear()
|
|
1226
1255
|
|
|
1227
1256
|
model._loras_active_adapters = []
|
|
1228
1257
|
model._loras_scaling = dict()
|
|
@@ -1262,7 +1291,7 @@ def move_loras_to_device(model, device="cpu" ):
|
|
|
1262
1291
|
if ".lora_" in k:
|
|
1263
1292
|
m.to(device)
|
|
1264
1293
|
|
|
1265
|
-
def fast_load_transformers_model(model_path: str,
|
|
1294
|
+
def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, defaultConfigPath = None, modelClass=None, modelPrefix = None, writable_tensors = True, verboseLevel = -1, modules = None, return_shared_modules = None, configKwargs ={}):
|
|
1266
1295
|
"""
|
|
1267
1296
|
quick version of .LoadfromPretrained of the transformers library
|
|
1268
1297
|
used to build a model and load the corresponding weights (quantized or not)
|
|
@@ -1331,42 +1360,36 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
|
|
|
1331
1360
|
model = transfomer_class(config_obj)
|
|
1332
1361
|
|
|
1333
1362
|
|
|
1334
|
-
|
|
1335
|
-
class_name = transformer_config["_class_name"]
|
|
1336
|
-
|
|
1363
|
+
else:
|
|
1337
1364
|
if modelClass !=None:
|
|
1338
1365
|
transfomer_class = modelClass
|
|
1339
|
-
|
|
1366
|
+
elif "_class_name" in transformer_config:
|
|
1367
|
+
class_name = 'Transformer3DModel'
|
|
1340
1368
|
module = __import__("diffusers")
|
|
1341
1369
|
transfomer_class = getattr(module, class_name)
|
|
1370
|
+
else:
|
|
1371
|
+
raise Exception("class not defined")
|
|
1342
1372
|
|
|
1343
1373
|
with init_empty_weights():
|
|
1344
1374
|
model = transfomer_class.from_config(transformer_config )
|
|
1345
1375
|
|
|
1346
1376
|
|
|
1347
1377
|
torch.set_default_device('cpu')
|
|
1378
|
+
model.eval().requires_grad_(False)
|
|
1348
1379
|
|
|
1349
1380
|
model._config = transformer_config
|
|
1350
1381
|
|
|
1351
|
-
load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, modelPrefix = modelPrefix, writable_tensors =writable_tensors ,verboseLevel=verboseLevel )
|
|
1382
|
+
load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, modelPrefix = modelPrefix, writable_tensors =writable_tensors, modules = modules, return_shared_modules = return_shared_modules, verboseLevel=verboseLevel )
|
|
1352
1383
|
|
|
1353
1384
|
return model
|
|
1354
1385
|
|
|
1355
1386
|
|
|
1356
1387
|
|
|
1357
|
-
def load_model_data(model, file_path
|
|
1388
|
+
def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True, modules = None, return_shared_modules = None, verboseLevel = -1):
|
|
1358
1389
|
"""
|
|
1359
1390
|
Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
|
|
1360
1391
|
"""
|
|
1361
|
-
if not isinstance(file_path, list):
|
|
1362
|
-
file_path = [file_path]
|
|
1363
|
-
|
|
1364
|
-
file_path = [ _get_model(file) for file in file_path]
|
|
1365
|
-
if any( file == None for file in file_path):
|
|
1366
|
-
raise Exception("Unable to find file")
|
|
1367
|
-
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
1368
1392
|
|
|
1369
|
-
model = _remove_model_wrapper(model)
|
|
1370
1393
|
|
|
1371
1394
|
def filter_state_dict(state_dict, base_model_prefix):
|
|
1372
1395
|
new_state_dict= {}
|
|
@@ -1387,10 +1410,34 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
|
|
|
1387
1410
|
new_state_dict[k[ start:]] = v
|
|
1388
1411
|
return new_state_dict
|
|
1389
1412
|
|
|
1413
|
+
|
|
1414
|
+
|
|
1415
|
+
if not isinstance(file_path, list):
|
|
1416
|
+
file_path = [file_path]
|
|
1417
|
+
|
|
1418
|
+
file_count = len(file_path)
|
|
1419
|
+
if isinstance(modules, (list,str)):
|
|
1420
|
+
if isinstance(modules, str): modules = [modules]
|
|
1421
|
+
file_path += modules
|
|
1422
|
+
modules = None
|
|
1423
|
+
|
|
1424
|
+
file_path = [ _get_model(file) for file in file_path]
|
|
1425
|
+
if any( file == None for file in file_path):
|
|
1426
|
+
raise Exception("Unable to find file")
|
|
1427
|
+
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
1428
|
+
|
|
1429
|
+
model = _remove_model_wrapper(model)
|
|
1430
|
+
|
|
1431
|
+
if return_shared_modules is not None:
|
|
1432
|
+
return_state_dict ={}
|
|
1433
|
+
return_quantization_map ={}
|
|
1434
|
+
return_shared_modules["state_dict"] = return_state_dict
|
|
1435
|
+
return_shared_modules["quantization_map"] = return_quantization_map
|
|
1436
|
+
|
|
1390
1437
|
full_quantization_map = {}
|
|
1391
1438
|
full_tied_weights_map = {}
|
|
1392
1439
|
full_state_dict = {}
|
|
1393
|
-
for file in file_path:
|
|
1440
|
+
for no, file in enumerate(file_path):
|
|
1394
1441
|
quantization_map = None
|
|
1395
1442
|
tied_weights_map = None
|
|
1396
1443
|
if not (".safetensors" in file or ".sft" in file):
|
|
@@ -1443,6 +1490,13 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
|
|
|
1443
1490
|
full_quantization_map.update(quantization_map)
|
|
1444
1491
|
if tied_weights_map != None:
|
|
1445
1492
|
full_tied_weights_map.update(tied_weights_map)
|
|
1493
|
+
if return_shared_modules is not None and no >= file_count:
|
|
1494
|
+
return_state_dict.update(state_dict)
|
|
1495
|
+
if quantization_map is not None: return_quantization_map.update(quantization_map)
|
|
1496
|
+
|
|
1497
|
+
if isinstance(modules, dict) :
|
|
1498
|
+
full_state_dict.update(modules["state_dict"])
|
|
1499
|
+
full_quantization_map.update(modules["quantization_map"])
|
|
1446
1500
|
|
|
1447
1501
|
state_dict, quantization_map, tied_weights_map = full_state_dict, full_quantization_map, full_tied_weights_map
|
|
1448
1502
|
full_state_dict, full_quantization_map, full_tied_weights_map = None, None, None
|
|
@@ -1463,7 +1517,7 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
|
|
|
1463
1517
|
|
|
1464
1518
|
|
|
1465
1519
|
missing_keys , unexpected_keys = model.load_state_dict(state_dict, False, assign = True )
|
|
1466
|
-
if len(missing_keys) > 0
|
|
1520
|
+
if len(missing_keys) > 0 :
|
|
1467
1521
|
# if there is a key mismatch maybe we forgot to remove some prefix
|
|
1468
1522
|
base_model_prefix = None
|
|
1469
1523
|
for k,v in state_dict.items():
|
|
@@ -1474,18 +1528,53 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
|
|
|
1474
1528
|
raise Exception(f"Missing keys: {missing_keys}")
|
|
1475
1529
|
state_dict = filter_state_dict(state_dict, base_model_prefix)
|
|
1476
1530
|
missing_keys , unexpected_keys = model.load_state_dict(state_dict, False, assign = True )
|
|
1531
|
+
|
|
1477
1532
|
del state_dict
|
|
1533
|
+
|
|
1478
1534
|
if len(unexpected_keys) > 0 and verboseLevel >=2:
|
|
1479
1535
|
print(f"Unexpected keys while loading '{file_path}': {unexpected_keys}")
|
|
1480
1536
|
|
|
1481
1537
|
for k,p in model.named_parameters():
|
|
1482
|
-
if p.is_meta:
|
|
1538
|
+
if p.is_meta :
|
|
1483
1539
|
txt = f"Incompatible State Dictionary or 'Init_Empty_Weights' not set since parameter '{k}' has no data"
|
|
1484
1540
|
raise Exception(txt)
|
|
1485
1541
|
for k,b in model.named_buffers():
|
|
1486
|
-
if b.is_meta:
|
|
1542
|
+
if b.is_meta :
|
|
1487
1543
|
txt = f"Incompatible State Dictionary or 'Init_Empty_Weights' not set since buffer '{k}' has no data"
|
|
1488
1544
|
raise Exception(txt)
|
|
1545
|
+
|
|
1546
|
+
if return_shared_modules is not None:
|
|
1547
|
+
mods = { k : v for k,v in model.named_modules()}
|
|
1548
|
+
return_parameters = {}
|
|
1549
|
+
return_shared_modules["parameters"] = return_parameters
|
|
1550
|
+
for k in return_state_dict:
|
|
1551
|
+
if k.endswith("._data"):
|
|
1552
|
+
k = k[:-6]
|
|
1553
|
+
pos = k.rfind(".")
|
|
1554
|
+
mod_name = k[:pos]
|
|
1555
|
+
param_name = k[pos +1:]
|
|
1556
|
+
mod = mods.get(mod_name, None)
|
|
1557
|
+
if mod is not None:
|
|
1558
|
+
p = mod._parameters.get(param_name, None)
|
|
1559
|
+
if p is None: p = mod._buffers.get(param_name, None)
|
|
1560
|
+
if p is not None:
|
|
1561
|
+
return_parameters[k] = p
|
|
1562
|
+
del mods
|
|
1563
|
+
|
|
1564
|
+
if isinstance(modules, dict) :
|
|
1565
|
+
mods = { k : v for k,v in model.named_modules()}
|
|
1566
|
+
# replace Parameter outer shell so that both models parameters are tied
|
|
1567
|
+
for k, rep_p in modules["parameters"].items():
|
|
1568
|
+
pos = k.rfind(".")
|
|
1569
|
+
mod_name = k[:pos]
|
|
1570
|
+
param_name = k[pos +1:]
|
|
1571
|
+
mod = mods.get(mod_name, None)
|
|
1572
|
+
if mod is not None:
|
|
1573
|
+
setattr(mod, param_name, rep_p)
|
|
1574
|
+
del mods
|
|
1575
|
+
modules["parameters"].clear()
|
|
1576
|
+
modules["state_dict"].clear()
|
|
1577
|
+
rep_p = p = None
|
|
1489
1578
|
|
|
1490
1579
|
if do_quantize:
|
|
1491
1580
|
if quantization_map != None and len(quantization_map) > 0 :
|
|
@@ -1500,7 +1589,7 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
|
|
|
1500
1589
|
|
|
1501
1590
|
return
|
|
1502
1591
|
|
|
1503
|
-
def save_model(model, file_path, do_quantize = False, quantizationType = qint8, verboseLevel = -1, config_file_path = None ):
|
|
1592
|
+
def save_model(model, file_path, do_quantize = False, quantizationType = qint8, verboseLevel = -1, config_file_path = None, filter_sd =None ):
|
|
1504
1593
|
"""save the weights of a model and quantize them if requested
|
|
1505
1594
|
These weights can be loaded again using 'load_model_data'
|
|
1506
1595
|
"""
|
|
@@ -1541,6 +1630,24 @@ def save_model(model, file_path, do_quantize = False, quantizationType = qint8,
|
|
|
1541
1630
|
cache_ref = {}
|
|
1542
1631
|
tied_weights_map = {}
|
|
1543
1632
|
sd = model.state_dict()
|
|
1633
|
+
if filter_sd != None:
|
|
1634
|
+
new_sd = {}
|
|
1635
|
+
new_quantization_map = {}
|
|
1636
|
+
for k_k in filter_sd:
|
|
1637
|
+
for s in [".weight", ".bias", ".weight._data", ".weight._scale"]:
|
|
1638
|
+
if k_k.endswith(s):
|
|
1639
|
+
k_k= k_k[:-len(s)]
|
|
1640
|
+
break
|
|
1641
|
+
for k,v in sd.items():
|
|
1642
|
+
if k.startswith(k_k):
|
|
1643
|
+
new_sd[k] = v
|
|
1644
|
+
if quantization_map != None:
|
|
1645
|
+
for k,v in quantization_map.items():
|
|
1646
|
+
if k.startswith(k_k):
|
|
1647
|
+
new_quantization_map[k] = v
|
|
1648
|
+
sd = new_sd
|
|
1649
|
+
if quantization_map != None: quantization_map = new_quantization_map
|
|
1650
|
+
|
|
1544
1651
|
out_sd = OrderedDict()
|
|
1545
1652
|
|
|
1546
1653
|
|
|
@@ -1755,6 +1862,12 @@ class offload:
|
|
|
1755
1862
|
if tied_p.is_cuda:
|
|
1756
1863
|
setattr(parent_module, n , tied_p)
|
|
1757
1864
|
continue
|
|
1865
|
+
# if hasattr(p,'_data'):
|
|
1866
|
+
# if not p._data.is_pinned() or not p._scale.is_pinned():
|
|
1867
|
+
# pass
|
|
1868
|
+
# else:
|
|
1869
|
+
# if not p.data.is_pinned():
|
|
1870
|
+
# pass
|
|
1758
1871
|
|
|
1759
1872
|
q = p.to("cuda", non_blocking=True)
|
|
1760
1873
|
if is_buffer:
|
|
@@ -2014,6 +2127,8 @@ class offload:
|
|
|
2014
2127
|
weight = weight.clone()
|
|
2015
2128
|
for active_adapter in active_adapters:
|
|
2016
2129
|
data = loras_data.get(active_adapter + '_GPU', None)
|
|
2130
|
+
if data == None:
|
|
2131
|
+
continue
|
|
2017
2132
|
lora_A_weight, lora_B_weight, diff_b, alpha = data
|
|
2018
2133
|
scaling = self._get_lora_scaling(loras_scaling, model, active_adapter) * alpha
|
|
2019
2134
|
if lora_A_weight != None:
|
|
@@ -2028,7 +2143,6 @@ class offload:
|
|
|
2028
2143
|
original_bias = False
|
|
2029
2144
|
bias.add_(diff_b, alpha=scaling)
|
|
2030
2145
|
# base_weight += scaling * lora_B_weight @ lora_A_weight
|
|
2031
|
-
break
|
|
2032
2146
|
if training:
|
|
2033
2147
|
pass
|
|
2034
2148
|
# result = torch.nn.functional.linear(dropout(x), base_weight, bias=submodule.bias)
|
|
@@ -2066,10 +2180,12 @@ class offload:
|
|
|
2066
2180
|
return result
|
|
2067
2181
|
|
|
2068
2182
|
|
|
2069
|
-
def hook_lora(self, submodule, current_model, model_id, loras_model_data, submodule_name):
|
|
2183
|
+
def hook_lora(self, submodule, current_model, model_id, loras_model_data, loras_model_shortcuts, submodule_name):
|
|
2070
2184
|
old_forward = submodule.forward
|
|
2071
2185
|
|
|
2072
2186
|
loras_data = {}
|
|
2187
|
+
assert submodule_name not in loras_model_shortcuts
|
|
2188
|
+
loras_model_shortcuts[submodule_name] = loras_data
|
|
2073
2189
|
loras_model_data[submodule] = loras_data
|
|
2074
2190
|
|
|
2075
2191
|
if isinstance(submodule, torch.nn.Linear):
|
|
@@ -2187,7 +2303,6 @@ class offload:
|
|
|
2187
2303
|
|
|
2188
2304
|
if current_budget == 0 or towers_names is None or len(towers_names) == 0 or not self.async_transfers:
|
|
2189
2305
|
return
|
|
2190
|
-
# current_budget = 5000 * ONE_MB
|
|
2191
2306
|
base_size = self.blocks_of_modules_sizes[model_id]
|
|
2192
2307
|
current_budget -= base_size
|
|
2193
2308
|
current_budget = max(0, current_budget)
|
|
@@ -2500,14 +2615,14 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2500
2615
|
print(f"Model '{model_id}' already pinned to reserved memory")
|
|
2501
2616
|
else:
|
|
2502
2617
|
_pin_to_memory(current_model, model_id, partialPinning= partialPinning, pinnedPEFTLora = pinnedPEFTLora, perc_reserved_mem_max = perc_reserved_mem_max, verboseLevel=verboseLevel)
|
|
2503
|
-
|
|
2504
2618
|
current_budget = model_budgets[model_id]
|
|
2505
2619
|
cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
|
|
2506
2620
|
self.loaded_blocks[model_id] = None
|
|
2507
2621
|
any_lora = loras !=None and model_id in loras
|
|
2508
2622
|
if any_lora:
|
|
2509
|
-
loras_model_data = {}
|
|
2623
|
+
loras_model_data, loras_model_shortcuts = {}, {}
|
|
2510
2624
|
current_model._loras_model_data = loras_model_data
|
|
2625
|
+
current_model._loras_model_shortcuts = loras_model_shortcuts
|
|
2511
2626
|
for submodule_name, submodule in current_model.named_modules():
|
|
2512
2627
|
# create a fake 'accelerate' parameter so that the _execution_device property returns always "cuda"
|
|
2513
2628
|
# (it is queried in many pipelines even if offloading is not properly implemented)
|
|
@@ -2541,7 +2656,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2541
2656
|
if hasattr(submodule, "forward"):
|
|
2542
2657
|
# if any_lora and isinstance(submodule, ( torch.nn.Linear, torch.nn.Conv3d, torch.nn.LayerNorm)):
|
|
2543
2658
|
if any_lora and hasattr(submodule,"weight"):
|
|
2544
|
-
submodule_method = self.hook_lora(submodule, current_model, model_id, loras_model_data, submodule_name)
|
|
2659
|
+
submodule_method = self.hook_lora(submodule, current_model, model_id, loras_model_data, loras_model_shortcuts, submodule_name)
|
|
2545
2660
|
else:
|
|
2546
2661
|
submodule_method = getattr(submodule, "forward")
|
|
2547
2662
|
if callable(submodule_method):
|
|
@@ -2551,11 +2666,12 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2551
2666
|
self.hook_preload_blocks_for_compilation(submodule, model_id, cur_blocks_name, context = submodule_name )
|
|
2552
2667
|
else:
|
|
2553
2668
|
self.hook_check_empty_cache_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
|
|
2554
|
-
|
|
2669
|
+
|
|
2555
2670
|
self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
|
|
2556
2671
|
|
|
2557
2672
|
|
|
2558
2673
|
self.tune_preloading(model_id, current_budget, towers_names)
|
|
2674
|
+
self.parameters_ref = {}
|
|
2559
2675
|
|
|
2560
2676
|
|
|
2561
2677
|
if self.verboseLevel >=2:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.5.
|
|
3
|
+
Version: 3.5.3
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,7 +15,7 @@ Dynamic: license-file
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<H2>Memory Management 3.5.
|
|
18
|
+
<H2>Memory Management 3.5.3 for the GPU Poor by DeepBeepMeep</H2>
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
+
mmgp/offload.py,sha256=owsKU30CiOVioGExG28B9F93R09mTvIoe4RwuHv_f7s,125986
|
|
4
|
+
mmgp/safetensors2.py,sha256=4nKV13qCMabnNEB1TA_ueFbfGYYmiQ9racR_C6SsGug,18693
|
|
5
|
+
mmgp-3.5.3.dist-info/licenses/LICENSE.md,sha256=DD-WIS0BkPoWJ_8hQO3J8hMP9K_1-dyrYv1YCbkxcDU,94
|
|
6
|
+
mmgp-3.5.3.dist-info/METADATA,sha256=lPWpMmbWiXt-ZOV5dRyrBnInGBjBiVkfbFYu19aeOkw,16309
|
|
7
|
+
mmgp-3.5.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
+
mmgp-3.5.3.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
+
mmgp-3.5.3.dist-info/RECORD,,
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
GNU GENERAL PUBLIC LICENSE
|
|
1
|
+
GNU GENERAL PUBLIC LICENSE
|
|
2
2
|
Version 3, 29 June 2007
|
mmgp-3.5.0.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
-
mmgp/offload.py,sha256=wKWX4eq8FnD2JZuTNHQu3EsiqqsHQ89T9XDNS-dpUqk,120897
|
|
4
|
-
mmgp/safetensors2.py,sha256=4nKV13qCMabnNEB1TA_ueFbfGYYmiQ9racR_C6SsGug,18693
|
|
5
|
-
mmgp-3.5.0.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
-
mmgp-3.5.0.dist-info/METADATA,sha256=TunzrXOHz79NN8TRdE57wEE3NThzo-Uy1DXY3pSzcpQ,16309
|
|
7
|
-
mmgp-3.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
-
mmgp-3.5.0.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
-
mmgp-3.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|