mmgp 3.5.12__tar.gz → 3.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.5.12
3
+ Version: 3.6.1
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
15
15
 
16
16
 
17
17
  <p align="center">
18
- <H2>Memory Management 3.5.12 for the GPU Poor by DeepBeepMeep</H2>
18
+ <H2>Memory Management 3.6.1 for the GPU Poor by DeepBeepMeep</H2>
19
19
  </p>
20
20
 
21
21
 
@@ -1,6 +1,6 @@
1
1
 
2
2
  <p align="center">
3
- <H2>Memory Management 3.5.12 for the GPU Poor by DeepBeepMeep</H2>
3
+ <H2>Memory Management 3.6.1 for the GPU Poor by DeepBeepMeep</H2>
4
4
  </p>
5
5
 
6
6
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mmgp"
3
- version = "3.5.12"
3
+ version = "3.6.1"
4
4
  authors = [
5
5
  { name = "deepbeepmeep", email = "deepbeepmeep@yahoo.com" },
6
6
  ]
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.5.12 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.6.1 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -688,7 +688,7 @@ def _welcome():
688
688
  if welcome_displayed:
689
689
  return
690
690
  welcome_displayed = True
691
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.12) by DeepBeepMeep ************{ENDC}{UNBOLD}")
691
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.1) by DeepBeepMeep ************{ENDC}{UNBOLD}")
692
692
 
693
693
  def change_dtype(model, new_dtype, exclude_buffers = False):
694
694
  for submodule_name, submodule in model.named_modules():
@@ -1097,7 +1097,9 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1097
1097
 
1098
1098
  invalid_keys = []
1099
1099
  unexpected_keys = []
1100
- for k, v in state_dict.items():
1100
+ new_state_dict = {}
1101
+ for k in list(state_dict.keys()):
1102
+ v = state_dict.pop(k)
1101
1103
  lora_A = lora_B = diff_b = diff = lora_key = None
1102
1104
  if k.endswith(".diff"):
1103
1105
  diff = v
@@ -1141,6 +1143,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1141
1143
  error_msg = append(error_msg, msg)
1142
1144
  fail = True
1143
1145
  break
1146
+ v = lora_A = lora_A.to(module.weight.dtype)
1144
1147
  elif lora_B != None:
1145
1148
  rank = lora_B.shape[1]
1146
1149
  if module_shape[0] != v.shape[0]:
@@ -1151,6 +1154,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1151
1154
  error_msg = append(error_msg, msg)
1152
1155
  fail = True
1153
1156
  break
1157
+ v = lora_B = lora_B.to(module.weight.dtype)
1154
1158
  elif diff != None:
1155
1159
  lora_B = diff
1156
1160
  if module_shape != v.shape:
@@ -1161,6 +1165,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1161
1165
  error_msg = append(error_msg, msg)
1162
1166
  fail = True
1163
1167
  break
1168
+ v = lora_B = lora_B.to(module.weight.dtype)
1164
1169
  elif diff_b != None:
1165
1170
  rank = diff_b.shape[0]
1166
1171
  if not hasattr(module, "bias"):
@@ -1179,8 +1184,11 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1179
1184
  error_msg = append(error_msg, msg)
1180
1185
  fail = True
1181
1186
  break
1187
+ v = diff_b = diff_b.to(module.weight.dtype)
1182
1188
 
1183
1189
  if not check_only:
1190
+ new_state_dict[k] = v
1191
+ v = None
1184
1192
  loras_module_data = loras_model_data.get(module, None)
1185
1193
  assert loras_module_data != None
1186
1194
  loras_adapter_data = loras_module_data.get(adapter_name, None)
@@ -1188,11 +1196,11 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1188
1196
  loras_adapter_data = [None, None, None, 1.]
1189
1197
  loras_module_data[adapter_name] = loras_adapter_data
1190
1198
  if lora_A != None:
1191
- loras_adapter_data[0] = lora_A.to(module.weight.dtype)
1199
+ loras_adapter_data[0] = lora_A
1192
1200
  elif lora_B != None:
1193
- loras_adapter_data[1] = lora_B.to(module.weight.dtype)
1201
+ loras_adapter_data[1] = lora_B
1194
1202
  else:
1195
- loras_adapter_data[2] = diff_b.to(module.weight.dtype)
1203
+ loras_adapter_data[2] = diff_b
1196
1204
  if rank != None and lora_key is not None and "lora" in lora_key:
1197
1205
  alpha_key = k[:-len(lora_key)] + "alpha"
1198
1206
  alpha = lora_alphas.get(alpha_key, None)
@@ -1220,7 +1228,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1220
1228
  if not check_only:
1221
1229
  # model._loras_tied_weights[adapter_name] = tied_weights
1222
1230
  if pinnedLora:
1223
- pinned_sd_list.append(state_dict)
1231
+ pinned_sd_list.append(new_state_dict)
1224
1232
  pinned_names_list.append(path)
1225
1233
  # _pin_sd_to_memory(state_dict, path)
1226
1234
 
@@ -2287,9 +2295,10 @@ class offload:
2287
2295
  src = f"""
2288
2296
  def {fname}(module, *args, **kwargs):
2289
2297
  _ = __TYPE_CONST # anchor type as a constant to make code object unique per class
2298
+ nada = "{fname}"
2290
2299
  mgr = module._mm_manager
2291
2300
  mgr._pre_check(module)
2292
- return module._mm_forward(*args, **kwargs)
2301
+ return module._mm_forward(*args, **kwargs) #{fname}
2293
2302
  """
2294
2303
  ns = {"__TYPE_CONST": mod_cls}
2295
2304
  exec(src, ns) # compile a new function object/code object for this class
@@ -2310,7 +2319,8 @@ def {fname}(module, *args, **kwargs):
2310
2319
  wrapper_fn = self._get_wrapper_for_type(type(target_module))
2311
2320
 
2312
2321
  # bind as a bound method (no partial/closures)
2313
- target_module.forward = types.MethodType(wrapper_fn, target_module)
2322
+ # target_module.forward = types.MethodType(wrapper_fn, target_module)
2323
+ target_module.forward = functools.update_wrapper(functools.partial(wrapper_fn, target_module), previous_method)
2314
2324
 
2315
2325
  def hook_check_load_into_GPU_if_needed_default(self, target_module, model, model_id, blocks_name, previous_method, context):
2316
2326
 
@@ -2345,12 +2355,12 @@ def {fname}(module, *args, **kwargs):
2345
2355
  if isinstance(target_module, torch.nn.Linear):
2346
2356
  def check_load_into_GPU_needed_linear(module, *args, **kwargs):
2347
2357
  check_load_into_GPU_needed()
2348
- return previous_method(*args, **kwargs)
2358
+ return previous_method(*args, **kwargs) # linear
2349
2359
  check_load_into_GPU_needed_module = check_load_into_GPU_needed_linear
2350
2360
  else:
2351
2361
  def check_load_into_GPU_needed_other(module, *args, **kwargs):
2352
2362
  check_load_into_GPU_needed()
2353
- return previous_method(*args, **kwargs)
2363
+ return previous_method(*args, **kwargs) # other
2354
2364
  check_load_into_GPU_needed_module = check_load_into_GPU_needed_other
2355
2365
 
2356
2366
  setattr(target_module, "_mm_id", model_id)
@@ -2498,7 +2508,7 @@ def {fname}(module, *args, **kwargs):
2498
2508
 
2499
2509
 
2500
2510
 
2501
- def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, vram_safety_coefficient = 0.8, verboseLevel = -1):
2511
+ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, vram_safety_coefficient = 0.8, compile_mode ="default", verboseLevel = -1):
2502
2512
  """Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
2503
2513
  pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
2504
2514
  quantizeTransformer: set True by default will quantize on the fly the video / image model
@@ -2771,8 +2781,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2771
2781
  elif compilationInThisOne and submodule in towers_modules:
2772
2782
  self.hook_preload_blocks_for_compilation(submodule, model_id, cur_blocks_name, context = submodule_name )
2773
2783
  else:
2774
- if compilationInThisOne and False:
2775
- self.hook_check_load_into_GPU_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
2784
+ if compilationInThisOne: #and False
2785
+ self.hook_check_load_into_GPU_if_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
2776
2786
  else:
2777
2787
  self.hook_check_load_into_GPU_if_needed_default(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
2778
2788
 
@@ -2789,7 +2799,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2789
2799
  print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
2790
2800
 
2791
2801
  for submodel in towers_modules:
2792
- submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode="default" ) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
2802
+ submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode= compile_mode) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
2793
2803
  #dynamic=True,
2794
2804
 
2795
2805
  self.tune_preloading(model_id, current_budget, towers_names)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.5.12
3
+ Version: 3.6.1
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
15
15
 
16
16
 
17
17
  <p align="center">
18
- <H2>Memory Management 3.5.12 for the GPU Poor by DeepBeepMeep</H2>
18
+ <H2>Memory Management 3.6.1 for the GPU Poor by DeepBeepMeep</H2>
19
19
  </p>
20
20
 
21
21
 
File without changes
File without changes
File without changes
File without changes
File without changes