mmgp 3.5.7__py3-none-any.whl → 3.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

mmgp/offload.py CHANGED
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.5.7 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.5.9 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -64,6 +64,11 @@ import psutil
64
64
  import builtins
65
65
  from accelerate import init_empty_weights
66
66
 
67
+ import functools
68
+ import types
69
+ from functools import lru_cache
70
+ import torch
71
+
67
72
 
68
73
  from mmgp import safetensors2
69
74
  from mmgp import profile_type
@@ -666,7 +671,7 @@ def _welcome():
666
671
  if welcome_displayed:
667
672
  return
668
673
  welcome_displayed = True
669
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.7) by DeepBeepMeep ************{ENDC}{UNBOLD}")
674
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.9) by DeepBeepMeep ************{ENDC}{UNBOLD}")
670
675
 
671
676
  def change_dtype(model, new_dtype, exclude_buffers = False):
672
677
  for submodule_name, submodule in model.named_modules():
@@ -1174,11 +1179,10 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1174
1179
  loras_adapter_data[1] = lora_B.to(module.weight.dtype)
1175
1180
  else:
1176
1181
  loras_adapter_data[2] = diff_b.to(module.weight.dtype)
1177
- if rank != None:
1178
- alpha_key = k[:-len("lora_X.weight")] + "alpha"
1182
+ if rank != None and "lora" in lora_key:
1183
+ alpha_key = k[:-len(lora_key)] + "alpha"
1179
1184
  alpha = lora_alphas.get(alpha_key, None)
1180
- alpha = 1. if alpha == None else alpha / rank
1181
- loras_adapter_data[3] = alpha
1185
+ if alpha is not None: loras_adapter_data[3] = alpha / rank
1182
1186
  lora_A = lora_B = diff = diff_b = v = loras_module_data = loras_adapter_data = lora_alphas = None
1183
1187
 
1184
1188
  if len(invalid_keys) > 0:
@@ -1757,6 +1761,7 @@ class offload:
1757
1761
  global last_offload_obj
1758
1762
  last_offload_obj = self
1759
1763
 
1764
+ self._type_wrappers = {}
1760
1765
 
1761
1766
  def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name, submodule_name):
1762
1767
 
@@ -2173,6 +2178,7 @@ class offload:
2173
2178
  scaling = self._get_lora_scaling(loras_scaling, model, active_adapter) * alpha
2174
2179
  if scaling == 0:
2175
2180
  continue
2181
+
2176
2182
  if lora_A == None:
2177
2183
  result.add_(diff_b, alpha=scaling)
2178
2184
  else:
@@ -2204,7 +2210,7 @@ class offload:
2204
2210
  if len(loras_data) == 0:
2205
2211
  return old_forward(*args, **kwargs)
2206
2212
  else:
2207
- submodule.aaa = submodule_name
2213
+ #submodule.aaa = submodule_name # just for debugging if uncommented will cause pytorch recompilation
2208
2214
  return self._lora_linear_forward(current_model, submodule, loras_data, *args, **kwargs)
2209
2215
  target_fn = lora_linear_forward
2210
2216
  else:
@@ -2236,10 +2242,63 @@ class offload:
2236
2242
 
2237
2243
  # need to be registered before the forward not to be break the efficiency of the compilation chain
2238
2244
  # it should be at the top of the compilation as this type of hook in the middle of a chain seems to break memory performance
2239
- target_module.register_forward_pre_hook(preload_blocks_for_compile)
2245
+ target_module.register_forward_pre_hook(preload_blocks_for_compile)
2246
+
2247
+
2240
2248
 
2241
2249
 
2242
- def hook_check_empty_cache_needed(self, target_module, model, model_id, blocks_name, previous_method, context):
2250
+ @torch._dynamo.disable
2251
+ def _pre_check(self, module):
2252
+ model_id = getattr(module, "_mm_model_id", None)
2253
+ blocks_name = getattr(module, "_mm_blocks_name", None)
2254
+
2255
+ self.ensure_model_loaded(model_id)
2256
+ if blocks_name is None:
2257
+ if self.ready_to_check_mem():
2258
+ self.empty_cache_if_needed()
2259
+ elif blocks_name != self.loaded_blocks[model_id] and \
2260
+ blocks_name not in self.preloaded_blocks_per_model[model_id]:
2261
+ self.gpu_load_blocks(model_id, blocks_name)
2262
+
2263
+ def _get_wrapper_for_type(self, mod_cls):
2264
+ fn = self._type_wrappers.get(mod_cls)
2265
+ if fn is not None:
2266
+ return fn
2267
+
2268
+ # Unique function name per class -> unique compiled code object
2269
+ fname = f"_mm_wrap_{mod_cls.__module__.replace('.', '_')}_{mod_cls.__name__}"
2270
+
2271
+ # Keep body minimal; all heavy/offload logic runs out-of-graph in _pre_check
2272
+ # Include __TYPE_CONST in the code so the bytecode/consts differ per class.
2273
+ src = f"""
2274
+ def {fname}(module, *args, **kwargs):
2275
+ _ = __TYPE_CONST # anchor type as a constant to make code object unique per class
2276
+ mgr = module._mm_manager
2277
+ mgr._pre_check(module)
2278
+ return module._mm_forward(*args, **kwargs)
2279
+ """
2280
+ ns = {"__TYPE_CONST": mod_cls}
2281
+ exec(src, ns) # compile a new function object/code object for this class
2282
+ fn = ns[fname]
2283
+ self._type_wrappers[mod_cls] = fn
2284
+ return fn
2285
+
2286
+ def hook_check_load_into_GPU_if_needed(
2287
+ self, target_module, model, model_id, blocks_name, previous_method, context
2288
+ ):
2289
+ # store instance data on the module (not captured by the wrapper)
2290
+ target_module._mm_manager = self
2291
+ target_module._mm_model_id = model_id
2292
+ target_module._mm_blocks_name = blocks_name
2293
+ target_module._mm_forward = previous_method
2294
+
2295
+ # per-TYPE wrapper (unique bytecode per class, reused across instances of that class)
2296
+ wrapper_fn = self._get_wrapper_for_type(type(target_module))
2297
+
2298
+ # bind as a bound method (no partial/closures)
2299
+ target_module.forward = types.MethodType(wrapper_fn, target_module)
2300
+
2301
+ def hook_check_load_into_GPU_if_needed_default(self, target_module, model, model_id, blocks_name, previous_method, context):
2243
2302
 
2244
2303
  dtype = model._dtype
2245
2304
  qint4quantization = isinstance(target_module, QModuleMixin) and target_module.weight!= None and target_module.weight.qtype == qint4
@@ -2259,22 +2318,32 @@ class offload:
2259
2318
  target_module.forward = target_module._mm_forward
2260
2319
  return
2261
2320
 
2262
- def check_empty_cuda_cache(module, *args, **kwargs):
2321
+ def check_load_into_GPU_needed():
2263
2322
  self.ensure_model_loaded(model_id)
2264
2323
  if blocks_name == None:
2265
2324
  if self.ready_to_check_mem():
2266
2325
  self.empty_cache_if_needed()
2267
2326
  elif blocks_name != self.loaded_blocks[model_id] and blocks_name not in self.preloaded_blocks_per_model[model_id]:
2268
2327
  self.gpu_load_blocks(model_id, blocks_name)
2269
- if qint4quantization and dtype !=None:
2270
- args, kwargs = self.move_args_to_gpu(dtype, *args, **kwargs)
2271
-
2272
- return previous_method(*args, **kwargs)
2328
+ # if qint4quantization and dtype !=None:
2329
+ # args, kwargs = self.move_args_to_gpu(dtype, *args, **kwargs)
2330
+
2331
+ if isinstance(target_module, torch.nn.Linear):
2332
+ def check_load_into_GPU_needed_linear(module, *args, **kwargs):
2333
+ check_load_into_GPU_needed()
2334
+ return previous_method(*args, **kwargs)
2335
+ check_load_into_GPU_needed_module = check_load_into_GPU_needed_linear
2336
+ else:
2337
+ def check_load_into_GPU_needed_other(module, *args, **kwargs):
2338
+ check_load_into_GPU_needed()
2339
+ return previous_method(*args, **kwargs)
2340
+ check_load_into_GPU_needed_module = check_load_into_GPU_needed_other
2273
2341
 
2274
2342
  setattr(target_module, "_mm_id", model_id)
2275
2343
  setattr(target_module, "_mm_forward", previous_method)
2276
2344
 
2277
- setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_empty_cuda_cache, target_module), previous_method) )
2345
+ setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_load_into_GPU_needed_module, target_module), previous_method) )
2346
+ # target_module.register_forward_pre_hook(check_empty_cuda_cache)
2278
2347
 
2279
2348
 
2280
2349
  def hook_change_module(self, target_module, model, model_id, module_id, previous_method, previous_method_name ):
@@ -2300,7 +2369,7 @@ class offload:
2300
2369
  if not self.verboseLevel >=1:
2301
2370
  return
2302
2371
 
2303
- if module_id == None or module_id =='':
2372
+ if previous_method_name =="forward" and (module_id == None or module_id ==''):
2304
2373
  model_name = model._get_name()
2305
2374
  print(f"Hooked to model '{model_id}' ({model_name})")
2306
2375
 
@@ -2607,19 +2676,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2607
2676
  for model_id in models:
2608
2677
  current_model: torch.nn.Module = models[model_id]
2609
2678
  towers_names, towers_modules = _detect_main_towers(current_model)
2610
- # compile main iterative modules stacks ("towers")
2611
2679
  compilationInThisOne = compileAllModels or model_id in modelsToCompile
2612
- if compilationInThisOne:
2613
- if self.verboseLevel>=1:
2614
- if len(towers_modules)>0:
2615
- formated_tower_names = [name + '*' for name in towers_names]
2616
- print(f"Pytorch compilation of '{model_id}' is scheduled for these modules : {formated_tower_names}.")
2617
- else:
2618
- print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
2619
-
2620
- for submodel in towers_modules:
2621
- submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode="default" ) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
2622
- #dynamic=True,
2623
2680
 
2624
2681
  if pinAllModels or model_id in modelsToPin:
2625
2682
  if hasattr(current_model,"_already_pinned"):
@@ -2665,8 +2722,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2665
2722
  # print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
2666
2723
  top_submodule = len(submodule_name.split("."))==1
2667
2724
  offload_hooks = submodule._offload_hooks if hasattr(submodule, "_offload_hooks") else []
2668
- if len(offload_hooks) > 0:
2669
- pass
2670
2725
  assert top_submodule or len(offload_hooks) == 0, "custom offload hooks can only be set at the of the module"
2671
2726
  submodule_method_names = ["forward"] + offload_hooks
2672
2727
  for submodule_method_name in submodule_method_names:
@@ -2681,11 +2736,27 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2681
2736
  elif compilationInThisOne and submodule in towers_modules:
2682
2737
  self.hook_preload_blocks_for_compilation(submodule, model_id, cur_blocks_name, context = submodule_name )
2683
2738
  else:
2684
- self.hook_check_empty_cache_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
2685
-
2739
+ if compilationInThisOne and False:
2740
+ self.hook_check_load_into_GPU_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
2741
+ else:
2742
+ self.hook_check_load_into_GPU_if_needed_default(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
2743
+
2686
2744
  self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
2687
2745
 
2688
2746
 
2747
+ # compile main iterative modules stacks ("towers")
2748
+ if compilationInThisOne:
2749
+ if self.verboseLevel>=1:
2750
+ if len(towers_modules)>0:
2751
+ formated_tower_names = [name + '*' for name in towers_names]
2752
+ print(f"Pytorch compilation of '{model_id}' is scheduled for these modules : {formated_tower_names}.")
2753
+ else:
2754
+ print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
2755
+
2756
+ for submodel in towers_modules:
2757
+ submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode="default" ) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
2758
+ #dynamic=True,
2759
+
2689
2760
  self.tune_preloading(model_id, current_budget, towers_names)
2690
2761
  self.parameters_ref = {}
2691
2762
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.5.7
3
+ Version: 3.5.9
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
15
15
 
16
16
 
17
17
  <p align="center">
18
- <H2>Memory Management 3.5.7 for the GPU Poor by DeepBeepMeep</H2>
18
+ <H2>Memory Management 3.5.9 for the GPU Poor by DeepBeepMeep</H2>
19
19
  </p>
20
20
 
21
21
 
@@ -0,0 +1,9 @@
1
+ __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
+ mmgp/offload.py,sha256=AViVBdUYDN42SnICeeTFa3K3JQ7a8rXB-eC2qPIY2yM,130347
4
+ mmgp/safetensors2.py,sha256=4nKV13qCMabnNEB1TA_ueFbfGYYmiQ9racR_C6SsGug,18693
5
+ mmgp-3.5.9.dist-info/licenses/LICENSE.md,sha256=DD-WIS0BkPoWJ_8hQO3J8hMP9K_1-dyrYv1YCbkxcDU,94
6
+ mmgp-3.5.9.dist-info/METADATA,sha256=PXpq_dDRmAQED1dTW8NKUUB_FcYb54VRqlpjqOY771Y,16309
7
+ mmgp-3.5.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
+ mmgp-3.5.9.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
+ mmgp-3.5.9.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
- mmgp/offload.py,sha256=SKt-EunQrH6omBFI7aNLe82GIoXBKW9y1i0HMPFrKLY,127089
4
- mmgp/safetensors2.py,sha256=4nKV13qCMabnNEB1TA_ueFbfGYYmiQ9racR_C6SsGug,18693
5
- mmgp-3.5.7.dist-info/licenses/LICENSE.md,sha256=DD-WIS0BkPoWJ_8hQO3J8hMP9K_1-dyrYv1YCbkxcDU,94
6
- mmgp-3.5.7.dist-info/METADATA,sha256=s420bK-WQuSZM2RpVwYjzXY-QmtIHkRbIiL9hAyV7sA,16309
7
- mmgp-3.5.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
- mmgp-3.5.7.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
- mmgp-3.5.7.dist-info/RECORD,,
File without changes