mmgp 3.5.6__tar.gz → 3.5.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.5.6
3
+ Version: 3.5.8
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
15
15
 
16
16
 
17
17
  <p align="center">
18
- <H2>Memory Management 3.5.6 for the GPU Poor by DeepBeepMeep</H2>
18
+ <H2>Memory Management 3.5.8 for the GPU Poor by DeepBeepMeep</H2>
19
19
  </p>
20
20
 
21
21
 
@@ -1,6 +1,6 @@
1
1
 
2
2
  <p align="center">
3
- <H2>Memory Management 3.5.6 for the GPU Poor by DeepBeepMeep</H2>
3
+ <H2>Memory Management 3.5.8 for the GPU Poor by DeepBeepMeep</H2>
4
4
  </p>
5
5
 
6
6
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mmgp"
3
- version = "3.5.6"
3
+ version = "3.5.8"
4
4
  authors = [
5
5
  { name = "deepbeepmeep", email = "deepbeepmeep@yahoo.com" },
6
6
  ]
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.5.6 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.5.8 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -64,6 +64,11 @@ import psutil
64
64
  import builtins
65
65
  from accelerate import init_empty_weights
66
66
 
67
+ import functools
68
+ import types
69
+ from functools import lru_cache
70
+ import torch
71
+
67
72
 
68
73
  from mmgp import safetensors2
69
74
  from mmgp import profile_type
@@ -122,8 +127,6 @@ class clock:
122
127
  def format_time_gap(self):
123
128
  return f"{self.stop_time - self.start_time:.2f}s"
124
129
 
125
-
126
-
127
130
  # useful functions to move a group of tensors (to design custom offload patches)
128
131
  def move_tensors(obj, device):
129
132
  if torch.is_tensor(obj):
@@ -668,7 +671,7 @@ def _welcome():
668
671
  if welcome_displayed:
669
672
  return
670
673
  welcome_displayed = True
671
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.6) by DeepBeepMeep ************{ENDC}{UNBOLD}")
674
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.8) by DeepBeepMeep ************{ENDC}{UNBOLD}")
672
675
 
673
676
  def change_dtype(model, new_dtype, exclude_buffers = False):
674
677
  for submodule_name, submodule in model.named_modules():
@@ -1295,7 +1298,7 @@ def move_loras_to_device(model, device="cpu" ):
1295
1298
  if ".lora_" in k:
1296
1299
  m.to(device)
1297
1300
 
1298
- def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, defaultConfigPath = None, modelClass=None, modelPrefix = None, writable_tensors = True, verboseLevel = -1, modules = None, return_shared_modules = None, configKwargs ={}):
1301
+ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, defaultConfigPath = None, modelClass=None, modelPrefix = None, writable_tensors = True, verboseLevel = -1, preprocess_sd = None, modules = None, return_shared_modules = None, configKwargs ={}):
1299
1302
  """
1300
1303
  quick version of .LoadfromPretrained of the transformers library
1301
1304
  used to build a model and load the corresponding weights (quantized or not)
@@ -1383,13 +1386,13 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantiza
1383
1386
 
1384
1387
  model._config = transformer_config
1385
1388
 
1386
- load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, modelPrefix = modelPrefix, writable_tensors =writable_tensors, modules = modules, return_shared_modules = return_shared_modules, verboseLevel=verboseLevel )
1389
+ load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, modelPrefix = modelPrefix, writable_tensors =writable_tensors, preprocess_sd = preprocess_sd , modules = modules, return_shared_modules = return_shared_modules, verboseLevel=verboseLevel )
1387
1390
 
1388
1391
  return model
1389
1392
 
1390
1393
 
1391
1394
 
1392
- def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True, modules = None, return_shared_modules = None, verboseLevel = -1):
1395
+ def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True, preprocess_sd = None, modules = None, return_shared_modules = None, verboseLevel = -1):
1393
1396
  """
1394
1397
  Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
1395
1398
  """
@@ -1506,6 +1509,9 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
1506
1509
  full_state_dict, full_quantization_map, full_tied_weights_map = None, None, None
1507
1510
 
1508
1511
  # deal if we are trying to load just a sub part of a larger model
1512
+ if preprocess_sd != None:
1513
+ state_dict, quantization_map = preprocess_sd(state_dict, quantization_map)
1514
+
1509
1515
  if modelPrefix != None:
1510
1516
  base_model_prefix = modelPrefix + "."
1511
1517
  state_dict = filter_state_dict(state_dict,base_model_prefix)
@@ -1756,6 +1762,7 @@ class offload:
1756
1762
  global last_offload_obj
1757
1763
  last_offload_obj = self
1758
1764
 
1765
+ self._type_wrappers = {}
1759
1766
 
1760
1767
  def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name, submodule_name):
1761
1768
 
@@ -2203,7 +2210,7 @@ class offload:
2203
2210
  if len(loras_data) == 0:
2204
2211
  return old_forward(*args, **kwargs)
2205
2212
  else:
2206
- submodule.aaa = submodule_name
2213
+ #submodule.aaa = submodule_name # just for debugging if uncommented will cause pytorch recompilation
2207
2214
  return self._lora_linear_forward(current_model, submodule, loras_data, *args, **kwargs)
2208
2215
  target_fn = lora_linear_forward
2209
2216
  else:
@@ -2235,10 +2242,63 @@ class offload:
2235
2242
 
2236
2243
  # need to be registered before the forward not to be break the efficiency of the compilation chain
2237
2244
  # it should be at the top of the compilation as this type of hook in the middle of a chain seems to break memory performance
2238
- target_module.register_forward_pre_hook(preload_blocks_for_compile)
2245
+ target_module.register_forward_pre_hook(preload_blocks_for_compile)
2239
2246
 
2240
2247
 
2241
- def hook_check_empty_cache_needed(self, target_module, model, model_id, blocks_name, previous_method, context):
2248
+
2249
+
2250
+ @torch._dynamo.disable
2251
+ def _pre_check(self, module):
2252
+ model_id = getattr(module, "_mm_model_id", None)
2253
+ blocks_name = getattr(module, "_mm_blocks_name", None)
2254
+
2255
+ self.ensure_model_loaded(model_id)
2256
+ if blocks_name is None:
2257
+ if self.ready_to_check_mem():
2258
+ self.empty_cache_if_needed()
2259
+ elif blocks_name != self.loaded_blocks[model_id] and \
2260
+ blocks_name not in self.preloaded_blocks_per_model[model_id]:
2261
+ self.gpu_load_blocks(model_id, blocks_name)
2262
+
2263
+ def _get_wrapper_for_type(self, mod_cls):
2264
+ fn = self._type_wrappers.get(mod_cls)
2265
+ if fn is not None:
2266
+ return fn
2267
+
2268
+ # Unique function name per class -> unique compiled code object
2269
+ fname = f"_mm_wrap_{mod_cls.__module__.replace('.', '_')}_{mod_cls.__name__}"
2270
+
2271
+ # Keep body minimal; all heavy/offload logic runs out-of-graph in _pre_check
2272
+ # Include __TYPE_CONST in the code so the bytecode/consts differ per class.
2273
+ src = f"""
2274
+ def {fname}(module, *args, **kwargs):
2275
+ _ = __TYPE_CONST # anchor type as a constant to make code object unique per class
2276
+ mgr = module._mm_manager
2277
+ mgr._pre_check(module)
2278
+ return module._mm_forward(*args, **kwargs)
2279
+ """
2280
+ ns = {"__TYPE_CONST": mod_cls}
2281
+ exec(src, ns) # compile a new function object/code object for this class
2282
+ fn = ns[fname]
2283
+ self._type_wrappers[mod_cls] = fn
2284
+ return fn
2285
+
2286
+ def hook_check_load_into_GPU_if_needed(
2287
+ self, target_module, model, model_id, blocks_name, previous_method, context
2288
+ ):
2289
+ # store instance data on the module (not captured by the wrapper)
2290
+ target_module._mm_manager = self
2291
+ target_module._mm_model_id = model_id
2292
+ target_module._mm_blocks_name = blocks_name
2293
+ target_module._mm_forward = previous_method
2294
+
2295
+ # per-TYPE wrapper (unique bytecode per class, reused across instances of that class)
2296
+ wrapper_fn = self._get_wrapper_for_type(type(target_module))
2297
+
2298
+ # bind as a bound method (no partial/closures)
2299
+ target_module.forward = types.MethodType(wrapper_fn, target_module)
2300
+
2301
+ def hook_check_load_into_GPU_if_needed_default(self, target_module, model, model_id, blocks_name, previous_method, context):
2242
2302
 
2243
2303
  dtype = model._dtype
2244
2304
  qint4quantization = isinstance(target_module, QModuleMixin) and target_module.weight!= None and target_module.weight.qtype == qint4
@@ -2258,25 +2318,35 @@ class offload:
2258
2318
  target_module.forward = target_module._mm_forward
2259
2319
  return
2260
2320
 
2261
- def check_empty_cuda_cache(module, *args, **kwargs):
2321
+ def check_load_into_GPU_needed():
2262
2322
  self.ensure_model_loaded(model_id)
2263
2323
  if blocks_name == None:
2264
2324
  if self.ready_to_check_mem():
2265
2325
  self.empty_cache_if_needed()
2266
2326
  elif blocks_name != self.loaded_blocks[model_id] and blocks_name not in self.preloaded_blocks_per_model[model_id]:
2267
2327
  self.gpu_load_blocks(model_id, blocks_name)
2268
- if qint4quantization and dtype !=None:
2269
- args, kwargs = self.move_args_to_gpu(dtype, *args, **kwargs)
2270
-
2271
- return previous_method(*args, **kwargs)
2328
+ # if qint4quantization and dtype !=None:
2329
+ # args, kwargs = self.move_args_to_gpu(dtype, *args, **kwargs)
2330
+
2331
+ if isinstance(target_module, torch.nn.Linear):
2332
+ def check_load_into_GPU_needed_linear(module, *args, **kwargs):
2333
+ check_load_into_GPU_needed()
2334
+ return previous_method(*args, **kwargs)
2335
+ check_load_into_GPU_needed_module = check_load_into_GPU_needed_linear
2336
+ else:
2337
+ def check_load_into_GPU_needed_other(module, *args, **kwargs):
2338
+ check_load_into_GPU_needed()
2339
+ return previous_method(*args, **kwargs)
2340
+ check_load_into_GPU_needed_module = check_load_into_GPU_needed_other
2272
2341
 
2273
2342
  setattr(target_module, "_mm_id", model_id)
2274
2343
  setattr(target_module, "_mm_forward", previous_method)
2275
2344
 
2276
- setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_empty_cuda_cache, target_module), previous_method) )
2345
+ setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_load_into_GPU_needed_module, target_module), previous_method) )
2346
+ # target_module.register_forward_pre_hook(check_empty_cuda_cache)
2277
2347
 
2278
2348
 
2279
- def hook_change_module(self, target_module, model, model_id, module_id, previous_method):
2349
+ def hook_change_module(self, target_module, model, model_id, module_id, previous_method, previous_method_name ):
2280
2350
  if hasattr(target_module, "_lock_dtype"):
2281
2351
  dtype = target_module._lock_dtype
2282
2352
  else:
@@ -2289,16 +2359,17 @@ class offload:
2289
2359
  args, kwargs = self.move_args_to_gpu(dtype, *args, **kwargs)
2290
2360
  return previous_method(*args, **kwargs)
2291
2361
 
2292
- if hasattr(target_module, "_mm_id"):
2362
+ if hasattr(target_module, "_mm_" + previous_method_name):
2293
2363
  return
2294
- setattr(target_module, "_mm_id", model_id)
2364
+ setattr(target_module, "_mm_Id", model_id)
2365
+ setattr(target_module, "_mm_" + previous_method_name, previous_method)
2295
2366
 
2296
- setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_change_module, target_module), previous_method) )
2367
+ setattr(target_module, previous_method_name, functools.update_wrapper(functools.partial(check_change_module, target_module), previous_method) )
2297
2368
 
2298
2369
  if not self.verboseLevel >=1:
2299
2370
  return
2300
2371
 
2301
- if module_id == None or module_id =='':
2372
+ if previous_method_name =="forward" and (module_id == None or module_id ==''):
2302
2373
  model_name = model._get_name()
2303
2374
  print(f"Hooked to model '{model_id}' ({model_name})")
2304
2375
 
@@ -2605,19 +2676,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2605
2676
  for model_id in models:
2606
2677
  current_model: torch.nn.Module = models[model_id]
2607
2678
  towers_names, towers_modules = _detect_main_towers(current_model)
2608
- # compile main iterative modules stacks ("towers")
2609
2679
  compilationInThisOne = compileAllModels or model_id in modelsToCompile
2610
- if compilationInThisOne:
2611
- if self.verboseLevel>=1:
2612
- if len(towers_modules)>0:
2613
- formated_tower_names = [name + '*' for name in towers_names]
2614
- print(f"Pytorch compilation of '{model_id}' is scheduled for these modules : {formated_tower_names}.")
2615
- else:
2616
- print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
2617
-
2618
- for submodel in towers_modules:
2619
- submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode="default" ) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
2620
- #dynamic=True,
2621
2680
 
2622
2681
  if pinAllModels or model_id in modelsToPin:
2623
2682
  if hasattr(current_model,"_already_pinned"):
@@ -2661,24 +2720,42 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2661
2720
  cur_blocks_prefix, prev_blocks_name, cur_blocks_seq, is_mod_seq = pre, None, num, False
2662
2721
  cur_blocks_name = submodule_name
2663
2722
  # print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
2664
-
2665
-
2666
- if hasattr(submodule, "forward"):
2667
- # if any_lora and isinstance(submodule, ( torch.nn.Linear, torch.nn.Conv3d, torch.nn.LayerNorm)):
2668
- if any_lora and hasattr(submodule,"weight"):
2723
+ top_submodule = len(submodule_name.split("."))==1
2724
+ offload_hooks = submodule._offload_hooks if hasattr(submodule, "_offload_hooks") else []
2725
+ assert top_submodule or len(offload_hooks) == 0, "custom offload hooks can only be set at the of the module"
2726
+ submodule_method_names = ["forward"] + offload_hooks
2727
+ for submodule_method_name in submodule_method_names:
2728
+ if not hasattr(submodule, submodule_method_name ): continue
2729
+ if submodule_method_name == "forward" and any_lora and hasattr(submodule,"weight"):
2669
2730
  submodule_method = self.hook_lora(submodule, current_model, model_id, loras_model_data, loras_model_shortcuts, submodule_name)
2670
2731
  else:
2671
- submodule_method = getattr(submodule, "forward")
2672
- if callable(submodule_method):
2673
- if len(submodule_name.split("."))==1:
2674
- self.hook_change_module(submodule, current_model, model_id, submodule_name, submodule_method)
2732
+ submodule_method = getattr(submodule, submodule_method_name)
2733
+ if callable(submodule_method):
2734
+ if top_submodule and cur_blocks_name is None:
2735
+ self.hook_change_module(submodule, current_model, model_id, submodule_name, submodule_method, submodule_method_name)
2675
2736
  elif compilationInThisOne and submodule in towers_modules:
2676
2737
  self.hook_preload_blocks_for_compilation(submodule, model_id, cur_blocks_name, context = submodule_name )
2677
2738
  else:
2678
- self.hook_check_empty_cache_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
2679
-
2680
- self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
2739
+ if compilationInThisOne and False:
2740
+ self.hook_check_load_into_GPU_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
2741
+ else:
2742
+ self.hook_check_load_into_GPU_if_needed_default(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
2681
2743
 
2744
+ self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
2745
+
2746
+
2747
+ # compile main iterative modules stacks ("towers")
2748
+ if compilationInThisOne:
2749
+ if self.verboseLevel>=1:
2750
+ if len(towers_modules)>0:
2751
+ formated_tower_names = [name + '*' for name in towers_names]
2752
+ print(f"Pytorch compilation of '{model_id}' is scheduled for these modules : {formated_tower_names}.")
2753
+ else:
2754
+ print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
2755
+
2756
+ for submodel in towers_modules:
2757
+ submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode="default" ) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
2758
+ #dynamic=True,
2682
2759
 
2683
2760
  self.tune_preloading(model_id, current_budget, towers_names)
2684
2761
  self.parameters_ref = {}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.5.6
3
+ Version: 3.5.8
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
15
15
 
16
16
 
17
17
  <p align="center">
18
- <H2>Memory Management 3.5.6 for the GPU Poor by DeepBeepMeep</H2>
18
+ <H2>Memory Management 3.5.8 for the GPU Poor by DeepBeepMeep</H2>
19
19
  </p>
20
20
 
21
21
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes