mmgp 1.0.2__tar.gz → 1.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mmgp
3
- Version: 1.0.2
3
+ Version: 1.0.3
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  License: Apache License
@@ -70,5 +70,5 @@ sort_first = [
70
70
  [tool.setuptools_scm]
71
71
  write_to = "src/_version.py"
72
72
  parentdir_prefix_version = "mmgp-"
73
- fallback_version = "1.0.2"
73
+ fallback_version = "1.0.3"
74
74
  version_scheme = "post-release"
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '1.0.2'
16
- __version_tuple__ = version_tuple = (1, 0, 2)
15
+ __version__ = version = '1.0.3'
16
+ __version_tuple__ = version_tuple = (1, 0, 3)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mmgp
3
- Version: 1.0.2
3
+ Version: 1.0.3
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  License: Apache License
@@ -135,7 +135,7 @@ class offload:
135
135
  else:
136
136
  p.data = p.data.cuda(non_blocking=True) #
137
137
  # torch.cuda.current_stream().synchronize()
138
-
138
+ @torch.compiler.disable()
139
139
  def unload_all(self):
140
140
  for model, model_id in zip(self.active_models, self.active_models_ids):
141
141
  if not self.pinInRAM:
@@ -198,7 +198,6 @@ class offload:
198
198
  # print(f"New cached memory after purge is {torch.cuda.memory_reserved()/1024000:0f} MB) ")
199
199
 
200
200
  def hook_me_light(self, target_module, forceMemoryCheck, previous_method):
201
- # @torch.compiler.disable()
202
201
  def check_empty_cache(module, *args, **kwargs):
203
202
  if self.ready_to_check_mem(forceMemoryCheck):
204
203
  self.empty_cache_if_needed()
@@ -208,7 +207,6 @@ class offload:
208
207
 
209
208
 
210
209
  def hook_me(self, target_module, model, model_id, module_id, previous_method):
211
- @torch.compiler.disable()
212
210
  def check_change_module(module, *args, **kwargs):
213
211
  performEmptyCacheTest = False
214
212
  if not model_id in self.active_models_ids:
@@ -240,7 +238,7 @@ class offload:
240
238
 
241
239
  if module_id == None or module_id =='':
242
240
  model_name = model._get_name()
243
- print(f"Hooked in model {model_name} ({model_id})")
241
+ print(f"Hooked in model '{model_id}' ({model_name})")
244
242
 
245
243
 
246
244
  # Not implemented yet, but why would one want to get rid of these features ?
@@ -258,26 +256,27 @@ class offload:
258
256
 
259
257
 
260
258
  @classmethod
261
- def all(cls, pipe_or_dict_of_modules, quantizeTransformer = True, pinInRAM = True, compile= True, verbose = True):
259
+ def all(cls, pipe_or_dict_of_modules, quantizeTransformer = True, pinInRAM = True, verbose = True):
262
260
  self = cls()
263
261
  self.verbose = verbose
264
262
  self.pinned_modules_data = {}
265
263
 
264
+ # compile not working yet or slower
265
+ compile = False
266
266
  self.pinInRAM = pinInRAM
267
-
267
+ pipe = None
268
268
  preloadInRAM = True
269
269
  torch.set_default_device('cuda')
270
270
  if hasattr(pipe_or_dict_of_modules, "components"):
271
271
  pipe_or_dict_of_modules.to("cpu") #XXXX
272
272
  # create a fake Accelerate parameter so that lora loading doesn't change the device
273
273
  pipe_or_dict_of_modules.hf_device_map = torch.device("cuda")
274
+ pipe = pipe_or_dict_of_modules
274
275
  pipe_or_dict_of_modules= pipe_or_dict_of_modules.components
275
276
 
276
277
 
277
-
278
278
  models = {k: v for k, v in pipe_or_dict_of_modules.items() if isinstance(v, torch.nn.Module)}
279
279
 
280
-
281
280
  if quantizeTransformer:
282
281
  self.models_to_quantize = ["transformer"]
283
282
  # del models["transformer"] # to test everything but the transformer that has a much longer loading
@@ -389,10 +388,14 @@ class offload:
389
388
  if verbose:
390
389
  print("Torch compilation started")
391
390
  torch._dynamo.config.cache_size_limit = 10000
391
+ # if pipe != None and hasattr(pipe, "__call__"):
392
+ # pipe.__call__= torch.compile(pipe.__call__, mode= "max-autotune")
393
+
392
394
  for model_id in models:
393
395
  current_model: torch.nn.Module = models[model_id]
394
- current_model.compile()
395
- #models["transformer"].compile()
396
+ current_model.compile(mode= "max-autotune")
397
+ #models["transformer"].compile()
398
+
396
399
  if verbose:
397
400
  print("Torch compilation done")
398
401
 
File without changes
File without changes
File without changes
File without changes
File without changes