mmgp 3.1.0__tar.gz → 3.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mmgp
3
- Version: 3.1.0
3
+ Version: 3.1.1
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mmgp"
3
- version = "3.1.0"
3
+ version = "3.1.1"
4
4
  authors = [
5
5
  { name = "deepbeepmeep", email = "deepbeepmeep@yahoo.com" },
6
6
  ]
@@ -261,6 +261,29 @@ def _remove_model_wrapper(model):
261
261
  return sub_module
262
262
  return model
263
263
 
264
+ # def force_load_tensor(t):
265
+ # c = torch.nn.Parameter(t + 0)
266
+ # torch.utils.swap_tensors(t, c)
267
+ # del c
268
+
269
+
270
+ # for n,m in model_to_quantize.named_modules():
271
+ # # do not read quantized weights (detected them directly or behind an adapter)
272
+ # if isinstance(m, QModuleMixin) or hasattr(m, "base_layer") and isinstance(m.base_layer, QModuleMixin):
273
+ # if hasattr(m, "bias") and m.bias is not None:
274
+ # force_load_tensor(m.bias.data)
275
+ # # m.bias.data = m.bias.data + 0
276
+ # else:
277
+ # for n, p in m.named_parameters(recurse = False):
278
+ # data = getattr(m, n)
279
+ # force_load_tensor(data)
280
+ # # setattr(m,n, torch.nn.Parameter(data + 0 ) )
281
+
282
+ # for b in m.buffers(recurse = False):
283
+ # # b.data = b.data + 0
284
+ # b.data = torch.nn.Buffer(b.data + 0)
285
+ # force_load_tensor(b.data)
286
+
264
287
 
265
288
 
266
289
  def _move_to_pinned_tensor(source_tensor, big_tensor, offset, length):
@@ -290,6 +313,17 @@ def _safetensors_load_file(file_path):
290
313
 
291
314
  return sd, metadata
292
315
 
316
+ def _force_load_buffer(p):
317
+ # To do : check if buffer was persistent and transfer state, or maybe swap keep already this property ?
318
+ q = torch.nn.Buffer(p + 0)
319
+ torch.utils.swap_tensors(p, q)
320
+ del q
321
+
322
+ def _force_load_parameter(p):
323
+ q = torch.nn.Parameter(p + 0)
324
+ torch.utils.swap_tensors(p, q)
325
+ del q
326
+
293
327
  def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_max = 0, verboseLevel = 1):
294
328
  if verboseLevel>=1 :
295
329
  if partialPinning:
@@ -302,6 +336,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
302
336
  towers_names, _ = _detect_main_towers(model)
303
337
  towers_names = [n +"." for n in towers_names]
304
338
 
339
+
305
340
  BIG_TENSOR_MAX_SIZE = 2**28 # 256 MB
306
341
  current_big_tensor_size = 0
307
342
  big_tensor_no = 0
@@ -315,10 +350,10 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
315
350
  if partialPinning:
316
351
  include = any(k.startswith(pre) for pre in towers_names) if partialPinning else True
317
352
  if include:
318
- params_list = params_list + list(sub_module.buffers(recurse=False)) + list(sub_module.parameters(recurse=False))
353
+ params_list = params_list + [ (k + '.' + n, p, False) for n, p in sub_module.named_parameters(recurse=False)] + [ (k + '.' + n, p, True) for n, p in sub_module.named_buffers(recurse=False)]
319
354
 
320
- # print(f"num params to pin {model_id}: {len(params_list)}")
321
- for p in params_list:
355
+
356
+ for n, p, _ in params_list:
322
357
  if isinstance(p, QTensor):
323
358
  if p._qtype == qint4:
324
359
  if hasattr(p,"_scale_shift"):
@@ -330,10 +365,16 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
330
365
  else:
331
366
  length = torch.numel(p.data) * p.data.element_size()
332
367
 
368
+
333
369
  if current_big_tensor_size + length > BIG_TENSOR_MAX_SIZE:
334
370
  big_tensors_sizes.append(current_big_tensor_size)
335
371
  current_big_tensor_size = 0
336
372
  big_tensor_no += 1
373
+
374
+
375
+ itemsize = p.data.dtype.itemsize
376
+ if current_big_tensor_size % itemsize:
377
+ current_big_tensor_size += itemsize - current_big_tensor_size % itemsize
337
378
  tensor_map_indexes.append((big_tensor_no, current_big_tensor_size, length ))
338
379
  current_big_tensor_size += length
339
380
 
@@ -362,12 +403,18 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
362
403
 
363
404
  gc.collect()
364
405
 
406
+
365
407
  tensor_no = 0
366
- for p in params_list:
408
+ # prev_big_tensor = 0
409
+ for n, p, is_buffer in params_list:
367
410
  big_tensor_no, offset, length = tensor_map_indexes[tensor_no]
368
-
411
+ # if big_tensor_no != prev_big_tensor:
412
+ # gc.collect()
413
+ # prev_big_tensor = big_tensor_no
369
414
  if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
370
415
  current_big_tensor = big_tensors[big_tensor_no]
416
+ if is_buffer :
417
+ _force_load_buffer(p) # otherwise potential memory leak
371
418
  if isinstance(p, QTensor):
372
419
  if p._qtype == qint4:
373
420
  length1 = torch.numel(p._data._data) * p._data._data.element_size()
@@ -395,7 +442,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
395
442
  gc.collect()
396
443
 
397
444
  if verboseLevel >=1:
398
- if total_tensor_bytes == total:
445
+ if total_tensor_bytes <= total:
399
446
  print(f"The whole model was pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
400
447
  else:
401
448
  print(f"{total/ONE_MB:.2f} MB were pinned to reserved RAM out of {total_tensor_bytes/ONE_MB:.2f} MB")
@@ -529,55 +576,56 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
529
576
  if hasattr(model_to_quantize, "_quanto_map"):
530
577
  for k, entry in model_to_quantize._quanto_map.items():
531
578
  weights = entry["weights"]
532
- print(f"Model '{model_id}' is already quantized to format '{weights}'")
579
+ print(f"Model '{model_id}' is already quantized in format '{weights}'")
533
580
  return False
534
581
  print(f"Model '{model_id}' is already quantized")
535
582
  return False
536
583
 
537
584
  print(f"Quantization of model '{model_id}' started to format '{weights}'")
538
585
 
586
+ tower_names ,_ = _detect_main_towers(model_to_quantize)
587
+ tower_names = [ n[:-1] for n in tower_names]
588
+
539
589
  for submodule_name, submodule in model_to_quantize.named_modules():
540
590
  if isinstance(submodule, QModuleMixin):
541
591
  if verboseLevel>=1:
542
592
  print("No quantization to do as model is already quantized")
543
593
  return False
544
594
 
545
-
546
595
  if submodule_name=='':
547
596
  continue
548
597
 
549
-
550
- flush = False
551
- if isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
552
- if cur_blocks_prefix == None:
553
- cur_blocks_prefix = submodule_name + "."
554
- flush = True
555
- else:
556
- #if cur_blocks_prefix != submodule_name[:len(cur_blocks_prefix)]:
557
- if not submodule_name.startswith(cur_blocks_prefix):
598
+ size = compute_submodule_size(submodule)
599
+ if not any(submodule_name.startswith(pre) for pre in tower_names):
600
+ flush = False
601
+ if isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
602
+ if cur_blocks_prefix == None:
558
603
  cur_blocks_prefix = submodule_name + "."
559
604
  flush = True
560
- else:
561
- if cur_blocks_prefix is not None:
562
- #if not cur_blocks_prefix == submodule_name[0:len(cur_blocks_prefix)]:
563
- if not submodule_name.startswith(cur_blocks_prefix):
564
- cur_blocks_prefix = None
565
- flush = True
566
-
567
- if flush:
568
- if submodule_size <= threshold:
569
- exclude_list += submodule_names
570
- if verboseLevel >=2:
571
- print(f"Excluded size {submodule_size/ONE_MB:.1f} MB: {prev_blocks_prefix} : {submodule_names}")
572
- total_excluded += submodule_size
573
-
574
- submodule_size = 0
575
- submodule_names = []
576
- prev_blocks_prefix = cur_blocks_prefix
577
- size = compute_submodule_size(submodule)
578
- submodule_size += size
605
+ else:
606
+ if not submodule_name.startswith(cur_blocks_prefix):
607
+ cur_blocks_prefix = submodule_name + "."
608
+ flush = True
609
+ else:
610
+ if cur_blocks_prefix is not None:
611
+ #if not cur_blocks_prefix == submodule_name[0:len(cur_blocks_prefix)]:
612
+ if not submodule_name.startswith(cur_blocks_prefix):
613
+ cur_blocks_prefix = None
614
+ flush = True
615
+
616
+ if flush :
617
+ if submodule_size <= threshold :
618
+ exclude_list += submodule_names
619
+ if verboseLevel >=2:
620
+ print(f"Excluded size {submodule_size/ONE_MB:.1f} MB: {prev_blocks_prefix} : {submodule_names}")
621
+ total_excluded += submodule_size
622
+
623
+ submodule_size = 0
624
+ submodule_names = []
625
+ prev_blocks_prefix = cur_blocks_prefix
626
+ submodule_size += size
627
+ submodule_names.append(submodule_name)
579
628
  total_size += size
580
- submodule_names.append(submodule_name)
581
629
 
582
630
  if submodule_size > 0 and submodule_size <= threshold:
583
631
  exclude_list += submodule_names
@@ -593,28 +641,29 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
593
641
  exclude_list = None
594
642
 
595
643
 
596
- #quantize(model_to_quantize,weights, exclude= exclude_list)
644
+ quantize(model_to_quantize,weights, exclude= exclude_list)
645
+ # quantize(model_to_quantize,weights, include= [ "*1.block.attn.to_out*"]) #"
646
+
647
+ # for name, m in model_to_quantize.named_modules():
648
+ # if exclude_list is None or not any( name == module_name for module_name in exclude_list):
649
+ # _quantize_submodule(model_to_quantize, name, m, weights=weights, activations=None, optimizer=None)
597
650
 
598
- for name, m in model_to_quantize.named_modules():
599
- if exclude_list is None or not any( name == module_name for module_name in exclude_list):
600
- _quantize_submodule(model_to_quantize, name, m, weights=weights, activations=None, optimizer=None)
601
651
 
602
652
  # force to read non quantized parameters so that their lazy tensors and corresponding mmap are released
603
653
  # otherwise we may end up keeping in memory both the quantized and the non quantize model
604
- for m in model_to_quantize.modules():
654
+ for n,m in model_to_quantize.named_modules():
605
655
  # do not read quantized weights (detected them directly or behind an adapter)
606
656
  if isinstance(m, QModuleMixin) or hasattr(m, "base_layer") and isinstance(m.base_layer, QModuleMixin):
607
657
  if hasattr(m, "bias") and m.bias is not None:
608
- m.bias.data = m.bias.data + 0
658
+ _force_load_parameter(m.bias)
609
659
  else:
610
- for n, p in m.named_parameters(recurse = False):
611
- data = getattr(m, n)
612
- setattr(m,n, torch.nn.Parameter(data + 0 ) )
660
+ for p in m.parameters(recurse = False):
661
+ _force_load_parameter(p)
613
662
 
614
663
  for b in m.buffers(recurse = False):
615
- b.data = b.data + 0
664
+ _force_load_buffer(b)
665
+
616
666
 
617
-
618
667
 
619
668
  freeze(model_to_quantize)
620
669
  torch.cuda.empty_cache()
@@ -723,6 +772,15 @@ def load_loras_into_model(model, lora_path, lora_multi = None, verboseLevel = -1
723
772
  print(f"Lora '{path}' was loaded in model '{_get_module_name(model)}'")
724
773
  set_weights_and_activate_adapters(model,[ str(i) for i in range(len(lora_multi))], lora_multi)
725
774
 
775
+ def move_loras_to_device(model, device="cpu" ):
776
+ if hasattr( model, "_lora_loadable_modules"):
777
+ for k in model._lora_loadable_modules:
778
+ move_loras_to_device(getattr(model,k), device)
779
+ return
780
+
781
+ for k, m in model.named_modules():
782
+ if ".lora_" in k:
783
+ m.to(device)
726
784
 
727
785
  def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, verboseLevel = -1):
728
786
  """
@@ -812,9 +870,6 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
812
870
 
813
871
  model = _remove_model_wrapper(model)
814
872
 
815
- # if pinToMemory and do_quantize:
816
- # raise Exception("Pinning and Quantization can not be used at the same time")
817
-
818
873
  if not (".safetensors" in file_path or ".sft" in file_path):
819
874
  if pinToMemory:
820
875
  raise Exception("Pinning to memory while loading only supported for safe tensors files")
@@ -933,7 +988,7 @@ class offload:
933
988
 
934
989
  for k,p in submodule.named_parameters(recurse=False):
935
990
  if isinstance(p, QTensor):
936
- blocks_params.append( (submodule, k, p ) )
991
+ blocks_params.append( (submodule, k, p, False ) )
937
992
 
938
993
  if p._qtype == qint4:
939
994
  if hasattr(p,"_scale_shift"):
@@ -947,11 +1002,11 @@ class offload:
947
1002
  blocks_params_size += torch.numel(p._scale) * p._scale.element_size()
948
1003
  blocks_params_size += torch.numel(p._data) * p._data.element_size()
949
1004
  else:
950
- blocks_params.append( (submodule, k, p ) )
1005
+ blocks_params.append( (submodule, k, p, False) )
951
1006
  blocks_params_size += torch.numel(p.data) * p.data.element_size()
952
1007
 
953
1008
  for k, p in submodule.named_buffers(recurse=False):
954
- blocks_params.append( (submodule, k, p) )
1009
+ blocks_params.append( (submodule, k, p, True) )
955
1010
  blocks_params_size += p.data.nbytes
956
1011
 
957
1012
 
@@ -981,9 +1036,12 @@ class offload:
981
1036
  def cpu_to_gpu(stream_to_use, blocks_params): #, record_for_stream = None
982
1037
  with torch.cuda.stream(stream_to_use):
983
1038
  for param in blocks_params:
984
- parent_module, n, p = param
1039
+ parent_module, n, p, is_buffer = param
985
1040
  q = p.to("cuda", non_blocking=True)
986
- q = torch.nn.Parameter(q , requires_grad=False)
1041
+ if is_buffer:
1042
+ q = torch.nn.Buffer(q)
1043
+ else:
1044
+ q = torch.nn.Parameter(q , requires_grad=False)
987
1045
  setattr(parent_module, n , q)
988
1046
  # if record_for_stream != None:
989
1047
  # if isinstance(p, QTensor):
@@ -1030,8 +1088,11 @@ class offload:
1030
1088
 
1031
1089
  blocks_params = self.blocks_of_modules[blocks_name]
1032
1090
  for param in blocks_params:
1033
- parent_module, n, p = param
1034
- q = torch.nn.Parameter(p , requires_grad=False)
1091
+ parent_module, n, p, is_buffer = param
1092
+ if is_buffer:
1093
+ q = torch.nn.Buffer(p)
1094
+ else:
1095
+ q = torch.nn.Parameter(p , requires_grad=False)
1035
1096
  setattr(parent_module, n , q)
1036
1097
  # cl.stop()
1037
1098
  # print(f"unload time: {cl.format_time_gap()}")
@@ -1403,19 +1464,16 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1403
1464
  for model_id in models:
1404
1465
  current_model: torch.nn.Module = models[model_id]
1405
1466
  towers_names, towers_modules = _detect_main_towers(current_model)
1406
- if self.verboseLevel>=2 and len(towers_names)>0:
1407
- print(f"Potential iterative blocks found in model '{model_id}':{towers_names}")
1408
1467
  # compile main iterative modules stacks ("towers")
1409
1468
  compilationInThisOne = compileAllModels or model_id in modelsToCompile
1410
1469
  if compilationInThisOne:
1411
1470
  if self.verboseLevel>=1:
1412
1471
  if len(towers_modules)>0:
1413
- print(f"Pytorch compilation of model '{model_id}' is scheduled.")
1472
+ print(f"Pytorch compilation of '{model_id}' is scheduled for these modules : {towers_names}.")
1414
1473
  else:
1415
1474
  print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
1416
1475
 
1417
1476
  for submodel in towers_modules:
1418
- # for submodel in tower:
1419
1477
  submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode="default" ) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
1420
1478
  #dynamic=True,
1421
1479
 
@@ -262,6 +262,7 @@ class SafeTensorFile:
262
262
 
263
263
  PAGE_SIZE = mmap.ALLOCATIONGRANULARITY
264
264
  MMAP_SIZE = 1024 * 1024 * 1024 # 1GB
265
+ # MMAP_SIZE = 256 * 1024 * 1024 # 1GB
265
266
 
266
267
  # First pass: find optimal aligned map boundaries
267
268
  skip_bytes = self._skip_bytes
@@ -322,6 +323,7 @@ class SafeTensorFile:
322
323
  current_pos += length
323
324
 
324
325
  return sd
326
+
325
327
 
326
328
  def create_tensors_without_mmap(self):
327
329
  sd = OrderedDict()
@@ -335,12 +337,11 @@ class SafeTensorFile:
335
337
  data_offsets = v["data_offsets"]
336
338
  length = data_offsets[1]-data_offsets[0]
337
339
  buffer = f.read(length)
338
- if len(shape) == 0:
339
- if length == 0:
340
- t = torch.empty(0, dtype=dtype)
341
- else:
342
- t = torch.frombuffer(bytearray(buffer), dtype=torch.uint8)
343
- t = t.view(dtype)
340
+ if length == 0:
341
+ t = torch.empty(0, dtype=dtype)
342
+ elif len(shape) == 0:
343
+ t = torch.frombuffer(bytearray(buffer), dtype=torch.uint8)
344
+ t = t.view(dtype)
344
345
  else:
345
346
  t = torch.frombuffer(bytearray(buffer), dtype=dtype)
346
347
  t = torch.reshape(t, shape)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mmgp
3
- Version: 3.1.0
3
+ Version: 3.1.1
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes