mmgp 3.1.4.post1__py3-none-any.whl → 3.1.4.post15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

mmgp/offload.py CHANGED
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.1 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.1.4 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -191,10 +191,10 @@ def _detect_main_towers(model, min_floors = 5):
191
191
  pre , num = _extract_num_from_str(submodule_name)
192
192
  if isinstance(submodule, (torch.nn.ModuleList)):
193
193
  cur_blocks_prefix, cur_blocks_seq = pre + ".", -1
194
- tower_name = submodule_name #+ ".*"
194
+ tower_name = submodule_name + "."
195
195
  elif num >=0:
196
196
  cur_blocks_prefix, cur_blocks_seq = pre, num
197
- tower_name = submodule_name[ :-1] #+ "*"
197
+ tower_name = submodule_name[ :-1]
198
198
  floors_modules.append(submodule)
199
199
 
200
200
  if len(floors_modules) >= min_floors:
@@ -420,7 +420,7 @@ def _welcome():
420
420
  if welcome_displayed:
421
421
  return
422
422
  welcome_displayed = True
423
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.1) by DeepBeepMeep ************{ENDC}{UNBOLD}")
423
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.1.4-15) by DeepBeepMeep ************{ENDC}{UNBOLD}")
424
424
 
425
425
  def _extract_num_from_str(num_in_str):
426
426
  size = len(num_in_str)
@@ -598,9 +598,13 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
598
598
 
599
599
  perc_excluded =total_excluded/ total_size if total_size >0 else 1
600
600
  if verboseLevel >=2:
601
- print(f"Total Excluded {total_excluded/ONE_MB:.1f} MB oF {total_size/ONE_MB:.1f} that is {perc_excluded*100:.2f}%")
601
+ if total_excluded == 0:
602
+ print(f"Can't find any module to exclude from quantization, full model ({total_size/ONE_MB:.1f} MB) will be quantized")
603
+ else:
604
+ print(f"Total Excluded {total_excluded/ONE_MB:.1f} MB of {total_size/ONE_MB:.1f} that is {perc_excluded*100:.2f}%")
602
605
  if perc_excluded >= 0.10:
603
- print(f"Too many modules are excluded, there is something wrong with the selection, switch back to full quantization.")
606
+ if verboseLevel >=2:
607
+ print(f"Too many modules are excluded, there is something wrong with the selection, switch back to full quantization.")
604
608
  exclude_list = None
605
609
 
606
610
 
@@ -905,6 +909,69 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
905
909
 
906
910
  return
907
911
 
912
+ def save_model(model, file_path, do_quantize = False, quantizationType = qint8, verboseLevel = -1, config_file_path = None ):
913
+ """save the weights of a model and quantize them if requested
914
+ These weights can be loaded again using 'load_model_data'
915
+ """
916
+
917
+ config = None
918
+ verboseLevel = _compute_verbose_level(verboseLevel)
919
+ if config_file_path !=None:
920
+ with open(config_file_path, "r", encoding="utf-8") as reader:
921
+ text = reader.read()
922
+ config= json.loads(text)
923
+ elif hasattr(model, "_config"):
924
+ config = model._config
925
+ elif hasattr(model, "config"):
926
+ config_fullpath = None
927
+ config_obj = getattr(model,"config")
928
+ config_path = getattr(config_obj,"_name_or_path", None)
929
+ if config_path != None:
930
+ config_fullpath = os.path.join(config_path, "config.json")
931
+ if not os.path.isfile(config_fullpath):
932
+ config_fullpath = None
933
+ if config_fullpath is None:
934
+ config_fullpath = os.path.join(os.path.dirname(file_path), "config.json")
935
+ if os.path.isfile(config_fullpath):
936
+ with open(config_fullpath, "r", encoding="utf-8") as reader:
937
+ text = reader.read()
938
+ config= json.loads(text)
939
+
940
+ if do_quantize:
941
+ _quantize(model, weights=quantizationType, model_id=file_path)
942
+
943
+ quantization_map = getattr(model, "_quanto_map", None)
944
+
945
+ if verboseLevel >=1:
946
+ print(f"Saving file '{file_path}")
947
+ safetensors2.torch_write_file(model.state_dict(), file_path , quantization_map = quantization_map, config = config)
948
+ if verboseLevel >=1:
949
+ print(f"File '{file_path}' saved")
950
+
951
+
952
+ def extract_models(prefix, obj):
953
+ pipe = {}
954
+ for name in dir(obj):
955
+ element = getattr(obj,name)
956
+ if name in ("pipeline", "pipe"):
957
+ pipeline = element
958
+ if hasattr(pipeline , "components") and isinstance(pipeline.components, dict):
959
+ for k, model in pipeline.components.items():
960
+ if model != None:
961
+ pipe[prefix + "/" + k ] = model
962
+ elif isinstance(element, torch.nn.Module):
963
+ if prefix + "/" + name in pipe:
964
+ pipe[prefix + "/_" + name ] = element
965
+ else:
966
+ pipe[prefix + "/" + name ] = element
967
+ elif isinstance(element, dict):
968
+ for k, element in element.items():
969
+ if hasattr(element , "pipeline"):
970
+ pipe.update( extract_models(prefix + "/" + k,element ))
971
+
972
+
973
+ return pipe
974
+
908
975
  def get_model_name(model):
909
976
  return model.name
910
977
 
@@ -931,6 +998,7 @@ class offload:
931
998
  self.loaded_blocks = {}
932
999
  self.prev_blocks_names = {}
933
1000
  self.next_blocks_names = {}
1001
+ self.preloaded_blocks_per_model = {}
934
1002
  self.default_stream = torch.cuda.default_stream(torch.device("cuda")) # torch.cuda.current_stream()
935
1003
  self.transfer_stream = torch.cuda.Stream()
936
1004
  self.async_transfers = False
@@ -940,6 +1008,8 @@ class offload:
940
1008
 
941
1009
  def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name):
942
1010
 
1011
+ if blocks_name is None:
1012
+ pass
943
1013
  entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
944
1014
  if entry_name in self.blocks_of_modules:
945
1015
  blocks_params = self.blocks_of_modules[entry_name]
@@ -994,11 +1064,9 @@ class offload:
994
1064
  return True
995
1065
 
996
1066
  @torch.compiler.disable()
997
- def gpu_load_blocks(self, model_id, blocks_name):
1067
+ def gpu_load_blocks(self, model_id, blocks_name, preload = False):
998
1068
  # cl = clock.start()
999
1069
 
1000
- if blocks_name != None:
1001
- self.loaded_blocks[model_id] = blocks_name
1002
1070
 
1003
1071
  entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
1004
1072
 
@@ -1019,26 +1087,50 @@ class offload:
1019
1087
  # else:
1020
1088
  # p.data.record_stream(record_for_stream)
1021
1089
 
1090
+ any_past_block = False
1091
+
1092
+ loaded_block = self.loaded_blocks[model_id]
1093
+ if not preload and loaded_block != None:
1094
+ any_past_block = True
1095
+ self.gpu_unload_blocks(model_id, loaded_block)
1096
+ if self.ready_to_check_mem():
1097
+ self.empty_cache_if_needed()
1098
+
1022
1099
 
1023
1100
  if self.verboseLevel >=2:
1024
1101
  model = self.models[model_id]
1025
1102
  model_name = model._get_name()
1026
- print(f"Loading model {entry_name} ({model_name}) in GPU")
1027
-
1103
+ # if not preload:
1104
+ # print(f"Request to load model {entry_name} ({model_name}) in GPU")
1105
+
1028
1106
 
1029
1107
  if self.async_transfers and blocks_name != None:
1030
- first = self.prev_blocks_names[entry_name] == None
1108
+ first = self.prev_blocks_names[entry_name] == None or not any_past_block
1031
1109
  next_blocks_entry = self.next_blocks_names[entry_name] if entry_name in self.next_blocks_names else None
1032
1110
  if first:
1033
1111
  cpu_to_gpu(torch.cuda.current_stream(), self.blocks_of_modules[entry_name])
1112
+ if self.verboseLevel >=2:
1113
+ if preload:
1114
+ print(f"Preloading model {entry_name} ({model_name}) in GPU")
1115
+ else:
1116
+ print(f"Loading model {entry_name} ({model_name}) in GPU")
1117
+
1034
1118
  torch.cuda.synchronize()
1035
1119
 
1036
1120
  if next_blocks_entry != None:
1037
1121
  cpu_to_gpu(self.transfer_stream, self.blocks_of_modules[next_blocks_entry]) #, self.default_stream
1122
+ if self.verboseLevel >=2:
1123
+ print(f"Prefetching model {next_blocks_entry} ({model_name}) in GPU")
1038
1124
 
1039
1125
  else:
1040
1126
  cpu_to_gpu(self.default_stream, self.blocks_of_modules[entry_name])
1127
+ if self.verboseLevel >=2:
1128
+ print(f"Loading model {entry_name} ({model_name}) in GPU")
1041
1129
  torch.cuda.synchronize()
1130
+
1131
+ if not preload:
1132
+ self.loaded_blocks[model_id] = blocks_name
1133
+
1042
1134
  # cl.stop()
1043
1135
  # print(f"load time: {cl.format_time_gap()}")
1044
1136
 
@@ -1072,13 +1164,19 @@ class offload:
1072
1164
  self.active_models.append(model)
1073
1165
  self.active_models_ids.append(model_id)
1074
1166
 
1075
- self.gpu_load_blocks(model_id, None)
1167
+ self.gpu_load_blocks(model_id, None, True)
1168
+ for block_name in self.preloaded_blocks_per_model[model_id]:
1169
+ self.gpu_load_blocks(model_id, block_name, True)
1170
+
1076
1171
 
1077
1172
  # torch.cuda.current_stream().synchronize()
1078
1173
 
1079
1174
  def unload_all(self):
1080
1175
  for model_id in self.active_models_ids:
1081
1176
  self.gpu_unload_blocks(model_id, None)
1177
+ for block_name in self.preloaded_blocks_per_model[model_id]:
1178
+ self.gpu_unload_blocks(model_id, block_name)
1179
+
1082
1180
  loaded_block = self.loaded_blocks[model_id]
1083
1181
  if loaded_block != None:
1084
1182
  self.gpu_unload_blocks(model_id, loaded_block)
@@ -1152,19 +1250,10 @@ class offload:
1152
1250
 
1153
1251
  # @torch.compiler.disable()
1154
1252
  def preload_blocks_for_compile(module, *args, **kwargs):
1155
- some_context = context #for debugging
1156
- if blocks_name == None:
1157
- if self.ready_to_check_mem():
1158
- self.empty_cache_if_needed()
1159
- else:
1160
- loaded_block = self.loaded_blocks[model_id]
1161
- if (loaded_block == None or loaded_block != blocks_name) :
1162
- if loaded_block != None:
1163
- self.gpu_unload_blocks(model_id, loaded_block)
1164
- if self.ready_to_check_mem():
1165
- self.empty_cache_if_needed()
1166
- self.loaded_blocks[model_id] = blocks_name
1167
- self.gpu_load_blocks(model_id, blocks_name)
1253
+ # some_context = context #for debugging
1254
+ if blocks_name != None and blocks_name != self.loaded_blocks[model_id] and blocks_name not in self.preloaded_blocks_per_model[model_id]:
1255
+ self.gpu_load_blocks(model_id, blocks_name)
1256
+
1168
1257
  # need to be registered before the forward not to be break the efficiency of the compilation chain
1169
1258
  # it should be at the top of the compilation as this type of hook in the middle of a chain seems to break memory performance
1170
1259
  target_module.register_forward_pre_hook(preload_blocks_for_compile)
@@ -1179,18 +1268,12 @@ class offload:
1179
1268
  def check_empty_cuda_cache(module, *args, **kwargs):
1180
1269
  # if self.ready_to_check_mem():
1181
1270
  # self.empty_cache_if_needed()
1271
+
1182
1272
  if blocks_name == None:
1183
1273
  if self.ready_to_check_mem():
1184
1274
  self.empty_cache_if_needed()
1185
- else:
1186
- loaded_block = self.loaded_blocks[model_id]
1187
- if (loaded_block == None or loaded_block != blocks_name) :
1188
- if loaded_block != None:
1189
- self.gpu_unload_blocks(model_id, loaded_block)
1190
- if self.ready_to_check_mem():
1191
- self.empty_cache_if_needed()
1192
- self.loaded_blocks[model_id] = blocks_name
1193
- self.gpu_load_blocks(model_id, blocks_name)
1275
+ elif blocks_name != self.loaded_blocks[model_id] and blocks_name not in self.preloaded_blocks_per_model[model_id]:
1276
+ self.gpu_load_blocks(model_id, blocks_name)
1194
1277
  if qint4quantization:
1195
1278
  args, kwargs = self.move_args_to_gpu(*args, **kwargs)
1196
1279
 
@@ -1240,69 +1323,82 @@ class offload:
1240
1323
  print(f"Hooked to model '{model_id}' ({model_name})")
1241
1324
 
1242
1325
 
1243
- def save_model(model, file_path, do_quantize = False, quantizationType = qint8, verboseLevel = -1, config_file_path = None ):
1244
- """save the weights of a model and quantize them if requested
1245
- These weights can be loaded again using 'load_model_data'
1246
- """
1247
-
1248
- config = None
1249
- verboseLevel = _compute_verbose_level(verboseLevel)
1250
- if config_file_path !=None:
1251
- with open(config_file_path, "r", encoding="utf-8") as reader:
1252
- text = reader.read()
1253
- config= json.loads(text)
1254
- elif hasattr(model, "_config"):
1255
- config = model._config
1256
- elif hasattr(model, "config"):
1257
- config_fullpath = None
1258
- config_obj = getattr(model,"config")
1259
- config_path = getattr(config_obj,"_name_or_path", None)
1260
- if config_path != None:
1261
- config_fullpath = os.path.join(config_path, "config.json")
1262
- if not os.path.isfile(config_fullpath):
1263
- config_fullpath = None
1264
- if config_fullpath is None:
1265
- config_fullpath = os.path.join(os.path.dirname(file_path), "config.json")
1266
- if os.path.isfile(config_fullpath):
1267
- with open(config_fullpath, "r", encoding="utf-8") as reader:
1268
- text = reader.read()
1269
- config= json.loads(text)
1270
1326
 
1271
- if do_quantize:
1272
- _quantize(model, weights=quantizationType, model_id=file_path)
1273
-
1274
- quantization_map = getattr(model, "_quanto_map", None)
1275
-
1276
- if verboseLevel >=1:
1277
- print(f"Saving file '{file_path}")
1278
- safetensors2.torch_write_file(model.state_dict(), file_path , quantization_map = quantization_map, config = config)
1279
- if verboseLevel >=1:
1280
- print(f"File '{file_path}' saved")
1327
+ def tune_preloading(self, model_id, current_budget, towers_names):
1328
+ preloaded_blocks = {}
1329
+ preload_total = 0
1330
+ max_blocks_fetch = 0
1281
1331
 
1332
+ self.preloaded_blocks_per_model[model_id] = preloaded_blocks
1282
1333
 
1283
- def extract_models(prefix, obj):
1284
- pipe = {}
1285
- for name in dir(obj):
1286
- element = getattr(obj,name)
1287
- if name in ("pipeline", "pipe"):
1288
- pipeline = element
1289
- if hasattr(pipeline , "components") and isinstance(pipeline.components, dict):
1290
- for k, model in pipeline.components.items():
1291
- if model != None:
1292
- pipe[prefix + "/" + k ] = model
1293
- elif isinstance(element, torch.nn.Module):
1294
- if prefix + "/" + name in pipe:
1295
- pipe[prefix + "/_" + name ] = element
1334
+ if current_budget == 0 or towers_names is None or len(towers_names) == 0 or not self.async_transfers:
1335
+ return
1336
+ # current_budget = 5000 * ONE_MB
1337
+ base_size = self.blocks_of_modules_sizes[model_id]
1338
+ current_budget -= base_size
1339
+ if current_budget <= 0:
1340
+ return
1341
+
1342
+ towers = []
1343
+ total_size = 0
1344
+ for tower_name in towers_names:
1345
+ max_floor_size = 0
1346
+ tower_size = 0
1347
+ floors = []
1348
+ prefix = model_id + "/" + tower_name
1349
+ for name, size in self.blocks_of_modules_sizes.items():
1350
+ if name.startswith(prefix):
1351
+ tower_size += size
1352
+ floor_no = int( name[len(prefix): ] )
1353
+ floors.append( (name, floor_no, size))
1354
+ max_floor_size = max(max_floor_size, size)
1355
+
1356
+ towers.append( (floors, max_floor_size, tower_size) )
1357
+ total_size += tower_size
1358
+ current_budget -= 2 * max_floor_size
1359
+ if current_budget <= 0:
1360
+ return
1361
+
1362
+
1363
+ for floors, max_floor_size, tower_size in towers:
1364
+ tower_budget = tower_size / total_size * current_budget
1365
+ preload_blocks_count = int( tower_budget / max_floor_size)
1366
+ preload_total += preload_blocks_count * max_floor_size
1367
+ max_blocks_fetch = max(max_floor_size, max_blocks_fetch)
1368
+ if preload_blocks_count <= 0:
1369
+ return
1370
+
1371
+ nb_blocks= len(floors)
1372
+ space_between = (nb_blocks - preload_blocks_count) / preload_blocks_count
1373
+ cursor = space_between
1374
+ first_non_preloaded = None
1375
+ prev_non_preloaded = None
1376
+ for block in floors:
1377
+ name, i, size = block
1378
+ if i < cursor:
1379
+ if prev_non_preloaded == None:
1380
+ first_non_preloaded = name
1381
+ else:
1382
+ self.next_blocks_names[prev_non_preloaded] = name
1383
+ self.prev_blocks_names[name] = prev_non_preloaded
1384
+ prev_non_preloaded = name
1385
+ else:
1386
+ self.next_blocks_names[name] = None
1387
+ self.prev_blocks_names[name] = None
1388
+ preloaded_blocks[name[ len(model_id) + 1 : ] ] = size
1389
+ cursor += 1 + space_between
1390
+
1391
+ if prev_non_preloaded != None and len(towers) == 1 :
1392
+ self.next_blocks_names[prev_non_preloaded] = first_non_preloaded
1393
+ self.prev_blocks_names[first_non_preloaded] = prev_non_preloaded
1296
1394
  else:
1297
- pipe[prefix + "/" + name ] = element
1298
- elif isinstance(element, dict):
1299
- for k, element in element.items():
1300
- if hasattr(element , "pipeline"):
1301
- pipe.update( extract_models(prefix + "/" + k,element ))
1395
+ self.next_blocks_names[prev_non_preloaded] = None
1302
1396
 
1397
+ self.preloaded_blocks_per_model[model_id] = preloaded_blocks
1398
+
1399
+ if self.verboseLevel >=2:
1400
+ print(f"Async loading plan for model '{model_id}' : {preload_total/ONE_MB:0.2f} MB will be preloaded ({preload_total/total_size*100:0.1f}% of recurrent layers data) with a {max_blocks_fetch/ONE_MB:0.2f} MB async shuttle")
1303
1401
 
1304
- return pipe
1305
-
1306
1402
 
1307
1403
  def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, verboseLevel = -1):
1308
1404
  """Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
@@ -1382,8 +1478,9 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1382
1478
 
1383
1479
  self.anyCompiledModule = compileAllModels or len(modelsToCompile)>0
1384
1480
  if self.anyCompiledModule:
1385
- torch._dynamo.config.cache_size_limit = 10000
1386
1481
  torch.compiler.reset()
1482
+ torch._dynamo.config.cache_size_limit = 10000
1483
+ #dynamic=True
1387
1484
 
1388
1485
  # torch._logging.set_logs(recompiles=True)
1389
1486
  # torch._inductor.config.realize_opcount_threshold = 100 # workaround bug "AssertionError: increase TRITON_MAX_BLOCK['X'] to 4096."
@@ -1463,7 +1560,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1463
1560
  if compilationInThisOne:
1464
1561
  if self.verboseLevel>=1:
1465
1562
  if len(towers_modules)>0:
1466
- print(f"Pytorch compilation of '{model_id}' is scheduled for these modules : {towers_names}*.")
1563
+ formated_tower_names = [name + '*' for name in towers_names]
1564
+ print(f"Pytorch compilation of '{model_id}' is scheduled for these modules : {formated_tower_names}.")
1467
1565
  else:
1468
1566
  print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
1469
1567
 
@@ -1479,7 +1577,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1479
1577
  _pin_to_memory(current_model, model_id, partialPinning= partialPinning, verboseLevel=verboseLevel)
1480
1578
 
1481
1579
  current_budget = model_budgets[model_id]
1482
- current_size = 0
1483
1580
  cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq = None, None, None, -1
1484
1581
  self.loaded_blocks[model_id] = None
1485
1582
 
@@ -1489,10 +1586,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1489
1586
  if not hasattr(submodule, "_hf_hook"):
1490
1587
  setattr(submodule, "_hf_hook", HfHook())
1491
1588
 
1492
- # if submodule_name=='':
1493
- # continue
1494
-
1495
-
1496
1589
  if current_budget > 0 and len(submodule_name) > 0:
1497
1590
  if cur_blocks_prefix != None:
1498
1591
  if submodule_name.startswith(cur_blocks_prefix):
@@ -1500,7 +1593,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1500
1593
  depth_name = submodule_name.split(".")
1501
1594
  level = depth_name[len(depth_prefix)-1]
1502
1595
  pre , num = _extract_num_from_str(level)
1503
- if num != cur_blocks_seq and (cur_blocks_seq == -1 or current_size > current_budget):
1596
+ if num != cur_blocks_seq: #and (cur_blocks_seq == -1 or current_size > current_budget)
1504
1597
  prev_blocks_name = cur_blocks_name
1505
1598
  cur_blocks_name = cur_blocks_prefix + str(num)
1506
1599
  # print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
@@ -1528,13 +1621,34 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1528
1621
  else:
1529
1622
  self.hook_check_empty_cache_needed(submodule, model_id, cur_blocks_name, submodule_method, context = submodule_name )
1530
1623
 
1531
- current_size = self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name)
1624
+ self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name)
1625
+
1626
+ self.tune_preloading(model_id, current_budget, towers_names)
1532
1627
 
1533
1628
 
1534
1629
  if self.verboseLevel >=2:
1535
- for n,b in self.blocks_of_modules_sizes.items():
1536
- print(f"Size of submodel '{n}': {b/ONE_MB:.1f} MB")
1630
+ start_num, prev_num, prev_pre, prev_size = -1, -1, None, -1
1631
+
1632
+ def print_size_range(n,start_num,prev_num, prev_size ):
1633
+ if prev_num < 0:
1634
+ print(f"Size of submodel '{n}': {prev_size/ONE_MB:.1f} MB")
1635
+ elif prev_num - start_num <=1:
1636
+ print(f"Size of submodel '{n+ str(start_num)}': {prev_size/ONE_MB:.1f} MB")
1637
+ else:
1638
+ print(f"Size of submodel '{n+ str(start_num) +'-'+ str(prev_num)}': {prev_size/ONE_MB:.1f} MB")
1639
+
1640
+ for n, size in self.blocks_of_modules_sizes.items():
1641
+ pre, num = _extract_num_from_str(n) if "/" in n else (n, -1)
1642
+ if prev_pre == None :
1643
+ start_num = num
1644
+ elif prev_pre != pre or prev_pre == pre and size != prev_size:
1645
+ print_size_range(prev_pre,start_num,prev_num, prev_size )
1646
+ start_num = num
1647
+ prev_num, prev_pre, prev_size = num, pre, size
1648
+ if prev_pre != None:
1649
+ print_size_range(prev_pre,start_num,prev_num, prev_size )
1537
1650
 
1651
+
1538
1652
  torch.set_default_device('cuda')
1539
1653
  torch.cuda.empty_cache()
1540
1654
  gc.collect()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mmgp
3
- Version: 3.1.4.post1
3
+ Version: 3.1.4.post15
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -0,0 +1,9 @@
1
+ __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
+ mmgp/offload.py,sha256=DEGTt5RPoLx9JK-d7Ld_B_rIuQrmhblQJw3V5CL9Lo8,74519
4
+ mmgp/safetensors2.py,sha256=OkJAvENfWeb-PL0FcxS1-eYeHLbemTaNXYvNxURrzIs,16154
5
+ mmgp-3.1.4.post15.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
+ mmgp-3.1.4.post15.dist-info/METADATA,sha256=IMmhK6xAu0A96mLlpby9V2H-K8RYIqRpORaBngvtC0U,14278
7
+ mmgp-3.1.4.post15.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
8
+ mmgp-3.1.4.post15.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
+ mmgp-3.1.4.post15.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
- mmgp/offload.py,sha256=W74N_5-UPC3VXjmrpnRsrXxeB3xF17xqy4D0bZywfzI,69497
4
- mmgp/safetensors2.py,sha256=OkJAvENfWeb-PL0FcxS1-eYeHLbemTaNXYvNxURrzIs,16154
5
- mmgp-3.1.4.post1.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
- mmgp-3.1.4.post1.dist-info/METADATA,sha256=hNYMSq_iwLiuk3oJD4WL_41K4lESTCYU_AeQU0VDB8w,14277
7
- mmgp-3.1.4.post1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
8
- mmgp-3.1.4.post1.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
- mmgp-3.1.4.post1.dist-info/RECORD,,