mmgp 3.1.4.post1__tar.gz → 3.1.4.post15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- {mmgp-3.1.4.post1/src/mmgp.egg-info → mmgp-3.1.4.post15}/PKG-INFO +1 -1
- {mmgp-3.1.4.post1 → mmgp-3.1.4.post15}/pyproject.toml +1 -1
- {mmgp-3.1.4.post1 → mmgp-3.1.4.post15}/src/mmgp/offload.py +217 -103
- {mmgp-3.1.4.post1 → mmgp-3.1.4.post15/src/mmgp.egg-info}/PKG-INFO +1 -1
- {mmgp-3.1.4.post1 → mmgp-3.1.4.post15}/LICENSE.md +0 -0
- {mmgp-3.1.4.post1 → mmgp-3.1.4.post15}/README.md +0 -0
- {mmgp-3.1.4.post1 → mmgp-3.1.4.post15}/setup.cfg +0 -0
- {mmgp-3.1.4.post1 → mmgp-3.1.4.post15}/src/__init__.py +0 -0
- {mmgp-3.1.4.post1 → mmgp-3.1.4.post15}/src/mmgp/__init__.py +0 -0
- {mmgp-3.1.4.post1 → mmgp-3.1.4.post15}/src/mmgp/safetensors2.py +0 -0
- {mmgp-3.1.4.post1 → mmgp-3.1.4.post15}/src/mmgp.egg-info/SOURCES.txt +0 -0
- {mmgp-3.1.4.post1 → mmgp-3.1.4.post15}/src/mmgp.egg-info/dependency_links.txt +0 -0
- {mmgp-3.1.4.post1 → mmgp-3.1.4.post15}/src/mmgp.egg-info/requires.txt +0 -0
- {mmgp-3.1.4.post1 → mmgp-3.1.4.post15}/src/mmgp.egg-info/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.1 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
1
|
+
# ------------------ Memory Management 3.1.4 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -191,10 +191,10 @@ def _detect_main_towers(model, min_floors = 5):
|
|
|
191
191
|
pre , num = _extract_num_from_str(submodule_name)
|
|
192
192
|
if isinstance(submodule, (torch.nn.ModuleList)):
|
|
193
193
|
cur_blocks_prefix, cur_blocks_seq = pre + ".", -1
|
|
194
|
-
tower_name = submodule_name
|
|
194
|
+
tower_name = submodule_name + "."
|
|
195
195
|
elif num >=0:
|
|
196
196
|
cur_blocks_prefix, cur_blocks_seq = pre, num
|
|
197
|
-
tower_name = submodule_name[ :-1]
|
|
197
|
+
tower_name = submodule_name[ :-1]
|
|
198
198
|
floors_modules.append(submodule)
|
|
199
199
|
|
|
200
200
|
if len(floors_modules) >= min_floors:
|
|
@@ -420,7 +420,7 @@ def _welcome():
|
|
|
420
420
|
if welcome_displayed:
|
|
421
421
|
return
|
|
422
422
|
welcome_displayed = True
|
|
423
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.1) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
423
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.1.4-15) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
424
424
|
|
|
425
425
|
def _extract_num_from_str(num_in_str):
|
|
426
426
|
size = len(num_in_str)
|
|
@@ -598,9 +598,13 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
598
598
|
|
|
599
599
|
perc_excluded =total_excluded/ total_size if total_size >0 else 1
|
|
600
600
|
if verboseLevel >=2:
|
|
601
|
-
|
|
601
|
+
if total_excluded == 0:
|
|
602
|
+
print(f"Can't find any module to exclude from quantization, full model ({total_size/ONE_MB:.1f} MB) will be quantized")
|
|
603
|
+
else:
|
|
604
|
+
print(f"Total Excluded {total_excluded/ONE_MB:.1f} MB of {total_size/ONE_MB:.1f} that is {perc_excluded*100:.2f}%")
|
|
602
605
|
if perc_excluded >= 0.10:
|
|
603
|
-
|
|
606
|
+
if verboseLevel >=2:
|
|
607
|
+
print(f"Too many modules are excluded, there is something wrong with the selection, switch back to full quantization.")
|
|
604
608
|
exclude_list = None
|
|
605
609
|
|
|
606
610
|
|
|
@@ -905,6 +909,69 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
|
|
|
905
909
|
|
|
906
910
|
return
|
|
907
911
|
|
|
912
|
+
def save_model(model, file_path, do_quantize = False, quantizationType = qint8, verboseLevel = -1, config_file_path = None ):
|
|
913
|
+
"""save the weights of a model and quantize them if requested
|
|
914
|
+
These weights can be loaded again using 'load_model_data'
|
|
915
|
+
"""
|
|
916
|
+
|
|
917
|
+
config = None
|
|
918
|
+
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
919
|
+
if config_file_path !=None:
|
|
920
|
+
with open(config_file_path, "r", encoding="utf-8") as reader:
|
|
921
|
+
text = reader.read()
|
|
922
|
+
config= json.loads(text)
|
|
923
|
+
elif hasattr(model, "_config"):
|
|
924
|
+
config = model._config
|
|
925
|
+
elif hasattr(model, "config"):
|
|
926
|
+
config_fullpath = None
|
|
927
|
+
config_obj = getattr(model,"config")
|
|
928
|
+
config_path = getattr(config_obj,"_name_or_path", None)
|
|
929
|
+
if config_path != None:
|
|
930
|
+
config_fullpath = os.path.join(config_path, "config.json")
|
|
931
|
+
if not os.path.isfile(config_fullpath):
|
|
932
|
+
config_fullpath = None
|
|
933
|
+
if config_fullpath is None:
|
|
934
|
+
config_fullpath = os.path.join(os.path.dirname(file_path), "config.json")
|
|
935
|
+
if os.path.isfile(config_fullpath):
|
|
936
|
+
with open(config_fullpath, "r", encoding="utf-8") as reader:
|
|
937
|
+
text = reader.read()
|
|
938
|
+
config= json.loads(text)
|
|
939
|
+
|
|
940
|
+
if do_quantize:
|
|
941
|
+
_quantize(model, weights=quantizationType, model_id=file_path)
|
|
942
|
+
|
|
943
|
+
quantization_map = getattr(model, "_quanto_map", None)
|
|
944
|
+
|
|
945
|
+
if verboseLevel >=1:
|
|
946
|
+
print(f"Saving file '{file_path}")
|
|
947
|
+
safetensors2.torch_write_file(model.state_dict(), file_path , quantization_map = quantization_map, config = config)
|
|
948
|
+
if verboseLevel >=1:
|
|
949
|
+
print(f"File '{file_path}' saved")
|
|
950
|
+
|
|
951
|
+
|
|
952
|
+
def extract_models(prefix, obj):
|
|
953
|
+
pipe = {}
|
|
954
|
+
for name in dir(obj):
|
|
955
|
+
element = getattr(obj,name)
|
|
956
|
+
if name in ("pipeline", "pipe"):
|
|
957
|
+
pipeline = element
|
|
958
|
+
if hasattr(pipeline , "components") and isinstance(pipeline.components, dict):
|
|
959
|
+
for k, model in pipeline.components.items():
|
|
960
|
+
if model != None:
|
|
961
|
+
pipe[prefix + "/" + k ] = model
|
|
962
|
+
elif isinstance(element, torch.nn.Module):
|
|
963
|
+
if prefix + "/" + name in pipe:
|
|
964
|
+
pipe[prefix + "/_" + name ] = element
|
|
965
|
+
else:
|
|
966
|
+
pipe[prefix + "/" + name ] = element
|
|
967
|
+
elif isinstance(element, dict):
|
|
968
|
+
for k, element in element.items():
|
|
969
|
+
if hasattr(element , "pipeline"):
|
|
970
|
+
pipe.update( extract_models(prefix + "/" + k,element ))
|
|
971
|
+
|
|
972
|
+
|
|
973
|
+
return pipe
|
|
974
|
+
|
|
908
975
|
def get_model_name(model):
|
|
909
976
|
return model.name
|
|
910
977
|
|
|
@@ -931,6 +998,7 @@ class offload:
|
|
|
931
998
|
self.loaded_blocks = {}
|
|
932
999
|
self.prev_blocks_names = {}
|
|
933
1000
|
self.next_blocks_names = {}
|
|
1001
|
+
self.preloaded_blocks_per_model = {}
|
|
934
1002
|
self.default_stream = torch.cuda.default_stream(torch.device("cuda")) # torch.cuda.current_stream()
|
|
935
1003
|
self.transfer_stream = torch.cuda.Stream()
|
|
936
1004
|
self.async_transfers = False
|
|
@@ -940,6 +1008,8 @@ class offload:
|
|
|
940
1008
|
|
|
941
1009
|
def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name):
|
|
942
1010
|
|
|
1011
|
+
if blocks_name is None:
|
|
1012
|
+
pass
|
|
943
1013
|
entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
|
|
944
1014
|
if entry_name in self.blocks_of_modules:
|
|
945
1015
|
blocks_params = self.blocks_of_modules[entry_name]
|
|
@@ -994,11 +1064,9 @@ class offload:
|
|
|
994
1064
|
return True
|
|
995
1065
|
|
|
996
1066
|
@torch.compiler.disable()
|
|
997
|
-
def gpu_load_blocks(self, model_id, blocks_name):
|
|
1067
|
+
def gpu_load_blocks(self, model_id, blocks_name, preload = False):
|
|
998
1068
|
# cl = clock.start()
|
|
999
1069
|
|
|
1000
|
-
if blocks_name != None:
|
|
1001
|
-
self.loaded_blocks[model_id] = blocks_name
|
|
1002
1070
|
|
|
1003
1071
|
entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
|
|
1004
1072
|
|
|
@@ -1019,26 +1087,50 @@ class offload:
|
|
|
1019
1087
|
# else:
|
|
1020
1088
|
# p.data.record_stream(record_for_stream)
|
|
1021
1089
|
|
|
1090
|
+
any_past_block = False
|
|
1091
|
+
|
|
1092
|
+
loaded_block = self.loaded_blocks[model_id]
|
|
1093
|
+
if not preload and loaded_block != None:
|
|
1094
|
+
any_past_block = True
|
|
1095
|
+
self.gpu_unload_blocks(model_id, loaded_block)
|
|
1096
|
+
if self.ready_to_check_mem():
|
|
1097
|
+
self.empty_cache_if_needed()
|
|
1098
|
+
|
|
1022
1099
|
|
|
1023
1100
|
if self.verboseLevel >=2:
|
|
1024
1101
|
model = self.models[model_id]
|
|
1025
1102
|
model_name = model._get_name()
|
|
1026
|
-
|
|
1027
|
-
|
|
1103
|
+
# if not preload:
|
|
1104
|
+
# print(f"Request to load model {entry_name} ({model_name}) in GPU")
|
|
1105
|
+
|
|
1028
1106
|
|
|
1029
1107
|
if self.async_transfers and blocks_name != None:
|
|
1030
|
-
first = self.prev_blocks_names[entry_name] == None
|
|
1108
|
+
first = self.prev_blocks_names[entry_name] == None or not any_past_block
|
|
1031
1109
|
next_blocks_entry = self.next_blocks_names[entry_name] if entry_name in self.next_blocks_names else None
|
|
1032
1110
|
if first:
|
|
1033
1111
|
cpu_to_gpu(torch.cuda.current_stream(), self.blocks_of_modules[entry_name])
|
|
1112
|
+
if self.verboseLevel >=2:
|
|
1113
|
+
if preload:
|
|
1114
|
+
print(f"Preloading model {entry_name} ({model_name}) in GPU")
|
|
1115
|
+
else:
|
|
1116
|
+
print(f"Loading model {entry_name} ({model_name}) in GPU")
|
|
1117
|
+
|
|
1034
1118
|
torch.cuda.synchronize()
|
|
1035
1119
|
|
|
1036
1120
|
if next_blocks_entry != None:
|
|
1037
1121
|
cpu_to_gpu(self.transfer_stream, self.blocks_of_modules[next_blocks_entry]) #, self.default_stream
|
|
1122
|
+
if self.verboseLevel >=2:
|
|
1123
|
+
print(f"Prefetching model {next_blocks_entry} ({model_name}) in GPU")
|
|
1038
1124
|
|
|
1039
1125
|
else:
|
|
1040
1126
|
cpu_to_gpu(self.default_stream, self.blocks_of_modules[entry_name])
|
|
1127
|
+
if self.verboseLevel >=2:
|
|
1128
|
+
print(f"Loading model {entry_name} ({model_name}) in GPU")
|
|
1041
1129
|
torch.cuda.synchronize()
|
|
1130
|
+
|
|
1131
|
+
if not preload:
|
|
1132
|
+
self.loaded_blocks[model_id] = blocks_name
|
|
1133
|
+
|
|
1042
1134
|
# cl.stop()
|
|
1043
1135
|
# print(f"load time: {cl.format_time_gap()}")
|
|
1044
1136
|
|
|
@@ -1072,13 +1164,19 @@ class offload:
|
|
|
1072
1164
|
self.active_models.append(model)
|
|
1073
1165
|
self.active_models_ids.append(model_id)
|
|
1074
1166
|
|
|
1075
|
-
self.gpu_load_blocks(model_id, None)
|
|
1167
|
+
self.gpu_load_blocks(model_id, None, True)
|
|
1168
|
+
for block_name in self.preloaded_blocks_per_model[model_id]:
|
|
1169
|
+
self.gpu_load_blocks(model_id, block_name, True)
|
|
1170
|
+
|
|
1076
1171
|
|
|
1077
1172
|
# torch.cuda.current_stream().synchronize()
|
|
1078
1173
|
|
|
1079
1174
|
def unload_all(self):
|
|
1080
1175
|
for model_id in self.active_models_ids:
|
|
1081
1176
|
self.gpu_unload_blocks(model_id, None)
|
|
1177
|
+
for block_name in self.preloaded_blocks_per_model[model_id]:
|
|
1178
|
+
self.gpu_unload_blocks(model_id, block_name)
|
|
1179
|
+
|
|
1082
1180
|
loaded_block = self.loaded_blocks[model_id]
|
|
1083
1181
|
if loaded_block != None:
|
|
1084
1182
|
self.gpu_unload_blocks(model_id, loaded_block)
|
|
@@ -1152,19 +1250,10 @@ class offload:
|
|
|
1152
1250
|
|
|
1153
1251
|
# @torch.compiler.disable()
|
|
1154
1252
|
def preload_blocks_for_compile(module, *args, **kwargs):
|
|
1155
|
-
some_context = context #for debugging
|
|
1156
|
-
if blocks_name
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
else:
|
|
1160
|
-
loaded_block = self.loaded_blocks[model_id]
|
|
1161
|
-
if (loaded_block == None or loaded_block != blocks_name) :
|
|
1162
|
-
if loaded_block != None:
|
|
1163
|
-
self.gpu_unload_blocks(model_id, loaded_block)
|
|
1164
|
-
if self.ready_to_check_mem():
|
|
1165
|
-
self.empty_cache_if_needed()
|
|
1166
|
-
self.loaded_blocks[model_id] = blocks_name
|
|
1167
|
-
self.gpu_load_blocks(model_id, blocks_name)
|
|
1253
|
+
# some_context = context #for debugging
|
|
1254
|
+
if blocks_name != None and blocks_name != self.loaded_blocks[model_id] and blocks_name not in self.preloaded_blocks_per_model[model_id]:
|
|
1255
|
+
self.gpu_load_blocks(model_id, blocks_name)
|
|
1256
|
+
|
|
1168
1257
|
# need to be registered before the forward not to be break the efficiency of the compilation chain
|
|
1169
1258
|
# it should be at the top of the compilation as this type of hook in the middle of a chain seems to break memory performance
|
|
1170
1259
|
target_module.register_forward_pre_hook(preload_blocks_for_compile)
|
|
@@ -1179,18 +1268,12 @@ class offload:
|
|
|
1179
1268
|
def check_empty_cuda_cache(module, *args, **kwargs):
|
|
1180
1269
|
# if self.ready_to_check_mem():
|
|
1181
1270
|
# self.empty_cache_if_needed()
|
|
1271
|
+
|
|
1182
1272
|
if blocks_name == None:
|
|
1183
1273
|
if self.ready_to_check_mem():
|
|
1184
1274
|
self.empty_cache_if_needed()
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
if (loaded_block == None or loaded_block != blocks_name) :
|
|
1188
|
-
if loaded_block != None:
|
|
1189
|
-
self.gpu_unload_blocks(model_id, loaded_block)
|
|
1190
|
-
if self.ready_to_check_mem():
|
|
1191
|
-
self.empty_cache_if_needed()
|
|
1192
|
-
self.loaded_blocks[model_id] = blocks_name
|
|
1193
|
-
self.gpu_load_blocks(model_id, blocks_name)
|
|
1275
|
+
elif blocks_name != self.loaded_blocks[model_id] and blocks_name not in self.preloaded_blocks_per_model[model_id]:
|
|
1276
|
+
self.gpu_load_blocks(model_id, blocks_name)
|
|
1194
1277
|
if qint4quantization:
|
|
1195
1278
|
args, kwargs = self.move_args_to_gpu(*args, **kwargs)
|
|
1196
1279
|
|
|
@@ -1240,69 +1323,82 @@ class offload:
|
|
|
1240
1323
|
print(f"Hooked to model '{model_id}' ({model_name})")
|
|
1241
1324
|
|
|
1242
1325
|
|
|
1243
|
-
def save_model(model, file_path, do_quantize = False, quantizationType = qint8, verboseLevel = -1, config_file_path = None ):
|
|
1244
|
-
"""save the weights of a model and quantize them if requested
|
|
1245
|
-
These weights can be loaded again using 'load_model_data'
|
|
1246
|
-
"""
|
|
1247
|
-
|
|
1248
|
-
config = None
|
|
1249
|
-
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
1250
|
-
if config_file_path !=None:
|
|
1251
|
-
with open(config_file_path, "r", encoding="utf-8") as reader:
|
|
1252
|
-
text = reader.read()
|
|
1253
|
-
config= json.loads(text)
|
|
1254
|
-
elif hasattr(model, "_config"):
|
|
1255
|
-
config = model._config
|
|
1256
|
-
elif hasattr(model, "config"):
|
|
1257
|
-
config_fullpath = None
|
|
1258
|
-
config_obj = getattr(model,"config")
|
|
1259
|
-
config_path = getattr(config_obj,"_name_or_path", None)
|
|
1260
|
-
if config_path != None:
|
|
1261
|
-
config_fullpath = os.path.join(config_path, "config.json")
|
|
1262
|
-
if not os.path.isfile(config_fullpath):
|
|
1263
|
-
config_fullpath = None
|
|
1264
|
-
if config_fullpath is None:
|
|
1265
|
-
config_fullpath = os.path.join(os.path.dirname(file_path), "config.json")
|
|
1266
|
-
if os.path.isfile(config_fullpath):
|
|
1267
|
-
with open(config_fullpath, "r", encoding="utf-8") as reader:
|
|
1268
|
-
text = reader.read()
|
|
1269
|
-
config= json.loads(text)
|
|
1270
1326
|
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
if verboseLevel >=1:
|
|
1277
|
-
print(f"Saving file '{file_path}")
|
|
1278
|
-
safetensors2.torch_write_file(model.state_dict(), file_path , quantization_map = quantization_map, config = config)
|
|
1279
|
-
if verboseLevel >=1:
|
|
1280
|
-
print(f"File '{file_path}' saved")
|
|
1327
|
+
def tune_preloading(self, model_id, current_budget, towers_names):
|
|
1328
|
+
preloaded_blocks = {}
|
|
1329
|
+
preload_total = 0
|
|
1330
|
+
max_blocks_fetch = 0
|
|
1281
1331
|
|
|
1332
|
+
self.preloaded_blocks_per_model[model_id] = preloaded_blocks
|
|
1282
1333
|
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1334
|
+
if current_budget == 0 or towers_names is None or len(towers_names) == 0 or not self.async_transfers:
|
|
1335
|
+
return
|
|
1336
|
+
# current_budget = 5000 * ONE_MB
|
|
1337
|
+
base_size = self.blocks_of_modules_sizes[model_id]
|
|
1338
|
+
current_budget -= base_size
|
|
1339
|
+
if current_budget <= 0:
|
|
1340
|
+
return
|
|
1341
|
+
|
|
1342
|
+
towers = []
|
|
1343
|
+
total_size = 0
|
|
1344
|
+
for tower_name in towers_names:
|
|
1345
|
+
max_floor_size = 0
|
|
1346
|
+
tower_size = 0
|
|
1347
|
+
floors = []
|
|
1348
|
+
prefix = model_id + "/" + tower_name
|
|
1349
|
+
for name, size in self.blocks_of_modules_sizes.items():
|
|
1350
|
+
if name.startswith(prefix):
|
|
1351
|
+
tower_size += size
|
|
1352
|
+
floor_no = int( name[len(prefix): ] )
|
|
1353
|
+
floors.append( (name, floor_no, size))
|
|
1354
|
+
max_floor_size = max(max_floor_size, size)
|
|
1355
|
+
|
|
1356
|
+
towers.append( (floors, max_floor_size, tower_size) )
|
|
1357
|
+
total_size += tower_size
|
|
1358
|
+
current_budget -= 2 * max_floor_size
|
|
1359
|
+
if current_budget <= 0:
|
|
1360
|
+
return
|
|
1361
|
+
|
|
1362
|
+
|
|
1363
|
+
for floors, max_floor_size, tower_size in towers:
|
|
1364
|
+
tower_budget = tower_size / total_size * current_budget
|
|
1365
|
+
preload_blocks_count = int( tower_budget / max_floor_size)
|
|
1366
|
+
preload_total += preload_blocks_count * max_floor_size
|
|
1367
|
+
max_blocks_fetch = max(max_floor_size, max_blocks_fetch)
|
|
1368
|
+
if preload_blocks_count <= 0:
|
|
1369
|
+
return
|
|
1370
|
+
|
|
1371
|
+
nb_blocks= len(floors)
|
|
1372
|
+
space_between = (nb_blocks - preload_blocks_count) / preload_blocks_count
|
|
1373
|
+
cursor = space_between
|
|
1374
|
+
first_non_preloaded = None
|
|
1375
|
+
prev_non_preloaded = None
|
|
1376
|
+
for block in floors:
|
|
1377
|
+
name, i, size = block
|
|
1378
|
+
if i < cursor:
|
|
1379
|
+
if prev_non_preloaded == None:
|
|
1380
|
+
first_non_preloaded = name
|
|
1381
|
+
else:
|
|
1382
|
+
self.next_blocks_names[prev_non_preloaded] = name
|
|
1383
|
+
self.prev_blocks_names[name] = prev_non_preloaded
|
|
1384
|
+
prev_non_preloaded = name
|
|
1385
|
+
else:
|
|
1386
|
+
self.next_blocks_names[name] = None
|
|
1387
|
+
self.prev_blocks_names[name] = None
|
|
1388
|
+
preloaded_blocks[name[ len(model_id) + 1 : ] ] = size
|
|
1389
|
+
cursor += 1 + space_between
|
|
1390
|
+
|
|
1391
|
+
if prev_non_preloaded != None and len(towers) == 1 :
|
|
1392
|
+
self.next_blocks_names[prev_non_preloaded] = first_non_preloaded
|
|
1393
|
+
self.prev_blocks_names[first_non_preloaded] = prev_non_preloaded
|
|
1296
1394
|
else:
|
|
1297
|
-
|
|
1298
|
-
elif isinstance(element, dict):
|
|
1299
|
-
for k, element in element.items():
|
|
1300
|
-
if hasattr(element , "pipeline"):
|
|
1301
|
-
pipe.update( extract_models(prefix + "/" + k,element ))
|
|
1395
|
+
self.next_blocks_names[prev_non_preloaded] = None
|
|
1302
1396
|
|
|
1397
|
+
self.preloaded_blocks_per_model[model_id] = preloaded_blocks
|
|
1398
|
+
|
|
1399
|
+
if self.verboseLevel >=2:
|
|
1400
|
+
print(f"Async loading plan for model '{model_id}' : {preload_total/ONE_MB:0.2f} MB will be preloaded ({preload_total/total_size*100:0.1f}% of recurrent layers data) with a {max_blocks_fetch/ONE_MB:0.2f} MB async shuttle")
|
|
1303
1401
|
|
|
1304
|
-
return pipe
|
|
1305
|
-
|
|
1306
1402
|
|
|
1307
1403
|
def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, verboseLevel = -1):
|
|
1308
1404
|
"""Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
|
|
@@ -1382,8 +1478,9 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1382
1478
|
|
|
1383
1479
|
self.anyCompiledModule = compileAllModels or len(modelsToCompile)>0
|
|
1384
1480
|
if self.anyCompiledModule:
|
|
1385
|
-
torch._dynamo.config.cache_size_limit = 10000
|
|
1386
1481
|
torch.compiler.reset()
|
|
1482
|
+
torch._dynamo.config.cache_size_limit = 10000
|
|
1483
|
+
#dynamic=True
|
|
1387
1484
|
|
|
1388
1485
|
# torch._logging.set_logs(recompiles=True)
|
|
1389
1486
|
# torch._inductor.config.realize_opcount_threshold = 100 # workaround bug "AssertionError: increase TRITON_MAX_BLOCK['X'] to 4096."
|
|
@@ -1463,7 +1560,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1463
1560
|
if compilationInThisOne:
|
|
1464
1561
|
if self.verboseLevel>=1:
|
|
1465
1562
|
if len(towers_modules)>0:
|
|
1466
|
-
|
|
1563
|
+
formated_tower_names = [name + '*' for name in towers_names]
|
|
1564
|
+
print(f"Pytorch compilation of '{model_id}' is scheduled for these modules : {formated_tower_names}.")
|
|
1467
1565
|
else:
|
|
1468
1566
|
print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
|
|
1469
1567
|
|
|
@@ -1479,7 +1577,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1479
1577
|
_pin_to_memory(current_model, model_id, partialPinning= partialPinning, verboseLevel=verboseLevel)
|
|
1480
1578
|
|
|
1481
1579
|
current_budget = model_budgets[model_id]
|
|
1482
|
-
current_size = 0
|
|
1483
1580
|
cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq = None, None, None, -1
|
|
1484
1581
|
self.loaded_blocks[model_id] = None
|
|
1485
1582
|
|
|
@@ -1489,10 +1586,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1489
1586
|
if not hasattr(submodule, "_hf_hook"):
|
|
1490
1587
|
setattr(submodule, "_hf_hook", HfHook())
|
|
1491
1588
|
|
|
1492
|
-
# if submodule_name=='':
|
|
1493
|
-
# continue
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
1589
|
if current_budget > 0 and len(submodule_name) > 0:
|
|
1497
1590
|
if cur_blocks_prefix != None:
|
|
1498
1591
|
if submodule_name.startswith(cur_blocks_prefix):
|
|
@@ -1500,7 +1593,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1500
1593
|
depth_name = submodule_name.split(".")
|
|
1501
1594
|
level = depth_name[len(depth_prefix)-1]
|
|
1502
1595
|
pre , num = _extract_num_from_str(level)
|
|
1503
|
-
if num != cur_blocks_seq and (cur_blocks_seq == -1 or current_size > current_budget)
|
|
1596
|
+
if num != cur_blocks_seq: #and (cur_blocks_seq == -1 or current_size > current_budget)
|
|
1504
1597
|
prev_blocks_name = cur_blocks_name
|
|
1505
1598
|
cur_blocks_name = cur_blocks_prefix + str(num)
|
|
1506
1599
|
# print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
|
|
@@ -1528,13 +1621,34 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1528
1621
|
else:
|
|
1529
1622
|
self.hook_check_empty_cache_needed(submodule, model_id, cur_blocks_name, submodule_method, context = submodule_name )
|
|
1530
1623
|
|
|
1531
|
-
|
|
1624
|
+
self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name)
|
|
1625
|
+
|
|
1626
|
+
self.tune_preloading(model_id, current_budget, towers_names)
|
|
1532
1627
|
|
|
1533
1628
|
|
|
1534
1629
|
if self.verboseLevel >=2:
|
|
1535
|
-
|
|
1536
|
-
|
|
1630
|
+
start_num, prev_num, prev_pre, prev_size = -1, -1, None, -1
|
|
1631
|
+
|
|
1632
|
+
def print_size_range(n,start_num,prev_num, prev_size ):
|
|
1633
|
+
if prev_num < 0:
|
|
1634
|
+
print(f"Size of submodel '{n}': {prev_size/ONE_MB:.1f} MB")
|
|
1635
|
+
elif prev_num - start_num <=1:
|
|
1636
|
+
print(f"Size of submodel '{n+ str(start_num)}': {prev_size/ONE_MB:.1f} MB")
|
|
1637
|
+
else:
|
|
1638
|
+
print(f"Size of submodel '{n+ str(start_num) +'-'+ str(prev_num)}': {prev_size/ONE_MB:.1f} MB")
|
|
1639
|
+
|
|
1640
|
+
for n, size in self.blocks_of_modules_sizes.items():
|
|
1641
|
+
pre, num = _extract_num_from_str(n) if "/" in n else (n, -1)
|
|
1642
|
+
if prev_pre == None :
|
|
1643
|
+
start_num = num
|
|
1644
|
+
elif prev_pre != pre or prev_pre == pre and size != prev_size:
|
|
1645
|
+
print_size_range(prev_pre,start_num,prev_num, prev_size )
|
|
1646
|
+
start_num = num
|
|
1647
|
+
prev_num, prev_pre, prev_size = num, pre, size
|
|
1648
|
+
if prev_pre != None:
|
|
1649
|
+
print_size_range(prev_pre,start_num,prev_num, prev_size )
|
|
1537
1650
|
|
|
1651
|
+
|
|
1538
1652
|
torch.set_default_device('cuda')
|
|
1539
1653
|
torch.cuda.empty_cache()
|
|
1540
1654
|
gc.collect()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|