mmgp 3.3.1__py3-none-any.whl → 3.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- mmgp/offload.py +57 -64
- mmgp/safetensors2.py +27 -19
- {mmgp-3.3.1.dist-info → mmgp-3.3.3.dist-info}/METADATA +2 -2
- mmgp-3.3.3.dist-info/RECORD +9 -0
- {mmgp-3.3.1.dist-info → mmgp-3.3.3.dist-info}/WHEEL +1 -1
- mmgp-3.3.1.dist-info/RECORD +0 -9
- {mmgp-3.3.1.dist-info → mmgp-3.3.3.dist-info}/licenses/LICENSE.md +0 -0
- {mmgp-3.3.1.dist-info → mmgp-3.3.3.dist-info}/top_level.txt +0 -0
mmgp/offload.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.3.
|
|
1
|
+
# ------------------ Memory Management 3.3.3 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -258,11 +258,11 @@ def _move_to_pinned_tensor(source_tensor, big_tensor, offset, length):
|
|
|
258
258
|
assert t.is_pinned()
|
|
259
259
|
return t
|
|
260
260
|
|
|
261
|
-
def _safetensors_load_file(file_path):
|
|
261
|
+
def _safetensors_load_file(file_path, writable_tensors = True):
|
|
262
262
|
from collections import OrderedDict
|
|
263
263
|
sd = OrderedDict()
|
|
264
264
|
|
|
265
|
-
with safetensors2.safe_open(file_path, framework="pt", device="cpu") as f:
|
|
265
|
+
with safetensors2.safe_open(file_path, framework="pt", device="cpu", writable_tensors =writable_tensors) as f:
|
|
266
266
|
for k in f.keys():
|
|
267
267
|
sd[k] = f.get_tensor(k)
|
|
268
268
|
metadata = f.metadata()
|
|
@@ -401,7 +401,7 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
|
|
|
401
401
|
return
|
|
402
402
|
|
|
403
403
|
|
|
404
|
-
def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True,
|
|
404
|
+
def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True, big_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
|
|
405
405
|
|
|
406
406
|
global max_pinnable_bytes, total_pinned_bytes
|
|
407
407
|
if max_pinnable_bytes > 0 and max_pinnable_bytes >= max_pinnable_bytes:
|
|
@@ -474,7 +474,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
474
474
|
length = torch.numel(p.data) * p.data.element_size()
|
|
475
475
|
|
|
476
476
|
ref_cache[ref] = (n, length)
|
|
477
|
-
if current_big_tensor_size + length >
|
|
477
|
+
if current_big_tensor_size + length > big_tensor_size and current_big_tensor_size !=0 :
|
|
478
478
|
big_tensors_sizes.append(current_big_tensor_size)
|
|
479
479
|
current_big_tensor_size = 0
|
|
480
480
|
big_tensor_no += 1
|
|
@@ -498,31 +498,14 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
498
498
|
big_tensors_sizes.append(current_big_tensor_size)
|
|
499
499
|
|
|
500
500
|
big_tensors = []
|
|
501
|
-
last_big_tensor = 0
|
|
502
501
|
total = 0
|
|
503
502
|
|
|
504
503
|
|
|
505
504
|
failed_planned_allocation = False
|
|
506
505
|
|
|
507
|
-
for size in big_tensors_sizes:
|
|
508
|
-
try:
|
|
509
|
-
# if total > 7000 * ONE_MB:
|
|
510
|
-
# raise Exception ("test no more reserved RAM")
|
|
511
|
-
current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
|
|
512
|
-
big_tensors.append(current_big_tensor)
|
|
513
|
-
except:
|
|
514
|
-
print(f"Unable to pin more tensors for this model as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
|
|
515
|
-
max_pinnable_bytes = total + total_pinned_bytes
|
|
516
|
-
failed_planned_allocation = True
|
|
517
|
-
break
|
|
518
|
-
|
|
519
|
-
last_big_tensor += 1
|
|
520
|
-
total += size
|
|
521
|
-
|
|
522
|
-
|
|
523
506
|
gc.collect()
|
|
524
507
|
|
|
525
|
-
|
|
508
|
+
last_allocated_big_tensor = -1
|
|
526
509
|
tensor_no = 0
|
|
527
510
|
# prev_big_tensor = 0
|
|
528
511
|
for n, (p, is_buffer) in params_dict.items():
|
|
@@ -543,37 +526,47 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
543
526
|
assert p.data.is_pinned()
|
|
544
527
|
q = None
|
|
545
528
|
else:
|
|
529
|
+
|
|
546
530
|
big_tensor_no, offset, length = tensor_map_indexes[tensor_no]
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
531
|
+
if last_allocated_big_tensor < big_tensor_no:
|
|
532
|
+
last_allocated_big_tensor += 1
|
|
533
|
+
size = big_tensors_sizes[last_allocated_big_tensor]
|
|
534
|
+
try:
|
|
535
|
+
# if total > 7000 * ONE_MB:
|
|
536
|
+
# raise Exception ("test no more reserved RAM")
|
|
537
|
+
current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
|
|
538
|
+
big_tensors.append(current_big_tensor)
|
|
539
|
+
except:
|
|
540
|
+
print(f"Unable to pin more tensors for this model as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
|
|
541
|
+
max_pinnable_bytes = total + total_pinned_bytes
|
|
542
|
+
failed_planned_allocation = True
|
|
543
|
+
break
|
|
552
544
|
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
length2 = torch.numel(p._scale) * p._scale.element_size()
|
|
566
|
-
p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
|
|
567
|
-
length3 = torch.numel(p._shift) * p._shift.element_size()
|
|
568
|
-
p._shift = _move_to_pinned_tensor(p._shift, current_big_tensor, offset + length1 + length2, length3)
|
|
545
|
+
total += size
|
|
546
|
+
|
|
547
|
+
current_big_tensor = big_tensors[big_tensor_no]
|
|
548
|
+
if is_buffer :
|
|
549
|
+
_force_load_buffer(p) # otherwise potential memory leak
|
|
550
|
+
if isinstance(p, QTensor):
|
|
551
|
+
if p._qtype == qint4:
|
|
552
|
+
length1 = torch.numel(p._data._data) * p._data._data.element_size()
|
|
553
|
+
p._data._data = _move_to_pinned_tensor(p._data._data, current_big_tensor, offset, length1)
|
|
554
|
+
if hasattr(p,"_scale_shift"):
|
|
555
|
+
length2 = torch.numel(p._scale_shift) * p._scale_shift.element_size()
|
|
556
|
+
p._scale_shift = _move_to_pinned_tensor(p._scale_shift, current_big_tensor, offset + length1, length2)
|
|
569
557
|
else:
|
|
570
|
-
length1 = torch.numel(p._data) * p._data.element_size()
|
|
571
|
-
p._data = _move_to_pinned_tensor(p._data, current_big_tensor, offset, length1)
|
|
572
558
|
length2 = torch.numel(p._scale) * p._scale.element_size()
|
|
573
559
|
p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
|
|
560
|
+
length3 = torch.numel(p._shift) * p._shift.element_size()
|
|
561
|
+
p._shift = _move_to_pinned_tensor(p._shift, current_big_tensor, offset + length1 + length2, length3)
|
|
574
562
|
else:
|
|
575
|
-
|
|
576
|
-
p.
|
|
563
|
+
length1 = torch.numel(p._data) * p._data.element_size()
|
|
564
|
+
p._data = _move_to_pinned_tensor(p._data, current_big_tensor, offset, length1)
|
|
565
|
+
length2 = torch.numel(p._scale) * p._scale.element_size()
|
|
566
|
+
p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
|
|
567
|
+
else:
|
|
568
|
+
length = torch.numel(p.data) * p.data.element_size()
|
|
569
|
+
p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
|
|
577
570
|
tensor_no += 1
|
|
578
571
|
del p
|
|
579
572
|
model._pinned_bytes = total
|
|
@@ -583,9 +576,9 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
583
576
|
|
|
584
577
|
if verboseLevel >=1:
|
|
585
578
|
if partialPinning or failed_planned_allocation:
|
|
586
|
-
print(f"The model was partially pinned to reserved RAM: {
|
|
579
|
+
print(f"The model was partially pinned to reserved RAM: {last_allocated_big_tensor + 1} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
587
580
|
else:
|
|
588
|
-
print(f"The whole model was pinned to reserved RAM: {
|
|
581
|
+
print(f"The whole model was pinned to reserved RAM: {last_allocated_big_tensor + 1} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
589
582
|
|
|
590
583
|
model._already_pinned = True
|
|
591
584
|
|
|
@@ -598,7 +591,7 @@ def _welcome():
|
|
|
598
591
|
if welcome_displayed:
|
|
599
592
|
return
|
|
600
593
|
welcome_displayed = True
|
|
601
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.3.
|
|
594
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.3.3) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
602
595
|
|
|
603
596
|
def _extract_num_from_str(num_in_str):
|
|
604
597
|
size = len(num_in_str)
|
|
@@ -932,7 +925,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
932
925
|
continue
|
|
933
926
|
fail = False
|
|
934
927
|
skip = False
|
|
935
|
-
state_dict = safetensors2.torch_load_file(path)
|
|
928
|
+
state_dict = safetensors2.torch_load_file(path, writable_tensors= False)
|
|
936
929
|
|
|
937
930
|
|
|
938
931
|
|
|
@@ -1151,7 +1144,7 @@ def move_loras_to_device(model, device="cpu" ):
|
|
|
1151
1144
|
if ".lora_" in k:
|
|
1152
1145
|
m.to(device)
|
|
1153
1146
|
|
|
1154
|
-
def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, modelClass=None, modelPrefix = None, verboseLevel = -1):
|
|
1147
|
+
def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, modelClass=None, modelPrefix = None, writable_tensors = True, verboseLevel = -1):
|
|
1155
1148
|
"""
|
|
1156
1149
|
quick version of .LoadfromPretrained of the transformers library
|
|
1157
1150
|
used to build a model and load the corresponding weights (quantized or not)
|
|
@@ -1167,7 +1160,7 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
|
|
|
1167
1160
|
|
|
1168
1161
|
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
1169
1162
|
|
|
1170
|
-
with safetensors2.safe_open(model_path) as f:
|
|
1163
|
+
with safetensors2.safe_open(model_path, writable_tensors =writable_tensors) as f:
|
|
1171
1164
|
metadata = f.metadata()
|
|
1172
1165
|
|
|
1173
1166
|
if metadata is None:
|
|
@@ -1231,13 +1224,13 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
|
|
|
1231
1224
|
|
|
1232
1225
|
model._config = transformer_config
|
|
1233
1226
|
|
|
1234
|
-
load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, modelPrefix = modelPrefix, verboseLevel=verboseLevel )
|
|
1227
|
+
load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, modelPrefix = modelPrefix, writable_tensors =writable_tensors ,verboseLevel=verboseLevel )
|
|
1235
1228
|
|
|
1236
1229
|
return model
|
|
1237
1230
|
|
|
1238
1231
|
|
|
1239
1232
|
|
|
1240
|
-
def load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, verboseLevel = -1):
|
|
1233
|
+
def load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True, verboseLevel = -1):
|
|
1241
1234
|
"""
|
|
1242
1235
|
Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
|
|
1243
1236
|
"""
|
|
@@ -1275,7 +1268,7 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
|
|
|
1275
1268
|
if "module" in state_dict:
|
|
1276
1269
|
state_dict = state_dict["module"]
|
|
1277
1270
|
else:
|
|
1278
|
-
state_dict, metadata = _safetensors_load_file(file_path)
|
|
1271
|
+
state_dict, metadata = _safetensors_load_file(file_path, writable_tensors =writable_tensors)
|
|
1279
1272
|
|
|
1280
1273
|
if metadata is None:
|
|
1281
1274
|
quantization_map = None
|
|
@@ -1470,7 +1463,6 @@ class offload:
|
|
|
1470
1463
|
def __init__(self):
|
|
1471
1464
|
self.active_models = []
|
|
1472
1465
|
self.active_models_ids = []
|
|
1473
|
-
self.active_subcaches = {}
|
|
1474
1466
|
self.models = {}
|
|
1475
1467
|
self.cotenants_map = {
|
|
1476
1468
|
"text_encoder": ["vae", "text_encoder_2"],
|
|
@@ -1732,7 +1724,6 @@ class offload:
|
|
|
1732
1724
|
|
|
1733
1725
|
self.active_models = []
|
|
1734
1726
|
self.active_models_ids = []
|
|
1735
|
-
self.active_subcaches = []
|
|
1736
1727
|
torch.cuda.empty_cache()
|
|
1737
1728
|
gc.collect()
|
|
1738
1729
|
self.last_reserved_mem_check = time.time()
|
|
@@ -2051,20 +2042,23 @@ class offload:
|
|
|
2051
2042
|
last_offload_obj = None
|
|
2052
2043
|
|
|
2053
2044
|
self.unload_all()
|
|
2054
|
-
self.
|
|
2045
|
+
self.active_models = None
|
|
2046
|
+
self.default_stream = None
|
|
2047
|
+
self.transfer_stream = None
|
|
2048
|
+
self.parameters_ref = None
|
|
2055
2049
|
keys= [k for k in self.blocks_of_modules.keys()]
|
|
2056
2050
|
for k in keys:
|
|
2057
2051
|
del self.blocks_of_modules[k]
|
|
2058
2052
|
|
|
2059
2053
|
self.blocks_of_modules = None
|
|
2060
2054
|
|
|
2061
|
-
|
|
2062
2055
|
for model_id, model in self.models.items():
|
|
2063
2056
|
move_loras_to_device(model, "cpu")
|
|
2064
2057
|
if hasattr(model, "_pinned_bytes"):
|
|
2065
2058
|
total_pinned_bytes -= model._pinned_bytes
|
|
2066
2059
|
if hasattr(model, "_loras_model_data"):
|
|
2067
2060
|
unload_loras_from_model(model)
|
|
2061
|
+
model = None
|
|
2068
2062
|
|
|
2069
2063
|
self.models = None
|
|
2070
2064
|
|
|
@@ -2074,7 +2068,7 @@ class offload:
|
|
|
2074
2068
|
|
|
2075
2069
|
|
|
2076
2070
|
|
|
2077
|
-
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
|
|
2071
|
+
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertFloatToBfloat16 = True, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
|
|
2078
2072
|
"""Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
|
|
2079
2073
|
pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
|
|
2080
2074
|
quantizeTransformer: set True by default will quantize on the fly the video / image model
|
|
@@ -2181,7 +2175,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2181
2175
|
modelPinned = (pinAllModels or model_id in modelsToPin) and not hasattr(current_model,"_already_pinned")
|
|
2182
2176
|
|
|
2183
2177
|
current_model_size = 0
|
|
2184
|
-
|
|
2185
2178
|
for n, p in current_model.named_parameters():
|
|
2186
2179
|
p.requires_grad = False
|
|
2187
2180
|
if isinstance(p, QTensor):
|
|
@@ -2201,7 +2194,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2201
2194
|
current_model_size += torch.numel(p._data) * p._data.element_size()
|
|
2202
2195
|
|
|
2203
2196
|
else:
|
|
2204
|
-
if p.data.dtype == torch.float32:
|
|
2197
|
+
if convertFloatToBfloat16 and p.data.dtype == torch.float32:
|
|
2205
2198
|
# convert any left overs float32 weight to bloat16 to divide by 2 the model memory footprint
|
|
2206
2199
|
p.data = p.data.to(torch.bfloat16)
|
|
2207
2200
|
current_model_size += torch.numel(p.data) * p.data.element_size()
|
mmgp/safetensors2.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Safetensors2 1.
|
|
1
|
+
# ------------------ Safetensors2 1.1 by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module entirely written in Python is a replacement for the safetensor library which requires much less RAM to load models.
|
|
4
4
|
# It can be conveniently used to keep a low RAM consumption when handling transit data (for instance when quantizing or transferring tensors to reserver RAM)
|
|
@@ -16,12 +16,14 @@ import safetensors
|
|
|
16
16
|
import accelerate
|
|
17
17
|
import os
|
|
18
18
|
from collections import OrderedDict
|
|
19
|
+
import warnings
|
|
19
20
|
|
|
21
|
+
warnings.filterwarnings("ignore", ".*The given buffer is not writable, and PyTorch does not support non-writable tensors*")
|
|
20
22
|
|
|
21
23
|
_old_torch_load_file = None
|
|
22
24
|
_old_safe_open = None
|
|
23
25
|
|
|
24
|
-
|
|
26
|
+
all_tensors_are_read_only = False
|
|
25
27
|
|
|
26
28
|
mmm = {}
|
|
27
29
|
verboseLevel = 1
|
|
@@ -232,7 +234,7 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None, extr
|
|
|
232
234
|
class SafeTensorFile:
|
|
233
235
|
"""Main class for accessing safetensors files that provides memory-efficient access"""
|
|
234
236
|
|
|
235
|
-
def __init__(self, file_path, metadata, catalog, skip_bytes, lazy_loading = True):
|
|
237
|
+
def __init__(self, file_path, metadata, catalog, skip_bytes, lazy_loading = True, writable_tensors = True):
|
|
236
238
|
self._file_path = file_path
|
|
237
239
|
self._metadata = metadata
|
|
238
240
|
self._catalog = catalog
|
|
@@ -241,19 +243,20 @@ class SafeTensorFile:
|
|
|
241
243
|
self.sd = None
|
|
242
244
|
self.mtracker = None
|
|
243
245
|
self.lazy_loading = lazy_loading
|
|
246
|
+
self.writable_tensors = writable_tensors
|
|
244
247
|
|
|
245
248
|
@classmethod
|
|
246
|
-
def load_metadata(cls, file_path, lazy_loading = True):
|
|
249
|
+
def load_metadata(cls, file_path, lazy_loading = True, writable_tensors = True):
|
|
247
250
|
with open(file_path, 'rb') as f:
|
|
248
251
|
catalog, metadata, skip_bytes = _read_safetensors_header(file_path, f)
|
|
249
252
|
|
|
250
|
-
return cls(file_path, metadata, catalog, skip_bytes, lazy_loading)
|
|
253
|
+
return cls(file_path, metadata, catalog, skip_bytes, lazy_loading, writable_tensors )
|
|
251
254
|
|
|
252
|
-
def init_tensors(self, lazyTensors = True):
|
|
255
|
+
def init_tensors(self, lazyTensors = True, writable_tensors = True):
|
|
253
256
|
if self.sd is None:
|
|
254
257
|
self.lazy_loading = lazyTensors
|
|
255
258
|
if lazyTensors:
|
|
256
|
-
self.sd = self.create_tensors_with_mmap()
|
|
259
|
+
self.sd = self.create_tensors_with_mmap(writable_tensors)
|
|
257
260
|
else:
|
|
258
261
|
self.sd = self.create_tensors_without_mmap()
|
|
259
262
|
# else:
|
|
@@ -263,7 +266,7 @@ class SafeTensorFile:
|
|
|
263
266
|
return self.sd
|
|
264
267
|
|
|
265
268
|
|
|
266
|
-
def create_tensors_with_mmap(self):
|
|
269
|
+
def create_tensors_with_mmap(self, writable_tensors = True):
|
|
267
270
|
|
|
268
271
|
self.mtracker = MmapTracker(self._file_path)
|
|
269
272
|
import mmap
|
|
@@ -302,7 +305,7 @@ class SafeTensorFile:
|
|
|
302
305
|
with open(self._file_path, 'rb') as f:
|
|
303
306
|
i = 0
|
|
304
307
|
for map_start, map_size in maps_info:
|
|
305
|
-
mm = mmap.mmap(f.fileno(), map_size, offset=map_start, access=mmap.ACCESS_COPY)
|
|
308
|
+
mm = mmap.mmap(f.fileno(), map_size, offset=map_start, access= mmap.ACCESS_COPY if writable_tensors else mmap.ACCESS_READ)
|
|
306
309
|
maps.append((mm, map_start, map_size))
|
|
307
310
|
self.mtracker.register(mm, i, map_start, map_size)
|
|
308
311
|
i = i+ 1
|
|
@@ -359,7 +362,7 @@ class SafeTensorFile:
|
|
|
359
362
|
def get_tensor(self, name: str) -> torch.tensor:
|
|
360
363
|
"""Get a tensor by name"""
|
|
361
364
|
# To do : switch to a JIT tensor creation per tensor
|
|
362
|
-
self.init_tensors()
|
|
365
|
+
self.init_tensors(self.lazy_loading, writable_tensors= self.writable_tensors)
|
|
363
366
|
return self.sd[name]
|
|
364
367
|
|
|
365
368
|
def keys(self) -> List[str]:
|
|
@@ -374,7 +377,7 @@ class SafeTensorFile:
|
|
|
374
377
|
|
|
375
378
|
def tensors(self) -> Dict[str, torch.tensor]:
|
|
376
379
|
"""Get dictionary of all tensors"""
|
|
377
|
-
self.init_tensors(self.lazy_loading)
|
|
380
|
+
self.init_tensors(self.lazy_loading, writable_tensors= self.writable_tensors)
|
|
378
381
|
return self.sd
|
|
379
382
|
|
|
380
383
|
def metadata(self) -> Optional[Dict[str, str]]:
|
|
@@ -383,7 +386,7 @@ class SafeTensorFile:
|
|
|
383
386
|
|
|
384
387
|
def __len__(self) -> int:
|
|
385
388
|
"""Get number of tensors"""
|
|
386
|
-
self.init_tensors(self.lazy_loading)
|
|
389
|
+
self.init_tensors(self.lazy_loading, writable_tensors= self.writable_tensors)
|
|
387
390
|
return len(self.keys())
|
|
388
391
|
|
|
389
392
|
def __contains__(self, key: str) -> bool:
|
|
@@ -401,17 +404,22 @@ class SafeTensorFile:
|
|
|
401
404
|
class _SafeTensorLoader:
|
|
402
405
|
"""Context manager for loading SafeTensorFile"""
|
|
403
406
|
|
|
404
|
-
def __init__(self, filename: str ):
|
|
407
|
+
def __init__(self, filename: str, writable_tensors = True ):
|
|
405
408
|
self.filename = Path(filename)
|
|
409
|
+
self.writable_tensors = writable_tensors
|
|
406
410
|
self.sft = None
|
|
407
411
|
if not self.filename.exists():
|
|
408
412
|
raise FileNotFoundError(f"File not found: {filename}")
|
|
409
413
|
|
|
410
414
|
def __enter__(self) -> SafeTensorFile:
|
|
411
415
|
"""Open file and return SafeTensorFile instance"""
|
|
412
|
-
|
|
416
|
+
writable_tensors = self.writable_tensors
|
|
417
|
+
|
|
418
|
+
if all_tensors_are_read_only:
|
|
419
|
+
writable_tensors = False
|
|
420
|
+
|
|
413
421
|
try:
|
|
414
|
-
self.sft = SafeTensorFile.load_metadata(self.filename)
|
|
422
|
+
self.sft = SafeTensorFile.load_metadata(self.filename, writable_tensors= writable_tensors)
|
|
415
423
|
return self.sft
|
|
416
424
|
|
|
417
425
|
except Exception as e:
|
|
@@ -428,14 +436,14 @@ class _SafeTensorLoader:
|
|
|
428
436
|
pass
|
|
429
437
|
|
|
430
438
|
|
|
431
|
-
def safe_open(filename: str, framework: str = "pt",device = "cpu") -> _SafeTensorLoader:
|
|
439
|
+
def safe_open(filename: str, framework: str = "pt",device = "cpu", writable_tensors = True) -> _SafeTensorLoader:
|
|
432
440
|
if device != "cpu" or framework !="pt":
|
|
433
441
|
return _old_safe_open(filename =filename, framework=framework, device=device)
|
|
434
|
-
return _SafeTensorLoader(filename)
|
|
442
|
+
return _SafeTensorLoader(filename, writable_tensors = writable_tensors)
|
|
435
443
|
|
|
436
|
-
def torch_load_file( filename, device = 'cpu' ) -> Dict[str, torch.Tensor]:
|
|
444
|
+
def torch_load_file( filename, device = 'cpu', writable_tensors = True) -> Dict[str, torch.Tensor]:
|
|
437
445
|
sd = {}
|
|
438
|
-
with safe_open(filename, framework="pt", device = device ) as f:
|
|
446
|
+
with safe_open(filename, framework="pt", device = device, writable_tensors =writable_tensors ) as f:
|
|
439
447
|
for k in f.keys():
|
|
440
448
|
sd[k] = f.get_tensor(k)
|
|
441
449
|
return sd
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.3.
|
|
3
|
+
Version: 3.3.3
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -17,7 +17,7 @@ Dynamic: license-file
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
<p align="center">
|
|
20
|
-
<H2>Memory Management 3.3.
|
|
20
|
+
<H2>Memory Management 3.3.3 for the GPU Poor by DeepBeepMeep</H2>
|
|
21
21
|
</p>
|
|
22
22
|
|
|
23
23
|
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
+
mmgp/offload.py,sha256=R0UbOXEGAFKd_6090o8v5CkVmJiWmHDQsww7A3-LZEU,106550
|
|
4
|
+
mmgp/safetensors2.py,sha256=rmUbBmK3Dra5prUTTRSVi6-XUFAa9Mj6B5CNPgzt9To,17333
|
|
5
|
+
mmgp-3.3.3.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
+
mmgp-3.3.3.dist-info/METADATA,sha256=xcODp7uhIfvy7Il1xEp8ed2VYmH1Eln-EnLy3MM4VGM,16153
|
|
7
|
+
mmgp-3.3.3.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
|
|
8
|
+
mmgp-3.3.3.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
+
mmgp-3.3.3.dist-info/RECORD,,
|
mmgp-3.3.1.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
-
mmgp/offload.py,sha256=hG-gMFeHsRkjaPan_lwiTsQOctkXylJMiWhyL3KvGQA,106337
|
|
4
|
-
mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
|
|
5
|
-
mmgp-3.3.1.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
-
mmgp-3.3.1.dist-info/METADATA,sha256=SF0kLwi8zGHF1F53ZxFDZq5bDCWE39l-A24tYeyyhHo,16153
|
|
7
|
-
mmgp-3.3.1.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
|
|
8
|
-
mmgp-3.3.1.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
-
mmgp-3.3.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|