mmgp 3.3.1__tar.gz → 3.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- {mmgp-3.3.1/src/mmgp.egg-info → mmgp-3.3.2}/PKG-INFO +2 -2
- {mmgp-3.3.1 → mmgp-3.3.2}/README.md +1 -1
- {mmgp-3.3.1 → mmgp-3.3.2}/pyproject.toml +1 -1
- {mmgp-3.3.1 → mmgp-3.3.2}/src/mmgp/offload.py +67 -50
- {mmgp-3.3.1 → mmgp-3.3.2}/src/mmgp/safetensors2.py +27 -19
- {mmgp-3.3.1 → mmgp-3.3.2/src/mmgp.egg-info}/PKG-INFO +2 -2
- {mmgp-3.3.1 → mmgp-3.3.2}/LICENSE.md +0 -0
- {mmgp-3.3.1 → mmgp-3.3.2}/setup.cfg +0 -0
- {mmgp-3.3.1 → mmgp-3.3.2}/src/__init__.py +0 -0
- {mmgp-3.3.1 → mmgp-3.3.2}/src/mmgp/__init__.py +0 -0
- {mmgp-3.3.1 → mmgp-3.3.2}/src/mmgp.egg-info/SOURCES.txt +0 -0
- {mmgp-3.3.1 → mmgp-3.3.2}/src/mmgp.egg-info/dependency_links.txt +0 -0
- {mmgp-3.3.1 → mmgp-3.3.2}/src/mmgp.egg-info/requires.txt +0 -0
- {mmgp-3.3.1 → mmgp-3.3.2}/src/mmgp.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.3.
|
|
3
|
+
Version: 3.3.2
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -17,7 +17,7 @@ Dynamic: license-file
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
<p align="center">
|
|
20
|
-
<H2>Memory Management 3.3.
|
|
20
|
+
<H2>Memory Management 3.3.2 for the GPU Poor by DeepBeepMeep</H2>
|
|
21
21
|
</p>
|
|
22
22
|
|
|
23
23
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.3.
|
|
1
|
+
# ------------------ Memory Management 3.3.2 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -258,11 +258,11 @@ def _move_to_pinned_tensor(source_tensor, big_tensor, offset, length):
|
|
|
258
258
|
assert t.is_pinned()
|
|
259
259
|
return t
|
|
260
260
|
|
|
261
|
-
def _safetensors_load_file(file_path):
|
|
261
|
+
def _safetensors_load_file(file_path, writable_tensors = True):
|
|
262
262
|
from collections import OrderedDict
|
|
263
263
|
sd = OrderedDict()
|
|
264
264
|
|
|
265
|
-
with safetensors2.safe_open(file_path, framework="pt", device="cpu") as f:
|
|
265
|
+
with safetensors2.safe_open(file_path, framework="pt", device="cpu", writable_tensors =writable_tensors) as f:
|
|
266
266
|
for k in f.keys():
|
|
267
267
|
sd[k] = f.get_tensor(k)
|
|
268
268
|
metadata = f.metadata()
|
|
@@ -504,25 +504,25 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
504
504
|
|
|
505
505
|
failed_planned_allocation = False
|
|
506
506
|
|
|
507
|
-
for size in big_tensors_sizes:
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
507
|
+
# for size in big_tensors_sizes:
|
|
508
|
+
# try:
|
|
509
|
+
# # if total > 7000 * ONE_MB:
|
|
510
|
+
# # raise Exception ("test no more reserved RAM")
|
|
511
|
+
# current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
|
|
512
|
+
# big_tensors.append(current_big_tensor)
|
|
513
|
+
# except:
|
|
514
|
+
# print(f"Unable to pin more tensors for this model as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
|
|
515
|
+
# max_pinnable_bytes = total + total_pinned_bytes
|
|
516
|
+
# failed_planned_allocation = True
|
|
517
|
+
# break
|
|
518
518
|
|
|
519
|
-
|
|
520
|
-
|
|
519
|
+
# last_big_tensor += 1
|
|
520
|
+
# total += size
|
|
521
521
|
|
|
522
522
|
|
|
523
523
|
gc.collect()
|
|
524
524
|
|
|
525
|
-
|
|
525
|
+
last_allocated_big_tensor = -1
|
|
526
526
|
tensor_no = 0
|
|
527
527
|
# prev_big_tensor = 0
|
|
528
528
|
for n, (p, is_buffer) in params_dict.items():
|
|
@@ -543,37 +543,54 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
543
543
|
assert p.data.is_pinned()
|
|
544
544
|
q = None
|
|
545
545
|
else:
|
|
546
|
+
|
|
546
547
|
big_tensor_no, offset, length = tensor_map_indexes[tensor_no]
|
|
548
|
+
if last_allocated_big_tensor < big_tensor_no:
|
|
549
|
+
last_allocated_big_tensor += 1
|
|
550
|
+
size = big_tensors_sizes[last_allocated_big_tensor]
|
|
551
|
+
try:
|
|
552
|
+
# if total > 7000 * ONE_MB:
|
|
553
|
+
# raise Exception ("test no more reserved RAM")
|
|
554
|
+
current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
|
|
555
|
+
big_tensors.append(current_big_tensor)
|
|
556
|
+
except:
|
|
557
|
+
print(f"Unable to pin more tensors for this model as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
|
|
558
|
+
max_pinnable_bytes = total + total_pinned_bytes
|
|
559
|
+
failed_planned_allocation = True
|
|
560
|
+
break
|
|
561
|
+
|
|
562
|
+
total += size
|
|
563
|
+
|
|
547
564
|
# if big_tensor_no != prev_big_tensor:
|
|
548
565
|
# gc.collect()
|
|
549
566
|
# prev_big_tensor = big_tensor_no
|
|
550
567
|
# match_param, match_isbuffer = tied_weights.get(n, (None, False))
|
|
551
568
|
# if match_param != None:
|
|
552
569
|
|
|
553
|
-
if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
else:
|
|
565
|
-
length2 = torch.numel(p._scale) * p._scale.element_size()
|
|
566
|
-
p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
|
|
567
|
-
length3 = torch.numel(p._shift) * p._shift.element_size()
|
|
568
|
-
p._shift = _move_to_pinned_tensor(p._shift, current_big_tensor, offset + length1 + length2, length3)
|
|
570
|
+
# if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
|
|
571
|
+
current_big_tensor = big_tensors[big_tensor_no]
|
|
572
|
+
if is_buffer :
|
|
573
|
+
_force_load_buffer(p) # otherwise potential memory leak
|
|
574
|
+
if isinstance(p, QTensor):
|
|
575
|
+
if p._qtype == qint4:
|
|
576
|
+
length1 = torch.numel(p._data._data) * p._data._data.element_size()
|
|
577
|
+
p._data._data = _move_to_pinned_tensor(p._data._data, current_big_tensor, offset, length1)
|
|
578
|
+
if hasattr(p,"_scale_shift"):
|
|
579
|
+
length2 = torch.numel(p._scale_shift) * p._scale_shift.element_size()
|
|
580
|
+
p._scale_shift = _move_to_pinned_tensor(p._scale_shift, current_big_tensor, offset + length1, length2)
|
|
569
581
|
else:
|
|
570
|
-
length1 = torch.numel(p._data) * p._data.element_size()
|
|
571
|
-
p._data = _move_to_pinned_tensor(p._data, current_big_tensor, offset, length1)
|
|
572
582
|
length2 = torch.numel(p._scale) * p._scale.element_size()
|
|
573
583
|
p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
|
|
584
|
+
length3 = torch.numel(p._shift) * p._shift.element_size()
|
|
585
|
+
p._shift = _move_to_pinned_tensor(p._shift, current_big_tensor, offset + length1 + length2, length3)
|
|
574
586
|
else:
|
|
575
|
-
|
|
576
|
-
p.
|
|
587
|
+
length1 = torch.numel(p._data) * p._data.element_size()
|
|
588
|
+
p._data = _move_to_pinned_tensor(p._data, current_big_tensor, offset, length1)
|
|
589
|
+
length2 = torch.numel(p._scale) * p._scale.element_size()
|
|
590
|
+
p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
|
|
591
|
+
else:
|
|
592
|
+
length = torch.numel(p.data) * p.data.element_size()
|
|
593
|
+
p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
|
|
577
594
|
tensor_no += 1
|
|
578
595
|
del p
|
|
579
596
|
model._pinned_bytes = total
|
|
@@ -598,7 +615,7 @@ def _welcome():
|
|
|
598
615
|
if welcome_displayed:
|
|
599
616
|
return
|
|
600
617
|
welcome_displayed = True
|
|
601
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.3.
|
|
618
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.3.3) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
602
619
|
|
|
603
620
|
def _extract_num_from_str(num_in_str):
|
|
604
621
|
size = len(num_in_str)
|
|
@@ -1151,7 +1168,7 @@ def move_loras_to_device(model, device="cpu" ):
|
|
|
1151
1168
|
if ".lora_" in k:
|
|
1152
1169
|
m.to(device)
|
|
1153
1170
|
|
|
1154
|
-
def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, modelClass=None, modelPrefix = None, verboseLevel = -1):
|
|
1171
|
+
def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, modelClass=None, modelPrefix = None, writable_tensors = True, verboseLevel = -1):
|
|
1155
1172
|
"""
|
|
1156
1173
|
quick version of .LoadfromPretrained of the transformers library
|
|
1157
1174
|
used to build a model and load the corresponding weights (quantized or not)
|
|
@@ -1167,7 +1184,7 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
|
|
|
1167
1184
|
|
|
1168
1185
|
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
1169
1186
|
|
|
1170
|
-
with safetensors2.safe_open(model_path) as f:
|
|
1187
|
+
with safetensors2.safe_open(model_path, writable_tensors =writable_tensors) as f:
|
|
1171
1188
|
metadata = f.metadata()
|
|
1172
1189
|
|
|
1173
1190
|
if metadata is None:
|
|
@@ -1231,13 +1248,13 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
|
|
|
1231
1248
|
|
|
1232
1249
|
model._config = transformer_config
|
|
1233
1250
|
|
|
1234
|
-
load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, modelPrefix = modelPrefix, verboseLevel=verboseLevel )
|
|
1251
|
+
load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, modelPrefix = modelPrefix, writable_tensors =writable_tensors ,verboseLevel=verboseLevel )
|
|
1235
1252
|
|
|
1236
1253
|
return model
|
|
1237
1254
|
|
|
1238
1255
|
|
|
1239
1256
|
|
|
1240
|
-
def load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, verboseLevel = -1):
|
|
1257
|
+
def load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True, verboseLevel = -1):
|
|
1241
1258
|
"""
|
|
1242
1259
|
Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
|
|
1243
1260
|
"""
|
|
@@ -1275,7 +1292,7 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
|
|
|
1275
1292
|
if "module" in state_dict:
|
|
1276
1293
|
state_dict = state_dict["module"]
|
|
1277
1294
|
else:
|
|
1278
|
-
state_dict, metadata = _safetensors_load_file(file_path)
|
|
1295
|
+
state_dict, metadata = _safetensors_load_file(file_path, writable_tensors =writable_tensors)
|
|
1279
1296
|
|
|
1280
1297
|
if metadata is None:
|
|
1281
1298
|
quantization_map = None
|
|
@@ -1470,7 +1487,6 @@ class offload:
|
|
|
1470
1487
|
def __init__(self):
|
|
1471
1488
|
self.active_models = []
|
|
1472
1489
|
self.active_models_ids = []
|
|
1473
|
-
self.active_subcaches = {}
|
|
1474
1490
|
self.models = {}
|
|
1475
1491
|
self.cotenants_map = {
|
|
1476
1492
|
"text_encoder": ["vae", "text_encoder_2"],
|
|
@@ -1732,7 +1748,6 @@ class offload:
|
|
|
1732
1748
|
|
|
1733
1749
|
self.active_models = []
|
|
1734
1750
|
self.active_models_ids = []
|
|
1735
|
-
self.active_subcaches = []
|
|
1736
1751
|
torch.cuda.empty_cache()
|
|
1737
1752
|
gc.collect()
|
|
1738
1753
|
self.last_reserved_mem_check = time.time()
|
|
@@ -2051,20 +2066,23 @@ class offload:
|
|
|
2051
2066
|
last_offload_obj = None
|
|
2052
2067
|
|
|
2053
2068
|
self.unload_all()
|
|
2054
|
-
self.
|
|
2069
|
+
self.active_models = None
|
|
2070
|
+
self.default_stream = None
|
|
2071
|
+
self.transfer_stream = None
|
|
2072
|
+
self.parameters_ref = None
|
|
2055
2073
|
keys= [k for k in self.blocks_of_modules.keys()]
|
|
2056
2074
|
for k in keys:
|
|
2057
2075
|
del self.blocks_of_modules[k]
|
|
2058
2076
|
|
|
2059
2077
|
self.blocks_of_modules = None
|
|
2060
2078
|
|
|
2061
|
-
|
|
2062
2079
|
for model_id, model in self.models.items():
|
|
2063
2080
|
move_loras_to_device(model, "cpu")
|
|
2064
2081
|
if hasattr(model, "_pinned_bytes"):
|
|
2065
2082
|
total_pinned_bytes -= model._pinned_bytes
|
|
2066
2083
|
if hasattr(model, "_loras_model_data"):
|
|
2067
2084
|
unload_loras_from_model(model)
|
|
2085
|
+
model = None
|
|
2068
2086
|
|
|
2069
2087
|
self.models = None
|
|
2070
2088
|
|
|
@@ -2074,7 +2092,7 @@ class offload:
|
|
|
2074
2092
|
|
|
2075
2093
|
|
|
2076
2094
|
|
|
2077
|
-
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
|
|
2095
|
+
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertFloatToBfloat16 = True, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
|
|
2078
2096
|
"""Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
|
|
2079
2097
|
pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
|
|
2080
2098
|
quantizeTransformer: set True by default will quantize on the fly the video / image model
|
|
@@ -2181,7 +2199,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2181
2199
|
modelPinned = (pinAllModels or model_id in modelsToPin) and not hasattr(current_model,"_already_pinned")
|
|
2182
2200
|
|
|
2183
2201
|
current_model_size = 0
|
|
2184
|
-
|
|
2185
2202
|
for n, p in current_model.named_parameters():
|
|
2186
2203
|
p.requires_grad = False
|
|
2187
2204
|
if isinstance(p, QTensor):
|
|
@@ -2201,7 +2218,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2201
2218
|
current_model_size += torch.numel(p._data) * p._data.element_size()
|
|
2202
2219
|
|
|
2203
2220
|
else:
|
|
2204
|
-
if p.data.dtype == torch.float32:
|
|
2221
|
+
if convertFloatToBfloat16 and p.data.dtype == torch.float32:
|
|
2205
2222
|
# convert any left overs float32 weight to bloat16 to divide by 2 the model memory footprint
|
|
2206
2223
|
p.data = p.data.to(torch.bfloat16)
|
|
2207
2224
|
current_model_size += torch.numel(p.data) * p.data.element_size()
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Safetensors2 1.
|
|
1
|
+
# ------------------ Safetensors2 1.1 by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module entirely written in Python is a replacement for the safetensor library which requires much less RAM to load models.
|
|
4
4
|
# It can be conveniently used to keep a low RAM consumption when handling transit data (for instance when quantizing or transferring tensors to reserver RAM)
|
|
@@ -16,12 +16,14 @@ import safetensors
|
|
|
16
16
|
import accelerate
|
|
17
17
|
import os
|
|
18
18
|
from collections import OrderedDict
|
|
19
|
+
import warnings
|
|
19
20
|
|
|
21
|
+
warnings.filterwarnings("ignore", ".*The given buffer is not writable, and PyTorch does not support non-writable tensors*")
|
|
20
22
|
|
|
21
23
|
_old_torch_load_file = None
|
|
22
24
|
_old_safe_open = None
|
|
23
25
|
|
|
24
|
-
|
|
26
|
+
all_tensors_are_read_only = False
|
|
25
27
|
|
|
26
28
|
mmm = {}
|
|
27
29
|
verboseLevel = 1
|
|
@@ -232,7 +234,7 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None, extr
|
|
|
232
234
|
class SafeTensorFile:
|
|
233
235
|
"""Main class for accessing safetensors files that provides memory-efficient access"""
|
|
234
236
|
|
|
235
|
-
def __init__(self, file_path, metadata, catalog, skip_bytes, lazy_loading = True):
|
|
237
|
+
def __init__(self, file_path, metadata, catalog, skip_bytes, lazy_loading = True, writable_tensors = True):
|
|
236
238
|
self._file_path = file_path
|
|
237
239
|
self._metadata = metadata
|
|
238
240
|
self._catalog = catalog
|
|
@@ -241,19 +243,20 @@ class SafeTensorFile:
|
|
|
241
243
|
self.sd = None
|
|
242
244
|
self.mtracker = None
|
|
243
245
|
self.lazy_loading = lazy_loading
|
|
246
|
+
self.writable_tensors = writable_tensors
|
|
244
247
|
|
|
245
248
|
@classmethod
|
|
246
|
-
def load_metadata(cls, file_path, lazy_loading = True):
|
|
249
|
+
def load_metadata(cls, file_path, lazy_loading = True, writable_tensors = True):
|
|
247
250
|
with open(file_path, 'rb') as f:
|
|
248
251
|
catalog, metadata, skip_bytes = _read_safetensors_header(file_path, f)
|
|
249
252
|
|
|
250
|
-
return cls(file_path, metadata, catalog, skip_bytes, lazy_loading)
|
|
253
|
+
return cls(file_path, metadata, catalog, skip_bytes, lazy_loading, writable_tensors )
|
|
251
254
|
|
|
252
|
-
def init_tensors(self, lazyTensors = True):
|
|
255
|
+
def init_tensors(self, lazyTensors = True, writable_tensors = True):
|
|
253
256
|
if self.sd is None:
|
|
254
257
|
self.lazy_loading = lazyTensors
|
|
255
258
|
if lazyTensors:
|
|
256
|
-
self.sd = self.create_tensors_with_mmap()
|
|
259
|
+
self.sd = self.create_tensors_with_mmap(writable_tensors)
|
|
257
260
|
else:
|
|
258
261
|
self.sd = self.create_tensors_without_mmap()
|
|
259
262
|
# else:
|
|
@@ -263,7 +266,7 @@ class SafeTensorFile:
|
|
|
263
266
|
return self.sd
|
|
264
267
|
|
|
265
268
|
|
|
266
|
-
def create_tensors_with_mmap(self):
|
|
269
|
+
def create_tensors_with_mmap(self, writable_tensors = True):
|
|
267
270
|
|
|
268
271
|
self.mtracker = MmapTracker(self._file_path)
|
|
269
272
|
import mmap
|
|
@@ -302,7 +305,7 @@ class SafeTensorFile:
|
|
|
302
305
|
with open(self._file_path, 'rb') as f:
|
|
303
306
|
i = 0
|
|
304
307
|
for map_start, map_size in maps_info:
|
|
305
|
-
mm = mmap.mmap(f.fileno(), map_size, offset=map_start, access=mmap.ACCESS_COPY)
|
|
308
|
+
mm = mmap.mmap(f.fileno(), map_size, offset=map_start, access= mmap.ACCESS_COPY if writable_tensors else mmap.ACCESS_READ)
|
|
306
309
|
maps.append((mm, map_start, map_size))
|
|
307
310
|
self.mtracker.register(mm, i, map_start, map_size)
|
|
308
311
|
i = i+ 1
|
|
@@ -359,7 +362,7 @@ class SafeTensorFile:
|
|
|
359
362
|
def get_tensor(self, name: str) -> torch.tensor:
|
|
360
363
|
"""Get a tensor by name"""
|
|
361
364
|
# To do : switch to a JIT tensor creation per tensor
|
|
362
|
-
self.init_tensors()
|
|
365
|
+
self.init_tensors(self.lazy_loading, writable_tensors= self.writable_tensors)
|
|
363
366
|
return self.sd[name]
|
|
364
367
|
|
|
365
368
|
def keys(self) -> List[str]:
|
|
@@ -374,7 +377,7 @@ class SafeTensorFile:
|
|
|
374
377
|
|
|
375
378
|
def tensors(self) -> Dict[str, torch.tensor]:
|
|
376
379
|
"""Get dictionary of all tensors"""
|
|
377
|
-
self.init_tensors(self.lazy_loading)
|
|
380
|
+
self.init_tensors(self.lazy_loading, writable_tensors= self.writable_tensors)
|
|
378
381
|
return self.sd
|
|
379
382
|
|
|
380
383
|
def metadata(self) -> Optional[Dict[str, str]]:
|
|
@@ -383,7 +386,7 @@ class SafeTensorFile:
|
|
|
383
386
|
|
|
384
387
|
def __len__(self) -> int:
|
|
385
388
|
"""Get number of tensors"""
|
|
386
|
-
self.init_tensors(self.lazy_loading)
|
|
389
|
+
self.init_tensors(self.lazy_loading, writable_tensors= self.writable_tensors)
|
|
387
390
|
return len(self.keys())
|
|
388
391
|
|
|
389
392
|
def __contains__(self, key: str) -> bool:
|
|
@@ -401,17 +404,22 @@ class SafeTensorFile:
|
|
|
401
404
|
class _SafeTensorLoader:
|
|
402
405
|
"""Context manager for loading SafeTensorFile"""
|
|
403
406
|
|
|
404
|
-
def __init__(self, filename: str ):
|
|
407
|
+
def __init__(self, filename: str, writable_tensors = True ):
|
|
405
408
|
self.filename = Path(filename)
|
|
409
|
+
self.writable_tensors = writable_tensors
|
|
406
410
|
self.sft = None
|
|
407
411
|
if not self.filename.exists():
|
|
408
412
|
raise FileNotFoundError(f"File not found: {filename}")
|
|
409
413
|
|
|
410
414
|
def __enter__(self) -> SafeTensorFile:
|
|
411
415
|
"""Open file and return SafeTensorFile instance"""
|
|
412
|
-
|
|
416
|
+
writable_tensors = self.writable_tensors
|
|
417
|
+
|
|
418
|
+
if all_tensors_are_read_only:
|
|
419
|
+
writable_tensors = False
|
|
420
|
+
|
|
413
421
|
try:
|
|
414
|
-
self.sft = SafeTensorFile.load_metadata(self.filename)
|
|
422
|
+
self.sft = SafeTensorFile.load_metadata(self.filename, writable_tensors= writable_tensors)
|
|
415
423
|
return self.sft
|
|
416
424
|
|
|
417
425
|
except Exception as e:
|
|
@@ -428,14 +436,14 @@ class _SafeTensorLoader:
|
|
|
428
436
|
pass
|
|
429
437
|
|
|
430
438
|
|
|
431
|
-
def safe_open(filename: str, framework: str = "pt",device = "cpu") -> _SafeTensorLoader:
|
|
439
|
+
def safe_open(filename: str, framework: str = "pt",device = "cpu", writable_tensors = True) -> _SafeTensorLoader:
|
|
432
440
|
if device != "cpu" or framework !="pt":
|
|
433
441
|
return _old_safe_open(filename =filename, framework=framework, device=device)
|
|
434
|
-
return _SafeTensorLoader(filename)
|
|
442
|
+
return _SafeTensorLoader(filename, writable_tensors = writable_tensors)
|
|
435
443
|
|
|
436
|
-
def torch_load_file( filename, device = 'cpu' ) -> Dict[str, torch.Tensor]:
|
|
444
|
+
def torch_load_file( filename, device = 'cpu', writable_tensors = True) -> Dict[str, torch.Tensor]:
|
|
437
445
|
sd = {}
|
|
438
|
-
with safe_open(filename, framework="pt", device = device ) as f:
|
|
446
|
+
with safe_open(filename, framework="pt", device = device, writable_tensors =writable_tensors ) as f:
|
|
439
447
|
for k in f.keys():
|
|
440
448
|
sd[k] = f.get_tensor(k)
|
|
441
449
|
return sd
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.3.
|
|
3
|
+
Version: 3.3.2
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -17,7 +17,7 @@ Dynamic: license-file
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
<p align="center">
|
|
20
|
-
<H2>Memory Management 3.3.
|
|
20
|
+
<H2>Memory Management 3.3.2 for the GPU Poor by DeepBeepMeep</H2>
|
|
21
21
|
</p>
|
|
22
22
|
|
|
23
23
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|