mmgp 3.1.0__tar.gz → 3.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- {mmgp-3.1.0/src/mmgp.egg-info → mmgp-3.1.2}/PKG-INFO +1 -1
- {mmgp-3.1.0 → mmgp-3.1.2}/pyproject.toml +1 -1
- {mmgp-3.1.0 → mmgp-3.1.2}/src/mmgp/offload.py +132 -67
- {mmgp-3.1.0 → mmgp-3.1.2}/src/mmgp/safetensors2.py +7 -6
- {mmgp-3.1.0 → mmgp-3.1.2/src/mmgp.egg-info}/PKG-INFO +1 -1
- {mmgp-3.1.0 → mmgp-3.1.2}/LICENSE.md +0 -0
- {mmgp-3.1.0 → mmgp-3.1.2}/README.md +0 -0
- {mmgp-3.1.0 → mmgp-3.1.2}/setup.cfg +0 -0
- {mmgp-3.1.0 → mmgp-3.1.2}/src/__init__.py +0 -0
- {mmgp-3.1.0 → mmgp-3.1.2}/src/mmgp/__init__.py +0 -0
- {mmgp-3.1.0 → mmgp-3.1.2}/src/mmgp.egg-info/SOURCES.txt +0 -0
- {mmgp-3.1.0 → mmgp-3.1.2}/src/mmgp.egg-info/dependency_links.txt +0 -0
- {mmgp-3.1.0 → mmgp-3.1.2}/src/mmgp.egg-info/requires.txt +0 -0
- {mmgp-3.1.0 → mmgp-3.1.2}/src/mmgp.egg-info/top_level.txt +0 -0
|
@@ -261,6 +261,29 @@ def _remove_model_wrapper(model):
|
|
|
261
261
|
return sub_module
|
|
262
262
|
return model
|
|
263
263
|
|
|
264
|
+
# def force_load_tensor(t):
|
|
265
|
+
# c = torch.nn.Parameter(t + 0)
|
|
266
|
+
# torch.utils.swap_tensors(t, c)
|
|
267
|
+
# del c
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
# for n,m in model_to_quantize.named_modules():
|
|
271
|
+
# # do not read quantized weights (detected them directly or behind an adapter)
|
|
272
|
+
# if isinstance(m, QModuleMixin) or hasattr(m, "base_layer") and isinstance(m.base_layer, QModuleMixin):
|
|
273
|
+
# if hasattr(m, "bias") and m.bias is not None:
|
|
274
|
+
# force_load_tensor(m.bias.data)
|
|
275
|
+
# # m.bias.data = m.bias.data + 0
|
|
276
|
+
# else:
|
|
277
|
+
# for n, p in m.named_parameters(recurse = False):
|
|
278
|
+
# data = getattr(m, n)
|
|
279
|
+
# force_load_tensor(data)
|
|
280
|
+
# # setattr(m,n, torch.nn.Parameter(data + 0 ) )
|
|
281
|
+
|
|
282
|
+
# for b in m.buffers(recurse = False):
|
|
283
|
+
# # b.data = b.data + 0
|
|
284
|
+
# b.data = torch.nn.Buffer(b.data + 0)
|
|
285
|
+
# force_load_tensor(b.data)
|
|
286
|
+
|
|
264
287
|
|
|
265
288
|
|
|
266
289
|
def _move_to_pinned_tensor(source_tensor, big_tensor, offset, length):
|
|
@@ -290,6 +313,17 @@ def _safetensors_load_file(file_path):
|
|
|
290
313
|
|
|
291
314
|
return sd, metadata
|
|
292
315
|
|
|
316
|
+
def _force_load_buffer(p):
|
|
317
|
+
# To do : check if buffer was persistent and transfer state, or maybe swap keep already this property ?
|
|
318
|
+
q = torch.nn.Buffer(p + 0)
|
|
319
|
+
torch.utils.swap_tensors(p, q)
|
|
320
|
+
del q
|
|
321
|
+
|
|
322
|
+
def _force_load_parameter(p):
|
|
323
|
+
q = torch.nn.Parameter(p + 0)
|
|
324
|
+
torch.utils.swap_tensors(p, q)
|
|
325
|
+
del q
|
|
326
|
+
|
|
293
327
|
def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_max = 0, verboseLevel = 1):
|
|
294
328
|
if verboseLevel>=1 :
|
|
295
329
|
if partialPinning:
|
|
@@ -302,6 +336,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
|
|
|
302
336
|
towers_names, _ = _detect_main_towers(model)
|
|
303
337
|
towers_names = [n +"." for n in towers_names]
|
|
304
338
|
|
|
339
|
+
|
|
305
340
|
BIG_TENSOR_MAX_SIZE = 2**28 # 256 MB
|
|
306
341
|
current_big_tensor_size = 0
|
|
307
342
|
big_tensor_no = 0
|
|
@@ -315,10 +350,10 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
|
|
|
315
350
|
if partialPinning:
|
|
316
351
|
include = any(k.startswith(pre) for pre in towers_names) if partialPinning else True
|
|
317
352
|
if include:
|
|
318
|
-
params_list = params_list +
|
|
353
|
+
params_list = params_list + [ (k + '.' + n, p, False) for n, p in sub_module.named_parameters(recurse=False)] + [ (k + '.' + n, p, True) for n, p in sub_module.named_buffers(recurse=False)]
|
|
319
354
|
|
|
320
|
-
|
|
321
|
-
for p in params_list:
|
|
355
|
+
|
|
356
|
+
for n, p, _ in params_list:
|
|
322
357
|
if isinstance(p, QTensor):
|
|
323
358
|
if p._qtype == qint4:
|
|
324
359
|
if hasattr(p,"_scale_shift"):
|
|
@@ -330,10 +365,16 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
|
|
|
330
365
|
else:
|
|
331
366
|
length = torch.numel(p.data) * p.data.element_size()
|
|
332
367
|
|
|
368
|
+
|
|
333
369
|
if current_big_tensor_size + length > BIG_TENSOR_MAX_SIZE:
|
|
334
370
|
big_tensors_sizes.append(current_big_tensor_size)
|
|
335
371
|
current_big_tensor_size = 0
|
|
336
372
|
big_tensor_no += 1
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
itemsize = p.data.dtype.itemsize
|
|
376
|
+
if current_big_tensor_size % itemsize:
|
|
377
|
+
current_big_tensor_size += itemsize - current_big_tensor_size % itemsize
|
|
337
378
|
tensor_map_indexes.append((big_tensor_no, current_big_tensor_size, length ))
|
|
338
379
|
current_big_tensor_size += length
|
|
339
380
|
|
|
@@ -362,12 +403,18 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
|
|
|
362
403
|
|
|
363
404
|
gc.collect()
|
|
364
405
|
|
|
406
|
+
|
|
365
407
|
tensor_no = 0
|
|
366
|
-
|
|
408
|
+
# prev_big_tensor = 0
|
|
409
|
+
for n, p, is_buffer in params_list:
|
|
367
410
|
big_tensor_no, offset, length = tensor_map_indexes[tensor_no]
|
|
368
|
-
|
|
411
|
+
# if big_tensor_no != prev_big_tensor:
|
|
412
|
+
# gc.collect()
|
|
413
|
+
# prev_big_tensor = big_tensor_no
|
|
369
414
|
if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
|
|
370
415
|
current_big_tensor = big_tensors[big_tensor_no]
|
|
416
|
+
if is_buffer :
|
|
417
|
+
_force_load_buffer(p) # otherwise potential memory leak
|
|
371
418
|
if isinstance(p, QTensor):
|
|
372
419
|
if p._qtype == qint4:
|
|
373
420
|
length1 = torch.numel(p._data._data) * p._data._data.element_size()
|
|
@@ -395,7 +442,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
|
|
|
395
442
|
gc.collect()
|
|
396
443
|
|
|
397
444
|
if verboseLevel >=1:
|
|
398
|
-
if total_tensor_bytes
|
|
445
|
+
if total_tensor_bytes <= total:
|
|
399
446
|
print(f"The whole model was pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
400
447
|
else:
|
|
401
448
|
print(f"{total/ONE_MB:.2f} MB were pinned to reserved RAM out of {total_tensor_bytes/ONE_MB:.2f} MB")
|
|
@@ -536,48 +583,49 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
536
583
|
|
|
537
584
|
print(f"Quantization of model '{model_id}' started to format '{weights}'")
|
|
538
585
|
|
|
586
|
+
tower_names ,_ = _detect_main_towers(model_to_quantize)
|
|
587
|
+
tower_names = [ n[:-1] for n in tower_names]
|
|
588
|
+
|
|
539
589
|
for submodule_name, submodule in model_to_quantize.named_modules():
|
|
540
590
|
if isinstance(submodule, QModuleMixin):
|
|
541
591
|
if verboseLevel>=1:
|
|
542
592
|
print("No quantization to do as model is already quantized")
|
|
543
593
|
return False
|
|
544
594
|
|
|
545
|
-
|
|
546
595
|
if submodule_name=='':
|
|
547
596
|
continue
|
|
548
597
|
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
if
|
|
553
|
-
cur_blocks_prefix
|
|
554
|
-
flush = True
|
|
555
|
-
else:
|
|
556
|
-
#if cur_blocks_prefix != submodule_name[:len(cur_blocks_prefix)]:
|
|
557
|
-
if not submodule_name.startswith(cur_blocks_prefix):
|
|
598
|
+
size = compute_submodule_size(submodule)
|
|
599
|
+
if not any(submodule_name.startswith(pre) for pre in tower_names):
|
|
600
|
+
flush = False
|
|
601
|
+
if isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
|
|
602
|
+
if cur_blocks_prefix == None:
|
|
558
603
|
cur_blocks_prefix = submodule_name + "."
|
|
559
604
|
flush = True
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
605
|
+
else:
|
|
606
|
+
if not submodule_name.startswith(cur_blocks_prefix):
|
|
607
|
+
cur_blocks_prefix = submodule_name + "."
|
|
608
|
+
flush = True
|
|
609
|
+
else:
|
|
610
|
+
if cur_blocks_prefix is not None:
|
|
611
|
+
#if not cur_blocks_prefix == submodule_name[0:len(cur_blocks_prefix)]:
|
|
612
|
+
if not submodule_name.startswith(cur_blocks_prefix):
|
|
613
|
+
cur_blocks_prefix = None
|
|
614
|
+
flush = True
|
|
615
|
+
|
|
616
|
+
if flush :
|
|
617
|
+
if submodule_size <= threshold :
|
|
618
|
+
exclude_list += submodule_names
|
|
619
|
+
if verboseLevel >=2:
|
|
620
|
+
print(f"Excluded size {submodule_size/ONE_MB:.1f} MB: {prev_blocks_prefix} : {submodule_names}")
|
|
621
|
+
total_excluded += submodule_size
|
|
622
|
+
|
|
623
|
+
submodule_size = 0
|
|
624
|
+
submodule_names = []
|
|
625
|
+
prev_blocks_prefix = cur_blocks_prefix
|
|
626
|
+
submodule_size += size
|
|
627
|
+
submodule_names.append(submodule_name)
|
|
579
628
|
total_size += size
|
|
580
|
-
submodule_names.append(submodule_name)
|
|
581
629
|
|
|
582
630
|
if submodule_size > 0 and submodule_size <= threshold:
|
|
583
631
|
exclude_list += submodule_names
|
|
@@ -593,28 +641,29 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
593
641
|
exclude_list = None
|
|
594
642
|
|
|
595
643
|
|
|
596
|
-
|
|
644
|
+
quantize(model_to_quantize,weights, exclude= exclude_list)
|
|
645
|
+
# quantize(model_to_quantize,weights, include= [ "*1.block.attn.to_out*"]) #"
|
|
646
|
+
|
|
647
|
+
# for name, m in model_to_quantize.named_modules():
|
|
648
|
+
# if exclude_list is None or not any( name == module_name for module_name in exclude_list):
|
|
649
|
+
# _quantize_submodule(model_to_quantize, name, m, weights=weights, activations=None, optimizer=None)
|
|
597
650
|
|
|
598
|
-
for name, m in model_to_quantize.named_modules():
|
|
599
|
-
if exclude_list is None or not any( name == module_name for module_name in exclude_list):
|
|
600
|
-
_quantize_submodule(model_to_quantize, name, m, weights=weights, activations=None, optimizer=None)
|
|
601
651
|
|
|
602
652
|
# force to read non quantized parameters so that their lazy tensors and corresponding mmap are released
|
|
603
653
|
# otherwise we may end up keeping in memory both the quantized and the non quantize model
|
|
604
|
-
for m in model_to_quantize.
|
|
654
|
+
for n,m in model_to_quantize.named_modules():
|
|
605
655
|
# do not read quantized weights (detected them directly or behind an adapter)
|
|
606
656
|
if isinstance(m, QModuleMixin) or hasattr(m, "base_layer") and isinstance(m.base_layer, QModuleMixin):
|
|
607
657
|
if hasattr(m, "bias") and m.bias is not None:
|
|
608
|
-
m.bias
|
|
658
|
+
_force_load_parameter(m.bias)
|
|
609
659
|
else:
|
|
610
|
-
for
|
|
611
|
-
|
|
612
|
-
setattr(m,n, torch.nn.Parameter(data + 0 ) )
|
|
660
|
+
for p in m.parameters(recurse = False):
|
|
661
|
+
_force_load_parameter(p)
|
|
613
662
|
|
|
614
663
|
for b in m.buffers(recurse = False):
|
|
615
|
-
b
|
|
664
|
+
_force_load_buffer(b)
|
|
665
|
+
|
|
616
666
|
|
|
617
|
-
|
|
618
667
|
|
|
619
668
|
freeze(model_to_quantize)
|
|
620
669
|
torch.cuda.empty_cache()
|
|
@@ -631,7 +680,7 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
631
680
|
|
|
632
681
|
return True
|
|
633
682
|
|
|
634
|
-
def load_loras_into_model(model, lora_path, lora_multi = None, verboseLevel = -1):
|
|
683
|
+
def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, verboseLevel = -1,):
|
|
635
684
|
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
636
685
|
|
|
637
686
|
if inject_adapter_in_model == None or set_weights_and_activate_adapters == None or get_peft_kwargs == None:
|
|
@@ -682,9 +731,6 @@ def load_loras_into_model(model, lora_path, lora_multi = None, verboseLevel = -1
|
|
|
682
731
|
|
|
683
732
|
# is_correct_format = all("lora" in key for key in state_dict.keys())
|
|
684
733
|
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
734
|
# check with first key if is not in peft format
|
|
689
735
|
# first_key = next(iter(state_dict.keys()))
|
|
690
736
|
# if "lora_A" not in first_key:
|
|
@@ -721,8 +767,27 @@ def load_loras_into_model(model, lora_path, lora_multi = None, verboseLevel = -1
|
|
|
721
767
|
pass
|
|
722
768
|
if verboseLevel >=1:
|
|
723
769
|
print(f"Lora '{path}' was loaded in model '{_get_module_name(model)}'")
|
|
724
|
-
|
|
770
|
+
if activate_all_loras:
|
|
771
|
+
set_weights_and_activate_adapters(model,[ str(i) for i in range(len(lora_multi))], lora_multi)
|
|
725
772
|
|
|
773
|
+
def activate_loras(model, lora_nos, lora_multi = None ):
|
|
774
|
+
if not isinstance(lora_nos, list):
|
|
775
|
+
lora_nos = [lora_nos]
|
|
776
|
+
lora_nos = [str(l) for l in lora_nos]
|
|
777
|
+
if lora_multi is None:
|
|
778
|
+
lora_multi = [1. for _ in lora_nos]
|
|
779
|
+
set_weights_and_activate_adapters(model, lora_nos, lora_multi)
|
|
780
|
+
|
|
781
|
+
|
|
782
|
+
def move_loras_to_device(model, device="cpu" ):
|
|
783
|
+
if hasattr( model, "_lora_loadable_modules"):
|
|
784
|
+
for k in model._lora_loadable_modules:
|
|
785
|
+
move_loras_to_device(getattr(model,k), device)
|
|
786
|
+
return
|
|
787
|
+
|
|
788
|
+
for k, m in model.named_modules():
|
|
789
|
+
if ".lora_" in k:
|
|
790
|
+
m.to(device)
|
|
726
791
|
|
|
727
792
|
def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, verboseLevel = -1):
|
|
728
793
|
"""
|
|
@@ -812,9 +877,6 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
|
|
|
812
877
|
|
|
813
878
|
model = _remove_model_wrapper(model)
|
|
814
879
|
|
|
815
|
-
# if pinToMemory and do_quantize:
|
|
816
|
-
# raise Exception("Pinning and Quantization can not be used at the same time")
|
|
817
|
-
|
|
818
880
|
if not (".safetensors" in file_path or ".sft" in file_path):
|
|
819
881
|
if pinToMemory:
|
|
820
882
|
raise Exception("Pinning to memory while loading only supported for safe tensors files")
|
|
@@ -933,7 +995,7 @@ class offload:
|
|
|
933
995
|
|
|
934
996
|
for k,p in submodule.named_parameters(recurse=False):
|
|
935
997
|
if isinstance(p, QTensor):
|
|
936
|
-
blocks_params.append( (submodule, k, p ) )
|
|
998
|
+
blocks_params.append( (submodule, k, p, False ) )
|
|
937
999
|
|
|
938
1000
|
if p._qtype == qint4:
|
|
939
1001
|
if hasattr(p,"_scale_shift"):
|
|
@@ -947,11 +1009,11 @@ class offload:
|
|
|
947
1009
|
blocks_params_size += torch.numel(p._scale) * p._scale.element_size()
|
|
948
1010
|
blocks_params_size += torch.numel(p._data) * p._data.element_size()
|
|
949
1011
|
else:
|
|
950
|
-
blocks_params.append( (submodule, k, p
|
|
1012
|
+
blocks_params.append( (submodule, k, p, False) )
|
|
951
1013
|
blocks_params_size += torch.numel(p.data) * p.data.element_size()
|
|
952
1014
|
|
|
953
1015
|
for k, p in submodule.named_buffers(recurse=False):
|
|
954
|
-
blocks_params.append( (submodule, k, p) )
|
|
1016
|
+
blocks_params.append( (submodule, k, p, True) )
|
|
955
1017
|
blocks_params_size += p.data.nbytes
|
|
956
1018
|
|
|
957
1019
|
|
|
@@ -981,9 +1043,12 @@ class offload:
|
|
|
981
1043
|
def cpu_to_gpu(stream_to_use, blocks_params): #, record_for_stream = None
|
|
982
1044
|
with torch.cuda.stream(stream_to_use):
|
|
983
1045
|
for param in blocks_params:
|
|
984
|
-
parent_module, n, p = param
|
|
1046
|
+
parent_module, n, p, is_buffer = param
|
|
985
1047
|
q = p.to("cuda", non_blocking=True)
|
|
986
|
-
|
|
1048
|
+
if is_buffer:
|
|
1049
|
+
q = torch.nn.Buffer(q)
|
|
1050
|
+
else:
|
|
1051
|
+
q = torch.nn.Parameter(q , requires_grad=False)
|
|
987
1052
|
setattr(parent_module, n , q)
|
|
988
1053
|
# if record_for_stream != None:
|
|
989
1054
|
# if isinstance(p, QTensor):
|
|
@@ -1030,8 +1095,11 @@ class offload:
|
|
|
1030
1095
|
|
|
1031
1096
|
blocks_params = self.blocks_of_modules[blocks_name]
|
|
1032
1097
|
for param in blocks_params:
|
|
1033
|
-
parent_module, n, p = param
|
|
1034
|
-
|
|
1098
|
+
parent_module, n, p, is_buffer = param
|
|
1099
|
+
if is_buffer:
|
|
1100
|
+
q = torch.nn.Buffer(p)
|
|
1101
|
+
else:
|
|
1102
|
+
q = torch.nn.Parameter(p , requires_grad=False)
|
|
1035
1103
|
setattr(parent_module, n , q)
|
|
1036
1104
|
# cl.stop()
|
|
1037
1105
|
# print(f"unload time: {cl.format_time_gap()}")
|
|
@@ -1403,19 +1471,16 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1403
1471
|
for model_id in models:
|
|
1404
1472
|
current_model: torch.nn.Module = models[model_id]
|
|
1405
1473
|
towers_names, towers_modules = _detect_main_towers(current_model)
|
|
1406
|
-
if self.verboseLevel>=2 and len(towers_names)>0:
|
|
1407
|
-
print(f"Potential iterative blocks found in model '{model_id}':{towers_names}")
|
|
1408
1474
|
# compile main iterative modules stacks ("towers")
|
|
1409
1475
|
compilationInThisOne = compileAllModels or model_id in modelsToCompile
|
|
1410
1476
|
if compilationInThisOne:
|
|
1411
1477
|
if self.verboseLevel>=1:
|
|
1412
1478
|
if len(towers_modules)>0:
|
|
1413
|
-
print(f"Pytorch compilation of
|
|
1479
|
+
print(f"Pytorch compilation of '{model_id}' is scheduled for these modules : {towers_names}.")
|
|
1414
1480
|
else:
|
|
1415
1481
|
print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
|
|
1416
1482
|
|
|
1417
1483
|
for submodel in towers_modules:
|
|
1418
|
-
# for submodel in tower:
|
|
1419
1484
|
submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode="default" ) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
|
|
1420
1485
|
#dynamic=True,
|
|
1421
1486
|
|
|
@@ -1559,7 +1624,7 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
|
|
|
1559
1624
|
extraModelsToQuantize = default_extraModelsToQuantize
|
|
1560
1625
|
budgets=default_budgets
|
|
1561
1626
|
budgets["transformer"] = 400
|
|
1562
|
-
asyncTransfers = False
|
|
1627
|
+
#asyncTransfers = False
|
|
1563
1628
|
info = "You have chosen the slowest profile that requires at least 24 GB of RAM and 10 GB of VRAM."
|
|
1564
1629
|
else:
|
|
1565
1630
|
raise Exception("Unknown profile")
|
|
@@ -262,6 +262,7 @@ class SafeTensorFile:
|
|
|
262
262
|
|
|
263
263
|
PAGE_SIZE = mmap.ALLOCATIONGRANULARITY
|
|
264
264
|
MMAP_SIZE = 1024 * 1024 * 1024 # 1GB
|
|
265
|
+
# MMAP_SIZE = 256 * 1024 * 1024 # 1GB
|
|
265
266
|
|
|
266
267
|
# First pass: find optimal aligned map boundaries
|
|
267
268
|
skip_bytes = self._skip_bytes
|
|
@@ -322,6 +323,7 @@ class SafeTensorFile:
|
|
|
322
323
|
current_pos += length
|
|
323
324
|
|
|
324
325
|
return sd
|
|
326
|
+
|
|
325
327
|
|
|
326
328
|
def create_tensors_without_mmap(self):
|
|
327
329
|
sd = OrderedDict()
|
|
@@ -335,12 +337,11 @@ class SafeTensorFile:
|
|
|
335
337
|
data_offsets = v["data_offsets"]
|
|
336
338
|
length = data_offsets[1]-data_offsets[0]
|
|
337
339
|
buffer = f.read(length)
|
|
338
|
-
if
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
t = t.view(dtype)
|
|
340
|
+
if length == 0:
|
|
341
|
+
t = torch.empty(0, dtype=dtype)
|
|
342
|
+
elif len(shape) == 0:
|
|
343
|
+
t = torch.frombuffer(bytearray(buffer), dtype=torch.uint8)
|
|
344
|
+
t = t.view(dtype)
|
|
344
345
|
else:
|
|
345
346
|
t = torch.frombuffer(bytearray(buffer), dtype=dtype)
|
|
346
347
|
t = torch.reshape(t, shape)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|