bigdl-core-cpp 2.5.0b20240527__py3-none-win_amd64.whl → 2.5.0b20240529__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/convert-hf-to-gguf.py +1363 -338
- bigdl/cpp/convert.py +199 -52
- bigdl/cpp/gguf-py/gguf/__init__.py +2 -0
- bigdl/cpp/gguf-py/gguf/constants.py +102 -28
- bigdl/cpp/gguf-py/gguf/gguf_reader.py +9 -5
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +36 -11
- bigdl/cpp/gguf-py/gguf/lazy.py +236 -0
- bigdl/cpp/gguf-py/gguf/quants.py +123 -0
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +28 -1
- bigdl/cpp/gguf-py/gguf/vocab.py +3 -3
- bigdl/cpp/libs/baby-llama.exe +0 -0
- bigdl/cpp/libs/batched-bench.exe +0 -0
- bigdl/cpp/libs/batched.exe +0 -0
- bigdl/cpp/libs/beam-search.exe +0 -0
- bigdl/cpp/libs/benchmark.exe +0 -0
- bigdl/cpp/libs/common.lib +0 -0
- bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/embedding.exe +0 -0
- bigdl/cpp/libs/export-lora.exe +0 -0
- bigdl/cpp/libs/finetune.exe +0 -0
- bigdl/cpp/libs/ggml_shared.dll +0 -0
- bigdl/cpp/libs/gguf.exe +0 -0
- bigdl/cpp/libs/gritlm.exe +0 -0
- bigdl/cpp/libs/imatrix.exe +0 -0
- bigdl/cpp/libs/infill.exe +0 -0
- bigdl/cpp/libs/llama-bench.exe +0 -0
- bigdl/cpp/libs/llama.dll +0 -0
- bigdl/cpp/libs/llava-cli.exe +0 -0
- bigdl/cpp/libs/llava_shared.dll +0 -0
- bigdl/cpp/libs/lookahead.exe +0 -0
- bigdl/cpp/libs/lookup.exe +0 -0
- bigdl/cpp/libs/ls-sycl-device.exe +0 -0
- bigdl/cpp/libs/main.exe +0 -0
- bigdl/cpp/libs/ollama.exe +0 -0
- bigdl/cpp/libs/parallel.exe +0 -0
- bigdl/cpp/libs/passkey.exe +0 -0
- bigdl/cpp/libs/perplexity.exe +0 -0
- bigdl/cpp/libs/q8dot.exe +0 -0
- bigdl/cpp/libs/quantize-stats.exe +0 -0
- bigdl/cpp/libs/quantize.exe +0 -0
- bigdl/cpp/libs/save-load-state.exe +0 -0
- bigdl/cpp/libs/server.exe +0 -0
- bigdl/cpp/libs/simple.exe +0 -0
- bigdl/cpp/libs/speculative.exe +0 -0
- bigdl/cpp/libs/tokenize.exe +0 -0
- bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
- bigdl/cpp/libs/vdot.exe +0 -0
- {bigdl_core_cpp-2.5.0b20240527.dist-info → bigdl_core_cpp-2.5.0b20240529.dist-info}/METADATA +1 -1
- bigdl_core_cpp-2.5.0b20240529.dist-info/RECORD +61 -0
- bigdl_core_cpp-2.5.0b20240527.dist-info/RECORD +0 -59
- {bigdl_core_cpp-2.5.0b20240527.data → bigdl_core_cpp-2.5.0b20240529.data}/scripts/init-llama-cpp.bat +0 -0
- {bigdl_core_cpp-2.5.0b20240527.data → bigdl_core_cpp-2.5.0b20240529.data}/scripts/init-llama-cpp.ps1 +0 -0
- {bigdl_core_cpp-2.5.0b20240527.data → bigdl_core_cpp-2.5.0b20240529.data}/scripts/init-ollama.bat +0 -0
- {bigdl_core_cpp-2.5.0b20240527.dist-info → bigdl_core_cpp-2.5.0b20240529.dist-info}/WHEEL +0 -0
- {bigdl_core_cpp-2.5.0b20240527.dist-info → bigdl_core_cpp-2.5.0b20240529.dist-info}/top_level.txt +0 -0
bigdl/cpp/convert.py
CHANGED
@@ -24,7 +24,7 @@ from abc import ABC, abstractmethod
|
|
24
24
|
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
25
25
|
from dataclasses import dataclass
|
26
26
|
from pathlib import Path
|
27
|
-
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
|
27
|
+
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional
|
28
28
|
|
29
29
|
import numpy as np
|
30
30
|
from sentencepiece import SentencePieceProcessor
|
@@ -284,6 +284,7 @@ class Params:
|
|
284
284
|
n_experts = None
|
285
285
|
n_experts_used = None
|
286
286
|
f_rope_freq_base = None
|
287
|
+
n_ff = None
|
287
288
|
|
288
289
|
# hack to determine LLaMA v1 vs v2 vs CodeLlama
|
289
290
|
if config.get("moe"):
|
@@ -308,6 +309,8 @@ class Params:
|
|
308
309
|
n_experts_used = config["moe"]["num_experts_per_tok"]
|
309
310
|
f_rope_freq_base = 1e6
|
310
311
|
|
312
|
+
assert n_ff is not None
|
313
|
+
|
311
314
|
return Params(
|
312
315
|
n_vocab = model["tok_embeddings.weight"].shape[0],
|
313
316
|
n_embd = config["dim"],
|
@@ -341,10 +344,47 @@ class Params:
|
|
341
344
|
return params
|
342
345
|
|
343
346
|
|
347
|
+
@dataclass
|
348
|
+
class Metadata:
|
349
|
+
name: Optional[str] = None
|
350
|
+
author: Optional[str] = None
|
351
|
+
version: Optional[str] = None
|
352
|
+
url: Optional[str] = None
|
353
|
+
description: Optional[str] = None
|
354
|
+
licence: Optional[str] = None
|
355
|
+
source_url: Optional[str] = None
|
356
|
+
source_hf_repo: Optional[str] = None
|
357
|
+
|
358
|
+
@staticmethod
|
359
|
+
def load(metadata_path: Path) -> Metadata:
|
360
|
+
if metadata_path is None or not metadata_path.exists():
|
361
|
+
return Metadata()
|
362
|
+
|
363
|
+
with open(metadata_path, 'r') as file:
|
364
|
+
data = json.load(file)
|
365
|
+
|
366
|
+
# Create a new Metadata instance
|
367
|
+
metadata = Metadata()
|
368
|
+
|
369
|
+
# Assigning values to Metadata attributes if they exist in the JSON file
|
370
|
+
# This is based on LLM_KV_NAMES mapping in llama.cpp
|
371
|
+
metadata.name = data.get("general.name")
|
372
|
+
metadata.author = data.get("general.author")
|
373
|
+
metadata.version = data.get("general.version")
|
374
|
+
metadata.url = data.get("general.url")
|
375
|
+
metadata.description = data.get("general.description")
|
376
|
+
metadata.license = data.get("general.license")
|
377
|
+
metadata.source_url = data.get("general.source.url")
|
378
|
+
metadata.source_hf_repo = data.get("general.source.huggingface.repository")
|
379
|
+
|
380
|
+
return metadata
|
381
|
+
|
382
|
+
|
344
383
|
#
|
345
384
|
# vocab
|
346
385
|
#
|
347
386
|
|
387
|
+
|
348
388
|
@runtime_checkable
|
349
389
|
class BaseVocab(Protocol):
|
350
390
|
tokenizer_model: ClassVar[str]
|
@@ -462,7 +502,8 @@ class SentencePieceVocab(Vocab):
|
|
462
502
|
# not found in alternate location either
|
463
503
|
raise FileNotFoundError('Cannot find tokenizer.model')
|
464
504
|
|
465
|
-
self.sentencepiece_tokenizer = SentencePieceProcessor(
|
505
|
+
self.sentencepiece_tokenizer = SentencePieceProcessor()
|
506
|
+
self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
|
466
507
|
vocab_size = self.sentencepiece_tokenizer.vocab_size()
|
467
508
|
|
468
509
|
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
@@ -482,23 +523,23 @@ class SentencePieceVocab(Vocab):
|
|
482
523
|
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
483
524
|
tokenizer = self.sentencepiece_tokenizer
|
484
525
|
for i in range(tokenizer.vocab_size()):
|
485
|
-
piece = tokenizer.
|
526
|
+
piece = tokenizer.IdToPiece(i)
|
486
527
|
text = piece.encode("utf-8")
|
487
|
-
score: float = tokenizer.
|
528
|
+
score: float = tokenizer.GetScore(i)
|
488
529
|
|
489
530
|
toktype = gguf.TokenType.NORMAL
|
490
|
-
if tokenizer.
|
531
|
+
if tokenizer.IsUnknown(i):
|
491
532
|
toktype = gguf.TokenType.UNKNOWN
|
492
|
-
if tokenizer.
|
533
|
+
if tokenizer.IsControl(i):
|
493
534
|
toktype = gguf.TokenType.CONTROL
|
494
535
|
|
495
536
|
# NOTE: I think added_tokens are user defined.
|
496
537
|
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
497
538
|
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
|
498
539
|
|
499
|
-
if tokenizer.
|
540
|
+
if tokenizer.IsUnused(i):
|
500
541
|
toktype = gguf.TokenType.UNUSED
|
501
|
-
if tokenizer.
|
542
|
+
if tokenizer.IsByte(i):
|
502
543
|
toktype = gguf.TokenType.BYTE
|
503
544
|
|
504
545
|
yield text, score, toktype
|
@@ -906,7 +947,7 @@ class LazyUnpickler(pickle.Unpickler):
|
|
906
947
|
def rebuild_from_type_v2(func, new_type, args, state):
|
907
948
|
return func(*args)
|
908
949
|
|
909
|
-
CLASSES = {
|
950
|
+
CLASSES: dict[tuple[str, str], type[LazyTensor] | LazyStorageKind] = {
|
910
951
|
# getattr used here as a workaround for mypy not being smart enough to determine
|
911
952
|
# the staticmethods have a __func__ attribute.
|
912
953
|
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
|
@@ -1062,21 +1103,42 @@ class OutputFile:
|
|
1062
1103
|
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
|
1063
1104
|
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
1064
1105
|
|
1065
|
-
def
|
1106
|
+
def add_meta_model(self, params: Params, metadata: Metadata) -> None:
|
1107
|
+
# Metadata About The Model And Its Provenence
|
1066
1108
|
name = "LLaMA"
|
1067
|
-
|
1068
|
-
|
1069
|
-
if params.n_ctx == 4096:
|
1070
|
-
name = "LLaMA v2"
|
1109
|
+
if metadata is not None and metadata.name is not None:
|
1110
|
+
name = metadata.name
|
1071
1111
|
elif params.path_model is not None:
|
1072
|
-
name =
|
1073
|
-
|
1074
|
-
|
1075
|
-
|
1076
|
-
|
1077
|
-
self.gguf.
|
1078
|
-
|
1079
|
-
|
1112
|
+
name = params.path_model.name
|
1113
|
+
elif params.n_ctx == 4096:
|
1114
|
+
# Heuristic detection of LLaMA v2 model
|
1115
|
+
name = "LLaMA v2"
|
1116
|
+
|
1117
|
+
self.gguf.add_name(name)
|
1118
|
+
|
1119
|
+
if metadata is not None:
|
1120
|
+
if metadata.author is not None:
|
1121
|
+
self.gguf.add_author(metadata.author)
|
1122
|
+
if metadata.version is not None:
|
1123
|
+
self.gguf.add_version(metadata.version)
|
1124
|
+
if metadata.url is not None:
|
1125
|
+
self.gguf.add_url(metadata.url)
|
1126
|
+
if metadata.description is not None:
|
1127
|
+
self.gguf.add_description(metadata.description)
|
1128
|
+
if metadata.licence is not None:
|
1129
|
+
self.gguf.add_licence(metadata.licence)
|
1130
|
+
if metadata.source_url is not None:
|
1131
|
+
self.gguf.add_source_url(metadata.source_url)
|
1132
|
+
if metadata.source_hf_repo is not None:
|
1133
|
+
self.gguf.add_source_hf_repo(metadata.source_hf_repo)
|
1134
|
+
|
1135
|
+
def add_meta_arch(self, params: Params) -> None:
|
1136
|
+
# Metadata About The Neural Architecture Itself
|
1137
|
+
self.gguf.add_vocab_size(params.n_vocab)
|
1138
|
+
self.gguf.add_context_length(params.n_ctx)
|
1139
|
+
self.gguf.add_embedding_length(params.n_embd)
|
1140
|
+
self.gguf.add_block_count(params.n_layer)
|
1141
|
+
self.gguf.add_feed_forward_length(params.n_ff)
|
1080
1142
|
self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
|
1081
1143
|
self.gguf.add_head_count (params.n_head)
|
1082
1144
|
self.gguf.add_head_count_kv (params.n_head_kv)
|
@@ -1179,13 +1241,14 @@ class OutputFile:
|
|
1179
1241
|
@staticmethod
|
1180
1242
|
def write_vocab_only(
|
1181
1243
|
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
|
1182
|
-
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
|
1244
|
+
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None,
|
1183
1245
|
) -> None:
|
1184
1246
|
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
|
1185
1247
|
|
1186
1248
|
of = OutputFile(fname_out, endianess=endianess)
|
1187
1249
|
|
1188
1250
|
# meta data
|
1251
|
+
of.add_meta_model(params, metadata)
|
1189
1252
|
of.add_meta_arch(params)
|
1190
1253
|
of.add_meta_vocab(vocab)
|
1191
1254
|
of.add_meta_special_vocab(svocab)
|
@@ -1212,12 +1275,14 @@ class OutputFile:
|
|
1212
1275
|
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
|
1213
1276
|
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
1214
1277
|
pad_vocab: bool = False,
|
1278
|
+
metadata: Metadata = None,
|
1215
1279
|
) -> None:
|
1216
1280
|
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
|
1217
1281
|
|
1218
1282
|
of = OutputFile(fname_out, endianess=endianess)
|
1219
1283
|
|
1220
1284
|
# meta data
|
1285
|
+
of.add_meta_model(params, metadata)
|
1221
1286
|
of.add_meta_arch(params)
|
1222
1287
|
if isinstance(vocab, Vocab):
|
1223
1288
|
of.add_meta_vocab(vocab)
|
@@ -1253,6 +1318,37 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
|
|
1253
1318
|
raise ValueError(f"Unexpected combination of types: {name_to_type}")
|
1254
1319
|
|
1255
1320
|
|
1321
|
+
def model_parameter_count(model: LazyModel) -> int:
|
1322
|
+
total_model_parameters = 0
|
1323
|
+
for i, (name, lazy_tensor) in enumerate(model.items()):
|
1324
|
+
sum_weights_in_tensor = 1
|
1325
|
+
for dim in lazy_tensor.shape:
|
1326
|
+
sum_weights_in_tensor *= dim
|
1327
|
+
total_model_parameters += sum_weights_in_tensor
|
1328
|
+
return total_model_parameters
|
1329
|
+
|
1330
|
+
|
1331
|
+
def model_parameter_count_rounded_notation(model_params_count: int) -> str:
|
1332
|
+
if model_params_count > 1e12 :
|
1333
|
+
# Trillions Of Parameters
|
1334
|
+
scaled_model_params = model_params_count * 1e-12
|
1335
|
+
scale_suffix = "T"
|
1336
|
+
elif model_params_count > 1e9 :
|
1337
|
+
# Billions Of Parameters
|
1338
|
+
scaled_model_params = model_params_count * 1e-9
|
1339
|
+
scale_suffix = "B"
|
1340
|
+
elif model_params_count > 1e6 :
|
1341
|
+
# Millions Of Parameters
|
1342
|
+
scaled_model_params = model_params_count * 1e-6
|
1343
|
+
scale_suffix = "M"
|
1344
|
+
else:
|
1345
|
+
# Thousands Of Parameters
|
1346
|
+
scaled_model_params = model_params_count * 1e-3
|
1347
|
+
scale_suffix = "K"
|
1348
|
+
|
1349
|
+
return f"{round(scaled_model_params)}{scale_suffix}"
|
1350
|
+
|
1351
|
+
|
1256
1352
|
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
1257
1353
|
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
|
1258
1354
|
for (name, tensor) in model.items()}
|
@@ -1432,13 +1528,35 @@ class VocabFactory:
|
|
1432
1528
|
return vocab, special_vocab
|
1433
1529
|
|
1434
1530
|
|
1435
|
-
def
|
1436
|
-
|
1437
|
-
GGMLFileType.AllF32: "
|
1438
|
-
GGMLFileType.MostlyF16: "
|
1439
|
-
GGMLFileType.MostlyQ8_0:"
|
1531
|
+
def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
|
1532
|
+
quantization = {
|
1533
|
+
GGMLFileType.AllF32: "F32",
|
1534
|
+
GGMLFileType.MostlyF16: "F16",
|
1535
|
+
GGMLFileType.MostlyQ8_0: "Q8_0",
|
1440
1536
|
}[file_type]
|
1441
|
-
|
1537
|
+
|
1538
|
+
parameters = model_parameter_count_rounded_notation(model_params_count)
|
1539
|
+
|
1540
|
+
expert_count = ""
|
1541
|
+
if params.n_experts is not None:
|
1542
|
+
expert_count = f"{params.n_experts}x"
|
1543
|
+
|
1544
|
+
version = ""
|
1545
|
+
if metadata is not None and metadata.version is not None:
|
1546
|
+
version = f"-{metadata.version}"
|
1547
|
+
|
1548
|
+
name = "ggml-model"
|
1549
|
+
if metadata is not None and metadata.name is not None:
|
1550
|
+
name = metadata.name
|
1551
|
+
elif params.path_model is not None:
|
1552
|
+
name = params.path_model.name
|
1553
|
+
|
1554
|
+
return f"{name}{version}-{expert_count}{parameters}-{quantization}"
|
1555
|
+
|
1556
|
+
|
1557
|
+
def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
|
1558
|
+
default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
|
1559
|
+
ret = model_paths[0].parent / f"{default_filename}.gguf"
|
1442
1560
|
if ret in model_paths:
|
1443
1561
|
logger.error(
|
1444
1562
|
f"Error: Default output path ({ret}) would overwrite the input. "
|
@@ -1476,17 +1594,30 @@ def main(args_in: list[str] | None = None) -> None:
|
|
1476
1594
|
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
|
1477
1595
|
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
|
1478
1596
|
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
1597
|
+
parser.add_argument("--metadata", type=Path, help="Specify the path for a metadata file")
|
1598
|
+
parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name")
|
1479
1599
|
|
1480
1600
|
args = parser.parse_args(args_in)
|
1481
1601
|
|
1482
1602
|
if args.verbose:
|
1483
1603
|
logging.basicConfig(level=logging.DEBUG)
|
1484
|
-
elif args.dump_single or args.dump:
|
1604
|
+
elif args.dump_single or args.dump or args.get_outfile:
|
1485
1605
|
# Avoid printing anything besides the dump output
|
1486
1606
|
logging.basicConfig(level=logging.WARNING)
|
1487
1607
|
else:
|
1488
1608
|
logging.basicConfig(level=logging.INFO)
|
1489
1609
|
|
1610
|
+
metadata = Metadata.load(args.metadata)
|
1611
|
+
|
1612
|
+
if args.get_outfile:
|
1613
|
+
model_plus = load_some_model(args.model)
|
1614
|
+
params = Params.load(model_plus)
|
1615
|
+
model = convert_model_names(model_plus.model, params, args.skip_unknown)
|
1616
|
+
model_params_count = model_parameter_count(model_plus.model)
|
1617
|
+
ftype = pick_output_type(model, args.outtype)
|
1618
|
+
print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
|
1619
|
+
return
|
1620
|
+
|
1490
1621
|
if args.no_vocab and args.vocab_only:
|
1491
1622
|
raise ValueError("--vocab-only does not make sense with --no-vocab")
|
1492
1623
|
|
@@ -1500,6 +1631,9 @@ def main(args_in: list[str] | None = None) -> None:
|
|
1500
1631
|
else:
|
1501
1632
|
model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
|
1502
1633
|
|
1634
|
+
model_params_count = model_parameter_count(model_plus.model)
|
1635
|
+
logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")
|
1636
|
+
|
1503
1637
|
if args.dump:
|
1504
1638
|
do_dump_model(model_plus)
|
1505
1639
|
return
|
@@ -1508,25 +1642,27 @@ def main(args_in: list[str] | None = None) -> None:
|
|
1508
1642
|
if args.big_endian:
|
1509
1643
|
endianess = gguf.GGUFEndian.BIG
|
1510
1644
|
|
1511
|
-
params =
|
1512
|
-
if
|
1513
|
-
|
1514
|
-
|
1515
|
-
|
1516
|
-
|
1517
|
-
|
1518
|
-
|
1519
|
-
|
1520
|
-
|
1521
|
-
|
1522
|
-
|
1523
|
-
|
1524
|
-
|
1525
|
-
|
1526
|
-
|
1527
|
-
|
1528
|
-
|
1529
|
-
|
1645
|
+
params = None
|
1646
|
+
if args.pad_vocab or not args.vocab_only:
|
1647
|
+
params = Params.load(model_plus)
|
1648
|
+
if params.n_ctx == -1:
|
1649
|
+
if args.ctx is None:
|
1650
|
+
msg = """\
|
1651
|
+
The model doesn't have a context size, and you didn't specify one with --ctx
|
1652
|
+
Please specify one with --ctx:
|
1653
|
+
- LLaMA v1: --ctx 2048
|
1654
|
+
- LLaMA v2: --ctx 4096"""
|
1655
|
+
parser.error(textwrap.dedent(msg))
|
1656
|
+
params.n_ctx = args.ctx
|
1657
|
+
|
1658
|
+
if args.outtype:
|
1659
|
+
params.ftype = {
|
1660
|
+
"f32": GGMLFileType.AllF32,
|
1661
|
+
"f16": GGMLFileType.MostlyF16,
|
1662
|
+
"q8_0": GGMLFileType.MostlyQ8_0,
|
1663
|
+
}[args.outtype]
|
1664
|
+
|
1665
|
+
logger.info(f"params = {params}")
|
1530
1666
|
|
1531
1667
|
model_parent_path = model_plus.paths[0].parent
|
1532
1668
|
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
|
@@ -1539,8 +1675,19 @@ def main(args_in: list[str] | None = None) -> None:
|
|
1539
1675
|
if not args.outfile:
|
1540
1676
|
raise ValueError("need --outfile if using --vocab-only")
|
1541
1677
|
outfile = args.outfile
|
1678
|
+
if params is None:
|
1679
|
+
params = Params(
|
1680
|
+
n_vocab = vocab.vocab_size,
|
1681
|
+
n_embd = 1,
|
1682
|
+
n_layer = 1,
|
1683
|
+
n_ctx = 1,
|
1684
|
+
n_ff = 1,
|
1685
|
+
n_head = 1,
|
1686
|
+
n_head_kv = 1,
|
1687
|
+
f_norm_eps = 1e-5,
|
1688
|
+
)
|
1542
1689
|
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
|
1543
|
-
endianess=endianess, pad_vocab=args.pad_vocab)
|
1690
|
+
endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
|
1544
1691
|
logger.info(f"Wrote {outfile}")
|
1545
1692
|
return
|
1546
1693
|
|
@@ -1553,13 +1700,13 @@ def main(args_in: list[str] | None = None) -> None:
|
|
1553
1700
|
model = convert_model_names(model, params, args.skip_unknown)
|
1554
1701
|
ftype = pick_output_type(model, args.outtype)
|
1555
1702
|
model = convert_to_output_type(model, ftype)
|
1556
|
-
outfile = args.outfile or default_outfile(model_plus.paths, ftype)
|
1703
|
+
outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)
|
1557
1704
|
|
1558
1705
|
params.ftype = ftype
|
1559
1706
|
logger.info(f"Writing {outfile}, format {ftype}")
|
1560
1707
|
|
1561
1708
|
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
|
1562
|
-
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
|
1709
|
+
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
|
1563
1710
|
logger.info(f"Wrote {outfile}")
|
1564
1711
|
|
1565
1712
|
|
@@ -10,6 +10,7 @@ from typing import Any
|
|
10
10
|
GGUF_MAGIC = 0x46554747 # "GGUF"
|
11
11
|
GGUF_VERSION = 3
|
12
12
|
GGUF_DEFAULT_ALIGNMENT = 32
|
13
|
+
GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h
|
13
14
|
|
14
15
|
#
|
15
16
|
# metadata keys
|
@@ -56,12 +57,13 @@ class Keys:
|
|
56
57
|
CAUSAL = "{arch}.attention.causal"
|
57
58
|
|
58
59
|
class Rope:
|
59
|
-
DIMENSION_COUNT
|
60
|
-
FREQ_BASE
|
61
|
-
SCALING_TYPE
|
62
|
-
SCALING_FACTOR
|
63
|
-
|
64
|
-
|
60
|
+
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
61
|
+
FREQ_BASE = "{arch}.rope.freq_base"
|
62
|
+
SCALING_TYPE = "{arch}.rope.scaling.type"
|
63
|
+
SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
64
|
+
SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
|
65
|
+
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
66
|
+
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
65
67
|
|
66
68
|
class SSM:
|
67
69
|
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
@@ -114,10 +116,10 @@ class MODEL_ARCH(IntEnum):
|
|
114
116
|
GPTNEOX = auto()
|
115
117
|
MPT = auto()
|
116
118
|
STARCODER = auto()
|
117
|
-
PERSIMMON = auto()
|
118
119
|
REFACT = auto()
|
119
120
|
BERT = auto()
|
120
121
|
NOMIC_BERT = auto()
|
122
|
+
JINA_BERT_V2 = auto()
|
121
123
|
BLOOM = auto()
|
122
124
|
STABLELM = auto()
|
123
125
|
QWEN = auto()
|
@@ -137,6 +139,7 @@ class MODEL_ARCH(IntEnum):
|
|
137
139
|
COMMAND_R = auto()
|
138
140
|
DBRX = auto()
|
139
141
|
OLMO = auto()
|
142
|
+
ARCTIC = auto()
|
140
143
|
|
141
144
|
|
142
145
|
class MODEL_TENSOR(IntEnum):
|
@@ -147,6 +150,8 @@ class MODEL_TENSOR(IntEnum):
|
|
147
150
|
OUTPUT = auto()
|
148
151
|
OUTPUT_NORM = auto()
|
149
152
|
ROPE_FREQS = auto()
|
153
|
+
ROPE_FACTORS_LONG = auto()
|
154
|
+
ROPE_FACTORS_SHORT = auto()
|
150
155
|
ATTN_Q = auto()
|
151
156
|
ATTN_K = auto()
|
152
157
|
ATTN_V = auto()
|
@@ -163,6 +168,7 @@ class MODEL_TENSOR(IntEnum):
|
|
163
168
|
FFN_DOWN = auto()
|
164
169
|
FFN_UP = auto()
|
165
170
|
FFN_ACT = auto()
|
171
|
+
FFN_NORM_EXP = auto()
|
166
172
|
FFN_GATE_EXP = auto()
|
167
173
|
FFN_DOWN_EXP = auto()
|
168
174
|
FFN_UP_EXP = auto()
|
@@ -191,10 +197,10 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
191
197
|
MODEL_ARCH.GPTNEOX: "gptneox",
|
192
198
|
MODEL_ARCH.MPT: "mpt",
|
193
199
|
MODEL_ARCH.STARCODER: "starcoder",
|
194
|
-
MODEL_ARCH.PERSIMMON: "persimmon",
|
195
200
|
MODEL_ARCH.REFACT: "refact",
|
196
201
|
MODEL_ARCH.BERT: "bert",
|
197
202
|
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
203
|
+
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
|
198
204
|
MODEL_ARCH.BLOOM: "bloom",
|
199
205
|
MODEL_ARCH.STABLELM: "stablelm",
|
200
206
|
MODEL_ARCH.QWEN: "qwen",
|
@@ -214,6 +220,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
214
220
|
MODEL_ARCH.COMMAND_R: "command-r",
|
215
221
|
MODEL_ARCH.DBRX: "dbrx",
|
216
222
|
MODEL_ARCH.OLMO: "olmo",
|
223
|
+
MODEL_ARCH.ARCTIC: "arctic",
|
217
224
|
}
|
218
225
|
|
219
226
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
@@ -224,6 +231,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|
224
231
|
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
225
232
|
MODEL_TENSOR.OUTPUT: "output",
|
226
233
|
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
234
|
+
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
|
235
|
+
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
|
227
236
|
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
228
237
|
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
229
238
|
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
@@ -245,6 +254,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|
245
254
|
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
246
255
|
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
247
256
|
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
257
|
+
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
|
248
258
|
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
249
259
|
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
250
260
|
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
@@ -380,6 +390,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
380
390
|
MODEL_TENSOR.FFN_UP,
|
381
391
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
382
392
|
],
|
393
|
+
MODEL_ARCH.JINA_BERT_V2: [
|
394
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
395
|
+
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
396
|
+
MODEL_TENSOR.TOKEN_TYPES,
|
397
|
+
MODEL_TENSOR.ATTN_OUT_NORM,
|
398
|
+
MODEL_TENSOR.ATTN_Q,
|
399
|
+
MODEL_TENSOR.ATTN_Q_NORM,
|
400
|
+
MODEL_TENSOR.ATTN_K,
|
401
|
+
MODEL_TENSOR.ATTN_K_NORM,
|
402
|
+
MODEL_TENSOR.ATTN_V,
|
403
|
+
MODEL_TENSOR.ATTN_OUT,
|
404
|
+
MODEL_TENSOR.FFN_UP,
|
405
|
+
MODEL_TENSOR.FFN_GATE,
|
406
|
+
MODEL_TENSOR.FFN_DOWN,
|
407
|
+
MODEL_TENSOR.LAYER_OUT_NORM,
|
408
|
+
],
|
383
409
|
MODEL_ARCH.MPT: [
|
384
410
|
MODEL_TENSOR.TOKEN_EMBD,
|
385
411
|
MODEL_TENSOR.OUTPUT_NORM,
|
@@ -407,20 +433,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
407
433
|
MODEL_TENSOR.FFN_DOWN,
|
408
434
|
MODEL_TENSOR.FFN_UP,
|
409
435
|
],
|
410
|
-
MODEL_ARCH.PERSIMMON: [
|
411
|
-
MODEL_TENSOR.TOKEN_EMBD,
|
412
|
-
MODEL_TENSOR.OUTPUT,
|
413
|
-
MODEL_TENSOR.OUTPUT_NORM,
|
414
|
-
MODEL_TENSOR.ATTN_NORM,
|
415
|
-
MODEL_TENSOR.ATTN_QKV,
|
416
|
-
MODEL_TENSOR.ATTN_OUT,
|
417
|
-
MODEL_TENSOR.FFN_NORM,
|
418
|
-
MODEL_TENSOR.FFN_DOWN,
|
419
|
-
MODEL_TENSOR.FFN_UP,
|
420
|
-
MODEL_TENSOR.ATTN_Q_NORM,
|
421
|
-
MODEL_TENSOR.ATTN_K_NORM,
|
422
|
-
MODEL_TENSOR.ATTN_ROT_EMBD,
|
423
|
-
],
|
424
436
|
MODEL_ARCH.REFACT: [
|
425
437
|
MODEL_TENSOR.TOKEN_EMBD,
|
426
438
|
MODEL_TENSOR.OUTPUT_NORM,
|
@@ -724,6 +736,27 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
724
736
|
MODEL_TENSOR.FFN_DOWN,
|
725
737
|
MODEL_TENSOR.FFN_UP,
|
726
738
|
],
|
739
|
+
MODEL_ARCH.ARCTIC: [
|
740
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
741
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
742
|
+
MODEL_TENSOR.OUTPUT,
|
743
|
+
MODEL_TENSOR.ROPE_FREQS,
|
744
|
+
MODEL_TENSOR.ATTN_NORM,
|
745
|
+
MODEL_TENSOR.ATTN_Q,
|
746
|
+
MODEL_TENSOR.ATTN_K,
|
747
|
+
MODEL_TENSOR.ATTN_V,
|
748
|
+
MODEL_TENSOR.ATTN_OUT,
|
749
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
750
|
+
MODEL_TENSOR.FFN_GATE_INP,
|
751
|
+
MODEL_TENSOR.FFN_NORM,
|
752
|
+
MODEL_TENSOR.FFN_GATE,
|
753
|
+
MODEL_TENSOR.FFN_DOWN,
|
754
|
+
MODEL_TENSOR.FFN_UP,
|
755
|
+
MODEL_TENSOR.FFN_NORM_EXP,
|
756
|
+
MODEL_TENSOR.FFN_GATE_EXP,
|
757
|
+
MODEL_TENSOR.FFN_DOWN_EXP,
|
758
|
+
MODEL_TENSOR.FFN_UP_EXP,
|
759
|
+
],
|
727
760
|
# TODO
|
728
761
|
}
|
729
762
|
|
@@ -737,9 +770,6 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
737
770
|
MODEL_TENSOR.ROPE_FREQS,
|
738
771
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
739
772
|
],
|
740
|
-
MODEL_ARCH.PERSIMMON: [
|
741
|
-
MODEL_TENSOR.ROPE_FREQS,
|
742
|
-
],
|
743
773
|
MODEL_ARCH.QWEN: [
|
744
774
|
MODEL_TENSOR.ROPE_FREQS,
|
745
775
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
@@ -817,6 +847,50 @@ class GGMLQuantizationType(IntEnum):
|
|
817
847
|
I64 = 27
|
818
848
|
F64 = 28
|
819
849
|
IQ1_M = 29
|
850
|
+
BF16 = 30
|
851
|
+
|
852
|
+
|
853
|
+
# TODO: add GGMLFileType from ggml_ftype in ggml.h
|
854
|
+
|
855
|
+
|
856
|
+
# from llama_ftype in llama.h
|
857
|
+
# ALL VALUES SHOULD BE THE SAME HERE AS THEY ARE OVER THERE.
|
858
|
+
class LlamaFileType(IntEnum):
|
859
|
+
ALL_F32 = 0
|
860
|
+
MOSTLY_F16 = 1 # except 1d tensors
|
861
|
+
MOSTLY_Q4_0 = 2 # except 1d tensors
|
862
|
+
MOSTLY_Q4_1 = 3 # except 1d tensors
|
863
|
+
MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16
|
864
|
+
# MOSTLY_Q4_2 = 5 # support has been removed
|
865
|
+
# MOSTLY_Q4_3 = 6 # support has been removed
|
866
|
+
MOSTLY_Q8_0 = 7 # except 1d tensors
|
867
|
+
MOSTLY_Q5_0 = 8 # except 1d tensors
|
868
|
+
MOSTLY_Q5_1 = 9 # except 1d tensors
|
869
|
+
MOSTLY_Q2_K = 10 # except 1d tensors
|
870
|
+
MOSTLY_Q3_K_S = 11 # except 1d tensors
|
871
|
+
MOSTLY_Q3_K_M = 12 # except 1d tensors
|
872
|
+
MOSTLY_Q3_K_L = 13 # except 1d tensors
|
873
|
+
MOSTLY_Q4_K_S = 14 # except 1d tensors
|
874
|
+
MOSTLY_Q4_K_M = 15 # except 1d tensors
|
875
|
+
MOSTLY_Q5_K_S = 16 # except 1d tensors
|
876
|
+
MOSTLY_Q5_K_M = 17 # except 1d tensors
|
877
|
+
MOSTLY_Q6_K = 18 # except 1d tensors
|
878
|
+
MOSTLY_IQ2_XXS = 19 # except 1d tensors
|
879
|
+
MOSTLY_IQ2_XS = 20 # except 1d tensors
|
880
|
+
MOSTLY_Q2_K_S = 21 # except 1d tensors
|
881
|
+
MOSTLY_IQ3_XS = 22 # except 1d tensors
|
882
|
+
MOSTLY_IQ3_XXS = 23 # except 1d tensors
|
883
|
+
MOSTLY_IQ1_S = 24 # except 1d tensors
|
884
|
+
MOSTLY_IQ4_NL = 25 # except 1d tensors
|
885
|
+
MOSTLY_IQ3_S = 26 # except 1d tensors
|
886
|
+
MOSTLY_IQ3_M = 27 # except 1d tensors
|
887
|
+
MOSTLY_IQ2_S = 28 # except 1d tensors
|
888
|
+
MOSTLY_IQ2_M = 29 # except 1d tensors
|
889
|
+
MOSTLY_IQ4_XS = 30 # except 1d tensors
|
890
|
+
MOSTLY_IQ1_M = 31 # except 1d tensors
|
891
|
+
MOSTLY_BF16 = 32 # except 1d tensors
|
892
|
+
|
893
|
+
GUESSED = 1024 # not specified in the model file
|
820
894
|
|
821
895
|
|
822
896
|
class GGUFEndian(IntEnum):
|
@@ -856,10 +930,9 @@ class GGUFValueType(IntEnum):
|
|
856
930
|
raise ValueError(f"Unknown type: {type(val)}")
|
857
931
|
|
858
932
|
|
859
|
-
# Note: Does not support GGML_QKK_64
|
860
|
-
QK_K = 256
|
861
933
|
# Items here are (block size, type size)
|
862
|
-
|
934
|
+
QK_K = 256
|
935
|
+
GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
863
936
|
GGMLQuantizationType.F32: (1, 4),
|
864
937
|
GGMLQuantizationType.F16: (1, 2),
|
865
938
|
GGMLQuantizationType.Q4_0: (32, 2 + 16),
|
@@ -888,6 +961,7 @@ GGML_QUANT_SIZES = {
|
|
888
961
|
GGMLQuantizationType.I64: (1, 8),
|
889
962
|
GGMLQuantizationType.F64: (1, 8),
|
890
963
|
GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
|
964
|
+
GGMLQuantizationType.BF16: (1, 2),
|
891
965
|
}
|
892
966
|
|
893
967
|
|