bigdl-core-cpp 2.5.0b20240527__py3-none-win_amd64.whl → 2.5.0b20240528__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. bigdl/cpp/convert-hf-to-gguf.py +1363 -338
  2. bigdl/cpp/convert.py +199 -52
  3. bigdl/cpp/gguf-py/gguf/__init__.py +2 -0
  4. bigdl/cpp/gguf-py/gguf/constants.py +102 -28
  5. bigdl/cpp/gguf-py/gguf/gguf_reader.py +9 -5
  6. bigdl/cpp/gguf-py/gguf/gguf_writer.py +36 -11
  7. bigdl/cpp/gguf-py/gguf/lazy.py +236 -0
  8. bigdl/cpp/gguf-py/gguf/quants.py +123 -0
  9. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +28 -1
  10. bigdl/cpp/gguf-py/gguf/vocab.py +3 -3
  11. bigdl/cpp/libs/baby-llama.exe +0 -0
  12. bigdl/cpp/libs/batched-bench.exe +0 -0
  13. bigdl/cpp/libs/batched.exe +0 -0
  14. bigdl/cpp/libs/beam-search.exe +0 -0
  15. bigdl/cpp/libs/benchmark.exe +0 -0
  16. bigdl/cpp/libs/common.lib +0 -0
  17. bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
  18. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
  19. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
  20. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
  21. bigdl/cpp/libs/embedding.exe +0 -0
  22. bigdl/cpp/libs/export-lora.exe +0 -0
  23. bigdl/cpp/libs/finetune.exe +0 -0
  24. bigdl/cpp/libs/ggml_shared.dll +0 -0
  25. bigdl/cpp/libs/gguf.exe +0 -0
  26. bigdl/cpp/libs/gritlm.exe +0 -0
  27. bigdl/cpp/libs/imatrix.exe +0 -0
  28. bigdl/cpp/libs/infill.exe +0 -0
  29. bigdl/cpp/libs/llama-bench.exe +0 -0
  30. bigdl/cpp/libs/llama.dll +0 -0
  31. bigdl/cpp/libs/llava-cli.exe +0 -0
  32. bigdl/cpp/libs/llava_shared.dll +0 -0
  33. bigdl/cpp/libs/lookahead.exe +0 -0
  34. bigdl/cpp/libs/lookup.exe +0 -0
  35. bigdl/cpp/libs/ls-sycl-device.exe +0 -0
  36. bigdl/cpp/libs/main.exe +0 -0
  37. bigdl/cpp/libs/ollama.exe +0 -0
  38. bigdl/cpp/libs/parallel.exe +0 -0
  39. bigdl/cpp/libs/passkey.exe +0 -0
  40. bigdl/cpp/libs/perplexity.exe +0 -0
  41. bigdl/cpp/libs/q8dot.exe +0 -0
  42. bigdl/cpp/libs/quantize-stats.exe +0 -0
  43. bigdl/cpp/libs/quantize.exe +0 -0
  44. bigdl/cpp/libs/save-load-state.exe +0 -0
  45. bigdl/cpp/libs/server.exe +0 -0
  46. bigdl/cpp/libs/simple.exe +0 -0
  47. bigdl/cpp/libs/speculative.exe +0 -0
  48. bigdl/cpp/libs/tokenize.exe +0 -0
  49. bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
  50. bigdl/cpp/libs/vdot.exe +0 -0
  51. {bigdl_core_cpp-2.5.0b20240527.dist-info → bigdl_core_cpp-2.5.0b20240528.dist-info}/METADATA +1 -1
  52. bigdl_core_cpp-2.5.0b20240528.dist-info/RECORD +61 -0
  53. bigdl_core_cpp-2.5.0b20240527.dist-info/RECORD +0 -59
  54. {bigdl_core_cpp-2.5.0b20240527.data → bigdl_core_cpp-2.5.0b20240528.data}/scripts/init-llama-cpp.bat +0 -0
  55. {bigdl_core_cpp-2.5.0b20240527.data → bigdl_core_cpp-2.5.0b20240528.data}/scripts/init-llama-cpp.ps1 +0 -0
  56. {bigdl_core_cpp-2.5.0b20240527.data → bigdl_core_cpp-2.5.0b20240528.data}/scripts/init-ollama.bat +0 -0
  57. {bigdl_core_cpp-2.5.0b20240527.dist-info → bigdl_core_cpp-2.5.0b20240528.dist-info}/WHEEL +0 -0
  58. {bigdl_core_cpp-2.5.0b20240527.dist-info → bigdl_core_cpp-2.5.0b20240528.dist-info}/top_level.txt +0 -0
bigdl/cpp/convert.py CHANGED
@@ -24,7 +24,7 @@ from abc import ABC, abstractmethod
24
24
  from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
25
25
  from dataclasses import dataclass
26
26
  from pathlib import Path
27
- from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
27
+ from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional
28
28
 
29
29
  import numpy as np
30
30
  from sentencepiece import SentencePieceProcessor
@@ -284,6 +284,7 @@ class Params:
284
284
  n_experts = None
285
285
  n_experts_used = None
286
286
  f_rope_freq_base = None
287
+ n_ff = None
287
288
 
288
289
  # hack to determine LLaMA v1 vs v2 vs CodeLlama
289
290
  if config.get("moe"):
@@ -308,6 +309,8 @@ class Params:
308
309
  n_experts_used = config["moe"]["num_experts_per_tok"]
309
310
  f_rope_freq_base = 1e6
310
311
 
312
+ assert n_ff is not None
313
+
311
314
  return Params(
312
315
  n_vocab = model["tok_embeddings.weight"].shape[0],
313
316
  n_embd = config["dim"],
@@ -341,10 +344,47 @@ class Params:
341
344
  return params
342
345
 
343
346
 
347
+ @dataclass
348
+ class Metadata:
349
+ name: Optional[str] = None
350
+ author: Optional[str] = None
351
+ version: Optional[str] = None
352
+ url: Optional[str] = None
353
+ description: Optional[str] = None
354
+ licence: Optional[str] = None
355
+ source_url: Optional[str] = None
356
+ source_hf_repo: Optional[str] = None
357
+
358
+ @staticmethod
359
+ def load(metadata_path: Path) -> Metadata:
360
+ if metadata_path is None or not metadata_path.exists():
361
+ return Metadata()
362
+
363
+ with open(metadata_path, 'r') as file:
364
+ data = json.load(file)
365
+
366
+ # Create a new Metadata instance
367
+ metadata = Metadata()
368
+
369
+ # Assigning values to Metadata attributes if they exist in the JSON file
370
+ # This is based on LLM_KV_NAMES mapping in llama.cpp
371
+ metadata.name = data.get("general.name")
372
+ metadata.author = data.get("general.author")
373
+ metadata.version = data.get("general.version")
374
+ metadata.url = data.get("general.url")
375
+ metadata.description = data.get("general.description")
376
+ metadata.license = data.get("general.license")
377
+ metadata.source_url = data.get("general.source.url")
378
+ metadata.source_hf_repo = data.get("general.source.huggingface.repository")
379
+
380
+ return metadata
381
+
382
+
344
383
  #
345
384
  # vocab
346
385
  #
347
386
 
387
+
348
388
  @runtime_checkable
349
389
  class BaseVocab(Protocol):
350
390
  tokenizer_model: ClassVar[str]
@@ -462,7 +502,8 @@ class SentencePieceVocab(Vocab):
462
502
  # not found in alternate location either
463
503
  raise FileNotFoundError('Cannot find tokenizer.model')
464
504
 
465
- self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
505
+ self.sentencepiece_tokenizer = SentencePieceProcessor()
506
+ self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
466
507
  vocab_size = self.sentencepiece_tokenizer.vocab_size()
467
508
 
468
509
  new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
@@ -482,23 +523,23 @@ class SentencePieceVocab(Vocab):
482
523
  def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
483
524
  tokenizer = self.sentencepiece_tokenizer
484
525
  for i in range(tokenizer.vocab_size()):
485
- piece = tokenizer.id_to_piece(i)
526
+ piece = tokenizer.IdToPiece(i)
486
527
  text = piece.encode("utf-8")
487
- score: float = tokenizer.get_score(i)
528
+ score: float = tokenizer.GetScore(i)
488
529
 
489
530
  toktype = gguf.TokenType.NORMAL
490
- if tokenizer.is_unknown(i):
531
+ if tokenizer.IsUnknown(i):
491
532
  toktype = gguf.TokenType.UNKNOWN
492
- if tokenizer.is_control(i):
533
+ if tokenizer.IsControl(i):
493
534
  toktype = gguf.TokenType.CONTROL
494
535
 
495
536
  # NOTE: I think added_tokens are user defined.
496
537
  # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
497
538
  # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
498
539
 
499
- if tokenizer.is_unused(i):
540
+ if tokenizer.IsUnused(i):
500
541
  toktype = gguf.TokenType.UNUSED
501
- if tokenizer.is_byte(i):
542
+ if tokenizer.IsByte(i):
502
543
  toktype = gguf.TokenType.BYTE
503
544
 
504
545
  yield text, score, toktype
@@ -906,7 +947,7 @@ class LazyUnpickler(pickle.Unpickler):
906
947
  def rebuild_from_type_v2(func, new_type, args, state):
907
948
  return func(*args)
908
949
 
909
- CLASSES = {
950
+ CLASSES: dict[tuple[str, str], type[LazyTensor] | LazyStorageKind] = {
910
951
  # getattr used here as a workaround for mypy not being smart enough to determine
911
952
  # the staticmethods have a __func__ attribute.
912
953
  ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
@@ -1062,21 +1103,42 @@ class OutputFile:
1062
1103
  def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
1063
1104
  self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
1064
1105
 
1065
- def add_meta_arch(self, params: Params) -> None:
1106
+ def add_meta_model(self, params: Params, metadata: Metadata) -> None:
1107
+ # Metadata About The Model And Its Provenence
1066
1108
  name = "LLaMA"
1067
-
1068
- # TODO: better logic to determine model name
1069
- if params.n_ctx == 4096:
1070
- name = "LLaMA v2"
1109
+ if metadata is not None and metadata.name is not None:
1110
+ name = metadata.name
1071
1111
  elif params.path_model is not None:
1072
- name = str(params.path_model.parent).split('/')[-1]
1073
-
1074
- self.gguf.add_name (name)
1075
- self.gguf.add_vocab_size (params.n_vocab)
1076
- self.gguf.add_context_length (params.n_ctx)
1077
- self.gguf.add_embedding_length (params.n_embd)
1078
- self.gguf.add_block_count (params.n_layer)
1079
- self.gguf.add_feed_forward_length (params.n_ff)
1112
+ name = params.path_model.name
1113
+ elif params.n_ctx == 4096:
1114
+ # Heuristic detection of LLaMA v2 model
1115
+ name = "LLaMA v2"
1116
+
1117
+ self.gguf.add_name(name)
1118
+
1119
+ if metadata is not None:
1120
+ if metadata.author is not None:
1121
+ self.gguf.add_author(metadata.author)
1122
+ if metadata.version is not None:
1123
+ self.gguf.add_version(metadata.version)
1124
+ if metadata.url is not None:
1125
+ self.gguf.add_url(metadata.url)
1126
+ if metadata.description is not None:
1127
+ self.gguf.add_description(metadata.description)
1128
+ if metadata.licence is not None:
1129
+ self.gguf.add_licence(metadata.licence)
1130
+ if metadata.source_url is not None:
1131
+ self.gguf.add_source_url(metadata.source_url)
1132
+ if metadata.source_hf_repo is not None:
1133
+ self.gguf.add_source_hf_repo(metadata.source_hf_repo)
1134
+
1135
+ def add_meta_arch(self, params: Params) -> None:
1136
+ # Metadata About The Neural Architecture Itself
1137
+ self.gguf.add_vocab_size(params.n_vocab)
1138
+ self.gguf.add_context_length(params.n_ctx)
1139
+ self.gguf.add_embedding_length(params.n_embd)
1140
+ self.gguf.add_block_count(params.n_layer)
1141
+ self.gguf.add_feed_forward_length(params.n_ff)
1080
1142
  self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
1081
1143
  self.gguf.add_head_count (params.n_head)
1082
1144
  self.gguf.add_head_count_kv (params.n_head_kv)
@@ -1179,13 +1241,14 @@ class OutputFile:
1179
1241
  @staticmethod
1180
1242
  def write_vocab_only(
1181
1243
  fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
1182
- endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
1244
+ endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None,
1183
1245
  ) -> None:
1184
1246
  check_vocab_size(params, vocab, pad_vocab=pad_vocab)
1185
1247
 
1186
1248
  of = OutputFile(fname_out, endianess=endianess)
1187
1249
 
1188
1250
  # meta data
1251
+ of.add_meta_model(params, metadata)
1189
1252
  of.add_meta_arch(params)
1190
1253
  of.add_meta_vocab(vocab)
1191
1254
  of.add_meta_special_vocab(svocab)
@@ -1212,12 +1275,14 @@ class OutputFile:
1212
1275
  fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
1213
1276
  concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
1214
1277
  pad_vocab: bool = False,
1278
+ metadata: Metadata = None,
1215
1279
  ) -> None:
1216
1280
  check_vocab_size(params, vocab, pad_vocab=pad_vocab)
1217
1281
 
1218
1282
  of = OutputFile(fname_out, endianess=endianess)
1219
1283
 
1220
1284
  # meta data
1285
+ of.add_meta_model(params, metadata)
1221
1286
  of.add_meta_arch(params)
1222
1287
  if isinstance(vocab, Vocab):
1223
1288
  of.add_meta_vocab(vocab)
@@ -1253,6 +1318,37 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
1253
1318
  raise ValueError(f"Unexpected combination of types: {name_to_type}")
1254
1319
 
1255
1320
 
1321
+ def model_parameter_count(model: LazyModel) -> int:
1322
+ total_model_parameters = 0
1323
+ for i, (name, lazy_tensor) in enumerate(model.items()):
1324
+ sum_weights_in_tensor = 1
1325
+ for dim in lazy_tensor.shape:
1326
+ sum_weights_in_tensor *= dim
1327
+ total_model_parameters += sum_weights_in_tensor
1328
+ return total_model_parameters
1329
+
1330
+
1331
+ def model_parameter_count_rounded_notation(model_params_count: int) -> str:
1332
+ if model_params_count > 1e12 :
1333
+ # Trillions Of Parameters
1334
+ scaled_model_params = model_params_count * 1e-12
1335
+ scale_suffix = "T"
1336
+ elif model_params_count > 1e9 :
1337
+ # Billions Of Parameters
1338
+ scaled_model_params = model_params_count * 1e-9
1339
+ scale_suffix = "B"
1340
+ elif model_params_count > 1e6 :
1341
+ # Millions Of Parameters
1342
+ scaled_model_params = model_params_count * 1e-6
1343
+ scale_suffix = "M"
1344
+ else:
1345
+ # Thousands Of Parameters
1346
+ scaled_model_params = model_params_count * 1e-3
1347
+ scale_suffix = "K"
1348
+
1349
+ return f"{round(scaled_model_params)}{scale_suffix}"
1350
+
1351
+
1256
1352
  def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
1257
1353
  return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
1258
1354
  for (name, tensor) in model.items()}
@@ -1432,13 +1528,35 @@ class VocabFactory:
1432
1528
  return vocab, special_vocab
1433
1529
 
1434
1530
 
1435
- def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
1436
- namestr = {
1437
- GGMLFileType.AllF32: "f32",
1438
- GGMLFileType.MostlyF16: "f16",
1439
- GGMLFileType.MostlyQ8_0:"q8_0",
1531
+ def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
1532
+ quantization = {
1533
+ GGMLFileType.AllF32: "F32",
1534
+ GGMLFileType.MostlyF16: "F16",
1535
+ GGMLFileType.MostlyQ8_0: "Q8_0",
1440
1536
  }[file_type]
1441
- ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
1537
+
1538
+ parameters = model_parameter_count_rounded_notation(model_params_count)
1539
+
1540
+ expert_count = ""
1541
+ if params.n_experts is not None:
1542
+ expert_count = f"{params.n_experts}x"
1543
+
1544
+ version = ""
1545
+ if metadata is not None and metadata.version is not None:
1546
+ version = f"-{metadata.version}"
1547
+
1548
+ name = "ggml-model"
1549
+ if metadata is not None and metadata.name is not None:
1550
+ name = metadata.name
1551
+ elif params.path_model is not None:
1552
+ name = params.path_model.name
1553
+
1554
+ return f"{name}{version}-{expert_count}{parameters}-{quantization}"
1555
+
1556
+
1557
+ def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
1558
+ default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
1559
+ ret = model_paths[0].parent / f"{default_filename}.gguf"
1442
1560
  if ret in model_paths:
1443
1561
  logger.error(
1444
1562
  f"Error: Default output path ({ret}) would overwrite the input. "
@@ -1476,17 +1594,30 @@ def main(args_in: list[str] | None = None) -> None:
1476
1594
  parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
1477
1595
  parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
1478
1596
  parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
1597
+ parser.add_argument("--metadata", type=Path, help="Specify the path for a metadata file")
1598
+ parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name")
1479
1599
 
1480
1600
  args = parser.parse_args(args_in)
1481
1601
 
1482
1602
  if args.verbose:
1483
1603
  logging.basicConfig(level=logging.DEBUG)
1484
- elif args.dump_single or args.dump:
1604
+ elif args.dump_single or args.dump or args.get_outfile:
1485
1605
  # Avoid printing anything besides the dump output
1486
1606
  logging.basicConfig(level=logging.WARNING)
1487
1607
  else:
1488
1608
  logging.basicConfig(level=logging.INFO)
1489
1609
 
1610
+ metadata = Metadata.load(args.metadata)
1611
+
1612
+ if args.get_outfile:
1613
+ model_plus = load_some_model(args.model)
1614
+ params = Params.load(model_plus)
1615
+ model = convert_model_names(model_plus.model, params, args.skip_unknown)
1616
+ model_params_count = model_parameter_count(model_plus.model)
1617
+ ftype = pick_output_type(model, args.outtype)
1618
+ print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
1619
+ return
1620
+
1490
1621
  if args.no_vocab and args.vocab_only:
1491
1622
  raise ValueError("--vocab-only does not make sense with --no-vocab")
1492
1623
 
@@ -1500,6 +1631,9 @@ def main(args_in: list[str] | None = None) -> None:
1500
1631
  else:
1501
1632
  model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
1502
1633
 
1634
+ model_params_count = model_parameter_count(model_plus.model)
1635
+ logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")
1636
+
1503
1637
  if args.dump:
1504
1638
  do_dump_model(model_plus)
1505
1639
  return
@@ -1508,25 +1642,27 @@ def main(args_in: list[str] | None = None) -> None:
1508
1642
  if args.big_endian:
1509
1643
  endianess = gguf.GGUFEndian.BIG
1510
1644
 
1511
- params = Params.load(model_plus)
1512
- if params.n_ctx == -1:
1513
- if args.ctx is None:
1514
- msg = """\
1515
- The model doesn't have a context size, and you didn't specify one with --ctx
1516
- Please specify one with --ctx:
1517
- - LLaMA v1: --ctx 2048
1518
- - LLaMA v2: --ctx 4096"""
1519
- parser.error(textwrap.dedent(msg))
1520
- params.n_ctx = args.ctx
1521
-
1522
- if args.outtype:
1523
- params.ftype = {
1524
- "f32": GGMLFileType.AllF32,
1525
- "f16": GGMLFileType.MostlyF16,
1526
- "q8_0": GGMLFileType.MostlyQ8_0,
1527
- }[args.outtype]
1528
-
1529
- logger.info(f"params = {params}")
1645
+ params = None
1646
+ if args.pad_vocab or not args.vocab_only:
1647
+ params = Params.load(model_plus)
1648
+ if params.n_ctx == -1:
1649
+ if args.ctx is None:
1650
+ msg = """\
1651
+ The model doesn't have a context size, and you didn't specify one with --ctx
1652
+ Please specify one with --ctx:
1653
+ - LLaMA v1: --ctx 2048
1654
+ - LLaMA v2: --ctx 4096"""
1655
+ parser.error(textwrap.dedent(msg))
1656
+ params.n_ctx = args.ctx
1657
+
1658
+ if args.outtype:
1659
+ params.ftype = {
1660
+ "f32": GGMLFileType.AllF32,
1661
+ "f16": GGMLFileType.MostlyF16,
1662
+ "q8_0": GGMLFileType.MostlyQ8_0,
1663
+ }[args.outtype]
1664
+
1665
+ logger.info(f"params = {params}")
1530
1666
 
1531
1667
  model_parent_path = model_plus.paths[0].parent
1532
1668
  vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
@@ -1539,8 +1675,19 @@ def main(args_in: list[str] | None = None) -> None:
1539
1675
  if not args.outfile:
1540
1676
  raise ValueError("need --outfile if using --vocab-only")
1541
1677
  outfile = args.outfile
1678
+ if params is None:
1679
+ params = Params(
1680
+ n_vocab = vocab.vocab_size,
1681
+ n_embd = 1,
1682
+ n_layer = 1,
1683
+ n_ctx = 1,
1684
+ n_ff = 1,
1685
+ n_head = 1,
1686
+ n_head_kv = 1,
1687
+ f_norm_eps = 1e-5,
1688
+ )
1542
1689
  OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
1543
- endianess=endianess, pad_vocab=args.pad_vocab)
1690
+ endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
1544
1691
  logger.info(f"Wrote {outfile}")
1545
1692
  return
1546
1693
 
@@ -1553,13 +1700,13 @@ def main(args_in: list[str] | None = None) -> None:
1553
1700
  model = convert_model_names(model, params, args.skip_unknown)
1554
1701
  ftype = pick_output_type(model, args.outtype)
1555
1702
  model = convert_to_output_type(model, ftype)
1556
- outfile = args.outfile or default_outfile(model_plus.paths, ftype)
1703
+ outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)
1557
1704
 
1558
1705
  params.ftype = ftype
1559
1706
  logger.info(f"Writing {outfile}, format {ftype}")
1560
1707
 
1561
1708
  OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
1562
- concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
1709
+ concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
1563
1710
  logger.info(f"Wrote {outfile}")
1564
1711
 
1565
1712
 
@@ -1,5 +1,7 @@
1
1
  from .constants import *
2
+ from .lazy import *
2
3
  from .gguf_reader import *
3
4
  from .gguf_writer import *
5
+ from .quants import *
4
6
  from .tensor_mapping import *
5
7
  from .vocab import *
@@ -10,6 +10,7 @@ from typing import Any
10
10
  GGUF_MAGIC = 0x46554747 # "GGUF"
11
11
  GGUF_VERSION = 3
12
12
  GGUF_DEFAULT_ALIGNMENT = 32
13
+ GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h
13
14
 
14
15
  #
15
16
  # metadata keys
@@ -56,12 +57,13 @@ class Keys:
56
57
  CAUSAL = "{arch}.attention.causal"
57
58
 
58
59
  class Rope:
59
- DIMENSION_COUNT = "{arch}.rope.dimension_count"
60
- FREQ_BASE = "{arch}.rope.freq_base"
61
- SCALING_TYPE = "{arch}.rope.scaling.type"
62
- SCALING_FACTOR = "{arch}.rope.scaling.factor"
63
- SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
64
- SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
60
+ DIMENSION_COUNT = "{arch}.rope.dimension_count"
61
+ FREQ_BASE = "{arch}.rope.freq_base"
62
+ SCALING_TYPE = "{arch}.rope.scaling.type"
63
+ SCALING_FACTOR = "{arch}.rope.scaling.factor"
64
+ SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
65
+ SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
66
+ SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
65
67
 
66
68
  class SSM:
67
69
  CONV_KERNEL = "{arch}.ssm.conv_kernel"
@@ -114,10 +116,10 @@ class MODEL_ARCH(IntEnum):
114
116
  GPTNEOX = auto()
115
117
  MPT = auto()
116
118
  STARCODER = auto()
117
- PERSIMMON = auto()
118
119
  REFACT = auto()
119
120
  BERT = auto()
120
121
  NOMIC_BERT = auto()
122
+ JINA_BERT_V2 = auto()
121
123
  BLOOM = auto()
122
124
  STABLELM = auto()
123
125
  QWEN = auto()
@@ -137,6 +139,7 @@ class MODEL_ARCH(IntEnum):
137
139
  COMMAND_R = auto()
138
140
  DBRX = auto()
139
141
  OLMO = auto()
142
+ ARCTIC = auto()
140
143
 
141
144
 
142
145
  class MODEL_TENSOR(IntEnum):
@@ -147,6 +150,8 @@ class MODEL_TENSOR(IntEnum):
147
150
  OUTPUT = auto()
148
151
  OUTPUT_NORM = auto()
149
152
  ROPE_FREQS = auto()
153
+ ROPE_FACTORS_LONG = auto()
154
+ ROPE_FACTORS_SHORT = auto()
150
155
  ATTN_Q = auto()
151
156
  ATTN_K = auto()
152
157
  ATTN_V = auto()
@@ -163,6 +168,7 @@ class MODEL_TENSOR(IntEnum):
163
168
  FFN_DOWN = auto()
164
169
  FFN_UP = auto()
165
170
  FFN_ACT = auto()
171
+ FFN_NORM_EXP = auto()
166
172
  FFN_GATE_EXP = auto()
167
173
  FFN_DOWN_EXP = auto()
168
174
  FFN_UP_EXP = auto()
@@ -191,10 +197,10 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
191
197
  MODEL_ARCH.GPTNEOX: "gptneox",
192
198
  MODEL_ARCH.MPT: "mpt",
193
199
  MODEL_ARCH.STARCODER: "starcoder",
194
- MODEL_ARCH.PERSIMMON: "persimmon",
195
200
  MODEL_ARCH.REFACT: "refact",
196
201
  MODEL_ARCH.BERT: "bert",
197
202
  MODEL_ARCH.NOMIC_BERT: "nomic-bert",
203
+ MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
198
204
  MODEL_ARCH.BLOOM: "bloom",
199
205
  MODEL_ARCH.STABLELM: "stablelm",
200
206
  MODEL_ARCH.QWEN: "qwen",
@@ -214,6 +220,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
214
220
  MODEL_ARCH.COMMAND_R: "command-r",
215
221
  MODEL_ARCH.DBRX: "dbrx",
216
222
  MODEL_ARCH.OLMO: "olmo",
223
+ MODEL_ARCH.ARCTIC: "arctic",
217
224
  }
218
225
 
219
226
  TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -224,6 +231,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
224
231
  MODEL_TENSOR.OUTPUT_NORM: "output_norm",
225
232
  MODEL_TENSOR.OUTPUT: "output",
226
233
  MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
234
+ MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
235
+ MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
227
236
  MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
228
237
  MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
229
238
  MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
@@ -245,6 +254,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
245
254
  MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
246
255
  MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
247
256
  MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
257
+ MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
248
258
  MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
249
259
  MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
250
260
  MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
@@ -380,6 +390,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
380
390
  MODEL_TENSOR.FFN_UP,
381
391
  MODEL_TENSOR.LAYER_OUT_NORM,
382
392
  ],
393
+ MODEL_ARCH.JINA_BERT_V2: [
394
+ MODEL_TENSOR.TOKEN_EMBD,
395
+ MODEL_TENSOR.TOKEN_EMBD_NORM,
396
+ MODEL_TENSOR.TOKEN_TYPES,
397
+ MODEL_TENSOR.ATTN_OUT_NORM,
398
+ MODEL_TENSOR.ATTN_Q,
399
+ MODEL_TENSOR.ATTN_Q_NORM,
400
+ MODEL_TENSOR.ATTN_K,
401
+ MODEL_TENSOR.ATTN_K_NORM,
402
+ MODEL_TENSOR.ATTN_V,
403
+ MODEL_TENSOR.ATTN_OUT,
404
+ MODEL_TENSOR.FFN_UP,
405
+ MODEL_TENSOR.FFN_GATE,
406
+ MODEL_TENSOR.FFN_DOWN,
407
+ MODEL_TENSOR.LAYER_OUT_NORM,
408
+ ],
383
409
  MODEL_ARCH.MPT: [
384
410
  MODEL_TENSOR.TOKEN_EMBD,
385
411
  MODEL_TENSOR.OUTPUT_NORM,
@@ -407,20 +433,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
407
433
  MODEL_TENSOR.FFN_DOWN,
408
434
  MODEL_TENSOR.FFN_UP,
409
435
  ],
410
- MODEL_ARCH.PERSIMMON: [
411
- MODEL_TENSOR.TOKEN_EMBD,
412
- MODEL_TENSOR.OUTPUT,
413
- MODEL_TENSOR.OUTPUT_NORM,
414
- MODEL_TENSOR.ATTN_NORM,
415
- MODEL_TENSOR.ATTN_QKV,
416
- MODEL_TENSOR.ATTN_OUT,
417
- MODEL_TENSOR.FFN_NORM,
418
- MODEL_TENSOR.FFN_DOWN,
419
- MODEL_TENSOR.FFN_UP,
420
- MODEL_TENSOR.ATTN_Q_NORM,
421
- MODEL_TENSOR.ATTN_K_NORM,
422
- MODEL_TENSOR.ATTN_ROT_EMBD,
423
- ],
424
436
  MODEL_ARCH.REFACT: [
425
437
  MODEL_TENSOR.TOKEN_EMBD,
426
438
  MODEL_TENSOR.OUTPUT_NORM,
@@ -724,6 +736,27 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
724
736
  MODEL_TENSOR.FFN_DOWN,
725
737
  MODEL_TENSOR.FFN_UP,
726
738
  ],
739
+ MODEL_ARCH.ARCTIC: [
740
+ MODEL_TENSOR.TOKEN_EMBD,
741
+ MODEL_TENSOR.OUTPUT_NORM,
742
+ MODEL_TENSOR.OUTPUT,
743
+ MODEL_TENSOR.ROPE_FREQS,
744
+ MODEL_TENSOR.ATTN_NORM,
745
+ MODEL_TENSOR.ATTN_Q,
746
+ MODEL_TENSOR.ATTN_K,
747
+ MODEL_TENSOR.ATTN_V,
748
+ MODEL_TENSOR.ATTN_OUT,
749
+ MODEL_TENSOR.ATTN_ROT_EMBD,
750
+ MODEL_TENSOR.FFN_GATE_INP,
751
+ MODEL_TENSOR.FFN_NORM,
752
+ MODEL_TENSOR.FFN_GATE,
753
+ MODEL_TENSOR.FFN_DOWN,
754
+ MODEL_TENSOR.FFN_UP,
755
+ MODEL_TENSOR.FFN_NORM_EXP,
756
+ MODEL_TENSOR.FFN_GATE_EXP,
757
+ MODEL_TENSOR.FFN_DOWN_EXP,
758
+ MODEL_TENSOR.FFN_UP_EXP,
759
+ ],
727
760
  # TODO
728
761
  }
729
762
 
@@ -737,9 +770,6 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
737
770
  MODEL_TENSOR.ROPE_FREQS,
738
771
  MODEL_TENSOR.ATTN_ROT_EMBD,
739
772
  ],
740
- MODEL_ARCH.PERSIMMON: [
741
- MODEL_TENSOR.ROPE_FREQS,
742
- ],
743
773
  MODEL_ARCH.QWEN: [
744
774
  MODEL_TENSOR.ROPE_FREQS,
745
775
  MODEL_TENSOR.ATTN_ROT_EMBD,
@@ -817,6 +847,50 @@ class GGMLQuantizationType(IntEnum):
817
847
  I64 = 27
818
848
  F64 = 28
819
849
  IQ1_M = 29
850
+ BF16 = 30
851
+
852
+
853
+ # TODO: add GGMLFileType from ggml_ftype in ggml.h
854
+
855
+
856
+ # from llama_ftype in llama.h
857
+ # ALL VALUES SHOULD BE THE SAME HERE AS THEY ARE OVER THERE.
858
+ class LlamaFileType(IntEnum):
859
+ ALL_F32 = 0
860
+ MOSTLY_F16 = 1 # except 1d tensors
861
+ MOSTLY_Q4_0 = 2 # except 1d tensors
862
+ MOSTLY_Q4_1 = 3 # except 1d tensors
863
+ MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16
864
+ # MOSTLY_Q4_2 = 5 # support has been removed
865
+ # MOSTLY_Q4_3 = 6 # support has been removed
866
+ MOSTLY_Q8_0 = 7 # except 1d tensors
867
+ MOSTLY_Q5_0 = 8 # except 1d tensors
868
+ MOSTLY_Q5_1 = 9 # except 1d tensors
869
+ MOSTLY_Q2_K = 10 # except 1d tensors
870
+ MOSTLY_Q3_K_S = 11 # except 1d tensors
871
+ MOSTLY_Q3_K_M = 12 # except 1d tensors
872
+ MOSTLY_Q3_K_L = 13 # except 1d tensors
873
+ MOSTLY_Q4_K_S = 14 # except 1d tensors
874
+ MOSTLY_Q4_K_M = 15 # except 1d tensors
875
+ MOSTLY_Q5_K_S = 16 # except 1d tensors
876
+ MOSTLY_Q5_K_M = 17 # except 1d tensors
877
+ MOSTLY_Q6_K = 18 # except 1d tensors
878
+ MOSTLY_IQ2_XXS = 19 # except 1d tensors
879
+ MOSTLY_IQ2_XS = 20 # except 1d tensors
880
+ MOSTLY_Q2_K_S = 21 # except 1d tensors
881
+ MOSTLY_IQ3_XS = 22 # except 1d tensors
882
+ MOSTLY_IQ3_XXS = 23 # except 1d tensors
883
+ MOSTLY_IQ1_S = 24 # except 1d tensors
884
+ MOSTLY_IQ4_NL = 25 # except 1d tensors
885
+ MOSTLY_IQ3_S = 26 # except 1d tensors
886
+ MOSTLY_IQ3_M = 27 # except 1d tensors
887
+ MOSTLY_IQ2_S = 28 # except 1d tensors
888
+ MOSTLY_IQ2_M = 29 # except 1d tensors
889
+ MOSTLY_IQ4_XS = 30 # except 1d tensors
890
+ MOSTLY_IQ1_M = 31 # except 1d tensors
891
+ MOSTLY_BF16 = 32 # except 1d tensors
892
+
893
+ GUESSED = 1024 # not specified in the model file
820
894
 
821
895
 
822
896
  class GGUFEndian(IntEnum):
@@ -856,10 +930,9 @@ class GGUFValueType(IntEnum):
856
930
  raise ValueError(f"Unknown type: {type(val)}")
857
931
 
858
932
 
859
- # Note: Does not support GGML_QKK_64
860
- QK_K = 256
861
933
  # Items here are (block size, type size)
862
- GGML_QUANT_SIZES = {
934
+ QK_K = 256
935
+ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
863
936
  GGMLQuantizationType.F32: (1, 4),
864
937
  GGMLQuantizationType.F16: (1, 2),
865
938
  GGMLQuantizationType.Q4_0: (32, 2 + 16),
@@ -888,6 +961,7 @@ GGML_QUANT_SIZES = {
888
961
  GGMLQuantizationType.I64: (1, 8),
889
962
  GGMLQuantizationType.F64: (1, 8),
890
963
  GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
964
+ GGMLQuantizationType.BF16: (1, 2),
891
965
  }
892
966
 
893
967