bigdl-core-cpp 2.5.0b20240421__py3-none-win_amd64.whl → 2.5.0b20240423__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. bigdl/cpp/convert.py +276 -189
  2. bigdl/cpp/gguf-py/__init__.py +0 -0
  3. bigdl/cpp/gguf-py/gguf/__init__.py +5 -0
  4. bigdl/cpp/gguf-py/gguf/constants.py +943 -0
  5. bigdl/cpp/gguf-py/gguf/gguf.py +15 -0
  6. bigdl/cpp/gguf-py/gguf/gguf_reader.py +279 -0
  7. bigdl/cpp/gguf-py/gguf/gguf_writer.py +518 -0
  8. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +434 -0
  9. bigdl/cpp/gguf-py/gguf/vocab.py +181 -0
  10. bigdl/cpp/libs/baby-llama.exe +0 -0
  11. bigdl/cpp/libs/batched-bench.exe +0 -0
  12. bigdl/cpp/libs/batched.exe +0 -0
  13. bigdl/cpp/libs/beam-search.exe +0 -0
  14. bigdl/cpp/libs/benchmark.exe +0 -0
  15. bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
  16. bigdl/cpp/libs/embedding.exe +0 -0
  17. bigdl/cpp/libs/export-lora.exe +0 -0
  18. bigdl/cpp/libs/finetune.exe +0 -0
  19. bigdl/cpp/libs/gguf.exe +0 -0
  20. bigdl/cpp/libs/gritlm.exe +0 -0
  21. bigdl/cpp/libs/imatrix.exe +0 -0
  22. bigdl/cpp/libs/infill.exe +0 -0
  23. bigdl/cpp/libs/llama-bench.exe +0 -0
  24. bigdl/cpp/libs/llava-cli.exe +0 -0
  25. bigdl/cpp/libs/lookahead.exe +0 -0
  26. bigdl/cpp/libs/lookup.exe +0 -0
  27. bigdl/cpp/libs/ls-sycl-device.exe +0 -0
  28. bigdl/cpp/libs/main.exe +0 -0
  29. bigdl/cpp/libs/ollama.exe +0 -0
  30. bigdl/cpp/libs/parallel.exe +0 -0
  31. bigdl/cpp/libs/passkey.exe +0 -0
  32. bigdl/cpp/libs/perplexity.exe +0 -0
  33. bigdl/cpp/libs/q8dot.exe +0 -0
  34. bigdl/cpp/libs/quantize-stats.exe +0 -0
  35. bigdl/cpp/libs/quantize.exe +0 -0
  36. bigdl/cpp/libs/save-load-state.exe +0 -0
  37. bigdl/cpp/libs/server.exe +0 -0
  38. bigdl/cpp/libs/simple.exe +0 -0
  39. bigdl/cpp/libs/speculative.exe +0 -0
  40. bigdl/cpp/libs/tokenize.exe +0 -0
  41. bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
  42. bigdl/cpp/libs/vdot.exe +0 -0
  43. {bigdl_core_cpp-2.5.0b20240421.data → bigdl_core_cpp-2.5.0b20240423.data}/scripts/init-llama-cpp.bat +1 -0
  44. {bigdl_core_cpp-2.5.0b20240421.dist-info → bigdl_core_cpp-2.5.0b20240423.dist-info}/METADATA +3 -3
  45. bigdl_core_cpp-2.5.0b20240423.dist-info/RECORD +50 -0
  46. bigdl_core_cpp-2.5.0b20240421.dist-info/RECORD +0 -42
  47. {bigdl_core_cpp-2.5.0b20240421.data → bigdl_core_cpp-2.5.0b20240423.data}/scripts/init-llama-cpp.ps1 +0 -0
  48. {bigdl_core_cpp-2.5.0b20240421.data → bigdl_core_cpp-2.5.0b20240423.data}/scripts/init-ollama.bat +0 -0
  49. {bigdl_core_cpp-2.5.0b20240421.dist-info → bigdl_core_cpp-2.5.0b20240423.dist-info}/WHEEL +0 -0
  50. {bigdl_core_cpp-2.5.0b20240421.dist-info → bigdl_core_cpp-2.5.0b20240423.dist-info}/top_level.txt +0 -0
bigdl/cpp/convert.py CHANGED
@@ -1,6 +1,4 @@
1
1
  #!/usr/bin/env python3
2
- # this file is copied from https://github.com/ggerganov/llama.cpp/blob/1e35d619a6fb0b9c5e3dc955345980ff056ddbaf/convert.py
3
-
4
2
  from __future__ import annotations
5
3
 
6
4
  import argparse
@@ -18,13 +16,14 @@ import re
18
16
  import signal
19
17
  import struct
20
18
  import sys
19
+ import textwrap
21
20
  import time
22
21
  import zipfile
23
- from abc import ABCMeta, abstractmethod
22
+ from abc import ABC, abstractmethod
24
23
  from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
25
24
  from dataclasses import dataclass
26
25
  from pathlib import Path
27
- from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
26
+ from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
28
27
 
29
28
  import numpy as np
30
29
  from sentencepiece import SentencePieceProcessor
@@ -34,7 +33,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
34
33
  import gguf
35
34
 
36
35
  if TYPE_CHECKING:
37
- from typing import TypeAlias
36
+ from typing_extensions import Self, TypeAlias
38
37
 
39
38
  if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
40
39
  faulthandler.register(signal.SIGUSR1)
@@ -45,6 +44,9 @@ ARCH = gguf.MODEL_ARCH.LLAMA
45
44
 
46
45
  DEFAULT_CONCURRENCY = 8
47
46
 
47
+ ADDED_TOKENS_FILE = 'added_tokens.json'
48
+ FAST_TOKENIZER_FILE = 'tokenizer.json'
49
+
48
50
  #
49
51
  # data types
50
52
  #
@@ -137,7 +139,8 @@ class GGMLFileType(enum.IntEnum):
137
139
  dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
138
140
  if dt is None:
139
141
  raise ValueError(self)
140
- # 1D tensors are always F32.
142
+ # Convert all 1D tensors to F32. Most of the codebase that takes in 1D tensors only handles F32 tensors, and most of the outputs tensors are F32.
143
+ # Also The 1d tensors aren't much of a performance/size issue. So instead of having to have separate F32 and F16 implementations of both, just convert everything to F32 for now.
141
144
  return dt if len(tensor.shape) > 1 else DT_F32
142
145
 
143
146
 
@@ -190,8 +193,10 @@ class Params:
190
193
  n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
191
194
 
192
195
  if n_layer < 1:
193
- raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
194
- "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
196
+ msg = """\
197
+ failed to guess 'n_layer'. This model is unknown or unsupported.
198
+ Suggestion: provide 'config.json' of the model in the same directory containing model files."""
199
+ raise KeyError(textwrap.dedent(msg))
195
200
 
196
201
  n_head = n_embd // 128 # guessed
197
202
  n_mult = 256 # guessed
@@ -213,7 +218,8 @@ class Params:
213
218
 
214
219
  @staticmethod
215
220
  def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
216
- config = json.load(open(config_path))
221
+ with open(config_path) as f:
222
+ config = json.load(f)
217
223
 
218
224
  rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
219
225
  rope_scaling = config.get("rope_scaling")
@@ -235,8 +241,10 @@ class Params:
235
241
  elif "max_position_embeddings" in config:
236
242
  n_ctx = config["max_position_embeddings"]
237
243
  else:
238
- raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
239
- "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
244
+ msg = """\
245
+ failed to guess 'n_ctx'. This model is unknown or unsupported.
246
+ Suggestion: provide 'config.json' of the model in the same directory containing model files."""
247
+ raise KeyError(textwrap.dedent(msg))
240
248
 
241
249
  n_experts = None
242
250
  n_experts_used = None
@@ -267,7 +275,8 @@ class Params:
267
275
  # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
268
276
  @staticmethod
269
277
  def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
270
- config = json.load(open(config_path))
278
+ with open(config_path) as f:
279
+ config = json.load(f)
271
280
 
272
281
  n_experts = None
273
282
  n_experts_used = None
@@ -333,44 +342,86 @@ class Params:
333
342
  # vocab
334
343
  #
335
344
 
336
- class BpeVocab:
337
- def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
338
- self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
339
- if isinstance(self.bpe_tokenizer.get('model'), dict):
340
- self.vocab = self.bpe_tokenizer["model"]["vocab"]
341
- else:
342
- self.vocab = self.bpe_tokenizer
343
- added_tokens: dict[str, int]
344
- if fname_added_tokens is not None:
345
- # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
346
- added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
345
+ @runtime_checkable
346
+ class BaseVocab(Protocol):
347
+ tokenizer_model: ClassVar[str]
348
+ name: ClassVar[str]
349
+
350
+
351
+ class NoVocab(BaseVocab):
352
+ tokenizer_model = "no_vocab"
353
+ name = "no_vocab"
354
+
355
+ def __repr__(self) -> str:
356
+ return "<NoVocab for a model without integrated vocabulary>"
357
+
358
+
359
+ @runtime_checkable
360
+ class Vocab(BaseVocab, Protocol):
361
+ vocab_size: int
362
+ added_tokens_dict: dict[str, int]
363
+ added_tokens_list: list[str]
364
+ fname_tokenizer: Path
365
+
366
+ def __init__(self, base_path: Path): ...
367
+ def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
368
+
369
+
370
+ class BpeVocab(Vocab):
371
+ tokenizer_model = "gpt2"
372
+ name = "bpe"
373
+
374
+ def __init__(self, base_path: Path):
375
+ added_tokens: dict[str, int] = {}
376
+
377
+ if (fname_tokenizer := base_path / 'vocab.json').exists():
378
+ # "slow" tokenizer
379
+ with open(fname_tokenizer, encoding="utf-8") as f:
380
+ self.vocab = json.load(f)
381
+
382
+ try:
383
+ # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
384
+ with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
385
+ added_tokens = json.load(f)
386
+ except FileNotFoundError:
387
+ pass
347
388
  else:
348
- # Fall back to trying to find the added tokens in tokenizer.json
349
- tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
350
- if not tokenizer_json_file.is_file():
351
- added_tokens = {}
352
- else:
353
- tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
354
- added_tokens = dict(
355
- (item['content'], item['id'])
356
- for item in tokenizer_json.get('added_tokens', [])
357
- # Added tokens here can be duplicates of the main vocabulary.
358
- if item['content'] not in self.bpe_tokenizer)
359
-
360
- vocab_size: int = len(self.vocab)
361
- expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
362
- actual_ids = sorted(added_tokens.values())
389
+ # "fast" tokenizer
390
+ fname_tokenizer = base_path / FAST_TOKENIZER_FILE
391
+
392
+ # if this fails, FileNotFoundError propagates to caller
393
+ with open(fname_tokenizer, encoding="utf-8") as f:
394
+ tokenizer_json = json.load(f)
395
+
396
+ tokenizer_model: dict[str, Any] = tokenizer_json['model']
397
+ if (
398
+ tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
399
+ or tokenizer_json['decoder']['type'] != 'ByteLevel'
400
+ ):
401
+ raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
402
+
403
+ self.vocab = tokenizer_model["vocab"]
404
+
405
+ if (added := tokenizer_json.get('added_tokens')) is not None:
406
+ # Added tokens here can be duplicates of the main vocabulary.
407
+ added_tokens = {item['content']: item['id']
408
+ for item in added
409
+ if item['content'] not in self.vocab}
410
+
411
+ vocab_size = len(self.vocab)
412
+ expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
413
+ actual_ids = sorted(added_tokens.values())
363
414
  if expected_ids != actual_ids:
364
415
  expected_end_id = vocab_size + len(actual_ids) - 1
365
- raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
416
+ raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
417
+ f"{vocab_size} - {expected_end_id}; got {actual_ids}")
366
418
 
367
419
  items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
368
420
  self.added_tokens_dict = added_tokens
369
421
  self.added_tokens_list = [text for (text, idx) in items]
370
- self.vocab_size_base: int = vocab_size
371
- self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
422
+ self.vocab_size_base = vocab_size
423
+ self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
372
424
  self.fname_tokenizer = fname_tokenizer
373
- self.fname_added_tokens = fname_added_tokens
374
425
 
375
426
  def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
376
427
  reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
@@ -391,16 +442,25 @@ class BpeVocab:
391
442
  return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
392
443
 
393
444
 
394
- class SentencePieceVocab:
395
- def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
396
- self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
397
- added_tokens: dict[str, int]
398
- if fname_added_tokens is not None:
399
- added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
400
- else:
401
- added_tokens = {}
445
+ class SentencePieceVocab(Vocab):
446
+ tokenizer_model = "llama"
447
+ name = "spm"
448
+
449
+ def __init__(self, base_path: Path):
450
+ added_tokens: dict[str, int] = {}
451
+ if (fname_tokenizer := base_path / 'tokenizer.model').exists():
452
+ # normal location
453
+ try:
454
+ with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
455
+ added_tokens = json.load(f)
456
+ except FileNotFoundError:
457
+ pass
458
+ elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
459
+ # not found in alternate location either
460
+ raise FileNotFoundError('Cannot find tokenizer.model')
402
461
 
403
- vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
462
+ self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
463
+ vocab_size = self.sentencepiece_tokenizer.vocab_size()
404
464
 
405
465
  new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
406
466
  expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
@@ -410,18 +470,17 @@ class SentencePieceVocab:
410
470
  raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
411
471
 
412
472
  # Token pieces that were added to the base vocabulary.
413
- self.added_tokens_dict = added_tokens
473
+ self.added_tokens_dict = added_tokens
414
474
  self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
415
475
  self.vocab_size_base = vocab_size
416
476
  self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
417
477
  self.fname_tokenizer = fname_tokenizer
418
- self.fname_added_tokens = fname_added_tokens
419
478
 
420
479
  def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
421
480
  tokenizer = self.sentencepiece_tokenizer
422
481
  for i in range(tokenizer.vocab_size()):
423
482
  piece = tokenizer.id_to_piece(i)
424
- text: bytes = piece.encode("utf-8")
483
+ text = piece.encode("utf-8")
425
484
  score: float = tokenizer.get_score(i)
426
485
 
427
486
  toktype = gguf.TokenType.NORMAL
@@ -454,24 +513,47 @@ class SentencePieceVocab:
454
513
  return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
455
514
 
456
515
 
457
- class HfVocab:
458
- def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
516
+ class LlamaHfVocab(Vocab):
517
+ tokenizer_model = "llama"
518
+ name = "hfft"
519
+
520
+ def __init__(self, base_path: Path):
521
+ fname_tokenizer = base_path / FAST_TOKENIZER_FILE
522
+ # if this fails, FileNotFoundError propagates to caller
523
+ with open(fname_tokenizer, encoding='utf-8') as f:
524
+ tokenizer_json = json.load(f)
525
+
526
+ # pre-check so we know if we need transformers
527
+ tokenizer_model: dict[str, Any] = tokenizer_json['model']
528
+ is_llama3 = (
529
+ tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
530
+ and not tokenizer_model.get('byte_fallback', True)
531
+ )
532
+ if is_llama3:
533
+ raise TypeError('Llama 3 must be converted with BpeVocab')
534
+
535
+ if not is_llama3 and (
536
+ tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
537
+ or tokenizer_json['decoder']['type'] != 'Sequence'
538
+ ):
539
+ raise FileNotFoundError('Cannot find Llama BPE tokenizer')
540
+
459
541
  try:
460
542
  from transformers import AutoTokenizer
461
543
  except ImportError as e:
462
544
  raise ImportError(
463
- "To use HfVocab, please install the `transformers` package. "
545
+ "To use LlamaHfVocab, please install the `transformers` package. "
464
546
  "You can install it with `pip install transformers`."
465
547
  ) from e
466
548
 
467
- print("fname_tokenizer:", fname_tokenizer)
468
549
  # Allow the tokenizer to default to slow or fast versions.
469
550
  # Explicitly set tokenizer to use local paths.
470
551
  self.tokenizer = AutoTokenizer.from_pretrained(
471
- fname_tokenizer,
472
- cache_dir=fname_tokenizer,
552
+ base_path,
553
+ cache_dir=base_path,
473
554
  local_files_only=True,
474
555
  )
556
+ assert self.tokenizer.is_fast # assume tokenizer.json is used
475
557
 
476
558
  # Initialize lists and dictionaries for added tokens
477
559
  self.added_tokens_list = []
@@ -499,8 +581,7 @@ class HfVocab:
499
581
  self.vocab_size_base = self.tokenizer.vocab_size
500
582
  self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
501
583
 
502
- self.fname_tokenizer = fname_tokenizer
503
- self.fname_added_tokens = fname_added_tokens
584
+ self.fname_tokenizer = fname_tokenizer
504
585
 
505
586
  def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
506
587
  reverse_vocab = {
@@ -552,10 +633,7 @@ class HfVocab:
552
633
  yield from self.added_tokens()
553
634
 
554
635
  def __repr__(self) -> str:
555
- return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
556
-
557
-
558
- Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab"
636
+ return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
559
637
 
560
638
 
561
639
  #
@@ -573,17 +651,18 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
573
651
  .reshape(weights.shape))
574
652
 
575
653
 
576
- class Tensor(metaclass=ABCMeta):
654
+ class Tensor(ABC):
655
+ ndarray: NDArray
577
656
  data_type: DataType
578
657
 
579
658
  @abstractmethod
580
- def astype(self, data_type: DataType) -> Tensor: ...
659
+ def astype(self, data_type: DataType) -> Self: ...
581
660
  @abstractmethod
582
- def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
661
+ def permute(self, n_head: int, n_head_kv: int) -> Self: ...
583
662
  @abstractmethod
584
- def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
663
+ def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> Self: ...
585
664
  @abstractmethod
586
- def part(self, n_part: int) -> UnquantizedTensor: ...
665
+ def part(self, n_part: int) -> Self: ...
587
666
  @abstractmethod
588
667
  def to_ggml(self) -> GGMLCompatibleTensor: ...
589
668
 
@@ -595,18 +674,18 @@ def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
595
674
 
596
675
 
597
676
  class UnquantizedTensor(Tensor):
598
- def __init__(self, ndarray: NDArray) -> None:
677
+ def __init__(self, ndarray: NDArray):
599
678
  assert isinstance(ndarray, np.ndarray)
600
679
  self.ndarray = ndarray
601
680
  self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
602
681
 
603
- def astype(self, data_type: DataType) -> Tensor:
682
+ def astype(self, data_type: DataType) -> UnquantizedTensor:
604
683
  dtype = data_type.dtype
605
684
  if self.data_type == DT_BF16:
606
685
  self.ndarray = bf16_to_fp32(self.ndarray)
607
686
  return UnquantizedTensor(self.ndarray.astype(dtype))
608
687
 
609
- def to_ggml(self) -> UnquantizedTensor:
688
+ def to_ggml(self) -> Self:
610
689
  return self
611
690
 
612
691
  def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
@@ -674,7 +753,7 @@ class ModelPlus:
674
753
  model: LazyModel
675
754
  paths: list[Path] # Where this was read from.
676
755
  format: Literal['ggml', 'torch', 'safetensors', 'none']
677
- vocab: Vocab | None # For GGML models (which have vocab built in), the vocab.
756
+ vocab: BaseVocab | None # For GGML models (which have vocab built in), the vocab.
678
757
 
679
758
 
680
759
  def merge_sharded(models: list[LazyModel]) -> LazyModel:
@@ -683,7 +762,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
683
762
  names = {name: None for model in models for name in model}
684
763
 
685
764
  def convert(name: str) -> LazyTensor:
686
- lazy_tensors: list[LazyTensor] = [model[name] for model in models]
765
+ lazy_tensors = [model[name] for model in models]
687
766
  if len(lazy_tensors) == 1:
688
767
  # only one file; don't go through this procedure since there might
689
768
  # be quantized tensors
@@ -704,7 +783,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
704
783
 
705
784
  def load() -> UnquantizedTensor:
706
785
  ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
707
- concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
786
+ concatenated = np.concatenate(ndarrays, axis=axis)
708
787
  return UnquantizedTensor(concatenated)
709
788
  description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
710
789
  return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
@@ -756,6 +835,15 @@ def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
756
835
  return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
757
836
 
758
837
 
838
+ def pack_experts_lazy(lazy_tensors: list[LazyTensor]) -> LazyTensor:
839
+ def load() -> Tensor:
840
+ tensors = [lazy_tensor.load() for lazy_tensor in lazy_tensors]
841
+ return UnquantizedTensor(np.array([tensor.ndarray for tensor in tensors]))
842
+ s = lazy_tensors[0].shape.copy()
843
+ s.insert(0, len(lazy_tensors))
844
+ return LazyTensor(load, s, lazy_tensors[0].data_type, 'pack_experts ' + ' | '.join(lt.description for lt in lazy_tensors))
845
+
846
+
759
847
  # Functionality that simulates `torch.load` but where individual tensors are
760
848
  # only loaded into memory on demand, not all at once.
761
849
  # PyTorch can't do this natively as of time of writing:
@@ -792,10 +880,10 @@ class LazyUnpickler(pickle.Unpickler):
792
880
 
793
881
  def load(offset: int, elm_count: int) -> NDArray:
794
882
  dtype = data_type.dtype
795
- fp = self.zip_file.open(info)
796
- fp.seek(offset * dtype.itemsize)
797
- size = elm_count * dtype.itemsize
798
- data = fp.read(size)
883
+ with self.zip_file.open(info) as fp:
884
+ fp.seek(offset * dtype.itemsize)
885
+ size = elm_count * dtype.itemsize
886
+ data = fp.read(size)
799
887
  assert len(data) == size
800
888
  return np.frombuffer(data, dtype)
801
889
  description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
@@ -816,7 +904,7 @@ class LazyUnpickler(pickle.Unpickler):
816
904
  def rebuild_from_type_v2(func, new_type, args, state):
817
905
  return func(*args)
818
906
 
819
- CLASSES: dict[tuple[str, str], Any] = {
907
+ CLASSES = {
820
908
  # getattr used here as a workaround for mypy not being smart enough to determine
821
909
  # the staticmethods have a __func__ attribute.
822
910
  ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
@@ -875,7 +963,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
875
963
  def must_read(fp: IO[bytes], length: int) -> bytes:
876
964
  ret = fp.read(length)
877
965
  if len(ret) < length:
878
- raise Exception("unexpectedly reached end of file")
966
+ raise EOFError("unexpectedly reached end of file")
879
967
  return ret
880
968
 
881
969
 
@@ -933,12 +1021,15 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
933
1021
  yield result
934
1022
 
935
1023
 
936
- def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
1024
+ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) -> None:
937
1025
  # Handle special case where the model's vocab size is not set
938
1026
  if params.n_vocab == -1:
939
1027
  raise ValueError(
940
- f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?"
1028
+ "The model's vocab size is set to -1 in params.json. Please update it manually."
1029
+ + (f" Maybe {vocab.vocab_size}?" if isinstance(vocab, Vocab) else ""),
941
1030
  )
1031
+ if not isinstance(vocab, Vocab):
1032
+ return # model has no vocab
942
1033
 
943
1034
  # Check for a vocab size mismatch
944
1035
  if params.n_vocab == vocab.vocab_size:
@@ -962,11 +1053,11 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
962
1053
  if vocab.vocab_size < params.n_vocab:
963
1054
  msg += " Add the --pad-vocab option and try again."
964
1055
 
965
- raise Exception(msg)
1056
+ raise ValueError(msg)
966
1057
 
967
1058
 
968
1059
  class OutputFile:
969
- def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
1060
+ def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
970
1061
  self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
971
1062
 
972
1063
  def add_meta_arch(self, params: Params) -> None:
@@ -979,6 +1070,7 @@ class OutputFile:
979
1070
  name = str(params.path_model.parent).split('/')[-1]
980
1071
 
981
1072
  self.gguf.add_name (name)
1073
+ self.gguf.add_vocab_size (params.n_vocab)
982
1074
  self.gguf.add_context_length (params.n_ctx)
983
1075
  self.gguf.add_embedding_length (params.n_embd)
984
1076
  self.gguf.add_block_count (params.n_layer)
@@ -1015,20 +1107,6 @@ class OutputFile:
1015
1107
  if params.ftype is not None:
1016
1108
  self.gguf.add_file_type(params.ftype)
1017
1109
 
1018
- def handle_tokenizer_model(self, vocab: Vocab) -> str:
1019
- # Map the vocab types to the supported tokenizer models
1020
- tokenizer_model = {
1021
- SentencePieceVocab: "llama",
1022
- HfVocab: "llama",
1023
- BpeVocab: "gpt2",
1024
- }.get(type(vocab))
1025
-
1026
- # Block if vocab type is not predefined
1027
- if tokenizer_model is None:
1028
- raise ValueError("Unknown vocab type: Not supported")
1029
-
1030
- return tokenizer_model
1031
-
1032
1110
  def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
1033
1111
  tokens = []
1034
1112
  scores = []
@@ -1045,11 +1123,8 @@ class OutputFile:
1045
1123
  return tokens, scores, toktypes
1046
1124
 
1047
1125
  def add_meta_vocab(self, vocab: Vocab) -> None:
1048
- # Handle the tokenizer model
1049
- tokenizer_model = self.handle_tokenizer_model(vocab)
1050
-
1051
1126
  # Ensure that tokenizer_model is added to the GGUF model
1052
- self.gguf.add_tokenizer_model(tokenizer_model)
1127
+ self.gguf.add_tokenizer_model(vocab.tokenizer_model)
1053
1128
 
1054
1129
  # Extract model vocabulary for model conversion
1055
1130
  tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
@@ -1076,6 +1151,26 @@ class OutputFile:
1076
1151
  def write_tensor_info(self) -> None:
1077
1152
  self.gguf.write_ti_data_to_file()
1078
1153
 
1154
+ def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None:
1155
+ ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency)
1156
+ if ftype == GGMLFileType.MostlyQ8_0:
1157
+ ndarrays = bounded_parallel_map(
1158
+ OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
1159
+ use_processpool_executor=True,
1160
+ )
1161
+ else:
1162
+ ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
1163
+
1164
+ start = time.time()
1165
+ for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
1166
+ elapsed = time.time() - start
1167
+ size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
1168
+ padi = len(str(len(model)))
1169
+ print(
1170
+ f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
1171
+ )
1172
+ self.gguf.write_tensor_data(ndarray)
1173
+
1079
1174
  def close(self) -> None:
1080
1175
  self.gguf.close()
1081
1176
 
@@ -1084,7 +1179,7 @@ class OutputFile:
1084
1179
  fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
1085
1180
  endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
1086
1181
  ) -> None:
1087
- check_vocab_size(params, vocab, pad_vocab = pad_vocab)
1182
+ check_vocab_size(params, vocab, pad_vocab=pad_vocab)
1088
1183
 
1089
1184
  of = OutputFile(fname_out, endianess=endianess)
1090
1185
 
@@ -1112,7 +1207,7 @@ class OutputFile:
1112
1207
 
1113
1208
  @staticmethod
1114
1209
  def write_all(
1115
- fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
1210
+ fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
1116
1211
  concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
1117
1212
  pad_vocab: bool = False,
1118
1213
  ) -> None:
@@ -1122,8 +1217,11 @@ class OutputFile:
1122
1217
 
1123
1218
  # meta data
1124
1219
  of.add_meta_arch(params)
1125
- of.add_meta_vocab(vocab)
1126
- of.add_meta_special_vocab(svocab)
1220
+ if isinstance(vocab, Vocab):
1221
+ of.add_meta_vocab(vocab)
1222
+ of.add_meta_special_vocab(svocab)
1223
+ else: # NoVocab
1224
+ of.gguf.add_tokenizer_model(vocab.tokenizer_model)
1127
1225
 
1128
1226
  # tensor info
1129
1227
  for name, lazy_tensor in model.items():
@@ -1133,24 +1231,7 @@ class OutputFile:
1133
1231
  of.write_tensor_info()
1134
1232
 
1135
1233
  # tensor data
1136
- ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
1137
- if ftype == GGMLFileType.MostlyQ8_0:
1138
- ndarrays = bounded_parallel_map(
1139
- OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
1140
- use_processpool_executor=True,
1141
- )
1142
- else:
1143
- ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
1144
-
1145
- start = time.time()
1146
- for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
1147
- elapsed = time.time() - start
1148
- size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
1149
- padi = len(str(len(model)))
1150
- print(
1151
- f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
1152
- )
1153
- of.gguf.write_tensor_data(ndarray)
1234
+ of.write_tensor_data(ftype, model, concurrency)
1154
1235
 
1155
1236
  of.close()
1156
1237
 
@@ -1158,16 +1239,16 @@ class OutputFile:
1158
1239
  def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
1159
1240
  wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
1160
1241
 
1161
- if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
1242
+ if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
1162
1243
  return GGMLFileType.AllF32
1163
- if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
1244
+ if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
1164
1245
  return GGMLFileType.MostlyF16
1165
1246
  if output_type_str == "q8_0":
1166
1247
  return GGMLFileType.MostlyQ8_0
1167
1248
 
1168
1249
  name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
1169
1250
 
1170
- raise Exception(f"Unexpected combination of types: {name_to_type}")
1251
+ raise ValueError(f"Unexpected combination of types: {name_to_type}")
1171
1252
 
1172
1253
 
1173
1254
  def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
@@ -1177,10 +1258,26 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM
1177
1258
 
1178
1259
  def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
1179
1260
  tmap = gguf.TensorNameMap(ARCH, params.n_layer)
1180
- should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
1261
+ should_skip = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
1181
1262
 
1182
1263
  tmp = model
1183
1264
 
1265
+ # merge experts into one tensor
1266
+ if params.n_experts and params.n_experts > 0:
1267
+ for i_l in range(params.n_layer):
1268
+ for w in range(1, 4):
1269
+ experts = []
1270
+ for e in range(params.n_experts):
1271
+ if f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight" in model:
1272
+ experts.append(model[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"])
1273
+ del tmp[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"]
1274
+ elif f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight" in model:
1275
+ experts.append(model[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"])
1276
+ del tmp[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"]
1277
+ else:
1278
+ raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight")
1279
+ tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts)
1280
+
1184
1281
  # HF models permut or pack some of the tensors, so we need to undo that
1185
1282
  for i in itertools.count():
1186
1283
  if f"model.layers.{i}.self_attn.q_proj.weight" in model:
@@ -1204,8 +1301,7 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
1204
1301
  if skip_unknown:
1205
1302
  print(f"Unexpected tensor name: {name} - skipping")
1206
1303
  continue
1207
- else:
1208
- raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
1304
+ raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
1209
1305
 
1210
1306
  if tensor_type in should_skip:
1211
1307
  print(f"skipping tensor {name_new}")
@@ -1222,7 +1318,7 @@ def nth_multifile_path(path: Path, n: int) -> Path | None:
1222
1318
  the nth path in the model.
1223
1319
  '''
1224
1320
  # Support the following patterns:
1225
- patterns: list[tuple[str, str]] = [
1321
+ patterns = [
1226
1322
  # - x.00.pth, x.01.pth, etc.
1227
1323
  (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
1228
1324
  # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
@@ -1261,16 +1357,16 @@ def load_some_model(path: Path) -> ModelPlus:
1261
1357
  # Be extra-friendly and accept either a file or a directory:
1262
1358
  if path.is_dir():
1263
1359
  # Check if it's a set of safetensors files first
1264
- globs = ["model-00001-of-*.safetensors", "model.safetensors"]
1360
+ globs = ["model-00001-of-*.safetensors", "model.safetensors", "consolidated.safetensors"]
1265
1361
  files = [file for glob in globs for file in path.glob(glob)]
1266
1362
  if not files:
1267
1363
  # Try the PyTorch patterns too, with lower priority
1268
1364
  globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
1269
1365
  files = [file for glob in globs for file in path.glob(glob)]
1270
1366
  if not files:
1271
- raise Exception(f"Can't find model in directory {path}")
1367
+ raise FileNotFoundError(f"Can't find model in directory {path}")
1272
1368
  if len(files) > 1:
1273
- raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
1369
+ raise ValueError(f"Found multiple models in {path}, not sure which to pick: {files}")
1274
1370
  path = files[0]
1275
1371
 
1276
1372
  paths = find_multifile_paths(path)
@@ -1284,36 +1380,14 @@ def load_some_model(path: Path) -> ModelPlus:
1284
1380
 
1285
1381
 
1286
1382
  class VocabFactory:
1287
- _FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"}
1383
+ _VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab]
1288
1384
 
1289
1385
  def __init__(self, path: Path):
1290
1386
  self.path = path
1291
- self.file_paths = self._detect_files()
1292
- print(f"Found vocab files: {self.file_paths}")
1293
-
1294
- def _detect_files(self) -> dict[str, Path | None]:
1295
- def locate(file: str) -> Path | None:
1296
- if (path := self.path / file).exists():
1297
- return path
1298
- if (path := self.path.parent / file).exists():
1299
- return path
1300
- return None
1301
-
1302
- return {vt: locate(f) for vt, f in self._FILES.items()}
1303
1387
 
1304
- def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
1305
- for vtype in vocab_types:
1306
- try:
1307
- path = self.file_paths[vtype]
1308
- except KeyError:
1309
- raise ValueError(f"Unsupported vocabulary type {vtype}") from None
1310
- if path is not None:
1311
- return vtype, path
1312
- raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
1313
-
1314
- def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
1315
- load_merges = vocabtype == "bpe"
1316
- n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
1388
+ def _create_special_vocab(self, vocab: BaseVocab, model_parent_path: Path) -> gguf.SpecialVocab:
1389
+ load_merges = vocab.name == "bpe"
1390
+ n_vocab = vocab.vocab_size if isinstance(vocab, Vocab) else None
1317
1391
  return gguf.SpecialVocab(
1318
1392
  model_parent_path,
1319
1393
  load_merges=load_merges,
@@ -1321,30 +1395,36 @@ class VocabFactory:
1321
1395
  n_vocab=n_vocab,
1322
1396
  )
1323
1397
 
1324
- def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
1325
- vocab_type, path = self._select_file(vocab_types)
1326
- print(f"Loading vocab file {path!r}, type {vocab_type!r}")
1398
+ def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
1399
+ vocab_classes: dict[str, type[Vocab]] = {cls.name: cls for cls in self._VOCAB_CLASSES}
1400
+ selected_vocabs: dict[str, type[Vocab]] = {}
1401
+ for vtype in vocab_types:
1402
+ try:
1403
+ selected_vocabs[vtype] = vocab_classes[vtype]
1404
+ except KeyError:
1405
+ raise ValueError(f"Unsupported vocabulary type {vtype}") from None
1327
1406
 
1328
- added_tokens_path = path.parent / "added_tokens.json"
1329
- vocab: Vocab
1330
- if vocab_type == "bpe":
1331
- vocab = BpeVocab(
1332
- path, added_tokens_path if added_tokens_path.exists() else None
1333
- )
1334
- elif vocab_type == "spm":
1335
- vocab = SentencePieceVocab(
1336
- path, added_tokens_path if added_tokens_path.exists() else None
1337
- )
1338
- elif vocab_type == "hfft":
1339
- vocab = HfVocab(
1340
- path.parent, added_tokens_path if added_tokens_path.exists() else None
1341
- )
1407
+ for vtype, cls in selected_vocabs.items():
1408
+ try:
1409
+ vocab = cls(self.path)
1410
+ break
1411
+ except FileNotFoundError:
1412
+ pass # ignore unavailable tokenizers
1413
+ else:
1414
+ raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
1415
+
1416
+ print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
1417
+ return vocab
1418
+
1419
+ def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
1420
+ vocab: BaseVocab
1421
+ if vocab_types is None:
1422
+ vocab = NoVocab()
1342
1423
  else:
1343
- raise ValueError(vocab_type)
1424
+ vocab = self._create_vocab_by_path(vocab_types)
1344
1425
  # FIXME: Respect --vocab-dir?
1345
1426
  special_vocab = self._create_special_vocab(
1346
1427
  vocab,
1347
- vocab_type,
1348
1428
  model_parent_path,
1349
1429
  )
1350
1430
  return vocab, special_vocab
@@ -1382,6 +1462,7 @@ def main(args_in: list[str] | None = None) -> None:
1382
1462
  parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
1383
1463
  parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
1384
1464
  parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
1465
+ parser.add_argument("--no-vocab", action="store_true", help="store model without the vocab")
1385
1466
  parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
1386
1467
  parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
1387
1468
  parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
@@ -1394,6 +1475,8 @@ def main(args_in: list[str] | None = None) -> None:
1394
1475
  parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
1395
1476
 
1396
1477
  args = parser.parse_args(args_in)
1478
+ if args.no_vocab and args.vocab_only:
1479
+ raise ValueError("--vocab-only does not make sense with --no-vocab")
1397
1480
 
1398
1481
  if args.dump_single:
1399
1482
  model_plus = lazy_load_file(args.model)
@@ -1415,10 +1498,12 @@ def main(args_in: list[str] | None = None) -> None:
1415
1498
  params = Params.load(model_plus)
1416
1499
  if params.n_ctx == -1:
1417
1500
  if args.ctx is None:
1418
- raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
1419
- "Please specify one with --ctx:\n"
1420
- " - LLaMA v1: --ctx 2048\n"
1421
- " - LLaMA v2: --ctx 4096\n")
1501
+ msg = """\
1502
+ The model doesn't have a context size, and you didn't specify one with --ctx
1503
+ Please specify one with --ctx:
1504
+ - LLaMA v1: --ctx 2048
1505
+ - LLaMA v2: --ctx 4096"""
1506
+ parser.error(textwrap.dedent(msg))
1422
1507
  params.n_ctx = args.ctx
1423
1508
 
1424
1509
  if args.outtype:
@@ -1433,9 +1518,11 @@ def main(args_in: list[str] | None = None) -> None:
1433
1518
  model_parent_path = model_plus.paths[0].parent
1434
1519
  vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
1435
1520
  vocab_factory = VocabFactory(vocab_path)
1436
- vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type.split(","), model_parent_path)
1521
+ vocab_types = None if args.no_vocab else args.vocab_type.split(",")
1522
+ vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
1437
1523
 
1438
1524
  if args.vocab_only:
1525
+ assert isinstance(vocab, Vocab)
1439
1526
  if not args.outfile:
1440
1527
  raise ValueError("need --outfile if using --vocab-only")
1441
1528
  outfile = args.outfile
@@ -1444,7 +1531,7 @@ def main(args_in: list[str] | None = None) -> None:
1444
1531
  print(f"Wrote {outfile}")
1445
1532
  return
1446
1533
 
1447
- if model_plus.vocab is not None and args.vocab_dir is None:
1534
+ if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
1448
1535
  vocab = model_plus.vocab
1449
1536
 
1450
1537
  print(f"Vocab info: {vocab}")
@@ -1465,4 +1552,4 @@ def main(args_in: list[str] | None = None) -> None:
1465
1552
 
1466
1553
 
1467
1554
  if __name__ == '__main__':
1468
- main()
1555
+ main()