bigdl-core-cpp 2.5.0b20240421__py3-none-win_amd64.whl → 2.5.0b20240422__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/convert.py +276 -189
- bigdl/cpp/gguf-py/__init__.py +0 -0
- bigdl/cpp/gguf-py/gguf/__init__.py +5 -0
- bigdl/cpp/gguf-py/gguf/constants.py +943 -0
- bigdl/cpp/gguf-py/gguf/gguf.py +15 -0
- bigdl/cpp/gguf-py/gguf/gguf_reader.py +279 -0
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +518 -0
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +434 -0
- bigdl/cpp/gguf-py/gguf/vocab.py +181 -0
- bigdl/cpp/libs/baby-llama.exe +0 -0
- bigdl/cpp/libs/batched-bench.exe +0 -0
- bigdl/cpp/libs/batched.exe +0 -0
- bigdl/cpp/libs/beam-search.exe +0 -0
- bigdl/cpp/libs/benchmark.exe +0 -0
- bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
- bigdl/cpp/libs/embedding.exe +0 -0
- bigdl/cpp/libs/export-lora.exe +0 -0
- bigdl/cpp/libs/finetune.exe +0 -0
- bigdl/cpp/libs/gguf.exe +0 -0
- bigdl/cpp/libs/gritlm.exe +0 -0
- bigdl/cpp/libs/imatrix.exe +0 -0
- bigdl/cpp/libs/infill.exe +0 -0
- bigdl/cpp/libs/llama-bench.exe +0 -0
- bigdl/cpp/libs/llava-cli.exe +0 -0
- bigdl/cpp/libs/lookahead.exe +0 -0
- bigdl/cpp/libs/lookup.exe +0 -0
- bigdl/cpp/libs/ls-sycl-device.exe +0 -0
- bigdl/cpp/libs/main.exe +0 -0
- bigdl/cpp/libs/ollama.exe +0 -0
- bigdl/cpp/libs/parallel.exe +0 -0
- bigdl/cpp/libs/passkey.exe +0 -0
- bigdl/cpp/libs/perplexity.exe +0 -0
- bigdl/cpp/libs/q8dot.exe +0 -0
- bigdl/cpp/libs/quantize-stats.exe +0 -0
- bigdl/cpp/libs/quantize.exe +0 -0
- bigdl/cpp/libs/save-load-state.exe +0 -0
- bigdl/cpp/libs/server.exe +0 -0
- bigdl/cpp/libs/simple.exe +0 -0
- bigdl/cpp/libs/speculative.exe +0 -0
- bigdl/cpp/libs/tokenize.exe +0 -0
- bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
- bigdl/cpp/libs/vdot.exe +0 -0
- {bigdl_core_cpp-2.5.0b20240421.data → bigdl_core_cpp-2.5.0b20240422.data}/scripts/init-llama-cpp.bat +1 -0
- {bigdl_core_cpp-2.5.0b20240421.dist-info → bigdl_core_cpp-2.5.0b20240422.dist-info}/METADATA +3 -3
- bigdl_core_cpp-2.5.0b20240422.dist-info/RECORD +50 -0
- bigdl_core_cpp-2.5.0b20240421.dist-info/RECORD +0 -42
- {bigdl_core_cpp-2.5.0b20240421.data → bigdl_core_cpp-2.5.0b20240422.data}/scripts/init-llama-cpp.ps1 +0 -0
- {bigdl_core_cpp-2.5.0b20240421.data → bigdl_core_cpp-2.5.0b20240422.data}/scripts/init-ollama.bat +0 -0
- {bigdl_core_cpp-2.5.0b20240421.dist-info → bigdl_core_cpp-2.5.0b20240422.dist-info}/WHEEL +0 -0
- {bigdl_core_cpp-2.5.0b20240421.dist-info → bigdl_core_cpp-2.5.0b20240422.dist-info}/top_level.txt +0 -0
bigdl/cpp/convert.py
CHANGED
@@ -1,6 +1,4 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# this file is copied from https://github.com/ggerganov/llama.cpp/blob/1e35d619a6fb0b9c5e3dc955345980ff056ddbaf/convert.py
|
3
|
-
|
4
2
|
from __future__ import annotations
|
5
3
|
|
6
4
|
import argparse
|
@@ -18,13 +16,14 @@ import re
|
|
18
16
|
import signal
|
19
17
|
import struct
|
20
18
|
import sys
|
19
|
+
import textwrap
|
21
20
|
import time
|
22
21
|
import zipfile
|
23
|
-
from abc import
|
22
|
+
from abc import ABC, abstractmethod
|
24
23
|
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
25
24
|
from dataclasses import dataclass
|
26
25
|
from pathlib import Path
|
27
|
-
from typing import
|
26
|
+
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
|
28
27
|
|
29
28
|
import numpy as np
|
30
29
|
from sentencepiece import SentencePieceProcessor
|
@@ -34,7 +33,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
|
34
33
|
import gguf
|
35
34
|
|
36
35
|
if TYPE_CHECKING:
|
37
|
-
from
|
36
|
+
from typing_extensions import Self, TypeAlias
|
38
37
|
|
39
38
|
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
|
40
39
|
faulthandler.register(signal.SIGUSR1)
|
@@ -45,6 +44,9 @@ ARCH = gguf.MODEL_ARCH.LLAMA
|
|
45
44
|
|
46
45
|
DEFAULT_CONCURRENCY = 8
|
47
46
|
|
47
|
+
ADDED_TOKENS_FILE = 'added_tokens.json'
|
48
|
+
FAST_TOKENIZER_FILE = 'tokenizer.json'
|
49
|
+
|
48
50
|
#
|
49
51
|
# data types
|
50
52
|
#
|
@@ -137,7 +139,8 @@ class GGMLFileType(enum.IntEnum):
|
|
137
139
|
dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
|
138
140
|
if dt is None:
|
139
141
|
raise ValueError(self)
|
140
|
-
# 1D tensors
|
142
|
+
# Convert all 1D tensors to F32. Most of the codebase that takes in 1D tensors only handles F32 tensors, and most of the outputs tensors are F32.
|
143
|
+
# Also The 1d tensors aren't much of a performance/size issue. So instead of having to have separate F32 and F16 implementations of both, just convert everything to F32 for now.
|
141
144
|
return dt if len(tensor.shape) > 1 else DT_F32
|
142
145
|
|
143
146
|
|
@@ -190,8 +193,10 @@ class Params:
|
|
190
193
|
n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
191
194
|
|
192
195
|
if n_layer < 1:
|
193
|
-
|
194
|
-
|
196
|
+
msg = """\
|
197
|
+
failed to guess 'n_layer'. This model is unknown or unsupported.
|
198
|
+
Suggestion: provide 'config.json' of the model in the same directory containing model files."""
|
199
|
+
raise KeyError(textwrap.dedent(msg))
|
195
200
|
|
196
201
|
n_head = n_embd // 128 # guessed
|
197
202
|
n_mult = 256 # guessed
|
@@ -213,7 +218,8 @@ class Params:
|
|
213
218
|
|
214
219
|
@staticmethod
|
215
220
|
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
|
216
|
-
|
221
|
+
with open(config_path) as f:
|
222
|
+
config = json.load(f)
|
217
223
|
|
218
224
|
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
|
219
225
|
rope_scaling = config.get("rope_scaling")
|
@@ -235,8 +241,10 @@ class Params:
|
|
235
241
|
elif "max_position_embeddings" in config:
|
236
242
|
n_ctx = config["max_position_embeddings"]
|
237
243
|
else:
|
238
|
-
|
239
|
-
|
244
|
+
msg = """\
|
245
|
+
failed to guess 'n_ctx'. This model is unknown or unsupported.
|
246
|
+
Suggestion: provide 'config.json' of the model in the same directory containing model files."""
|
247
|
+
raise KeyError(textwrap.dedent(msg))
|
240
248
|
|
241
249
|
n_experts = None
|
242
250
|
n_experts_used = None
|
@@ -267,7 +275,8 @@ class Params:
|
|
267
275
|
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
|
268
276
|
@staticmethod
|
269
277
|
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
|
270
|
-
|
278
|
+
with open(config_path) as f:
|
279
|
+
config = json.load(f)
|
271
280
|
|
272
281
|
n_experts = None
|
273
282
|
n_experts_used = None
|
@@ -333,44 +342,86 @@ class Params:
|
|
333
342
|
# vocab
|
334
343
|
#
|
335
344
|
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
345
|
+
@runtime_checkable
|
346
|
+
class BaseVocab(Protocol):
|
347
|
+
tokenizer_model: ClassVar[str]
|
348
|
+
name: ClassVar[str]
|
349
|
+
|
350
|
+
|
351
|
+
class NoVocab(BaseVocab):
|
352
|
+
tokenizer_model = "no_vocab"
|
353
|
+
name = "no_vocab"
|
354
|
+
|
355
|
+
def __repr__(self) -> str:
|
356
|
+
return "<NoVocab for a model without integrated vocabulary>"
|
357
|
+
|
358
|
+
|
359
|
+
@runtime_checkable
|
360
|
+
class Vocab(BaseVocab, Protocol):
|
361
|
+
vocab_size: int
|
362
|
+
added_tokens_dict: dict[str, int]
|
363
|
+
added_tokens_list: list[str]
|
364
|
+
fname_tokenizer: Path
|
365
|
+
|
366
|
+
def __init__(self, base_path: Path): ...
|
367
|
+
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
|
368
|
+
|
369
|
+
|
370
|
+
class BpeVocab(Vocab):
|
371
|
+
tokenizer_model = "gpt2"
|
372
|
+
name = "bpe"
|
373
|
+
|
374
|
+
def __init__(self, base_path: Path):
|
375
|
+
added_tokens: dict[str, int] = {}
|
376
|
+
|
377
|
+
if (fname_tokenizer := base_path / 'vocab.json').exists():
|
378
|
+
# "slow" tokenizer
|
379
|
+
with open(fname_tokenizer, encoding="utf-8") as f:
|
380
|
+
self.vocab = json.load(f)
|
381
|
+
|
382
|
+
try:
|
383
|
+
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
384
|
+
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
|
385
|
+
added_tokens = json.load(f)
|
386
|
+
except FileNotFoundError:
|
387
|
+
pass
|
347
388
|
else:
|
348
|
-
#
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
tokenizer_json = json.load(
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
389
|
+
# "fast" tokenizer
|
390
|
+
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
391
|
+
|
392
|
+
# if this fails, FileNotFoundError propagates to caller
|
393
|
+
with open(fname_tokenizer, encoding="utf-8") as f:
|
394
|
+
tokenizer_json = json.load(f)
|
395
|
+
|
396
|
+
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
397
|
+
if (
|
398
|
+
tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
|
399
|
+
or tokenizer_json['decoder']['type'] != 'ByteLevel'
|
400
|
+
):
|
401
|
+
raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
|
402
|
+
|
403
|
+
self.vocab = tokenizer_model["vocab"]
|
404
|
+
|
405
|
+
if (added := tokenizer_json.get('added_tokens')) is not None:
|
406
|
+
# Added tokens here can be duplicates of the main vocabulary.
|
407
|
+
added_tokens = {item['content']: item['id']
|
408
|
+
for item in added
|
409
|
+
if item['content'] not in self.vocab}
|
410
|
+
|
411
|
+
vocab_size = len(self.vocab)
|
412
|
+
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
413
|
+
actual_ids = sorted(added_tokens.values())
|
363
414
|
if expected_ids != actual_ids:
|
364
415
|
expected_end_id = vocab_size + len(actual_ids) - 1
|
365
|
-
raise
|
416
|
+
raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
|
417
|
+
f"{vocab_size} - {expected_end_id}; got {actual_ids}")
|
366
418
|
|
367
419
|
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
368
420
|
self.added_tokens_dict = added_tokens
|
369
421
|
self.added_tokens_list = [text for (text, idx) in items]
|
370
|
-
self.vocab_size_base
|
371
|
-
self.vocab_size
|
422
|
+
self.vocab_size_base = vocab_size
|
423
|
+
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
372
424
|
self.fname_tokenizer = fname_tokenizer
|
373
|
-
self.fname_added_tokens = fname_added_tokens
|
374
425
|
|
375
426
|
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
376
427
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
|
@@ -391,16 +442,25 @@ class BpeVocab:
|
|
391
442
|
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
392
443
|
|
393
444
|
|
394
|
-
class SentencePieceVocab:
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
445
|
+
class SentencePieceVocab(Vocab):
|
446
|
+
tokenizer_model = "llama"
|
447
|
+
name = "spm"
|
448
|
+
|
449
|
+
def __init__(self, base_path: Path):
|
450
|
+
added_tokens: dict[str, int] = {}
|
451
|
+
if (fname_tokenizer := base_path / 'tokenizer.model').exists():
|
452
|
+
# normal location
|
453
|
+
try:
|
454
|
+
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
|
455
|
+
added_tokens = json.load(f)
|
456
|
+
except FileNotFoundError:
|
457
|
+
pass
|
458
|
+
elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
|
459
|
+
# not found in alternate location either
|
460
|
+
raise FileNotFoundError('Cannot find tokenizer.model')
|
402
461
|
|
403
|
-
|
462
|
+
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
463
|
+
vocab_size = self.sentencepiece_tokenizer.vocab_size()
|
404
464
|
|
405
465
|
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
406
466
|
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
@@ -410,18 +470,17 @@ class SentencePieceVocab:
|
|
410
470
|
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
411
471
|
|
412
472
|
# Token pieces that were added to the base vocabulary.
|
413
|
-
self.added_tokens_dict
|
473
|
+
self.added_tokens_dict = added_tokens
|
414
474
|
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
415
475
|
self.vocab_size_base = vocab_size
|
416
476
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
417
477
|
self.fname_tokenizer = fname_tokenizer
|
418
|
-
self.fname_added_tokens = fname_added_tokens
|
419
478
|
|
420
479
|
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
421
480
|
tokenizer = self.sentencepiece_tokenizer
|
422
481
|
for i in range(tokenizer.vocab_size()):
|
423
482
|
piece = tokenizer.id_to_piece(i)
|
424
|
-
text
|
483
|
+
text = piece.encode("utf-8")
|
425
484
|
score: float = tokenizer.get_score(i)
|
426
485
|
|
427
486
|
toktype = gguf.TokenType.NORMAL
|
@@ -454,24 +513,47 @@ class SentencePieceVocab:
|
|
454
513
|
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
455
514
|
|
456
515
|
|
457
|
-
class
|
458
|
-
|
516
|
+
class LlamaHfVocab(Vocab):
|
517
|
+
tokenizer_model = "llama"
|
518
|
+
name = "hfft"
|
519
|
+
|
520
|
+
def __init__(self, base_path: Path):
|
521
|
+
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
522
|
+
# if this fails, FileNotFoundError propagates to caller
|
523
|
+
with open(fname_tokenizer, encoding='utf-8') as f:
|
524
|
+
tokenizer_json = json.load(f)
|
525
|
+
|
526
|
+
# pre-check so we know if we need transformers
|
527
|
+
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
528
|
+
is_llama3 = (
|
529
|
+
tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
|
530
|
+
and not tokenizer_model.get('byte_fallback', True)
|
531
|
+
)
|
532
|
+
if is_llama3:
|
533
|
+
raise TypeError('Llama 3 must be converted with BpeVocab')
|
534
|
+
|
535
|
+
if not is_llama3 and (
|
536
|
+
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
|
537
|
+
or tokenizer_json['decoder']['type'] != 'Sequence'
|
538
|
+
):
|
539
|
+
raise FileNotFoundError('Cannot find Llama BPE tokenizer')
|
540
|
+
|
459
541
|
try:
|
460
542
|
from transformers import AutoTokenizer
|
461
543
|
except ImportError as e:
|
462
544
|
raise ImportError(
|
463
|
-
"To use
|
545
|
+
"To use LlamaHfVocab, please install the `transformers` package. "
|
464
546
|
"You can install it with `pip install transformers`."
|
465
547
|
) from e
|
466
548
|
|
467
|
-
print("fname_tokenizer:", fname_tokenizer)
|
468
549
|
# Allow the tokenizer to default to slow or fast versions.
|
469
550
|
# Explicitly set tokenizer to use local paths.
|
470
551
|
self.tokenizer = AutoTokenizer.from_pretrained(
|
471
|
-
|
472
|
-
cache_dir=
|
552
|
+
base_path,
|
553
|
+
cache_dir=base_path,
|
473
554
|
local_files_only=True,
|
474
555
|
)
|
556
|
+
assert self.tokenizer.is_fast # assume tokenizer.json is used
|
475
557
|
|
476
558
|
# Initialize lists and dictionaries for added tokens
|
477
559
|
self.added_tokens_list = []
|
@@ -499,8 +581,7 @@ class HfVocab:
|
|
499
581
|
self.vocab_size_base = self.tokenizer.vocab_size
|
500
582
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
501
583
|
|
502
|
-
self.fname_tokenizer
|
503
|
-
self.fname_added_tokens = fname_added_tokens
|
584
|
+
self.fname_tokenizer = fname_tokenizer
|
504
585
|
|
505
586
|
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
506
587
|
reverse_vocab = {
|
@@ -552,10 +633,7 @@ class HfVocab:
|
|
552
633
|
yield from self.added_tokens()
|
553
634
|
|
554
635
|
def __repr__(self) -> str:
|
555
|
-
return f"<
|
556
|
-
|
557
|
-
|
558
|
-
Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab"
|
636
|
+
return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
559
637
|
|
560
638
|
|
561
639
|
#
|
@@ -573,17 +651,18 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
|
573
651
|
.reshape(weights.shape))
|
574
652
|
|
575
653
|
|
576
|
-
class Tensor(
|
654
|
+
class Tensor(ABC):
|
655
|
+
ndarray: NDArray
|
577
656
|
data_type: DataType
|
578
657
|
|
579
658
|
@abstractmethod
|
580
|
-
def astype(self, data_type: DataType) ->
|
659
|
+
def astype(self, data_type: DataType) -> Self: ...
|
581
660
|
@abstractmethod
|
582
|
-
def permute(self, n_head: int, n_head_kv: int) ->
|
661
|
+
def permute(self, n_head: int, n_head_kv: int) -> Self: ...
|
583
662
|
@abstractmethod
|
584
|
-
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) ->
|
663
|
+
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> Self: ...
|
585
664
|
@abstractmethod
|
586
|
-
def part(self, n_part: int) ->
|
665
|
+
def part(self, n_part: int) -> Self: ...
|
587
666
|
@abstractmethod
|
588
667
|
def to_ggml(self) -> GGMLCompatibleTensor: ...
|
589
668
|
|
@@ -595,18 +674,18 @@ def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
|
|
595
674
|
|
596
675
|
|
597
676
|
class UnquantizedTensor(Tensor):
|
598
|
-
def __init__(self, ndarray: NDArray)
|
677
|
+
def __init__(self, ndarray: NDArray):
|
599
678
|
assert isinstance(ndarray, np.ndarray)
|
600
679
|
self.ndarray = ndarray
|
601
680
|
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
|
602
681
|
|
603
|
-
def astype(self, data_type: DataType) ->
|
682
|
+
def astype(self, data_type: DataType) -> UnquantizedTensor:
|
604
683
|
dtype = data_type.dtype
|
605
684
|
if self.data_type == DT_BF16:
|
606
685
|
self.ndarray = bf16_to_fp32(self.ndarray)
|
607
686
|
return UnquantizedTensor(self.ndarray.astype(dtype))
|
608
687
|
|
609
|
-
def to_ggml(self) ->
|
688
|
+
def to_ggml(self) -> Self:
|
610
689
|
return self
|
611
690
|
|
612
691
|
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
|
@@ -674,7 +753,7 @@ class ModelPlus:
|
|
674
753
|
model: LazyModel
|
675
754
|
paths: list[Path] # Where this was read from.
|
676
755
|
format: Literal['ggml', 'torch', 'safetensors', 'none']
|
677
|
-
vocab:
|
756
|
+
vocab: BaseVocab | None # For GGML models (which have vocab built in), the vocab.
|
678
757
|
|
679
758
|
|
680
759
|
def merge_sharded(models: list[LazyModel]) -> LazyModel:
|
@@ -683,7 +762,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
|
|
683
762
|
names = {name: None for model in models for name in model}
|
684
763
|
|
685
764
|
def convert(name: str) -> LazyTensor:
|
686
|
-
lazy_tensors
|
765
|
+
lazy_tensors = [model[name] for model in models]
|
687
766
|
if len(lazy_tensors) == 1:
|
688
767
|
# only one file; don't go through this procedure since there might
|
689
768
|
# be quantized tensors
|
@@ -704,7 +783,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
|
|
704
783
|
|
705
784
|
def load() -> UnquantizedTensor:
|
706
785
|
ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
|
707
|
-
concatenated
|
786
|
+
concatenated = np.concatenate(ndarrays, axis=axis)
|
708
787
|
return UnquantizedTensor(concatenated)
|
709
788
|
description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
|
710
789
|
return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
|
@@ -756,6 +835,15 @@ def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
|
|
756
835
|
return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
|
757
836
|
|
758
837
|
|
838
|
+
def pack_experts_lazy(lazy_tensors: list[LazyTensor]) -> LazyTensor:
|
839
|
+
def load() -> Tensor:
|
840
|
+
tensors = [lazy_tensor.load() for lazy_tensor in lazy_tensors]
|
841
|
+
return UnquantizedTensor(np.array([tensor.ndarray for tensor in tensors]))
|
842
|
+
s = lazy_tensors[0].shape.copy()
|
843
|
+
s.insert(0, len(lazy_tensors))
|
844
|
+
return LazyTensor(load, s, lazy_tensors[0].data_type, 'pack_experts ' + ' | '.join(lt.description for lt in lazy_tensors))
|
845
|
+
|
846
|
+
|
759
847
|
# Functionality that simulates `torch.load` but where individual tensors are
|
760
848
|
# only loaded into memory on demand, not all at once.
|
761
849
|
# PyTorch can't do this natively as of time of writing:
|
@@ -792,10 +880,10 @@ class LazyUnpickler(pickle.Unpickler):
|
|
792
880
|
|
793
881
|
def load(offset: int, elm_count: int) -> NDArray:
|
794
882
|
dtype = data_type.dtype
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
883
|
+
with self.zip_file.open(info) as fp:
|
884
|
+
fp.seek(offset * dtype.itemsize)
|
885
|
+
size = elm_count * dtype.itemsize
|
886
|
+
data = fp.read(size)
|
799
887
|
assert len(data) == size
|
800
888
|
return np.frombuffer(data, dtype)
|
801
889
|
description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
|
@@ -816,7 +904,7 @@ class LazyUnpickler(pickle.Unpickler):
|
|
816
904
|
def rebuild_from_type_v2(func, new_type, args, state):
|
817
905
|
return func(*args)
|
818
906
|
|
819
|
-
CLASSES
|
907
|
+
CLASSES = {
|
820
908
|
# getattr used here as a workaround for mypy not being smart enough to determine
|
821
909
|
# the staticmethods have a __func__ attribute.
|
822
910
|
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
|
@@ -875,7 +963,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
|
|
875
963
|
def must_read(fp: IO[bytes], length: int) -> bytes:
|
876
964
|
ret = fp.read(length)
|
877
965
|
if len(ret) < length:
|
878
|
-
raise
|
966
|
+
raise EOFError("unexpectedly reached end of file")
|
879
967
|
return ret
|
880
968
|
|
881
969
|
|
@@ -933,12 +1021,15 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
|
|
933
1021
|
yield result
|
934
1022
|
|
935
1023
|
|
936
|
-
def check_vocab_size(params: Params, vocab:
|
1024
|
+
def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) -> None:
|
937
1025
|
# Handle special case where the model's vocab size is not set
|
938
1026
|
if params.n_vocab == -1:
|
939
1027
|
raise ValueError(
|
940
|
-
|
1028
|
+
"The model's vocab size is set to -1 in params.json. Please update it manually."
|
1029
|
+
+ (f" Maybe {vocab.vocab_size}?" if isinstance(vocab, Vocab) else ""),
|
941
1030
|
)
|
1031
|
+
if not isinstance(vocab, Vocab):
|
1032
|
+
return # model has no vocab
|
942
1033
|
|
943
1034
|
# Check for a vocab size mismatch
|
944
1035
|
if params.n_vocab == vocab.vocab_size:
|
@@ -962,11 +1053,11 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
|
|
962
1053
|
if vocab.vocab_size < params.n_vocab:
|
963
1054
|
msg += " Add the --pad-vocab option and try again."
|
964
1055
|
|
965
|
-
raise
|
1056
|
+
raise ValueError(msg)
|
966
1057
|
|
967
1058
|
|
968
1059
|
class OutputFile:
|
969
|
-
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE)
|
1060
|
+
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
|
970
1061
|
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
971
1062
|
|
972
1063
|
def add_meta_arch(self, params: Params) -> None:
|
@@ -979,6 +1070,7 @@ class OutputFile:
|
|
979
1070
|
name = str(params.path_model.parent).split('/')[-1]
|
980
1071
|
|
981
1072
|
self.gguf.add_name (name)
|
1073
|
+
self.gguf.add_vocab_size (params.n_vocab)
|
982
1074
|
self.gguf.add_context_length (params.n_ctx)
|
983
1075
|
self.gguf.add_embedding_length (params.n_embd)
|
984
1076
|
self.gguf.add_block_count (params.n_layer)
|
@@ -1015,20 +1107,6 @@ class OutputFile:
|
|
1015
1107
|
if params.ftype is not None:
|
1016
1108
|
self.gguf.add_file_type(params.ftype)
|
1017
1109
|
|
1018
|
-
def handle_tokenizer_model(self, vocab: Vocab) -> str:
|
1019
|
-
# Map the vocab types to the supported tokenizer models
|
1020
|
-
tokenizer_model = {
|
1021
|
-
SentencePieceVocab: "llama",
|
1022
|
-
HfVocab: "llama",
|
1023
|
-
BpeVocab: "gpt2",
|
1024
|
-
}.get(type(vocab))
|
1025
|
-
|
1026
|
-
# Block if vocab type is not predefined
|
1027
|
-
if tokenizer_model is None:
|
1028
|
-
raise ValueError("Unknown vocab type: Not supported")
|
1029
|
-
|
1030
|
-
return tokenizer_model
|
1031
|
-
|
1032
1110
|
def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
|
1033
1111
|
tokens = []
|
1034
1112
|
scores = []
|
@@ -1045,11 +1123,8 @@ class OutputFile:
|
|
1045
1123
|
return tokens, scores, toktypes
|
1046
1124
|
|
1047
1125
|
def add_meta_vocab(self, vocab: Vocab) -> None:
|
1048
|
-
# Handle the tokenizer model
|
1049
|
-
tokenizer_model = self.handle_tokenizer_model(vocab)
|
1050
|
-
|
1051
1126
|
# Ensure that tokenizer_model is added to the GGUF model
|
1052
|
-
self.gguf.add_tokenizer_model(tokenizer_model)
|
1127
|
+
self.gguf.add_tokenizer_model(vocab.tokenizer_model)
|
1053
1128
|
|
1054
1129
|
# Extract model vocabulary for model conversion
|
1055
1130
|
tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
|
@@ -1076,6 +1151,26 @@ class OutputFile:
|
|
1076
1151
|
def write_tensor_info(self) -> None:
|
1077
1152
|
self.gguf.write_ti_data_to_file()
|
1078
1153
|
|
1154
|
+
def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None:
|
1155
|
+
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency)
|
1156
|
+
if ftype == GGMLFileType.MostlyQ8_0:
|
1157
|
+
ndarrays = bounded_parallel_map(
|
1158
|
+
OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
|
1159
|
+
use_processpool_executor=True,
|
1160
|
+
)
|
1161
|
+
else:
|
1162
|
+
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
|
1163
|
+
|
1164
|
+
start = time.time()
|
1165
|
+
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
|
1166
|
+
elapsed = time.time() - start
|
1167
|
+
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
|
1168
|
+
padi = len(str(len(model)))
|
1169
|
+
print(
|
1170
|
+
f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
|
1171
|
+
)
|
1172
|
+
self.gguf.write_tensor_data(ndarray)
|
1173
|
+
|
1079
1174
|
def close(self) -> None:
|
1080
1175
|
self.gguf.close()
|
1081
1176
|
|
@@ -1084,7 +1179,7 @@ class OutputFile:
|
|
1084
1179
|
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
|
1085
1180
|
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
|
1086
1181
|
) -> None:
|
1087
|
-
check_vocab_size(params, vocab, pad_vocab
|
1182
|
+
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
|
1088
1183
|
|
1089
1184
|
of = OutputFile(fname_out, endianess=endianess)
|
1090
1185
|
|
@@ -1112,7 +1207,7 @@ class OutputFile:
|
|
1112
1207
|
|
1113
1208
|
@staticmethod
|
1114
1209
|
def write_all(
|
1115
|
-
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab:
|
1210
|
+
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
|
1116
1211
|
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
1117
1212
|
pad_vocab: bool = False,
|
1118
1213
|
) -> None:
|
@@ -1122,8 +1217,11 @@ class OutputFile:
|
|
1122
1217
|
|
1123
1218
|
# meta data
|
1124
1219
|
of.add_meta_arch(params)
|
1125
|
-
|
1126
|
-
|
1220
|
+
if isinstance(vocab, Vocab):
|
1221
|
+
of.add_meta_vocab(vocab)
|
1222
|
+
of.add_meta_special_vocab(svocab)
|
1223
|
+
else: # NoVocab
|
1224
|
+
of.gguf.add_tokenizer_model(vocab.tokenizer_model)
|
1127
1225
|
|
1128
1226
|
# tensor info
|
1129
1227
|
for name, lazy_tensor in model.items():
|
@@ -1133,24 +1231,7 @@ class OutputFile:
|
|
1133
1231
|
of.write_tensor_info()
|
1134
1232
|
|
1135
1233
|
# tensor data
|
1136
|
-
|
1137
|
-
if ftype == GGMLFileType.MostlyQ8_0:
|
1138
|
-
ndarrays = bounded_parallel_map(
|
1139
|
-
OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
|
1140
|
-
use_processpool_executor=True,
|
1141
|
-
)
|
1142
|
-
else:
|
1143
|
-
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
|
1144
|
-
|
1145
|
-
start = time.time()
|
1146
|
-
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
|
1147
|
-
elapsed = time.time() - start
|
1148
|
-
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
|
1149
|
-
padi = len(str(len(model)))
|
1150
|
-
print(
|
1151
|
-
f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
|
1152
|
-
)
|
1153
|
-
of.gguf.write_tensor_data(ndarray)
|
1234
|
+
of.write_tensor_data(ftype, model, concurrency)
|
1154
1235
|
|
1155
1236
|
of.close()
|
1156
1237
|
|
@@ -1158,16 +1239,16 @@ class OutputFile:
|
|
1158
1239
|
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
1159
1240
|
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
|
1160
1241
|
|
1161
|
-
if output_type_str == "f32" or (output_type_str is None and wq_type
|
1242
|
+
if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
|
1162
1243
|
return GGMLFileType.AllF32
|
1163
|
-
if output_type_str == "f16" or (output_type_str is None and wq_type
|
1244
|
+
if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
|
1164
1245
|
return GGMLFileType.MostlyF16
|
1165
1246
|
if output_type_str == "q8_0":
|
1166
1247
|
return GGMLFileType.MostlyQ8_0
|
1167
1248
|
|
1168
1249
|
name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
|
1169
1250
|
|
1170
|
-
raise
|
1251
|
+
raise ValueError(f"Unexpected combination of types: {name_to_type}")
|
1171
1252
|
|
1172
1253
|
|
1173
1254
|
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
@@ -1177,10 +1258,26 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM
|
|
1177
1258
|
|
1178
1259
|
def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
|
1179
1260
|
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
|
1180
|
-
should_skip
|
1261
|
+
should_skip = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
|
1181
1262
|
|
1182
1263
|
tmp = model
|
1183
1264
|
|
1265
|
+
# merge experts into one tensor
|
1266
|
+
if params.n_experts and params.n_experts > 0:
|
1267
|
+
for i_l in range(params.n_layer):
|
1268
|
+
for w in range(1, 4):
|
1269
|
+
experts = []
|
1270
|
+
for e in range(params.n_experts):
|
1271
|
+
if f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight" in model:
|
1272
|
+
experts.append(model[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"])
|
1273
|
+
del tmp[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"]
|
1274
|
+
elif f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight" in model:
|
1275
|
+
experts.append(model[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"])
|
1276
|
+
del tmp[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"]
|
1277
|
+
else:
|
1278
|
+
raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight")
|
1279
|
+
tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts)
|
1280
|
+
|
1184
1281
|
# HF models permut or pack some of the tensors, so we need to undo that
|
1185
1282
|
for i in itertools.count():
|
1186
1283
|
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
|
@@ -1204,8 +1301,7 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
|
|
1204
1301
|
if skip_unknown:
|
1205
1302
|
print(f"Unexpected tensor name: {name} - skipping")
|
1206
1303
|
continue
|
1207
|
-
|
1208
|
-
raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
|
1304
|
+
raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
|
1209
1305
|
|
1210
1306
|
if tensor_type in should_skip:
|
1211
1307
|
print(f"skipping tensor {name_new}")
|
@@ -1222,7 +1318,7 @@ def nth_multifile_path(path: Path, n: int) -> Path | None:
|
|
1222
1318
|
the nth path in the model.
|
1223
1319
|
'''
|
1224
1320
|
# Support the following patterns:
|
1225
|
-
patterns
|
1321
|
+
patterns = [
|
1226
1322
|
# - x.00.pth, x.01.pth, etc.
|
1227
1323
|
(r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
|
1228
1324
|
# - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
|
@@ -1261,16 +1357,16 @@ def load_some_model(path: Path) -> ModelPlus:
|
|
1261
1357
|
# Be extra-friendly and accept either a file or a directory:
|
1262
1358
|
if path.is_dir():
|
1263
1359
|
# Check if it's a set of safetensors files first
|
1264
|
-
globs = ["model-00001-of-*.safetensors", "model.safetensors"]
|
1360
|
+
globs = ["model-00001-of-*.safetensors", "model.safetensors", "consolidated.safetensors"]
|
1265
1361
|
files = [file for glob in globs for file in path.glob(glob)]
|
1266
1362
|
if not files:
|
1267
1363
|
# Try the PyTorch patterns too, with lower priority
|
1268
1364
|
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
|
1269
1365
|
files = [file for glob in globs for file in path.glob(glob)]
|
1270
1366
|
if not files:
|
1271
|
-
raise
|
1367
|
+
raise FileNotFoundError(f"Can't find model in directory {path}")
|
1272
1368
|
if len(files) > 1:
|
1273
|
-
raise
|
1369
|
+
raise ValueError(f"Found multiple models in {path}, not sure which to pick: {files}")
|
1274
1370
|
path = files[0]
|
1275
1371
|
|
1276
1372
|
paths = find_multifile_paths(path)
|
@@ -1284,36 +1380,14 @@ def load_some_model(path: Path) -> ModelPlus:
|
|
1284
1380
|
|
1285
1381
|
|
1286
1382
|
class VocabFactory:
|
1287
|
-
|
1383
|
+
_VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab]
|
1288
1384
|
|
1289
1385
|
def __init__(self, path: Path):
|
1290
1386
|
self.path = path
|
1291
|
-
self.file_paths = self._detect_files()
|
1292
|
-
print(f"Found vocab files: {self.file_paths}")
|
1293
|
-
|
1294
|
-
def _detect_files(self) -> dict[str, Path | None]:
|
1295
|
-
def locate(file: str) -> Path | None:
|
1296
|
-
if (path := self.path / file).exists():
|
1297
|
-
return path
|
1298
|
-
if (path := self.path.parent / file).exists():
|
1299
|
-
return path
|
1300
|
-
return None
|
1301
|
-
|
1302
|
-
return {vt: locate(f) for vt, f in self._FILES.items()}
|
1303
1387
|
|
1304
|
-
def
|
1305
|
-
|
1306
|
-
|
1307
|
-
path = self.file_paths[vtype]
|
1308
|
-
except KeyError:
|
1309
|
-
raise ValueError(f"Unsupported vocabulary type {vtype}") from None
|
1310
|
-
if path is not None:
|
1311
|
-
return vtype, path
|
1312
|
-
raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
|
1313
|
-
|
1314
|
-
def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
|
1315
|
-
load_merges = vocabtype == "bpe"
|
1316
|
-
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
|
1388
|
+
def _create_special_vocab(self, vocab: BaseVocab, model_parent_path: Path) -> gguf.SpecialVocab:
|
1389
|
+
load_merges = vocab.name == "bpe"
|
1390
|
+
n_vocab = vocab.vocab_size if isinstance(vocab, Vocab) else None
|
1317
1391
|
return gguf.SpecialVocab(
|
1318
1392
|
model_parent_path,
|
1319
1393
|
load_merges=load_merges,
|
@@ -1321,30 +1395,36 @@ class VocabFactory:
|
|
1321
1395
|
n_vocab=n_vocab,
|
1322
1396
|
)
|
1323
1397
|
|
1324
|
-
def
|
1325
|
-
|
1326
|
-
|
1398
|
+
def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
|
1399
|
+
vocab_classes: dict[str, type[Vocab]] = {cls.name: cls for cls in self._VOCAB_CLASSES}
|
1400
|
+
selected_vocabs: dict[str, type[Vocab]] = {}
|
1401
|
+
for vtype in vocab_types:
|
1402
|
+
try:
|
1403
|
+
selected_vocabs[vtype] = vocab_classes[vtype]
|
1404
|
+
except KeyError:
|
1405
|
+
raise ValueError(f"Unsupported vocabulary type {vtype}") from None
|
1327
1406
|
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1332
|
-
|
1333
|
-
|
1334
|
-
|
1335
|
-
|
1336
|
-
|
1337
|
-
|
1338
|
-
|
1339
|
-
|
1340
|
-
|
1341
|
-
|
1407
|
+
for vtype, cls in selected_vocabs.items():
|
1408
|
+
try:
|
1409
|
+
vocab = cls(self.path)
|
1410
|
+
break
|
1411
|
+
except FileNotFoundError:
|
1412
|
+
pass # ignore unavailable tokenizers
|
1413
|
+
else:
|
1414
|
+
raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
|
1415
|
+
|
1416
|
+
print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
|
1417
|
+
return vocab
|
1418
|
+
|
1419
|
+
def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
|
1420
|
+
vocab: BaseVocab
|
1421
|
+
if vocab_types is None:
|
1422
|
+
vocab = NoVocab()
|
1342
1423
|
else:
|
1343
|
-
|
1424
|
+
vocab = self._create_vocab_by_path(vocab_types)
|
1344
1425
|
# FIXME: Respect --vocab-dir?
|
1345
1426
|
special_vocab = self._create_special_vocab(
|
1346
1427
|
vocab,
|
1347
|
-
vocab_type,
|
1348
1428
|
model_parent_path,
|
1349
1429
|
)
|
1350
1430
|
return vocab, special_vocab
|
@@ -1382,6 +1462,7 @@ def main(args_in: list[str] | None = None) -> None:
|
|
1382
1462
|
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
|
1383
1463
|
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
|
1384
1464
|
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
1465
|
+
parser.add_argument("--no-vocab", action="store_true", help="store model without the vocab")
|
1385
1466
|
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
|
1386
1467
|
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
|
1387
1468
|
parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
|
@@ -1394,6 +1475,8 @@ def main(args_in: list[str] | None = None) -> None:
|
|
1394
1475
|
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
|
1395
1476
|
|
1396
1477
|
args = parser.parse_args(args_in)
|
1478
|
+
if args.no_vocab and args.vocab_only:
|
1479
|
+
raise ValueError("--vocab-only does not make sense with --no-vocab")
|
1397
1480
|
|
1398
1481
|
if args.dump_single:
|
1399
1482
|
model_plus = lazy_load_file(args.model)
|
@@ -1415,10 +1498,12 @@ def main(args_in: list[str] | None = None) -> None:
|
|
1415
1498
|
params = Params.load(model_plus)
|
1416
1499
|
if params.n_ctx == -1:
|
1417
1500
|
if args.ctx is None:
|
1418
|
-
|
1419
|
-
|
1420
|
-
|
1421
|
-
|
1501
|
+
msg = """\
|
1502
|
+
The model doesn't have a context size, and you didn't specify one with --ctx
|
1503
|
+
Please specify one with --ctx:
|
1504
|
+
- LLaMA v1: --ctx 2048
|
1505
|
+
- LLaMA v2: --ctx 4096"""
|
1506
|
+
parser.error(textwrap.dedent(msg))
|
1422
1507
|
params.n_ctx = args.ctx
|
1423
1508
|
|
1424
1509
|
if args.outtype:
|
@@ -1433,9 +1518,11 @@ def main(args_in: list[str] | None = None) -> None:
|
|
1433
1518
|
model_parent_path = model_plus.paths[0].parent
|
1434
1519
|
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
|
1435
1520
|
vocab_factory = VocabFactory(vocab_path)
|
1436
|
-
|
1521
|
+
vocab_types = None if args.no_vocab else args.vocab_type.split(",")
|
1522
|
+
vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
|
1437
1523
|
|
1438
1524
|
if args.vocab_only:
|
1525
|
+
assert isinstance(vocab, Vocab)
|
1439
1526
|
if not args.outfile:
|
1440
1527
|
raise ValueError("need --outfile if using --vocab-only")
|
1441
1528
|
outfile = args.outfile
|
@@ -1444,7 +1531,7 @@ def main(args_in: list[str] | None = None) -> None:
|
|
1444
1531
|
print(f"Wrote {outfile}")
|
1445
1532
|
return
|
1446
1533
|
|
1447
|
-
if model_plus.vocab is not None and args.vocab_dir is None:
|
1534
|
+
if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
|
1448
1535
|
vocab = model_plus.vocab
|
1449
1536
|
|
1450
1537
|
print(f"Vocab info: {vocab}")
|
@@ -1465,4 +1552,4 @@ def main(args_in: list[str] | None = None) -> None:
|
|
1465
1552
|
|
1466
1553
|
|
1467
1554
|
if __name__ == '__main__':
|
1468
|
-
main()
|
1555
|
+
main()
|