bigdl-core-cpp 2.5.0rc1__py3-none-win_amd64.whl → 2.6.0b20240828__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/{convert-hf-to-gguf.py → convert_hf_to_gguf.py} +413 -67
- bigdl/cpp/convert_hf_to_gguf_update.py +354 -0
- bigdl/cpp/convert_llama_ggml_to_gguf.py +454 -0
- bigdl/cpp/convert_lora_to_gguf.py +393 -0
- bigdl/cpp/gguf-py/gguf/__init__.py +1 -1
- bigdl/cpp/gguf-py/gguf/constants.py +71 -2
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +16 -1
- bigdl/cpp/gguf-py/gguf/lazy.py +4 -1
- bigdl/cpp/gguf-py/gguf/metadata.py +70 -63
- bigdl/cpp/gguf-py/gguf/quants.py +1129 -64
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +23 -15
- bigdl/cpp/gguf-py/gguf/utility.py +1 -1
- bigdl/cpp/gguf-py/gguf/vocab.py +301 -1
- bigdl/cpp/libs/common.lib +0 -0
- bigdl/cpp/libs/{gguf.exe → dist/windows-amd64/lib/ollama/runners/cpu/ggml.dll} +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/llama.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ggml.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/llama.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ggml.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/llama.dll +0 -0
- bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/{ggml_shared.dll → ggml.dll} +0 -0
- bigdl/cpp/libs/llama-batched.exe +0 -0
- bigdl/cpp/libs/llama-bench.exe +0 -0
- bigdl/cpp/libs/llama-cli.exe +0 -0
- bigdl/cpp/libs/llama-embedding.exe +0 -0
- bigdl/cpp/libs/llama-gguf.exe +0 -0
- bigdl/cpp/libs/llama-llava-cli.exe +0 -0
- bigdl/cpp/libs/llama-lookup.exe +0 -0
- bigdl/cpp/libs/{ls-sycl-device.exe → llama-ls-sycl-device.exe} +0 -0
- bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
- bigdl/cpp/libs/llama-perplexity.exe +0 -0
- bigdl/cpp/libs/llama-quantize.exe +0 -0
- bigdl/cpp/libs/llama-server.exe +0 -0
- bigdl/cpp/libs/llama-simple.exe +0 -0
- bigdl/cpp/libs/llama-speculative.exe +0 -0
- bigdl/cpp/libs/llama-tokenize.exe +0 -0
- bigdl/cpp/libs/llama.dll +0 -0
- bigdl/cpp/libs/llava_shared.dll +0 -0
- bigdl/cpp/libs/ollama.exe +0 -0
- {bigdl_core_cpp-2.5.0rc1.data → bigdl_core_cpp-2.6.0b20240828.data}/scripts/init-llama-cpp.bat +7 -2
- {bigdl_core_cpp-2.5.0rc1.data → bigdl_core_cpp-2.6.0b20240828.data}/scripts/init-ollama.bat +6 -0
- {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0b20240828.dist-info}/METADATA +1 -1
- bigdl_core_cpp-2.6.0b20240828.dist-info/RECORD +54 -0
- bigdl/cpp/convert.py +0 -1714
- bigdl/cpp/libs/baby-llama.exe +0 -0
- bigdl/cpp/libs/batched-bench.exe +0 -0
- bigdl/cpp/libs/batched.exe +0 -0
- bigdl/cpp/libs/beam-search.exe +0 -0
- bigdl/cpp/libs/benchmark.exe +0 -0
- bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/embedding.exe +0 -0
- bigdl/cpp/libs/export-lora.exe +0 -0
- bigdl/cpp/libs/finetune.exe +0 -0
- bigdl/cpp/libs/gritlm.exe +0 -0
- bigdl/cpp/libs/imatrix.exe +0 -0
- bigdl/cpp/libs/infill.exe +0 -0
- bigdl/cpp/libs/llava-cli.exe +0 -0
- bigdl/cpp/libs/lookahead.exe +0 -0
- bigdl/cpp/libs/lookup.exe +0 -0
- bigdl/cpp/libs/main.exe +0 -0
- bigdl/cpp/libs/parallel.exe +0 -0
- bigdl/cpp/libs/passkey.exe +0 -0
- bigdl/cpp/libs/perplexity.exe +0 -0
- bigdl/cpp/libs/q8dot.exe +0 -0
- bigdl/cpp/libs/quantize-stats.exe +0 -0
- bigdl/cpp/libs/quantize.exe +0 -0
- bigdl/cpp/libs/save-load-state.exe +0 -0
- bigdl/cpp/libs/server.exe +0 -0
- bigdl/cpp/libs/simple.exe +0 -0
- bigdl/cpp/libs/speculative.exe +0 -0
- bigdl/cpp/libs/tokenize.exe +0 -0
- bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
- bigdl/cpp/libs/vdot.exe +0 -0
- bigdl_core_cpp-2.5.0rc1.dist-info/RECORD +0 -63
- {bigdl_core_cpp-2.5.0rc1.data → bigdl_core_cpp-2.6.0b20240828.data}/scripts/init-llama-cpp.ps1 +0 -0
- {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0b20240828.dist-info}/WHEEL +0 -0
- {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0b20240828.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,393 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
from __future__ import annotations
|
5
|
+
|
6
|
+
from dataclasses import dataclass
|
7
|
+
import logging
|
8
|
+
import argparse
|
9
|
+
import os
|
10
|
+
import sys
|
11
|
+
import json
|
12
|
+
from math import prod
|
13
|
+
from pathlib import Path
|
14
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
|
15
|
+
|
16
|
+
import torch
|
17
|
+
|
18
|
+
if TYPE_CHECKING:
|
19
|
+
from torch import Tensor
|
20
|
+
|
21
|
+
if 'NO_LOCAL_GGUF' not in os.environ:
|
22
|
+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
23
|
+
import gguf
|
24
|
+
|
25
|
+
# reuse model definitions from convert_hf_to_gguf.py
|
26
|
+
from convert_hf_to_gguf import LazyTorchTensor, Model
|
27
|
+
|
28
|
+
logger = logging.getLogger("lora-to-gguf")
|
29
|
+
|
30
|
+
|
31
|
+
@dataclass
|
32
|
+
class PartialLoraTensor:
|
33
|
+
A: Tensor | None = None
|
34
|
+
B: Tensor | None = None
|
35
|
+
|
36
|
+
|
37
|
+
# magic to support tensor shape modifications and splitting
|
38
|
+
class LoraTorchTensor:
|
39
|
+
_lora_A: Tensor # (n_rank, row_size)
|
40
|
+
_lora_B: Tensor # (col_size, n_rank)
|
41
|
+
_rank: int
|
42
|
+
|
43
|
+
def __init__(self, A: Tensor, B: Tensor):
|
44
|
+
assert len(A.shape) == len(B.shape)
|
45
|
+
assert A.shape[-2] == B.shape[-1]
|
46
|
+
if A.dtype != B.dtype:
|
47
|
+
A = A.to(torch.float32)
|
48
|
+
B = B.to(torch.float32)
|
49
|
+
self._lora_A = A
|
50
|
+
self._lora_B = B
|
51
|
+
self._rank = B.shape[-1]
|
52
|
+
|
53
|
+
def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
|
54
|
+
return (self._lora_A, self._lora_B)
|
55
|
+
|
56
|
+
def __getitem__(
|
57
|
+
self,
|
58
|
+
indices: (
|
59
|
+
SupportsIndex
|
60
|
+
| slice
|
61
|
+
| tuple[SupportsIndex | slice | Tensor, ...] # TODO: add ellipsis in the type signature
|
62
|
+
),
|
63
|
+
) -> LoraTorchTensor:
|
64
|
+
shape = self.shape
|
65
|
+
if isinstance(indices, SupportsIndex):
|
66
|
+
if len(shape) > 2:
|
67
|
+
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
|
68
|
+
else:
|
69
|
+
raise NotImplementedError # can't return a vector
|
70
|
+
elif isinstance(indices, slice):
|
71
|
+
if len(shape) > 2:
|
72
|
+
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
|
73
|
+
else:
|
74
|
+
return LoraTorchTensor(self._lora_A, self._lora_B[indices])
|
75
|
+
elif isinstance(indices, tuple):
|
76
|
+
assert len(indices) > 0
|
77
|
+
if indices[-1] is Ellipsis:
|
78
|
+
return self[indices[:-1]]
|
79
|
+
# expand ellipsis
|
80
|
+
indices = tuple(
|
81
|
+
u
|
82
|
+
for v in (
|
83
|
+
(
|
84
|
+
(slice(None, None) for _ in range(len(indices) - 1))
|
85
|
+
if i is Ellipsis
|
86
|
+
else (i,)
|
87
|
+
)
|
88
|
+
for i in indices
|
89
|
+
)
|
90
|
+
for u in v
|
91
|
+
)
|
92
|
+
|
93
|
+
if len(indices) < len(shape):
|
94
|
+
indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape))))
|
95
|
+
|
96
|
+
# TODO: make sure this is correct
|
97
|
+
indices_A = (
|
98
|
+
*(
|
99
|
+
(
|
100
|
+
j.__index__() % self._lora_A.shape[i]
|
101
|
+
if isinstance(j, SupportsIndex)
|
102
|
+
else slice(None, None)
|
103
|
+
)
|
104
|
+
for i, j in enumerate(indices[:-2])
|
105
|
+
),
|
106
|
+
slice(None, None),
|
107
|
+
indices[-1],
|
108
|
+
)
|
109
|
+
indices_B = indices[:-1]
|
110
|
+
return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
|
111
|
+
else:
|
112
|
+
raise NotImplementedError # unknown indice type
|
113
|
+
|
114
|
+
@property
|
115
|
+
def dtype(self) -> torch.dtype:
|
116
|
+
assert self._lora_A.dtype == self._lora_B.dtype
|
117
|
+
return self._lora_A.dtype
|
118
|
+
|
119
|
+
@property
|
120
|
+
def shape(self) -> tuple[int, ...]:
|
121
|
+
assert len(self._lora_A.shape) == len(self._lora_B.shape)
|
122
|
+
return (*self._lora_B.shape[:-1], self._lora_A.shape[-1])
|
123
|
+
|
124
|
+
def size(self, dim=None):
|
125
|
+
assert dim is None
|
126
|
+
return self.shape
|
127
|
+
|
128
|
+
def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
|
129
|
+
if isinstance(shape[0], tuple):
|
130
|
+
new_shape: tuple[int, ...] = shape[0]
|
131
|
+
else:
|
132
|
+
new_shape = cast(tuple[int, ...], shape)
|
133
|
+
orig_shape = self.shape
|
134
|
+
if len(new_shape) < 2:
|
135
|
+
raise NotImplementedError # can't become a vector
|
136
|
+
|
137
|
+
# expand -1 in the shape
|
138
|
+
if any(dim == -1 for dim in new_shape):
|
139
|
+
n_elems = prod(orig_shape)
|
140
|
+
n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
|
141
|
+
assert n_elems % n_new_elems == 0
|
142
|
+
new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),)
|
143
|
+
|
144
|
+
if new_shape[-1] != orig_shape[-1]:
|
145
|
+
raise NotImplementedError # can't reshape the row size trivially
|
146
|
+
|
147
|
+
shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
|
148
|
+
shape_B = (*new_shape[:-1], self._rank)
|
149
|
+
return LoraTorchTensor(
|
150
|
+
self._lora_A.reshape(shape_A),
|
151
|
+
self._lora_B.reshape(shape_B),
|
152
|
+
)
|
153
|
+
|
154
|
+
def reshape_as(self, other: Tensor) -> LoraTorchTensor:
|
155
|
+
return self.reshape(*other.shape)
|
156
|
+
|
157
|
+
def view(self, *size: int) -> LoraTorchTensor:
|
158
|
+
return self.reshape(*size)
|
159
|
+
|
160
|
+
def permute(self, *dims: int) -> LoraTorchTensor:
|
161
|
+
shape = self.shape
|
162
|
+
dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
|
163
|
+
if dims[-1] == -1:
|
164
|
+
# TODO: support higher dimensional A shapes bigger than 1
|
165
|
+
assert all(dim == 1 for dim in self._lora_A.shape[:-2])
|
166
|
+
return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
|
167
|
+
if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
|
168
|
+
return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims))
|
169
|
+
else:
|
170
|
+
# TODO: compose the above two
|
171
|
+
raise NotImplementedError
|
172
|
+
|
173
|
+
def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
|
174
|
+
shape = self.shape
|
175
|
+
dims = [i for i in range(len(shape))]
|
176
|
+
dims[dim0], dims[dim1] = dims[dim1], dims[dim0]
|
177
|
+
return self.permute(*dims)
|
178
|
+
|
179
|
+
def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
|
180
|
+
return self.transpose(axis0, axis1)
|
181
|
+
|
182
|
+
def to(self, *args, **kwargs):
|
183
|
+
return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
|
184
|
+
|
185
|
+
@classmethod
|
186
|
+
def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
|
187
|
+
del types # unused
|
188
|
+
|
189
|
+
if kwargs is None:
|
190
|
+
kwargs = {}
|
191
|
+
|
192
|
+
if func is torch.permute:
|
193
|
+
return type(args[0]).permute(*args, **kwargs)
|
194
|
+
elif func is torch.reshape:
|
195
|
+
return type(args[0]).reshape(*args, **kwargs)
|
196
|
+
elif func is torch.stack:
|
197
|
+
assert isinstance(args[0], Sequence)
|
198
|
+
dim = kwargs.get("dim", 0)
|
199
|
+
assert dim == 0
|
200
|
+
return LoraTorchTensor(
|
201
|
+
torch.stack([a._lora_A for a in args[0]], dim),
|
202
|
+
torch.stack([b._lora_B for b in args[0]], dim),
|
203
|
+
)
|
204
|
+
elif func is torch.cat:
|
205
|
+
assert isinstance(args[0], Sequence)
|
206
|
+
dim = kwargs.get("dim", 0)
|
207
|
+
assert dim == 0
|
208
|
+
if len(args[0][0].shape) > 2:
|
209
|
+
return LoraTorchTensor(
|
210
|
+
torch.cat([a._lora_A for a in args[0]], dim),
|
211
|
+
torch.cat([b._lora_B for b in args[0]], dim),
|
212
|
+
)
|
213
|
+
elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]):
|
214
|
+
return LoraTorchTensor(
|
215
|
+
args[0][0]._lora_A,
|
216
|
+
torch.cat([b._lora_B for b in args[0]], dim),
|
217
|
+
)
|
218
|
+
else:
|
219
|
+
raise NotImplementedError
|
220
|
+
else:
|
221
|
+
raise NotImplementedError
|
222
|
+
|
223
|
+
|
224
|
+
def get_base_tensor_name(lora_tensor_name: str) -> str:
|
225
|
+
base_name = lora_tensor_name.replace("base_model.model.", "")
|
226
|
+
base_name = base_name.replace(".lora_A.weight", ".weight")
|
227
|
+
base_name = base_name.replace(".lora_B.weight", ".weight")
|
228
|
+
return base_name
|
229
|
+
|
230
|
+
|
231
|
+
def parse_args() -> argparse.Namespace:
|
232
|
+
parser = argparse.ArgumentParser(
|
233
|
+
description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
|
234
|
+
parser.add_argument(
|
235
|
+
"--outfile", type=Path,
|
236
|
+
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
237
|
+
)
|
238
|
+
parser.add_argument(
|
239
|
+
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
|
240
|
+
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
241
|
+
)
|
242
|
+
parser.add_argument(
|
243
|
+
"--bigendian", action="store_true",
|
244
|
+
help="model is executed on big endian machine",
|
245
|
+
)
|
246
|
+
parser.add_argument(
|
247
|
+
"--no-lazy", action="store_true",
|
248
|
+
help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
|
249
|
+
)
|
250
|
+
parser.add_argument(
|
251
|
+
"--verbose", action="store_true",
|
252
|
+
help="increase output verbosity",
|
253
|
+
)
|
254
|
+
parser.add_argument(
|
255
|
+
"--dry-run", action="store_true",
|
256
|
+
help="only print out what will be done, without writing any new files",
|
257
|
+
)
|
258
|
+
parser.add_argument(
|
259
|
+
"--base", type=Path, required=True,
|
260
|
+
help="directory containing base model file",
|
261
|
+
)
|
262
|
+
parser.add_argument(
|
263
|
+
"lora_path", type=Path,
|
264
|
+
help="directory containing LoRA adapter file",
|
265
|
+
)
|
266
|
+
|
267
|
+
return parser.parse_args()
|
268
|
+
|
269
|
+
|
270
|
+
if __name__ == '__main__':
|
271
|
+
args = parse_args()
|
272
|
+
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
273
|
+
|
274
|
+
ftype_map: dict[str, gguf.LlamaFileType] = {
|
275
|
+
"f32": gguf.LlamaFileType.ALL_F32,
|
276
|
+
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
277
|
+
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
278
|
+
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
279
|
+
"auto": gguf.LlamaFileType.GUESSED,
|
280
|
+
}
|
281
|
+
|
282
|
+
ftype = ftype_map[args.outtype]
|
283
|
+
|
284
|
+
dir_base_model: Path = args.base
|
285
|
+
dir_lora: Path = args.lora_path
|
286
|
+
lora_config = dir_lora / "adapter_config.json"
|
287
|
+
input_model = dir_lora / "adapter_model.safetensors"
|
288
|
+
|
289
|
+
if args.outfile is not None:
|
290
|
+
fname_out = args.outfile
|
291
|
+
else:
|
292
|
+
# output in the same directory as the model by default
|
293
|
+
fname_out = dir_lora
|
294
|
+
|
295
|
+
if os.path.exists(input_model):
|
296
|
+
# lazy import load_file only if lora is in safetensors format.
|
297
|
+
from safetensors.torch import load_file
|
298
|
+
|
299
|
+
lora_model = load_file(input_model, device="cpu")
|
300
|
+
else:
|
301
|
+
input_model = os.path.join(dir_lora, "adapter_model.bin")
|
302
|
+
lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
|
303
|
+
|
304
|
+
# load base model
|
305
|
+
logger.info(f"Loading base model: {dir_base_model.name}")
|
306
|
+
hparams = Model.load_hparams(dir_base_model)
|
307
|
+
with torch.inference_mode():
|
308
|
+
try:
|
309
|
+
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
310
|
+
except NotImplementedError:
|
311
|
+
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
312
|
+
sys.exit(1)
|
313
|
+
|
314
|
+
class LoraModel(model_class):
|
315
|
+
model_arch = model_class.model_arch
|
316
|
+
|
317
|
+
lora_alpha: float
|
318
|
+
|
319
|
+
def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
|
320
|
+
|
321
|
+
super().__init__(*args, **kwargs)
|
322
|
+
|
323
|
+
self.dir_model_card = dir_lora_model
|
324
|
+
self.lora_alpha = float(lora_alpha)
|
325
|
+
|
326
|
+
def set_type(self):
|
327
|
+
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
|
328
|
+
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
|
329
|
+
|
330
|
+
def set_gguf_parameters(self):
|
331
|
+
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
|
332
|
+
super().set_gguf_parameters()
|
333
|
+
|
334
|
+
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
335
|
+
tensor_map: dict[str, PartialLoraTensor] = {}
|
336
|
+
|
337
|
+
for name, tensor in lora_model.items():
|
338
|
+
if self.lazy:
|
339
|
+
tensor = LazyTorchTensor.from_eager(tensor)
|
340
|
+
base_name = get_base_tensor_name(name)
|
341
|
+
is_lora_a = ".lora_A.weight" in name
|
342
|
+
is_lora_b = ".lora_B.weight" in name
|
343
|
+
if not is_lora_a and not is_lora_b:
|
344
|
+
if ".base_layer.weight" in name:
|
345
|
+
continue
|
346
|
+
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
|
347
|
+
sys.exit(1)
|
348
|
+
|
349
|
+
if base_name in tensor_map:
|
350
|
+
if is_lora_a:
|
351
|
+
tensor_map[base_name].A = tensor
|
352
|
+
else:
|
353
|
+
tensor_map[base_name].B = tensor
|
354
|
+
else:
|
355
|
+
if is_lora_a:
|
356
|
+
tensor_map[base_name] = PartialLoraTensor(A=tensor)
|
357
|
+
else:
|
358
|
+
tensor_map[base_name] = PartialLoraTensor(B=tensor)
|
359
|
+
|
360
|
+
for name, tensor in tensor_map.items():
|
361
|
+
assert tensor.A is not None
|
362
|
+
assert tensor.B is not None
|
363
|
+
yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
|
364
|
+
|
365
|
+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
366
|
+
dest = super().modify_tensors(data_torch, name, bid)
|
367
|
+
for dest_name, dest_data in dest:
|
368
|
+
assert isinstance(dest_data, LoraTorchTensor)
|
369
|
+
lora_a, lora_b = dest_data.get_lora_A_B()
|
370
|
+
|
371
|
+
yield (dest_name + ".lora_a", lora_a)
|
372
|
+
yield (dest_name + ".lora_b", lora_b)
|
373
|
+
|
374
|
+
with open(lora_config, "r") as f:
|
375
|
+
lparams: dict[str, Any] = json.load(f)
|
376
|
+
|
377
|
+
alpha: float = lparams["lora_alpha"]
|
378
|
+
|
379
|
+
model_instance = LoraModel(
|
380
|
+
dir_base_model,
|
381
|
+
ftype,
|
382
|
+
fname_out,
|
383
|
+
is_big_endian=args.bigendian,
|
384
|
+
use_temp_file=False,
|
385
|
+
eager=args.no_lazy,
|
386
|
+
dry_run=args.dry_run,
|
387
|
+
dir_lora_model=dir_lora,
|
388
|
+
lora_alpha=alpha,
|
389
|
+
)
|
390
|
+
|
391
|
+
logger.info("Exporting model...")
|
392
|
+
model_instance.write()
|
393
|
+
logger.info(f"Model successfully exported to {model_instance.fname_out}")
|
@@ -130,6 +130,7 @@ class Keys:
|
|
130
130
|
INNER_SIZE = "{arch}.ssm.inner_size"
|
131
131
|
STATE_SIZE = "{arch}.ssm.state_size"
|
132
132
|
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
|
133
|
+
DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms"
|
133
134
|
|
134
135
|
class Tokenizer:
|
135
136
|
MODEL = "tokenizer.ggml.model"
|
@@ -161,6 +162,7 @@ class Keys:
|
|
161
162
|
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
|
162
163
|
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
|
163
164
|
EOT_ID = "tokenizer.ggml.eot_token_id"
|
165
|
+
EOM_ID = "tokenizer.ggml.eom_token_id"
|
164
166
|
|
165
167
|
class Adapter:
|
166
168
|
TYPE = "adapter.type"
|
@@ -216,7 +218,10 @@ class MODEL_ARCH(IntEnum):
|
|
216
218
|
CHATGLM = auto()
|
217
219
|
BITNET = auto()
|
218
220
|
T5 = auto()
|
221
|
+
T5ENCODER = auto()
|
219
222
|
JAIS = auto()
|
223
|
+
NEMOTRON = auto()
|
224
|
+
EXAONE = auto()
|
220
225
|
|
221
226
|
|
222
227
|
class MODEL_TENSOR(IntEnum):
|
@@ -343,7 +348,10 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
343
348
|
MODEL_ARCH.CHATGLM: "chatglm",
|
344
349
|
MODEL_ARCH.BITNET: "bitnet",
|
345
350
|
MODEL_ARCH.T5: "t5",
|
351
|
+
MODEL_ARCH.T5ENCODER: "t5encoder",
|
346
352
|
MODEL_ARCH.JAIS: "jais",
|
353
|
+
MODEL_ARCH.NEMOTRON: "nemotron",
|
354
|
+
MODEL_ARCH.EXAONE: "exaone",
|
347
355
|
}
|
348
356
|
|
349
357
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
@@ -1035,6 +1043,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
1035
1043
|
MODEL_TENSOR.ENC_FFN_UP,
|
1036
1044
|
MODEL_TENSOR.ENC_OUTPUT_NORM,
|
1037
1045
|
],
|
1046
|
+
MODEL_ARCH.T5ENCODER: [
|
1047
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
1048
|
+
MODEL_TENSOR.OUTPUT,
|
1049
|
+
MODEL_TENSOR.ENC_ATTN_NORM,
|
1050
|
+
MODEL_TENSOR.ENC_ATTN_Q,
|
1051
|
+
MODEL_TENSOR.ENC_ATTN_K,
|
1052
|
+
MODEL_TENSOR.ENC_ATTN_V,
|
1053
|
+
MODEL_TENSOR.ENC_ATTN_OUT,
|
1054
|
+
MODEL_TENSOR.ENC_ATTN_REL_B,
|
1055
|
+
MODEL_TENSOR.ENC_FFN_NORM,
|
1056
|
+
MODEL_TENSOR.ENC_FFN_GATE,
|
1057
|
+
MODEL_TENSOR.ENC_FFN_DOWN,
|
1058
|
+
MODEL_TENSOR.ENC_FFN_UP,
|
1059
|
+
MODEL_TENSOR.ENC_OUTPUT_NORM,
|
1060
|
+
],
|
1038
1061
|
MODEL_ARCH.JAIS: [
|
1039
1062
|
MODEL_TENSOR.TOKEN_EMBD,
|
1040
1063
|
MODEL_TENSOR.OUTPUT_NORM,
|
@@ -1047,6 +1070,37 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
1047
1070
|
MODEL_TENSOR.FFN_GATE,
|
1048
1071
|
MODEL_TENSOR.FFN_UP,
|
1049
1072
|
],
|
1073
|
+
MODEL_ARCH.NEMOTRON: [
|
1074
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
1075
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
1076
|
+
MODEL_TENSOR.OUTPUT,
|
1077
|
+
MODEL_TENSOR.ROPE_FREQS,
|
1078
|
+
MODEL_TENSOR.ATTN_NORM,
|
1079
|
+
MODEL_TENSOR.ATTN_Q,
|
1080
|
+
MODEL_TENSOR.ATTN_K,
|
1081
|
+
MODEL_TENSOR.ATTN_V,
|
1082
|
+
MODEL_TENSOR.ATTN_OUT,
|
1083
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
1084
|
+
MODEL_TENSOR.FFN_NORM,
|
1085
|
+
MODEL_TENSOR.FFN_DOWN,
|
1086
|
+
MODEL_TENSOR.FFN_UP,
|
1087
|
+
],
|
1088
|
+
MODEL_ARCH.EXAONE: [
|
1089
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
1090
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
1091
|
+
MODEL_TENSOR.OUTPUT,
|
1092
|
+
MODEL_TENSOR.ROPE_FREQS,
|
1093
|
+
MODEL_TENSOR.ATTN_NORM,
|
1094
|
+
MODEL_TENSOR.ATTN_Q,
|
1095
|
+
MODEL_TENSOR.ATTN_K,
|
1096
|
+
MODEL_TENSOR.ATTN_V,
|
1097
|
+
MODEL_TENSOR.ATTN_OUT,
|
1098
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
1099
|
+
MODEL_TENSOR.FFN_NORM,
|
1100
|
+
MODEL_TENSOR.FFN_GATE,
|
1101
|
+
MODEL_TENSOR.FFN_DOWN,
|
1102
|
+
MODEL_TENSOR.FFN_UP,
|
1103
|
+
],
|
1050
1104
|
# TODO
|
1051
1105
|
}
|
1052
1106
|
|
@@ -1087,6 +1141,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
1087
1141
|
MODEL_ARCH.CHATGLM: [
|
1088
1142
|
MODEL_TENSOR.ROPE_FREQS,
|
1089
1143
|
],
|
1144
|
+
MODEL_ARCH.NEMOTRON: [
|
1145
|
+
MODEL_TENSOR.ROPE_FREQS,
|
1146
|
+
MODEL_TENSOR.ATTN_ROT_EMBD,
|
1147
|
+
],
|
1090
1148
|
}
|
1091
1149
|
|
1092
1150
|
#
|
@@ -1145,6 +1203,9 @@ class GGMLQuantizationType(IntEnum):
|
|
1145
1203
|
F64 = 28
|
1146
1204
|
IQ1_M = 29
|
1147
1205
|
BF16 = 30
|
1206
|
+
Q4_0_4_4 = 31
|
1207
|
+
Q4_0_4_8 = 32
|
1208
|
+
Q4_0_8_8 = 33
|
1148
1209
|
|
1149
1210
|
|
1150
1211
|
# TODO: add GGMLFileType from ggml_ftype in ggml.h
|
@@ -1157,7 +1218,7 @@ class LlamaFileType(IntEnum):
|
|
1157
1218
|
MOSTLY_F16 = 1 # except 1d tensors
|
1158
1219
|
MOSTLY_Q4_0 = 2 # except 1d tensors
|
1159
1220
|
MOSTLY_Q4_1 = 3 # except 1d tensors
|
1160
|
-
MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16
|
1221
|
+
# MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16
|
1161
1222
|
# MOSTLY_Q4_2 = 5 # support has been removed
|
1162
1223
|
# MOSTLY_Q4_3 = 6 # support has been removed
|
1163
1224
|
MOSTLY_Q8_0 = 7 # except 1d tensors
|
@@ -1186,6 +1247,9 @@ class LlamaFileType(IntEnum):
|
|
1186
1247
|
MOSTLY_IQ4_XS = 30 # except 1d tensors
|
1187
1248
|
MOSTLY_IQ1_M = 31 # except 1d tensors
|
1188
1249
|
MOSTLY_BF16 = 32 # except 1d tensors
|
1250
|
+
MOSTLY_Q4_0_4_4 = 33 # except 1d tensors
|
1251
|
+
MOSTLY_Q4_0_4_8 = 34 # except 1d tensors
|
1252
|
+
MOSTLY_Q4_0_8_8 = 35 # except 1d tensors
|
1189
1253
|
|
1190
1254
|
GUESSED = 1024 # not specified in the model file
|
1191
1255
|
|
@@ -1259,6 +1323,9 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
|
1259
1323
|
GGMLQuantizationType.F64: (1, 8),
|
1260
1324
|
GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
|
1261
1325
|
GGMLQuantizationType.BF16: (1, 2),
|
1326
|
+
GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
|
1327
|
+
GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
|
1328
|
+
GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
|
1262
1329
|
}
|
1263
1330
|
|
1264
1331
|
|
@@ -1306,6 +1373,7 @@ KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL
|
|
1306
1373
|
KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE
|
1307
1374
|
KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE
|
1308
1375
|
KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
|
1376
|
+
KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS
|
1309
1377
|
|
1310
1378
|
# tokenization
|
1311
1379
|
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
|
@@ -1326,4 +1394,5 @@ KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
|
|
1326
1394
|
KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID
|
1327
1395
|
KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
|
1328
1396
|
KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID
|
1329
|
-
KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
|
1397
|
+
KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
|
1398
|
+
KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
|
@@ -312,6 +312,8 @@ class GGUFWriter:
|
|
312
312
|
self.add_key_value(key, val, GGUFValueType.STRING)
|
313
313
|
|
314
314
|
def add_array(self, key: str, val: Sequence[Any]) -> None:
|
315
|
+
if len(val) == 0:
|
316
|
+
return
|
315
317
|
self.add_key_value(key, val, GGUFValueType.ARRAY)
|
316
318
|
|
317
319
|
@staticmethod
|
@@ -728,6 +730,9 @@ class GGUFWriter:
|
|
728
730
|
def add_ssm_time_step_rank(self, value: int) -> None:
|
729
731
|
self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value)
|
730
732
|
|
733
|
+
def add_ssm_dt_b_c_rms(self, value: bool) -> None:
|
734
|
+
self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value)
|
735
|
+
|
731
736
|
def add_tokenizer_model(self, model: str) -> None:
|
732
737
|
self.add_string(Keys.Tokenizer.MODEL, model)
|
733
738
|
|
@@ -826,6 +831,9 @@ class GGUFWriter:
|
|
826
831
|
def add_eot_token_id(self, id: int) -> None:
|
827
832
|
self.add_uint32(Keys.Tokenizer.EOT_ID, id)
|
828
833
|
|
834
|
+
def add_eom_token_id(self, id: int) -> None:
|
835
|
+
self.add_uint32(Keys.Tokenizer.EOM_ID, id)
|
836
|
+
|
829
837
|
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
830
838
|
pack_prefix = ''
|
831
839
|
if not skip_pack_prefix:
|
@@ -845,7 +853,14 @@ class GGUFWriter:
|
|
845
853
|
encoded_val = val.encode("utf-8") if isinstance(val, str) else val
|
846
854
|
kv_data += self._pack("Q", len(encoded_val))
|
847
855
|
kv_data += encoded_val
|
848
|
-
elif vtype == GGUFValueType.ARRAY
|
856
|
+
elif vtype == GGUFValueType.ARRAY:
|
857
|
+
|
858
|
+
if not isinstance(val, Sequence):
|
859
|
+
raise ValueError("Invalid GGUF metadata array, expecting sequence")
|
860
|
+
|
861
|
+
if len(val) == 0:
|
862
|
+
raise ValueError("Invalid GGUF metadata array. Empty array")
|
863
|
+
|
849
864
|
if isinstance(val, bytes):
|
850
865
|
ltype = GGUFValueType.UINT8
|
851
866
|
else:
|
bigdl/cpp/gguf-py/gguf/lazy.py
CHANGED
@@ -191,6 +191,8 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
|
191
191
|
class LazyNumpyTensor(LazyBase):
|
192
192
|
_tensor_type = np.ndarray
|
193
193
|
|
194
|
+
shape: tuple[int, ...] # Makes the type checker happy in quants.py
|
195
|
+
|
194
196
|
@classmethod
|
195
197
|
def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
|
196
198
|
# The initial idea was to use np.nan as the fill value,
|
@@ -208,4 +210,5 @@ class LazyNumpyTensor(LazyBase):
|
|
208
210
|
eager = LazyNumpyTensor.to_eager(self)
|
209
211
|
return eager.tofile(*args, **kwargs)
|
210
212
|
|
211
|
-
|
213
|
+
|
214
|
+
# TODO: __array_function__
|