bigdl-core-cpp 2.5.0b20240524__py3-none-win_amd64.whl → 2.5.0b20240528__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigdl/cpp/convert-hf-to-gguf.py +1363 -338
- bigdl/cpp/convert.py +199 -52
- bigdl/cpp/gguf-py/gguf/__init__.py +2 -0
- bigdl/cpp/gguf-py/gguf/constants.py +102 -28
- bigdl/cpp/gguf-py/gguf/gguf_reader.py +9 -5
- bigdl/cpp/gguf-py/gguf/gguf_writer.py +36 -11
- bigdl/cpp/gguf-py/gguf/lazy.py +236 -0
- bigdl/cpp/gguf-py/gguf/quants.py +123 -0
- bigdl/cpp/gguf-py/gguf/tensor_mapping.py +28 -1
- bigdl/cpp/gguf-py/gguf/vocab.py +3 -3
- bigdl/cpp/libs/baby-llama.exe +0 -0
- bigdl/cpp/libs/batched-bench.exe +0 -0
- bigdl/cpp/libs/batched.exe +0 -0
- bigdl/cpp/libs/beam-search.exe +0 -0
- bigdl/cpp/libs/benchmark.exe +0 -0
- bigdl/cpp/libs/common.lib +0 -0
- bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
- bigdl/cpp/libs/embedding.exe +0 -0
- bigdl/cpp/libs/export-lora.exe +0 -0
- bigdl/cpp/libs/finetune.exe +0 -0
- bigdl/cpp/libs/ggml_shared.dll +0 -0
- bigdl/cpp/libs/gguf.exe +0 -0
- bigdl/cpp/libs/gritlm.exe +0 -0
- bigdl/cpp/libs/imatrix.exe +0 -0
- bigdl/cpp/libs/infill.exe +0 -0
- bigdl/cpp/libs/llama-bench.exe +0 -0
- bigdl/cpp/libs/llama.dll +0 -0
- bigdl/cpp/libs/llava-cli.exe +0 -0
- bigdl/cpp/libs/llava_shared.dll +0 -0
- bigdl/cpp/libs/lookahead.exe +0 -0
- bigdl/cpp/libs/lookup.exe +0 -0
- bigdl/cpp/libs/ls-sycl-device.exe +0 -0
- bigdl/cpp/libs/main.exe +0 -0
- bigdl/cpp/libs/ollama.exe +0 -0
- bigdl/cpp/libs/parallel.exe +0 -0
- bigdl/cpp/libs/passkey.exe +0 -0
- bigdl/cpp/libs/perplexity.exe +0 -0
- bigdl/cpp/libs/q8dot.exe +0 -0
- bigdl/cpp/libs/quantize-stats.exe +0 -0
- bigdl/cpp/libs/quantize.exe +0 -0
- bigdl/cpp/libs/save-load-state.exe +0 -0
- bigdl/cpp/libs/server.exe +0 -0
- bigdl/cpp/libs/simple.exe +0 -0
- bigdl/cpp/libs/speculative.exe +0 -0
- bigdl/cpp/libs/tokenize.exe +0 -0
- bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
- bigdl/cpp/libs/vdot.exe +0 -0
- {bigdl_core_cpp-2.5.0b20240524.dist-info → bigdl_core_cpp-2.5.0b20240528.dist-info}/METADATA +1 -1
- bigdl_core_cpp-2.5.0b20240528.dist-info/RECORD +61 -0
- bigdl_core_cpp-2.5.0b20240524.dist-info/RECORD +0 -59
- {bigdl_core_cpp-2.5.0b20240524.data → bigdl_core_cpp-2.5.0b20240528.data}/scripts/init-llama-cpp.bat +0 -0
- {bigdl_core_cpp-2.5.0b20240524.data → bigdl_core_cpp-2.5.0b20240528.data}/scripts/init-llama-cpp.ps1 +0 -0
- {bigdl_core_cpp-2.5.0b20240524.data → bigdl_core_cpp-2.5.0b20240528.data}/scripts/init-ollama.bat +0 -0
- {bigdl_core_cpp-2.5.0b20240524.dist-info → bigdl_core_cpp-2.5.0b20240528.dist-info}/WHEEL +0 -0
- {bigdl_core_cpp-2.5.0b20240524.dist-info → bigdl_core_cpp-2.5.0b20240528.dist-info}/top_level.txt +0 -0
@@ -12,6 +12,8 @@ from typing import Any, Literal, NamedTuple, TypeVar, Union
|
|
12
12
|
import numpy as np
|
13
13
|
import numpy.typing as npt
|
14
14
|
|
15
|
+
from .quants import quant_shape_to_byte_shape
|
16
|
+
|
15
17
|
if __name__ == "__main__":
|
16
18
|
import sys
|
17
19
|
from pathlib import Path
|
@@ -65,7 +67,7 @@ class ReaderTensor(NamedTuple):
|
|
65
67
|
|
66
68
|
class GGUFReader:
|
67
69
|
# I - same as host, S - swapped
|
68
|
-
byte_order: Literal['I' | 'S'] = 'I'
|
70
|
+
byte_order: Literal['I'] | Literal['S'] = 'I'
|
69
71
|
alignment: int = GGUF_DEFAULT_ALIGNMENT
|
70
72
|
|
71
73
|
# Note: Internal helper, API may change.
|
@@ -83,7 +85,7 @@ class GGUFReader:
|
|
83
85
|
GGUFValueType.BOOL: np.bool_,
|
84
86
|
}
|
85
87
|
|
86
|
-
def __init__(self, path: os.PathLike[str] | str, mode: Literal['r' | 'r+' | 'c'] = 'r'):
|
88
|
+
def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'):
|
87
89
|
self.data = np.memmap(path, mode = mode)
|
88
90
|
offs = 0
|
89
91
|
if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
|
@@ -128,7 +130,7 @@ class GGUFReader:
|
|
128
130
|
return self.tensors[idx]
|
129
131
|
|
130
132
|
def _get(
|
131
|
-
self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I' | 'S' | '<'] = None,
|
133
|
+
self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I'] | Literal['S'] | Literal['<'] = None,
|
132
134
|
) -> npt.NDArray[Any]:
|
133
135
|
count = int(count)
|
134
136
|
itemsize = int(np.empty([], dtype = dtype).itemsize)
|
@@ -250,7 +252,8 @@ class GGUFReader:
|
|
250
252
|
raise ValueError(f'Found duplicated tensor with name {tensor_name}')
|
251
253
|
tensor_names.add(tensor_name)
|
252
254
|
ggml_type = GGMLQuantizationType(raw_dtype[0])
|
253
|
-
n_elems = np.prod(dims)
|
255
|
+
n_elems = int(np.prod(dims))
|
256
|
+
np_dims = tuple(reversed(dims.tolist()))
|
254
257
|
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
|
255
258
|
n_bytes = n_elems * type_size // block_size
|
256
259
|
data_offs = int(start_offs + offset_tensor[0])
|
@@ -279,6 +282,7 @@ class GGUFReader:
|
|
279
282
|
else:
|
280
283
|
item_count = n_bytes
|
281
284
|
item_type = np.uint8
|
285
|
+
np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
|
282
286
|
tensors.append(ReaderTensor(
|
283
287
|
name = tensor_name,
|
284
288
|
tensor_type = ggml_type,
|
@@ -286,7 +290,7 @@ class GGUFReader:
|
|
286
290
|
n_elements = n_elems,
|
287
291
|
n_bytes = n_bytes,
|
288
292
|
data_offset = data_offs,
|
289
|
-
data = self._get(data_offs, item_type, item_count),
|
293
|
+
data = self._get(data_offs, item_type, item_count).reshape(np_dims),
|
290
294
|
field = field,
|
291
295
|
))
|
292
296
|
self.tensors = tensors
|
@@ -25,6 +25,8 @@ from .constants import (
|
|
25
25
|
TokenType,
|
26
26
|
)
|
27
27
|
|
28
|
+
from .quants import quant_shape_from_byte_shape
|
29
|
+
|
28
30
|
logger = logging.getLogger(__name__)
|
29
31
|
|
30
32
|
|
@@ -176,7 +178,7 @@ class GGUFWriter:
|
|
176
178
|
if pack_fmt is not None:
|
177
179
|
self.kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL)
|
178
180
|
elif vtype == GGUFValueType.STRING:
|
179
|
-
encoded_val = val.encode("
|
181
|
+
encoded_val = val.encode("utf-8") if isinstance(val, str) else val
|
180
182
|
self.kv_data += self._pack("Q", len(encoded_val))
|
181
183
|
self.kv_data += encoded_val
|
182
184
|
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
|
@@ -195,7 +197,7 @@ class GGUFWriter:
|
|
195
197
|
return ((x + n - 1) // n) * n
|
196
198
|
|
197
199
|
def add_tensor_info(
|
198
|
-
self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype
|
200
|
+
self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype,
|
199
201
|
tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None,
|
200
202
|
) -> None:
|
201
203
|
if self.state is not WriterState.EMPTY:
|
@@ -205,13 +207,9 @@ class GGUFWriter:
|
|
205
207
|
raise ValueError(f'Duplicated tensor name {name}')
|
206
208
|
self.ti_names.add(name)
|
207
209
|
|
208
|
-
encoded_name = name.encode("
|
210
|
+
encoded_name = name.encode("utf-8")
|
209
211
|
self.ti_data += self._pack("Q", len(encoded_name))
|
210
212
|
self.ti_data += encoded_name
|
211
|
-
n_dims = len(tensor_shape)
|
212
|
-
self.ti_data += self._pack("I", n_dims)
|
213
|
-
for i in range(n_dims):
|
214
|
-
self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
|
215
213
|
if raw_dtype is None:
|
216
214
|
if tensor_dtype == np.float16:
|
217
215
|
dtype = GGMLQuantizationType.F16
|
@@ -231,6 +229,12 @@ class GGUFWriter:
|
|
231
229
|
raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now")
|
232
230
|
else:
|
233
231
|
dtype = raw_dtype
|
232
|
+
if tensor_dtype == np.uint8:
|
233
|
+
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
|
234
|
+
n_dims = len(tensor_shape)
|
235
|
+
self.ti_data += self._pack("I", n_dims)
|
236
|
+
for i in range(n_dims):
|
237
|
+
self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
|
234
238
|
self.ti_data += self._pack("I", dtype)
|
235
239
|
self.ti_data += self._pack("Q", self.offset_tensor)
|
236
240
|
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
|
@@ -272,15 +276,33 @@ class GGUFWriter:
|
|
272
276
|
tensor.tofile(self.fout)
|
273
277
|
self.write_padding(self.fout, tensor.nbytes)
|
274
278
|
|
275
|
-
def write_tensors_to_file(self) -> None:
|
279
|
+
def write_tensors_to_file(self, *, progress: bool = False) -> None:
|
276
280
|
self.write_ti_data_to_file()
|
277
281
|
|
278
282
|
self.write_padding(self.fout, self.fout.tell())
|
279
283
|
|
280
284
|
if self.temp_file is None:
|
285
|
+
self.tensors.reverse() # to pop from the "beginning" in constant time
|
286
|
+
|
287
|
+
if progress:
|
288
|
+
from tqdm import tqdm
|
289
|
+
|
290
|
+
total_bytes = sum(t.nbytes for t in self.tensors)
|
291
|
+
|
292
|
+
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
293
|
+
|
294
|
+
while True:
|
295
|
+
try:
|
296
|
+
tensor = self.tensors.pop()
|
297
|
+
except IndexError:
|
298
|
+
break
|
299
|
+
tensor.tofile(self.fout)
|
300
|
+
bar.update(tensor.nbytes)
|
301
|
+
self.write_padding(self.fout, tensor.nbytes)
|
302
|
+
return
|
281
303
|
while True:
|
282
304
|
try:
|
283
|
-
tensor = self.tensors.pop(
|
305
|
+
tensor = self.tensors.pop()
|
284
306
|
except IndexError:
|
285
307
|
break
|
286
308
|
tensor.tofile(self.fout)
|
@@ -332,7 +354,7 @@ class GGUFWriter:
|
|
332
354
|
def add_name(self, name: str) -> None:
|
333
355
|
self.add_string(Keys.General.NAME, name)
|
334
356
|
|
335
|
-
def add_quantization_version(self, quantization_version:
|
357
|
+
def add_quantization_version(self, quantization_version: int) -> None:
|
336
358
|
self.add_uint32(
|
337
359
|
Keys.General.QUANTIZATION_VERSION, quantization_version)
|
338
360
|
|
@@ -409,6 +431,9 @@ class GGUFWriter:
|
|
409
431
|
def add_rope_scaling_factor(self, value: float) -> None:
|
410
432
|
self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)
|
411
433
|
|
434
|
+
def add_rope_scaling_attn_factors(self, value: Sequence[float]) -> None:
|
435
|
+
self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value)
|
436
|
+
|
412
437
|
def add_rope_scaling_orig_ctx_len(self, value: int) -> None:
|
413
438
|
self.add_uint32(Keys.Rope.SCALING_ORIG_CTX_LEN.format(arch=self.arch), value)
|
414
439
|
|
@@ -479,7 +504,7 @@ class GGUFWriter:
|
|
479
504
|
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
|
480
505
|
|
481
506
|
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
|
482
|
-
if isinstance(value,
|
507
|
+
if not isinstance(value, str):
|
483
508
|
template_default = None
|
484
509
|
template_names = set()
|
485
510
|
|
@@ -0,0 +1,236 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from abc import ABC, ABCMeta, abstractmethod
|
3
|
+
|
4
|
+
import logging
|
5
|
+
from typing import Any, Callable
|
6
|
+
from collections import deque
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
from numpy._typing import _Shape
|
10
|
+
from numpy.typing import DTypeLike
|
11
|
+
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
class LazyMeta(ABCMeta):
|
17
|
+
|
18
|
+
def __new__(cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs):
|
19
|
+
def __getattr__(self, __name: str) -> Any:
|
20
|
+
meta_attr = getattr(self._meta, __name)
|
21
|
+
if callable(meta_attr):
|
22
|
+
return type(self)._wrap_fn(
|
23
|
+
(lambda s, *args, **kwargs: getattr(s, __name)(*args, **kwargs)),
|
24
|
+
use_self=self,
|
25
|
+
)
|
26
|
+
elif isinstance(meta_attr, self._tensor_type):
|
27
|
+
# e.g. self.T with torch.Tensor should still be wrapped
|
28
|
+
return type(self)._wrap_fn(lambda s: getattr(s, __name))(self)
|
29
|
+
else:
|
30
|
+
# no need to wrap non-tensor properties,
|
31
|
+
# and they likely don't depend on the actual contents of the tensor
|
32
|
+
return meta_attr
|
33
|
+
|
34
|
+
namespace["__getattr__"] = __getattr__
|
35
|
+
|
36
|
+
# need to make a builder for the wrapped wrapper to copy the name,
|
37
|
+
# or else it fails with very cryptic error messages,
|
38
|
+
# because somehow the same string would end up in every closures
|
39
|
+
def mk_wrap(op_name: str, *, meta_noop: bool = False):
|
40
|
+
# need to wrap the wrapper to get self
|
41
|
+
def wrapped_special_op(self, *args, **kwargs):
|
42
|
+
return type(self)._wrap_fn(
|
43
|
+
getattr(type(self)._tensor_type, op_name),
|
44
|
+
meta_noop=meta_noop,
|
45
|
+
)(self, *args, **kwargs)
|
46
|
+
return wrapped_special_op
|
47
|
+
|
48
|
+
# special methods bypass __getattr__, so they need to be added manually
|
49
|
+
# ref: https://docs.python.org/3/reference/datamodel.html#special-lookup
|
50
|
+
# NOTE: doing this from a metaclass is very convenient
|
51
|
+
# TODO: make this even more comprehensive
|
52
|
+
for binary_op in (
|
53
|
+
"lt", "le", "eq", "ne", "ge", "gt", "not"
|
54
|
+
"abs", "add", "and", "floordiv", "invert", "lshift", "mod", "mul", "matmul",
|
55
|
+
"neg", "or", "pos", "pow", "rshift", "sub", "truediv", "xor",
|
56
|
+
"iadd", "iand", "ifloordiv", "ilshift", "imod", "imul", "ior", "irshift", "isub", "ixor",
|
57
|
+
"radd", "rand", "rfloordiv", "rmul", "ror", "rpow", "rsub", "rtruediv", "rxor",
|
58
|
+
):
|
59
|
+
attr_name = f"__{binary_op}__"
|
60
|
+
# the result of these operators usually has the same shape and dtype as the input,
|
61
|
+
# so evaluation on the meta tensor can be skipped.
|
62
|
+
namespace[attr_name] = mk_wrap(attr_name, meta_noop=True)
|
63
|
+
|
64
|
+
for special_op in (
|
65
|
+
"getitem", "setitem", "len",
|
66
|
+
):
|
67
|
+
attr_name = f"__{special_op}__"
|
68
|
+
namespace[attr_name] = mk_wrap(attr_name, meta_noop=False)
|
69
|
+
|
70
|
+
return super().__new__(cls, name, bases, namespace, **kwargs)
|
71
|
+
|
72
|
+
|
73
|
+
# Tree of lazy tensors
|
74
|
+
class LazyBase(ABC, metaclass=LazyMeta):
|
75
|
+
_tensor_type: type
|
76
|
+
_meta: Any
|
77
|
+
_data: Any | None
|
78
|
+
_lazy: deque[LazyBase] # shared within a graph, to avoid deep recursion when making eager
|
79
|
+
_args: tuple
|
80
|
+
_func: Callable[[tuple], Any] | None
|
81
|
+
|
82
|
+
def __init__(self, *, meta: Any, data: Any | None = None, lazy: deque[LazyBase] | None = None, args: tuple = (), func: Callable[[tuple], Any] | None = None):
|
83
|
+
super().__init__()
|
84
|
+
self._meta = meta
|
85
|
+
self._data = data
|
86
|
+
self._lazy = lazy if lazy is not None else deque()
|
87
|
+
self._args = args
|
88
|
+
self._func = func
|
89
|
+
assert self._func is not None or self._data is not None
|
90
|
+
if self._data is None:
|
91
|
+
self._lazy.append(self)
|
92
|
+
|
93
|
+
def __init_subclass__(cls) -> None:
|
94
|
+
if "_tensor_type" not in cls.__dict__:
|
95
|
+
raise TypeError(f"property '_tensor_type' must be defined for {cls!r}")
|
96
|
+
return super().__init_subclass__()
|
97
|
+
|
98
|
+
@staticmethod
|
99
|
+
def _recurse_apply(o: Any, fn: Callable[[Any], Any]) -> Any:
|
100
|
+
# TODO: dict and set
|
101
|
+
if isinstance(o, (list, tuple)):
|
102
|
+
L = []
|
103
|
+
for item in o:
|
104
|
+
L.append(LazyBase._recurse_apply(item, fn))
|
105
|
+
if isinstance(o, tuple):
|
106
|
+
L = tuple(L)
|
107
|
+
return L
|
108
|
+
elif isinstance(o, LazyBase):
|
109
|
+
return fn(o)
|
110
|
+
else:
|
111
|
+
return o
|
112
|
+
|
113
|
+
@classmethod
|
114
|
+
def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False) -> Callable[[Any], Any]:
|
115
|
+
def wrapped_fn(*args, **kwargs):
|
116
|
+
if kwargs is None:
|
117
|
+
kwargs = {}
|
118
|
+
args = ((use_self,) if use_self is not None else ()) + args
|
119
|
+
|
120
|
+
meta_args = LazyBase._recurse_apply(args, lambda t: t._meta)
|
121
|
+
|
122
|
+
if isinstance(meta_noop, bool) and not meta_noop:
|
123
|
+
try:
|
124
|
+
res = fn(*meta_args, **kwargs)
|
125
|
+
except NotImplementedError:
|
126
|
+
# running some operations on PyTorch's Meta tensors can cause this exception
|
127
|
+
res = None
|
128
|
+
else:
|
129
|
+
# some operators don't need to actually run on the meta tensors
|
130
|
+
assert len(args) > 0
|
131
|
+
res = args[0]
|
132
|
+
assert isinstance(res, cls)
|
133
|
+
res = res._meta
|
134
|
+
# allow operations to override the dtype and shape
|
135
|
+
if meta_noop is not True:
|
136
|
+
if isinstance(meta_noop, tuple):
|
137
|
+
dtype, shape = meta_noop
|
138
|
+
assert callable(shape)
|
139
|
+
res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape))
|
140
|
+
else:
|
141
|
+
res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)
|
142
|
+
|
143
|
+
if isinstance(res, cls._tensor_type):
|
144
|
+
def collect_replace(t: LazyBase):
|
145
|
+
if collect_replace.shared_lazy is None:
|
146
|
+
collect_replace.shared_lazy = t._lazy
|
147
|
+
else:
|
148
|
+
collect_replace.shared_lazy.extend(t._lazy)
|
149
|
+
t._lazy = collect_replace.shared_lazy
|
150
|
+
|
151
|
+
# emulating a static variable
|
152
|
+
collect_replace.shared_lazy = None
|
153
|
+
|
154
|
+
LazyBase._recurse_apply(args, collect_replace)
|
155
|
+
|
156
|
+
shared_lazy = collect_replace.shared_lazy
|
157
|
+
|
158
|
+
return cls(meta=cls.eager_to_meta(res), lazy=shared_lazy, args=args, func=lambda a: fn(*a, **kwargs))
|
159
|
+
else:
|
160
|
+
del res # not needed
|
161
|
+
# non-tensor return likely relies on the contents of the args
|
162
|
+
# (e.g. the result of torch.equal)
|
163
|
+
eager_args = cls.to_eager(args)
|
164
|
+
return fn(*eager_args, **kwargs)
|
165
|
+
return wrapped_fn
|
166
|
+
|
167
|
+
@classmethod
|
168
|
+
def to_eager(cls, t: Any) -> Any:
|
169
|
+
def simple_to_eager(_t: LazyBase) -> Any:
|
170
|
+
def already_eager_to_eager(_t: LazyBase) -> Any:
|
171
|
+
assert _t._data is not None
|
172
|
+
return _t._data
|
173
|
+
|
174
|
+
while _t._data is None:
|
175
|
+
lt = _t._lazy.popleft()
|
176
|
+
if lt._data is not None:
|
177
|
+
# Lazy tensor did not belong in the lazy queue.
|
178
|
+
# Weirdly only happens with Bloom models...
|
179
|
+
# likely because tensors aren't unique in the queue.
|
180
|
+
# The final output is still the same as in eager mode,
|
181
|
+
# so it's safe to ignore this.
|
182
|
+
continue
|
183
|
+
assert lt._func is not None
|
184
|
+
lt._args = cls._recurse_apply(lt._args, already_eager_to_eager)
|
185
|
+
lt._data = lt._func(lt._args)
|
186
|
+
# sanity check
|
187
|
+
assert lt._data.dtype == lt._meta.dtype
|
188
|
+
assert lt._data.shape == lt._meta.shape
|
189
|
+
|
190
|
+
return _t._data
|
191
|
+
|
192
|
+
# recurse into lists and/or tuples, keeping their structure
|
193
|
+
return cls._recurse_apply(t, simple_to_eager)
|
194
|
+
|
195
|
+
@classmethod
|
196
|
+
def eager_to_meta(cls, t: Any) -> Any:
|
197
|
+
return cls.meta_with_dtype_and_shape(t.dtype, t.shape)
|
198
|
+
|
199
|
+
# must be overridden, meta tensor init is backend-specific
|
200
|
+
@classmethod
|
201
|
+
@abstractmethod
|
202
|
+
def meta_with_dtype_and_shape(cls, dtype: Any, shape: Any) -> Any: pass
|
203
|
+
|
204
|
+
@classmethod
|
205
|
+
def from_eager(cls, t: Any) -> Any:
|
206
|
+
if type(t) is cls:
|
207
|
+
# already eager
|
208
|
+
return t
|
209
|
+
elif isinstance(t, cls._tensor_type):
|
210
|
+
return cls(meta=cls.eager_to_meta(t), data=t)
|
211
|
+
else:
|
212
|
+
return TypeError(f"{type(t)!r} is not compatible with {cls._tensor_type!r}")
|
213
|
+
|
214
|
+
|
215
|
+
class LazyNumpyTensor(LazyBase):
|
216
|
+
_tensor_type = np.ndarray
|
217
|
+
|
218
|
+
@classmethod
|
219
|
+
def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: _Shape) -> np.ndarray[Any, Any]:
|
220
|
+
# The initial idea was to use np.nan as the fill value,
|
221
|
+
# but non-float types like np.int16 can't use that.
|
222
|
+
# So zero it is.
|
223
|
+
cheat = np.zeros(1, dtype)
|
224
|
+
return np.lib.stride_tricks.as_strided(cheat, shape, (0 for _ in shape))
|
225
|
+
|
226
|
+
def astype(self, dtype, *args, **kwargs):
|
227
|
+
meta = type(self).meta_with_dtype_and_shape(dtype, self._meta.shape)
|
228
|
+
full_args = (self, dtype,) + args
|
229
|
+
# very important to pass the shared _lazy deque, or else there's an infinite loop somewhere.
|
230
|
+
return type(self)(meta=meta, args=full_args, lazy=self._lazy, func=(lambda a: a[0].astype(*a[1:], **kwargs)))
|
231
|
+
|
232
|
+
def tofile(self, *args, **kwargs):
|
233
|
+
eager = LazyNumpyTensor.to_eager(self)
|
234
|
+
return eager.tofile(*args, **kwargs)
|
235
|
+
|
236
|
+
# TODO: __array_function__
|
@@ -0,0 +1,123 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Callable, Sequence
|
3
|
+
|
4
|
+
from numpy.typing import DTypeLike
|
5
|
+
|
6
|
+
from .constants import GGML_QUANT_SIZES, GGMLQuantizationType
|
7
|
+
from .lazy import LazyNumpyTensor
|
8
|
+
|
9
|
+
import numpy as np
|
10
|
+
|
11
|
+
|
12
|
+
def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
13
|
+
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
14
|
+
if shape[-1] % block_size != 0:
|
15
|
+
raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
|
16
|
+
return (*shape[:-1], shape[-1] // block_size * type_size)
|
17
|
+
|
18
|
+
|
19
|
+
def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
20
|
+
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
21
|
+
if shape[-1] % type_size != 0:
|
22
|
+
raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
|
23
|
+
return (*shape[:-1], shape[-1] // type_size * block_size)
|
24
|
+
|
25
|
+
|
26
|
+
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
27
|
+
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
|
28
|
+
n = n.astype(np.float32, copy=False).view(np.int32)
|
29
|
+
# force nan to quiet
|
30
|
+
n = np.where((n & 0x7fffffff) > 0x7f800000, (n & 0xffff0000) | (64 << 16), n)
|
31
|
+
# flush subnormals to zero
|
32
|
+
n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n)
|
33
|
+
# round to nearest even
|
34
|
+
n = (n + (0x7fff + ((n >> 16) & 1))) >> 16
|
35
|
+
return n.astype(np.int16)
|
36
|
+
|
37
|
+
|
38
|
+
# This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
|
39
|
+
def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
|
40
|
+
rows = arr.reshape((-1, arr.shape[-1]))
|
41
|
+
osize = 1
|
42
|
+
for dim in oshape:
|
43
|
+
osize *= dim
|
44
|
+
out = np.empty(shape=osize, dtype=otype)
|
45
|
+
# compute over groups of 16 rows (arbitrary, but seems good for performance)
|
46
|
+
n_groups = rows.shape[0] // 16
|
47
|
+
np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
|
48
|
+
return out.reshape(oshape)
|
49
|
+
|
50
|
+
|
51
|
+
def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:
|
52
|
+
return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.int16, oshape=n.shape)
|
53
|
+
|
54
|
+
|
55
|
+
__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.int16)
|
56
|
+
|
57
|
+
|
58
|
+
def quantize_bf16(n: np.ndarray):
|
59
|
+
if type(n) is LazyNumpyTensor:
|
60
|
+
return __quantize_bf16_lazy(n)
|
61
|
+
else:
|
62
|
+
return __quantize_bf16_array(n)
|
63
|
+
|
64
|
+
|
65
|
+
__q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0]
|
66
|
+
|
67
|
+
|
68
|
+
def can_quantize_to_q8_0(n: np.ndarray) -> bool:
|
69
|
+
return n.shape[-1] % __q8_block_size == 0
|
70
|
+
|
71
|
+
|
72
|
+
# round away from zero
|
73
|
+
# ref: https://stackoverflow.com/a/59143326/22827863
|
74
|
+
def np_roundf(n: np.ndarray) -> np.ndarray:
|
75
|
+
a = abs(n)
|
76
|
+
floored = np.floor(a)
|
77
|
+
b = floored + np.floor(2 * (a - floored))
|
78
|
+
return np.sign(n) * b
|
79
|
+
|
80
|
+
|
81
|
+
def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]:
|
82
|
+
return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size)
|
83
|
+
|
84
|
+
|
85
|
+
# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
|
86
|
+
def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:
|
87
|
+
shape = n.shape
|
88
|
+
assert shape[-1] % __q8_block_size == 0
|
89
|
+
|
90
|
+
n_blocks = n.size // __q8_block_size
|
91
|
+
|
92
|
+
blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False)
|
93
|
+
|
94
|
+
d = abs(blocks).max(axis=1, keepdims=True) / 127
|
95
|
+
with np.errstate(divide="ignore"):
|
96
|
+
id = np.where(d == 0, 0, 1 / d)
|
97
|
+
qs = np_roundf(blocks * id)
|
98
|
+
|
99
|
+
# (n_blocks, 2)
|
100
|
+
d = d.astype(np.float16).view(np.uint8)
|
101
|
+
# (n_blocks, block_size)
|
102
|
+
qs = qs.astype(np.int8).view(np.uint8)
|
103
|
+
|
104
|
+
assert d.shape[1] + qs.shape[1] == __q8_type_size
|
105
|
+
|
106
|
+
return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape))
|
107
|
+
|
108
|
+
|
109
|
+
def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray:
|
110
|
+
return __apply_over_grouped_rows(__quantize_q8_0_rows, arr=n, otype=np.uint8, oshape=__quantize_q8_0_shape_change(n.shape))
|
111
|
+
|
112
|
+
|
113
|
+
__quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn(
|
114
|
+
__quantize_q8_0_array,
|
115
|
+
meta_noop=(np.uint8, __quantize_q8_0_shape_change),
|
116
|
+
)
|
117
|
+
|
118
|
+
|
119
|
+
def quantize_q8_0(data: np.ndarray):
|
120
|
+
if type(data) is LazyNumpyTensor:
|
121
|
+
return __quantize_q8_0_lazy(data)
|
122
|
+
else:
|
123
|
+
return __quantize_q8_0_array(data)
|
@@ -137,6 +137,7 @@ class TensorNameMap:
|
|
137
137
|
"layers.{bid}.attention.wk", # llama-pth
|
138
138
|
"encoder.layer.{bid}.attention.self.key", # bert
|
139
139
|
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
140
|
+
"transformer.h.{bid}.attn.k", # refact
|
140
141
|
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
|
141
142
|
"model.layers.{bid}.attention.wk", # internlm2
|
142
143
|
"transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok
|
@@ -148,6 +149,7 @@ class TensorNameMap:
|
|
148
149
|
"layers.{bid}.attention.wv", # llama-pth
|
149
150
|
"encoder.layer.{bid}.attention.self.value", # bert
|
150
151
|
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
152
|
+
"transformer.h.{bid}.attn.v", # refact
|
151
153
|
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
|
152
154
|
"model.layers.{bid}.attention.wv", # internlm2
|
153
155
|
"transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok
|
@@ -229,6 +231,7 @@ class TensorNameMap:
|
|
229
231
|
"layers.{bid}.feed_forward.w3", # llama-pth
|
230
232
|
"encoder.layer.{bid}.intermediate.dense", # bert
|
231
233
|
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
234
|
+
"transformer.h.{bid}.mlp.linear_3", # refact
|
232
235
|
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
233
236
|
"model.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
234
237
|
"transformer.h.{bid}.mlp.w1", # qwen
|
@@ -240,6 +243,8 @@ class TensorNameMap:
|
|
240
243
|
"model.layers.{bid}.feed_forward.w3", # internlm2
|
241
244
|
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
242
245
|
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
246
|
+
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
247
|
+
"model.layers.{bid}.residual_mlp.w3", # arctic
|
243
248
|
),
|
244
249
|
|
245
250
|
MODEL_TENSOR.FFN_UP_EXP: (
|
@@ -266,6 +271,9 @@ class TensorNameMap:
|
|
266
271
|
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
267
272
|
"model.layers.{bid}.feed_forward.w1", # internlm2
|
268
273
|
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
274
|
+
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2
|
275
|
+
"transformer.h.{bid}.mlp.linear_1", # refact
|
276
|
+
"model.layers.{bid}.residual_mlp.w1", # arctic
|
269
277
|
),
|
270
278
|
|
271
279
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
@@ -299,6 +307,8 @@ class TensorNameMap:
|
|
299
307
|
"model.layers.{bid}.feed_forward.w2", # internlm2
|
300
308
|
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
301
309
|
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
310
|
+
"encoder.layer.{bid}.mlp.wo", # jina-bert-v2
|
311
|
+
"model.layers.{bid}.residual_mlp.w2", # arctic
|
302
312
|
),
|
303
313
|
|
304
314
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
@@ -317,6 +327,7 @@ class TensorNameMap:
|
|
317
327
|
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
318
328
|
"model.layers.{bid}.self_attn.q_norm", # cohere
|
319
329
|
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
330
|
+
"encoder.layer.{bid}.attention.self.layer_norm_q" # jina-bert-v2
|
320
331
|
),
|
321
332
|
|
322
333
|
MODEL_TENSOR.ATTN_K_NORM: (
|
@@ -324,6 +335,7 @@ class TensorNameMap:
|
|
324
335
|
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
325
336
|
"model.layers.{bid}.self_attn.k_norm", # cohere
|
326
337
|
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
338
|
+
"encoder.layer.{bid}.attention.self.layer_norm_k" # jina-bert-v2
|
327
339
|
),
|
328
340
|
|
329
341
|
MODEL_TENSOR.ROPE_FREQS: (
|
@@ -334,6 +346,7 @@ class TensorNameMap:
|
|
334
346
|
"encoder.layer.{bid}.output.LayerNorm", # bert
|
335
347
|
"encoder.layers.{bid}.norm2", # nomic-bert
|
336
348
|
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
|
349
|
+
"encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2
|
337
350
|
),
|
338
351
|
|
339
352
|
MODEL_TENSOR.SSM_IN: (
|
@@ -372,6 +385,18 @@ class TensorNameMap:
|
|
372
385
|
),
|
373
386
|
}
|
374
387
|
|
388
|
+
# architecture-specific block mappings
|
389
|
+
arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = {
|
390
|
+
MODEL_ARCH.ARCTIC: {
|
391
|
+
MODEL_TENSOR.FFN_NORM: (
|
392
|
+
"model.layers.{bid}.residual_layernorm",
|
393
|
+
),
|
394
|
+
MODEL_TENSOR.FFN_NORM_EXP: (
|
395
|
+
"model.layers.{bid}.post_attention_layernorm",
|
396
|
+
),
|
397
|
+
},
|
398
|
+
}
|
399
|
+
|
375
400
|
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
376
401
|
|
377
402
|
def __init__(self, arch: MODEL_ARCH, n_blocks: int):
|
@@ -383,12 +408,14 @@ class TensorNameMap:
|
|
383
408
|
self.mapping[tensor_name] = (tensor, tensor_name)
|
384
409
|
for key in keys:
|
385
410
|
self.mapping[key] = (tensor, tensor_name)
|
411
|
+
if arch in self.arch_block_mappings_cfg:
|
412
|
+
self.block_mappings_cfg.update(self.arch_block_mappings_cfg[arch])
|
386
413
|
for bid in range(n_blocks):
|
387
414
|
for tensor, keys in self.block_mappings_cfg.items():
|
388
415
|
if tensor not in MODEL_TENSORS[arch]:
|
389
416
|
continue
|
390
417
|
# TODO: make this configurable
|
391
|
-
n_experts =
|
418
|
+
n_experts = 128
|
392
419
|
for xid in range(n_experts):
|
393
420
|
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
|
394
421
|
self.mapping[tensor_name] = (tensor, tensor_name)
|
bigdl/cpp/gguf-py/gguf/vocab.py
CHANGED
@@ -4,7 +4,7 @@ import logging
|
|
4
4
|
import json
|
5
5
|
import os
|
6
6
|
from pathlib import Path
|
7
|
-
from typing import Any, Callable
|
7
|
+
from typing import Any, Callable, Sequence, Mapping, Iterable
|
8
8
|
|
9
9
|
from .gguf_writer import GGUFWriter
|
10
10
|
|
@@ -15,11 +15,11 @@ class SpecialVocab:
|
|
15
15
|
merges: list[str]
|
16
16
|
add_special_token: dict[str, bool]
|
17
17
|
special_token_ids: dict[str, int]
|
18
|
-
chat_template: str | None
|
18
|
+
chat_template: str | Sequence[Mapping[str, str]] | None
|
19
19
|
|
20
20
|
def __init__(
|
21
21
|
self, path: str | os.PathLike[str], load_merges: bool = False,
|
22
|
-
special_token_types:
|
22
|
+
special_token_types: Iterable[str] | None = None,
|
23
23
|
n_vocab: int | None = None,
|
24
24
|
):
|
25
25
|
self.special_token_ids = {}
|
bigdl/cpp/libs/baby-llama.exe
CHANGED
Binary file
|
bigdl/cpp/libs/batched-bench.exe
CHANGED
Binary file
|
bigdl/cpp/libs/batched.exe
CHANGED
Binary file
|
bigdl/cpp/libs/beam-search.exe
CHANGED
Binary file
|
bigdl/cpp/libs/benchmark.exe
CHANGED
Binary file
|
bigdl/cpp/libs/common.lib
CHANGED
Binary file
|