bigdl-core-cpp 2.5.0b20240527__py3-none-win_amd64.whl → 2.5.0b20240529__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. bigdl/cpp/convert-hf-to-gguf.py +1363 -338
  2. bigdl/cpp/convert.py +199 -52
  3. bigdl/cpp/gguf-py/gguf/__init__.py +2 -0
  4. bigdl/cpp/gguf-py/gguf/constants.py +102 -28
  5. bigdl/cpp/gguf-py/gguf/gguf_reader.py +9 -5
  6. bigdl/cpp/gguf-py/gguf/gguf_writer.py +36 -11
  7. bigdl/cpp/gguf-py/gguf/lazy.py +236 -0
  8. bigdl/cpp/gguf-py/gguf/quants.py +123 -0
  9. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +28 -1
  10. bigdl/cpp/gguf-py/gguf/vocab.py +3 -3
  11. bigdl/cpp/libs/baby-llama.exe +0 -0
  12. bigdl/cpp/libs/batched-bench.exe +0 -0
  13. bigdl/cpp/libs/batched.exe +0 -0
  14. bigdl/cpp/libs/beam-search.exe +0 -0
  15. bigdl/cpp/libs/benchmark.exe +0 -0
  16. bigdl/cpp/libs/common.lib +0 -0
  17. bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
  18. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
  19. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
  20. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
  21. bigdl/cpp/libs/embedding.exe +0 -0
  22. bigdl/cpp/libs/export-lora.exe +0 -0
  23. bigdl/cpp/libs/finetune.exe +0 -0
  24. bigdl/cpp/libs/ggml_shared.dll +0 -0
  25. bigdl/cpp/libs/gguf.exe +0 -0
  26. bigdl/cpp/libs/gritlm.exe +0 -0
  27. bigdl/cpp/libs/imatrix.exe +0 -0
  28. bigdl/cpp/libs/infill.exe +0 -0
  29. bigdl/cpp/libs/llama-bench.exe +0 -0
  30. bigdl/cpp/libs/llama.dll +0 -0
  31. bigdl/cpp/libs/llava-cli.exe +0 -0
  32. bigdl/cpp/libs/llava_shared.dll +0 -0
  33. bigdl/cpp/libs/lookahead.exe +0 -0
  34. bigdl/cpp/libs/lookup.exe +0 -0
  35. bigdl/cpp/libs/ls-sycl-device.exe +0 -0
  36. bigdl/cpp/libs/main.exe +0 -0
  37. bigdl/cpp/libs/ollama.exe +0 -0
  38. bigdl/cpp/libs/parallel.exe +0 -0
  39. bigdl/cpp/libs/passkey.exe +0 -0
  40. bigdl/cpp/libs/perplexity.exe +0 -0
  41. bigdl/cpp/libs/q8dot.exe +0 -0
  42. bigdl/cpp/libs/quantize-stats.exe +0 -0
  43. bigdl/cpp/libs/quantize.exe +0 -0
  44. bigdl/cpp/libs/save-load-state.exe +0 -0
  45. bigdl/cpp/libs/server.exe +0 -0
  46. bigdl/cpp/libs/simple.exe +0 -0
  47. bigdl/cpp/libs/speculative.exe +0 -0
  48. bigdl/cpp/libs/tokenize.exe +0 -0
  49. bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
  50. bigdl/cpp/libs/vdot.exe +0 -0
  51. {bigdl_core_cpp-2.5.0b20240527.dist-info → bigdl_core_cpp-2.5.0b20240529.dist-info}/METADATA +1 -1
  52. bigdl_core_cpp-2.5.0b20240529.dist-info/RECORD +61 -0
  53. bigdl_core_cpp-2.5.0b20240527.dist-info/RECORD +0 -59
  54. {bigdl_core_cpp-2.5.0b20240527.data → bigdl_core_cpp-2.5.0b20240529.data}/scripts/init-llama-cpp.bat +0 -0
  55. {bigdl_core_cpp-2.5.0b20240527.data → bigdl_core_cpp-2.5.0b20240529.data}/scripts/init-llama-cpp.ps1 +0 -0
  56. {bigdl_core_cpp-2.5.0b20240527.data → bigdl_core_cpp-2.5.0b20240529.data}/scripts/init-ollama.bat +0 -0
  57. {bigdl_core_cpp-2.5.0b20240527.dist-info → bigdl_core_cpp-2.5.0b20240529.dist-info}/WHEEL +0 -0
  58. {bigdl_core_cpp-2.5.0b20240527.dist-info → bigdl_core_cpp-2.5.0b20240529.dist-info}/top_level.txt +0 -0
@@ -12,6 +12,8 @@ from typing import Any, Literal, NamedTuple, TypeVar, Union
12
12
  import numpy as np
13
13
  import numpy.typing as npt
14
14
 
15
+ from .quants import quant_shape_to_byte_shape
16
+
15
17
  if __name__ == "__main__":
16
18
  import sys
17
19
  from pathlib import Path
@@ -65,7 +67,7 @@ class ReaderTensor(NamedTuple):
65
67
 
66
68
  class GGUFReader:
67
69
  # I - same as host, S - swapped
68
- byte_order: Literal['I' | 'S'] = 'I'
70
+ byte_order: Literal['I'] | Literal['S'] = 'I'
69
71
  alignment: int = GGUF_DEFAULT_ALIGNMENT
70
72
 
71
73
  # Note: Internal helper, API may change.
@@ -83,7 +85,7 @@ class GGUFReader:
83
85
  GGUFValueType.BOOL: np.bool_,
84
86
  }
85
87
 
86
- def __init__(self, path: os.PathLike[str] | str, mode: Literal['r' | 'r+' | 'c'] = 'r'):
88
+ def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'):
87
89
  self.data = np.memmap(path, mode = mode)
88
90
  offs = 0
89
91
  if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
@@ -128,7 +130,7 @@ class GGUFReader:
128
130
  return self.tensors[idx]
129
131
 
130
132
  def _get(
131
- self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I' | 'S' | '<'] = None,
133
+ self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I'] | Literal['S'] | Literal['<'] = None,
132
134
  ) -> npt.NDArray[Any]:
133
135
  count = int(count)
134
136
  itemsize = int(np.empty([], dtype = dtype).itemsize)
@@ -250,7 +252,8 @@ class GGUFReader:
250
252
  raise ValueError(f'Found duplicated tensor with name {tensor_name}')
251
253
  tensor_names.add(tensor_name)
252
254
  ggml_type = GGMLQuantizationType(raw_dtype[0])
253
- n_elems = np.prod(dims)
255
+ n_elems = int(np.prod(dims))
256
+ np_dims = tuple(reversed(dims.tolist()))
254
257
  block_size, type_size = GGML_QUANT_SIZES[ggml_type]
255
258
  n_bytes = n_elems * type_size // block_size
256
259
  data_offs = int(start_offs + offset_tensor[0])
@@ -279,6 +282,7 @@ class GGUFReader:
279
282
  else:
280
283
  item_count = n_bytes
281
284
  item_type = np.uint8
285
+ np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
282
286
  tensors.append(ReaderTensor(
283
287
  name = tensor_name,
284
288
  tensor_type = ggml_type,
@@ -286,7 +290,7 @@ class GGUFReader:
286
290
  n_elements = n_elems,
287
291
  n_bytes = n_bytes,
288
292
  data_offset = data_offs,
289
- data = self._get(data_offs, item_type, item_count),
293
+ data = self._get(data_offs, item_type, item_count).reshape(np_dims),
290
294
  field = field,
291
295
  ))
292
296
  self.tensors = tensors
@@ -25,6 +25,8 @@ from .constants import (
25
25
  TokenType,
26
26
  )
27
27
 
28
+ from .quants import quant_shape_from_byte_shape
29
+
28
30
  logger = logging.getLogger(__name__)
29
31
 
30
32
 
@@ -176,7 +178,7 @@ class GGUFWriter:
176
178
  if pack_fmt is not None:
177
179
  self.kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL)
178
180
  elif vtype == GGUFValueType.STRING:
179
- encoded_val = val.encode("utf8") if isinstance(val, str) else val
181
+ encoded_val = val.encode("utf-8") if isinstance(val, str) else val
180
182
  self.kv_data += self._pack("Q", len(encoded_val))
181
183
  self.kv_data += encoded_val
182
184
  elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
@@ -195,7 +197,7 @@ class GGUFWriter:
195
197
  return ((x + n - 1) // n) * n
196
198
 
197
199
  def add_tensor_info(
198
- self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32],
200
+ self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype,
199
201
  tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None,
200
202
  ) -> None:
201
203
  if self.state is not WriterState.EMPTY:
@@ -205,13 +207,9 @@ class GGUFWriter:
205
207
  raise ValueError(f'Duplicated tensor name {name}')
206
208
  self.ti_names.add(name)
207
209
 
208
- encoded_name = name.encode("utf8")
210
+ encoded_name = name.encode("utf-8")
209
211
  self.ti_data += self._pack("Q", len(encoded_name))
210
212
  self.ti_data += encoded_name
211
- n_dims = len(tensor_shape)
212
- self.ti_data += self._pack("I", n_dims)
213
- for i in range(n_dims):
214
- self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
215
213
  if raw_dtype is None:
216
214
  if tensor_dtype == np.float16:
217
215
  dtype = GGMLQuantizationType.F16
@@ -231,6 +229,12 @@ class GGUFWriter:
231
229
  raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now")
232
230
  else:
233
231
  dtype = raw_dtype
232
+ if tensor_dtype == np.uint8:
233
+ tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
234
+ n_dims = len(tensor_shape)
235
+ self.ti_data += self._pack("I", n_dims)
236
+ for i in range(n_dims):
237
+ self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
234
238
  self.ti_data += self._pack("I", dtype)
235
239
  self.ti_data += self._pack("Q", self.offset_tensor)
236
240
  self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
@@ -272,15 +276,33 @@ class GGUFWriter:
272
276
  tensor.tofile(self.fout)
273
277
  self.write_padding(self.fout, tensor.nbytes)
274
278
 
275
- def write_tensors_to_file(self) -> None:
279
+ def write_tensors_to_file(self, *, progress: bool = False) -> None:
276
280
  self.write_ti_data_to_file()
277
281
 
278
282
  self.write_padding(self.fout, self.fout.tell())
279
283
 
280
284
  if self.temp_file is None:
285
+ self.tensors.reverse() # to pop from the "beginning" in constant time
286
+
287
+ if progress:
288
+ from tqdm import tqdm
289
+
290
+ total_bytes = sum(t.nbytes for t in self.tensors)
291
+
292
+ bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
293
+
294
+ while True:
295
+ try:
296
+ tensor = self.tensors.pop()
297
+ except IndexError:
298
+ break
299
+ tensor.tofile(self.fout)
300
+ bar.update(tensor.nbytes)
301
+ self.write_padding(self.fout, tensor.nbytes)
302
+ return
281
303
  while True:
282
304
  try:
283
- tensor = self.tensors.pop(0)
305
+ tensor = self.tensors.pop()
284
306
  except IndexError:
285
307
  break
286
308
  tensor.tofile(self.fout)
@@ -332,7 +354,7 @@ class GGUFWriter:
332
354
  def add_name(self, name: str) -> None:
333
355
  self.add_string(Keys.General.NAME, name)
334
356
 
335
- def add_quantization_version(self, quantization_version: GGMLQuantizationType) -> None:
357
+ def add_quantization_version(self, quantization_version: int) -> None:
336
358
  self.add_uint32(
337
359
  Keys.General.QUANTIZATION_VERSION, quantization_version)
338
360
 
@@ -409,6 +431,9 @@ class GGUFWriter:
409
431
  def add_rope_scaling_factor(self, value: float) -> None:
410
432
  self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)
411
433
 
434
+ def add_rope_scaling_attn_factors(self, value: Sequence[float]) -> None:
435
+ self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value)
436
+
412
437
  def add_rope_scaling_orig_ctx_len(self, value: int) -> None:
413
438
  self.add_uint32(Keys.Rope.SCALING_ORIG_CTX_LEN.format(arch=self.arch), value)
414
439
 
@@ -479,7 +504,7 @@ class GGUFWriter:
479
504
  self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
480
505
 
481
506
  def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
482
- if isinstance(value, list):
507
+ if not isinstance(value, str):
483
508
  template_default = None
484
509
  template_names = set()
485
510
 
@@ -0,0 +1,236 @@
1
+ from __future__ import annotations
2
+ from abc import ABC, ABCMeta, abstractmethod
3
+
4
+ import logging
5
+ from typing import Any, Callable
6
+ from collections import deque
7
+
8
+ import numpy as np
9
+ from numpy._typing import _Shape
10
+ from numpy.typing import DTypeLike
11
+
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class LazyMeta(ABCMeta):
17
+
18
+ def __new__(cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs):
19
+ def __getattr__(self, __name: str) -> Any:
20
+ meta_attr = getattr(self._meta, __name)
21
+ if callable(meta_attr):
22
+ return type(self)._wrap_fn(
23
+ (lambda s, *args, **kwargs: getattr(s, __name)(*args, **kwargs)),
24
+ use_self=self,
25
+ )
26
+ elif isinstance(meta_attr, self._tensor_type):
27
+ # e.g. self.T with torch.Tensor should still be wrapped
28
+ return type(self)._wrap_fn(lambda s: getattr(s, __name))(self)
29
+ else:
30
+ # no need to wrap non-tensor properties,
31
+ # and they likely don't depend on the actual contents of the tensor
32
+ return meta_attr
33
+
34
+ namespace["__getattr__"] = __getattr__
35
+
36
+ # need to make a builder for the wrapped wrapper to copy the name,
37
+ # or else it fails with very cryptic error messages,
38
+ # because somehow the same string would end up in every closures
39
+ def mk_wrap(op_name: str, *, meta_noop: bool = False):
40
+ # need to wrap the wrapper to get self
41
+ def wrapped_special_op(self, *args, **kwargs):
42
+ return type(self)._wrap_fn(
43
+ getattr(type(self)._tensor_type, op_name),
44
+ meta_noop=meta_noop,
45
+ )(self, *args, **kwargs)
46
+ return wrapped_special_op
47
+
48
+ # special methods bypass __getattr__, so they need to be added manually
49
+ # ref: https://docs.python.org/3/reference/datamodel.html#special-lookup
50
+ # NOTE: doing this from a metaclass is very convenient
51
+ # TODO: make this even more comprehensive
52
+ for binary_op in (
53
+ "lt", "le", "eq", "ne", "ge", "gt", "not"
54
+ "abs", "add", "and", "floordiv", "invert", "lshift", "mod", "mul", "matmul",
55
+ "neg", "or", "pos", "pow", "rshift", "sub", "truediv", "xor",
56
+ "iadd", "iand", "ifloordiv", "ilshift", "imod", "imul", "ior", "irshift", "isub", "ixor",
57
+ "radd", "rand", "rfloordiv", "rmul", "ror", "rpow", "rsub", "rtruediv", "rxor",
58
+ ):
59
+ attr_name = f"__{binary_op}__"
60
+ # the result of these operators usually has the same shape and dtype as the input,
61
+ # so evaluation on the meta tensor can be skipped.
62
+ namespace[attr_name] = mk_wrap(attr_name, meta_noop=True)
63
+
64
+ for special_op in (
65
+ "getitem", "setitem", "len",
66
+ ):
67
+ attr_name = f"__{special_op}__"
68
+ namespace[attr_name] = mk_wrap(attr_name, meta_noop=False)
69
+
70
+ return super().__new__(cls, name, bases, namespace, **kwargs)
71
+
72
+
73
+ # Tree of lazy tensors
74
+ class LazyBase(ABC, metaclass=LazyMeta):
75
+ _tensor_type: type
76
+ _meta: Any
77
+ _data: Any | None
78
+ _lazy: deque[LazyBase] # shared within a graph, to avoid deep recursion when making eager
79
+ _args: tuple
80
+ _func: Callable[[tuple], Any] | None
81
+
82
+ def __init__(self, *, meta: Any, data: Any | None = None, lazy: deque[LazyBase] | None = None, args: tuple = (), func: Callable[[tuple], Any] | None = None):
83
+ super().__init__()
84
+ self._meta = meta
85
+ self._data = data
86
+ self._lazy = lazy if lazy is not None else deque()
87
+ self._args = args
88
+ self._func = func
89
+ assert self._func is not None or self._data is not None
90
+ if self._data is None:
91
+ self._lazy.append(self)
92
+
93
+ def __init_subclass__(cls) -> None:
94
+ if "_tensor_type" not in cls.__dict__:
95
+ raise TypeError(f"property '_tensor_type' must be defined for {cls!r}")
96
+ return super().__init_subclass__()
97
+
98
+ @staticmethod
99
+ def _recurse_apply(o: Any, fn: Callable[[Any], Any]) -> Any:
100
+ # TODO: dict and set
101
+ if isinstance(o, (list, tuple)):
102
+ L = []
103
+ for item in o:
104
+ L.append(LazyBase._recurse_apply(item, fn))
105
+ if isinstance(o, tuple):
106
+ L = tuple(L)
107
+ return L
108
+ elif isinstance(o, LazyBase):
109
+ return fn(o)
110
+ else:
111
+ return o
112
+
113
+ @classmethod
114
+ def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False) -> Callable[[Any], Any]:
115
+ def wrapped_fn(*args, **kwargs):
116
+ if kwargs is None:
117
+ kwargs = {}
118
+ args = ((use_self,) if use_self is not None else ()) + args
119
+
120
+ meta_args = LazyBase._recurse_apply(args, lambda t: t._meta)
121
+
122
+ if isinstance(meta_noop, bool) and not meta_noop:
123
+ try:
124
+ res = fn(*meta_args, **kwargs)
125
+ except NotImplementedError:
126
+ # running some operations on PyTorch's Meta tensors can cause this exception
127
+ res = None
128
+ else:
129
+ # some operators don't need to actually run on the meta tensors
130
+ assert len(args) > 0
131
+ res = args[0]
132
+ assert isinstance(res, cls)
133
+ res = res._meta
134
+ # allow operations to override the dtype and shape
135
+ if meta_noop is not True:
136
+ if isinstance(meta_noop, tuple):
137
+ dtype, shape = meta_noop
138
+ assert callable(shape)
139
+ res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape))
140
+ else:
141
+ res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)
142
+
143
+ if isinstance(res, cls._tensor_type):
144
+ def collect_replace(t: LazyBase):
145
+ if collect_replace.shared_lazy is None:
146
+ collect_replace.shared_lazy = t._lazy
147
+ else:
148
+ collect_replace.shared_lazy.extend(t._lazy)
149
+ t._lazy = collect_replace.shared_lazy
150
+
151
+ # emulating a static variable
152
+ collect_replace.shared_lazy = None
153
+
154
+ LazyBase._recurse_apply(args, collect_replace)
155
+
156
+ shared_lazy = collect_replace.shared_lazy
157
+
158
+ return cls(meta=cls.eager_to_meta(res), lazy=shared_lazy, args=args, func=lambda a: fn(*a, **kwargs))
159
+ else:
160
+ del res # not needed
161
+ # non-tensor return likely relies on the contents of the args
162
+ # (e.g. the result of torch.equal)
163
+ eager_args = cls.to_eager(args)
164
+ return fn(*eager_args, **kwargs)
165
+ return wrapped_fn
166
+
167
+ @classmethod
168
+ def to_eager(cls, t: Any) -> Any:
169
+ def simple_to_eager(_t: LazyBase) -> Any:
170
+ def already_eager_to_eager(_t: LazyBase) -> Any:
171
+ assert _t._data is not None
172
+ return _t._data
173
+
174
+ while _t._data is None:
175
+ lt = _t._lazy.popleft()
176
+ if lt._data is not None:
177
+ # Lazy tensor did not belong in the lazy queue.
178
+ # Weirdly only happens with Bloom models...
179
+ # likely because tensors aren't unique in the queue.
180
+ # The final output is still the same as in eager mode,
181
+ # so it's safe to ignore this.
182
+ continue
183
+ assert lt._func is not None
184
+ lt._args = cls._recurse_apply(lt._args, already_eager_to_eager)
185
+ lt._data = lt._func(lt._args)
186
+ # sanity check
187
+ assert lt._data.dtype == lt._meta.dtype
188
+ assert lt._data.shape == lt._meta.shape
189
+
190
+ return _t._data
191
+
192
+ # recurse into lists and/or tuples, keeping their structure
193
+ return cls._recurse_apply(t, simple_to_eager)
194
+
195
+ @classmethod
196
+ def eager_to_meta(cls, t: Any) -> Any:
197
+ return cls.meta_with_dtype_and_shape(t.dtype, t.shape)
198
+
199
+ # must be overridden, meta tensor init is backend-specific
200
+ @classmethod
201
+ @abstractmethod
202
+ def meta_with_dtype_and_shape(cls, dtype: Any, shape: Any) -> Any: pass
203
+
204
+ @classmethod
205
+ def from_eager(cls, t: Any) -> Any:
206
+ if type(t) is cls:
207
+ # already eager
208
+ return t
209
+ elif isinstance(t, cls._tensor_type):
210
+ return cls(meta=cls.eager_to_meta(t), data=t)
211
+ else:
212
+ return TypeError(f"{type(t)!r} is not compatible with {cls._tensor_type!r}")
213
+
214
+
215
+ class LazyNumpyTensor(LazyBase):
216
+ _tensor_type = np.ndarray
217
+
218
+ @classmethod
219
+ def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: _Shape) -> np.ndarray[Any, Any]:
220
+ # The initial idea was to use np.nan as the fill value,
221
+ # but non-float types like np.int16 can't use that.
222
+ # So zero it is.
223
+ cheat = np.zeros(1, dtype)
224
+ return np.lib.stride_tricks.as_strided(cheat, shape, (0 for _ in shape))
225
+
226
+ def astype(self, dtype, *args, **kwargs):
227
+ meta = type(self).meta_with_dtype_and_shape(dtype, self._meta.shape)
228
+ full_args = (self, dtype,) + args
229
+ # very important to pass the shared _lazy deque, or else there's an infinite loop somewhere.
230
+ return type(self)(meta=meta, args=full_args, lazy=self._lazy, func=(lambda a: a[0].astype(*a[1:], **kwargs)))
231
+
232
+ def tofile(self, *args, **kwargs):
233
+ eager = LazyNumpyTensor.to_eager(self)
234
+ return eager.tofile(*args, **kwargs)
235
+
236
+ # TODO: __array_function__
@@ -0,0 +1,123 @@
1
+ from __future__ import annotations
2
+ from typing import Callable, Sequence
3
+
4
+ from numpy.typing import DTypeLike
5
+
6
+ from .constants import GGML_QUANT_SIZES, GGMLQuantizationType
7
+ from .lazy import LazyNumpyTensor
8
+
9
+ import numpy as np
10
+
11
+
12
+ def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
13
+ block_size, type_size = GGML_QUANT_SIZES[quant_type]
14
+ if shape[-1] % block_size != 0:
15
+ raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
16
+ return (*shape[:-1], shape[-1] // block_size * type_size)
17
+
18
+
19
+ def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
20
+ block_size, type_size = GGML_QUANT_SIZES[quant_type]
21
+ if shape[-1] % type_size != 0:
22
+ raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
23
+ return (*shape[:-1], shape[-1] // type_size * block_size)
24
+
25
+
26
+ # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
27
+ def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
28
+ n = n.astype(np.float32, copy=False).view(np.int32)
29
+ # force nan to quiet
30
+ n = np.where((n & 0x7fffffff) > 0x7f800000, (n & 0xffff0000) | (64 << 16), n)
31
+ # flush subnormals to zero
32
+ n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n)
33
+ # round to nearest even
34
+ n = (n + (0x7fff + ((n >> 16) & 1))) >> 16
35
+ return n.astype(np.int16)
36
+
37
+
38
+ # This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
39
+ def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
40
+ rows = arr.reshape((-1, arr.shape[-1]))
41
+ osize = 1
42
+ for dim in oshape:
43
+ osize *= dim
44
+ out = np.empty(shape=osize, dtype=otype)
45
+ # compute over groups of 16 rows (arbitrary, but seems good for performance)
46
+ n_groups = rows.shape[0] // 16
47
+ np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
48
+ return out.reshape(oshape)
49
+
50
+
51
+ def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:
52
+ return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.int16, oshape=n.shape)
53
+
54
+
55
+ __quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.int16)
56
+
57
+
58
+ def quantize_bf16(n: np.ndarray):
59
+ if type(n) is LazyNumpyTensor:
60
+ return __quantize_bf16_lazy(n)
61
+ else:
62
+ return __quantize_bf16_array(n)
63
+
64
+
65
+ __q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0]
66
+
67
+
68
+ def can_quantize_to_q8_0(n: np.ndarray) -> bool:
69
+ return n.shape[-1] % __q8_block_size == 0
70
+
71
+
72
+ # round away from zero
73
+ # ref: https://stackoverflow.com/a/59143326/22827863
74
+ def np_roundf(n: np.ndarray) -> np.ndarray:
75
+ a = abs(n)
76
+ floored = np.floor(a)
77
+ b = floored + np.floor(2 * (a - floored))
78
+ return np.sign(n) * b
79
+
80
+
81
+ def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]:
82
+ return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size)
83
+
84
+
85
+ # Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
86
+ def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:
87
+ shape = n.shape
88
+ assert shape[-1] % __q8_block_size == 0
89
+
90
+ n_blocks = n.size // __q8_block_size
91
+
92
+ blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False)
93
+
94
+ d = abs(blocks).max(axis=1, keepdims=True) / 127
95
+ with np.errstate(divide="ignore"):
96
+ id = np.where(d == 0, 0, 1 / d)
97
+ qs = np_roundf(blocks * id)
98
+
99
+ # (n_blocks, 2)
100
+ d = d.astype(np.float16).view(np.uint8)
101
+ # (n_blocks, block_size)
102
+ qs = qs.astype(np.int8).view(np.uint8)
103
+
104
+ assert d.shape[1] + qs.shape[1] == __q8_type_size
105
+
106
+ return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape))
107
+
108
+
109
+ def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray:
110
+ return __apply_over_grouped_rows(__quantize_q8_0_rows, arr=n, otype=np.uint8, oshape=__quantize_q8_0_shape_change(n.shape))
111
+
112
+
113
+ __quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn(
114
+ __quantize_q8_0_array,
115
+ meta_noop=(np.uint8, __quantize_q8_0_shape_change),
116
+ )
117
+
118
+
119
+ def quantize_q8_0(data: np.ndarray):
120
+ if type(data) is LazyNumpyTensor:
121
+ return __quantize_q8_0_lazy(data)
122
+ else:
123
+ return __quantize_q8_0_array(data)
@@ -137,6 +137,7 @@ class TensorNameMap:
137
137
  "layers.{bid}.attention.wk", # llama-pth
138
138
  "encoder.layer.{bid}.attention.self.key", # bert
139
139
  "transformer.h.{bid}.attn.k_proj", # gpt-j
140
+ "transformer.h.{bid}.attn.k", # refact
140
141
  "model.layers.layers.{bid}.self_attn.k_proj", # plamo
141
142
  "model.layers.{bid}.attention.wk", # internlm2
142
143
  "transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok
@@ -148,6 +149,7 @@ class TensorNameMap:
148
149
  "layers.{bid}.attention.wv", # llama-pth
149
150
  "encoder.layer.{bid}.attention.self.value", # bert
150
151
  "transformer.h.{bid}.attn.v_proj", # gpt-j
152
+ "transformer.h.{bid}.attn.v", # refact
151
153
  "model.layers.layers.{bid}.self_attn.v_proj", # plamo
152
154
  "model.layers.{bid}.attention.wv", # internlm2
153
155
  "transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok
@@ -229,6 +231,7 @@ class TensorNameMap:
229
231
  "layers.{bid}.feed_forward.w3", # llama-pth
230
232
  "encoder.layer.{bid}.intermediate.dense", # bert
231
233
  "transformer.h.{bid}.mlp.fc_in", # gpt-j
234
+ "transformer.h.{bid}.mlp.linear_3", # refact
232
235
  "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
233
236
  "model.layers.{bid}.mlp.dense_h_to_4h", # persimmon
234
237
  "transformer.h.{bid}.mlp.w1", # qwen
@@ -240,6 +243,8 @@ class TensorNameMap:
240
243
  "model.layers.{bid}.feed_forward.w3", # internlm2
241
244
  "encoder.layers.{bid}.mlp.fc11", # nomic-bert
242
245
  "model.layers.{bid}.mlp.c_fc", # starcoder2
246
+ "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
247
+ "model.layers.{bid}.residual_mlp.w3", # arctic
243
248
  ),
244
249
 
245
250
  MODEL_TENSOR.FFN_UP_EXP: (
@@ -266,6 +271,9 @@ class TensorNameMap:
266
271
  "model.layers.layers.{bid}.mlp.gate_proj", # plamo
267
272
  "model.layers.{bid}.feed_forward.w1", # internlm2
268
273
  "encoder.layers.{bid}.mlp.fc12", # nomic-bert
274
+ "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2
275
+ "transformer.h.{bid}.mlp.linear_1", # refact
276
+ "model.layers.{bid}.residual_mlp.w1", # arctic
269
277
  ),
270
278
 
271
279
  MODEL_TENSOR.FFN_GATE_EXP: (
@@ -299,6 +307,8 @@ class TensorNameMap:
299
307
  "model.layers.{bid}.feed_forward.w2", # internlm2
300
308
  "encoder.layers.{bid}.mlp.fc2", # nomic-bert
301
309
  "model.layers.{bid}.mlp.c_proj", # starcoder2
310
+ "encoder.layer.{bid}.mlp.wo", # jina-bert-v2
311
+ "model.layers.{bid}.residual_mlp.w2", # arctic
302
312
  ),
303
313
 
304
314
  MODEL_TENSOR.FFN_DOWN_EXP: (
@@ -317,6 +327,7 @@ class TensorNameMap:
317
327
  "model.layers.{bid}.self_attn.q_layernorm", # persimmon
318
328
  "model.layers.{bid}.self_attn.q_norm", # cohere
319
329
  "transformer.blocks.{bid}.attn.q_ln", # sea-lion
330
+ "encoder.layer.{bid}.attention.self.layer_norm_q" # jina-bert-v2
320
331
  ),
321
332
 
322
333
  MODEL_TENSOR.ATTN_K_NORM: (
@@ -324,6 +335,7 @@ class TensorNameMap:
324
335
  "model.layers.{bid}.self_attn.k_layernorm", # persimmon
325
336
  "model.layers.{bid}.self_attn.k_norm", # cohere
326
337
  "transformer.blocks.{bid}.attn.k_ln", # sea-lion
338
+ "encoder.layer.{bid}.attention.self.layer_norm_k" # jina-bert-v2
327
339
  ),
328
340
 
329
341
  MODEL_TENSOR.ROPE_FREQS: (
@@ -334,6 +346,7 @@ class TensorNameMap:
334
346
  "encoder.layer.{bid}.output.LayerNorm", # bert
335
347
  "encoder.layers.{bid}.norm2", # nomic-bert
336
348
  "transformer.decoder_layer.{bid}.rms_norm_3", # Grok
349
+ "encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2
337
350
  ),
338
351
 
339
352
  MODEL_TENSOR.SSM_IN: (
@@ -372,6 +385,18 @@ class TensorNameMap:
372
385
  ),
373
386
  }
374
387
 
388
+ # architecture-specific block mappings
389
+ arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = {
390
+ MODEL_ARCH.ARCTIC: {
391
+ MODEL_TENSOR.FFN_NORM: (
392
+ "model.layers.{bid}.residual_layernorm",
393
+ ),
394
+ MODEL_TENSOR.FFN_NORM_EXP: (
395
+ "model.layers.{bid}.post_attention_layernorm",
396
+ ),
397
+ },
398
+ }
399
+
375
400
  mapping: dict[str, tuple[MODEL_TENSOR, str]]
376
401
 
377
402
  def __init__(self, arch: MODEL_ARCH, n_blocks: int):
@@ -383,12 +408,14 @@ class TensorNameMap:
383
408
  self.mapping[tensor_name] = (tensor, tensor_name)
384
409
  for key in keys:
385
410
  self.mapping[key] = (tensor, tensor_name)
411
+ if arch in self.arch_block_mappings_cfg:
412
+ self.block_mappings_cfg.update(self.arch_block_mappings_cfg[arch])
386
413
  for bid in range(n_blocks):
387
414
  for tensor, keys in self.block_mappings_cfg.items():
388
415
  if tensor not in MODEL_TENSORS[arch]:
389
416
  continue
390
417
  # TODO: make this configurable
391
- n_experts = 60
418
+ n_experts = 128
392
419
  for xid in range(n_experts):
393
420
  tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
394
421
  self.mapping[tensor_name] = (tensor, tensor_name)
@@ -4,7 +4,7 @@ import logging
4
4
  import json
5
5
  import os
6
6
  from pathlib import Path
7
- from typing import Any, Callable
7
+ from typing import Any, Callable, Sequence, Mapping, Iterable
8
8
 
9
9
  from .gguf_writer import GGUFWriter
10
10
 
@@ -15,11 +15,11 @@ class SpecialVocab:
15
15
  merges: list[str]
16
16
  add_special_token: dict[str, bool]
17
17
  special_token_ids: dict[str, int]
18
- chat_template: str | None
18
+ chat_template: str | Sequence[Mapping[str, str]] | None
19
19
 
20
20
  def __init__(
21
21
  self, path: str | os.PathLike[str], load_merges: bool = False,
22
- special_token_types: tuple[str, ...] | None = None,
22
+ special_token_types: Iterable[str] | None = None,
23
23
  n_vocab: int | None = None,
24
24
  ):
25
25
  self.special_token_ids = {}
Binary file
Binary file
Binary file
Binary file
Binary file
bigdl/cpp/libs/common.lib CHANGED
Binary file