bigdl-core-cpp 2.1.0b2__py3-none-manylinux2010_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. bigdl/cpp/__init__.py +0 -0
  2. bigdl/cpp/cli/init-llama-cpp +14 -0
  3. bigdl/cpp/cli/init-ollama +8 -0
  4. bigdl/cpp/convert-hf-to-gguf.py +2856 -0
  5. bigdl/cpp/convert.py +1714 -0
  6. bigdl/cpp/gguf-py/__init__.py +0 -0
  7. bigdl/cpp/gguf-py/gguf/__init__.py +7 -0
  8. bigdl/cpp/gguf-py/gguf/constants.py +1033 -0
  9. bigdl/cpp/gguf-py/gguf/gguf.py +15 -0
  10. bigdl/cpp/gguf-py/gguf/gguf_reader.py +296 -0
  11. bigdl/cpp/gguf-py/gguf/gguf_writer.py +554 -0
  12. bigdl/cpp/gguf-py/gguf/lazy.py +236 -0
  13. bigdl/cpp/gguf-py/gguf/py.typed +0 -0
  14. bigdl/cpp/gguf-py/gguf/quants.py +123 -0
  15. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +463 -0
  16. bigdl/cpp/gguf-py/gguf/vocab.py +165 -0
  17. bigdl/cpp/libs/baby-llama +0 -0
  18. bigdl/cpp/libs/batched +0 -0
  19. bigdl/cpp/libs/batched-bench +0 -0
  20. bigdl/cpp/libs/benchmark +0 -0
  21. bigdl/cpp/libs/embedding +0 -0
  22. bigdl/cpp/libs/gguf +0 -0
  23. bigdl/cpp/libs/imatrix +0 -0
  24. bigdl/cpp/libs/llama-bench +0 -0
  25. bigdl/cpp/libs/llava-cli +0 -0
  26. bigdl/cpp/libs/lookahead +0 -0
  27. bigdl/cpp/libs/lookup +0 -0
  28. bigdl/cpp/libs/ls-sycl-device +0 -0
  29. bigdl/cpp/libs/main +0 -0
  30. bigdl/cpp/libs/ollama +0 -0
  31. bigdl/cpp/libs/perplexity +0 -0
  32. bigdl/cpp/libs/quantize +0 -0
  33. bigdl/cpp/libs/quantize-stats +0 -0
  34. bigdl/cpp/libs/save-load-state +0 -0
  35. bigdl/cpp/libs/server +0 -0
  36. bigdl/cpp/libs/speculative +0 -0
  37. bigdl/cpp/libs/tokenize +0 -0
  38. bigdl_core_cpp-2.1.0b2.data/scripts/init-llama-cpp +14 -0
  39. bigdl_core_cpp-2.1.0b2.data/scripts/init-ollama +8 -0
  40. bigdl_core_cpp-2.1.0b2.dist-info/METADATA +18 -0
  41. bigdl_core_cpp-2.1.0b2.dist-info/RECORD +43 -0
  42. bigdl_core_cpp-2.1.0b2.dist-info/WHEEL +5 -0
  43. bigdl_core_cpp-2.1.0b2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,236 @@
1
+ from __future__ import annotations
2
+ from abc import ABC, ABCMeta, abstractmethod
3
+
4
+ import logging
5
+ from typing import Any, Callable
6
+ from collections import deque
7
+
8
+ import numpy as np
9
+ from numpy._typing import _Shape
10
+ from numpy.typing import DTypeLike
11
+
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class LazyMeta(ABCMeta):
17
+
18
+ def __new__(cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs):
19
+ def __getattr__(self, __name: str) -> Any:
20
+ meta_attr = getattr(self._meta, __name)
21
+ if callable(meta_attr):
22
+ return type(self)._wrap_fn(
23
+ (lambda s, *args, **kwargs: getattr(s, __name)(*args, **kwargs)),
24
+ use_self=self,
25
+ )
26
+ elif isinstance(meta_attr, self._tensor_type):
27
+ # e.g. self.T with torch.Tensor should still be wrapped
28
+ return type(self)._wrap_fn(lambda s: getattr(s, __name))(self)
29
+ else:
30
+ # no need to wrap non-tensor properties,
31
+ # and they likely don't depend on the actual contents of the tensor
32
+ return meta_attr
33
+
34
+ namespace["__getattr__"] = __getattr__
35
+
36
+ # need to make a builder for the wrapped wrapper to copy the name,
37
+ # or else it fails with very cryptic error messages,
38
+ # because somehow the same string would end up in every closures
39
+ def mk_wrap(op_name: str, *, meta_noop: bool = False):
40
+ # need to wrap the wrapper to get self
41
+ def wrapped_special_op(self, *args, **kwargs):
42
+ return type(self)._wrap_fn(
43
+ getattr(type(self)._tensor_type, op_name),
44
+ meta_noop=meta_noop,
45
+ )(self, *args, **kwargs)
46
+ return wrapped_special_op
47
+
48
+ # special methods bypass __getattr__, so they need to be added manually
49
+ # ref: https://docs.python.org/3/reference/datamodel.html#special-lookup
50
+ # NOTE: doing this from a metaclass is very convenient
51
+ # TODO: make this even more comprehensive
52
+ for binary_op in (
53
+ "lt", "le", "eq", "ne", "ge", "gt", "not"
54
+ "abs", "add", "and", "floordiv", "invert", "lshift", "mod", "mul", "matmul",
55
+ "neg", "or", "pos", "pow", "rshift", "sub", "truediv", "xor",
56
+ "iadd", "iand", "ifloordiv", "ilshift", "imod", "imul", "ior", "irshift", "isub", "ixor",
57
+ "radd", "rand", "rfloordiv", "rmul", "ror", "rpow", "rsub", "rtruediv", "rxor",
58
+ ):
59
+ attr_name = f"__{binary_op}__"
60
+ # the result of these operators usually has the same shape and dtype as the input,
61
+ # so evaluation on the meta tensor can be skipped.
62
+ namespace[attr_name] = mk_wrap(attr_name, meta_noop=True)
63
+
64
+ for special_op in (
65
+ "getitem", "setitem", "len",
66
+ ):
67
+ attr_name = f"__{special_op}__"
68
+ namespace[attr_name] = mk_wrap(attr_name, meta_noop=False)
69
+
70
+ return super().__new__(cls, name, bases, namespace, **kwargs)
71
+
72
+
73
+ # Tree of lazy tensors
74
+ class LazyBase(ABC, metaclass=LazyMeta):
75
+ _tensor_type: type
76
+ _meta: Any
77
+ _data: Any | None
78
+ _lazy: deque[LazyBase] # shared within a graph, to avoid deep recursion when making eager
79
+ _args: tuple
80
+ _func: Callable[[tuple], Any] | None
81
+
82
+ def __init__(self, *, meta: Any, data: Any | None = None, lazy: deque[LazyBase] | None = None, args: tuple = (), func: Callable[[tuple], Any] | None = None):
83
+ super().__init__()
84
+ self._meta = meta
85
+ self._data = data
86
+ self._lazy = lazy if lazy is not None else deque()
87
+ self._args = args
88
+ self._func = func
89
+ assert self._func is not None or self._data is not None
90
+ if self._data is None:
91
+ self._lazy.append(self)
92
+
93
+ def __init_subclass__(cls) -> None:
94
+ if "_tensor_type" not in cls.__dict__:
95
+ raise TypeError(f"property '_tensor_type' must be defined for {cls!r}")
96
+ return super().__init_subclass__()
97
+
98
+ @staticmethod
99
+ def _recurse_apply(o: Any, fn: Callable[[Any], Any]) -> Any:
100
+ # TODO: dict and set
101
+ if isinstance(o, (list, tuple)):
102
+ L = []
103
+ for item in o:
104
+ L.append(LazyBase._recurse_apply(item, fn))
105
+ if isinstance(o, tuple):
106
+ L = tuple(L)
107
+ return L
108
+ elif isinstance(o, LazyBase):
109
+ return fn(o)
110
+ else:
111
+ return o
112
+
113
+ @classmethod
114
+ def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False) -> Callable[[Any], Any]:
115
+ def wrapped_fn(*args, **kwargs):
116
+ if kwargs is None:
117
+ kwargs = {}
118
+ args = ((use_self,) if use_self is not None else ()) + args
119
+
120
+ meta_args = LazyBase._recurse_apply(args, lambda t: t._meta)
121
+
122
+ if isinstance(meta_noop, bool) and not meta_noop:
123
+ try:
124
+ res = fn(*meta_args, **kwargs)
125
+ except NotImplementedError:
126
+ # running some operations on PyTorch's Meta tensors can cause this exception
127
+ res = None
128
+ else:
129
+ # some operators don't need to actually run on the meta tensors
130
+ assert len(args) > 0
131
+ res = args[0]
132
+ assert isinstance(res, cls)
133
+ res = res._meta
134
+ # allow operations to override the dtype and shape
135
+ if meta_noop is not True:
136
+ if isinstance(meta_noop, tuple):
137
+ dtype, shape = meta_noop
138
+ assert callable(shape)
139
+ res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape))
140
+ else:
141
+ res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)
142
+
143
+ if isinstance(res, cls._tensor_type):
144
+ def collect_replace(t: LazyBase):
145
+ if collect_replace.shared_lazy is None:
146
+ collect_replace.shared_lazy = t._lazy
147
+ else:
148
+ collect_replace.shared_lazy.extend(t._lazy)
149
+ t._lazy = collect_replace.shared_lazy
150
+
151
+ # emulating a static variable
152
+ collect_replace.shared_lazy = None
153
+
154
+ LazyBase._recurse_apply(args, collect_replace)
155
+
156
+ shared_lazy = collect_replace.shared_lazy
157
+
158
+ return cls(meta=cls.eager_to_meta(res), lazy=shared_lazy, args=args, func=lambda a: fn(*a, **kwargs))
159
+ else:
160
+ del res # not needed
161
+ # non-tensor return likely relies on the contents of the args
162
+ # (e.g. the result of torch.equal)
163
+ eager_args = cls.to_eager(args)
164
+ return fn(*eager_args, **kwargs)
165
+ return wrapped_fn
166
+
167
+ @classmethod
168
+ def to_eager(cls, t: Any) -> Any:
169
+ def simple_to_eager(_t: LazyBase) -> Any:
170
+ def already_eager_to_eager(_t: LazyBase) -> Any:
171
+ assert _t._data is not None
172
+ return _t._data
173
+
174
+ while _t._data is None:
175
+ lt = _t._lazy.popleft()
176
+ if lt._data is not None:
177
+ # Lazy tensor did not belong in the lazy queue.
178
+ # Weirdly only happens with Bloom models...
179
+ # likely because tensors aren't unique in the queue.
180
+ # The final output is still the same as in eager mode,
181
+ # so it's safe to ignore this.
182
+ continue
183
+ assert lt._func is not None
184
+ lt._args = cls._recurse_apply(lt._args, already_eager_to_eager)
185
+ lt._data = lt._func(lt._args)
186
+ # sanity check
187
+ assert lt._data.dtype == lt._meta.dtype
188
+ assert lt._data.shape == lt._meta.shape
189
+
190
+ return _t._data
191
+
192
+ # recurse into lists and/or tuples, keeping their structure
193
+ return cls._recurse_apply(t, simple_to_eager)
194
+
195
+ @classmethod
196
+ def eager_to_meta(cls, t: Any) -> Any:
197
+ return cls.meta_with_dtype_and_shape(t.dtype, t.shape)
198
+
199
+ # must be overridden, meta tensor init is backend-specific
200
+ @classmethod
201
+ @abstractmethod
202
+ def meta_with_dtype_and_shape(cls, dtype: Any, shape: Any) -> Any: pass
203
+
204
+ @classmethod
205
+ def from_eager(cls, t: Any) -> Any:
206
+ if type(t) is cls:
207
+ # already eager
208
+ return t
209
+ elif isinstance(t, cls._tensor_type):
210
+ return cls(meta=cls.eager_to_meta(t), data=t)
211
+ else:
212
+ return TypeError(f"{type(t)!r} is not compatible with {cls._tensor_type!r}")
213
+
214
+
215
+ class LazyNumpyTensor(LazyBase):
216
+ _tensor_type = np.ndarray
217
+
218
+ @classmethod
219
+ def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: _Shape) -> np.ndarray[Any, Any]:
220
+ # The initial idea was to use np.nan as the fill value,
221
+ # but non-float types like np.int16 can't use that.
222
+ # So zero it is.
223
+ cheat = np.zeros(1, dtype)
224
+ return np.lib.stride_tricks.as_strided(cheat, shape, (0 for _ in shape))
225
+
226
+ def astype(self, dtype, *args, **kwargs):
227
+ meta = type(self).meta_with_dtype_and_shape(dtype, self._meta.shape)
228
+ full_args = (self, dtype,) + args
229
+ # very important to pass the shared _lazy deque, or else there's an infinite loop somewhere.
230
+ return type(self)(meta=meta, args=full_args, lazy=self._lazy, func=(lambda a: a[0].astype(*a[1:], **kwargs)))
231
+
232
+ def tofile(self, *args, **kwargs):
233
+ eager = LazyNumpyTensor.to_eager(self)
234
+ return eager.tofile(*args, **kwargs)
235
+
236
+ # TODO: __array_function__
File without changes
@@ -0,0 +1,123 @@
1
+ from __future__ import annotations
2
+ from typing import Callable, Sequence
3
+
4
+ from numpy.typing import DTypeLike
5
+
6
+ from .constants import GGML_QUANT_SIZES, GGMLQuantizationType
7
+ from .lazy import LazyNumpyTensor
8
+
9
+ import numpy as np
10
+
11
+
12
+ def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
13
+ block_size, type_size = GGML_QUANT_SIZES[quant_type]
14
+ if shape[-1] % block_size != 0:
15
+ raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
16
+ return (*shape[:-1], shape[-1] // block_size * type_size)
17
+
18
+
19
+ def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
20
+ block_size, type_size = GGML_QUANT_SIZES[quant_type]
21
+ if shape[-1] % type_size != 0:
22
+ raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
23
+ return (*shape[:-1], shape[-1] // type_size * block_size)
24
+
25
+
26
+ # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
27
+ def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
28
+ n = n.astype(np.float32, copy=False).view(np.int32)
29
+ # force nan to quiet
30
+ n = np.where((n & 0x7fffffff) > 0x7f800000, (n & 0xffff0000) | (64 << 16), n)
31
+ # flush subnormals to zero
32
+ n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n)
33
+ # round to nearest even
34
+ n = (n + (0x7fff + ((n >> 16) & 1))) >> 16
35
+ return n.astype(np.int16)
36
+
37
+
38
+ # This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
39
+ def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
40
+ rows = arr.reshape((-1, arr.shape[-1]))
41
+ osize = 1
42
+ for dim in oshape:
43
+ osize *= dim
44
+ out = np.empty(shape=osize, dtype=otype)
45
+ # compute over groups of 16 rows (arbitrary, but seems good for performance)
46
+ n_groups = rows.shape[0] // 16
47
+ np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
48
+ return out.reshape(oshape)
49
+
50
+
51
+ def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:
52
+ return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.int16, oshape=n.shape)
53
+
54
+
55
+ __quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.int16)
56
+
57
+
58
+ def quantize_bf16(n: np.ndarray):
59
+ if type(n) is LazyNumpyTensor:
60
+ return __quantize_bf16_lazy(n)
61
+ else:
62
+ return __quantize_bf16_array(n)
63
+
64
+
65
+ __q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0]
66
+
67
+
68
+ def can_quantize_to_q8_0(n: np.ndarray) -> bool:
69
+ return n.shape[-1] % __q8_block_size == 0
70
+
71
+
72
+ # round away from zero
73
+ # ref: https://stackoverflow.com/a/59143326/22827863
74
+ def np_roundf(n: np.ndarray) -> np.ndarray:
75
+ a = abs(n)
76
+ floored = np.floor(a)
77
+ b = floored + np.floor(2 * (a - floored))
78
+ return np.sign(n) * b
79
+
80
+
81
+ def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]:
82
+ return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size)
83
+
84
+
85
+ # Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
86
+ def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:
87
+ shape = n.shape
88
+ assert shape[-1] % __q8_block_size == 0
89
+
90
+ n_blocks = n.size // __q8_block_size
91
+
92
+ blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False)
93
+
94
+ d = abs(blocks).max(axis=1, keepdims=True) / 127
95
+ with np.errstate(divide="ignore"):
96
+ id = np.where(d == 0, 0, 1 / d)
97
+ qs = np_roundf(blocks * id)
98
+
99
+ # (n_blocks, 2)
100
+ d = d.astype(np.float16).view(np.uint8)
101
+ # (n_blocks, block_size)
102
+ qs = qs.astype(np.int8).view(np.uint8)
103
+
104
+ assert d.shape[1] + qs.shape[1] == __q8_type_size
105
+
106
+ return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape))
107
+
108
+
109
+ def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray:
110
+ return __apply_over_grouped_rows(__quantize_q8_0_rows, arr=n, otype=np.uint8, oshape=__quantize_q8_0_shape_change(n.shape))
111
+
112
+
113
+ __quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn(
114
+ __quantize_q8_0_array,
115
+ meta_noop=(np.uint8, __quantize_q8_0_shape_change),
116
+ )
117
+
118
+
119
+ def quantize_q8_0(data: np.ndarray):
120
+ if type(data) is LazyNumpyTensor:
121
+ return __quantize_q8_0_lazy(data)
122
+ else:
123
+ return __quantize_q8_0_array(data)