bigdl-core-cpp 2.5.0rc1__py3-none-win_amd64.whl → 2.6.0b20230911__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. bigdl/cpp/{convert-hf-to-gguf.py → convert_hf_to_gguf.py} +413 -67
  2. bigdl/cpp/convert_hf_to_gguf_update.py +354 -0
  3. bigdl/cpp/convert_llama_ggml_to_gguf.py +454 -0
  4. bigdl/cpp/convert_lora_to_gguf.py +393 -0
  5. bigdl/cpp/gguf-py/gguf/__init__.py +1 -1
  6. bigdl/cpp/gguf-py/gguf/constants.py +71 -2
  7. bigdl/cpp/gguf-py/gguf/gguf_writer.py +16 -1
  8. bigdl/cpp/gguf-py/gguf/lazy.py +4 -1
  9. bigdl/cpp/gguf-py/gguf/metadata.py +70 -63
  10. bigdl/cpp/gguf-py/gguf/quants.py +1129 -64
  11. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +23 -15
  12. bigdl/cpp/gguf-py/gguf/utility.py +1 -1
  13. bigdl/cpp/gguf-py/gguf/vocab.py +301 -1
  14. bigdl/cpp/libs/common.lib +0 -0
  15. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/ggml.dll +0 -0
  16. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/llama.dll +0 -0
  17. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/ollama_llama_server.exe +0 -0
  18. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ggml.dll +0 -0
  19. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/llama.dll +0 -0
  20. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ollama_llama_server.exe +0 -0
  21. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ggml.dll +0 -0
  22. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/llama.dll +0 -0
  23. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ollama_llama_server.exe +0 -0
  24. bigdl/cpp/libs/ggml.dll +0 -0
  25. bigdl/cpp/libs/llama-batched.exe +0 -0
  26. bigdl/cpp/libs/llama-bench.exe +0 -0
  27. bigdl/cpp/libs/llama-cli.exe +0 -0
  28. bigdl/cpp/libs/llama-embedding.exe +0 -0
  29. bigdl/cpp/libs/llama-gguf.exe +0 -0
  30. bigdl/cpp/libs/llama-llava-cli.exe +0 -0
  31. bigdl/cpp/libs/llama-lookup.exe +0 -0
  32. bigdl/cpp/libs/{ls-sycl-device.exe → llama-ls-sycl-device.exe} +0 -0
  33. bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
  34. bigdl/cpp/libs/llama-perplexity.exe +0 -0
  35. bigdl/cpp/libs/llama-quantize.exe +0 -0
  36. bigdl/cpp/libs/llama-server.exe +0 -0
  37. bigdl/cpp/libs/llama-simple.exe +0 -0
  38. bigdl/cpp/libs/llama-speculative.exe +0 -0
  39. bigdl/cpp/libs/llama-tokenize.exe +0 -0
  40. bigdl/cpp/libs/llama.dll +0 -0
  41. bigdl/cpp/libs/llava_shared.dll +0 -0
  42. bigdl/cpp/libs/ollama.exe +0 -0
  43. {bigdl_core_cpp-2.5.0rc1.data → bigdl_core_cpp-2.6.0b20230911.data}/scripts/init-llama-cpp.bat +7 -2
  44. {bigdl_core_cpp-2.5.0rc1.data → bigdl_core_cpp-2.6.0b20230911.data}/scripts/init-ollama.bat +6 -0
  45. {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0b20230911.dist-info}/METADATA +3 -3
  46. bigdl_core_cpp-2.6.0b20230911.dist-info/RECORD +54 -0
  47. bigdl/cpp/convert.py +0 -1714
  48. bigdl/cpp/libs/baby-llama.exe +0 -0
  49. bigdl/cpp/libs/batched-bench.exe +0 -0
  50. bigdl/cpp/libs/batched.exe +0 -0
  51. bigdl/cpp/libs/beam-search.exe +0 -0
  52. bigdl/cpp/libs/benchmark.exe +0 -0
  53. bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
  54. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
  55. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
  56. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
  57. bigdl/cpp/libs/embedding.exe +0 -0
  58. bigdl/cpp/libs/export-lora.exe +0 -0
  59. bigdl/cpp/libs/finetune.exe +0 -0
  60. bigdl/cpp/libs/ggml_shared.dll +0 -0
  61. bigdl/cpp/libs/gguf.exe +0 -0
  62. bigdl/cpp/libs/gritlm.exe +0 -0
  63. bigdl/cpp/libs/imatrix.exe +0 -0
  64. bigdl/cpp/libs/infill.exe +0 -0
  65. bigdl/cpp/libs/llava-cli.exe +0 -0
  66. bigdl/cpp/libs/lookahead.exe +0 -0
  67. bigdl/cpp/libs/lookup.exe +0 -0
  68. bigdl/cpp/libs/main.exe +0 -0
  69. bigdl/cpp/libs/parallel.exe +0 -0
  70. bigdl/cpp/libs/passkey.exe +0 -0
  71. bigdl/cpp/libs/perplexity.exe +0 -0
  72. bigdl/cpp/libs/q8dot.exe +0 -0
  73. bigdl/cpp/libs/quantize-stats.exe +0 -0
  74. bigdl/cpp/libs/quantize.exe +0 -0
  75. bigdl/cpp/libs/save-load-state.exe +0 -0
  76. bigdl/cpp/libs/server.exe +0 -0
  77. bigdl/cpp/libs/simple.exe +0 -0
  78. bigdl/cpp/libs/speculative.exe +0 -0
  79. bigdl/cpp/libs/tokenize.exe +0 -0
  80. bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
  81. bigdl/cpp/libs/vdot.exe +0 -0
  82. bigdl_core_cpp-2.5.0rc1.dist-info/RECORD +0 -63
  83. {bigdl_core_cpp-2.5.0rc1.data → bigdl_core_cpp-2.6.0b20230911.data}/scripts/init-llama-cpp.ps1 +0 -0
  84. {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0b20230911.dist-info}/WHEEL +0 -0
  85. {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0b20230911.dist-info}/top_level.txt +0 -0
@@ -1,123 +1,1188 @@
1
1
  from __future__ import annotations
2
- from typing import Callable, Sequence
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any, Callable, Sequence
4
+ from math import log2, ceil
3
5
 
4
6
  from numpy.typing import DTypeLike
5
7
 
6
- from .constants import GGML_QUANT_SIZES, GGMLQuantizationType
8
+ from .constants import GGML_QUANT_SIZES, GGMLQuantizationType, QK_K
7
9
  from .lazy import LazyNumpyTensor
8
10
 
9
11
  import numpy as np
10
12
 
11
13
 
12
- def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
14
+ def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]:
13
15
  block_size, type_size = GGML_QUANT_SIZES[quant_type]
14
16
  if shape[-1] % block_size != 0:
15
17
  raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
16
18
  return (*shape[:-1], shape[-1] // block_size * type_size)
17
19
 
18
20
 
19
- def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
21
+ def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]:
20
22
  block_size, type_size = GGML_QUANT_SIZES[quant_type]
21
23
  if shape[-1] % type_size != 0:
22
24
  raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
23
25
  return (*shape[:-1], shape[-1] // type_size * block_size)
24
26
 
25
27
 
26
- # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
27
- def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
28
- n = n.astype(np.float32, copy=False).view(np.int32)
29
- # force nan to quiet
30
- n = np.where((n & 0x7fffffff) > 0x7f800000, (n & 0xffff0000) | (64 << 16), n)
31
- # flush subnormals to zero
32
- n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n)
33
- # round to nearest even
34
- n = (n + (0x7fff + ((n >> 16) & 1))) >> 16
35
- return n.astype(np.int16)
36
-
37
-
38
28
  # This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
39
- def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
29
+ def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
40
30
  rows = arr.reshape((-1, arr.shape[-1]))
41
31
  osize = 1
42
32
  for dim in oshape:
43
33
  osize *= dim
44
34
  out = np.empty(shape=osize, dtype=otype)
45
35
  # compute over groups of 16 rows (arbitrary, but seems good for performance)
46
- n_groups = rows.shape[0] // 16
36
+ n_groups = (rows.shape[0] // 16) or 1
47
37
  np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
48
38
  return out.reshape(oshape)
49
39
 
50
40
 
51
- def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:
52
- return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.int16, oshape=n.shape)
41
+ # round away from zero
42
+ # ref: https://stackoverflow.com/a/59143326/22827863
43
+ def np_roundf(n: np.ndarray) -> np.ndarray:
44
+ a = abs(n)
45
+ floored = np.floor(a)
46
+ b = floored + np.floor(2 * (a - floored))
47
+ return np.sign(n) * b
48
+
49
+
50
+ class QuantError(Exception): ...
53
51
 
54
52
 
55
- __quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.int16)
53
+ _type_traits: dict[GGMLQuantizationType, type[__Quant]] = {}
56
54
 
57
55
 
58
- def quantize_bf16(n: np.ndarray):
59
- if type(n) is LazyNumpyTensor:
60
- return __quantize_bf16_lazy(n)
56
+ def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
57
+ if qtype == GGMLQuantizationType.F32:
58
+ return data.astype(np.float32, copy=False)
59
+ elif qtype == GGMLQuantizationType.F16:
60
+ return data.astype(np.float16, copy=False)
61
+ elif (q := _type_traits.get(qtype)) is not None:
62
+ return q.quantize(data)
61
63
  else:
62
- return __quantize_bf16_array(n)
64
+ raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented")
63
65
 
64
66
 
65
- __q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0]
67
+ def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
68
+ if qtype == GGMLQuantizationType.F32:
69
+ return data.view(np.float32)
70
+ elif qtype == GGMLQuantizationType.F16:
71
+ return data.view(np.float16).astype(np.float32)
72
+ elif (q := _type_traits.get(qtype)) is not None:
73
+ return q.dequantize(data)
74
+ else:
75
+ raise NotImplementedError(f"Dequantization for {qtype.name} is not yet implemented")
66
76
 
67
77
 
68
- def can_quantize_to_q8_0(n: np.ndarray) -> bool:
69
- return n.shape[-1] % __q8_block_size == 0
78
+ class __Quant(ABC):
79
+ qtype: GGMLQuantizationType
80
+ block_size: int
81
+ type_size: int
70
82
 
83
+ grid: np.ndarray[Any, np.dtype[np.float32]] | None = None
84
+ grid_shape: tuple[int, int] = (0, 0)
85
+ grid_map: tuple[int | float, ...] = ()
86
+ grid_hex: bytes | None = None
71
87
 
72
- # round away from zero
73
- # ref: https://stackoverflow.com/a/59143326/22827863
74
- def np_roundf(n: np.ndarray) -> np.ndarray:
75
- a = abs(n)
76
- floored = np.floor(a)
77
- b = floored + np.floor(2 * (a - floored))
78
- return np.sign(n) * b
88
+ def __init__(self):
89
+ return TypeError("Quant conversion classes can't have instances")
79
90
 
91
+ def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None:
92
+ cls.qtype = qtype
93
+ cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype]
94
+ cls.__quantize_lazy = LazyNumpyTensor._wrap_fn(
95
+ cls.__quantize_array,
96
+ meta_noop=(np.uint8, cls.__shape_to_bytes)
97
+ )
98
+ cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn(
99
+ cls.__dequantize_array,
100
+ meta_noop=(np.float32, cls.__shape_from_bytes)
101
+ )
102
+ assert qtype not in _type_traits
103
+ _type_traits[qtype] = cls
80
104
 
81
- def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]:
82
- return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size)
105
+ @classmethod
106
+ def init_grid(cls):
107
+ if cls.grid is not None or cls.grid_hex is None:
108
+ return
83
109
 
110
+ bits_per_elem = ceil(log2(len(cls.grid_map)))
111
+ assert bits_per_elem != 0, cls.qtype.name
112
+ elems_per_byte = 8 // bits_per_elem
84
113
 
85
- # Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
86
- def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:
87
- shape = n.shape
88
- assert shape[-1] % __q8_block_size == 0
114
+ grid = np.frombuffer(cls.grid_hex, dtype=np.uint8)
115
+ # decode hexadecimal chars from grid
116
+ grid = grid.reshape((-1, 2))
117
+ grid = (np.where(grid > 0x40, grid + 9, grid) & 0x0F) << np.array([4, 0], dtype=np.uint8).reshape((1, 2))
118
+ grid = grid[..., 0] | grid[..., 1]
119
+ # unpack the grid values
120
+ grid = grid.reshape((-1, 1)) >> np.array([i for i in range(0, 8, 8 // elems_per_byte)], dtype=np.uint8).reshape((1, elems_per_byte))
121
+ grid = (grid & ((1 << bits_per_elem) - 1)).reshape((-1, 1))
122
+ grid_map = np.array(cls.grid_map, dtype=np.float32).reshape((1, -1))
123
+ grid = np.take_along_axis(grid_map, grid, axis=-1)
124
+ cls.grid = grid.reshape((1, 1, *cls.grid_shape))
89
125
 
90
- n_blocks = n.size // __q8_block_size
126
+ @classmethod
127
+ @abstractmethod
128
+ def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
129
+ raise NotImplementedError
91
130
 
92
- blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False)
131
+ @classmethod
132
+ @abstractmethod
133
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
134
+ raise NotImplementedError
93
135
 
94
- d = abs(blocks).max(axis=1, keepdims=True) / 127
95
- with np.errstate(divide="ignore"):
96
- id = np.where(d == 0, 0, 1 / d)
97
- qs = np_roundf(blocks * id)
136
+ @classmethod
137
+ def quantize_rows(cls, rows: np.ndarray) -> np.ndarray:
138
+ rows = rows.astype(np.float32, copy=False)
139
+ shape = rows.shape
140
+ n_blocks = rows.size // cls.block_size
141
+ blocks = rows.reshape((n_blocks, cls.block_size))
142
+ blocks = cls.quantize_blocks(blocks)
143
+ assert blocks.dtype == np.uint8
144
+ assert blocks.shape[-1] == cls.type_size
145
+ return blocks.reshape(cls.__shape_to_bytes(shape))
98
146
 
99
- # (n_blocks, 2)
100
- d = d.astype(np.float16).view(np.uint8)
101
- # (n_blocks, block_size)
102
- qs = qs.astype(np.int8).view(np.uint8)
147
+ @classmethod
148
+ def dequantize_rows(cls, rows: np.ndarray) -> np.ndarray:
149
+ rows = rows.view(np.uint8)
150
+ shape = rows.shape
151
+ n_blocks = rows.size // cls.type_size
152
+ blocks = rows.reshape((n_blocks, cls.type_size))
153
+ blocks = cls.dequantize_blocks(blocks)
154
+ assert blocks.dtype == np.float32
155
+ assert blocks.shape[-1] == cls.block_size
156
+ return blocks.reshape(cls.__shape_from_bytes(shape))
103
157
 
104
- assert d.shape[1] + qs.shape[1] == __q8_type_size
158
+ @classmethod
159
+ def __shape_to_bytes(cls, shape: Sequence[int]):
160
+ return quant_shape_to_byte_shape(shape, cls.qtype)
105
161
 
106
- return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape))
162
+ @classmethod
163
+ def __shape_from_bytes(cls, shape: Sequence[int]):
164
+ return quant_shape_from_byte_shape(shape, cls.qtype)
107
165
 
166
+ @classmethod
167
+ def __quantize_array(cls, array: np.ndarray) -> np.ndarray:
168
+ return _apply_over_grouped_rows(cls.quantize_rows, arr=array, otype=np.uint8, oshape=cls.__shape_to_bytes(array.shape))
108
169
 
109
- def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray:
110
- return __apply_over_grouped_rows(__quantize_q8_0_rows, arr=n, otype=np.uint8, oshape=__quantize_q8_0_shape_change(n.shape))
170
+ @classmethod
171
+ def __dequantize_array(cls, array: np.ndarray) -> np.ndarray:
172
+ cls.init_grid()
173
+ return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape))
111
174
 
175
+ @classmethod
176
+ def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
177
+ pass
112
178
 
113
- __quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn(
114
- __quantize_q8_0_array,
115
- meta_noop=(np.uint8, __quantize_q8_0_shape_change),
116
- )
179
+ @classmethod
180
+ def __dequantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
181
+ pass
117
182
 
183
+ @classmethod
184
+ def can_quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> bool:
185
+ return tensor.shape[-1] % cls.block_size == 0
118
186
 
119
- def quantize_q8_0(data: np.ndarray):
120
- if type(data) is LazyNumpyTensor:
121
- return __quantize_q8_0_lazy(data)
122
- else:
123
- return __quantize_q8_0_array(data)
187
+ @classmethod
188
+ def quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
189
+ if not cls.can_quantize(tensor):
190
+ raise QuantError(f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}")
191
+ if isinstance(tensor, LazyNumpyTensor):
192
+ return cls.__quantize_lazy(tensor)
193
+ else:
194
+ return cls.__quantize_array(tensor)
195
+
196
+ @classmethod
197
+ def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
198
+ if isinstance(tensor, LazyNumpyTensor):
199
+ return cls.__dequantize_lazy(tensor)
200
+ else:
201
+ return cls.__dequantize_array(tensor)
202
+
203
+
204
+ class BF16(__Quant, qtype=GGMLQuantizationType.BF16):
205
+ @classmethod
206
+ # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
207
+ def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
208
+ n = blocks.view(np.uint32)
209
+ # force nan to quiet
210
+ n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n)
211
+ # round to nearest even
212
+ n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16
213
+ return n.astype(np.uint16).view(np.uint8)
214
+
215
+ @classmethod
216
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
217
+ return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32)
218
+
219
+
220
+ class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0):
221
+ @classmethod
222
+ def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
223
+ n_blocks = blocks.shape[0]
224
+
225
+ imax = abs(blocks).argmax(axis=-1, keepdims=True)
226
+ max = np.take_along_axis(blocks, imax, axis=-1)
227
+
228
+ d = max / -8
229
+ with np.errstate(divide="ignore"):
230
+ id = np.where(d == 0, 0, 1 / d)
231
+ # FIXME: Q4_0's reference rounding is cursed and depends on FMA
232
+ qs = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(8.5), dtype=np.float32).astype(np.uint8).clip(0, 15)
233
+
234
+ qs = qs.reshape((n_blocks, 2, cls.block_size // 2))
235
+ qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4))
236
+
237
+ d = d.astype(np.float16).view(np.uint8)
238
+
239
+ return np.concatenate([d, qs], axis=-1)
240
+
241
+ @classmethod
242
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
243
+ n_blocks = blocks.shape[0]
244
+
245
+ d, qs = np.hsplit(blocks, [2])
246
+
247
+ d = d.view(np.float16).astype(np.float32)
248
+
249
+ qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
250
+ qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1)).astype(np.int8) - np.int8(8)
251
+
252
+ return (d * qs.astype(np.float32))
253
+
254
+
255
+ class Q4_1(__Quant, qtype=GGMLQuantizationType.Q4_1):
256
+ @classmethod
257
+ def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
258
+ n_blocks = blocks.shape[0]
259
+
260
+ max = blocks.max(axis=-1, keepdims=True)
261
+ min = blocks.min(axis=-1, keepdims=True)
262
+
263
+ d = (max - min) / 15
264
+ with np.errstate(divide="ignore"):
265
+ id = np.where(d == 0, 0, 1 / d)
266
+ qs = np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32).astype(np.uint8).clip(0, 15)
267
+
268
+ qs = qs.reshape((n_blocks, 2, cls.block_size // 2))
269
+ qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4))
270
+
271
+ d = d.astype(np.float16).view(np.uint8)
272
+ m = min.astype(np.float16).view(np.uint8)
273
+
274
+ return np.concatenate([d, m, qs], axis=-1)
275
+
276
+ @classmethod
277
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
278
+ n_blocks = blocks.shape[0]
279
+
280
+ d, rest = np.hsplit(blocks, [2])
281
+ m, qs = np.hsplit(rest, [2])
282
+
283
+ d = d.view(np.float16).astype(np.float32)
284
+ m = m.view(np.float16).astype(np.float32)
285
+
286
+ qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
287
+ qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1)).astype(np.float32)
288
+
289
+ return (d * qs) + m
290
+
291
+
292
+ class Q5_0(__Quant, qtype=GGMLQuantizationType.Q5_0):
293
+ @classmethod
294
+ def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
295
+ n_blocks = blocks.shape[0]
296
+
297
+ imax = abs(blocks).argmax(axis=-1, keepdims=True)
298
+ max = np.take_along_axis(blocks, imax, axis=-1)
299
+
300
+ d = max / -16
301
+ with np.errstate(divide="ignore"):
302
+ id = np.where(d == 0, 0, 1 / d)
303
+ # FIXME: Q5_0's reference rounding is cursed and depends on FMA
304
+ q = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(16.5), dtype=np.float32).astype(np.uint8).clip(0, 31)
305
+
306
+ qs = q.reshape((n_blocks, 2, cls.block_size // 2))
307
+ qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
308
+
309
+ qh = np.packbits(q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little").reshape(n_blocks, 4)
310
+
311
+ d = d.astype(np.float16).view(np.uint8)
312
+
313
+ return np.concatenate([d, qh, qs], axis=-1)
314
+
315
+ @classmethod
316
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
317
+ n_blocks = blocks.shape[0]
318
+
319
+ d, rest = np.hsplit(blocks, [2])
320
+ qh, qs = np.hsplit(rest, [4])
321
+
322
+ d = d.view(np.float16).astype(np.float32)
323
+ qh = qh.view(np.uint32)
324
+
325
+ qh = qh.reshape((n_blocks, 1)) >> np.array([i for i in range(32)], dtype=np.uint32).reshape((1, 32))
326
+ ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
327
+ qh = (qh & np.uint32(0x01)).astype(np.uint8)
328
+ ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1))
329
+
330
+ qs = (ql | (qh << np.uint8(4))).astype(np.int8) - np.int8(16)
331
+
332
+ return (d * qs.astype(np.float32))
333
+
334
+
335
+ class Q5_1(__Quant, qtype=GGMLQuantizationType.Q5_1):
336
+ @classmethod
337
+ def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
338
+ n_blocks = blocks.shape[0]
339
+
340
+ max = blocks.max(axis=-1, keepdims=True)
341
+ min = blocks.min(axis=-1, keepdims=True)
342
+
343
+ d = (max - min) / 31
344
+ with np.errstate(divide="ignore"):
345
+ id = np.where(d == 0, 0, 1 / d)
346
+ q = np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32).astype(np.uint8).clip(0, 31)
347
+
348
+ qs = q.reshape((n_blocks, 2, cls.block_size // 2))
349
+ qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
350
+
351
+ qh = np.packbits(q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little").reshape(n_blocks, 4)
352
+
353
+ d = d.astype(np.float16).view(np.uint8)
354
+ m = min.astype(np.float16).view(np.uint8)
355
+
356
+ return np.concatenate([d, m, qh, qs], axis=-1)
357
+
358
+ @classmethod
359
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
360
+ n_blocks = blocks.shape[0]
361
+
362
+ d, rest = np.hsplit(blocks, [2])
363
+ m, rest = np.hsplit(rest, [2])
364
+ qh, qs = np.hsplit(rest, [4])
365
+
366
+ d = d.view(np.float16).astype(np.float32)
367
+ m = m.view(np.float16).astype(np.float32)
368
+ qh = qh.view(np.uint32)
369
+
370
+ qh = qh.reshape((n_blocks, 1)) >> np.array([i for i in range(32)], dtype=np.uint32).reshape((1, 32))
371
+ ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
372
+ qh = (qh & np.uint32(0x01)).astype(np.uint8)
373
+ ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1))
374
+
375
+ qs = (ql | (qh << np.uint8(4))).astype(np.float32)
376
+
377
+ return (d * qs) + m
378
+
379
+
380
+ class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0):
381
+ @classmethod
382
+ # Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
383
+ def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
384
+
385
+ d = abs(blocks).max(axis=1, keepdims=True) / 127
386
+ with np.errstate(divide="ignore"):
387
+ id = np.where(d == 0, 0, 1 / d)
388
+ qs = np_roundf(blocks * id)
389
+
390
+ # (n_blocks, 2)
391
+ d = d.astype(np.float16).view(np.uint8)
392
+ # (n_blocks, block_size)
393
+ qs = qs.astype(np.int8).view(np.uint8)
394
+
395
+ return np.concatenate([d, qs], axis=1)
396
+
397
+ @classmethod
398
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
399
+ d, x = np.split(blocks, [2], axis=1)
400
+ d = d.view(np.float16).astype(np.float32)
401
+ x = x.view(np.int8).astype(np.float32)
402
+
403
+ return (x * d)
404
+
405
+
406
+ class Q2_K(__Quant, qtype=GGMLQuantizationType.Q2_K):
407
+ @classmethod
408
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
409
+ n_blocks = blocks.shape[0]
410
+
411
+ scales, rest = np.hsplit(blocks, [QK_K // 16])
412
+ qs, rest = np.hsplit(rest, [QK_K // 4])
413
+ d, dmin = np.hsplit(rest, [2])
414
+
415
+ d = d.view(np.float16).astype(np.float32)
416
+ dmin = dmin.view(np.float16).astype(np.float32)
417
+
418
+ # (n_blocks, 16, 1)
419
+ dl = (d * (scales & 0xF).astype(np.float32)).reshape((n_blocks, QK_K // 16, 1))
420
+ ml = (dmin * (scales >> 4).astype(np.float32)).reshape((n_blocks, QK_K // 16, 1))
421
+
422
+ shift = np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
423
+
424
+ qs = (qs.reshape((n_blocks, -1, 1, 32)) >> shift) & np.uint8(3)
425
+
426
+ qs = qs.reshape((n_blocks, QK_K // 16, 16)).astype(np.float32)
427
+
428
+ qs = dl * qs - ml
429
+
430
+ return qs.reshape((n_blocks, -1))
431
+
432
+
433
+ class Q3_K(__Quant, qtype=GGMLQuantizationType.Q3_K):
434
+ @classmethod
435
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
436
+ n_blocks = blocks.shape[0]
437
+
438
+ hmask, rest = np.hsplit(blocks, [QK_K // 8])
439
+ qs, rest = np.hsplit(rest, [QK_K // 4])
440
+ scales, d = np.hsplit(rest, [12])
441
+
442
+ d = d.view(np.float16).astype(np.float32)
443
+
444
+ # The scales are packed at 6-bit each in this pattern:
445
+ # 0: IIIIAAAA
446
+ # 1: JJJJBBBB
447
+ # 2: KKKKCCCC
448
+ # 3: LLLLDDDD
449
+ # 4: MMMMEEEE
450
+ # 5: NNNNFFFF
451
+ # 6: OOOOGGGG
452
+ # 7: PPPPHHHH
453
+ # 8: MMIIEEAA
454
+ # 9: NNJJFFBB
455
+ # 10: OOKKGGCC
456
+ # 11: PPLLHHDD
457
+ lscales, hscales = np.hsplit(scales, [8])
458
+ lscales = lscales.reshape((n_blocks, 1, 8)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 2, 1))
459
+ lscales = lscales.reshape((n_blocks, 16))
460
+ hscales = hscales.reshape((n_blocks, 1, 4)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 4, 1))
461
+ hscales = hscales.reshape((n_blocks, 16))
462
+ scales = (lscales & np.uint8(0x0F)) | ((hscales & np.uint8(0x03)) << np.uint8(4))
463
+ scales = (scales.astype(np.int8) - np.int8(32)).astype(np.float32)
464
+
465
+ dl = (d * scales).reshape((n_blocks, 16, 1))
466
+
467
+ ql = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
468
+ qh = hmask.reshape(n_blocks, -1, 1, 32) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8, 1))
469
+ ql = ql.reshape((n_blocks, 16, QK_K // 16)) & np.uint8(3)
470
+ qh = (qh.reshape((n_blocks, 16, QK_K // 16)) & np.uint8(1))
471
+ qh = qh ^ np.uint8(1) # strangely, the offset is zero when the bitmask is 1
472
+ q = (ql.astype(np.int8) - (qh << np.uint8(2)).astype(np.int8)).astype(np.float32)
473
+
474
+ return (dl * q).reshape((n_blocks, QK_K))
475
+
476
+
477
+ class Q4_K(__Quant, qtype=GGMLQuantizationType.Q4_K):
478
+ K_SCALE_SIZE = 12
479
+
480
+ @staticmethod
481
+ def get_scale_min(scales: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
482
+ n_blocks = scales.shape[0]
483
+ scales = scales.view(np.uint8)
484
+ ### Unpacking the following: ###
485
+ # 0 EEAAAAAA
486
+ # 1 FFBBBBBB
487
+ # 2 GGCCCCCC
488
+ # 3 HHDDDDDD
489
+ # 4 eeaaaaaa
490
+ # 5 ffbbbbbb
491
+ # 6 ggcccccc
492
+ # 7 hhdddddd
493
+ # 8 eeeeEEEE
494
+ # 9 ffffFFFF
495
+ # 10 ggggGGGG
496
+ # 11 hhhhHHHH
497
+ scales = scales.reshape((n_blocks, 3, 4))
498
+ d, m, m_d = np.split(scales, 3, axis=-2)
499
+
500
+ sc = np.concatenate([d & 0x3F, (m_d & 0x0F) | ((d >> 2) & 0x30)], axis=-1)
501
+ min = np.concatenate([m & 0x3F, (m_d >> 4) | ((m >> 2) & 0x30)], axis=-1)
502
+
503
+ return (sc.reshape((n_blocks, 8)), min.reshape((n_blocks, 8)))
504
+
505
+ @classmethod
506
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
507
+ n_blocks = blocks.shape[0]
508
+
509
+ d, rest = np.hsplit(blocks, [2])
510
+ dmin, rest = np.hsplit(rest, [2])
511
+ scales, qs = np.hsplit(rest, [cls.K_SCALE_SIZE])
512
+
513
+ d = d.view(np.float16).astype(np.float32)
514
+ dmin = dmin.view(np.float16).astype(np.float32)
515
+
516
+ sc, m = Q4_K.get_scale_min(scales)
517
+
518
+ d = (d * sc.astype(np.float32)).reshape((n_blocks, -1, 1))
519
+ dm = (dmin * m.astype(np.float32)).reshape((n_blocks, -1, 1))
520
+
521
+ qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
522
+ qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1, 32)).astype(np.float32)
523
+
524
+ return (d * qs - dm).reshape((n_blocks, QK_K))
525
+
526
+
527
+ class Q5_K(__Quant, qtype=GGMLQuantizationType.Q5_K):
528
+ @classmethod
529
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
530
+ n_blocks = blocks.shape[0]
531
+
532
+ d, rest = np.hsplit(blocks, [2])
533
+ dmin, rest = np.hsplit(rest, [2])
534
+ scales, rest = np.hsplit(rest, [Q4_K.K_SCALE_SIZE])
535
+ qh, qs = np.hsplit(rest, [QK_K // 8])
536
+
537
+ d = d.view(np.float16).astype(np.float32)
538
+ dmin = dmin.view(np.float16).astype(np.float32)
539
+
540
+ sc, m = Q4_K.get_scale_min(scales)
541
+
542
+ d = (d * sc.astype(np.float32)).reshape((n_blocks, -1, 1))
543
+ dm = (dmin * m.astype(np.float32)).reshape((n_blocks, -1, 1))
544
+
545
+ ql = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
546
+ qh = qh.reshape((n_blocks, -1, 1, 32)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8, 1))
547
+ ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1, 32))
548
+ qh = (qh & np.uint8(0x01)).reshape((n_blocks, -1, 32))
549
+ q = (ql | (qh << np.uint8(4))).astype(np.float32)
550
+
551
+ return (d * q - dm).reshape((n_blocks, QK_K))
552
+
553
+
554
+ class Q6_K(__Quant, qtype=GGMLQuantizationType.Q6_K):
555
+ @classmethod
556
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
557
+ n_blocks = blocks.shape[0]
558
+
559
+ ql, rest = np.hsplit(blocks, [QK_K // 2])
560
+ qh, rest = np.hsplit(rest, [QK_K // 4])
561
+ scales, d = np.hsplit(rest, [QK_K // 16])
562
+
563
+ scales = scales.view(np.int8).astype(np.float32)
564
+ d = d.view(np.float16).astype(np.float32)
565
+ d = (d * scales).reshape((n_blocks, QK_K // 16, 1))
566
+
567
+ ql = ql.reshape((n_blocks, -1, 1, 64)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
568
+ ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1, 32))
569
+ qh = qh.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
570
+ qh = (qh & np.uint8(0x03)).reshape((n_blocks, -1, 32))
571
+ q = (ql | (qh << np.uint8(4))).astype(np.int8) - np.int8(32)
572
+ q = q.reshape((n_blocks, QK_K // 16, -1)).astype(np.float32)
573
+
574
+ return (d * q).reshape((n_blocks, QK_K))
575
+
576
+
577
+ class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS):
578
+ ksigns: bytes = (
579
+ b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f"
580
+ b"\x90\x11\x12\x93\x14\x95\x96\x17\x18\x99\x9a\x1b\x9c\x1d\x1e\x9f"
581
+ b"\xa0\x21\x22\xa3\x24\xa5\xa6\x27\x28\xa9\xaa\x2b\xac\x2d\x2e\xaf"
582
+ b"\x30\xb1\xb2\x33\xb4\x35\x36\xb7\xb8\x39\x3a\xbb\x3c\xbd\xbe\x3f"
583
+ b"\xc0\x41\x42\xc3\x44\xc5\xc6\x47\x48\xc9\xca\x4b\xcc\x4d\x4e\xcf"
584
+ b"\x50\xd1\xd2\x53\xd4\x55\x56\xd7\xd8\x59\x5a\xdb\x5c\xdd\xde\x5f"
585
+ b"\x60\xe1\xe2\x63\xe4\x65\x66\xe7\xe8\x69\x6a\xeb\x6c\xed\xee\x6f"
586
+ b"\xf0\x71\x72\xf3\x74\xf5\xf6\x77\x78\xf9\xfa\x7b\xfc\x7d\x7e\xff"
587
+ )
588
+
589
+ # iq2xxs_grid, but with each byte of the original packed in 2 bits,
590
+ # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2.
591
+ grid_shape = (256, 8)
592
+ grid_map = (0x08, 0x19, 0x2b)
593
+ grid_hex = (
594
+ b"00000200050008000a00110014002000220028002a0041004400500058006100"
595
+ b"6400800082008a00a20001010401100115014001840198010002020222028202"
596
+ b"010404041004210424044004420448046004810484049004a404000502050805"
597
+ b"200546056905800591050906100640068406a406000805080808140828084108"
598
+ b"440850085208880804094009020a140a01100410101021104010601084109010"
599
+ b"951000110811201150115a118011241245120014081420142514491480141815"
600
+ b"6215001616160118041810184018811800190519a019511a002002200a204420"
601
+ b"6120802082202921482100220222012404241024402456240025412564259026"
602
+ b"082820289428442a014004401040184021402440404048405640604081408440"
603
+ b"9040004120416141804185410142104248425642684200440844204480449944"
604
+ b"124524450046014804481048404845480049584961498249454a904a00500850"
605
+ b"1150195020508050885004514251a4519152905492540a550156545600581158"
606
+ b"195864584059085a046010604060686000615561186260620064056410651265"
607
+ b"84654268008002800a8041808280048118814081118201840484108415844084"
608
+ b"608400854685948509864086608602880489118a0490109024904090a1901691"
609
+ b"8091459200942294449451958198209902a050a085a009a100a218a450a804a9"
610
+ )
611
+
612
+ @classmethod
613
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
614
+ n_blocks = blocks.shape[0]
615
+
616
+ d, qs = np.hsplit(blocks, [2])
617
+
618
+ d = d.view(np.float16).astype(np.float32)
619
+
620
+ qs = qs.view(np.uint32).reshape(n_blocks, -1, 2)
621
+
622
+ db = d * (np.float32(0.5) + (qs[..., 1] >> 28).astype(np.float32)) * np.float32(0.25)
623
+ db = db.reshape((n_blocks, -1, 1, 1))
624
+
625
+ # get the sign indices and unpack the bits
626
+ signs = qs[..., 1].reshape((n_blocks, -1, 1)) >> np.array([0, 7, 14, 21], dtype=np.uint32).reshape((1, 1, 4))
627
+ ksigns = np.frombuffer(cls.ksigns, dtype=np.uint8).reshape((1, 1, 1, 128))
628
+ signs = (signs & np.uint32(0x7F)).reshape((n_blocks, -1, 4, 1))
629
+ signs = np.take_along_axis(ksigns, signs, axis=-1)
630
+ signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 1, 8))
631
+ signs = signs & np.uint8(0x01)
632
+ signs = np.where(signs == 0, np.float32(1), np.float32(-1))
633
+ signs = signs.reshape((n_blocks, -1, 4, 8))
634
+
635
+ assert cls.grid is not None
636
+ grid = np.take_along_axis(cls.grid, qs[..., 0].copy().view(np.uint8).reshape((n_blocks, -1, 1, 1)), axis=-2)
637
+ grid = grid.reshape((n_blocks, -1, 4, 8))
638
+
639
+ return (db * grid * signs).reshape((n_blocks, -1))
640
+
641
+
642
+ class IQ2_XS(__Quant, qtype=GGMLQuantizationType.IQ2_XS):
643
+ # iq2xs_grid, but with each byte of the original packed in 2 bits,
644
+ # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2.
645
+ grid_shape = (512, 8)
646
+ grid_map = (0x08, 0x19, 0x2b)
647
+ grid_hex = (
648
+ b"00000200050008000a0011001400160019002000220025002800410044004600"
649
+ b"49005000520055005800610064008000820085008800910094009900a0000101"
650
+ b"04010601090110011201150118011a0121012401400142014501480151015401"
651
+ b"6001680181018401900100020202050208021102140220024102440250025502"
652
+ b"80028a0201040404060409041004120415041804210424044004420445044804"
653
+ b"5104540456046004810484049004000502050505080511051405200541054405"
654
+ b"500561058005010604061006260640064206840600080208050808080a081108"
655
+ b"14082008250841084408500858088008a008aa08010904091009400981098909"
656
+ b"000a200a280a960aa00a01100410061009101010121015101810211024104010"
657
+ b"4210451048105110541060106a10811084109010001102110511081111111411"
658
+ b"2011411144115011801194119611011204120612101240126012001402140514"
659
+ b"0814111414142014411444144914501464148014011504151015401500161416"
660
+ b"49160118041810181218401854188618001905196619511aa91a002002200520"
661
+ b"08200a201120142020204120442050208020a020012104211021402148216521"
662
+ b"002222228022a82201240424102429244024002541255225992501261a26a626"
663
+ b"002808280a28202855288828a22868299029082a202a822a882a8a2a01400440"
664
+ b"0640094010401240154018402140244040404240454048404a40514054406040"
665
+ b"6540814084409040004102410541084111411441204141414441504180418541"
666
+ b"a241014204421042124229424042004402440544084411441444194420444144"
667
+ b"4444504480449444014504451045244540459a4500460a464446504601480448"
668
+ b"1048404845485448624800491149444950496949044a00500250055008501150"
669
+ b"145020502850415044505050805001510451105115514051425100524452aa52"
670
+ b"0154045410542154405460548154a154005508558055885521566856a1560058"
671
+ b"14584158505899581a5940594259855a0160046010604060546062608660a960"
672
+ b"006124624a62926200641664106540654565a46501686a682569066a546a626a"
673
+ b"00800280058008801180148020802a8041804480508080808280a880aa800181"
674
+ b"0481068110814081518159810082208280828282a082a8820184048410841284"
675
+ b"158440846084898400854485a58518866a860088088825885a8880888288a888"
676
+ b"0689228a808a888a968aa88a0190049010904090569084900091229164915692"
677
+ b"89920094059444945094589429959095929541965198a6984999159a609a00a0"
678
+ b"02a008a00aa020a02aa0a0a051a159a1a6a100a202a208a22aa280a2a0a240a4"
679
+ b"95a465a698a60aa820a822a828a8a0a8a8a804a984a986a928aa2aaa91aaaaaa"
680
+ )
681
+
682
+ @classmethod
683
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
684
+ n_blocks = blocks.shape[0]
685
+
686
+ d, rest = np.hsplit(blocks, [2])
687
+ qs, scales = np.hsplit(rest, [2 * QK_K // 8])
688
+
689
+ d = d.view(np.float16).astype(np.float32)
690
+ qs = qs.view(np.uint16)
691
+
692
+ scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
693
+ scales = (scales & 0x0F).reshape((n_blocks, -1))
694
+ db = d * (np.float32(0.5) + scales) * np.float32(0.25)
695
+ db = db.reshape((n_blocks, -1, 1, 1))
696
+
697
+ # get the sign indices and unpack the bits
698
+ signs = np.frombuffer(IQ2_XXS.ksigns, dtype=np.uint8).reshape(1, 1, 128)
699
+ signs = np.take_along_axis(signs, (qs >> 9).reshape((n_blocks, -1, 1)), axis=-1)
700
+ signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8))
701
+ signs = signs & np.uint8(0x01)
702
+ signs = np.where(signs == 0, np.float32(1), np.float32(-1))
703
+ signs = signs.reshape((n_blocks, -1, 2, 8))
704
+
705
+ assert cls.grid is not None
706
+ grid = np.take_along_axis(cls.grid, (qs & np.uint16(511)).reshape((n_blocks, -1, 1, 1)), axis=-2)
707
+ grid = grid.reshape((n_blocks, -1, 2, 8))
708
+
709
+ return (db * grid * signs).reshape((n_blocks, -1))
710
+
711
+
712
+ class IQ2_S(__Quant, qtype=GGMLQuantizationType.IQ2_S):
713
+ # iq2s_grid, but with each byte of the original packed in 2 bits,
714
+ # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2.
715
+ grid_shape = (1024, 8)
716
+ grid_map = (0x08, 0x19, 0x2b)
717
+ grid_hex = (
718
+ b"00000200050008000a0011001400160019002000220025002800410044004600"
719
+ b"490050005200550058006100640066006900800082008500880091009400a000"
720
+ b"a500aa0001010401060109011001120115011801210124014001420145014801"
721
+ b"510154015601590160016501680181018401900192019501a101a40100020202"
722
+ b"050208021102140220022a02410244024602490250025502800285028a029402"
723
+ b"a202010404040604090410041204150418042104240426042904400442044504"
724
+ b"48044a0451045404560459046004620465048104840486048904900495049804"
725
+ b"a104a40400050205050508050a05110514051605190520052505280541054405"
726
+ b"46054905500552055505580561056405800582058505880591059405a0050106"
727
+ b"0406060609061006150640064506480651065406600681068406900600080208"
728
+ b"050808081108140816081908200825082a084108440846084908500852085508"
729
+ b"580861086408800885089408aa08010904091009120915091809210940094509"
730
+ b"480951095409600981099009000a110a140a220a280a2a0a500a990a01100410"
731
+ b"0610091010101210151018102110241026104010421045104810511054105610"
732
+ b"59106010621065106810811084108610901095109810a110a410001102110511"
733
+ b"08110a1111111411161119112011221125112811411144114611491150115211"
734
+ b"5511581161116411801182118511881191119411011204120912101215122112"
735
+ b"2412401245125112541281128412901200140214051408141114141416141914"
736
+ b"2014251428144114441446144914501452145514581461146414801482148514"
737
+ b"881491149414a014011504150615091510151215151518152115241540154215"
738
+ b"4515481551155415601581158415901500160516081611161416201641164416"
739
+ b"50168016aa160118041806180918101815181818211840184218451848185118"
740
+ b"541860188118841800190219051908191119141920194119441950196919a219"
741
+ b"041a101a401a561a00200220052008201120142016201920202025202a204120"
742
+ b"4420502052205520642080208a209420aa200121042110211221152121214021"
743
+ b"4221452151215421602181218421902100220a22222228222a22442250228822"
744
+ b"8a22a82201240424062409241024152418242124242440244224452448245124"
745
+ b"5424602481248424902400250525082511251425202541254425502566258025"
746
+ b"0126042610264026592600280528112814284128442850288a28aa2801290429"
747
+ b"102995290a2a222a642a882a8a2a014004400640094010401240154018401a40"
748
+ b"21402440264040404240454048404a4051405440564059406040624065408140"
749
+ b"8440904095409840a140a4400041024105410841114114411641194120412241"
750
+ b"2541414144414641494150415241554158416141644180418241854188419141"
751
+ b"9441a04101420442104212421542184224424042454248425142544260428142"
752
+ b"844200440244054408440a441144144416441944204422442544284441444444"
753
+ b"46444944504452445544584461446444804482448544884491449444a0440145"
754
+ b"0445064509451045124515451845214524454045424545454845514554456045"
755
+ b"6a4581458445904500460246054608461146144620464146444650468046a546"
756
+ b"0148044809481048124815481848214824484048424845484848514854486048"
757
+ b"84489048004902490549084911491449204941494449504980499649014a044a"
758
+ b"104a404a00500250055008501150145016501950205022502550285041504450"
759
+ b"4650495050505250555058506150645080508250855088509150945001510451"
760
+ b"0651095110511251155118512151245140514251455148515151545160518151"
761
+ b"8451905100520552085211521452205241524452505269528052015404540654"
762
+ b"0954105412541554185421542454405442544554485451545454605481548454"
763
+ b"9054005502550555085511551455205541554455505580550156045610562656"
764
+ b"405600580258055808581158145820584158445850585a588058015904591059"
765
+ b"4059005a195a855aa85a01600460066010601260156018602160246040604560"
766
+ b"4860516054606060846090600061026105610861116114612061416144615061"
767
+ b"806199610462106240625662a162006405640864116414642064416444645064"
768
+ b"806401650465106540654a656865926500669466016804681068656898680069"
769
+ b"2a69426aa16a0080028005800880118014801980208025804180448050805280"
770
+ b"5580588061808080858091809480018104810981108112811581188121812481"
771
+ b"408142814581488151815481818184819081a981008205820a82118214824182"
772
+ b"4482508201840484068409841084128415841884218440844284458448845184"
773
+ b"5484608481848484908400850285058508851185148520854185448550858085"
774
+ b"8a85018604861086298640860088058811881488418844885088a28801890489"
775
+ b"40896589228a588a5a8a828aa28a019004900990109012901590189024904090"
776
+ b"4290459048905190549060908190849090900091059111911491419144915091"
777
+ b"5a910192049210924092a6920094029405940894119414942094419444945094"
778
+ b"8094969401950495109540959895a19500964696649601980498109826984098"
779
+ b"a998009949995299909a00a005a00aa014a022a02aa041a044a050a0a2a0aaa0"
780
+ b"40a165a102a20aa222a228a22aa282a288a28aa2a8a201a404a410a440a489a4"
781
+ b"a4a400a519a551a60aa828a8a2a854a986a908aa0aaa20aa22aa28aa88aaaaaa"
782
+ )
783
+
784
+ @classmethod
785
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
786
+ n_blocks = blocks.shape[0]
787
+
788
+ d, rest = np.hsplit(blocks, [2])
789
+ qs, rest = np.hsplit(rest, [QK_K // 8])
790
+ signs, rest = np.hsplit(rest, [QK_K // 8])
791
+ qh, scales = np.hsplit(rest, [QK_K // 32])
792
+
793
+ d = d.view(np.float16).astype(np.float32)
794
+
795
+ scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
796
+ scales = (scales & 0x0F).reshape((n_blocks, -1))
797
+ db = d * (np.float32(0.5) + scales) * np.float32(0.25)
798
+ db = db.reshape((n_blocks, -1, 1, 1))
799
+
800
+ # unpack the sign bits
801
+ signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8))
802
+ signs = signs & np.uint8(0x01)
803
+ signs = np.where(signs == 0, np.float32(1), np.float32(-1))
804
+ signs = signs.reshape((n_blocks, -1, 2, 8))
805
+
806
+ qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4))
807
+ qs = qs.astype(np.uint16) | ((qh & 0x03).astype(np.uint16) << 8).reshape((n_blocks, -1))
808
+
809
+ assert cls.grid is not None
810
+ grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
811
+ grid = grid.reshape((n_blocks, -1, 2, 8))
812
+
813
+ return (db * grid * signs).reshape((n_blocks, -1))
814
+
815
+
816
+ class IQ3_XXS(__Quant, qtype=GGMLQuantizationType.IQ3_XXS):
817
+ grid_shape = (256, 4)
818
+ grid_map = (0x04, 0x0c, 0x14, 0x1c, 0x24, 0x2c, 0x34, 0x3e)
819
+ grid_hex = (
820
+ b"0000020004001100130017002000220031004200730075000101030110011201"
821
+ b"2101250130013201410154017001000202020402110220022202310233023702"
822
+ b"5102570275020103070310031203250370031304370444045704730475040105"
823
+ b"0705320552053506640610071407160743076107011003101010121021102310"
824
+ b"3010321034104710501000110211111120112211011203121012121221123012"
825
+ b"7212001302132013311346136613011405145014201524154615711505162217"
826
+ b"4017002002201120132020202220262031204220012103210521102112212121"
827
+ b"3021632167217021002202221122172220222222372240225522012310231423"
828
+ b"7023742335245324032527254125742501270327162745270130103012302130"
829
+ b"2330503065307230003102312031313144314631013203321032253252327232"
830
+ b"1133333330344734723400350635223555351436363663363337603704401740"
831
+ b"3540374053405740744120423742404260426642074345430444514464442545"
832
+ b"4345704505471047124730471250415070500051065126515551145232527252"
833
+ b"0253535310542354275472540255315550562457425724604460466064602161"
834
+ b"6161176264623063366344640565526533660367216703700570077010703270"
835
+ b"5270267140711272457252720073157333736073217441740075027524753076"
836
+ )
837
+
838
+ @classmethod
839
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
840
+ n_blocks = blocks.shape[0]
841
+
842
+ d, rest = np.hsplit(blocks, [2])
843
+ qs, scales = np.hsplit(rest, [QK_K // 4])
844
+
845
+ d = d.view(np.float16).astype(np.float32)
846
+ scales = scales.view(np.uint32)
847
+
848
+ db = d * (np.float32(0.5) + (scales >> 28).astype(np.float32)) * np.float32(0.5)
849
+ db = db.reshape((n_blocks, -1, 1, 1))
850
+
851
+ # get the sign indices and unpack the bits
852
+ signs = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 7, 14, 21], dtype=np.uint32).reshape((1, 1, 4))
853
+ ksigns = np.frombuffer(IQ2_XXS.ksigns, dtype=np.uint8).reshape((1, 1, 1, 128))
854
+ signs = (signs & np.uint32(0x7F)).reshape((n_blocks, -1, 4, 1))
855
+ signs = np.take_along_axis(ksigns, signs, axis=-1)
856
+ signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 1, 8))
857
+ signs = signs & np.uint8(0x01)
858
+ signs = np.where(signs == 0, np.float32(1), np.float32(-1))
859
+ signs = signs.reshape((n_blocks, -1, 4, 8))
860
+
861
+ assert cls.grid is not None
862
+ grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
863
+ grid = grid.reshape((n_blocks, -1, 4, 8))
864
+
865
+ return (db * grid * signs).reshape((n_blocks, -1))
866
+
867
+
868
+ class IQ3_S(__Quant, qtype=GGMLQuantizationType.IQ3_S):
869
+ grid_shape = (512, 4)
870
+ grid_map = (0x01, 0x03, 0x05, 0x07, 0x09, 0x0b, 0x0d, 0x0f)
871
+ grid_hex = (
872
+ b"0000010002000500070010001100120014001600200021002500330040004200"
873
+ b"4500470051005300600062007100740077000001010102010401100111011501"
874
+ b"2001230127013101350144016101650172010002010205020702100213021602"
875
+ b"2102250230023402420245024702510253027002730203031103150320032203"
876
+ b"3103330336034403500352036703710375030004130417042104240432044004"
877
+ b"4304510470040205040520052205260533054105450547056605730506061106"
878
+ b"1306310652067106000702070407200722072607330750075407001001100210"
879
+ b"0410101011101310151017102010221031103410361054105610611072100011"
880
+ b"0111031106111011141121113011331141115011521170117611001212121512"
881
+ b"1712201224123212401243125512601272120113041307131013131321132713"
882
+ b"3013341341136213701303140514121414143114331442144614501454140115"
883
+ b"1015131521153015321551152016241627164416461601170317101712172117"
884
+ b"3517411762177017002001200320052007201020122014201620212023202720"
885
+ b"3020322041204320452050205220672070207320752000210221102113211721"
886
+ b"2221252131213421422151210122042207222122232230223722412253225722"
887
+ b"7122742200230223052311232223242331233323422350236623012407242024"
888
+ b"2324322435244124722475240425112522253725402553257025002602260726"
889
+ b"2126552661260527112726273027432750270230113013301530173022303130"
890
+ b"3330353042304430473051306330713001310331053114312131233140316031"
891
+ b"7231763100321232203232323432503201331033143321332333273330334133"
892
+ b"4333473355337333033411341634223431345234603464340135103512352535"
893
+ b"3235443556357335163641360137033720372237353700400440124020402440"
894
+ b"2740324041405040704002410741114113412241304135414341514155410142"
895
+ b"0342104215422142334240425742624270420443114313432043224331433543"
896
+ b"0044024424443744404471440545074521456245134634466046104715473047"
897
+ b"4347514702501050145022504050445047505250665074500151035105511251"
898
+ b"2151325172510052115223523052365253520253075310532753445351536553"
899
+ b"7353015404542054325446541255265551555355425602570457225711601360"
900
+ b"1560316033606060006120612761646112623462426255626262706200631463"
901
+ b"2163406325644364626400650365346560650566406611671367007004700770"
902
+ b"2070227036704070547062700271117124714371457101720472107216722172"
903
+ b"3072517202733273357353730174057413742074507422754275027631760077"
904
+ )
905
+
906
+ @classmethod
907
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
908
+ n_blocks = blocks.shape[0]
909
+
910
+ d, rest = np.hsplit(blocks, [2])
911
+ qs, rest = np.hsplit(rest, [QK_K // 4])
912
+ qh, rest = np.hsplit(rest, [QK_K // 32])
913
+ signs, scales = np.hsplit(rest, [QK_K // 8])
914
+
915
+ d = d.view(np.float16).astype(np.float32)
916
+
917
+ scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
918
+ scales = (scales & 0x0F).reshape((n_blocks, -1))
919
+ db = d * (1 + 2 * scales)
920
+ db = db.reshape((n_blocks, -1, 1, 1))
921
+
922
+ # unpack the sign bits
923
+ signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8))
924
+ signs = signs & np.uint8(0x01)
925
+ signs = np.where(signs == 0, np.float32(1), np.float32(-1))
926
+ signs = signs.reshape((n_blocks, -1, 4, 8))
927
+
928
+ qh = qh.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8)
929
+ qh = (qh & 0x01).astype(np.uint16).reshape((n_blocks, -1))
930
+ qs = qs.astype(np.uint16) | (qh << 8)
931
+
932
+ assert cls.grid is not None
933
+ grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
934
+ grid = grid.reshape((n_blocks, -1, 4, 8))
935
+
936
+ return (db * grid * signs).reshape((n_blocks, -1))
937
+
938
+
939
+ class IQ1_S(__Quant, qtype=GGMLQuantizationType.IQ1_S):
940
+ # iq1s_grid, with each byte packed into 2 bits
941
+ # -1, 0, 1 <=> 0, 1, 2
942
+ grid_shape = (2048, 8)
943
+ grid_map = (-1, 0, 1)
944
+ grid_hex = (
945
+ b"00000200050008000a00110015002000220028002a0045005100540056006500"
946
+ b"8000820088008a009500a000a200a800aa000401050111011401160119011a01"
947
+ b"2501410146014901520155015a0161016401660168018501910194019601a501"
948
+ b"0002020208020a0215022002220228022a024502510259026402690280028202"
949
+ b"88028a02910295029902a002a202a802aa021104140416042504410449045504"
950
+ b"5a046404650491049904a5040105040505050605150518051a05290540054505"
951
+ b"4a0550055105540555055605590560056205650568056a058105910595059805"
952
+ b"9a05a105a405a505a605a9051406190641064406500652065506580660066106"
953
+ b"6606690685069106940699060008020808080a0815082008220828082a084508"
954
+ b"5108560865088008820888088a089508a008a208a808aa080509110914091909"
955
+ b"2409250941095009510955096109640969099109940996099909a509000a020a"
956
+ b"080a0a0a150a200a220a280a2a0a450a510a590a610a650a800a820a850a880a"
957
+ b"8a0a950aa00aa20aa80aaa0a1010111014101910241025104110441050105510"
958
+ b"58106110641065106910911094109610a110a510011104110611091110111211"
959
+ b"1511181121112411291145114a11501151115211541155115611591160116511"
960
+ b"841192119511a111a41111121412161225124012461249125212551258125a12"
961
+ b"641266128512911294129612a512011406140914141415141814191421142614"
962
+ b"41144514461448144a1451145414551456145914621465146814841489149014"
963
+ b"94149514981499149a14a114a414a514a914021505150a151115141515151615"
964
+ b"191520152215251528152a154115441545154615511552155415551556155915"
965
+ b"5a1561156415651566156915801582158415851588158a159015911594159515"
966
+ b"961599159a15a015a215a51501160416051606161516161618161a1621162616"
967
+ b"401642164416451648164a165116551656165816591661166416651668166916"
968
+ b"6a1686168a1692169516a416a916111816182518411844184618491850185518"
969
+ b"58185a1860186118641866186918851891189418a5181019121915191a192119"
970
+ b"25194219441945194819511954195519561959195a19601965196a1989199119"
971
+ b"921995199819a119a619a919091a161a241a261a441a461a491a501a521a551a"
972
+ b"581a611a661a691a851a911a961a9a1a0020022008200a201520202022202520"
973
+ b"28202a20452051205920612065208020822088208a209520a020a220a520a820"
974
+ b"aa2005211121142119212521422144214921552158215a216121642165216621"
975
+ b"8521902196219921a521012208220a22112215222022222228222a2245225122"
976
+ b"562259226522812288228a2291229522a022a222a822aa220524142416241924"
977
+ b"252444244524462449245224552458245a2466248524912494249924a124a524"
978
+ b"0925152521252925402545254825512554255525592562256525682589259025"
979
+ b"9425952598259a25a125a425a625a92505261026122619262526412649265526"
980
+ b"6026612669268426862690269a260028022808280a2815282028222828282a28"
981
+ b"45285128542865288028822888288a28a028a228a828aa280929112914291929"
982
+ b"2529462949295229552961296429662969298529902996299929a429a529002a"
983
+ b"022a082a0a2a202a222a282a2a2a452a512a562a592a652a802a822a882a8a2a"
984
+ b"952aa02aa22aa82aaa2a054011401640254049405240554058405a4061406440"
985
+ b"664094409940a140a6400041014104410641094112411541164118411a412141"
986
+ b"26412941454148414a41514154415541564159415a41654168416a4181418441"
987
+ b"8641904192419541a041a141a241054211421442164225424142524255425a42"
988
+ b"6442694289429442a5420144154419442944454448444a445144544455445644"
989
+ b"61446244654468446a44814486448944904492449544a044a144a94401450245"
990
+ b"05450a4511451445154516451945204525452a45414544454545464549455045"
991
+ b"5145544555455645584559456145644565456645694582458445854588459145"
992
+ b"94459545964599459a45a545a845aa450146054609461446154618461a462146"
993
+ b"2446294640464246454648465046514652465546564659466246654668468146"
994
+ b"85468a4694469546a146a446a6460548114815481a4825484248494850485548"
995
+ b"5848614864486648694885489148944896489948a5480149054906490a491049"
996
+ b"144915491849214924492649404945494a495149524954495549564959496049"
997
+ b"6249654966496a49864989499249954996499849a149a449a649a949164a444a"
998
+ b"464a494a554a584a5a4a644a694a944aa54a0150045005500650095012501550"
999
+ b"1a50215024502950405045504850515054505550565059506550685086508950"
1000
+ b"95509850a050a150a650a9500551085109510a51115114511551165118511951"
1001
+ b"20512551265128512a5141514451455146514951505151515251545155515651"
1002
+ b"585159515a51615164516551665169518251855191519451955196519951a051"
1003
+ b"a551aa5101520652125215521a5221522452425245524a525152545255525652"
1004
+ b"595262526552855290529252955299529a52a452045405541154145415541654"
1005
+ b"185419542154255428542a54415444544554465449544a545054515454545554"
1006
+ b"5654585459545a54615462546454655466546954805488548a54915494549554"
1007
+ b"96549954a154a454a554aa540155025504550555065509551055115512551455"
1008
+ b"1555165519551a55215524552555265529554055415542554455455546554855"
1009
+ b"4955505551555255545555555655585559555a55605561556455655566556855"
1010
+ b"69556a5581558455855589558a559055915594559555965598559955a155a455"
1011
+ b"a555a655a9550056015602560456065608560956115614561556185619562056"
1012
+ b"2156225624562556265628562956415645564656485649564a56505651565256"
1013
+ b"545655565656585659565a566156645665566956825685568656885689568a56"
1014
+ b"915695569a56a256a556a656a856a95604580558065809581058155818582158"
1015
+ b"2a58455848584a58515854585558565858585958605862586458655882588958"
1016
+ b"9058925895589858a158a9580159025905590a59115914591559165919592559"
1017
+ b"41594459455946594959505951595259545955595659585959595a5961596459"
1018
+ b"655966596959815985598959915994599559965998599959a559045a085a155a"
1019
+ b"1a5a205a255a265a295a455a485a495a515a555a565a585a595a625a655a685a"
1020
+ b"6a5a815a8a5a925a955a965a985a9a5aa15a0560146016601960256044605060"
1021
+ b"5560566058605a60616064606660696081609660a56001610461066109611261"
1022
+ b"15612161226126612961456149615161556156615961656166616a6184618a61"
1023
+ b"92619561a161a661a96111621662196240624162466255625662586260628562"
1024
+ b"91629662a56211641264156416641a6421642664296440644264456448644a64"
1025
+ b"516454645564566459645a646064626465648464856489649064926494649564"
1026
+ b"966498649a64a164a464a964056508650a651165156516651965446545654665"
1027
+ b"496550655165546555655665596561656465656566656965866589658a659165"
1028
+ b"9565966599659a65a265a565a665a86502660966156620662666286629664066"
1029
+ b"456648664a66516654665566566658665a666066656668668066826685668a66"
1030
+ b"9466966698669966a066a466a666aa661668196825684168526855685a686168"
1031
+ b"6968856891689868a66801690469106915692169246926692969406941694569"
1032
+ b"4669486951695469556956695969606965696a69826984698a699569a169a469"
1033
+ b"a569a969116a166a186a416a446a496a506a556a586a5a6a646a656a696a866a"
1034
+ b"946a986a9a6aa66a0080028008800a802080228028802a804580508051805480"
1035
+ b"5680598065808080828088808a809580a080a280a880aa800581118114811681"
1036
+ b"1981258141814481498150815281558156815881598164816681698185818981"
1037
+ b"948196819981a5810082028208820a8215822082228228822a82518254825982"
1038
+ b"65828082828288828a829582a082a282a882aa82148419844184448451845584"
1039
+ b"5a846184648469849484998401850985128515851a8526852985408541854585"
1040
+ b"4885518554855585568559855a856585668568856a8581858485868589859085"
1041
+ b"928595859885a68511861686198625864186448649864a865086558659865a86"
1042
+ b"618666866a86858691869a86a4860088028808880a8815882088228828882a88"
1043
+ b"41884588518854885988658869888088828888888a889588a088a288a888aa88"
1044
+ b"05890689118914891689258941894489468949895089528955895a8961896489"
1045
+ b"858996899989a589008a028a088a0a8a158a208a228a288a2a8a458a518a548a"
1046
+ b"568a808a828a888a8a8a958aa08aa28aa88aaa8a059011901690189019902590"
1047
+ b"419046904990559058905a9069906a9085909190949096909990a59001910491"
1048
+ b"069109911091159118911a912191249126912991409145915091519154915591"
1049
+ b"569159916291659184918691929195919891a191a491a691a991059211921492"
1050
+ b"19922592449246924992509252925592589266926992859294929692a9920194"
1051
+ b"04940694109415941894269440944a9451945494559456945894599460946194"
1052
+ b"62946594849486949294949495949894a194a9940095059508950a9510951195"
1053
+ b"14951595169519952195259529952a9541954495459546954995509551955295"
1054
+ b"549555955695589559955a956195649565956695699581958595889591959295"
1055
+ b"94959595969599959a95a095a295a595a895aa95019604961096159619962096"
1056
+ b"2696299645964896499651965296559656965996659668968296849689968a96"
1057
+ b"929694969596a496a696a9960598169819982598419846985098529855985698"
1058
+ b"5a98649865988598919896989998a59804990699099910991299159918991a99"
1059
+ b"209921992499269940994299459948994a995199549955995699599962996599"
1060
+ b"66996a99819984999099929995999a99a199a699059a159a259a449a469a499a"
1061
+ b"509a559a589a619a859a919a949a959a969a00a002a008a00aa015a020a022a0"
1062
+ b"28a02aa045a051a054a056a059a080a082a088a08aa095a0a0a0a2a0a8a0aaa0"
1063
+ b"05a109a111a114a116a119a11aa146a149a151a155a158a15aa161a164a185a1"
1064
+ b"90a192a196a199a102a208a20aa210a219a222a228a22aa245a251a256a259a2"
1065
+ b"65a280a282a288a28aa295a2a0a2a2a2a8a2aaa219a425a441a444a450a454a4"
1066
+ b"55a458a45aa461a465a466a468a469a485a406a509a510a512a515a518a526a5"
1067
+ b"29a542a545a551a554a555a556a559a565a56aa581a584a585a586a589a592a5"
1068
+ b"95a598a505a611a616a61aa621a625a644a646a64aa652a655a656a658a660a6"
1069
+ b"62a686a690a695a696a699a6a1a6a4a6a6a600a802a808a80aa820a822a828a8"
1070
+ b"2aa851a854a856a859a880a882a888a88aa895a8a0a8a2a8a8a8aaa805a914a9"
1071
+ b"19a921a925a941a950a955a95aa961a966a969a990a996a900aa02aa08aa0aaa"
1072
+ b"20aa22aa28aa2aaa51aa54aa56aa80aa82aa88aa8aaa95aaa0aaa2aaa8aaaaaa"
1073
+ )
1074
+
1075
+ delta = np.float32(0.125)
1076
+
1077
+ @classmethod
1078
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
1079
+ n_blocks = blocks.shape[0]
1080
+
1081
+ d, rest = np.hsplit(blocks, [2])
1082
+ qs, qh = np.hsplit(rest, [QK_K // 8])
1083
+
1084
+ d = d.view(np.float16).astype(np.float32)
1085
+ qh = qh.view(np.uint16)
1086
+
1087
+ dl = d * (2 * ((qh >> 12) & 7) + 1)
1088
+ dl = dl.reshape((n_blocks, -1, 1, 1))
1089
+ delta = np.where((qh & np.uint16(0x8000)) == 0, cls.delta, -cls.delta)
1090
+ delta = delta.reshape((n_blocks, -1, 1, 1))
1091
+
1092
+ qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 3, 6, 9], dtype=np.uint16).reshape((1, 1, 4))
1093
+ qs = qs.astype(np.uint16) | ((qh & 7) << 8).reshape((n_blocks, -1))
1094
+
1095
+ assert cls.grid is not None
1096
+ grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
1097
+ grid = grid.reshape((n_blocks, -1, 4, 8))
1098
+
1099
+ return (dl * (grid + delta)).reshape((n_blocks, -1))
1100
+
1101
+
1102
+ class IQ1_M(__Quant, qtype=GGMLQuantizationType.IQ1_M):
1103
+ grid_shape = IQ1_S.grid_shape
1104
+ grid_map = IQ1_S.grid_map
1105
+ grid_hex = IQ1_S.grid_hex
1106
+
1107
+ delta = IQ1_S.delta
1108
+
1109
+ # Okay *this* type is weird. It's the only one which stores the f16 scales in multiple parts.
1110
+ @classmethod
1111
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
1112
+ n_blocks = blocks.shape[0]
1113
+
1114
+ qs, rest = np.hsplit(blocks, [QK_K // 8])
1115
+ qh, scales = np.hsplit(rest, [QK_K // 16])
1116
+
1117
+ # The f16 scale is packed across multiple bytes
1118
+ scales = scales.view(np.uint16)
1119
+ d = (scales.reshape((n_blocks, 4)) & np.uint16(0xF000)) >> np.array([12, 8, 4, 0], dtype=np.uint16).reshape((1, 4))
1120
+ d = d[..., 0] | d[..., 1] | d[..., 2] | d[..., 3]
1121
+ d = d.view(np.float16).astype(np.float32).reshape((n_blocks, 1))
1122
+
1123
+ scales = scales.reshape(n_blocks, -1, 1) >> np.array([0, 3, 6, 9], dtype=np.uint16).reshape((1, 1, 4))
1124
+ scales = (scales & 0x07).reshape((n_blocks, -1))
1125
+ dl = d * (2 * scales + 1)
1126
+ dl = dl.reshape((n_blocks, -1, 2, 1, 1))
1127
+
1128
+ qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
1129
+ qs = qs.astype(np.uint16) | ((qh & 0x07).astype(np.uint16) << 8).reshape((n_blocks, -1))
1130
+
1131
+ delta = np.where(qh & 0x08 == 0, cls.delta, -cls.delta)
1132
+ delta = delta.reshape((n_blocks, -1, 2, 2, 1))
1133
+
1134
+ assert cls.grid is not None
1135
+ grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
1136
+ grid = grid.reshape((n_blocks, -1, 2, 2, 8))
1137
+
1138
+ return (dl * (grid + delta)).reshape((n_blocks, -1))
1139
+
1140
+
1141
+ class IQ4_NL(__Quant, qtype=GGMLQuantizationType.IQ4_NL):
1142
+ kvalues = (-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113)
1143
+
1144
+ @classmethod
1145
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
1146
+ n_blocks = blocks.shape[0]
1147
+
1148
+ d, qs = np.hsplit(blocks, [2])
1149
+
1150
+ d = d.view(np.float16).astype(np.float32)
1151
+
1152
+ qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
1153
+
1154
+ qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1, 1))
1155
+
1156
+ kvalues = np.array(cls.kvalues, dtype=np.int8).reshape(1, 1, 16)
1157
+ qs = np.take_along_axis(kvalues, qs, axis=-1).astype(np.float32).reshape((n_blocks, -1))
1158
+
1159
+ return (d * qs)
1160
+
1161
+
1162
+ class IQ4_XS(__Quant, qtype=GGMLQuantizationType.IQ4_XS):
1163
+ @classmethod
1164
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
1165
+ n_blocks = blocks.shape[0]
1166
+
1167
+ d, rest = np.hsplit(blocks, [2])
1168
+ scales_h, rest = np.hsplit(rest, [2])
1169
+ scales_l, qs = np.hsplit(rest, [QK_K // 64])
1170
+
1171
+ d = d.view(np.float16).astype(np.float32)
1172
+ scales_h = scales_h.view(np.uint16)
1173
+
1174
+ scales_l = scales_l.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
1175
+ scales_h = scales_h.reshape((n_blocks, 1, -1)) >> np.array([2 * i for i in range(QK_K // 32)], dtype=np.uint16).reshape((1, -1, 1))
1176
+ scales_l = scales_l.reshape((n_blocks, -1)) & np.uint8(0x0F)
1177
+ scales_h = scales_h.reshape((n_blocks, -1)).astype(np.uint8) & np.uint8(0x03)
1178
+
1179
+ scales = (scales_l | (scales_h << np.uint8(4))).astype(np.int8) - np.int8(32)
1180
+ dl = (d * scales.astype(np.float32)).reshape((n_blocks, -1, 1))
1181
+
1182
+ qs = qs.reshape((n_blocks, -1, 1, 16)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
1183
+ qs = qs.reshape((n_blocks, -1, 32, 1)) & np.uint8(0x0F)
1184
+
1185
+ kvalues = np.array(IQ4_NL.kvalues, dtype=np.int8).reshape((1, 1, 1, -1))
1186
+ qs = np.take_along_axis(kvalues, qs, axis=-1).astype(np.float32).reshape((n_blocks, -1, 32))
1187
+
1188
+ return (dl * qs).reshape((n_blocks, -1))