bigdl-core-cpp 2.5.0rc1__py3-none-win_amd64.whl → 2.6.0b20240827__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. bigdl/cpp/{convert-hf-to-gguf.py → convert_hf_to_gguf.py} +413 -67
  2. bigdl/cpp/convert_hf_to_gguf_update.py +354 -0
  3. bigdl/cpp/convert_llama_ggml_to_gguf.py +454 -0
  4. bigdl/cpp/convert_lora_to_gguf.py +393 -0
  5. bigdl/cpp/gguf-py/gguf/__init__.py +1 -1
  6. bigdl/cpp/gguf-py/gguf/constants.py +71 -2
  7. bigdl/cpp/gguf-py/gguf/gguf_writer.py +16 -1
  8. bigdl/cpp/gguf-py/gguf/lazy.py +4 -1
  9. bigdl/cpp/gguf-py/gguf/metadata.py +70 -63
  10. bigdl/cpp/gguf-py/gguf/quants.py +1129 -64
  11. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +23 -15
  12. bigdl/cpp/gguf-py/gguf/utility.py +1 -1
  13. bigdl/cpp/gguf-py/gguf/vocab.py +301 -1
  14. bigdl/cpp/libs/common.lib +0 -0
  15. bigdl/cpp/libs/{gguf.exe → dist/windows-amd64/lib/ollama/runners/cpu/ggml.dll} +0 -0
  16. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/llama.dll +0 -0
  17. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/ollama_llama_server.exe +0 -0
  18. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ggml.dll +0 -0
  19. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/llama.dll +0 -0
  20. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ollama_llama_server.exe +0 -0
  21. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ggml.dll +0 -0
  22. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/llama.dll +0 -0
  23. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ollama_llama_server.exe +0 -0
  24. bigdl/cpp/libs/{ggml_shared.dll → ggml.dll} +0 -0
  25. bigdl/cpp/libs/llama-batched.exe +0 -0
  26. bigdl/cpp/libs/llama-bench.exe +0 -0
  27. bigdl/cpp/libs/llama-cli.exe +0 -0
  28. bigdl/cpp/libs/llama-embedding.exe +0 -0
  29. bigdl/cpp/libs/llama-gguf.exe +0 -0
  30. bigdl/cpp/libs/llama-llava-cli.exe +0 -0
  31. bigdl/cpp/libs/llama-lookup.exe +0 -0
  32. bigdl/cpp/libs/{ls-sycl-device.exe → llama-ls-sycl-device.exe} +0 -0
  33. bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
  34. bigdl/cpp/libs/llama-perplexity.exe +0 -0
  35. bigdl/cpp/libs/llama-quantize.exe +0 -0
  36. bigdl/cpp/libs/llama-server.exe +0 -0
  37. bigdl/cpp/libs/llama-simple.exe +0 -0
  38. bigdl/cpp/libs/llama-speculative.exe +0 -0
  39. bigdl/cpp/libs/llama-tokenize.exe +0 -0
  40. bigdl/cpp/libs/llama.dll +0 -0
  41. bigdl/cpp/libs/llava_shared.dll +0 -0
  42. bigdl/cpp/libs/ollama.exe +0 -0
  43. {bigdl_core_cpp-2.5.0rc1.data → bigdl_core_cpp-2.6.0b20240827.data}/scripts/init-llama-cpp.bat +7 -2
  44. {bigdl_core_cpp-2.5.0rc1.data → bigdl_core_cpp-2.6.0b20240827.data}/scripts/init-ollama.bat +6 -0
  45. {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0b20240827.dist-info}/METADATA +1 -1
  46. bigdl_core_cpp-2.6.0b20240827.dist-info/RECORD +54 -0
  47. bigdl/cpp/convert.py +0 -1714
  48. bigdl/cpp/libs/baby-llama.exe +0 -0
  49. bigdl/cpp/libs/batched-bench.exe +0 -0
  50. bigdl/cpp/libs/batched.exe +0 -0
  51. bigdl/cpp/libs/beam-search.exe +0 -0
  52. bigdl/cpp/libs/benchmark.exe +0 -0
  53. bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
  54. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
  55. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
  56. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
  57. bigdl/cpp/libs/embedding.exe +0 -0
  58. bigdl/cpp/libs/export-lora.exe +0 -0
  59. bigdl/cpp/libs/finetune.exe +0 -0
  60. bigdl/cpp/libs/gritlm.exe +0 -0
  61. bigdl/cpp/libs/imatrix.exe +0 -0
  62. bigdl/cpp/libs/infill.exe +0 -0
  63. bigdl/cpp/libs/llava-cli.exe +0 -0
  64. bigdl/cpp/libs/lookahead.exe +0 -0
  65. bigdl/cpp/libs/lookup.exe +0 -0
  66. bigdl/cpp/libs/main.exe +0 -0
  67. bigdl/cpp/libs/parallel.exe +0 -0
  68. bigdl/cpp/libs/passkey.exe +0 -0
  69. bigdl/cpp/libs/perplexity.exe +0 -0
  70. bigdl/cpp/libs/q8dot.exe +0 -0
  71. bigdl/cpp/libs/quantize-stats.exe +0 -0
  72. bigdl/cpp/libs/quantize.exe +0 -0
  73. bigdl/cpp/libs/save-load-state.exe +0 -0
  74. bigdl/cpp/libs/server.exe +0 -0
  75. bigdl/cpp/libs/simple.exe +0 -0
  76. bigdl/cpp/libs/speculative.exe +0 -0
  77. bigdl/cpp/libs/tokenize.exe +0 -0
  78. bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
  79. bigdl/cpp/libs/vdot.exe +0 -0
  80. bigdl_core_cpp-2.5.0rc1.dist-info/RECORD +0 -63
  81. {bigdl_core_cpp-2.5.0rc1.data → bigdl_core_cpp-2.6.0b20240827.data}/scripts/init-llama-cpp.ps1 +0 -0
  82. {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0b20240827.dist-info}/WHEEL +0 -0
  83. {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0b20240827.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,393 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ from __future__ import annotations
5
+
6
+ from dataclasses import dataclass
7
+ import logging
8
+ import argparse
9
+ import os
10
+ import sys
11
+ import json
12
+ from math import prod
13
+ from pathlib import Path
14
+ from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
15
+
16
+ import torch
17
+
18
+ if TYPE_CHECKING:
19
+ from torch import Tensor
20
+
21
+ if 'NO_LOCAL_GGUF' not in os.environ:
22
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
23
+ import gguf
24
+
25
+ # reuse model definitions from convert_hf_to_gguf.py
26
+ from convert_hf_to_gguf import LazyTorchTensor, Model
27
+
28
+ logger = logging.getLogger("lora-to-gguf")
29
+
30
+
31
+ @dataclass
32
+ class PartialLoraTensor:
33
+ A: Tensor | None = None
34
+ B: Tensor | None = None
35
+
36
+
37
+ # magic to support tensor shape modifications and splitting
38
+ class LoraTorchTensor:
39
+ _lora_A: Tensor # (n_rank, row_size)
40
+ _lora_B: Tensor # (col_size, n_rank)
41
+ _rank: int
42
+
43
+ def __init__(self, A: Tensor, B: Tensor):
44
+ assert len(A.shape) == len(B.shape)
45
+ assert A.shape[-2] == B.shape[-1]
46
+ if A.dtype != B.dtype:
47
+ A = A.to(torch.float32)
48
+ B = B.to(torch.float32)
49
+ self._lora_A = A
50
+ self._lora_B = B
51
+ self._rank = B.shape[-1]
52
+
53
+ def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
54
+ return (self._lora_A, self._lora_B)
55
+
56
+ def __getitem__(
57
+ self,
58
+ indices: (
59
+ SupportsIndex
60
+ | slice
61
+ | tuple[SupportsIndex | slice | Tensor, ...] # TODO: add ellipsis in the type signature
62
+ ),
63
+ ) -> LoraTorchTensor:
64
+ shape = self.shape
65
+ if isinstance(indices, SupportsIndex):
66
+ if len(shape) > 2:
67
+ return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
68
+ else:
69
+ raise NotImplementedError # can't return a vector
70
+ elif isinstance(indices, slice):
71
+ if len(shape) > 2:
72
+ return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
73
+ else:
74
+ return LoraTorchTensor(self._lora_A, self._lora_B[indices])
75
+ elif isinstance(indices, tuple):
76
+ assert len(indices) > 0
77
+ if indices[-1] is Ellipsis:
78
+ return self[indices[:-1]]
79
+ # expand ellipsis
80
+ indices = tuple(
81
+ u
82
+ for v in (
83
+ (
84
+ (slice(None, None) for _ in range(len(indices) - 1))
85
+ if i is Ellipsis
86
+ else (i,)
87
+ )
88
+ for i in indices
89
+ )
90
+ for u in v
91
+ )
92
+
93
+ if len(indices) < len(shape):
94
+ indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape))))
95
+
96
+ # TODO: make sure this is correct
97
+ indices_A = (
98
+ *(
99
+ (
100
+ j.__index__() % self._lora_A.shape[i]
101
+ if isinstance(j, SupportsIndex)
102
+ else slice(None, None)
103
+ )
104
+ for i, j in enumerate(indices[:-2])
105
+ ),
106
+ slice(None, None),
107
+ indices[-1],
108
+ )
109
+ indices_B = indices[:-1]
110
+ return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
111
+ else:
112
+ raise NotImplementedError # unknown indice type
113
+
114
+ @property
115
+ def dtype(self) -> torch.dtype:
116
+ assert self._lora_A.dtype == self._lora_B.dtype
117
+ return self._lora_A.dtype
118
+
119
+ @property
120
+ def shape(self) -> tuple[int, ...]:
121
+ assert len(self._lora_A.shape) == len(self._lora_B.shape)
122
+ return (*self._lora_B.shape[:-1], self._lora_A.shape[-1])
123
+
124
+ def size(self, dim=None):
125
+ assert dim is None
126
+ return self.shape
127
+
128
+ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
129
+ if isinstance(shape[0], tuple):
130
+ new_shape: tuple[int, ...] = shape[0]
131
+ else:
132
+ new_shape = cast(tuple[int, ...], shape)
133
+ orig_shape = self.shape
134
+ if len(new_shape) < 2:
135
+ raise NotImplementedError # can't become a vector
136
+
137
+ # expand -1 in the shape
138
+ if any(dim == -1 for dim in new_shape):
139
+ n_elems = prod(orig_shape)
140
+ n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
141
+ assert n_elems % n_new_elems == 0
142
+ new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),)
143
+
144
+ if new_shape[-1] != orig_shape[-1]:
145
+ raise NotImplementedError # can't reshape the row size trivially
146
+
147
+ shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
148
+ shape_B = (*new_shape[:-1], self._rank)
149
+ return LoraTorchTensor(
150
+ self._lora_A.reshape(shape_A),
151
+ self._lora_B.reshape(shape_B),
152
+ )
153
+
154
+ def reshape_as(self, other: Tensor) -> LoraTorchTensor:
155
+ return self.reshape(*other.shape)
156
+
157
+ def view(self, *size: int) -> LoraTorchTensor:
158
+ return self.reshape(*size)
159
+
160
+ def permute(self, *dims: int) -> LoraTorchTensor:
161
+ shape = self.shape
162
+ dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
163
+ if dims[-1] == -1:
164
+ # TODO: support higher dimensional A shapes bigger than 1
165
+ assert all(dim == 1 for dim in self._lora_A.shape[:-2])
166
+ return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
167
+ if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
168
+ return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims))
169
+ else:
170
+ # TODO: compose the above two
171
+ raise NotImplementedError
172
+
173
+ def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
174
+ shape = self.shape
175
+ dims = [i for i in range(len(shape))]
176
+ dims[dim0], dims[dim1] = dims[dim1], dims[dim0]
177
+ return self.permute(*dims)
178
+
179
+ def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
180
+ return self.transpose(axis0, axis1)
181
+
182
+ def to(self, *args, **kwargs):
183
+ return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
184
+
185
+ @classmethod
186
+ def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
187
+ del types # unused
188
+
189
+ if kwargs is None:
190
+ kwargs = {}
191
+
192
+ if func is torch.permute:
193
+ return type(args[0]).permute(*args, **kwargs)
194
+ elif func is torch.reshape:
195
+ return type(args[0]).reshape(*args, **kwargs)
196
+ elif func is torch.stack:
197
+ assert isinstance(args[0], Sequence)
198
+ dim = kwargs.get("dim", 0)
199
+ assert dim == 0
200
+ return LoraTorchTensor(
201
+ torch.stack([a._lora_A for a in args[0]], dim),
202
+ torch.stack([b._lora_B for b in args[0]], dim),
203
+ )
204
+ elif func is torch.cat:
205
+ assert isinstance(args[0], Sequence)
206
+ dim = kwargs.get("dim", 0)
207
+ assert dim == 0
208
+ if len(args[0][0].shape) > 2:
209
+ return LoraTorchTensor(
210
+ torch.cat([a._lora_A for a in args[0]], dim),
211
+ torch.cat([b._lora_B for b in args[0]], dim),
212
+ )
213
+ elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]):
214
+ return LoraTorchTensor(
215
+ args[0][0]._lora_A,
216
+ torch.cat([b._lora_B for b in args[0]], dim),
217
+ )
218
+ else:
219
+ raise NotImplementedError
220
+ else:
221
+ raise NotImplementedError
222
+
223
+
224
+ def get_base_tensor_name(lora_tensor_name: str) -> str:
225
+ base_name = lora_tensor_name.replace("base_model.model.", "")
226
+ base_name = base_name.replace(".lora_A.weight", ".weight")
227
+ base_name = base_name.replace(".lora_B.weight", ".weight")
228
+ return base_name
229
+
230
+
231
+ def parse_args() -> argparse.Namespace:
232
+ parser = argparse.ArgumentParser(
233
+ description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
234
+ parser.add_argument(
235
+ "--outfile", type=Path,
236
+ help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
237
+ )
238
+ parser.add_argument(
239
+ "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
240
+ help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
241
+ )
242
+ parser.add_argument(
243
+ "--bigendian", action="store_true",
244
+ help="model is executed on big endian machine",
245
+ )
246
+ parser.add_argument(
247
+ "--no-lazy", action="store_true",
248
+ help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
249
+ )
250
+ parser.add_argument(
251
+ "--verbose", action="store_true",
252
+ help="increase output verbosity",
253
+ )
254
+ parser.add_argument(
255
+ "--dry-run", action="store_true",
256
+ help="only print out what will be done, without writing any new files",
257
+ )
258
+ parser.add_argument(
259
+ "--base", type=Path, required=True,
260
+ help="directory containing base model file",
261
+ )
262
+ parser.add_argument(
263
+ "lora_path", type=Path,
264
+ help="directory containing LoRA adapter file",
265
+ )
266
+
267
+ return parser.parse_args()
268
+
269
+
270
+ if __name__ == '__main__':
271
+ args = parse_args()
272
+ logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
273
+
274
+ ftype_map: dict[str, gguf.LlamaFileType] = {
275
+ "f32": gguf.LlamaFileType.ALL_F32,
276
+ "f16": gguf.LlamaFileType.MOSTLY_F16,
277
+ "bf16": gguf.LlamaFileType.MOSTLY_BF16,
278
+ "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
279
+ "auto": gguf.LlamaFileType.GUESSED,
280
+ }
281
+
282
+ ftype = ftype_map[args.outtype]
283
+
284
+ dir_base_model: Path = args.base
285
+ dir_lora: Path = args.lora_path
286
+ lora_config = dir_lora / "adapter_config.json"
287
+ input_model = dir_lora / "adapter_model.safetensors"
288
+
289
+ if args.outfile is not None:
290
+ fname_out = args.outfile
291
+ else:
292
+ # output in the same directory as the model by default
293
+ fname_out = dir_lora
294
+
295
+ if os.path.exists(input_model):
296
+ # lazy import load_file only if lora is in safetensors format.
297
+ from safetensors.torch import load_file
298
+
299
+ lora_model = load_file(input_model, device="cpu")
300
+ else:
301
+ input_model = os.path.join(dir_lora, "adapter_model.bin")
302
+ lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
303
+
304
+ # load base model
305
+ logger.info(f"Loading base model: {dir_base_model.name}")
306
+ hparams = Model.load_hparams(dir_base_model)
307
+ with torch.inference_mode():
308
+ try:
309
+ model_class = Model.from_model_architecture(hparams["architectures"][0])
310
+ except NotImplementedError:
311
+ logger.error(f"Model {hparams['architectures'][0]} is not supported")
312
+ sys.exit(1)
313
+
314
+ class LoraModel(model_class):
315
+ model_arch = model_class.model_arch
316
+
317
+ lora_alpha: float
318
+
319
+ def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
320
+
321
+ super().__init__(*args, **kwargs)
322
+
323
+ self.dir_model_card = dir_lora_model
324
+ self.lora_alpha = float(lora_alpha)
325
+
326
+ def set_type(self):
327
+ self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
328
+ self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
329
+
330
+ def set_gguf_parameters(self):
331
+ self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
332
+ super().set_gguf_parameters()
333
+
334
+ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
335
+ tensor_map: dict[str, PartialLoraTensor] = {}
336
+
337
+ for name, tensor in lora_model.items():
338
+ if self.lazy:
339
+ tensor = LazyTorchTensor.from_eager(tensor)
340
+ base_name = get_base_tensor_name(name)
341
+ is_lora_a = ".lora_A.weight" in name
342
+ is_lora_b = ".lora_B.weight" in name
343
+ if not is_lora_a and not is_lora_b:
344
+ if ".base_layer.weight" in name:
345
+ continue
346
+ logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
347
+ sys.exit(1)
348
+
349
+ if base_name in tensor_map:
350
+ if is_lora_a:
351
+ tensor_map[base_name].A = tensor
352
+ else:
353
+ tensor_map[base_name].B = tensor
354
+ else:
355
+ if is_lora_a:
356
+ tensor_map[base_name] = PartialLoraTensor(A=tensor)
357
+ else:
358
+ tensor_map[base_name] = PartialLoraTensor(B=tensor)
359
+
360
+ for name, tensor in tensor_map.items():
361
+ assert tensor.A is not None
362
+ assert tensor.B is not None
363
+ yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
364
+
365
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
366
+ dest = super().modify_tensors(data_torch, name, bid)
367
+ for dest_name, dest_data in dest:
368
+ assert isinstance(dest_data, LoraTorchTensor)
369
+ lora_a, lora_b = dest_data.get_lora_A_B()
370
+
371
+ yield (dest_name + ".lora_a", lora_a)
372
+ yield (dest_name + ".lora_b", lora_b)
373
+
374
+ with open(lora_config, "r") as f:
375
+ lparams: dict[str, Any] = json.load(f)
376
+
377
+ alpha: float = lparams["lora_alpha"]
378
+
379
+ model_instance = LoraModel(
380
+ dir_base_model,
381
+ ftype,
382
+ fname_out,
383
+ is_big_endian=args.bigendian,
384
+ use_temp_file=False,
385
+ eager=args.no_lazy,
386
+ dry_run=args.dry_run,
387
+ dir_lora_model=dir_lora,
388
+ lora_alpha=alpha,
389
+ )
390
+
391
+ logger.info("Exporting model...")
392
+ model_instance.write()
393
+ logger.info(f"Model successfully exported to {model_instance.fname_out}")
@@ -6,4 +6,4 @@ from .quants import *
6
6
  from .tensor_mapping import *
7
7
  from .vocab import *
8
8
  from .utility import *
9
- from .metadata import *
9
+ from .metadata import *
@@ -130,6 +130,7 @@ class Keys:
130
130
  INNER_SIZE = "{arch}.ssm.inner_size"
131
131
  STATE_SIZE = "{arch}.ssm.state_size"
132
132
  TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
133
+ DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms"
133
134
 
134
135
  class Tokenizer:
135
136
  MODEL = "tokenizer.ggml.model"
@@ -161,6 +162,7 @@ class Keys:
161
162
  SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
162
163
  MIDDLE_ID = "tokenizer.ggml.middle_token_id"
163
164
  EOT_ID = "tokenizer.ggml.eot_token_id"
165
+ EOM_ID = "tokenizer.ggml.eom_token_id"
164
166
 
165
167
  class Adapter:
166
168
  TYPE = "adapter.type"
@@ -216,7 +218,10 @@ class MODEL_ARCH(IntEnum):
216
218
  CHATGLM = auto()
217
219
  BITNET = auto()
218
220
  T5 = auto()
221
+ T5ENCODER = auto()
219
222
  JAIS = auto()
223
+ NEMOTRON = auto()
224
+ EXAONE = auto()
220
225
 
221
226
 
222
227
  class MODEL_TENSOR(IntEnum):
@@ -343,7 +348,10 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
343
348
  MODEL_ARCH.CHATGLM: "chatglm",
344
349
  MODEL_ARCH.BITNET: "bitnet",
345
350
  MODEL_ARCH.T5: "t5",
351
+ MODEL_ARCH.T5ENCODER: "t5encoder",
346
352
  MODEL_ARCH.JAIS: "jais",
353
+ MODEL_ARCH.NEMOTRON: "nemotron",
354
+ MODEL_ARCH.EXAONE: "exaone",
347
355
  }
348
356
 
349
357
  TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -1035,6 +1043,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1035
1043
  MODEL_TENSOR.ENC_FFN_UP,
1036
1044
  MODEL_TENSOR.ENC_OUTPUT_NORM,
1037
1045
  ],
1046
+ MODEL_ARCH.T5ENCODER: [
1047
+ MODEL_TENSOR.TOKEN_EMBD,
1048
+ MODEL_TENSOR.OUTPUT,
1049
+ MODEL_TENSOR.ENC_ATTN_NORM,
1050
+ MODEL_TENSOR.ENC_ATTN_Q,
1051
+ MODEL_TENSOR.ENC_ATTN_K,
1052
+ MODEL_TENSOR.ENC_ATTN_V,
1053
+ MODEL_TENSOR.ENC_ATTN_OUT,
1054
+ MODEL_TENSOR.ENC_ATTN_REL_B,
1055
+ MODEL_TENSOR.ENC_FFN_NORM,
1056
+ MODEL_TENSOR.ENC_FFN_GATE,
1057
+ MODEL_TENSOR.ENC_FFN_DOWN,
1058
+ MODEL_TENSOR.ENC_FFN_UP,
1059
+ MODEL_TENSOR.ENC_OUTPUT_NORM,
1060
+ ],
1038
1061
  MODEL_ARCH.JAIS: [
1039
1062
  MODEL_TENSOR.TOKEN_EMBD,
1040
1063
  MODEL_TENSOR.OUTPUT_NORM,
@@ -1047,6 +1070,37 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1047
1070
  MODEL_TENSOR.FFN_GATE,
1048
1071
  MODEL_TENSOR.FFN_UP,
1049
1072
  ],
1073
+ MODEL_ARCH.NEMOTRON: [
1074
+ MODEL_TENSOR.TOKEN_EMBD,
1075
+ MODEL_TENSOR.OUTPUT_NORM,
1076
+ MODEL_TENSOR.OUTPUT,
1077
+ MODEL_TENSOR.ROPE_FREQS,
1078
+ MODEL_TENSOR.ATTN_NORM,
1079
+ MODEL_TENSOR.ATTN_Q,
1080
+ MODEL_TENSOR.ATTN_K,
1081
+ MODEL_TENSOR.ATTN_V,
1082
+ MODEL_TENSOR.ATTN_OUT,
1083
+ MODEL_TENSOR.ATTN_ROT_EMBD,
1084
+ MODEL_TENSOR.FFN_NORM,
1085
+ MODEL_TENSOR.FFN_DOWN,
1086
+ MODEL_TENSOR.FFN_UP,
1087
+ ],
1088
+ MODEL_ARCH.EXAONE: [
1089
+ MODEL_TENSOR.TOKEN_EMBD,
1090
+ MODEL_TENSOR.OUTPUT_NORM,
1091
+ MODEL_TENSOR.OUTPUT,
1092
+ MODEL_TENSOR.ROPE_FREQS,
1093
+ MODEL_TENSOR.ATTN_NORM,
1094
+ MODEL_TENSOR.ATTN_Q,
1095
+ MODEL_TENSOR.ATTN_K,
1096
+ MODEL_TENSOR.ATTN_V,
1097
+ MODEL_TENSOR.ATTN_OUT,
1098
+ MODEL_TENSOR.ATTN_ROT_EMBD,
1099
+ MODEL_TENSOR.FFN_NORM,
1100
+ MODEL_TENSOR.FFN_GATE,
1101
+ MODEL_TENSOR.FFN_DOWN,
1102
+ MODEL_TENSOR.FFN_UP,
1103
+ ],
1050
1104
  # TODO
1051
1105
  }
1052
1106
 
@@ -1087,6 +1141,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1087
1141
  MODEL_ARCH.CHATGLM: [
1088
1142
  MODEL_TENSOR.ROPE_FREQS,
1089
1143
  ],
1144
+ MODEL_ARCH.NEMOTRON: [
1145
+ MODEL_TENSOR.ROPE_FREQS,
1146
+ MODEL_TENSOR.ATTN_ROT_EMBD,
1147
+ ],
1090
1148
  }
1091
1149
 
1092
1150
  #
@@ -1145,6 +1203,9 @@ class GGMLQuantizationType(IntEnum):
1145
1203
  F64 = 28
1146
1204
  IQ1_M = 29
1147
1205
  BF16 = 30
1206
+ Q4_0_4_4 = 31
1207
+ Q4_0_4_8 = 32
1208
+ Q4_0_8_8 = 33
1148
1209
 
1149
1210
 
1150
1211
  # TODO: add GGMLFileType from ggml_ftype in ggml.h
@@ -1157,7 +1218,7 @@ class LlamaFileType(IntEnum):
1157
1218
  MOSTLY_F16 = 1 # except 1d tensors
1158
1219
  MOSTLY_Q4_0 = 2 # except 1d tensors
1159
1220
  MOSTLY_Q4_1 = 3 # except 1d tensors
1160
- MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16
1221
+ # MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16
1161
1222
  # MOSTLY_Q4_2 = 5 # support has been removed
1162
1223
  # MOSTLY_Q4_3 = 6 # support has been removed
1163
1224
  MOSTLY_Q8_0 = 7 # except 1d tensors
@@ -1186,6 +1247,9 @@ class LlamaFileType(IntEnum):
1186
1247
  MOSTLY_IQ4_XS = 30 # except 1d tensors
1187
1248
  MOSTLY_IQ1_M = 31 # except 1d tensors
1188
1249
  MOSTLY_BF16 = 32 # except 1d tensors
1250
+ MOSTLY_Q4_0_4_4 = 33 # except 1d tensors
1251
+ MOSTLY_Q4_0_4_8 = 34 # except 1d tensors
1252
+ MOSTLY_Q4_0_8_8 = 35 # except 1d tensors
1189
1253
 
1190
1254
  GUESSED = 1024 # not specified in the model file
1191
1255
 
@@ -1259,6 +1323,9 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
1259
1323
  GGMLQuantizationType.F64: (1, 8),
1260
1324
  GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
1261
1325
  GGMLQuantizationType.BF16: (1, 2),
1326
+ GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
1327
+ GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
1328
+ GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
1262
1329
  }
1263
1330
 
1264
1331
 
@@ -1306,6 +1373,7 @@ KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL
1306
1373
  KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE
1307
1374
  KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE
1308
1375
  KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
1376
+ KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS
1309
1377
 
1310
1378
  # tokenization
1311
1379
  KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
@@ -1326,4 +1394,5 @@ KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
1326
1394
  KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID
1327
1395
  KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
1328
1396
  KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID
1329
- KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
1397
+ KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
1398
+ KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
@@ -312,6 +312,8 @@ class GGUFWriter:
312
312
  self.add_key_value(key, val, GGUFValueType.STRING)
313
313
 
314
314
  def add_array(self, key: str, val: Sequence[Any]) -> None:
315
+ if len(val) == 0:
316
+ return
315
317
  self.add_key_value(key, val, GGUFValueType.ARRAY)
316
318
 
317
319
  @staticmethod
@@ -728,6 +730,9 @@ class GGUFWriter:
728
730
  def add_ssm_time_step_rank(self, value: int) -> None:
729
731
  self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value)
730
732
 
733
+ def add_ssm_dt_b_c_rms(self, value: bool) -> None:
734
+ self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value)
735
+
731
736
  def add_tokenizer_model(self, model: str) -> None:
732
737
  self.add_string(Keys.Tokenizer.MODEL, model)
733
738
 
@@ -826,6 +831,9 @@ class GGUFWriter:
826
831
  def add_eot_token_id(self, id: int) -> None:
827
832
  self.add_uint32(Keys.Tokenizer.EOT_ID, id)
828
833
 
834
+ def add_eom_token_id(self, id: int) -> None:
835
+ self.add_uint32(Keys.Tokenizer.EOM_ID, id)
836
+
829
837
  def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
830
838
  pack_prefix = ''
831
839
  if not skip_pack_prefix:
@@ -845,7 +853,14 @@ class GGUFWriter:
845
853
  encoded_val = val.encode("utf-8") if isinstance(val, str) else val
846
854
  kv_data += self._pack("Q", len(encoded_val))
847
855
  kv_data += encoded_val
848
- elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
856
+ elif vtype == GGUFValueType.ARRAY:
857
+
858
+ if not isinstance(val, Sequence):
859
+ raise ValueError("Invalid GGUF metadata array, expecting sequence")
860
+
861
+ if len(val) == 0:
862
+ raise ValueError("Invalid GGUF metadata array. Empty array")
863
+
849
864
  if isinstance(val, bytes):
850
865
  ltype = GGUFValueType.UINT8
851
866
  else:
@@ -191,6 +191,8 @@ class LazyBase(ABC, metaclass=LazyMeta):
191
191
  class LazyNumpyTensor(LazyBase):
192
192
  _tensor_type = np.ndarray
193
193
 
194
+ shape: tuple[int, ...] # Makes the type checker happy in quants.py
195
+
194
196
  @classmethod
195
197
  def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
196
198
  # The initial idea was to use np.nan as the fill value,
@@ -208,4 +210,5 @@ class LazyNumpyTensor(LazyBase):
208
210
  eager = LazyNumpyTensor.to_eager(self)
209
211
  return eager.tofile(*args, **kwargs)
210
212
 
211
- # TODO: __array_function__
213
+
214
+ # TODO: __array_function__