bigdl-core-cpp 2.5.0rc1__py3-none-win_amd64.whl → 2.6.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. bigdl/cpp/{convert-hf-to-gguf.py → convert_hf_to_gguf.py} +1673 -278
  2. bigdl/cpp/convert_hf_to_gguf_update.py +381 -0
  3. bigdl/cpp/convert_llama_ggml_to_gguf.py +450 -0
  4. bigdl/cpp/convert_lora_to_gguf.py +461 -0
  5. bigdl/cpp/gguf-py/gguf/__init__.py +1 -1
  6. bigdl/cpp/gguf-py/gguf/constants.py +698 -171
  7. bigdl/cpp/gguf-py/gguf/gguf.py +1 -1
  8. bigdl/cpp/gguf-py/gguf/gguf_reader.py +5 -6
  9. bigdl/cpp/gguf-py/gguf/gguf_writer.py +108 -17
  10. bigdl/cpp/gguf-py/gguf/lazy.py +3 -1
  11. bigdl/cpp/gguf-py/gguf/metadata.py +195 -76
  12. bigdl/cpp/gguf-py/gguf/quants.py +1210 -64
  13. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +262 -43
  14. bigdl/cpp/gguf-py/gguf/utility.py +2 -2
  15. bigdl/cpp/gguf-py/gguf/vocab.py +325 -3
  16. bigdl/cpp/libs/common.lib +0 -0
  17. bigdl/cpp/libs/ggml-base.dll +0 -0
  18. bigdl/cpp/libs/ggml-cpu.dll +0 -0
  19. bigdl/cpp/libs/ggml-sycl.dll +0 -0
  20. bigdl/cpp/libs/ggml.dll +0 -0
  21. bigdl/cpp/libs/libc++.dll +0 -0
  22. bigdl/cpp/libs/llama-batched.exe +0 -0
  23. bigdl/cpp/libs/llama-bench.exe +0 -0
  24. bigdl/cpp/libs/llama-cli.exe +0 -0
  25. bigdl/cpp/libs/llama-embedding.exe +0 -0
  26. bigdl/cpp/libs/llama-gemma3-cli.exe +0 -0
  27. bigdl/cpp/libs/llama-gguf.exe +0 -0
  28. bigdl/cpp/libs/llama-llava-cli.exe +0 -0
  29. bigdl/cpp/libs/llama-lookup.exe +0 -0
  30. bigdl/cpp/libs/llama-ls-sycl-device.exe +0 -0
  31. bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
  32. bigdl/cpp/libs/llama-perplexity.exe +0 -0
  33. bigdl/cpp/libs/llama-quantize.exe +0 -0
  34. bigdl/cpp/libs/llama-server.exe +0 -0
  35. bigdl/cpp/libs/llama-simple.exe +0 -0
  36. bigdl/cpp/libs/llama-speculative.exe +0 -0
  37. bigdl/cpp/libs/llama-tokenize.exe +0 -0
  38. bigdl/cpp/libs/llama.dll +0 -0
  39. bigdl/cpp/libs/llava_shared.dll +0 -0
  40. bigdl/cpp/libs/ollama-ggml-base.dll +0 -0
  41. bigdl/cpp/libs/ollama-ggml-cpu.dll +0 -0
  42. bigdl/cpp/libs/ollama-ggml-sycl.dll +0 -0
  43. bigdl/cpp/libs/ollama-lib.exe +0 -0
  44. bigdl/cpp/libs/ollama.exe +0 -0
  45. bigdl/cpp/libs/ollama_ggml.dll +0 -0
  46. bigdl/cpp/libs/ollama_llama.dll +0 -0
  47. bigdl/cpp/libs/ollama_llava_shared.dll +0 -0
  48. {bigdl_core_cpp-2.5.0rc1.data → bigdl_core_cpp-2.6.0.data}/scripts/init-llama-cpp.bat +7 -2
  49. bigdl_core_cpp-2.6.0.data/scripts/init-ollama.bat +16 -0
  50. {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0.dist-info}/METADATA +9 -5
  51. bigdl_core_cpp-2.6.0.dist-info/RECORD +57 -0
  52. {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0.dist-info}/WHEEL +1 -1
  53. bigdl/cpp/convert.py +0 -1714
  54. bigdl/cpp/libs/baby-llama.exe +0 -0
  55. bigdl/cpp/libs/batched-bench.exe +0 -0
  56. bigdl/cpp/libs/batched.exe +0 -0
  57. bigdl/cpp/libs/beam-search.exe +0 -0
  58. bigdl/cpp/libs/benchmark.exe +0 -0
  59. bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
  60. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
  61. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
  62. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
  63. bigdl/cpp/libs/embedding.exe +0 -0
  64. bigdl/cpp/libs/export-lora.exe +0 -0
  65. bigdl/cpp/libs/finetune.exe +0 -0
  66. bigdl/cpp/libs/ggml_shared.dll +0 -0
  67. bigdl/cpp/libs/gguf.exe +0 -0
  68. bigdl/cpp/libs/gritlm.exe +0 -0
  69. bigdl/cpp/libs/imatrix.exe +0 -0
  70. bigdl/cpp/libs/infill.exe +0 -0
  71. bigdl/cpp/libs/llava-cli.exe +0 -0
  72. bigdl/cpp/libs/lookahead.exe +0 -0
  73. bigdl/cpp/libs/lookup.exe +0 -0
  74. bigdl/cpp/libs/ls-sycl-device.exe +0 -0
  75. bigdl/cpp/libs/main.exe +0 -0
  76. bigdl/cpp/libs/parallel.exe +0 -0
  77. bigdl/cpp/libs/passkey.exe +0 -0
  78. bigdl/cpp/libs/perplexity.exe +0 -0
  79. bigdl/cpp/libs/q8dot.exe +0 -0
  80. bigdl/cpp/libs/quantize-stats.exe +0 -0
  81. bigdl/cpp/libs/quantize.exe +0 -0
  82. bigdl/cpp/libs/save-load-state.exe +0 -0
  83. bigdl/cpp/libs/server.exe +0 -0
  84. bigdl/cpp/libs/simple.exe +0 -0
  85. bigdl/cpp/libs/speculative.exe +0 -0
  86. bigdl/cpp/libs/tokenize.exe +0 -0
  87. bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
  88. bigdl/cpp/libs/vdot.exe +0 -0
  89. bigdl_core_cpp-2.5.0rc1.data/scripts/init-ollama.bat +0 -13
  90. bigdl_core_cpp-2.5.0rc1.dist-info/RECORD +0 -63
  91. {bigdl_core_cpp-2.5.0rc1.data → bigdl_core_cpp-2.6.0.data}/scripts/init-llama-cpp.ps1 +0 -0
  92. {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,461 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ from __future__ import annotations
5
+
6
+ from dataclasses import dataclass
7
+ import logging
8
+ import argparse
9
+ import os
10
+ import sys
11
+ import json
12
+ from math import prod
13
+ from pathlib import Path
14
+ from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
15
+ from transformers import AutoConfig
16
+
17
+ import torch
18
+
19
+ if TYPE_CHECKING:
20
+ from torch import Tensor
21
+
22
+ if 'NO_LOCAL_GGUF' not in os.environ:
23
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
24
+ import gguf
25
+
26
+ # reuse model definitions from convert_hf_to_gguf.py
27
+ from convert_hf_to_gguf import LazyTorchTensor, Model
28
+
29
+ logger = logging.getLogger("lora-to-gguf")
30
+
31
+
32
+ @dataclass
33
+ class PartialLoraTensor:
34
+ A: Tensor | None = None
35
+ B: Tensor | None = None
36
+
37
+
38
+ # magic to support tensor shape modifications and splitting
39
+ class LoraTorchTensor:
40
+ _lora_A: Tensor # (n_rank, row_size)
41
+ _lora_B: Tensor # (col_size, n_rank)
42
+ _rank: int
43
+
44
+ def __init__(self, A: Tensor, B: Tensor):
45
+ assert len(A.shape) == len(B.shape)
46
+ assert A.shape[-2] == B.shape[-1]
47
+ if A.dtype != B.dtype:
48
+ A = A.to(torch.float32)
49
+ B = B.to(torch.float32)
50
+ self._lora_A = A
51
+ self._lora_B = B
52
+ self._rank = B.shape[-1]
53
+
54
+ def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
55
+ return (self._lora_A, self._lora_B)
56
+
57
+ def __getitem__(
58
+ self,
59
+ indices: (
60
+ SupportsIndex
61
+ | slice
62
+ | tuple[SupportsIndex | slice | Tensor, ...] # TODO: add ellipsis in the type signature
63
+ ),
64
+ ) -> LoraTorchTensor:
65
+ shape = self.shape
66
+ if isinstance(indices, SupportsIndex):
67
+ if len(shape) > 2:
68
+ return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
69
+ else:
70
+ raise NotImplementedError # can't return a vector
71
+ elif isinstance(indices, slice):
72
+ if len(shape) > 2:
73
+ return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
74
+ else:
75
+ return LoraTorchTensor(self._lora_A, self._lora_B[indices])
76
+ elif isinstance(indices, tuple):
77
+ assert len(indices) > 0
78
+ if indices[-1] is Ellipsis:
79
+ return self[indices[:-1]]
80
+ # expand ellipsis
81
+ indices = tuple(
82
+ u
83
+ for v in (
84
+ (
85
+ (slice(None, None) for _ in range(len(indices) - 1))
86
+ if i is Ellipsis
87
+ else (i,)
88
+ )
89
+ for i in indices
90
+ )
91
+ for u in v
92
+ )
93
+
94
+ if len(indices) < len(shape):
95
+ indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape))))
96
+
97
+ # TODO: make sure this is correct
98
+ indices_A = (
99
+ *(
100
+ (
101
+ j.__index__() % self._lora_A.shape[i]
102
+ if isinstance(j, SupportsIndex)
103
+ else slice(None, None)
104
+ )
105
+ for i, j in enumerate(indices[:-2])
106
+ ),
107
+ slice(None, None),
108
+ indices[-1],
109
+ )
110
+ indices_B = indices[:-1]
111
+ return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
112
+ else:
113
+ raise NotImplementedError # unknown indice type
114
+
115
+ @property
116
+ def dtype(self) -> torch.dtype:
117
+ assert self._lora_A.dtype == self._lora_B.dtype
118
+ return self._lora_A.dtype
119
+
120
+ @property
121
+ def shape(self) -> tuple[int, ...]:
122
+ assert len(self._lora_A.shape) == len(self._lora_B.shape)
123
+ return (*self._lora_B.shape[:-1], self._lora_A.shape[-1])
124
+
125
+ def size(self, dim=None):
126
+ assert dim is None
127
+ return self.shape
128
+
129
+ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
130
+ if isinstance(shape[0], tuple):
131
+ new_shape: tuple[int, ...] = shape[0]
132
+ else:
133
+ new_shape = cast(tuple[int, ...], shape)
134
+ orig_shape = self.shape
135
+ if len(new_shape) < 2:
136
+ raise NotImplementedError # can't become a vector
137
+
138
+ # expand -1 in the shape
139
+ if any(dim == -1 for dim in new_shape):
140
+ n_elems = prod(orig_shape)
141
+ n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
142
+ assert n_elems % n_new_elems == 0
143
+ new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),)
144
+
145
+ if new_shape[-1] != orig_shape[-1]:
146
+ raise NotImplementedError # can't reshape the row size trivially
147
+
148
+ shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
149
+ shape_B = (*new_shape[:-1], self._rank)
150
+ return LoraTorchTensor(
151
+ self._lora_A.reshape(shape_A),
152
+ self._lora_B.reshape(shape_B),
153
+ )
154
+
155
+ def reshape_as(self, other: Tensor) -> LoraTorchTensor:
156
+ return self.reshape(*other.shape)
157
+
158
+ def view(self, *size: int) -> LoraTorchTensor:
159
+ return self.reshape(*size)
160
+
161
+ def permute(self, *dims: int) -> LoraTorchTensor:
162
+ shape = self.shape
163
+ dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
164
+ if dims[-1] == -1:
165
+ # TODO: support higher dimensional A shapes bigger than 1
166
+ assert all(dim == 1 for dim in self._lora_A.shape[:-2])
167
+ return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
168
+ if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
169
+ return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims))
170
+ else:
171
+ # TODO: compose the above two
172
+ raise NotImplementedError
173
+
174
+ def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
175
+ shape = self.shape
176
+ dims = [i for i in range(len(shape))]
177
+ dims[dim0], dims[dim1] = dims[dim1], dims[dim0]
178
+ return self.permute(*dims)
179
+
180
+ def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
181
+ return self.transpose(axis0, axis1)
182
+
183
+ def to(self, *args, **kwargs):
184
+ return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
185
+
186
+ @classmethod
187
+ def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
188
+ del types # unused
189
+
190
+ if kwargs is None:
191
+ kwargs = {}
192
+
193
+ if func is torch.permute:
194
+ return type(args[0]).permute(*args, **kwargs)
195
+ elif func is torch.reshape:
196
+ return type(args[0]).reshape(*args, **kwargs)
197
+ elif func is torch.stack:
198
+ assert isinstance(args[0], Sequence)
199
+ dim = kwargs.get("dim", 0)
200
+ assert dim == 0
201
+ return LoraTorchTensor(
202
+ torch.stack([a._lora_A for a in args[0]], dim),
203
+ torch.stack([b._lora_B for b in args[0]], dim),
204
+ )
205
+ elif func is torch.cat:
206
+ assert isinstance(args[0], Sequence)
207
+ dim = kwargs.get("dim", 0)
208
+ assert dim == 0
209
+ if len(args[0][0].shape) > 2:
210
+ return LoraTorchTensor(
211
+ torch.cat([a._lora_A for a in args[0]], dim),
212
+ torch.cat([b._lora_B for b in args[0]], dim),
213
+ )
214
+ elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]):
215
+ return LoraTorchTensor(
216
+ args[0][0]._lora_A,
217
+ torch.cat([b._lora_B for b in args[0]], dim),
218
+ )
219
+ else:
220
+ raise NotImplementedError
221
+ else:
222
+ raise NotImplementedError
223
+
224
+
225
+ def get_base_tensor_name(lora_tensor_name: str) -> str:
226
+ base_name = lora_tensor_name.replace("base_model.model.", "")
227
+ base_name = base_name.replace(".lora_A.weight", ".weight")
228
+ base_name = base_name.replace(".lora_B.weight", ".weight")
229
+ # models produced by mergekit-extract-lora have token embeddings in the adapter
230
+ base_name = base_name.replace(".lora_embedding_A", ".weight")
231
+ base_name = base_name.replace(".lora_embedding_B", ".weight")
232
+ return base_name
233
+
234
+
235
+ def parse_args() -> argparse.Namespace:
236
+ parser = argparse.ArgumentParser(
237
+ description="Convert a Hugging Face PEFT LoRA adapter to a GGUF file")
238
+ parser.add_argument(
239
+ "--outfile", type=Path,
240
+ help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
241
+ )
242
+ parser.add_argument(
243
+ "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
244
+ help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
245
+ )
246
+ parser.add_argument(
247
+ "--bigendian", action="store_true",
248
+ help="model is executed on big endian machine",
249
+ )
250
+ parser.add_argument(
251
+ "--no-lazy", action="store_true",
252
+ help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
253
+ )
254
+ parser.add_argument(
255
+ "--verbose", action="store_true",
256
+ help="increase output verbosity",
257
+ )
258
+ parser.add_argument(
259
+ "--dry-run", action="store_true",
260
+ help="only print out what will be done, without writing any new files",
261
+ )
262
+ parser.add_argument(
263
+ "--base", type=Path,
264
+ help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
265
+ )
266
+ parser.add_argument(
267
+ "--base-model-id", type=str,
268
+ help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
269
+ )
270
+ parser.add_argument(
271
+ "lora_path", type=Path,
272
+ help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
273
+ )
274
+
275
+ return parser.parse_args()
276
+
277
+
278
+ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
279
+ # normally, adapter does not come with base model config, we need to load it from AutoConfig
280
+ config = AutoConfig.from_pretrained(hf_model_id)
281
+ return config.to_dict()
282
+
283
+
284
+ if __name__ == '__main__':
285
+ args = parse_args()
286
+ logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
287
+
288
+ ftype_map: dict[str, gguf.LlamaFileType] = {
289
+ "f32": gguf.LlamaFileType.ALL_F32,
290
+ "f16": gguf.LlamaFileType.MOSTLY_F16,
291
+ "bf16": gguf.LlamaFileType.MOSTLY_BF16,
292
+ "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
293
+ "auto": gguf.LlamaFileType.GUESSED,
294
+ }
295
+
296
+ ftype = ftype_map[args.outtype]
297
+
298
+ dir_base_model: Path | None = args.base
299
+ dir_lora: Path = args.lora_path
300
+ base_model_id: str | None = args.base_model_id
301
+ lora_config = dir_lora / "adapter_config.json"
302
+ input_model = dir_lora / "adapter_model.safetensors"
303
+
304
+ if args.outfile is not None:
305
+ fname_out = args.outfile
306
+ else:
307
+ # output in the same directory as the model by default
308
+ fname_out = dir_lora
309
+
310
+ if os.path.exists(input_model):
311
+ # lazy import load_file only if lora is in safetensors format.
312
+ from safetensors.torch import load_file
313
+
314
+ lora_model = load_file(input_model, device="cpu")
315
+ else:
316
+ input_model = os.path.join(dir_lora, "adapter_model.bin")
317
+ lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
318
+
319
+ # load LoRA config
320
+ with open(lora_config, "r") as f:
321
+ lparams: dict[str, Any] = json.load(f)
322
+
323
+ # load base model
324
+ if base_model_id is not None:
325
+ logger.info(f"Loading base model from Hugging Face: {base_model_id}")
326
+ hparams = load_hparams_from_hf(base_model_id)
327
+ elif dir_base_model is None:
328
+ if "base_model_name_or_path" in lparams:
329
+ model_id = lparams["base_model_name_or_path"]
330
+ logger.info(f"Loading base model from Hugging Face: {model_id}")
331
+ try:
332
+ hparams = load_hparams_from_hf(model_id)
333
+ except OSError as e:
334
+ logger.error(f"Failed to load base model config: {e}")
335
+ logger.error("Please try downloading the base model and add its path to --base")
336
+ sys.exit(1)
337
+ else:
338
+ logger.error("'base_model_name_or_path' is not found in adapter_config.json")
339
+ logger.error("Base model config is required. Please download the base model and add its path to --base")
340
+ sys.exit(1)
341
+ else:
342
+ logger.info(f"Loading base model: {dir_base_model.name}")
343
+ hparams = Model.load_hparams(dir_base_model)
344
+
345
+ with torch.inference_mode():
346
+ try:
347
+ model_class = Model.from_model_architecture(hparams["architectures"][0])
348
+ except NotImplementedError:
349
+ logger.error(f"Model {hparams['architectures'][0]} is not supported")
350
+ sys.exit(1)
351
+
352
+ class LoraModel(model_class):
353
+ model_arch = model_class.model_arch
354
+
355
+ lora_alpha: float
356
+
357
+ def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
358
+
359
+ super().__init__(*args, **kwargs)
360
+
361
+ self.dir_model_card = dir_lora_model
362
+ self.lora_alpha = float(lora_alpha)
363
+
364
+ def set_vocab(self):
365
+ pass
366
+
367
+ def set_type(self):
368
+ self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
369
+ self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
370
+
371
+ def set_gguf_parameters(self):
372
+ self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
373
+
374
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
375
+ # Never add extra tensors (e.g. rope_freqs) for LoRA adapters
376
+ return ()
377
+
378
+ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
379
+ tensor_map: dict[str, PartialLoraTensor] = {}
380
+
381
+ for name, tensor in lora_model.items():
382
+ if self.lazy:
383
+ tensor = LazyTorchTensor.from_eager(tensor)
384
+ base_name = get_base_tensor_name(name)
385
+ # note: mergekit-extract-lora also adds token embeddings to the adapter
386
+ is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
387
+ is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
388
+ if not is_lora_a and not is_lora_b:
389
+ if ".base_layer.weight" in name:
390
+ continue
391
+ # mergekit-extract-lora add these layernorm to the adapter, we need to keep them
392
+ if "_layernorm" in name or ".norm" in name:
393
+ yield (base_name, tensor)
394
+ continue
395
+ logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
396
+ if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
397
+ logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
398
+ logger.error("Please refer to https://github.com/ggml-org/llama.cpp/pull/9948")
399
+ sys.exit(1)
400
+
401
+ if base_name in tensor_map:
402
+ if is_lora_a:
403
+ tensor_map[base_name].A = tensor
404
+ else:
405
+ tensor_map[base_name].B = tensor
406
+ else:
407
+ if is_lora_a:
408
+ tensor_map[base_name] = PartialLoraTensor(A=tensor)
409
+ else:
410
+ tensor_map[base_name] = PartialLoraTensor(B=tensor)
411
+
412
+ for name, tensor in tensor_map.items():
413
+ assert tensor.A is not None
414
+ assert tensor.B is not None
415
+ yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
416
+
417
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
418
+ dest = list(super().modify_tensors(data_torch, name, bid))
419
+ # some archs may have the same tensor for lm_head and output (tie word embeddings)
420
+ # in this case, adapters targeting lm_head will fail when using llama-export-lora
421
+ # therefore, we ignore them for now
422
+ # see: https://github.com/ggml-org/llama.cpp/issues/9065
423
+ if name == "lm_head.weight" and len(dest) == 0:
424
+ raise ValueError("lm_head is present in adapter, but is ignored in base model")
425
+ for dest_name, dest_data in dest:
426
+ # mergekit-extract-lora add these layernorm to the adapter
427
+ if "_norm" in dest_name:
428
+ assert dest_data.dim() == 1
429
+ yield (dest_name, dest_data)
430
+ continue
431
+
432
+ # otherwise, we must get the lora_A and lora_B tensors
433
+ assert isinstance(dest_data, LoraTorchTensor)
434
+ lora_a, lora_b = dest_data.get_lora_A_B()
435
+
436
+ # note: mergekit-extract-lora flip and transpose A and B
437
+ # here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
438
+ if "token_embd.weight" in dest_name:
439
+ lora_a = lora_a.T
440
+
441
+ yield (dest_name + ".lora_a", lora_a)
442
+ yield (dest_name + ".lora_b", lora_b)
443
+
444
+ alpha: float = lparams["lora_alpha"]
445
+
446
+ model_instance = LoraModel(
447
+ dir_base_model,
448
+ ftype,
449
+ fname_out,
450
+ is_big_endian=args.bigendian,
451
+ use_temp_file=False,
452
+ eager=args.no_lazy,
453
+ dry_run=args.dry_run,
454
+ dir_lora_model=dir_lora,
455
+ lora_alpha=alpha,
456
+ hparams=hparams,
457
+ )
458
+
459
+ logger.info("Exporting model...")
460
+ model_instance.write()
461
+ logger.info(f"Model successfully exported to {model_instance.fname_out}")
@@ -6,4 +6,4 @@ from .quants import *
6
6
  from .tensor_mapping import *
7
7
  from .vocab import *
8
8
  from .utility import *
9
- from .metadata import *
9
+ from .metadata import *