bigdl-core-cpp 2.5.0rc1__py3-none-win_amd64.whl → 2.6.0b1__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. bigdl/cpp/{convert-hf-to-gguf.py → convert_hf_to_gguf.py} +413 -67
  2. bigdl/cpp/convert_hf_to_gguf_update.py +354 -0
  3. bigdl/cpp/convert_llama_ggml_to_gguf.py +454 -0
  4. bigdl/cpp/convert_lora_to_gguf.py +393 -0
  5. bigdl/cpp/gguf-py/gguf/__init__.py +1 -1
  6. bigdl/cpp/gguf-py/gguf/constants.py +71 -2
  7. bigdl/cpp/gguf-py/gguf/gguf_writer.py +16 -1
  8. bigdl/cpp/gguf-py/gguf/lazy.py +4 -1
  9. bigdl/cpp/gguf-py/gguf/metadata.py +70 -63
  10. bigdl/cpp/gguf-py/gguf/quants.py +1129 -64
  11. bigdl/cpp/gguf-py/gguf/tensor_mapping.py +23 -15
  12. bigdl/cpp/gguf-py/gguf/utility.py +1 -1
  13. bigdl/cpp/gguf-py/gguf/vocab.py +301 -1
  14. bigdl/cpp/libs/common.lib +0 -0
  15. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/ggml.dll +0 -0
  16. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/llama.dll +0 -0
  17. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu/ollama_llama_server.exe +0 -0
  18. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ggml.dll +0 -0
  19. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/llama.dll +0 -0
  20. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx/ollama_llama_server.exe +0 -0
  21. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ggml.dll +0 -0
  22. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/llama.dll +0 -0
  23. bigdl/cpp/libs/dist/windows-amd64/lib/ollama/runners/cpu_avx2/ollama_llama_server.exe +0 -0
  24. bigdl/cpp/libs/ggml.dll +0 -0
  25. bigdl/cpp/libs/llama-batched.exe +0 -0
  26. bigdl/cpp/libs/llama-bench.exe +0 -0
  27. bigdl/cpp/libs/llama-cli.exe +0 -0
  28. bigdl/cpp/libs/llama-embedding.exe +0 -0
  29. bigdl/cpp/libs/llama-gguf.exe +0 -0
  30. bigdl/cpp/libs/llama-llava-cli.exe +0 -0
  31. bigdl/cpp/libs/llama-lookup.exe +0 -0
  32. bigdl/cpp/libs/{ls-sycl-device.exe → llama-ls-sycl-device.exe} +0 -0
  33. bigdl/cpp/libs/llama-minicpmv-cli.exe +0 -0
  34. bigdl/cpp/libs/llama-perplexity.exe +0 -0
  35. bigdl/cpp/libs/llama-quantize.exe +0 -0
  36. bigdl/cpp/libs/llama-server.exe +0 -0
  37. bigdl/cpp/libs/llama-simple.exe +0 -0
  38. bigdl/cpp/libs/llama-speculative.exe +0 -0
  39. bigdl/cpp/libs/llama-tokenize.exe +0 -0
  40. bigdl/cpp/libs/llama.dll +0 -0
  41. bigdl/cpp/libs/llava_shared.dll +0 -0
  42. bigdl/cpp/libs/ollama.exe +0 -0
  43. {bigdl_core_cpp-2.5.0rc1.data → bigdl_core_cpp-2.6.0b1.data}/scripts/init-llama-cpp.bat +7 -2
  44. {bigdl_core_cpp-2.5.0rc1.data → bigdl_core_cpp-2.6.0b1.data}/scripts/init-ollama.bat +6 -0
  45. {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0b1.dist-info}/METADATA +3 -3
  46. bigdl_core_cpp-2.6.0b1.dist-info/RECORD +54 -0
  47. {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0b1.dist-info}/WHEEL +1 -1
  48. bigdl/cpp/convert.py +0 -1714
  49. bigdl/cpp/libs/baby-llama.exe +0 -0
  50. bigdl/cpp/libs/batched-bench.exe +0 -0
  51. bigdl/cpp/libs/batched.exe +0 -0
  52. bigdl/cpp/libs/beam-search.exe +0 -0
  53. bigdl/cpp/libs/benchmark.exe +0 -0
  54. bigdl/cpp/libs/convert-llama2c-to-ggml.exe +0 -0
  55. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu/ollama_llama_server.exe +0 -0
  56. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx/ollama_llama_server.exe +0 -0
  57. bigdl/cpp/libs/dist/windows-amd64/ollama_runners/cpu_avx2/ollama_llama_server.exe +0 -0
  58. bigdl/cpp/libs/embedding.exe +0 -0
  59. bigdl/cpp/libs/export-lora.exe +0 -0
  60. bigdl/cpp/libs/finetune.exe +0 -0
  61. bigdl/cpp/libs/ggml_shared.dll +0 -0
  62. bigdl/cpp/libs/gguf.exe +0 -0
  63. bigdl/cpp/libs/gritlm.exe +0 -0
  64. bigdl/cpp/libs/imatrix.exe +0 -0
  65. bigdl/cpp/libs/infill.exe +0 -0
  66. bigdl/cpp/libs/llava-cli.exe +0 -0
  67. bigdl/cpp/libs/lookahead.exe +0 -0
  68. bigdl/cpp/libs/lookup.exe +0 -0
  69. bigdl/cpp/libs/main.exe +0 -0
  70. bigdl/cpp/libs/parallel.exe +0 -0
  71. bigdl/cpp/libs/passkey.exe +0 -0
  72. bigdl/cpp/libs/perplexity.exe +0 -0
  73. bigdl/cpp/libs/q8dot.exe +0 -0
  74. bigdl/cpp/libs/quantize-stats.exe +0 -0
  75. bigdl/cpp/libs/quantize.exe +0 -0
  76. bigdl/cpp/libs/save-load-state.exe +0 -0
  77. bigdl/cpp/libs/server.exe +0 -0
  78. bigdl/cpp/libs/simple.exe +0 -0
  79. bigdl/cpp/libs/speculative.exe +0 -0
  80. bigdl/cpp/libs/tokenize.exe +0 -0
  81. bigdl/cpp/libs/train-text-from-scratch.exe +0 -0
  82. bigdl/cpp/libs/vdot.exe +0 -0
  83. bigdl_core_cpp-2.5.0rc1.dist-info/RECORD +0 -63
  84. {bigdl_core_cpp-2.5.0rc1.data → bigdl_core_cpp-2.6.0b1.data}/scripts/init-llama-cpp.ps1 +0 -0
  85. {bigdl_core_cpp-2.5.0rc1.dist-info → bigdl_core_cpp-2.6.0b1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,454 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ import argparse
6
+ import os
7
+ import struct
8
+ import sys
9
+ from enum import IntEnum
10
+ from pathlib import Path
11
+
12
+ import numpy as np
13
+
14
+ if 'NO_LOCAL_GGUF' not in os.environ:
15
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
16
+ import gguf
17
+
18
+ logger = logging.getLogger("ggml-to-gguf")
19
+
20
+
21
+ class GGMLFormat(IntEnum):
22
+ GGML = 0
23
+ GGMF = 1
24
+ GGJT = 2
25
+
26
+
27
+ class GGMLFType(IntEnum):
28
+ ALL_F32 = 0
29
+ MOSTLY_F16 = 1
30
+ MOSTLY_Q4_0 = 2
31
+ MOSTLY_Q4_1 = 3
32
+ MOSTLY_Q4_1_SOME_F16 = 4
33
+ MOSTLY_Q8_0 = 7
34
+ MOSTLY_Q5_0 = 8
35
+ MOSTLY_Q5_1 = 9
36
+ MOSTLY_Q2_K = 10
37
+ MOSTLY_Q3_K_S = 11
38
+ MOSTLY_Q3_K_M = 12
39
+ MOSTLY_Q3_K_L = 13
40
+ MOSTLY_Q4_K_S = 14
41
+ MOSTLY_Q4_K_M = 15
42
+ MOSTLY_Q5_K_S = 16
43
+ MOSTLY_Q5_K_M = 17
44
+ MOSTLY_Q6_K = 18
45
+
46
+
47
+ class Hyperparameters:
48
+ def __init__(self):
49
+ self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
50
+ self.n_layer = self.n_rot = self.n_ff = 0
51
+ self.ftype = GGMLFType.ALL_F32
52
+
53
+ def set_n_ff(self, model):
54
+ ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
55
+ assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
56
+ ff_tensor = model.tensors[ff_tensor_idx]
57
+ self.n_ff = ff_tensor.dims[1]
58
+
59
+ def load(self, data, offset):
60
+ (
61
+ self.n_vocab,
62
+ self.n_embd,
63
+ self.n_mult,
64
+ self.n_head,
65
+ self.n_layer,
66
+ self.n_rot,
67
+ ftype,
68
+ ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
69
+ try:
70
+ self.ftype = GGMLFType(ftype)
71
+ except ValueError:
72
+ raise ValueError(f'Invalid ftype {ftype}')
73
+ return 4 * 7
74
+
75
+ def __str__(self):
76
+ return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
77
+
78
+
79
+ class Vocab:
80
+ def __init__(self, load_scores = True):
81
+ self.items = []
82
+ self.load_scores = load_scores
83
+
84
+ def load(self, data, offset, n_vocab):
85
+ orig_offset = offset
86
+ for _ in range(n_vocab):
87
+ itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
88
+ assert itemlen < 4096, 'Absurd vocab item length'
89
+ offset += 4
90
+ item_text = bytes(data[offset:offset + itemlen])
91
+ offset += itemlen
92
+ if self.load_scores:
93
+ item_score = struct.unpack('<f', data[offset:offset + 4])[0]
94
+ offset += 4
95
+ else:
96
+ item_score = 0.0
97
+ self.items.append((item_text, item_score))
98
+ return offset - orig_offset
99
+
100
+
101
+ class Tensor:
102
+ def __init__(self, use_padding = True):
103
+ self.name = None
104
+ self.dims: tuple[int, ...] = ()
105
+ self.dtype = None
106
+ self.start_offset = 0
107
+ self.len_bytes = np.int64(0)
108
+ self.use_padding = use_padding
109
+
110
+ def load(self, data, offset):
111
+ orig_offset = offset
112
+ (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
113
+ assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
114
+ assert name_len < 4096, 'Absurd tensor name length'
115
+ quant = gguf.GGML_QUANT_SIZES.get(dtype)
116
+ assert quant is not None, 'Unknown tensor type'
117
+ (blksize, tysize) = quant
118
+ offset += 12
119
+ self.dtype= gguf.GGMLQuantizationType(dtype)
120
+ self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
121
+ offset += 4 * n_dims
122
+ self.name = bytes(data[offset:offset + name_len])
123
+ offset += name_len
124
+ pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
125
+ offset += pad
126
+ n_elems = np.prod(self.dims)
127
+ n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
128
+ self.start_offset = offset
129
+ self.len_bytes = n_bytes
130
+ offset += n_bytes
131
+ return offset - orig_offset
132
+
133
+
134
+ class GGMLModel:
135
+
136
+ file_format: GGMLFormat
137
+ format_version: int
138
+
139
+ def __init__(self):
140
+ self.hyperparameters = None
141
+ self.vocab = None
142
+ self.tensor_map = {}
143
+ self.tensors = []
144
+
145
+ def validate_header(self, data, offset):
146
+ magic = bytes(data[offset:offset + 4])
147
+ if magic == b'GGUF':
148
+ raise ValueError('File is already in GGUF format.')
149
+ if magic == b'lmgg':
150
+ self.file_format = GGMLFormat.GGML
151
+ self.format_version = 1
152
+ return 4
153
+ version = struct.unpack('<I', data[offset + 4:offset + 8])[0]
154
+ if magic == b'fmgg':
155
+ if version != 1:
156
+ raise ValueError(f'Cannot handle unexpected GGMF file version {version}')
157
+ self.file_format = GGMLFormat.GGMF
158
+ self.format_version = version
159
+ return 8
160
+ if magic == b'tjgg':
161
+ if version < 1 or version > 3:
162
+ raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
163
+ self.file_format = GGMLFormat.GGJT
164
+ self.format_version = version
165
+ return 8
166
+ raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
167
+
168
+ def validate_conversion(self, ftype):
169
+ err = ''
170
+ if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
171
+ if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
172
+ err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
173
+ elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
174
+ if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
175
+ GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
176
+ err = 'Q4 and Q8 quantizations changed in GGJTv3.'
177
+ if len(err) > 0:
178
+ raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
179
+
180
+ def load(self, data, offset):
181
+ offset += self.validate_header(data, offset)
182
+ hp = Hyperparameters()
183
+ offset += hp.load(data, offset)
184
+ logger.info(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
185
+ self.validate_conversion(hp.ftype)
186
+ vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
187
+ offset += vocab.load(data, offset, hp.n_vocab)
188
+ tensors: list[Tensor] = []
189
+ tensor_map = {}
190
+ while offset < len(data):
191
+ tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
192
+ offset += tensor.load(data, offset)
193
+ tensor_map[tensor.name] = len(tensors)
194
+ tensors.append(tensor)
195
+ self.hyperparameters = hp
196
+ self.vocab = vocab
197
+ self.tensors = tensors
198
+ self.tensor_map = tensor_map
199
+ hp.set_n_ff(self)
200
+ return offset
201
+
202
+
203
+ class GGMLToGGUF:
204
+ def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
205
+ hp = ggml_model.hyperparameters
206
+ self.model = ggml_model
207
+ self.data = data
208
+ self.cfg = cfg
209
+ self.params_override = params_override
210
+ self.vocab_override = vocab_override
211
+ self.special_vocab = special_vocab
212
+ if params_override is not None:
213
+ n_kv_head = params_override.n_head_kv
214
+ else:
215
+ if cfg.gqa == 1:
216
+ n_kv_head = hp.n_head
217
+ else:
218
+ gqa = float(cfg.gqa)
219
+ n_kv_head = None
220
+ for x in range(1, 256):
221
+ if float(hp.n_head) / float(x) == gqa:
222
+ n_kv_head = x
223
+ assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
224
+ logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
225
+ self.n_kv_head = n_kv_head
226
+ self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
227
+
228
+ def save(self):
229
+ logger.info('* Preparing to save GGUF file')
230
+ gguf_writer = gguf.GGUFWriter(
231
+ self.cfg.output,
232
+ gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
233
+ use_temp_file = False)
234
+ self.add_params(gguf_writer)
235
+ self.add_vocab(gguf_writer)
236
+ if self.special_vocab is not None:
237
+ self.special_vocab.add_to_gguf(gguf_writer)
238
+ self.add_tensors(gguf_writer)
239
+ logger.info(" gguf: write header")
240
+ gguf_writer.write_header_to_file()
241
+ logger.info(" gguf: write metadata")
242
+ gguf_writer.write_kv_data_to_file()
243
+ logger.info(" gguf: write tensors")
244
+ gguf_writer.write_tensors_to_file()
245
+ gguf_writer.close()
246
+
247
+ def add_params(self, gguf_writer):
248
+ hp = self.model.hyperparameters
249
+ cfg = self.cfg
250
+ if cfg.desc is not None:
251
+ desc = cfg.desc
252
+ else:
253
+ desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
254
+ try:
255
+ # Filenames aren't necessarily valid UTF8.
256
+ name = cfg.name if cfg.name is not None else cfg.input.name
257
+ except UnicodeDecodeError:
258
+ name = None
259
+ logger.info('* Adding model parameters and KV items')
260
+ if name is not None:
261
+ gguf_writer.add_name(name)
262
+ gguf_writer.add_description(desc)
263
+ gguf_writer.add_file_type(int(hp.ftype))
264
+ if self.params_override is not None:
265
+ po = self.params_override
266
+ assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
267
+ assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
268
+ assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
269
+ gguf_writer.add_context_length (po.n_ctx)
270
+ gguf_writer.add_embedding_length (po.n_embd)
271
+ gguf_writer.add_block_count (po.n_layer)
272
+ gguf_writer.add_feed_forward_length (po.n_ff)
273
+ gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
274
+ gguf_writer.add_head_count (po.n_head)
275
+ gguf_writer.add_head_count_kv (po.n_head_kv)
276
+ gguf_writer.add_layer_norm_rms_eps (po.f_norm_eps)
277
+ return
278
+ gguf_writer.add_context_length(cfg.context_length)
279
+ gguf_writer.add_embedding_length(hp.n_embd)
280
+ gguf_writer.add_block_count(hp.n_layer)
281
+ gguf_writer.add_feed_forward_length(hp.n_ff)
282
+ gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
283
+ gguf_writer.add_head_count(hp.n_head)
284
+ gguf_writer.add_head_count_kv(self.n_kv_head)
285
+ gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
286
+
287
+ def add_vocab(self, gguf_writer):
288
+ hp = self.model.hyperparameters
289
+ gguf_writer.add_tokenizer_model('llama')
290
+ gguf_writer.add_tokenizer_pre('default')
291
+ tokens = []
292
+ scores = []
293
+ toktypes = []
294
+ if self.vocab_override is not None:
295
+ vo = self.vocab_override
296
+ logger.info('* Adding vocab item(s)')
297
+ <<<<<<< HEAD:convert-llama-ggml-to-gguf.py
298
+ for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
299
+ =======
300
+ for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
301
+ >>>>>>> 1731d42:convert_llama_ggml_to_gguf.py
302
+ tokens.append(vbytes)
303
+ scores.append(score)
304
+ toktypes.append(ttype)
305
+ assert len(tokens) == hp.n_vocab, \
306
+ f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
307
+ gguf_writer.add_token_list(tokens)
308
+ gguf_writer.add_token_scores(scores)
309
+ if len(toktypes) > 0:
310
+ gguf_writer.add_token_types(toktypes)
311
+ return
312
+ logger.info(f'* Adding {hp.n_vocab} vocab item(s)')
313
+ assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
314
+ for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
315
+ tt = 1 # Normal
316
+ # Special handling for UNK, BOS, EOS tokens.
317
+ if tokid <= 2:
318
+ if tokid == 0:
319
+ vbytes = b'<unk>'
320
+ tt = 2
321
+ elif tokid == 1:
322
+ vbytes = b'<s>'
323
+ tt = 3
324
+ else:
325
+ vbytes = b'</s>'
326
+ tt = 3
327
+ elif len(vbytes) == 0:
328
+ tt = 3 # Control
329
+ elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
330
+ vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
331
+ tt = 6 # Byte
332
+ else:
333
+ vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
334
+ toktypes.append(tt)
335
+ tokens.append(vbytes)
336
+ scores.append(vscore)
337
+ gguf_writer.add_token_list(tokens)
338
+ gguf_writer.add_token_scores(scores)
339
+ gguf_writer.add_token_types(toktypes)
340
+ gguf_writer.add_unk_token_id(0)
341
+ gguf_writer.add_bos_token_id(1)
342
+ gguf_writer.add_eos_token_id(2)
343
+
344
+ def add_tensors(self, gguf_writer):
345
+ tensor_map = self.name_map
346
+ data = self.data
347
+ logger.info(f'* Adding {len(self.model.tensors)} tensor(s)')
348
+ for tensor in self.model.tensors:
349
+ name = str(tensor.name, 'UTF-8')
350
+ mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
351
+ assert mapped_name is not None, f'Bad name {name}'
352
+ tempdims = list(tensor.dims[:])
353
+ if len(tempdims) > 1:
354
+ temp = tempdims[1]
355
+ tempdims[1] = tempdims[0]
356
+ tempdims[0] = temp
357
+ gguf_writer.add_tensor(
358
+ mapped_name,
359
+ data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
360
+ raw_shape = tempdims,
361
+ raw_dtype = tensor.dtype)
362
+
363
+
364
+ def handle_metadata(cfg, hp):
365
+ import examples.convert_legacy_llama as convert
366
+
367
+ assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
368
+ hf_config_path = cfg.model_metadata_dir / "config.json"
369
+ orig_config_path = cfg.model_metadata_dir / "params.json"
370
+ # We pass a fake model here. "original" mode will check the shapes of some
371
+ # tensors if information is missing in the .json file: other than that, the
372
+ # model data isn't used so this should be safe (at least for now).
373
+ fakemodel = {
374
+ 'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
375
+ 'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
376
+ }
377
+ fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
378
+ fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
379
+ if hf_config_path.exists():
380
+ params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
381
+ elif orig_config_path.exists():
382
+ params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
383
+ else:
384
+ raise ValueError('Unable to load metadata')
385
+ vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
386
+ vocab_factory = convert.VocabFactory(vocab_path)
387
+ vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype.split(","), cfg.model_metadata_dir)
388
+ convert.check_vocab_size(params, vocab)
389
+ return params, vocab, special_vocab
390
+
391
+
392
+ def handle_args():
393
+ parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
394
+ parser.add_argument('--input', '-i', type = Path, required = True,
395
+ help = 'Input GGMLv3 filename')
396
+ parser.add_argument('--output', '-o', type = Path, required = True,
397
+ help ='Output GGUF filename')
398
+ parser.add_argument('--name',
399
+ help = 'Set model name')
400
+ parser.add_argument('--desc',
401
+ help = 'Set model description')
402
+ parser.add_argument('--gqa', type = int, default = 1,
403
+ help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
404
+ parser.add_argument('--eps', default = '5.0e-06',
405
+ help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
406
+ parser.add_argument('--context-length', '-c', type=int, default = 2048,
407
+ help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
408
+ parser.add_argument('--model-metadata-dir', '-m', type = Path,
409
+ help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
410
+ parser.add_argument("--vocab-dir", type=Path,
411
+ help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
412
+ parser.add_argument("--vocabtype", default="spm,hfft",
413
+ help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
414
+ parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
415
+ return parser.parse_args()
416
+
417
+
418
+ def main():
419
+ cfg = handle_args()
420
+ logging.basicConfig(level=logging.DEBUG if cfg.verbose else logging.INFO)
421
+ logger.info(f'* Using config: {cfg}')
422
+ logger.warning('=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===')
423
+ if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
424
+ logger.info('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
425
+ data = np.memmap(cfg.input, mode = 'r')
426
+ model = GGMLModel()
427
+ logger.info('* Scanning GGML input file')
428
+ offset = model.load(data, 0) # noqa
429
+ logger.info(f'* GGML model hyperparameters: {model.hyperparameters}')
430
+ vocab_override = None
431
+ params_override = None
432
+ special_vocab = None
433
+ if cfg.model_metadata_dir is not None:
434
+ (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
435
+ logger.info('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
436
+ logger.info(f'* Overriding params: {params_override}')
437
+ logger.info(f'* Overriding vocab: {vocab_override}')
438
+ logger.info(f'* Special vocab: {special_vocab}')
439
+ else:
440
+ logger.warning('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
441
+ if model.file_format == GGMLFormat.GGML:
442
+ logger.info('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
443
+ converter = GGMLToGGUF(
444
+ model, data, cfg,
445
+ params_override = params_override,
446
+ vocab_override = vocab_override,
447
+ special_vocab = special_vocab
448
+ )
449
+ converter.save()
450
+ logger.info(f'* Successful completion. Output saved to: {cfg.output}')
451
+
452
+
453
+ if __name__ == '__main__':
454
+ main()