xinference 0.14.1.post1__py3-none-any.whl → 0.14.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (87) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +15 -34
  3. xinference/client/restful/restful_client.py +2 -2
  4. xinference/core/chat_interface.py +44 -9
  5. xinference/core/model.py +4 -4
  6. xinference/core/scheduler.py +1 -2
  7. xinference/core/worker.py +1 -1
  8. xinference/deploy/cmdline.py +2 -2
  9. xinference/deploy/test/test_cmdline.py +7 -7
  10. xinference/model/llm/__init__.py +20 -27
  11. xinference/model/llm/{ggml/llamacpp.py → llama_cpp/core.py} +2 -35
  12. xinference/model/llm/llm_family.json +448 -1153
  13. xinference/model/llm/llm_family.py +14 -139
  14. xinference/model/llm/llm_family_modelscope.json +230 -313
  15. xinference/model/llm/memory.py +9 -9
  16. xinference/model/llm/sglang/core.py +2 -2
  17. xinference/model/llm/{pytorch → transformers}/chatglm.py +6 -13
  18. xinference/model/llm/{pytorch → transformers}/core.py +2 -10
  19. xinference/model/llm/transformers/intern_vl.py +457 -0
  20. xinference/model/llm/{pytorch → transformers}/internlm2.py +4 -8
  21. xinference/model/llm/{pytorch → transformers}/minicpmv26.py +67 -22
  22. xinference/model/llm/{pytorch → transformers}/utils.py +1 -2
  23. xinference/model/llm/utils.py +76 -70
  24. xinference/model/llm/vllm/core.py +110 -11
  25. xinference/model/utils.py +1 -95
  26. xinference/thirdparty/internvl/__init__.py +0 -0
  27. xinference/thirdparty/internvl/conversation.py +393 -0
  28. xinference/thirdparty/omnilmm/model/utils.py +16 -1
  29. xinference/web/ui/build/asset-manifest.json +3 -3
  30. xinference/web/ui/build/index.html +1 -1
  31. xinference/web/ui/build/static/js/main.ffc26121.js +3 -0
  32. xinference/web/ui/build/static/js/main.ffc26121.js.map +1 -0
  33. xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +1 -0
  34. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +1 -0
  35. xinference/web/ui/node_modules/.cache/babel-loader/5391543180fead1eeef5364300301498d58a7d91d62de3841a32768b67f4552f.json +1 -0
  36. xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +1 -0
  37. xinference/web/ui/node_modules/.cache/babel-loader/714c37ce0ec5b5c591033f02be2f3f491fdd70da3ef568ee4a4f94689a3d5ca2.json +1 -0
  38. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +1 -0
  39. xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +1 -0
  40. xinference/web/ui/node_modules/.cache/babel-loader/a797831de0dc74897f4b50b3426555d748f328b4c2cc391de709eadaf6a5f3e3.json +1 -0
  41. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +1 -0
  42. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +1 -0
  43. xinference/web/ui/node_modules/.cache/babel-loader/e91938976f229ce986b2907e51e1f00540b584ced0a315d498c172d13220739d.json +1 -0
  44. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +1 -0
  45. {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/METADATA +5 -8
  46. {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/RECORD +63 -70
  47. xinference/locale/utils.py +0 -39
  48. xinference/locale/zh_CN.json +0 -26
  49. xinference/model/llm/ggml/tools/__init__.py +0 -15
  50. xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +0 -498
  51. xinference/model/llm/ggml/tools/gguf.py +0 -884
  52. xinference/model/llm/pytorch/__init__.py +0 -13
  53. xinference/model/llm/pytorch/baichuan.py +0 -81
  54. xinference/model/llm/pytorch/falcon.py +0 -138
  55. xinference/model/llm/pytorch/intern_vl.py +0 -352
  56. xinference/model/llm/pytorch/vicuna.py +0 -69
  57. xinference/web/ui/build/static/js/main.17ca0398.js +0 -3
  58. xinference/web/ui/build/static/js/main.17ca0398.js.map +0 -1
  59. xinference/web/ui/node_modules/.cache/babel-loader/1444c41a4d04494f1cbc2d8c1537df107b451cb569cb2c1fbf5159f3a4841a5f.json +0 -1
  60. xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +0 -1
  61. xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +0 -1
  62. xinference/web/ui/node_modules/.cache/babel-loader/6450605fac003812485f6251b9f0caafbf2e5bfc3bbe2f000050d9e2fdb8dcd3.json +0 -1
  63. xinference/web/ui/node_modules/.cache/babel-loader/71684495d995c7e266eecc6a0ad8ea0284cc785f80abddf863789c57a6134969.json +0 -1
  64. xinference/web/ui/node_modules/.cache/babel-loader/80acd1edf31542ab1dcccfad02cb4b38f3325cff847a781fcce97500cfd6f878.json +0 -1
  65. xinference/web/ui/node_modules/.cache/babel-loader/8a9742ddd8ba8546ef42dc14caca443f2b4524fabed7bf269e0eff3b7b64ee7d.json +0 -1
  66. xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +0 -1
  67. xinference/web/ui/node_modules/.cache/babel-loader/d93730e2b5d7e8c957b4d0965d2ed1dac9045a649adbd47c220d11f255d4b1e0.json +0 -1
  68. xinference/web/ui/node_modules/.cache/babel-loader/e656dc00b4d8b387f0a81ba8fc558767df1601c66369e2eb86a5ef27cf080572.json +0 -1
  69. xinference/web/ui/node_modules/.cache/babel-loader/f28b83886159d83b84f099b05d607a822dca4dd7f2d8aa6d56fe08bab0b5b086.json +0 -1
  70. xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +0 -1
  71. /xinference/{locale → model/llm/llama_cpp}/__init__.py +0 -0
  72. /xinference/model/llm/{ggml → transformers}/__init__.py +0 -0
  73. /xinference/model/llm/{pytorch → transformers}/cogvlm2.py +0 -0
  74. /xinference/model/llm/{pytorch → transformers}/compression.py +0 -0
  75. /xinference/model/llm/{pytorch → transformers}/deepseek_vl.py +0 -0
  76. /xinference/model/llm/{pytorch → transformers}/glm4v.py +0 -0
  77. /xinference/model/llm/{pytorch → transformers}/llama_2.py +0 -0
  78. /xinference/model/llm/{pytorch → transformers}/minicpmv25.py +0 -0
  79. /xinference/model/llm/{pytorch → transformers}/omnilmm.py +0 -0
  80. /xinference/model/llm/{pytorch → transformers}/qwen_vl.py +0 -0
  81. /xinference/model/llm/{pytorch → transformers}/tensorizer_utils.py +0 -0
  82. /xinference/model/llm/{pytorch → transformers}/yi_vl.py +0 -0
  83. /xinference/web/ui/build/static/js/{main.17ca0398.js.LICENSE.txt → main.ffc26121.js.LICENSE.txt} +0 -0
  84. {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/LICENSE +0 -0
  85. {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/WHEEL +0 -0
  86. {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/entry_points.txt +0 -0
  87. {xinference-0.14.1.post1.dist-info → xinference-0.14.2.dist-info}/top_level.txt +0 -0
@@ -1,498 +0,0 @@
1
- #!/usr/bin/env python3
2
- # Copied from llama.cpp to convert ggml file to gguf
3
- from __future__ import annotations
4
-
5
- import argparse
6
- import struct
7
- from enum import IntEnum
8
- from pathlib import Path
9
- from typing import Optional
10
-
11
- import numpy as np
12
-
13
- from . import gguf
14
-
15
- # Note: Does not support GGML_QKK_64
16
- QK_K = 256
17
- # Items here are (block size, type size)
18
- GGML_QUANT_SIZES = {
19
- gguf.GGMLQuantizationType.F32: (1, 4),
20
- gguf.GGMLQuantizationType.F16: (1, 2),
21
- gguf.GGMLQuantizationType.Q4_0: (32, 2 + 16),
22
- gguf.GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16),
23
- gguf.GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16),
24
- gguf.GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16),
25
- gguf.GGMLQuantizationType.Q8_0: (32, 2 + 32),
26
- gguf.GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32),
27
- gguf.GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4),
28
- gguf.GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12),
29
- gguf.GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12),
30
- gguf.GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
31
- gguf.GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
32
- gguf.GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8),
33
- }
34
-
35
-
36
- class GGMLFormat(IntEnum):
37
- GGML = 0
38
- GGMF = 1
39
- GGJT = 2
40
-
41
-
42
- class GGMLFType(IntEnum):
43
- ALL_F32 = 0
44
- MOSTLY_F16 = 1
45
- MOSTLY_Q4_0 = 2
46
- MOSTLY_Q4_1 = 3
47
- MOSTLY_Q4_1_SOME_F16 = 4
48
- MOSTLY_Q8_0 = 7
49
- MOSTLY_Q5_0 = 8
50
- MOSTLY_Q5_1 = 9
51
- MOSTLY_Q2_K = 10
52
- MOSTLY_Q3_K_S = 11
53
- MOSTLY_Q3_K_M = 12
54
- MOSTLY_Q3_K_L = 13
55
- MOSTLY_Q4_K_S = 14
56
- MOSTLY_Q4_K_M = 15
57
- MOSTLY_Q5_K_S = 16
58
- MOSTLY_Q5_K_M = 17
59
- MOSTLY_Q6_K = 18
60
-
61
-
62
- class Hyperparameters:
63
- def __init__(self):
64
- self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
65
- self.n_layer = self.n_rot = self.n_ff = 0
66
- self.ftype = GGMLFType.ALL_F32
67
-
68
- def set_n_ff(self, model):
69
- ff_tensor_idx = model.tensor_map.get(b"layers.0.feed_forward.w1.weight")
70
- assert ff_tensor_idx is not None, "Missing layer 0 FF tensor"
71
- ff_tensor = model.tensors[ff_tensor_idx]
72
- self.n_ff = ff_tensor.dims[1]
73
-
74
- def load(self, data, offset):
75
- (
76
- self.n_vocab,
77
- self.n_embd,
78
- self.n_mult,
79
- self.n_head,
80
- self.n_layer,
81
- self.n_rot,
82
- ftype,
83
- ) = struct.unpack("<7I", data[offset : offset + (4 * 7)])
84
- try:
85
- self.ftype = GGMLFType(ftype)
86
- except ValueError:
87
- raise ValueError(f"Invalid ftype {ftype}")
88
- return 4 * 7
89
-
90
- def __str__(self):
91
- return f"<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>"
92
-
93
-
94
- class Vocab:
95
- def __init__(self, load_scores=True):
96
- self.items = []
97
- self.load_scores = load_scores
98
-
99
- def load(self, data, offset, n_vocab):
100
- orig_offset = offset
101
- for _ in range(n_vocab):
102
- itemlen = struct.unpack("<I", data[offset : offset + 4])[0]
103
- assert itemlen < 4096, "Absurd vocab item length"
104
- offset += 4
105
- item_text = bytes(data[offset : offset + itemlen])
106
- offset += itemlen
107
- if self.load_scores:
108
- item_score = struct.unpack("<f", data[offset : offset + 4])[0]
109
- offset += 4
110
- else:
111
- item_score = 0.0
112
- self.items.append((item_text, item_score))
113
- return offset - orig_offset
114
-
115
-
116
- class Tensor:
117
- def __init__(self, use_padding=True):
118
- self.name = None
119
- self.dims: tuple[int, ...] = () # type: ignore
120
- self.dtype = None
121
- self.start_offset = 0
122
- self.len_bytes = np.int64(0)
123
- self.use_padding = use_padding
124
-
125
- def load(self, data, offset):
126
- orig_offset = offset
127
- (n_dims, name_len, dtype) = struct.unpack("<3I", data[offset : offset + 12])
128
- assert n_dims >= 0 and n_dims <= 4, f"Invalid tensor dimensions {n_dims}"
129
- assert name_len < 4096, "Absurd tensor name length"
130
- quant = GGML_QUANT_SIZES.get(dtype)
131
- assert quant is not None, "Unknown tensor type"
132
- (blksize, tysize) = quant
133
- offset += 12
134
- self.dtype = dtype
135
- self.dims = struct.unpack(f"<{n_dims}I", data[offset : offset + (4 * n_dims)])
136
- offset += 4 * n_dims
137
- self.name = bytes(data[offset : offset + name_len])
138
- offset += name_len
139
- pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
140
- offset += pad
141
- n_elems = np.prod(self.dims)
142
- n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
143
- self.start_offset = offset
144
- self.len_bytes = n_bytes
145
- offset += n_bytes
146
- # print(n_dims, name_len, dtype, self.dims, self.name, pad)
147
- return offset - orig_offset
148
-
149
-
150
- class GGMLModel:
151
- def __init__(self):
152
- self.hyperparameters = None
153
- self.vocab = None
154
- self.tensor_map = {}
155
- self.tensors = []
156
-
157
- def validate_header(self, data, offset):
158
- magic = bytes(data[offset : offset + 4])
159
- if magic == b"GGUF":
160
- raise ValueError("File is already in GGUF format.")
161
- if magic == b"lmgg":
162
- self.file_format = GGMLFormat.GGML
163
- self.format_version = 1
164
- return 4
165
- version = struct.unpack("<I", data[offset + 4 : offset + 8])[0]
166
- if magic == b"fmgg":
167
- if version != 1:
168
- raise ValueError(
169
- f"Cannot handle unexpected GGMF file version {version}"
170
- )
171
- self.file_format = GGMLFormat.GGMF
172
- self.format_version = version
173
- return 8
174
- if magic == b"tjgg":
175
- if version < 1 or version > 3:
176
- raise ValueError(
177
- f"Cannot handle unexpected GGJT file version {version}"
178
- )
179
- self.file_format = GGMLFormat.GGJT
180
- self.format_version = version
181
- return 8
182
- raise ValueError(
183
- f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file."
184
- )
185
-
186
- def validate_conversion(self, ftype):
187
- err = ""
188
- if self.file_format < GGMLFormat.GGJT or self.format_version < 2:
189
- if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
190
- err = "Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2."
191
- elif self.file_format == GGMLFormat.GGJT and self.format_version == 2:
192
- if ftype in (
193
- GGMLFType.MOSTLY_Q4_0,
194
- GGMLFType.MOSTLY_Q4_1,
195
- GGMLFType.MOSTLY_Q4_1_SOME_F16,
196
- GGMLFType.MOSTLY_Q8_0,
197
- ):
198
- err = "Q4 and Q8 quantizations changed in GGJTv3."
199
- if len(err) > 0:
200
- raise ValueError(
201
- f"{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion."
202
- )
203
-
204
- def load(self, data, offset):
205
- offset += self.validate_header(data, offset)
206
- hp = Hyperparameters()
207
- offset += hp.load(data, offset)
208
- print(
209
- f"* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}"
210
- )
211
- self.validate_conversion(hp.ftype)
212
- vocab = Vocab(load_scores=self.file_format > GGMLFormat.GGML)
213
- offset += vocab.load(data, offset, hp.n_vocab)
214
- tensors: list[Tensor] = [] # type: ignore
215
- tensor_map = {}
216
- while offset < len(data):
217
- tensor = Tensor(use_padding=self.file_format > GGMLFormat.GGMF)
218
- offset += tensor.load(data, offset)
219
- tensor_map[tensor.name] = len(tensors)
220
- tensors.append(tensor)
221
- self.hyperparameters = hp
222
- self.vocab = vocab
223
- self.tensors = tensors
224
- self.tensor_map = tensor_map
225
- hp.set_n_ff(self)
226
- return offset
227
-
228
-
229
- class GGMLToGGUF:
230
- def __init__(
231
- self,
232
- ggml_model,
233
- data,
234
- cfg,
235
- params_override=None,
236
- vocab_override=None,
237
- special_vocab=None,
238
- ):
239
- hp = ggml_model.hyperparameters
240
- self.model = ggml_model
241
- self.data = data
242
- self.cfg = cfg
243
- self.params_override = params_override
244
- self.vocab_override = vocab_override
245
- self.special_vocab = special_vocab
246
- if params_override is not None:
247
- n_kv_head = params_override.n_head_kv
248
- else:
249
- if cfg.gqa == 1:
250
- n_kv_head = hp.n_head
251
- else:
252
- gqa = float(cfg.gqa)
253
- n_kv_head = None
254
- for x in range(1, 256):
255
- if float(hp.n_head) / float(x) == gqa:
256
- n_kv_head = x
257
- assert (
258
- n_kv_head is not None
259
- ), "Couldn't determine n_kv_head from GQA param"
260
- print(f"- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}")
261
- self.n_kv_head = n_kv_head
262
- self.name_map = gguf.get_tensor_name_map(
263
- gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer
264
- )
265
-
266
- def save(self):
267
- print("* Preparing to save GGUF file")
268
- gguf_writer = gguf.GGUFWriter(
269
- self.cfg.output,
270
- gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
271
- use_temp_file=False,
272
- )
273
- self.add_params(gguf_writer)
274
- self.add_vocab(gguf_writer)
275
- if self.special_vocab is not None:
276
- self.special_vocab.add_to_gguf(gguf_writer)
277
- self.add_tensors(gguf_writer)
278
- print(" gguf: write header")
279
- gguf_writer.write_header_to_file()
280
- print(" gguf: write metadata")
281
- gguf_writer.write_kv_data_to_file()
282
- print(" gguf: write tensors")
283
- gguf_writer.write_tensors_to_file()
284
- gguf_writer.close()
285
-
286
- def add_params(self, gguf_writer):
287
- hp = self.model.hyperparameters
288
- cfg = self.cfg
289
- if cfg.desc is not None:
290
- desc = cfg.desc
291
- else:
292
- desc = f"converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format"
293
- try:
294
- # Filenames aren't necessarily valid UTF8.
295
- name = cfg.name if cfg.name is not None else cfg.input.name
296
- except UnicodeDecodeError:
297
- name = None
298
- print("* Adding model parameters and KV items")
299
- if name is not None:
300
- gguf_writer.add_name(name)
301
- gguf_writer.add_description(desc)
302
- gguf_writer.add_file_type(int(hp.ftype))
303
- if self.params_override is not None:
304
- po = self.params_override
305
- assert po.n_embd == hp.n_embd, "Model hyperparams mismatch"
306
- assert po.n_layer == hp.n_layer, "Model hyperparams mismatch"
307
- assert po.n_head == hp.n_head, "Model hyperparams mismatch"
308
- gguf_writer.add_context_length(po.n_ctx)
309
- gguf_writer.add_embedding_length(po.n_embd)
310
- gguf_writer.add_block_count(po.n_layer)
311
- gguf_writer.add_feed_forward_length(po.n_ff)
312
- gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
313
- gguf_writer.add_head_count(po.n_head)
314
- gguf_writer.add_head_count_kv(po.n_head_kv)
315
- gguf_writer.add_layer_norm_rms_eps(po.f_norm_eps)
316
- return
317
- gguf_writer.add_context_length(cfg.context_length)
318
- gguf_writer.add_embedding_length(hp.n_embd)
319
- gguf_writer.add_block_count(hp.n_layer)
320
- gguf_writer.add_feed_forward_length(hp.n_ff)
321
- gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
322
- gguf_writer.add_head_count(hp.n_head)
323
- gguf_writer.add_head_count_kv(self.n_kv_head)
324
- gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
325
-
326
- def add_vocab(self, gguf_writer):
327
- hp = self.model.hyperparameters
328
- gguf_writer.add_tokenizer_model("llama")
329
- tokens = []
330
- scores = []
331
- toktypes = []
332
- if self.vocab_override is not None:
333
- vo = self.vocab_override
334
- print("* Adding vocab item(s)")
335
- for idx, (vbytes, score, ttype) in enumerate(vo.all_tokens()):
336
- tokens.append(vbytes)
337
- scores.append(score)
338
- toktypes.append(ttype)
339
- assert (
340
- len(tokens) == hp.n_vocab
341
- ), f"Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}"
342
- gguf_writer.add_token_list(tokens)
343
- gguf_writer.add_token_scores(scores)
344
- if len(toktypes) > 0:
345
- gguf_writer.add_token_types(toktypes)
346
- return
347
- print(f"* Adding {hp.n_vocab} vocab item(s)")
348
- assert (
349
- len(self.model.vocab.items) >= 3
350
- ), "Cannot handle unexpectedly short model vocab"
351
- for tokid, (vbytes, vscore) in enumerate(self.model.vocab.items):
352
- tt = 1 # Normal
353
- # Special handling for UNK, BOS, EOS tokens.
354
- if tokid <= 2:
355
- if tokid == 0:
356
- vbytes = b"<unk>"
357
- tt = 2
358
- elif tokid == 1:
359
- vbytes = b"<s>"
360
- tt = 3
361
- else:
362
- vbytes = b"</s>"
363
- tt = 3
364
- elif len(vbytes) == 0:
365
- tt = 3 # Control
366
- elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
367
- vbytes = bytes(f"<0x{vbytes[0]:02X}>", encoding="UTF-8")
368
- tt = 6 # Byte
369
- else:
370
- vbytes = vbytes.replace(b" ", b"\xe2\x96\x81")
371
- toktypes.append(tt)
372
- tokens.append(vbytes)
373
- scores.append(vscore)
374
- gguf_writer.add_token_list(tokens)
375
- gguf_writer.add_token_scores(scores)
376
- gguf_writer.add_token_types(toktypes)
377
- gguf_writer.add_unk_token_id(0)
378
- gguf_writer.add_bos_token_id(1)
379
- gguf_writer.add_eos_token_id(2)
380
-
381
- def add_tensors(self, gguf_writer):
382
- tensor_map = self.name_map
383
- data = self.data
384
- print(f"* Adding {len(self.model.tensors)} tensor(s)")
385
- for tensor in self.model.tensors:
386
- name = str(tensor.name, "UTF-8")
387
- mapped_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
388
- assert mapped_name is not None, f"Bad name {name}"
389
- tempdims = list(tensor.dims[:])
390
- if len(tempdims) > 1:
391
- temp = tempdims[1]
392
- tempdims[1] = tempdims[0]
393
- tempdims[0] = temp
394
- # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
395
- gguf_writer.add_tensor(
396
- mapped_name,
397
- data[tensor.start_offset : tensor.start_offset + tensor.len_bytes],
398
- raw_shape=tempdims,
399
- raw_dtype=tensor.dtype,
400
- )
401
-
402
-
403
- def handle_args():
404
- parser = argparse.ArgumentParser(description="Convert GGML models to GGUF")
405
- parser.add_argument(
406
- "--input", "-i", type=Path, required=True, help="Input GGMLv3 filename"
407
- )
408
- parser.add_argument(
409
- "--output", "-o", type=Path, required=True, help="Output GGUF filename"
410
- )
411
- parser.add_argument("--name", help="Set model name")
412
- parser.add_argument("--desc", help="Set model description")
413
- parser.add_argument(
414
- "--gqa",
415
- type=int,
416
- default=1,
417
- help="grouped-query attention factor (use 8 for LLaMA2 70B)",
418
- )
419
- parser.add_argument(
420
- "--eps",
421
- default="5.0e-06",
422
- help="RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2",
423
- )
424
- parser.add_argument(
425
- "--context-length",
426
- "-c",
427
- type=int,
428
- default=2048,
429
- help="Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096",
430
- )
431
- return parser.parse_args()
432
-
433
-
434
- from dataclasses import dataclass
435
-
436
-
437
- @dataclass
438
- class Config:
439
- input: Path
440
- output: Path
441
- name: Optional[str]
442
- desc: Optional[str]
443
- gqa: int
444
- eps: float
445
- context_length: int
446
-
447
-
448
- def convert(
449
- source_path: str,
450
- dest_path: str,
451
- model_name: Optional[str] = None,
452
- model_desc: Optional[str] = None,
453
- gqa: int = 1,
454
- eps: float = 5.0e-06,
455
- context_length: int = 2048,
456
- ):
457
- cfg = Config(
458
- input=Path(source_path),
459
- output=Path(dest_path),
460
- name=model_name,
461
- desc=model_desc,
462
- gqa=gqa,
463
- eps=eps,
464
- context_length=context_length,
465
- )
466
- print(f"* Using config: {cfg}")
467
- print(
468
- "\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n"
469
- )
470
- if cfg.gqa == 1 or cfg.eps == "5.0e-06":
471
- print(
472
- '- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".'
473
- )
474
- data = np.memmap(cfg.input, mode="r")
475
- model = GGMLModel()
476
- print("* Scanning GGML input file")
477
- model.load(data, 0)
478
- print(f"* GGML model hyperparameters: {model.hyperparameters}")
479
- vocab_override = None
480
- params_override = None
481
- special_vocab = None
482
- print(
483
- "\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n"
484
- )
485
- if model.file_format == GGMLFormat.GGML:
486
- print(
487
- "! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!"
488
- )
489
- converter = GGMLToGGUF(
490
- model,
491
- data,
492
- cfg,
493
- params_override=params_override,
494
- vocab_override=vocab_override,
495
- special_vocab=special_vocab,
496
- )
497
- converter.save()
498
- print(f"* Successful completion. Output saved to: {cfg.output}")