xinference 0.14.1.post1__py3-none-any.whl → 0.14.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (194) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +15 -34
  3. xinference/client/restful/restful_client.py +2 -2
  4. xinference/core/chat_interface.py +45 -10
  5. xinference/core/image_interface.py +9 -0
  6. xinference/core/model.py +8 -5
  7. xinference/core/scheduler.py +1 -2
  8. xinference/core/worker.py +49 -42
  9. xinference/deploy/cmdline.py +2 -2
  10. xinference/deploy/test/test_cmdline.py +7 -7
  11. xinference/model/audio/chattts.py +24 -9
  12. xinference/model/audio/core.py +8 -2
  13. xinference/model/audio/fish_speech.py +228 -0
  14. xinference/model/audio/model_spec.json +8 -0
  15. xinference/model/embedding/core.py +23 -1
  16. xinference/model/image/model_spec.json +2 -1
  17. xinference/model/image/model_spec_modelscope.json +2 -1
  18. xinference/model/image/stable_diffusion/core.py +49 -1
  19. xinference/model/llm/__init__.py +26 -27
  20. xinference/model/llm/{ggml/llamacpp.py → llama_cpp/core.py} +2 -35
  21. xinference/model/llm/llm_family.json +606 -1266
  22. xinference/model/llm/llm_family.py +16 -139
  23. xinference/model/llm/llm_family_modelscope.json +276 -313
  24. xinference/model/llm/lmdeploy/__init__.py +0 -0
  25. xinference/model/llm/lmdeploy/core.py +557 -0
  26. xinference/model/llm/memory.py +9 -9
  27. xinference/model/llm/sglang/core.py +2 -2
  28. xinference/model/llm/{pytorch → transformers}/chatglm.py +6 -13
  29. xinference/model/llm/{pytorch → transformers}/cogvlm2.py +4 -45
  30. xinference/model/llm/transformers/cogvlm2_video.py +524 -0
  31. xinference/model/llm/{pytorch → transformers}/core.py +3 -10
  32. xinference/model/llm/{pytorch → transformers}/glm4v.py +2 -23
  33. xinference/model/llm/transformers/intern_vl.py +540 -0
  34. xinference/model/llm/{pytorch → transformers}/internlm2.py +4 -8
  35. xinference/model/llm/{pytorch → transformers}/minicpmv25.py +2 -23
  36. xinference/model/llm/{pytorch → transformers}/minicpmv26.py +66 -41
  37. xinference/model/llm/{pytorch → transformers}/utils.py +1 -2
  38. xinference/model/llm/{pytorch → transformers}/yi_vl.py +2 -24
  39. xinference/model/llm/utils.py +85 -70
  40. xinference/model/llm/vllm/core.py +110 -11
  41. xinference/model/utils.py +1 -95
  42. xinference/thirdparty/fish_speech/__init__.py +0 -0
  43. xinference/thirdparty/fish_speech/fish_speech/__init__.py +0 -0
  44. xinference/thirdparty/fish_speech/fish_speech/callbacks/__init__.py +3 -0
  45. xinference/thirdparty/fish_speech/fish_speech/callbacks/grad_norm.py +113 -0
  46. xinference/thirdparty/fish_speech/fish_speech/configs/__init__.py +0 -0
  47. xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
  48. xinference/thirdparty/fish_speech/fish_speech/conversation.py +2 -0
  49. xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
  50. xinference/thirdparty/fish_speech/fish_speech/datasets/concat_repeat.py +53 -0
  51. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
  52. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text_data_pb2.py +33 -0
  53. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text_data_stream.py +36 -0
  54. xinference/thirdparty/fish_speech/fish_speech/datasets/semantic.py +496 -0
  55. xinference/thirdparty/fish_speech/fish_speech/datasets/vqgan.py +147 -0
  56. xinference/thirdparty/fish_speech/fish_speech/i18n/__init__.py +3 -0
  57. xinference/thirdparty/fish_speech/fish_speech/i18n/core.py +40 -0
  58. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
  59. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +122 -0
  60. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +122 -0
  61. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +123 -0
  62. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json +133 -0
  63. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +122 -0
  64. xinference/thirdparty/fish_speech/fish_speech/i18n/scan.py +122 -0
  65. xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
  66. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/__init__.py +0 -0
  67. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/lit_module.py +202 -0
  68. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +779 -0
  69. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/lora.py +92 -0
  70. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py +3 -0
  71. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py +442 -0
  72. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
  73. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py +44 -0
  74. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +625 -0
  75. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +139 -0
  76. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py +115 -0
  77. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py +225 -0
  78. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/utils.py +94 -0
  79. xinference/thirdparty/fish_speech/fish_speech/scheduler.py +40 -0
  80. xinference/thirdparty/fish_speech/fish_speech/text/__init__.py +4 -0
  81. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/__init__.py +0 -0
  82. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_class.py +172 -0
  83. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_constant.py +30 -0
  84. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_util.py +342 -0
  85. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/cardinal.py +32 -0
  86. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/date.py +75 -0
  87. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/digit.py +32 -0
  88. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/fraction.py +35 -0
  89. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/money.py +43 -0
  90. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/percentage.py +33 -0
  91. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/telephone.py +51 -0
  92. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +177 -0
  93. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +69 -0
  94. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +130 -0
  95. xinference/thirdparty/fish_speech/fish_speech/train.py +139 -0
  96. xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +23 -0
  97. xinference/thirdparty/fish_speech/fish_speech/utils/braceexpand.py +217 -0
  98. xinference/thirdparty/fish_speech/fish_speech/utils/context.py +13 -0
  99. xinference/thirdparty/fish_speech/fish_speech/utils/file.py +16 -0
  100. xinference/thirdparty/fish_speech/fish_speech/utils/instantiators.py +50 -0
  101. xinference/thirdparty/fish_speech/fish_speech/utils/logger.py +55 -0
  102. xinference/thirdparty/fish_speech/fish_speech/utils/logging_utils.py +48 -0
  103. xinference/thirdparty/fish_speech/fish_speech/utils/rich_utils.py +100 -0
  104. xinference/thirdparty/fish_speech/fish_speech/utils/spectrogram.py +122 -0
  105. xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +114 -0
  106. xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
  107. xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +120 -0
  108. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1237 -0
  109. xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
  110. xinference/thirdparty/fish_speech/tools/api.py +495 -0
  111. xinference/thirdparty/fish_speech/tools/auto_rerank.py +159 -0
  112. xinference/thirdparty/fish_speech/tools/download_models.py +55 -0
  113. xinference/thirdparty/fish_speech/tools/extract_model.py +21 -0
  114. xinference/thirdparty/fish_speech/tools/file.py +108 -0
  115. xinference/thirdparty/fish_speech/tools/gen_ref.py +36 -0
  116. xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
  117. xinference/thirdparty/fish_speech/tools/llama/build_dataset.py +169 -0
  118. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +171 -0
  119. xinference/thirdparty/fish_speech/tools/llama/generate.py +698 -0
  120. xinference/thirdparty/fish_speech/tools/llama/merge_lora.py +95 -0
  121. xinference/thirdparty/fish_speech/tools/llama/quantize.py +497 -0
  122. xinference/thirdparty/fish_speech/tools/llama/rebuild_tokenizer.py +57 -0
  123. xinference/thirdparty/fish_speech/tools/merge_asr_files.py +55 -0
  124. xinference/thirdparty/fish_speech/tools/post_api.py +164 -0
  125. xinference/thirdparty/fish_speech/tools/sensevoice/__init__.py +0 -0
  126. xinference/thirdparty/fish_speech/tools/sensevoice/auto_model.py +573 -0
  127. xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py +332 -0
  128. xinference/thirdparty/fish_speech/tools/sensevoice/vad_utils.py +61 -0
  129. xinference/thirdparty/fish_speech/tools/smart_pad.py +47 -0
  130. xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
  131. xinference/thirdparty/fish_speech/tools/vqgan/create_train_split.py +83 -0
  132. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +227 -0
  133. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +120 -0
  134. xinference/thirdparty/fish_speech/tools/webui.py +619 -0
  135. xinference/thirdparty/fish_speech/tools/whisper_asr.py +176 -0
  136. xinference/thirdparty/internvl/__init__.py +0 -0
  137. xinference/thirdparty/internvl/conversation.py +393 -0
  138. xinference/thirdparty/omnilmm/model/utils.py +16 -1
  139. xinference/web/ui/build/asset-manifest.json +3 -3
  140. xinference/web/ui/build/index.html +1 -1
  141. xinference/web/ui/build/static/js/main.661c7b0a.js +3 -0
  142. xinference/web/ui/build/static/js/{main.17ca0398.js.map → main.661c7b0a.js.map} +1 -1
  143. xinference/web/ui/node_modules/.cache/babel-loader/070d8c6b3b0f3485c6d3885f0b6bbfdf9643e088a468acbd5d596f2396071c16.json +1 -0
  144. xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +1 -0
  145. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +1 -0
  146. xinference/web/ui/node_modules/.cache/babel-loader/5391543180fead1eeef5364300301498d58a7d91d62de3841a32768b67f4552f.json +1 -0
  147. xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +1 -0
  148. xinference/web/ui/node_modules/.cache/babel-loader/714c37ce0ec5b5c591033f02be2f3f491fdd70da3ef568ee4a4f94689a3d5ca2.json +1 -0
  149. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +1 -0
  150. xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +1 -0
  151. xinference/web/ui/node_modules/.cache/babel-loader/a797831de0dc74897f4b50b3426555d748f328b4c2cc391de709eadaf6a5f3e3.json +1 -0
  152. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +1 -0
  153. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +1 -0
  154. xinference/web/ui/node_modules/.cache/babel-loader/e91938976f229ce986b2907e51e1f00540b584ced0a315d498c172d13220739d.json +1 -0
  155. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +1 -0
  156. {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/METADATA +22 -13
  157. {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/RECORD +170 -79
  158. xinference/locale/utils.py +0 -39
  159. xinference/locale/zh_CN.json +0 -26
  160. xinference/model/llm/ggml/tools/__init__.py +0 -15
  161. xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +0 -498
  162. xinference/model/llm/ggml/tools/gguf.py +0 -884
  163. xinference/model/llm/pytorch/__init__.py +0 -13
  164. xinference/model/llm/pytorch/baichuan.py +0 -81
  165. xinference/model/llm/pytorch/falcon.py +0 -138
  166. xinference/model/llm/pytorch/intern_vl.py +0 -352
  167. xinference/model/llm/pytorch/vicuna.py +0 -69
  168. xinference/web/ui/build/static/js/main.17ca0398.js +0 -3
  169. xinference/web/ui/node_modules/.cache/babel-loader/1444c41a4d04494f1cbc2d8c1537df107b451cb569cb2c1fbf5159f3a4841a5f.json +0 -1
  170. xinference/web/ui/node_modules/.cache/babel-loader/2f40209b32e7e46a2eab6b8c8a355eb42c3caa8bc3228dd929f32fd2b3940294.json +0 -1
  171. xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +0 -1
  172. xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +0 -1
  173. xinference/web/ui/node_modules/.cache/babel-loader/6450605fac003812485f6251b9f0caafbf2e5bfc3bbe2f000050d9e2fdb8dcd3.json +0 -1
  174. xinference/web/ui/node_modules/.cache/babel-loader/71684495d995c7e266eecc6a0ad8ea0284cc785f80abddf863789c57a6134969.json +0 -1
  175. xinference/web/ui/node_modules/.cache/babel-loader/80acd1edf31542ab1dcccfad02cb4b38f3325cff847a781fcce97500cfd6f878.json +0 -1
  176. xinference/web/ui/node_modules/.cache/babel-loader/8a9742ddd8ba8546ef42dc14caca443f2b4524fabed7bf269e0eff3b7b64ee7d.json +0 -1
  177. xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +0 -1
  178. xinference/web/ui/node_modules/.cache/babel-loader/d93730e2b5d7e8c957b4d0965d2ed1dac9045a649adbd47c220d11f255d4b1e0.json +0 -1
  179. xinference/web/ui/node_modules/.cache/babel-loader/e656dc00b4d8b387f0a81ba8fc558767df1601c66369e2eb86a5ef27cf080572.json +0 -1
  180. xinference/web/ui/node_modules/.cache/babel-loader/f28b83886159d83b84f099b05d607a822dca4dd7f2d8aa6d56fe08bab0b5b086.json +0 -1
  181. xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +0 -1
  182. /xinference/{locale → model/llm/llama_cpp}/__init__.py +0 -0
  183. /xinference/model/llm/{ggml → transformers}/__init__.py +0 -0
  184. /xinference/model/llm/{pytorch → transformers}/compression.py +0 -0
  185. /xinference/model/llm/{pytorch → transformers}/deepseek_vl.py +0 -0
  186. /xinference/model/llm/{pytorch → transformers}/llama_2.py +0 -0
  187. /xinference/model/llm/{pytorch → transformers}/omnilmm.py +0 -0
  188. /xinference/model/llm/{pytorch → transformers}/qwen_vl.py +0 -0
  189. /xinference/model/llm/{pytorch → transformers}/tensorizer_utils.py +0 -0
  190. /xinference/web/ui/build/static/js/{main.17ca0398.js.LICENSE.txt → main.661c7b0a.js.LICENSE.txt} +0 -0
  191. {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/LICENSE +0 -0
  192. {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/WHEEL +0 -0
  193. {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/entry_points.txt +0 -0
  194. {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/top_level.txt +0 -0
@@ -1,26 +0,0 @@
1
- {
2
- "Please create model first": "请先创建模型",
3
- "stop reason": "停止原因",
4
- "Show stop reason": "展示停止原因",
5
- "Max tokens": "最大 token 数量",
6
- "The maximum number of tokens to generate.": "生成 token 数量最大值",
7
- "Temperature": "温度参数",
8
- "The temperature to use for sampling.": "温度参数用于调整输出的多样性,数值越高多样性越高",
9
- "Top P": "Top P",
10
- "The top-p value to use for sampling.": "用于控制生成文本的确定性,数值越低确定性越高",
11
- "Window size": "窗口大小",
12
- "Window size of chat history.": "用于生成回复的聊天历史窗口大小",
13
- "show stop reason": "展示停止原因",
14
- "Downloading": "下载中",
15
- "Download failed, please retry.": "下载失败,请重新下载",
16
- "model name": "模型名",
17
- "model format": "模型格式",
18
- "model size in billions": "模型大小(B)",
19
- "quantization": "模型量化方式",
20
- "Parameters": "参数调整",
21
- "create": "创建",
22
- "select model": "选择模型",
23
- "Arena": "角斗场",
24
- "Chat": "聊天",
25
- "Input": "输入"
26
- }
@@ -1,15 +0,0 @@
1
- # Copyright 2022-2023 XProbe Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from .convert_ggml_to_gguf import convert
@@ -1,498 +0,0 @@
1
- #!/usr/bin/env python3
2
- # Copied from llama.cpp to convert ggml file to gguf
3
- from __future__ import annotations
4
-
5
- import argparse
6
- import struct
7
- from enum import IntEnum
8
- from pathlib import Path
9
- from typing import Optional
10
-
11
- import numpy as np
12
-
13
- from . import gguf
14
-
15
- # Note: Does not support GGML_QKK_64
16
- QK_K = 256
17
- # Items here are (block size, type size)
18
- GGML_QUANT_SIZES = {
19
- gguf.GGMLQuantizationType.F32: (1, 4),
20
- gguf.GGMLQuantizationType.F16: (1, 2),
21
- gguf.GGMLQuantizationType.Q4_0: (32, 2 + 16),
22
- gguf.GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16),
23
- gguf.GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16),
24
- gguf.GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16),
25
- gguf.GGMLQuantizationType.Q8_0: (32, 2 + 32),
26
- gguf.GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32),
27
- gguf.GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4),
28
- gguf.GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12),
29
- gguf.GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12),
30
- gguf.GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
31
- gguf.GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
32
- gguf.GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8),
33
- }
34
-
35
-
36
- class GGMLFormat(IntEnum):
37
- GGML = 0
38
- GGMF = 1
39
- GGJT = 2
40
-
41
-
42
- class GGMLFType(IntEnum):
43
- ALL_F32 = 0
44
- MOSTLY_F16 = 1
45
- MOSTLY_Q4_0 = 2
46
- MOSTLY_Q4_1 = 3
47
- MOSTLY_Q4_1_SOME_F16 = 4
48
- MOSTLY_Q8_0 = 7
49
- MOSTLY_Q5_0 = 8
50
- MOSTLY_Q5_1 = 9
51
- MOSTLY_Q2_K = 10
52
- MOSTLY_Q3_K_S = 11
53
- MOSTLY_Q3_K_M = 12
54
- MOSTLY_Q3_K_L = 13
55
- MOSTLY_Q4_K_S = 14
56
- MOSTLY_Q4_K_M = 15
57
- MOSTLY_Q5_K_S = 16
58
- MOSTLY_Q5_K_M = 17
59
- MOSTLY_Q6_K = 18
60
-
61
-
62
- class Hyperparameters:
63
- def __init__(self):
64
- self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
65
- self.n_layer = self.n_rot = self.n_ff = 0
66
- self.ftype = GGMLFType.ALL_F32
67
-
68
- def set_n_ff(self, model):
69
- ff_tensor_idx = model.tensor_map.get(b"layers.0.feed_forward.w1.weight")
70
- assert ff_tensor_idx is not None, "Missing layer 0 FF tensor"
71
- ff_tensor = model.tensors[ff_tensor_idx]
72
- self.n_ff = ff_tensor.dims[1]
73
-
74
- def load(self, data, offset):
75
- (
76
- self.n_vocab,
77
- self.n_embd,
78
- self.n_mult,
79
- self.n_head,
80
- self.n_layer,
81
- self.n_rot,
82
- ftype,
83
- ) = struct.unpack("<7I", data[offset : offset + (4 * 7)])
84
- try:
85
- self.ftype = GGMLFType(ftype)
86
- except ValueError:
87
- raise ValueError(f"Invalid ftype {ftype}")
88
- return 4 * 7
89
-
90
- def __str__(self):
91
- return f"<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>"
92
-
93
-
94
- class Vocab:
95
- def __init__(self, load_scores=True):
96
- self.items = []
97
- self.load_scores = load_scores
98
-
99
- def load(self, data, offset, n_vocab):
100
- orig_offset = offset
101
- for _ in range(n_vocab):
102
- itemlen = struct.unpack("<I", data[offset : offset + 4])[0]
103
- assert itemlen < 4096, "Absurd vocab item length"
104
- offset += 4
105
- item_text = bytes(data[offset : offset + itemlen])
106
- offset += itemlen
107
- if self.load_scores:
108
- item_score = struct.unpack("<f", data[offset : offset + 4])[0]
109
- offset += 4
110
- else:
111
- item_score = 0.0
112
- self.items.append((item_text, item_score))
113
- return offset - orig_offset
114
-
115
-
116
- class Tensor:
117
- def __init__(self, use_padding=True):
118
- self.name = None
119
- self.dims: tuple[int, ...] = () # type: ignore
120
- self.dtype = None
121
- self.start_offset = 0
122
- self.len_bytes = np.int64(0)
123
- self.use_padding = use_padding
124
-
125
- def load(self, data, offset):
126
- orig_offset = offset
127
- (n_dims, name_len, dtype) = struct.unpack("<3I", data[offset : offset + 12])
128
- assert n_dims >= 0 and n_dims <= 4, f"Invalid tensor dimensions {n_dims}"
129
- assert name_len < 4096, "Absurd tensor name length"
130
- quant = GGML_QUANT_SIZES.get(dtype)
131
- assert quant is not None, "Unknown tensor type"
132
- (blksize, tysize) = quant
133
- offset += 12
134
- self.dtype = dtype
135
- self.dims = struct.unpack(f"<{n_dims}I", data[offset : offset + (4 * n_dims)])
136
- offset += 4 * n_dims
137
- self.name = bytes(data[offset : offset + name_len])
138
- offset += name_len
139
- pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
140
- offset += pad
141
- n_elems = np.prod(self.dims)
142
- n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
143
- self.start_offset = offset
144
- self.len_bytes = n_bytes
145
- offset += n_bytes
146
- # print(n_dims, name_len, dtype, self.dims, self.name, pad)
147
- return offset - orig_offset
148
-
149
-
150
- class GGMLModel:
151
- def __init__(self):
152
- self.hyperparameters = None
153
- self.vocab = None
154
- self.tensor_map = {}
155
- self.tensors = []
156
-
157
- def validate_header(self, data, offset):
158
- magic = bytes(data[offset : offset + 4])
159
- if magic == b"GGUF":
160
- raise ValueError("File is already in GGUF format.")
161
- if magic == b"lmgg":
162
- self.file_format = GGMLFormat.GGML
163
- self.format_version = 1
164
- return 4
165
- version = struct.unpack("<I", data[offset + 4 : offset + 8])[0]
166
- if magic == b"fmgg":
167
- if version != 1:
168
- raise ValueError(
169
- f"Cannot handle unexpected GGMF file version {version}"
170
- )
171
- self.file_format = GGMLFormat.GGMF
172
- self.format_version = version
173
- return 8
174
- if magic == b"tjgg":
175
- if version < 1 or version > 3:
176
- raise ValueError(
177
- f"Cannot handle unexpected GGJT file version {version}"
178
- )
179
- self.file_format = GGMLFormat.GGJT
180
- self.format_version = version
181
- return 8
182
- raise ValueError(
183
- f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file."
184
- )
185
-
186
- def validate_conversion(self, ftype):
187
- err = ""
188
- if self.file_format < GGMLFormat.GGJT or self.format_version < 2:
189
- if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
190
- err = "Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2."
191
- elif self.file_format == GGMLFormat.GGJT and self.format_version == 2:
192
- if ftype in (
193
- GGMLFType.MOSTLY_Q4_0,
194
- GGMLFType.MOSTLY_Q4_1,
195
- GGMLFType.MOSTLY_Q4_1_SOME_F16,
196
- GGMLFType.MOSTLY_Q8_0,
197
- ):
198
- err = "Q4 and Q8 quantizations changed in GGJTv3."
199
- if len(err) > 0:
200
- raise ValueError(
201
- f"{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion."
202
- )
203
-
204
- def load(self, data, offset):
205
- offset += self.validate_header(data, offset)
206
- hp = Hyperparameters()
207
- offset += hp.load(data, offset)
208
- print(
209
- f"* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}"
210
- )
211
- self.validate_conversion(hp.ftype)
212
- vocab = Vocab(load_scores=self.file_format > GGMLFormat.GGML)
213
- offset += vocab.load(data, offset, hp.n_vocab)
214
- tensors: list[Tensor] = [] # type: ignore
215
- tensor_map = {}
216
- while offset < len(data):
217
- tensor = Tensor(use_padding=self.file_format > GGMLFormat.GGMF)
218
- offset += tensor.load(data, offset)
219
- tensor_map[tensor.name] = len(tensors)
220
- tensors.append(tensor)
221
- self.hyperparameters = hp
222
- self.vocab = vocab
223
- self.tensors = tensors
224
- self.tensor_map = tensor_map
225
- hp.set_n_ff(self)
226
- return offset
227
-
228
-
229
- class GGMLToGGUF:
230
- def __init__(
231
- self,
232
- ggml_model,
233
- data,
234
- cfg,
235
- params_override=None,
236
- vocab_override=None,
237
- special_vocab=None,
238
- ):
239
- hp = ggml_model.hyperparameters
240
- self.model = ggml_model
241
- self.data = data
242
- self.cfg = cfg
243
- self.params_override = params_override
244
- self.vocab_override = vocab_override
245
- self.special_vocab = special_vocab
246
- if params_override is not None:
247
- n_kv_head = params_override.n_head_kv
248
- else:
249
- if cfg.gqa == 1:
250
- n_kv_head = hp.n_head
251
- else:
252
- gqa = float(cfg.gqa)
253
- n_kv_head = None
254
- for x in range(1, 256):
255
- if float(hp.n_head) / float(x) == gqa:
256
- n_kv_head = x
257
- assert (
258
- n_kv_head is not None
259
- ), "Couldn't determine n_kv_head from GQA param"
260
- print(f"- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}")
261
- self.n_kv_head = n_kv_head
262
- self.name_map = gguf.get_tensor_name_map(
263
- gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer
264
- )
265
-
266
- def save(self):
267
- print("* Preparing to save GGUF file")
268
- gguf_writer = gguf.GGUFWriter(
269
- self.cfg.output,
270
- gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
271
- use_temp_file=False,
272
- )
273
- self.add_params(gguf_writer)
274
- self.add_vocab(gguf_writer)
275
- if self.special_vocab is not None:
276
- self.special_vocab.add_to_gguf(gguf_writer)
277
- self.add_tensors(gguf_writer)
278
- print(" gguf: write header")
279
- gguf_writer.write_header_to_file()
280
- print(" gguf: write metadata")
281
- gguf_writer.write_kv_data_to_file()
282
- print(" gguf: write tensors")
283
- gguf_writer.write_tensors_to_file()
284
- gguf_writer.close()
285
-
286
- def add_params(self, gguf_writer):
287
- hp = self.model.hyperparameters
288
- cfg = self.cfg
289
- if cfg.desc is not None:
290
- desc = cfg.desc
291
- else:
292
- desc = f"converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format"
293
- try:
294
- # Filenames aren't necessarily valid UTF8.
295
- name = cfg.name if cfg.name is not None else cfg.input.name
296
- except UnicodeDecodeError:
297
- name = None
298
- print("* Adding model parameters and KV items")
299
- if name is not None:
300
- gguf_writer.add_name(name)
301
- gguf_writer.add_description(desc)
302
- gguf_writer.add_file_type(int(hp.ftype))
303
- if self.params_override is not None:
304
- po = self.params_override
305
- assert po.n_embd == hp.n_embd, "Model hyperparams mismatch"
306
- assert po.n_layer == hp.n_layer, "Model hyperparams mismatch"
307
- assert po.n_head == hp.n_head, "Model hyperparams mismatch"
308
- gguf_writer.add_context_length(po.n_ctx)
309
- gguf_writer.add_embedding_length(po.n_embd)
310
- gguf_writer.add_block_count(po.n_layer)
311
- gguf_writer.add_feed_forward_length(po.n_ff)
312
- gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
313
- gguf_writer.add_head_count(po.n_head)
314
- gguf_writer.add_head_count_kv(po.n_head_kv)
315
- gguf_writer.add_layer_norm_rms_eps(po.f_norm_eps)
316
- return
317
- gguf_writer.add_context_length(cfg.context_length)
318
- gguf_writer.add_embedding_length(hp.n_embd)
319
- gguf_writer.add_block_count(hp.n_layer)
320
- gguf_writer.add_feed_forward_length(hp.n_ff)
321
- gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
322
- gguf_writer.add_head_count(hp.n_head)
323
- gguf_writer.add_head_count_kv(self.n_kv_head)
324
- gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
325
-
326
- def add_vocab(self, gguf_writer):
327
- hp = self.model.hyperparameters
328
- gguf_writer.add_tokenizer_model("llama")
329
- tokens = []
330
- scores = []
331
- toktypes = []
332
- if self.vocab_override is not None:
333
- vo = self.vocab_override
334
- print("* Adding vocab item(s)")
335
- for idx, (vbytes, score, ttype) in enumerate(vo.all_tokens()):
336
- tokens.append(vbytes)
337
- scores.append(score)
338
- toktypes.append(ttype)
339
- assert (
340
- len(tokens) == hp.n_vocab
341
- ), f"Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}"
342
- gguf_writer.add_token_list(tokens)
343
- gguf_writer.add_token_scores(scores)
344
- if len(toktypes) > 0:
345
- gguf_writer.add_token_types(toktypes)
346
- return
347
- print(f"* Adding {hp.n_vocab} vocab item(s)")
348
- assert (
349
- len(self.model.vocab.items) >= 3
350
- ), "Cannot handle unexpectedly short model vocab"
351
- for tokid, (vbytes, vscore) in enumerate(self.model.vocab.items):
352
- tt = 1 # Normal
353
- # Special handling for UNK, BOS, EOS tokens.
354
- if tokid <= 2:
355
- if tokid == 0:
356
- vbytes = b"<unk>"
357
- tt = 2
358
- elif tokid == 1:
359
- vbytes = b"<s>"
360
- tt = 3
361
- else:
362
- vbytes = b"</s>"
363
- tt = 3
364
- elif len(vbytes) == 0:
365
- tt = 3 # Control
366
- elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
367
- vbytes = bytes(f"<0x{vbytes[0]:02X}>", encoding="UTF-8")
368
- tt = 6 # Byte
369
- else:
370
- vbytes = vbytes.replace(b" ", b"\xe2\x96\x81")
371
- toktypes.append(tt)
372
- tokens.append(vbytes)
373
- scores.append(vscore)
374
- gguf_writer.add_token_list(tokens)
375
- gguf_writer.add_token_scores(scores)
376
- gguf_writer.add_token_types(toktypes)
377
- gguf_writer.add_unk_token_id(0)
378
- gguf_writer.add_bos_token_id(1)
379
- gguf_writer.add_eos_token_id(2)
380
-
381
- def add_tensors(self, gguf_writer):
382
- tensor_map = self.name_map
383
- data = self.data
384
- print(f"* Adding {len(self.model.tensors)} tensor(s)")
385
- for tensor in self.model.tensors:
386
- name = str(tensor.name, "UTF-8")
387
- mapped_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
388
- assert mapped_name is not None, f"Bad name {name}"
389
- tempdims = list(tensor.dims[:])
390
- if len(tempdims) > 1:
391
- temp = tempdims[1]
392
- tempdims[1] = tempdims[0]
393
- tempdims[0] = temp
394
- # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
395
- gguf_writer.add_tensor(
396
- mapped_name,
397
- data[tensor.start_offset : tensor.start_offset + tensor.len_bytes],
398
- raw_shape=tempdims,
399
- raw_dtype=tensor.dtype,
400
- )
401
-
402
-
403
- def handle_args():
404
- parser = argparse.ArgumentParser(description="Convert GGML models to GGUF")
405
- parser.add_argument(
406
- "--input", "-i", type=Path, required=True, help="Input GGMLv3 filename"
407
- )
408
- parser.add_argument(
409
- "--output", "-o", type=Path, required=True, help="Output GGUF filename"
410
- )
411
- parser.add_argument("--name", help="Set model name")
412
- parser.add_argument("--desc", help="Set model description")
413
- parser.add_argument(
414
- "--gqa",
415
- type=int,
416
- default=1,
417
- help="grouped-query attention factor (use 8 for LLaMA2 70B)",
418
- )
419
- parser.add_argument(
420
- "--eps",
421
- default="5.0e-06",
422
- help="RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2",
423
- )
424
- parser.add_argument(
425
- "--context-length",
426
- "-c",
427
- type=int,
428
- default=2048,
429
- help="Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096",
430
- )
431
- return parser.parse_args()
432
-
433
-
434
- from dataclasses import dataclass
435
-
436
-
437
- @dataclass
438
- class Config:
439
- input: Path
440
- output: Path
441
- name: Optional[str]
442
- desc: Optional[str]
443
- gqa: int
444
- eps: float
445
- context_length: int
446
-
447
-
448
- def convert(
449
- source_path: str,
450
- dest_path: str,
451
- model_name: Optional[str] = None,
452
- model_desc: Optional[str] = None,
453
- gqa: int = 1,
454
- eps: float = 5.0e-06,
455
- context_length: int = 2048,
456
- ):
457
- cfg = Config(
458
- input=Path(source_path),
459
- output=Path(dest_path),
460
- name=model_name,
461
- desc=model_desc,
462
- gqa=gqa,
463
- eps=eps,
464
- context_length=context_length,
465
- )
466
- print(f"* Using config: {cfg}")
467
- print(
468
- "\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n"
469
- )
470
- if cfg.gqa == 1 or cfg.eps == "5.0e-06":
471
- print(
472
- '- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".'
473
- )
474
- data = np.memmap(cfg.input, mode="r")
475
- model = GGMLModel()
476
- print("* Scanning GGML input file")
477
- model.load(data, 0)
478
- print(f"* GGML model hyperparameters: {model.hyperparameters}")
479
- vocab_override = None
480
- params_override = None
481
- special_vocab = None
482
- print(
483
- "\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n"
484
- )
485
- if model.file_format == GGMLFormat.GGML:
486
- print(
487
- "! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!"
488
- )
489
- converter = GGMLToGGUF(
490
- model,
491
- data,
492
- cfg,
493
- params_override=params_override,
494
- vocab_override=vocab_override,
495
- special_vocab=special_vocab,
496
- )
497
- converter.save()
498
- print(f"* Successful completion. Output saved to: {cfg.output}")