xinference 0.16.3__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (373) hide show
  1. xinference/_compat.py +24 -2
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +219 -77
  4. xinference/client/restful/restful_client.py +47 -2
  5. xinference/constants.py +1 -0
  6. xinference/core/chat_interface.py +6 -1
  7. xinference/core/model.py +124 -34
  8. xinference/core/supervisor.py +180 -12
  9. xinference/core/utils.py +73 -4
  10. xinference/core/worker.py +102 -4
  11. xinference/deploy/cmdline.py +3 -1
  12. xinference/deploy/test/test_cmdline.py +56 -0
  13. xinference/isolation.py +24 -0
  14. xinference/model/audio/__init__.py +12 -0
  15. xinference/model/audio/core.py +37 -4
  16. xinference/model/audio/cosyvoice.py +39 -6
  17. xinference/model/audio/f5tts.py +200 -0
  18. xinference/model/audio/f5tts_mlx.py +260 -0
  19. xinference/model/audio/fish_speech.py +70 -110
  20. xinference/model/audio/melotts.py +110 -0
  21. xinference/model/audio/model_spec.json +179 -3
  22. xinference/model/audio/model_spec_modelscope.json +27 -0
  23. xinference/model/audio/utils.py +32 -0
  24. xinference/model/audio/whisper.py +35 -10
  25. xinference/model/audio/whisper_mlx.py +208 -0
  26. xinference/model/embedding/core.py +322 -6
  27. xinference/model/embedding/model_spec.json +8 -1
  28. xinference/model/embedding/model_spec_modelscope.json +9 -1
  29. xinference/model/image/core.py +69 -1
  30. xinference/model/image/model_spec.json +145 -4
  31. xinference/model/image/model_spec_modelscope.json +150 -4
  32. xinference/model/image/stable_diffusion/core.py +50 -15
  33. xinference/model/llm/__init__.py +6 -2
  34. xinference/model/llm/llm_family.json +1055 -93
  35. xinference/model/llm/llm_family.py +15 -36
  36. xinference/model/llm/llm_family_modelscope.json +1031 -78
  37. xinference/model/llm/memory.py +1 -1
  38. xinference/model/llm/mlx/core.py +285 -47
  39. xinference/model/llm/sglang/core.py +2 -0
  40. xinference/model/llm/transformers/chatglm.py +9 -5
  41. xinference/model/llm/transformers/cogagent.py +272 -0
  42. xinference/model/llm/transformers/core.py +3 -0
  43. xinference/model/llm/transformers/glm_edge_v.py +230 -0
  44. xinference/model/llm/transformers/qwen2_vl.py +12 -1
  45. xinference/model/llm/transformers/utils.py +16 -8
  46. xinference/model/llm/utils.py +55 -4
  47. xinference/model/llm/vllm/core.py +137 -12
  48. xinference/model/llm/vllm/xavier/__init__.py +13 -0
  49. xinference/model/llm/vllm/xavier/allocator.py +74 -0
  50. xinference/model/llm/vllm/xavier/block.py +111 -0
  51. xinference/model/llm/vllm/xavier/block_manager.py +71 -0
  52. xinference/model/llm/vllm/xavier/block_tracker.py +129 -0
  53. xinference/model/llm/vllm/xavier/collective.py +74 -0
  54. xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
  55. xinference/model/llm/vllm/xavier/engine.py +247 -0
  56. xinference/model/llm/vllm/xavier/executor.py +134 -0
  57. xinference/model/llm/vllm/xavier/scheduler.py +438 -0
  58. xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
  59. xinference/model/llm/vllm/xavier/test/test_xavier.py +147 -0
  60. xinference/model/llm/vllm/xavier/transfer.py +319 -0
  61. xinference/model/rerank/core.py +11 -4
  62. xinference/model/video/diffusers.py +14 -0
  63. xinference/model/video/model_spec.json +15 -0
  64. xinference/model/video/model_spec_modelscope.json +16 -0
  65. xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
  66. xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
  67. xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
  68. xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
  69. xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
  70. xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
  71. xinference/thirdparty/cosyvoice/bin/train.py +42 -8
  72. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
  73. xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
  74. xinference/thirdparty/cosyvoice/cli/model.py +330 -80
  75. xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
  76. xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
  77. xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
  78. xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
  79. xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
  80. xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
  81. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
  82. xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
  83. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
  84. xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
  85. xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  86. xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
  87. xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
  88. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
  89. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
  90. xinference/thirdparty/cosyvoice/utils/common.py +28 -1
  91. xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
  92. xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
  93. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
  94. xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
  95. xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
  96. xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
  97. xinference/thirdparty/f5_tts/api.py +166 -0
  98. xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
  99. xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
  100. xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
  101. xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
  102. xinference/thirdparty/f5_tts/eval/README.md +49 -0
  103. xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
  104. xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
  105. xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
  106. xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
  107. xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
  108. xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
  109. xinference/thirdparty/f5_tts/infer/README.md +191 -0
  110. xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
  111. xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
  112. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
  113. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
  114. xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
  115. xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
  116. xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
  117. xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
  118. xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
  119. xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
  120. xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
  121. xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
  122. xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
  123. xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
  124. xinference/thirdparty/f5_tts/model/__init__.py +10 -0
  125. xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
  126. xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
  127. xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
  128. xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
  129. xinference/thirdparty/f5_tts/model/cfm.py +285 -0
  130. xinference/thirdparty/f5_tts/model/dataset.py +319 -0
  131. xinference/thirdparty/f5_tts/model/modules.py +658 -0
  132. xinference/thirdparty/f5_tts/model/trainer.py +366 -0
  133. xinference/thirdparty/f5_tts/model/utils.py +185 -0
  134. xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
  135. xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
  136. xinference/thirdparty/f5_tts/socket_server.py +159 -0
  137. xinference/thirdparty/f5_tts/train/README.md +77 -0
  138. xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
  139. xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
  140. xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
  141. xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
  142. xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
  143. xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
  144. xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
  145. xinference/thirdparty/f5_tts/train/train.py +75 -0
  146. xinference/thirdparty/fish_speech/fish_speech/conversation.py +266 -1
  147. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +2 -1
  148. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +2 -1
  149. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +2 -2
  150. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ko_KR.json +123 -0
  151. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +2 -1
  152. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +137 -29
  153. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +9 -9
  154. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +1 -1
  155. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +17 -11
  156. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
  157. xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
  158. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
  159. xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +2 -1
  160. xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +22 -0
  161. xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +1 -1
  162. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +2 -2
  163. xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +34 -18
  164. xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
  165. xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
  166. xinference/thirdparty/fish_speech/tools/e2e_webui.py +232 -0
  167. xinference/thirdparty/fish_speech/tools/fish_e2e.py +298 -0
  168. xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
  169. xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
  170. xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
  171. xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
  172. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
  173. xinference/thirdparty/fish_speech/tools/llama/generate.py +484 -72
  174. xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
  175. xinference/thirdparty/fish_speech/tools/schema.py +170 -0
  176. xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
  177. xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
  178. xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
  179. xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
  180. xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
  181. xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
  182. xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
  183. xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
  184. xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
  185. xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
  186. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +7 -1
  187. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +2 -3
  188. xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
  189. xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
  190. xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
  191. xinference/thirdparty/matcha/utils/utils.py +2 -2
  192. xinference/thirdparty/melo/api.py +135 -0
  193. xinference/thirdparty/melo/app.py +61 -0
  194. xinference/thirdparty/melo/attentions.py +459 -0
  195. xinference/thirdparty/melo/commons.py +160 -0
  196. xinference/thirdparty/melo/configs/config.json +94 -0
  197. xinference/thirdparty/melo/data/example/metadata.list +20 -0
  198. xinference/thirdparty/melo/data_utils.py +413 -0
  199. xinference/thirdparty/melo/download_utils.py +67 -0
  200. xinference/thirdparty/melo/infer.py +25 -0
  201. xinference/thirdparty/melo/init_downloads.py +14 -0
  202. xinference/thirdparty/melo/losses.py +58 -0
  203. xinference/thirdparty/melo/main.py +36 -0
  204. xinference/thirdparty/melo/mel_processing.py +174 -0
  205. xinference/thirdparty/melo/models.py +1030 -0
  206. xinference/thirdparty/melo/modules.py +598 -0
  207. xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
  208. xinference/thirdparty/melo/monotonic_align/core.py +46 -0
  209. xinference/thirdparty/melo/preprocess_text.py +135 -0
  210. xinference/thirdparty/melo/split_utils.py +174 -0
  211. xinference/thirdparty/melo/text/__init__.py +35 -0
  212. xinference/thirdparty/melo/text/chinese.py +199 -0
  213. xinference/thirdparty/melo/text/chinese_bert.py +107 -0
  214. xinference/thirdparty/melo/text/chinese_mix.py +253 -0
  215. xinference/thirdparty/melo/text/cleaner.py +36 -0
  216. xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
  217. xinference/thirdparty/melo/text/cmudict.rep +129530 -0
  218. xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
  219. xinference/thirdparty/melo/text/english.py +284 -0
  220. xinference/thirdparty/melo/text/english_bert.py +39 -0
  221. xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
  222. xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
  223. xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
  224. xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
  225. xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
  226. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
  227. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
  228. xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
  229. xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
  230. xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
  231. xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
  232. xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
  233. xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
  234. xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
  235. xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
  236. xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
  237. xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
  238. xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
  239. xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
  240. xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
  241. xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
  242. xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
  243. xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
  244. xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
  245. xinference/thirdparty/melo/text/french.py +94 -0
  246. xinference/thirdparty/melo/text/french_bert.py +39 -0
  247. xinference/thirdparty/melo/text/japanese.py +647 -0
  248. xinference/thirdparty/melo/text/japanese_bert.py +49 -0
  249. xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
  250. xinference/thirdparty/melo/text/korean.py +192 -0
  251. xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
  252. xinference/thirdparty/melo/text/spanish.py +122 -0
  253. xinference/thirdparty/melo/text/spanish_bert.py +39 -0
  254. xinference/thirdparty/melo/text/symbols.py +290 -0
  255. xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
  256. xinference/thirdparty/melo/train.py +635 -0
  257. xinference/thirdparty/melo/train.sh +19 -0
  258. xinference/thirdparty/melo/transforms.py +209 -0
  259. xinference/thirdparty/melo/utils.py +424 -0
  260. xinference/types.py +17 -1
  261. xinference/web/ui/build/asset-manifest.json +6 -6
  262. xinference/web/ui/build/index.html +1 -1
  263. xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
  264. xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
  265. xinference/web/ui/build/static/js/main.b0936c54.js +3 -0
  266. xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
  267. xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
  268. xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
  269. xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
  270. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
  271. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
  272. xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
  273. xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
  274. xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
  275. xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
  276. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
  277. xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
  278. xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
  279. xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
  280. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
  281. xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
  282. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
  283. xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
  284. xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
  285. xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
  286. xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
  287. xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
  288. xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
  289. xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
  290. xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
  291. xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
  292. xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
  293. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
  294. xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
  295. xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
  296. xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
  297. xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
  298. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
  299. xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
  300. xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
  301. xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
  302. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
  303. xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
  304. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
  305. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
  306. xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
  307. xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
  308. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
  309. xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
  310. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
  311. xinference/web/ui/node_modules/.package-lock.json +67 -3
  312. xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
  313. xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
  314. xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
  315. xinference/web/ui/node_modules/i18next/package.json +129 -0
  316. xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
  317. xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
  318. xinference/web/ui/node_modules/react-i18next/package.json +162 -0
  319. xinference/web/ui/node_modules/void-elements/package.json +34 -0
  320. xinference/web/ui/package-lock.json +69 -3
  321. xinference/web/ui/package.json +2 -0
  322. xinference/web/ui/src/locales/en.json +186 -0
  323. xinference/web/ui/src/locales/zh.json +186 -0
  324. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/METADATA +96 -36
  325. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/RECORD +335 -146
  326. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/WHEEL +1 -1
  327. xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
  328. xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
  329. xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
  330. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
  331. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
  332. xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
  333. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
  334. xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
  335. xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
  336. xinference/thirdparty/fish_speech/tools/api.py +0 -440
  337. xinference/thirdparty/fish_speech/tools/commons.py +0 -35
  338. xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
  339. xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -34
  340. xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
  341. xinference/thirdparty/fish_speech/tools/webui.py +0 -485
  342. xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
  343. xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
  344. xinference/web/ui/build/static/js/main.2f269bb3.js +0 -3
  345. xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
  346. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
  347. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
  348. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
  349. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
  350. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
  351. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
  352. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
  353. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
  354. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
  355. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
  356. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
  357. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
  358. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
  359. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
  360. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
  361. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
  362. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
  363. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
  364. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
  365. /xinference/thirdparty/{cosyvoice/bin → f5_tts}/__init__.py +0 -0
  366. /xinference/thirdparty/{cosyvoice/flow → melo}/__init__.py +0 -0
  367. /xinference/thirdparty/{cosyvoice/hifigan → melo/text/english_utils}/__init__.py +0 -0
  368. /xinference/thirdparty/{cosyvoice/llm → melo/text/es_phonemizer}/__init__.py +0 -0
  369. /xinference/thirdparty/{fish_speech/fish_speech/configs → melo/text/fr_phonemizer}/__init__.py +0 -0
  370. /xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
  371. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/LICENSE +0 -0
  372. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/entry_points.txt +0 -0
  373. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,7 @@
14
14
 
15
15
  # NOTE:
16
16
  #
17
- # The algorithum is ported from https://github.com/RahulSChand/gpu_poor
17
+ # The algorithm is ported from https://github.com/RahulSChand/gpu_poor
18
18
  #
19
19
  # Improvement:
20
20
  #
@@ -17,7 +17,8 @@ import platform
17
17
  import sys
18
18
  import time
19
19
  import uuid
20
- from typing import Dict, Iterator, List, Optional, TypedDict, Union
20
+ from dataclasses import dataclass, field
21
+ from typing import Any, Dict, Iterator, List, Optional, Tuple, TypedDict, Union
21
22
 
22
23
  from ....fields import max_tokens_field
23
24
  from ....types import (
@@ -53,6 +54,14 @@ class MLXGenerateConfig(TypedDict, total=False):
53
54
  stream: bool
54
55
  stream_options: Optional[Union[dict, None]]
55
56
  tools: Optional[List[Dict]]
57
+ lora_name: Optional[str]
58
+
59
+
60
+ @dataclass
61
+ class PromptCache:
62
+ cache: List[Any] = field(default_factory=list)
63
+ model_key: Tuple[str, Optional[str]] = ("", None)
64
+ tokens: List[int] = field(default_factory=list)
56
65
 
57
66
 
58
67
  class MLXModel(LLM):
@@ -69,6 +78,8 @@ class MLXModel(LLM):
69
78
  super().__init__(model_uid, model_family, model_spec, quantization, model_path)
70
79
  self._use_fast_tokenizer = True
71
80
  self._model_config: MLXModelConfig = self._sanitize_model_config(model_config)
81
+ self._max_kv_size = None
82
+ self._prompt_cache = None
72
83
  if peft_model is not None:
73
84
  raise ValueError("MLX engine has not supported lora yet")
74
85
 
@@ -92,10 +103,10 @@ class MLXModel(LLM):
92
103
  # default config is adapted from
93
104
  # https://github.com/ml-explore/mlx-examples/blob/f212b770d8b5143e23102eda20400ae43340f844/llms/mlx_lm/utils.py#L129
94
105
  generate_config.setdefault("temperature", 0.0)
106
+ generate_config.setdefault("logit_bias", None)
95
107
  generate_config.setdefault("repetition_penalty", None)
96
108
  generate_config.setdefault("repetition_context_size", 20)
97
109
  generate_config.setdefault("top_p", 1.0)
98
- generate_config.setdefault("logit_bias", None)
99
110
  return generate_config
100
111
 
101
112
  def _load_model(self, **kwargs):
@@ -127,6 +138,9 @@ class MLXModel(LLM):
127
138
  logger.debug(f"Setting cache limit to {cache_limit_gb} GB")
128
139
  mx.metal.set_cache_limit(cache_limit_gb * 1024 * 1024 * 1024)
129
140
 
141
+ self._max_kv_size = kwargs.get("max_kv_size", None)
142
+ self._prompt_cache = PromptCache()
143
+
130
144
  return load(
131
145
  self.model_path,
132
146
  tokenizer_config=tokenizer_config,
@@ -154,13 +168,69 @@ class MLXModel(LLM):
154
168
  return False
155
169
  if "generate" not in llm_family.model_ability:
156
170
  return False
171
+ if "chat" in llm_family.model_ability or "vision" in llm_family.model_ability:
172
+ # do not process chat or vision
173
+ return False
157
174
  return True
158
175
 
159
- def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig):
160
- import mlx.core as mx
161
- from mlx_lm.utils import generate_step
176
+ def _get_prompt_cache(
177
+ self, prompt, lora_name: Optional[str] = None, model: Any = None
178
+ ):
179
+ from mlx_lm.models.cache import make_prompt_cache
162
180
 
163
- model = self._model
181
+ assert self._prompt_cache is not None
182
+ cache_len = len(self._prompt_cache.tokens)
183
+ model_key = (self.model_path, lora_name)
184
+ if (
185
+ self._prompt_cache.model_key != model_key
186
+ or cache_len >= len(prompt)
187
+ or self._prompt_cache.tokens != prompt[:cache_len]
188
+ ):
189
+ self._prompt_cache.model_key = model_key
190
+ self._prompt_cache.cache = make_prompt_cache(
191
+ model or self._model, self._max_kv_size
192
+ )
193
+ self._prompt_cache.tokens = []
194
+ logger.debug("Making new prompt cache for %s", self.model_uid)
195
+ else:
196
+ prompt = prompt[cache_len:]
197
+ logger.debug("Cache hit for %s", self.model_uid)
198
+ self._prompt_cache.tokens.extend(prompt)
199
+ return prompt
200
+
201
+ def _generate_stream_inner(self, **kwargs):
202
+ from mlx_lm.utils import make_logits_processors, make_sampler, stream_generate
203
+
204
+ sampler = make_sampler(
205
+ temp=kwargs.pop("temperature"), top_p=kwargs.pop("top_p")
206
+ )
207
+ prompt_token_ids = kwargs.pop("prompt_token_ids")
208
+ logits_processors = make_logits_processors(
209
+ logit_bias=kwargs.pop("logits_bias", None),
210
+ repetition_penalty=kwargs.pop("repetition_penalty"),
211
+ repetition_context_size=kwargs.pop("repetition_context_size"),
212
+ )
213
+ yield from stream_generate(
214
+ self._model,
215
+ self._tokenizer,
216
+ prompt_token_ids,
217
+ sampler=sampler,
218
+ logits_processors=logits_processors,
219
+ **kwargs,
220
+ )
221
+
222
+ def _prepare_inputs(
223
+ self, prompt: Union[str, Dict[str, Any]], kwargs
224
+ ) -> Tuple[Any, int]:
225
+ prompt_token_ids = self._tokenizer.encode(prompt)
226
+ prompt_token_ids = self._get_prompt_cache(
227
+ prompt_token_ids, kwargs.get("lora_name")
228
+ )
229
+ return prompt_token_ids, len(prompt_token_ids)
230
+
231
+ def _generate_stream(
232
+ self, prompt: Union[str, Dict[str, Any]], kwargs: MLXGenerateConfig
233
+ ):
164
234
  model_uid = self.model_uid
165
235
  tokenizer = self._tokenizer
166
236
  max_tokens = kwargs["max_tokens"]
@@ -174,35 +244,28 @@ class MLXModel(LLM):
174
244
  else False
175
245
  )
176
246
 
177
- prompt_tokens = mx.array(tokenizer.encode(prompt))
178
- input_echo_len = len(prompt_tokens)
247
+ prompt_token_ids, input_echo_len = self._prepare_inputs(prompt, kwargs)
179
248
 
180
249
  i = 0
181
250
  start = time.time()
182
251
  output = ""
183
- for (token, _), i in zip(
184
- generate_step(
185
- prompt_tokens,
186
- model,
187
- temp=kwargs["temperature"],
252
+ tokens = []
253
+ for chunk_resp, i in zip(
254
+ self._generate_stream_inner(
255
+ prompt_token_ids=prompt_token_ids,
256
+ max_tokens=max_tokens,
257
+ temperature=kwargs["temperature"],
258
+ top_p=kwargs["top_p"],
188
259
  repetition_penalty=kwargs["repetition_penalty"],
189
260
  repetition_context_size=kwargs["repetition_context_size"],
190
- top_p=kwargs["top_p"],
191
- logit_bias=kwargs["logit_bias"],
261
+ prompt_cache=self._prompt_cache.cache if self._prompt_cache else None, # type: ignore
192
262
  ),
193
263
  range(max_tokens),
194
264
  ):
195
- if token == tokenizer.eos_token_id or token in stop_token_ids: # type: ignore
196
- break
197
-
198
- # Yield the last segment if streaming
199
- out = tokenizer.decode(
200
- token,
201
- skip_special_tokens=True,
202
- spaces_between_special_tokens=False,
203
- clean_up_tokenization_spaces=True,
204
- )
265
+ token = chunk_resp.token
266
+ tokens.append(token)
205
267
 
268
+ out = chunk_resp.text
206
269
  if stream:
207
270
  # this special character is mainly for qwen
208
271
  out = out.strip("�")
@@ -226,10 +289,16 @@ class MLXModel(LLM):
226
289
  total_tokens=(input_echo_len + i),
227
290
  ), completion_usage
228
291
 
292
+ if token == tokenizer.eos_token_id or token in stop_token_ids: # type: ignore
293
+ break
294
+
229
295
  logger.info(
230
296
  f"Average generation speed: {i / (time.time() - start):.2f} tokens/s."
231
297
  )
232
298
 
299
+ if self._prompt_cache:
300
+ self._prompt_cache.tokens.extend(tokens) # type: ignore
301
+
233
302
  if i == max_tokens - 1:
234
303
  finish_reason = "length"
235
304
  else:
@@ -272,10 +341,12 @@ class MLXModel(LLM):
272
341
  yield completion_chunk, completion_usage
273
342
 
274
343
  def generate(
275
- self, prompt: str, generate_config: Optional[MLXGenerateConfig] = None
344
+ self,
345
+ prompt: Union[str, Dict[str, Any]],
346
+ generate_config: Optional[MLXGenerateConfig] = None,
276
347
  ) -> Union[Completion, Iterator[CompletionChunk]]:
277
348
  def generator_wrapper(
278
- prompt: str, generate_config: MLXGenerateConfig
349
+ prompt: Union[str, Dict[str, Any]], generate_config: MLXGenerateConfig
279
350
  ) -> Iterator[CompletionChunk]:
280
351
  for completion_chunk, completion_usage in self._generate_stream(
281
352
  prompt,
@@ -314,26 +385,6 @@ class MLXModel(LLM):
314
385
 
315
386
 
316
387
  class MLXChatModel(MLXModel, ChatModelMixin):
317
- def __init__(
318
- self,
319
- model_uid: str,
320
- model_family: "LLMFamilyV1",
321
- model_spec: "LLMSpecV1",
322
- quantization: str,
323
- model_path: str,
324
- model_config: Optional[MLXModelConfig] = None,
325
- peft_model: Optional[List[LoRA]] = None,
326
- ):
327
- super().__init__(
328
- model_uid,
329
- model_family,
330
- model_spec,
331
- quantization,
332
- model_path,
333
- model_config,
334
- peft_model,
335
- )
336
-
337
388
  def _sanitize_generate_config(
338
389
  self,
339
390
  generate_config: Optional[MLXGenerateConfig],
@@ -360,6 +411,9 @@ class MLXChatModel(MLXModel, ChatModelMixin):
360
411
  return False
361
412
  if "chat" not in llm_family.model_ability:
362
413
  return False
414
+ if "vision" in llm_family.model_ability:
415
+ # do not process vision
416
+ return False
363
417
  return True
364
418
 
365
419
  def chat(
@@ -390,3 +444,187 @@ class MLXChatModel(MLXModel, ChatModelMixin):
390
444
  if tools:
391
445
  return self._tool_calls_completion(self.model_family, self.model_uid, c)
392
446
  return self._to_chat_completion(c)
447
+
448
+
449
+ class MLXVisionModel(MLXModel, ChatModelMixin):
450
+ @classmethod
451
+ def match(
452
+ cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
453
+ ) -> bool:
454
+ if llm_spec.model_format not in ["mlx"]:
455
+ return False
456
+ if sys.platform != "darwin" or platform.processor() != "arm":
457
+ # only work for Mac M chips
458
+ return False
459
+ if "vision" not in llm_family.model_ability:
460
+ return False
461
+ return True
462
+
463
+ def _load_model(self, **kwargs):
464
+ try:
465
+ from mlx_vlm import load
466
+ except ImportError:
467
+ error_message = "Failed to import module 'mlx_vlm'"
468
+ installation_guide = [
469
+ "Please make sure 'mlx_vlm' is installed. ",
470
+ "You can install it by `pip install mlx_vlm`\n",
471
+ ]
472
+
473
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
474
+
475
+ self._prompt_cache = PromptCache()
476
+
477
+ return load(self.model_path)
478
+
479
+ def load(self):
480
+ kwargs = {}
481
+ kwargs["revision"] = self._model_config.get(
482
+ "revision", self.model_spec.model_revision
483
+ )
484
+ kwargs["trust_remote_code"] = self._model_config.get("trust_remote_code")
485
+ kwargs["cache_limit_gb"] = self._model_config.pop("cache_limit_gb", None)
486
+
487
+ self._model, self._processor = self._load_model(**kwargs)
488
+ self._tokenizer = self._processor.tokenizer
489
+
490
+ def _generate_stream_inner(self, **kwargs):
491
+ import mlx.core as mx
492
+ from mlx_lm.utils import GenerationResponse
493
+ from mlx_vlm.utils import generate_step
494
+
495
+ inputs = kwargs["prompt_token_ids"]
496
+
497
+ max_tokens = kwargs.pop("max_tokens")
498
+ input_ids, pixel_values, mask, kwargs = inputs
499
+
500
+ tokenizer = self._processor.tokenizer
501
+ detokenizer = self._processor.detokenizer
502
+
503
+ detokenizer.reset()
504
+ tic = time.perf_counter()
505
+ for (token, logprobs), n in zip(
506
+ generate_step(input_ids, self._model, pixel_values, mask, **kwargs),
507
+ range(max_tokens),
508
+ ):
509
+ if n == 0:
510
+ prompt_time = time.perf_counter() - tic
511
+ prompt_tps = len(input_ids) / prompt_time
512
+ tic = time.perf_counter()
513
+ if token == tokenizer.eos_token_id:
514
+ break
515
+ detokenizer.add_token(token)
516
+
517
+ # Yield the last segment if streaming
518
+ yield GenerationResponse(
519
+ text=detokenizer.last_segment,
520
+ token=token,
521
+ logprobs=logprobs,
522
+ prompt_tokens=len(input_ids),
523
+ prompt_tps=prompt_tps,
524
+ generation_tokens=n + 1,
525
+ generation_tps=(n + 1) / (time.perf_counter() - tic),
526
+ peak_memory=mx.metal.get_peak_memory() / 1e9,
527
+ )
528
+
529
+ detokenizer.finalize()
530
+ yield GenerationResponse(
531
+ text=detokenizer.last_segment,
532
+ token=token,
533
+ logprobs=logprobs,
534
+ prompt_tokens=len(input_ids),
535
+ prompt_tps=prompt_tps,
536
+ generation_tokens=n + 1,
537
+ generation_tps=(n + 1) / (time.perf_counter() - tic),
538
+ peak_memory=mx.metal.get_peak_memory() / 1e9,
539
+ )
540
+
541
+ def _prepare_inputs(
542
+ self, prompt: Union[str, Dict[str, Any]], kwargs
543
+ ) -> Tuple[Any, int]:
544
+ import mlx.core as mx
545
+ from mlx_vlm import prepare_inputs
546
+
547
+ prompt_str = prompt.get("prompt") # type: ignore
548
+ images = prompt.get("multi_modal_data", {}).get("image") # type: ignore
549
+ if images and not isinstance(images, list):
550
+ images = [images]
551
+ resize_shape = kwargs.pop("resize_shape", None)
552
+ image_token_index = getattr(self._model.config, "image_token_index", None)
553
+
554
+ processor = self._processor
555
+ tokenizer = processor if hasattr(processor, "encode") else processor.tokenizer
556
+ prompt_tokens = mx.array(tokenizer.encode(prompt_str))
557
+
558
+ if not images:
559
+ input_ids = prompt_tokens[None, :]
560
+ pixel_values = mask = None
561
+ kwargs = {}
562
+ input_token_len = input_ids.size
563
+ else:
564
+ inputs = prepare_inputs(
565
+ processor, images, prompt_str, image_token_index, resize_shape
566
+ )
567
+ input_ids = inputs["input_ids"]
568
+ pixel_values = inputs["pixel_values"]
569
+ mask = inputs["attention_mask"]
570
+ kwargs = {
571
+ k: v
572
+ for k, v in inputs.items()
573
+ if k not in ["input_ids", "pixel_values", "attention_mask"]
574
+ }
575
+ input_token_len = int(mask.sum())
576
+ return (input_ids, pixel_values, mask, kwargs), input_token_len
577
+
578
+ def chat(
579
+ self,
580
+ messages: List[Dict],
581
+ generate_config: Optional[MLXGenerateConfig] = None,
582
+ ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
583
+ messages = self._transform_messages(messages) # type: ignore
584
+ tools = generate_config.pop("tools", []) if generate_config else None
585
+
586
+ model_family = self.model_family.model_family or self.model_family.model_name
587
+
588
+ if "internvl2" not in model_family.lower():
589
+ from qwen_vl_utils import process_vision_info
590
+
591
+ full_context_kwargs = {}
592
+ if tools and model_family in QWEN_TOOL_CALL_FAMILY:
593
+ full_context_kwargs["tools"] = tools
594
+ assert self.model_family.chat_template is not None
595
+ prompt = self.get_full_context(
596
+ messages, self.model_family.chat_template, **full_context_kwargs
597
+ )
598
+ images, video_inputs = process_vision_info(messages)
599
+ if video_inputs:
600
+ raise ValueError("Not support video input now.")
601
+ else:
602
+ prompt, images = self.get_specific_prompt(model_family, messages) # type: ignore
603
+
604
+ if not images:
605
+ inputs = {
606
+ "prompt": prompt,
607
+ }
608
+ elif len(images) == 1:
609
+ inputs = {
610
+ "prompt": prompt,
611
+ "multi_modal_data": {"image": images[-1]}, # type: ignore
612
+ }
613
+ else:
614
+ inputs = {
615
+ "prompt": prompt,
616
+ "multi_modal_data": {"image": images}, # type: ignore
617
+ }
618
+ generate_config = self._sanitize_generate_config(generate_config)
619
+
620
+ stream = generate_config.get("stream", False)
621
+ if stream:
622
+ it = self.generate(inputs, generate_config)
623
+ assert isinstance(it, Iterator)
624
+ return self._to_chat_completion_chunks(it)
625
+ else:
626
+ c = self.generate(inputs, generate_config)
627
+ assert not isinstance(c, Iterator)
628
+ if tools:
629
+ return self._tool_calls_completion(self.model_family, self.model_uid, c)
630
+ return self._to_chat_completion(c)
@@ -75,6 +75,7 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
75
75
  "llama-2-chat",
76
76
  "llama-3-instruct",
77
77
  "llama-3.1-instruct",
78
+ "llama-3.3-instruct",
78
79
  "qwen-chat",
79
80
  "qwen1.5-chat",
80
81
  "qwen2-instruct",
@@ -89,6 +90,7 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
89
90
  "deepseek-v2-chat-0628",
90
91
  "qwen2.5-instruct",
91
92
  "qwen2.5-coder-instruct",
93
+ "QwQ-32B-Preview",
92
94
  ]
93
95
 
94
96
 
@@ -61,7 +61,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
61
61
 
62
62
  def _load_model(self, **kwargs):
63
63
  try:
64
- from transformers import AutoModel, AutoTokenizer
64
+ from transformers import AutoModelForCausalLM, AutoTokenizer
65
65
  except ImportError:
66
66
  error_message = "Failed to import module 'transformers'"
67
67
  installation_guide = [
@@ -77,7 +77,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
77
77
  encode_special_tokens=True,
78
78
  revision=kwargs["revision"],
79
79
  )
80
- model = AutoModel.from_pretrained(
80
+ model = AutoModelForCausalLM.from_pretrained(
81
81
  self.model_path,
82
82
  **kwargs,
83
83
  )
@@ -232,9 +232,11 @@ class ChatglmPytorchChatModel(PytorchChatModel):
232
232
  content = {
233
233
  "name": function_name,
234
234
  "arguments": json.dumps(
235
- arguments_json
236
- if isinstance(arguments_json, dict)
237
- else arguments,
235
+ (
236
+ arguments_json
237
+ if isinstance(arguments_json, dict)
238
+ else arguments
239
+ ),
238
240
  ensure_ascii=False,
239
241
  ),
240
242
  }
@@ -331,6 +333,8 @@ class ChatglmPytorchChatModel(PytorchChatModel):
331
333
  max_new_tokens = generate_config.get("max_tokens")
332
334
  if max_new_tokens is not None:
333
335
  kwargs["max_new_tokens"] = int(max_new_tokens)
336
+ else:
337
+ kwargs["max_new_tokens"] = 1024
334
338
  do_sample = generate_config.get("do_sample")
335
339
  if do_sample is not None:
336
340
  kwargs["do_sample"] = bool(do_sample)