xinference 0.16.3__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (373) hide show
  1. xinference/_compat.py +24 -2
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +219 -77
  4. xinference/client/restful/restful_client.py +47 -2
  5. xinference/constants.py +1 -0
  6. xinference/core/chat_interface.py +6 -1
  7. xinference/core/model.py +124 -34
  8. xinference/core/supervisor.py +180 -12
  9. xinference/core/utils.py +73 -4
  10. xinference/core/worker.py +102 -4
  11. xinference/deploy/cmdline.py +3 -1
  12. xinference/deploy/test/test_cmdline.py +56 -0
  13. xinference/isolation.py +24 -0
  14. xinference/model/audio/__init__.py +12 -0
  15. xinference/model/audio/core.py +37 -4
  16. xinference/model/audio/cosyvoice.py +39 -6
  17. xinference/model/audio/f5tts.py +200 -0
  18. xinference/model/audio/f5tts_mlx.py +260 -0
  19. xinference/model/audio/fish_speech.py +70 -110
  20. xinference/model/audio/melotts.py +110 -0
  21. xinference/model/audio/model_spec.json +179 -3
  22. xinference/model/audio/model_spec_modelscope.json +27 -0
  23. xinference/model/audio/utils.py +32 -0
  24. xinference/model/audio/whisper.py +35 -10
  25. xinference/model/audio/whisper_mlx.py +208 -0
  26. xinference/model/embedding/core.py +322 -6
  27. xinference/model/embedding/model_spec.json +8 -1
  28. xinference/model/embedding/model_spec_modelscope.json +9 -1
  29. xinference/model/image/core.py +69 -1
  30. xinference/model/image/model_spec.json +145 -4
  31. xinference/model/image/model_spec_modelscope.json +150 -4
  32. xinference/model/image/stable_diffusion/core.py +50 -15
  33. xinference/model/llm/__init__.py +6 -2
  34. xinference/model/llm/llm_family.json +1055 -93
  35. xinference/model/llm/llm_family.py +15 -36
  36. xinference/model/llm/llm_family_modelscope.json +1031 -78
  37. xinference/model/llm/memory.py +1 -1
  38. xinference/model/llm/mlx/core.py +285 -47
  39. xinference/model/llm/sglang/core.py +2 -0
  40. xinference/model/llm/transformers/chatglm.py +9 -5
  41. xinference/model/llm/transformers/cogagent.py +272 -0
  42. xinference/model/llm/transformers/core.py +3 -0
  43. xinference/model/llm/transformers/glm_edge_v.py +230 -0
  44. xinference/model/llm/transformers/qwen2_vl.py +12 -1
  45. xinference/model/llm/transformers/utils.py +16 -8
  46. xinference/model/llm/utils.py +55 -4
  47. xinference/model/llm/vllm/core.py +137 -12
  48. xinference/model/llm/vllm/xavier/__init__.py +13 -0
  49. xinference/model/llm/vllm/xavier/allocator.py +74 -0
  50. xinference/model/llm/vllm/xavier/block.py +111 -0
  51. xinference/model/llm/vllm/xavier/block_manager.py +71 -0
  52. xinference/model/llm/vllm/xavier/block_tracker.py +129 -0
  53. xinference/model/llm/vllm/xavier/collective.py +74 -0
  54. xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
  55. xinference/model/llm/vllm/xavier/engine.py +247 -0
  56. xinference/model/llm/vllm/xavier/executor.py +134 -0
  57. xinference/model/llm/vllm/xavier/scheduler.py +438 -0
  58. xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
  59. xinference/model/llm/vllm/xavier/test/test_xavier.py +147 -0
  60. xinference/model/llm/vllm/xavier/transfer.py +319 -0
  61. xinference/model/rerank/core.py +11 -4
  62. xinference/model/video/diffusers.py +14 -0
  63. xinference/model/video/model_spec.json +15 -0
  64. xinference/model/video/model_spec_modelscope.json +16 -0
  65. xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
  66. xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
  67. xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
  68. xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
  69. xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
  70. xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
  71. xinference/thirdparty/cosyvoice/bin/train.py +42 -8
  72. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
  73. xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
  74. xinference/thirdparty/cosyvoice/cli/model.py +330 -80
  75. xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
  76. xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
  77. xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
  78. xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
  79. xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
  80. xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
  81. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
  82. xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
  83. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
  84. xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
  85. xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  86. xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
  87. xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
  88. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
  89. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
  90. xinference/thirdparty/cosyvoice/utils/common.py +28 -1
  91. xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
  92. xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
  93. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
  94. xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
  95. xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
  96. xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
  97. xinference/thirdparty/f5_tts/api.py +166 -0
  98. xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
  99. xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
  100. xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
  101. xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
  102. xinference/thirdparty/f5_tts/eval/README.md +49 -0
  103. xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
  104. xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
  105. xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
  106. xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
  107. xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
  108. xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
  109. xinference/thirdparty/f5_tts/infer/README.md +191 -0
  110. xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
  111. xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
  112. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
  113. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
  114. xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
  115. xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
  116. xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
  117. xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
  118. xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
  119. xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
  120. xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
  121. xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
  122. xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
  123. xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
  124. xinference/thirdparty/f5_tts/model/__init__.py +10 -0
  125. xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
  126. xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
  127. xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
  128. xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
  129. xinference/thirdparty/f5_tts/model/cfm.py +285 -0
  130. xinference/thirdparty/f5_tts/model/dataset.py +319 -0
  131. xinference/thirdparty/f5_tts/model/modules.py +658 -0
  132. xinference/thirdparty/f5_tts/model/trainer.py +366 -0
  133. xinference/thirdparty/f5_tts/model/utils.py +185 -0
  134. xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
  135. xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
  136. xinference/thirdparty/f5_tts/socket_server.py +159 -0
  137. xinference/thirdparty/f5_tts/train/README.md +77 -0
  138. xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
  139. xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
  140. xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
  141. xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
  142. xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
  143. xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
  144. xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
  145. xinference/thirdparty/f5_tts/train/train.py +75 -0
  146. xinference/thirdparty/fish_speech/fish_speech/conversation.py +266 -1
  147. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +2 -1
  148. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +2 -1
  149. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +2 -2
  150. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ko_KR.json +123 -0
  151. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +2 -1
  152. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +137 -29
  153. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +9 -9
  154. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +1 -1
  155. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +17 -11
  156. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
  157. xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
  158. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
  159. xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +2 -1
  160. xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +22 -0
  161. xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +1 -1
  162. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +2 -2
  163. xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +34 -18
  164. xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
  165. xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
  166. xinference/thirdparty/fish_speech/tools/e2e_webui.py +232 -0
  167. xinference/thirdparty/fish_speech/tools/fish_e2e.py +298 -0
  168. xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
  169. xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
  170. xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
  171. xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
  172. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
  173. xinference/thirdparty/fish_speech/tools/llama/generate.py +484 -72
  174. xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
  175. xinference/thirdparty/fish_speech/tools/schema.py +170 -0
  176. xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
  177. xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
  178. xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
  179. xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
  180. xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
  181. xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
  182. xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
  183. xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
  184. xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
  185. xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
  186. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +7 -1
  187. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +2 -3
  188. xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
  189. xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
  190. xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
  191. xinference/thirdparty/matcha/utils/utils.py +2 -2
  192. xinference/thirdparty/melo/api.py +135 -0
  193. xinference/thirdparty/melo/app.py +61 -0
  194. xinference/thirdparty/melo/attentions.py +459 -0
  195. xinference/thirdparty/melo/commons.py +160 -0
  196. xinference/thirdparty/melo/configs/config.json +94 -0
  197. xinference/thirdparty/melo/data/example/metadata.list +20 -0
  198. xinference/thirdparty/melo/data_utils.py +413 -0
  199. xinference/thirdparty/melo/download_utils.py +67 -0
  200. xinference/thirdparty/melo/infer.py +25 -0
  201. xinference/thirdparty/melo/init_downloads.py +14 -0
  202. xinference/thirdparty/melo/losses.py +58 -0
  203. xinference/thirdparty/melo/main.py +36 -0
  204. xinference/thirdparty/melo/mel_processing.py +174 -0
  205. xinference/thirdparty/melo/models.py +1030 -0
  206. xinference/thirdparty/melo/modules.py +598 -0
  207. xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
  208. xinference/thirdparty/melo/monotonic_align/core.py +46 -0
  209. xinference/thirdparty/melo/preprocess_text.py +135 -0
  210. xinference/thirdparty/melo/split_utils.py +174 -0
  211. xinference/thirdparty/melo/text/__init__.py +35 -0
  212. xinference/thirdparty/melo/text/chinese.py +199 -0
  213. xinference/thirdparty/melo/text/chinese_bert.py +107 -0
  214. xinference/thirdparty/melo/text/chinese_mix.py +253 -0
  215. xinference/thirdparty/melo/text/cleaner.py +36 -0
  216. xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
  217. xinference/thirdparty/melo/text/cmudict.rep +129530 -0
  218. xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
  219. xinference/thirdparty/melo/text/english.py +284 -0
  220. xinference/thirdparty/melo/text/english_bert.py +39 -0
  221. xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
  222. xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
  223. xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
  224. xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
  225. xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
  226. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
  227. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
  228. xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
  229. xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
  230. xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
  231. xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
  232. xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
  233. xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
  234. xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
  235. xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
  236. xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
  237. xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
  238. xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
  239. xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
  240. xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
  241. xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
  242. xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
  243. xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
  244. xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
  245. xinference/thirdparty/melo/text/french.py +94 -0
  246. xinference/thirdparty/melo/text/french_bert.py +39 -0
  247. xinference/thirdparty/melo/text/japanese.py +647 -0
  248. xinference/thirdparty/melo/text/japanese_bert.py +49 -0
  249. xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
  250. xinference/thirdparty/melo/text/korean.py +192 -0
  251. xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
  252. xinference/thirdparty/melo/text/spanish.py +122 -0
  253. xinference/thirdparty/melo/text/spanish_bert.py +39 -0
  254. xinference/thirdparty/melo/text/symbols.py +290 -0
  255. xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
  256. xinference/thirdparty/melo/train.py +635 -0
  257. xinference/thirdparty/melo/train.sh +19 -0
  258. xinference/thirdparty/melo/transforms.py +209 -0
  259. xinference/thirdparty/melo/utils.py +424 -0
  260. xinference/types.py +17 -1
  261. xinference/web/ui/build/asset-manifest.json +6 -6
  262. xinference/web/ui/build/index.html +1 -1
  263. xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
  264. xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
  265. xinference/web/ui/build/static/js/main.b0936c54.js +3 -0
  266. xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
  267. xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
  268. xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
  269. xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
  270. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
  271. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
  272. xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
  273. xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
  274. xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
  275. xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
  276. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
  277. xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
  278. xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
  279. xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
  280. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
  281. xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
  282. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
  283. xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
  284. xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
  285. xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
  286. xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
  287. xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
  288. xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
  289. xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
  290. xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
  291. xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
  292. xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
  293. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
  294. xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
  295. xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
  296. xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
  297. xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
  298. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
  299. xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
  300. xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
  301. xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
  302. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
  303. xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
  304. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
  305. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
  306. xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
  307. xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
  308. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
  309. xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
  310. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
  311. xinference/web/ui/node_modules/.package-lock.json +67 -3
  312. xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
  313. xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
  314. xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
  315. xinference/web/ui/node_modules/i18next/package.json +129 -0
  316. xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
  317. xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
  318. xinference/web/ui/node_modules/react-i18next/package.json +162 -0
  319. xinference/web/ui/node_modules/void-elements/package.json +34 -0
  320. xinference/web/ui/package-lock.json +69 -3
  321. xinference/web/ui/package.json +2 -0
  322. xinference/web/ui/src/locales/en.json +186 -0
  323. xinference/web/ui/src/locales/zh.json +186 -0
  324. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/METADATA +96 -36
  325. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/RECORD +335 -146
  326. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/WHEEL +1 -1
  327. xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
  328. xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
  329. xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
  330. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
  331. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
  332. xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
  333. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
  334. xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
  335. xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
  336. xinference/thirdparty/fish_speech/tools/api.py +0 -440
  337. xinference/thirdparty/fish_speech/tools/commons.py +0 -35
  338. xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
  339. xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -34
  340. xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
  341. xinference/thirdparty/fish_speech/tools/webui.py +0 -485
  342. xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
  343. xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
  344. xinference/web/ui/build/static/js/main.2f269bb3.js +0 -3
  345. xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
  346. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
  347. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
  348. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
  349. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
  350. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
  351. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
  352. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
  353. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
  354. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
  355. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
  356. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
  357. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
  358. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
  359. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
  360. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
  361. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
  362. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
  363. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
  364. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
  365. /xinference/thirdparty/{cosyvoice/bin → f5_tts}/__init__.py +0 -0
  366. /xinference/thirdparty/{cosyvoice/flow → melo}/__init__.py +0 -0
  367. /xinference/thirdparty/{cosyvoice/hifigan → melo/text/english_utils}/__init__.py +0 -0
  368. /xinference/thirdparty/{cosyvoice/llm → melo/text/es_phonemizer}/__init__.py +0 -0
  369. /xinference/thirdparty/{fish_speech/fish_speech/configs → melo/text/fr_phonemizer}/__init__.py +0 -0
  370. /xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
  371. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/LICENSE +0 -0
  372. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/entry_points.txt +0 -0
  373. {xinference-0.16.3.dist-info → xinference-1.2.1.dist-info}/top_level.txt +0 -0
xinference/_compat.py CHANGED
@@ -60,6 +60,10 @@ from openai.types.chat.chat_completion_stream_options_param import (
60
60
  ChatCompletionStreamOptionsParam,
61
61
  )
62
62
  from openai.types.chat.chat_completion_tool_param import ChatCompletionToolParam
63
+ from openai.types.shared_params.response_format_json_object import (
64
+ ResponseFormatJSONObject,
65
+ )
66
+ from openai.types.shared_params.response_format_text import ResponseFormatText
63
67
 
64
68
  OpenAIChatCompletionStreamOptionsParam = create_model_from_typeddict(
65
69
  ChatCompletionStreamOptionsParam
@@ -68,6 +72,24 @@ OpenAIChatCompletionToolParam = create_model_from_typeddict(ChatCompletionToolPa
68
72
  OpenAIChatCompletionNamedToolChoiceParam = create_model_from_typeddict(
69
73
  ChatCompletionNamedToolChoiceParam
70
74
  )
75
+ from openai._types import Body
76
+
77
+
78
+ class JSONSchema(BaseModel):
79
+ name: str
80
+ description: Optional[str] = None
81
+ schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
82
+ strict: Optional[bool] = None
83
+
84
+
85
+ class ResponseFormatJSONSchema(BaseModel):
86
+ json_schema: JSONSchema
87
+ type: Literal["json_schema"]
88
+
89
+
90
+ ResponseFormat = Union[
91
+ ResponseFormatText, ResponseFormatJSONObject, ResponseFormatJSONSchema
92
+ ]
71
93
 
72
94
 
73
95
  class CreateChatCompletionOpenAI(BaseModel):
@@ -84,8 +106,7 @@ class CreateChatCompletionOpenAI(BaseModel):
84
106
  n: Optional[int]
85
107
  parallel_tool_calls: Optional[bool]
86
108
  presence_penalty: Optional[float]
87
- # we do not support this
88
- # response_format: ResponseFormat
109
+ response_format: Optional[ResponseFormat]
89
110
  seed: Optional[int]
90
111
  service_tier: Optional[Literal["auto", "default"]]
91
112
  stop: Union[Optional[str], List[str]]
@@ -100,4 +121,5 @@ class CreateChatCompletionOpenAI(BaseModel):
100
121
  tools: Optional[Iterable[OpenAIChatCompletionToolParam]] # type: ignore
101
122
  top_logprobs: Optional[int]
102
123
  top_p: Optional[float]
124
+ extra_body: Optional[Body]
103
125
  user: Optional[str]
xinference/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2024-11-07T16:55:36+0800",
11
+ "date": "2025-01-24T16:52:57+0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "85ab86bf1c0967e45fbec995534cd5a0c9a9c439",
15
- "version": "0.16.3"
14
+ "full-revisionid": "a57b99b07b40d1082f69a8fc5b968d56bc3636bc",
15
+ "version": "1.2.1"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -52,10 +52,14 @@ from xoscar.utils import get_next_port
52
52
 
53
53
  from .._compat import BaseModel, Field
54
54
  from .._version import get_versions
55
- from ..constants import XINFERENCE_DEFAULT_ENDPOINT_PORT, XINFERENCE_DISABLE_METRICS
55
+ from ..constants import (
56
+ XINFERENCE_DEFAULT_CANCEL_BLOCK_DURATION,
57
+ XINFERENCE_DEFAULT_ENDPOINT_PORT,
58
+ XINFERENCE_DISABLE_METRICS,
59
+ )
56
60
  from ..core.event import Event, EventCollectorActor, EventType
57
61
  from ..core.supervisor import SupervisorActor
58
- from ..core.utils import json_dumps
62
+ from ..core.utils import CancelMixin, json_dumps
59
63
  from ..types import (
60
64
  ChatCompletion,
61
65
  Completion,
@@ -90,9 +94,9 @@ class CreateCompletionRequest(CreateCompletion):
90
94
 
91
95
  class CreateEmbeddingRequest(BaseModel):
92
96
  model: str
93
- input: Union[str, List[str], List[int], List[List[int]]] = Field(
94
- description="The input to embed."
95
- )
97
+ input: Union[
98
+ str, List[str], List[int], List[List[int]], Dict[str, str], List[Dict[str, str]]
99
+ ] = Field(description="The input to embed.")
96
100
  user: Optional[str] = None
97
101
 
98
102
  class Config:
@@ -111,6 +115,7 @@ class RerankRequest(BaseModel):
111
115
  return_documents: Optional[bool] = False
112
116
  return_len: Optional[bool] = False
113
117
  max_chunks_per_doc: Optional[int] = None
118
+ kwargs: Optional[str] = None
114
119
 
115
120
 
116
121
  class TextToImageRequest(BaseModel):
@@ -206,7 +211,7 @@ class BuildGradioImageInterfaceRequest(BaseModel):
206
211
  model_ability: List[str]
207
212
 
208
213
 
209
- class RESTfulAPI:
214
+ class RESTfulAPI(CancelMixin):
210
215
  def __init__(
211
216
  self,
212
217
  supervisor_address: str,
@@ -484,6 +489,16 @@ class RESTfulAPI:
484
489
  else None
485
490
  ),
486
491
  )
492
+ self._router.add_api_route(
493
+ "/v1/convert_ids_to_tokens",
494
+ self.convert_ids_to_tokens,
495
+ methods=["POST"],
496
+ dependencies=(
497
+ [Security(self._auth_service, scopes=["models:read"])]
498
+ if self.is_authenticated()
499
+ else None
500
+ ),
501
+ )
487
502
  self._router.add_api_route(
488
503
  "/v1/rerank",
489
504
  self.rerank,
@@ -1199,6 +1214,19 @@ class RESTfulAPI:
1199
1214
  async def get_address(self) -> JSONResponse:
1200
1215
  return JSONResponse(content=self._supervisor_address)
1201
1216
 
1217
+ async def _get_model_last_error(self, replica_model_uid: bytes, e: Exception):
1218
+ if not isinstance(e, xo.ServerClosed):
1219
+ return e
1220
+ try:
1221
+ model_status = await (await self._get_supervisor_ref()).get_model_status(
1222
+ replica_model_uid.decode("utf-8")
1223
+ )
1224
+ if model_status is not None and model_status.last_error:
1225
+ return Exception(model_status.last_error)
1226
+ except Exception as ex:
1227
+ return ex
1228
+ return e
1229
+
1202
1230
  async def create_completion(self, request: Request) -> Response:
1203
1231
  raw_body = await request.json()
1204
1232
  body = CreateCompletionRequest.parse_obj(raw_body)
@@ -1214,6 +1242,9 @@ class RESTfulAPI:
1214
1242
  raw_kwargs = {k: v for k, v in raw_body.items() if k not in exclude}
1215
1243
  kwargs = body.dict(exclude_unset=True, exclude=exclude)
1216
1244
 
1245
+ # guided_decoding params
1246
+ kwargs.update(self.extract_guided_params(raw_body=raw_body))
1247
+
1217
1248
  # TODO: Decide if this default value override is necessary #1061
1218
1249
  if body.max_tokens is None:
1219
1250
  kwargs["max_tokens"] = max_tokens_field.default
@@ -1254,11 +1285,14 @@ class RESTfulAPI:
1254
1285
  )
1255
1286
  return
1256
1287
  except Exception as ex:
1288
+ ex = await self._get_model_last_error(model.uid, ex)
1257
1289
  logger.exception("Completion stream got an error: %s", ex)
1258
1290
  await self._report_error_event(model_uid, str(ex))
1259
1291
  # https://github.com/openai/openai-python/blob/e0aafc6c1a45334ac889fe3e54957d309c3af93f/src/openai/_streaming.py#L107
1260
1292
  yield dict(data=json.dumps({"error": str(ex)}))
1261
1293
  return
1294
+ finally:
1295
+ await model.decrease_serve_count()
1262
1296
 
1263
1297
  return EventSourceResponse(stream_results())
1264
1298
  else:
@@ -1266,6 +1300,7 @@ class RESTfulAPI:
1266
1300
  data = await model.generate(body.prompt, kwargs, raw_params=raw_kwargs)
1267
1301
  return Response(data, media_type="application/json")
1268
1302
  except Exception as e:
1303
+ e = await self._get_model_last_error(model.uid, e)
1269
1304
  logger.error(e, exc_info=True)
1270
1305
  await self._report_error_event(model_uid, str(e))
1271
1306
  self.handle_request_limit_error(e)
@@ -1297,25 +1332,49 @@ class RESTfulAPI:
1297
1332
  try:
1298
1333
  embedding = await model.create_embedding(body.input, **kwargs)
1299
1334
  return Response(embedding, media_type="application/json")
1300
- except RuntimeError as re:
1301
- logger.error(re, exc_info=True)
1302
- await self._report_error_event(model_uid, str(re))
1303
- self.handle_request_limit_error(re)
1304
- raise HTTPException(status_code=400, detail=str(re))
1335
+ except Exception as e:
1336
+ e = await self._get_model_last_error(model.uid, e)
1337
+ logger.error(e, exc_info=True)
1338
+ await self._report_error_event(model_uid, str(e))
1339
+ self.handle_request_limit_error(e)
1340
+ raise HTTPException(status_code=500, detail=str(e))
1341
+
1342
+ async def convert_ids_to_tokens(self, request: Request) -> Response:
1343
+ payload = await request.json()
1344
+ body = CreateEmbeddingRequest.parse_obj(payload)
1345
+ model_uid = body.model
1346
+ exclude = {
1347
+ "model",
1348
+ "input",
1349
+ "user",
1350
+ }
1351
+ kwargs = {key: value for key, value in payload.items() if key not in exclude}
1352
+
1353
+ try:
1354
+ model = await (await self._get_supervisor_ref()).get_model(model_uid)
1355
+ except ValueError as ve:
1356
+ logger.error(str(ve), exc_info=True)
1357
+ await self._report_error_event(model_uid, str(ve))
1358
+ raise HTTPException(status_code=400, detail=str(ve))
1305
1359
  except Exception as e:
1306
1360
  logger.error(e, exc_info=True)
1307
1361
  await self._report_error_event(model_uid, str(e))
1308
1362
  raise HTTPException(status_code=500, detail=str(e))
1309
1363
 
1364
+ try:
1365
+ decoded_texts = await model.convert_ids_to_tokens(body.input, **kwargs)
1366
+ return Response(decoded_texts, media_type="application/json")
1367
+ except Exception as e:
1368
+ e = await self._get_model_last_error(model.uid, e)
1369
+ logger.error(e, exc_info=True)
1370
+ await self._report_error_event(model_uid, str(e))
1371
+ self.handle_request_limit_error(e)
1372
+ raise HTTPException(status_code=500, detail=str(e))
1373
+
1310
1374
  async def rerank(self, request: Request) -> Response:
1311
1375
  payload = await request.json()
1312
1376
  body = RerankRequest.parse_obj(payload)
1313
1377
  model_uid = body.model
1314
- kwargs = {
1315
- key: value
1316
- for key, value in payload.items()
1317
- if key not in RerankRequest.__annotations__.keys()
1318
- }
1319
1378
 
1320
1379
  try:
1321
1380
  model = await (await self._get_supervisor_ref()).get_model(model_uid)
@@ -1329,6 +1388,10 @@ class RESTfulAPI:
1329
1388
  raise HTTPException(status_code=500, detail=str(e))
1330
1389
 
1331
1390
  try:
1391
+ if body.kwargs is not None:
1392
+ parsed_kwargs = json.loads(body.kwargs)
1393
+ else:
1394
+ parsed_kwargs = {}
1332
1395
  scores = await model.rerank(
1333
1396
  body.documents,
1334
1397
  body.query,
@@ -1336,17 +1399,14 @@ class RESTfulAPI:
1336
1399
  max_chunks_per_doc=body.max_chunks_per_doc,
1337
1400
  return_documents=body.return_documents,
1338
1401
  return_len=body.return_len,
1339
- **kwargs,
1402
+ **parsed_kwargs,
1340
1403
  )
1341
1404
  return Response(scores, media_type="application/json")
1342
- except RuntimeError as re:
1343
- logger.error(re, exc_info=True)
1344
- await self._report_error_event(model_uid, str(re))
1345
- self.handle_request_limit_error(re)
1346
- raise HTTPException(status_code=400, detail=str(re))
1347
1405
  except Exception as e:
1406
+ e = await self._get_model_last_error(model.uid, e)
1348
1407
  logger.error(e, exc_info=True)
1349
1408
  await self._report_error_event(model_uid, str(e))
1409
+ self.handle_request_limit_error(e)
1350
1410
  raise HTTPException(status_code=500, detail=str(e))
1351
1411
 
1352
1412
  async def create_transcriptions(
@@ -1391,13 +1451,11 @@ class RESTfulAPI:
1391
1451
  **parsed_kwargs,
1392
1452
  )
1393
1453
  return Response(content=transcription, media_type="application/json")
1394
- except RuntimeError as re:
1395
- logger.error(re, exc_info=True)
1396
- await self._report_error_event(model_uid, str(re))
1397
- raise HTTPException(status_code=400, detail=str(re))
1398
1454
  except Exception as e:
1455
+ e = await self._get_model_last_error(model_ref.uid, e)
1399
1456
  logger.error(e, exc_info=True)
1400
1457
  await self._report_error_event(model_uid, str(e))
1458
+ self.handle_request_limit_error(e)
1401
1459
  raise HTTPException(status_code=500, detail=str(e))
1402
1460
 
1403
1461
  async def create_translations(
@@ -1442,13 +1500,11 @@ class RESTfulAPI:
1442
1500
  **parsed_kwargs,
1443
1501
  )
1444
1502
  return Response(content=translation, media_type="application/json")
1445
- except RuntimeError as re:
1446
- logger.error(re, exc_info=True)
1447
- await self._report_error_event(model_uid, str(re))
1448
- raise HTTPException(status_code=400, detail=str(re))
1449
1503
  except Exception as e:
1504
+ e = await self._get_model_last_error(model_ref.uid, e)
1450
1505
  logger.error(e, exc_info=True)
1451
1506
  await self._report_error_event(model_uid, str(e))
1507
+ self.handle_request_limit_error(e)
1452
1508
  raise HTTPException(status_code=500, detail=str(e))
1453
1509
 
1454
1510
  async def create_speech(
@@ -1491,19 +1547,24 @@ class RESTfulAPI:
1491
1547
  **parsed_kwargs,
1492
1548
  )
1493
1549
  if body.stream:
1550
+
1551
+ async def stream_results():
1552
+ try:
1553
+ async for item in out:
1554
+ yield item
1555
+ finally:
1556
+ await model.decrease_serve_count()
1557
+
1494
1558
  return EventSourceResponse(
1495
- media_type="application/octet-stream", content=out
1559
+ media_type="application/octet-stream", content=stream_results()
1496
1560
  )
1497
1561
  else:
1498
1562
  return Response(media_type="application/octet-stream", content=out)
1499
- except RuntimeError as re:
1500
- logger.error(re, exc_info=True)
1501
- await self._report_error_event(model_uid, str(re))
1502
- self.handle_request_limit_error(re)
1503
- raise HTTPException(status_code=400, detail=str(re))
1504
1563
  except Exception as e:
1564
+ e = await self._get_model_last_error(model.uid, e)
1505
1565
  logger.error(e, exc_info=True)
1506
1566
  await self._report_error_event(model_uid, str(e))
1567
+ self.handle_request_limit_error(e)
1507
1568
  raise HTTPException(status_code=500, detail=str(e))
1508
1569
 
1509
1570
  async def get_progress(self, request_id: str) -> JSONResponse:
@@ -1531,8 +1592,11 @@ class RESTfulAPI:
1531
1592
  await self._report_error_event(model_uid, str(e))
1532
1593
  raise HTTPException(status_code=500, detail=str(e))
1533
1594
 
1595
+ request_id = None
1534
1596
  try:
1535
1597
  kwargs = json.loads(body.kwargs) if body.kwargs else {}
1598
+ request_id = kwargs.get("request_id")
1599
+ self._add_running_task(request_id)
1536
1600
  image_list = await model.text_to_image(
1537
1601
  prompt=body.prompt,
1538
1602
  n=body.n,
@@ -1541,14 +1605,16 @@ class RESTfulAPI:
1541
1605
  **kwargs,
1542
1606
  )
1543
1607
  return Response(content=image_list, media_type="application/json")
1544
- except RuntimeError as re:
1545
- logger.error(re, exc_info=True)
1546
- await self._report_error_event(model_uid, str(re))
1547
- self.handle_request_limit_error(re)
1548
- raise HTTPException(status_code=400, detail=str(re))
1608
+ except asyncio.CancelledError:
1609
+ err_str = f"The request has been cancelled: {request_id}"
1610
+ logger.error(err_str)
1611
+ await self._report_error_event(model_uid, err_str)
1612
+ raise HTTPException(status_code=409, detail=err_str)
1549
1613
  except Exception as e:
1614
+ e = await self._get_model_last_error(model.uid, e)
1550
1615
  logger.error(e, exc_info=True)
1551
1616
  await self._report_error_event(model_uid, str(e))
1617
+ self.handle_request_limit_error(e)
1552
1618
  raise HTTPException(status_code=500, detail=str(e))
1553
1619
 
1554
1620
  async def sdapi_options(self, request: Request) -> Response:
@@ -1619,14 +1685,11 @@ class RESTfulAPI:
1619
1685
  **kwargs,
1620
1686
  )
1621
1687
  return Response(content=image_list, media_type="application/json")
1622
- except RuntimeError as re:
1623
- logger.error(re, exc_info=True)
1624
- await self._report_error_event(model_uid, str(re))
1625
- self.handle_request_limit_error(re)
1626
- raise HTTPException(status_code=400, detail=str(re))
1627
1688
  except Exception as e:
1689
+ e = await self._get_model_last_error(model.uid, e)
1628
1690
  logger.error(e, exc_info=True)
1629
1691
  await self._report_error_event(model_uid, str(e))
1692
+ self.handle_request_limit_error(e)
1630
1693
  raise HTTPException(status_code=500, detail=str(e))
1631
1694
 
1632
1695
  async def sdapi_img2img(self, request: Request) -> Response:
@@ -1653,14 +1716,11 @@ class RESTfulAPI:
1653
1716
  **kwargs,
1654
1717
  )
1655
1718
  return Response(content=image_list, media_type="application/json")
1656
- except RuntimeError as re:
1657
- logger.error(re, exc_info=True)
1658
- await self._report_error_event(model_uid, str(re))
1659
- self.handle_request_limit_error(re)
1660
- raise HTTPException(status_code=400, detail=str(re))
1661
1719
  except Exception as e:
1720
+ e = await self._get_model_last_error(model.uid, e)
1662
1721
  logger.error(e, exc_info=True)
1663
1722
  await self._report_error_event(model_uid, str(e))
1723
+ self.handle_request_limit_error(e)
1664
1724
  raise HTTPException(status_code=500, detail=str(e))
1665
1725
 
1666
1726
  async def create_variations(
@@ -1686,11 +1746,14 @@ class RESTfulAPI:
1686
1746
  await self._report_error_event(model_uid, str(e))
1687
1747
  raise HTTPException(status_code=500, detail=str(e))
1688
1748
 
1749
+ request_id = None
1689
1750
  try:
1690
1751
  if kwargs is not None:
1691
1752
  parsed_kwargs = json.loads(kwargs)
1692
1753
  else:
1693
1754
  parsed_kwargs = {}
1755
+ request_id = parsed_kwargs.get("request_id")
1756
+ self._add_running_task(request_id)
1694
1757
  image_list = await model_ref.image_to_image(
1695
1758
  image=Image.open(image.file),
1696
1759
  prompt=prompt,
@@ -1701,13 +1764,16 @@ class RESTfulAPI:
1701
1764
  **parsed_kwargs,
1702
1765
  )
1703
1766
  return Response(content=image_list, media_type="application/json")
1704
- except RuntimeError as re:
1705
- logger.error(re, exc_info=True)
1706
- await self._report_error_event(model_uid, str(re))
1707
- raise HTTPException(status_code=400, detail=str(re))
1767
+ except asyncio.CancelledError:
1768
+ err_str = f"The request has been cancelled: {request_id}"
1769
+ logger.error(err_str)
1770
+ await self._report_error_event(model_uid, err_str)
1771
+ raise HTTPException(status_code=409, detail=err_str)
1708
1772
  except Exception as e:
1773
+ e = await self._get_model_last_error(model_ref.uid, e)
1709
1774
  logger.error(e, exc_info=True)
1710
1775
  await self._report_error_event(model_uid, str(e))
1776
+ self.handle_request_limit_error(e)
1711
1777
  raise HTTPException(status_code=500, detail=str(e))
1712
1778
 
1713
1779
  async def create_inpainting(
@@ -1734,11 +1800,14 @@ class RESTfulAPI:
1734
1800
  await self._report_error_event(model_uid, str(e))
1735
1801
  raise HTTPException(status_code=500, detail=str(e))
1736
1802
 
1803
+ request_id = None
1737
1804
  try:
1738
1805
  if kwargs is not None:
1739
1806
  parsed_kwargs = json.loads(kwargs)
1740
1807
  else:
1741
1808
  parsed_kwargs = {}
1809
+ request_id = parsed_kwargs.get("request_id")
1810
+ self._add_running_task(request_id)
1742
1811
  im = Image.open(image.file)
1743
1812
  mask_im = Image.open(mask_image.file)
1744
1813
  if not size:
@@ -1755,13 +1824,16 @@ class RESTfulAPI:
1755
1824
  **parsed_kwargs,
1756
1825
  )
1757
1826
  return Response(content=image_list, media_type="application/json")
1758
- except RuntimeError as re:
1759
- logger.error(re, exc_info=True)
1760
- await self._report_error_event(model_uid, str(re))
1761
- raise HTTPException(status_code=400, detail=str(re))
1827
+ except asyncio.CancelledError:
1828
+ err_str = f"The request has been cancelled: {request_id}"
1829
+ logger.error(err_str)
1830
+ await self._report_error_event(model_uid, err_str)
1831
+ raise HTTPException(status_code=409, detail=err_str)
1762
1832
  except Exception as e:
1833
+ e = await self._get_model_last_error(model_ref.uid, e)
1763
1834
  logger.error(e, exc_info=True)
1764
1835
  await self._report_error_event(model_uid, str(e))
1836
+ self.handle_request_limit_error(e)
1765
1837
  raise HTTPException(status_code=500, detail=str(e))
1766
1838
 
1767
1839
  async def create_ocr(
@@ -1782,24 +1854,30 @@ class RESTfulAPI:
1782
1854
  await self._report_error_event(model_uid, str(e))
1783
1855
  raise HTTPException(status_code=500, detail=str(e))
1784
1856
 
1857
+ request_id = None
1785
1858
  try:
1786
1859
  if kwargs is not None:
1787
1860
  parsed_kwargs = json.loads(kwargs)
1788
1861
  else:
1789
1862
  parsed_kwargs = {}
1863
+ request_id = parsed_kwargs.get("request_id")
1864
+ self._add_running_task(request_id)
1790
1865
  im = Image.open(image.file)
1791
1866
  text = await model_ref.ocr(
1792
1867
  image=im,
1793
1868
  **parsed_kwargs,
1794
1869
  )
1795
1870
  return Response(content=text, media_type="text/plain")
1796
- except RuntimeError as re:
1797
- logger.error(re, exc_info=True)
1798
- await self._report_error_event(model_uid, str(re))
1799
- raise HTTPException(status_code=400, detail=str(re))
1871
+ except asyncio.CancelledError:
1872
+ err_str = f"The request has been cancelled: {request_id}"
1873
+ logger.error(err_str)
1874
+ await self._report_error_event(model_uid, err_str)
1875
+ raise HTTPException(status_code=409, detail=err_str)
1800
1876
  except Exception as e:
1877
+ e = await self._get_model_last_error(model_ref.uid, e)
1801
1878
  logger.error(e, exc_info=True)
1802
1879
  await self._report_error_event(model_uid, str(e))
1880
+ self.handle_request_limit_error(e)
1803
1881
  raise HTTPException(status_code=500, detail=str(e))
1804
1882
 
1805
1883
  async def create_flexible_infer(self, request: Request) -> Response:
@@ -1826,14 +1904,11 @@ class RESTfulAPI:
1826
1904
  try:
1827
1905
  result = await model.infer(**kwargs)
1828
1906
  return Response(result, media_type="application/json")
1829
- except RuntimeError as re:
1830
- logger.error(re, exc_info=True)
1831
- await self._report_error_event(model_uid, str(re))
1832
- self.handle_request_limit_error(re)
1833
- raise HTTPException(status_code=400, detail=str(re))
1834
1907
  except Exception as e:
1908
+ e = await self._get_model_last_error(model.uid, e)
1835
1909
  logger.error(e, exc_info=True)
1836
1910
  await self._report_error_event(model_uid, str(e))
1911
+ self.handle_request_limit_error(e)
1837
1912
  raise HTTPException(status_code=500, detail=str(e))
1838
1913
 
1839
1914
  async def create_videos(self, request: Request) -> Response:
@@ -1858,14 +1933,11 @@ class RESTfulAPI:
1858
1933
  **kwargs,
1859
1934
  )
1860
1935
  return Response(content=video_list, media_type="application/json")
1861
- except RuntimeError as re:
1862
- logger.error(re, exc_info=True)
1863
- await self._report_error_event(model_uid, str(re))
1864
- self.handle_request_limit_error(re)
1865
- raise HTTPException(status_code=400, detail=str(re))
1866
1936
  except Exception as e:
1937
+ e = await self._get_model_last_error(model.uid, e)
1867
1938
  logger.error(e, exc_info=True)
1868
1939
  await self._report_error_event(model_uid, str(e))
1940
+ self.handle_request_limit_error(e)
1869
1941
  raise HTTPException(status_code=500, detail=str(e))
1870
1942
 
1871
1943
  async def create_chat_completion(self, request: Request) -> Response:
@@ -1880,9 +1952,13 @@ class RESTfulAPI:
1880
1952
  "logit_bias_type",
1881
1953
  "user",
1882
1954
  }
1955
+
1883
1956
  raw_kwargs = {k: v for k, v in raw_body.items() if k not in exclude}
1884
1957
  kwargs = body.dict(exclude_unset=True, exclude=exclude)
1885
1958
 
1959
+ # guided_decoding params
1960
+ kwargs.update(self.extract_guided_params(raw_body=raw_body))
1961
+
1886
1962
  # TODO: Decide if this default value override is necessary #1061
1887
1963
  if body.max_tokens is None:
1888
1964
  kwargs["max_tokens"] = max_tokens_field.default
@@ -1946,7 +2022,6 @@ class RESTfulAPI:
1946
2022
  )
1947
2023
  if body.tools and body.stream:
1948
2024
  is_vllm = await model.is_vllm_backend()
1949
-
1950
2025
  if not (
1951
2026
  (is_vllm and model_family in QWEN_TOOL_CALL_FAMILY)
1952
2027
  or (not is_vllm and model_family in GLM4_TOOL_CALL_FAMILY)
@@ -1956,7 +2031,8 @@ class RESTfulAPI:
1956
2031
  detail="Streaming support for tool calls is available only when using "
1957
2032
  "Qwen models with vLLM backend or GLM4-chat models without vLLM backend.",
1958
2033
  )
1959
-
2034
+ if "skip_special_tokens" in raw_kwargs and await model.is_vllm_backend():
2035
+ kwargs["skip_special_tokens"] = raw_kwargs["skip_special_tokens"]
1960
2036
  if body.stream:
1961
2037
 
1962
2038
  async def stream_results():
@@ -1986,11 +2062,14 @@ class RESTfulAPI:
1986
2062
  # TODO: Cannot yield here. Yield here would leads to error for the next streaming request.
1987
2063
  return
1988
2064
  except Exception as ex:
2065
+ ex = await self._get_model_last_error(model.uid, ex)
1989
2066
  logger.exception("Chat completion stream got an error: %s", ex)
1990
2067
  await self._report_error_event(model_uid, str(ex))
1991
2068
  # https://github.com/openai/openai-python/blob/e0aafc6c1a45334ac889fe3e54957d309c3af93f/src/openai/_streaming.py#L107
1992
2069
  yield dict(data=json.dumps({"error": str(ex)}))
1993
2070
  return
2071
+ finally:
2072
+ await model.decrease_serve_count()
1994
2073
 
1995
2074
  return EventSourceResponse(stream_results())
1996
2075
  else:
@@ -2002,6 +2081,7 @@ class RESTfulAPI:
2002
2081
  )
2003
2082
  return Response(content=data, media_type="application/json")
2004
2083
  except Exception as e:
2084
+ e = await self._get_model_last_error(model.uid, e)
2005
2085
  logger.error(e, exc_info=True)
2006
2086
  await self._report_error_event(model_uid, str(e))
2007
2087
  self.handle_request_limit_error(e)
@@ -2111,10 +2191,25 @@ class RESTfulAPI:
2111
2191
  logger.error(e, exc_info=True)
2112
2192
  raise HTTPException(status_code=500, detail=str(e))
2113
2193
 
2114
- async def abort_request(self, model_uid: str, request_id: str) -> JSONResponse:
2194
+ async def abort_request(
2195
+ self, request: Request, model_uid: str, request_id: str
2196
+ ) -> JSONResponse:
2115
2197
  try:
2198
+ payload = await request.json()
2199
+ block_duration = payload.get(
2200
+ "block_duration", XINFERENCE_DEFAULT_CANCEL_BLOCK_DURATION
2201
+ )
2202
+ logger.info(
2203
+ "Abort request with model uid: %s, request id: %s, block duration: %s",
2204
+ model_uid,
2205
+ request_id,
2206
+ block_duration,
2207
+ )
2116
2208
  supervisor_ref = await self._get_supervisor_ref()
2117
- res = await supervisor_ref.abort_request(model_uid, request_id)
2209
+ res = await supervisor_ref.abort_request(
2210
+ model_uid, request_id, block_duration
2211
+ )
2212
+ self._cancel_running_task(request_id, block_duration)
2118
2213
  return JSONResponse(content=res)
2119
2214
  except Exception as e:
2120
2215
  logger.error(e, exc_info=True)
@@ -2228,6 +2323,53 @@ class RESTfulAPI:
2228
2323
  logger.error(e, exc_info=True)
2229
2324
  raise HTTPException(status_code=500, detail=str(e))
2230
2325
 
2326
+ @staticmethod
2327
+ def extract_guided_params(raw_body: dict) -> dict:
2328
+ kwargs = {}
2329
+ raw_extra_body: dict = raw_body.get("extra_body") # type: ignore
2330
+ if raw_body.get("guided_json"):
2331
+ kwargs["guided_json"] = raw_body.get("guided_json")
2332
+ if raw_body.get("guided_regex") is not None:
2333
+ kwargs["guided_regex"] = raw_body.get("guided_regex")
2334
+ if raw_body.get("guided_choice") is not None:
2335
+ kwargs["guided_choice"] = raw_body.get("guided_choice")
2336
+ if raw_body.get("guided_grammar") is not None:
2337
+ kwargs["guided_grammar"] = raw_body.get("guided_grammar")
2338
+ if raw_body.get("guided_json_object") is not None:
2339
+ kwargs["guided_json_object"] = raw_body.get("guided_json_object")
2340
+ if raw_body.get("guided_decoding_backend") is not None:
2341
+ kwargs["guided_decoding_backend"] = raw_body.get("guided_decoding_backend")
2342
+ if raw_body.get("guided_whitespace_pattern") is not None:
2343
+ kwargs["guided_whitespace_pattern"] = raw_body.get(
2344
+ "guided_whitespace_pattern"
2345
+ )
2346
+ # Parse OpenAI extra_body
2347
+ if raw_extra_body is not None:
2348
+ if raw_extra_body.get("guided_json"):
2349
+ kwargs["guided_json"] = raw_extra_body.get("guided_json")
2350
+ if raw_extra_body.get("guided_regex") is not None:
2351
+ kwargs["guided_regex"] = raw_extra_body.get("guided_regex")
2352
+ if raw_extra_body.get("guided_choice") is not None:
2353
+ kwargs["guided_choice"] = raw_extra_body.get("guided_choice")
2354
+ if raw_extra_body.get("guided_grammar") is not None:
2355
+ kwargs["guided_grammar"] = raw_extra_body.get("guided_grammar")
2356
+ if raw_extra_body.get("guided_json_object") is not None:
2357
+ kwargs["guided_json_object"] = raw_extra_body.get("guided_json_object")
2358
+ if raw_extra_body.get("guided_decoding_backend") is not None:
2359
+ kwargs["guided_decoding_backend"] = raw_extra_body.get(
2360
+ "guided_decoding_backend"
2361
+ )
2362
+ if raw_extra_body.get("guided_whitespace_pattern") is not None:
2363
+ kwargs["guided_whitespace_pattern"] = raw_extra_body.get(
2364
+ "guided_whitespace_pattern"
2365
+ )
2366
+ if raw_extra_body.get("platform") is not None:
2367
+ kwargs["platform"] = raw_extra_body.get("platform")
2368
+ if raw_extra_body.get("format") is not None:
2369
+ kwargs["format"] = raw_extra_body.get("format")
2370
+
2371
+ return kwargs
2372
+
2231
2373
 
2232
2374
  def run(
2233
2375
  supervisor_address: str,