xinference 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (210) hide show
  1. xinference/_compat.py +2 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +72 -66
  4. xinference/core/model.py +78 -25
  5. xinference/core/supervisor.py +81 -10
  6. xinference/core/utils.py +12 -8
  7. xinference/core/worker.py +32 -0
  8. xinference/model/audio/core.py +5 -0
  9. xinference/model/audio/cosyvoice.py +25 -3
  10. xinference/model/audio/f5tts.py +15 -10
  11. xinference/model/audio/f5tts_mlx.py +260 -0
  12. xinference/model/audio/fish_speech.py +35 -111
  13. xinference/model/audio/model_spec.json +19 -3
  14. xinference/model/audio/model_spec_modelscope.json +9 -0
  15. xinference/model/audio/utils.py +32 -0
  16. xinference/model/image/core.py +69 -1
  17. xinference/model/image/model_spec.json +145 -4
  18. xinference/model/image/model_spec_modelscope.json +150 -4
  19. xinference/model/image/stable_diffusion/core.py +45 -13
  20. xinference/model/llm/__init__.py +2 -0
  21. xinference/model/llm/llm_family.json +143 -0
  22. xinference/model/llm/llm_family.py +15 -36
  23. xinference/model/llm/llm_family_modelscope.json +148 -0
  24. xinference/model/llm/mlx/core.py +37 -32
  25. xinference/model/llm/transformers/cogagent.py +272 -0
  26. xinference/model/llm/transformers/core.py +2 -0
  27. xinference/model/llm/transformers/qwen2_vl.py +12 -1
  28. xinference/model/llm/utils.py +28 -3
  29. xinference/model/llm/vllm/core.py +48 -9
  30. xinference/model/llm/vllm/xavier/__init__.py +13 -0
  31. xinference/model/llm/vllm/xavier/allocator.py +74 -0
  32. xinference/model/llm/vllm/xavier/block.py +112 -0
  33. xinference/model/llm/vllm/xavier/block_manager.py +71 -0
  34. xinference/model/llm/vllm/xavier/block_tracker.py +116 -0
  35. xinference/model/llm/vllm/xavier/engine.py +247 -0
  36. xinference/model/llm/vllm/xavier/executor.py +132 -0
  37. xinference/model/llm/vllm/xavier/scheduler.py +422 -0
  38. xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
  39. xinference/model/llm/vllm/xavier/test/test_xavier.py +122 -0
  40. xinference/model/llm/vllm/xavier/transfer.py +298 -0
  41. xinference/model/video/diffusers.py +14 -0
  42. xinference/model/video/model_spec.json +15 -0
  43. xinference/model/video/model_spec_modelscope.json +16 -0
  44. xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
  45. xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
  46. xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
  47. xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
  48. xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
  49. xinference/thirdparty/cosyvoice/bin/train.py +42 -8
  50. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
  51. xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
  52. xinference/thirdparty/cosyvoice/cli/model.py +330 -80
  53. xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
  54. xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
  55. xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
  56. xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
  57. xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
  58. xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
  59. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
  60. xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
  61. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
  62. xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
  63. xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  64. xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
  65. xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
  66. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
  67. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
  68. xinference/thirdparty/cosyvoice/utils/common.py +28 -1
  69. xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
  70. xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
  71. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
  72. xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
  73. xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
  74. xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
  75. xinference/thirdparty/fish_speech/fish_speech/conversation.py +94 -83
  76. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +63 -20
  77. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +1 -26
  78. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
  79. xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
  80. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
  81. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1 -1
  82. xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +7 -13
  83. xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
  84. xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
  85. xinference/thirdparty/fish_speech/tools/fish_e2e.py +2 -2
  86. xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
  87. xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
  88. xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
  89. xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
  90. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
  91. xinference/thirdparty/fish_speech/tools/llama/generate.py +117 -89
  92. xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
  93. xinference/thirdparty/fish_speech/tools/schema.py +11 -28
  94. xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
  95. xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
  96. xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
  97. xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
  98. xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
  99. xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
  100. xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
  101. xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
  102. xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
  103. xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
  104. xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
  105. xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
  106. xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
  107. xinference/thirdparty/matcha/utils/utils.py +2 -2
  108. xinference/types.py +13 -0
  109. xinference/web/ui/build/asset-manifest.json +6 -6
  110. xinference/web/ui/build/index.html +1 -1
  111. xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
  112. xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
  113. xinference/web/ui/build/static/js/main.1eb206d1.js +3 -0
  114. xinference/web/ui/build/static/js/main.1eb206d1.js.map +1 -0
  115. xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
  116. xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
  117. xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
  118. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
  119. xinference/web/ui/node_modules/.cache/babel-loader/2213d49de260e1f67c888081b18f120f5225462b829ae57c9e05a05cec83689d.json +1 -0
  120. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
  121. xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
  122. xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
  123. xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
  124. xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
  125. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
  126. xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
  127. xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
  128. xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
  129. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
  130. xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
  131. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
  132. xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
  133. xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
  134. xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
  135. xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
  136. xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
  137. xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
  138. xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
  139. xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
  140. xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
  141. xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
  142. xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
  143. xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
  144. xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
  145. xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
  146. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
  147. xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
  148. xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
  149. xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
  150. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
  151. xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
  152. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
  153. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
  154. xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
  155. xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
  156. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
  157. xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
  158. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
  159. xinference/web/ui/node_modules/.package-lock.json +67 -3
  160. xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
  161. xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
  162. xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
  163. xinference/web/ui/node_modules/i18next/package.json +129 -0
  164. xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
  165. xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
  166. xinference/web/ui/node_modules/react-i18next/package.json +162 -0
  167. xinference/web/ui/node_modules/void-elements/package.json +34 -0
  168. xinference/web/ui/package-lock.json +69 -3
  169. xinference/web/ui/package.json +2 -0
  170. xinference/web/ui/src/locales/en.json +186 -0
  171. xinference/web/ui/src/locales/zh.json +186 -0
  172. {xinference-1.1.0.dist-info → xinference-1.2.0.dist-info}/METADATA +19 -11
  173. {xinference-1.1.0.dist-info → xinference-1.2.0.dist-info}/RECORD +178 -111
  174. xinference/thirdparty/cosyvoice/bin/__init__.py +0 -0
  175. xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
  176. xinference/thirdparty/cosyvoice/flow/__init__.py +0 -0
  177. xinference/thirdparty/cosyvoice/hifigan/__init__.py +0 -0
  178. xinference/thirdparty/cosyvoice/llm/__init__.py +0 -0
  179. xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
  180. xinference/thirdparty/fish_speech/tools/api.py +0 -943
  181. xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -95
  182. xinference/thirdparty/fish_speech/tools/webui.py +0 -548
  183. xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
  184. xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
  185. xinference/web/ui/build/static/js/main.4eb4ee80.js +0 -3
  186. xinference/web/ui/build/static/js/main.4eb4ee80.js.map +0 -1
  187. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
  188. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
  189. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
  190. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
  191. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
  192. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
  193. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
  194. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
  195. xinference/web/ui/node_modules/.cache/babel-loader/8c5eeb02f772d02cbe8b89c05428d0dd41a97866f75f7dc1c2164a67f5a1cf98.json +0 -1
  196. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
  197. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
  198. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
  199. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
  200. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
  201. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
  202. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
  203. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
  204. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
  205. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
  206. /xinference/web/ui/build/static/js/{main.4eb4ee80.js.LICENSE.txt → main.1eb206d1.js.LICENSE.txt} +0 -0
  207. {xinference-1.1.0.dist-info → xinference-1.2.0.dist-info}/LICENSE +0 -0
  208. {xinference-1.1.0.dist-info → xinference-1.2.0.dist-info}/WHEEL +0 -0
  209. {xinference-1.1.0.dist-info → xinference-1.2.0.dist-info}/entry_points.txt +0 -0
  210. {xinference-1.1.0.dist-info → xinference-1.2.0.dist-info}/top_level.txt +0 -0
xinference/core/utils.py CHANGED
@@ -62,12 +62,16 @@ def log_async(
62
62
 
63
63
  @wraps(func)
64
64
  async def wrapped(*args, **kwargs):
65
- try:
66
- bound_args = sig.bind_partial(*args, **kwargs)
67
- arguments = bound_args.arguments
68
- except TypeError:
69
- arguments = {}
70
- request_id_str = arguments.get("request_id", "")
65
+ request_id_str = kwargs.get("request_id")
66
+ if not request_id_str:
67
+ # sometimes `request_id` not in kwargs
68
+ # we try to bind the arguments
69
+ try:
70
+ bound_args = sig.bind_partial(*args, **kwargs)
71
+ arguments = bound_args.arguments
72
+ except TypeError:
73
+ arguments = {}
74
+ request_id_str = arguments.get("request_id", "")
71
75
  if not request_id_str:
72
76
  request_id_str = uuid.uuid1()
73
77
  if func_name == "text_to_image":
@@ -272,8 +276,8 @@ def get_nvidia_gpu_info() -> Dict:
272
276
 
273
277
 
274
278
  def assign_replica_gpu(
275
- _replica_model_uid: str, replica: int, gpu_idx: Union[int, List[int]]
276
- ) -> List[int]:
279
+ _replica_model_uid: str, replica: int, gpu_idx: Optional[Union[int, List[int]]]
280
+ ) -> Optional[List[int]]:
277
281
  model_uid, rep_id = parse_replica_model_uid(_replica_model_uid)
278
282
  rep_id, replica = int(rep_id), int(replica)
279
283
  if isinstance(gpu_idx, int):
xinference/core/worker.py CHANGED
@@ -22,6 +22,7 @@ import signal
22
22
  import threading
23
23
  import time
24
24
  from collections import defaultdict
25
+ from dataclasses import dataclass
25
26
  from logging import getLogger
26
27
  from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
27
28
 
@@ -58,6 +59,11 @@ else:
58
59
  MODEL_ACTOR_AUTO_RECOVER_LIMIT = None
59
60
 
60
61
 
62
+ @dataclass
63
+ class ModelStatus:
64
+ last_error: str = ""
65
+
66
+
61
67
  class WorkerActor(xo.StatelessActor):
62
68
  def __init__(
63
69
  self,
@@ -90,6 +96,7 @@ class WorkerActor(xo.StatelessActor):
90
96
  # attributes maintained after model launched:
91
97
  self._model_uid_to_model: Dict[str, xo.ActorRefType["ModelActor"]] = {}
92
98
  self._model_uid_to_model_spec: Dict[str, ModelDescription] = {}
99
+ self._model_uid_to_model_status: Dict[str, ModelStatus] = {}
93
100
  self._gpu_to_model_uid: Dict[int, str] = {}
94
101
  self._gpu_to_embedding_model_uids: Dict[int, Set[str]] = defaultdict(set)
95
102
  # Dict structure: gpu_index: {(replica_model_uid, model_type)}
@@ -866,6 +873,9 @@ class WorkerActor(xo.StatelessActor):
866
873
  )
867
874
 
868
875
  try:
876
+ xavier_config: Optional[Dict] = kwargs.pop("xavier_config", None)
877
+ if xavier_config is not None:
878
+ xavier_config["rank_address"] = subpool_address
869
879
  model, model_description = await asyncio.to_thread(
870
880
  create_model_instance,
871
881
  subpool_address,
@@ -893,6 +903,7 @@ class WorkerActor(xo.StatelessActor):
893
903
  model=model,
894
904
  model_description=model_description,
895
905
  request_limits=request_limits,
906
+ xavier_config=xavier_config,
896
907
  )
897
908
  await model_ref.load()
898
909
  except:
@@ -902,6 +913,7 @@ class WorkerActor(xo.StatelessActor):
902
913
  raise
903
914
  self._model_uid_to_model[model_uid] = model_ref
904
915
  self._model_uid_to_model_spec[model_uid] = model_description
916
+ self._model_uid_to_model_status[model_uid] = ModelStatus()
905
917
  self._model_uid_to_addr[model_uid] = subpool_address
906
918
  self._model_uid_to_recover_count.setdefault(
907
919
  model_uid, MODEL_ACTOR_AUTO_RECOVER_LIMIT
@@ -921,6 +933,7 @@ class WorkerActor(xo.StatelessActor):
921
933
  origin_uid,
922
934
  {"model_ability": abilities, "status": LaunchStatus.READY.name},
923
935
  )
936
+ return subpool_address
924
937
 
925
938
  @log_async(logger=logger, level=logging.INFO)
926
939
  async def terminate_model(self, model_uid: str, is_model_die=False):
@@ -976,6 +989,7 @@ class WorkerActor(xo.StatelessActor):
976
989
  status = LaunchStatus.ERROR.name
977
990
  else:
978
991
  status = LaunchStatus.TERMINATED.name
992
+ self._model_uid_to_model_status.pop(model_uid, None)
979
993
 
980
994
  if self._status_guard_ref is None:
981
995
  _ = await self.get_supervisor_ref()
@@ -1010,6 +1024,9 @@ class WorkerActor(xo.StatelessActor):
1010
1024
 
1011
1025
  @log_sync(logger=logger)
1012
1026
  def get_model(self, model_uid: str) -> xo.ActorRefType["ModelActor"]:
1027
+ model_status = self._model_uid_to_model_status.get(model_uid)
1028
+ if model_status and model_status.last_error:
1029
+ raise Exception(model_status.last_error)
1013
1030
  model_ref = self._model_uid_to_model.get(model_uid, None)
1014
1031
  if model_ref is None:
1015
1032
  raise ValueError(f"Model not found, uid: {model_uid}")
@@ -1138,6 +1155,21 @@ class WorkerActor(xo.StatelessActor):
1138
1155
  }
1139
1156
  return ret
1140
1157
 
1158
+ def update_model_status(self, model_uid: str, **kwargs):
1159
+ model_status = self._model_uid_to_model_status.get(model_uid)
1160
+ if model_status is not None:
1161
+ for k, v in kwargs.items():
1162
+ setattr(model_status, k, v)
1163
+
1164
+ def get_model_status(self, model_uid: str):
1165
+ return self._model_uid_to_model_status.get(model_uid)
1166
+
1141
1167
  @staticmethod
1142
1168
  def record_metrics(name, op, kwargs):
1143
1169
  record_metrics(name, op, kwargs)
1170
+
1171
+ async def start_transfer_for_vllm(
1172
+ self, rep_model_uid: str, rank_addresses: List[str]
1173
+ ):
1174
+ model_ref = self._model_uid_to_model[rep_model_uid]
1175
+ await model_ref.start_transfer_for_vllm(rank_addresses)
@@ -22,6 +22,7 @@ from ..utils import valid_model_revision
22
22
  from .chattts import ChatTTSModel
23
23
  from .cosyvoice import CosyVoiceModel
24
24
  from .f5tts import F5TTSModel
25
+ from .f5tts_mlx import F5TTSMLXModel
25
26
  from .fish_speech import FishSpeechModel
26
27
  from .funasr import FunASRModel
27
28
  from .whisper import WhisperModel
@@ -171,6 +172,7 @@ def create_audio_model_instance(
171
172
  CosyVoiceModel,
172
173
  FishSpeechModel,
173
174
  F5TTSModel,
175
+ F5TTSMLXModel,
174
176
  ],
175
177
  AudioModelDescription,
176
178
  ]:
@@ -185,6 +187,7 @@ def create_audio_model_instance(
185
187
  CosyVoiceModel,
186
188
  FishSpeechModel,
187
189
  F5TTSModel,
190
+ F5TTSMLXModel,
188
191
  ]
189
192
  if model_spec.model_family == "whisper":
190
193
  if not model_spec.engine:
@@ -201,6 +204,8 @@ def create_audio_model_instance(
201
204
  model = FishSpeechModel(model_uid, model_path, model_spec, **kwargs)
202
205
  elif model_spec.model_family == "F5-TTS":
203
206
  model = F5TTSModel(model_uid, model_path, model_spec, **kwargs)
207
+ elif model_spec.model_family == "F5-TTS-MLX":
208
+ model = F5TTSMLXModel(model_uid, model_path, model_spec, **kwargs)
204
209
  else:
205
210
  raise Exception(f"Unsupported audio model family: {model_spec.model_family}")
206
211
  model_description = AudioModelDescription(
@@ -39,6 +39,7 @@ class CosyVoiceModel:
39
39
  self._device = device
40
40
  self._model = None
41
41
  self._kwargs = kwargs
42
+ self._is_cosyvoice2 = False
42
43
 
43
44
  @property
44
45
  def model_ability(self):
@@ -51,7 +52,14 @@ class CosyVoiceModel:
51
52
  # The yaml config loaded from model has hard-coded the import paths. please refer to: load_hyperpyyaml
52
53
  sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../thirdparty"))
53
54
 
54
- from cosyvoice.cli.cosyvoice import CosyVoice
55
+ if "CosyVoice2" in self._model_spec.model_name:
56
+ from cosyvoice.cli.cosyvoice import CosyVoice2 as CosyVoice
57
+
58
+ self._is_cosyvoice2 = True
59
+ else:
60
+ from cosyvoice.cli.cosyvoice import CosyVoice
61
+
62
+ self._is_cosyvoice2 = False
55
63
 
56
64
  self._model = CosyVoice(
57
65
  self._model_path, load_jit=self._kwargs.get("load_jit", False)
@@ -78,12 +86,22 @@ class CosyVoiceModel:
78
86
  output = self._model.inference_zero_shot(
79
87
  input, prompt_text, prompt_speech_16k, stream=stream
80
88
  )
89
+ elif instruct_text:
90
+ assert self._is_cosyvoice2
91
+ logger.info("CosyVoice inference_instruct")
92
+ output = self._model.inference_instruct2(
93
+ input,
94
+ instruct_text=instruct_text,
95
+ prompt_speech_16k=prompt_speech_16k,
96
+ stream=stream,
97
+ )
81
98
  else:
82
99
  logger.info("CosyVoice inference_cross_lingual")
83
100
  output = self._model.inference_cross_lingual(
84
101
  input, prompt_speech_16k, stream=stream
85
102
  )
86
103
  else:
104
+ assert not self._is_cosyvoice2
87
105
  available_speakers = self._model.list_avaliable_spks()
88
106
  if not voice:
89
107
  voice = available_speakers[0]
@@ -106,7 +124,9 @@ class CosyVoiceModel:
106
124
  def _generator_stream():
107
125
  with BytesIO() as out:
108
126
  writer = torchaudio.io.StreamWriter(out, format=response_format)
109
- writer.add_audio_stream(sample_rate=22050, num_channels=1)
127
+ writer.add_audio_stream(
128
+ sample_rate=self._model.sample_rate, num_channels=1
129
+ )
110
130
  i = 0
111
131
  last_pos = 0
112
132
  with writer.open():
@@ -125,7 +145,7 @@ class CosyVoiceModel:
125
145
  chunks = [o["tts_speech"] for o in output]
126
146
  t = torch.cat(chunks, dim=1)
127
147
  with BytesIO() as out:
128
- torchaudio.save(out, t, 22050, format=response_format)
148
+ torchaudio.save(out, t, self._model.sample_rate, format=response_format)
129
149
  return out.getvalue()
130
150
 
131
151
  return _generator_stream() if stream else _generator_block()
@@ -163,6 +183,8 @@ class CosyVoiceModel:
163
183
  assert (
164
184
  prompt_text is None
165
185
  ), "CosyVoice Instruct model does not support prompt_text"
186
+ elif self._is_cosyvoice2:
187
+ assert prompt_speech is not None, "CosyVoice2 requires prompt_speech"
166
188
  else:
167
189
  # inference_zero_shot
168
190
  # inference_cross_lingual
@@ -11,12 +11,12 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
14
+ import io
15
15
  import logging
16
16
  import os
17
17
  import re
18
18
  from io import BytesIO
19
- from typing import TYPE_CHECKING, Optional
19
+ from typing import TYPE_CHECKING, Optional, Union
20
20
 
21
21
  if TYPE_CHECKING:
22
22
  from .core import AudioModelFamilyV1
@@ -106,9 +106,9 @@ class F5TTSModel:
106
106
  ) = preprocess_ref_audio_text(
107
107
  voices[voice]["ref_audio"], voices[voice]["ref_text"]
108
108
  )
109
- print("Voice:", voice)
110
- print("Ref_audio:", voices[voice]["ref_audio"])
111
- print("Ref_text:", voices[voice]["ref_text"])
109
+ logger.info("Voice:", voice)
110
+ logger.info("Ref_audio:", voices[voice]["ref_audio"])
111
+ logger.info("Ref_text:", voices[voice]["ref_text"])
112
112
 
113
113
  final_sample_rate = None
114
114
  generated_audio_segments = []
@@ -122,16 +122,16 @@ class F5TTSModel:
122
122
  if match:
123
123
  voice = match[1]
124
124
  else:
125
- print("No voice tag found, using main.")
125
+ logger.info("No voice tag found, using main.")
126
126
  voice = "main"
127
127
  if voice not in voices:
128
- print(f"Voice {voice} not found, using main.")
128
+ logger.info(f"Voice {voice} not found, using main.")
129
129
  voice = "main"
130
130
  text = re.sub(reg2, "", text)
131
131
  gen_text = text.strip()
132
132
  ref_audio = voices[voice]["ref_audio"]
133
133
  ref_text = voices[voice]["ref_text"]
134
- print(f"Voice: {voice}")
134
+ logger.info(f"Voice: {voice}")
135
135
  audio, final_sample_rate, spectragram = infer_process(
136
136
  ref_audio,
137
137
  ref_text,
@@ -167,18 +167,23 @@ class F5TTSModel:
167
167
  prompt_speech: Optional[bytes] = kwargs.pop("prompt_speech", None)
168
168
  prompt_text: Optional[str] = kwargs.pop("prompt_text", None)
169
169
 
170
+ ref_audio: Union[str, io.BytesIO]
170
171
  if prompt_speech is None:
171
172
  base = os.path.dirname(f5_tts.__file__)
172
173
  config = os.path.join(base, "infer/examples/basic/basic.toml")
173
174
  with open(config, "rb") as f:
174
175
  config_dict = tomli.load(f)
175
- prompt_speech = os.path.join(base, config_dict["ref_audio"])
176
+ ref_audio = os.path.join(base, config_dict["ref_audio"])
176
177
  prompt_text = config_dict["ref_text"]
178
+ else:
179
+ ref_audio = io.BytesIO(prompt_speech)
180
+ if prompt_text is None:
181
+ raise ValueError("`prompt_text` cannot be empty")
177
182
 
178
183
  assert self._model is not None
179
184
  vocoder_name = self._kwargs.get("vocoder_name", "vocos")
180
185
  sample_rate, wav = self._infer(
181
- ref_audio=prompt_speech,
186
+ ref_audio=ref_audio,
182
187
  ref_text=prompt_text,
183
188
  text_gen=input,
184
189
  model_obj=self._model,
@@ -0,0 +1,260 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import datetime
16
+ import io
17
+ import logging
18
+ import os
19
+ from io import BytesIO
20
+ from pathlib import Path
21
+ from typing import TYPE_CHECKING, Literal, Optional, Union
22
+
23
+ import numpy as np
24
+ from tqdm import tqdm
25
+
26
+ if TYPE_CHECKING:
27
+ from .core import AudioModelFamilyV1
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class F5TTSMLXModel:
33
+ def __init__(
34
+ self,
35
+ model_uid: str,
36
+ model_path: str,
37
+ model_spec: "AudioModelFamilyV1",
38
+ device: Optional[str] = None,
39
+ **kwargs,
40
+ ):
41
+ self._model_uid = model_uid
42
+ self._model_path = model_path
43
+ self._model_spec = model_spec
44
+ self._device = device
45
+ self._model = None
46
+ self._kwargs = kwargs
47
+ self._model = None
48
+
49
+ @property
50
+ def model_ability(self):
51
+ return self._model_spec.model_ability
52
+
53
+ def load(self):
54
+ try:
55
+ import mlx.core as mx
56
+ from f5_tts_mlx.cfm import F5TTS
57
+ from f5_tts_mlx.dit import DiT
58
+ from f5_tts_mlx.duration import DurationPredictor, DurationTransformer
59
+ from vocos_mlx import Vocos
60
+ except ImportError:
61
+ error_message = "Failed to import module 'f5_tts_mlx'"
62
+ installation_guide = [
63
+ "Please make sure 'f5_tts_mlx' is installed.\n",
64
+ ]
65
+
66
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
67
+
68
+ path = Path(self._model_path)
69
+ # vocab
70
+
71
+ vocab_path = path / "vocab.txt"
72
+ vocab = {v: i for i, v in enumerate(Path(vocab_path).read_text().split("\n"))}
73
+ if len(vocab) == 0:
74
+ raise ValueError(f"Could not load vocab from {vocab_path}")
75
+
76
+ # duration predictor
77
+
78
+ duration_model_path = path / "duration_v2.safetensors"
79
+ duration_predictor = None
80
+
81
+ if duration_model_path.exists():
82
+ duration_predictor = DurationPredictor(
83
+ transformer=DurationTransformer(
84
+ dim=512,
85
+ depth=8,
86
+ heads=8,
87
+ text_dim=512,
88
+ ff_mult=2,
89
+ conv_layers=2,
90
+ text_num_embeds=len(vocab) - 1,
91
+ ),
92
+ vocab_char_map=vocab,
93
+ )
94
+ weights = mx.load(duration_model_path.as_posix(), format="safetensors")
95
+ duration_predictor.load_weights(list(weights.items()))
96
+
97
+ # vocoder
98
+
99
+ vocos = Vocos.from_pretrained("lucasnewman/vocos-mel-24khz")
100
+
101
+ # model
102
+
103
+ model_path = path / "model.safetensors"
104
+
105
+ f5tts = F5TTS(
106
+ transformer=DiT(
107
+ dim=1024,
108
+ depth=22,
109
+ heads=16,
110
+ ff_mult=2,
111
+ text_dim=512,
112
+ conv_layers=4,
113
+ text_num_embeds=len(vocab) - 1,
114
+ ),
115
+ vocab_char_map=vocab,
116
+ vocoder=vocos.decode,
117
+ duration_predictor=duration_predictor,
118
+ )
119
+
120
+ weights = mx.load(model_path.as_posix(), format="safetensors")
121
+ f5tts.load_weights(list(weights.items()))
122
+ mx.eval(f5tts.parameters())
123
+
124
+ self._model = f5tts
125
+
126
+ def speech(
127
+ self,
128
+ input: str,
129
+ voice: str,
130
+ response_format: str = "mp3",
131
+ speed: float = 1.0,
132
+ stream: bool = False,
133
+ **kwargs,
134
+ ):
135
+ import mlx.core as mx
136
+ import soundfile as sf
137
+ import tomli
138
+ from f5_tts_mlx.generate import (
139
+ FRAMES_PER_SEC,
140
+ SAMPLE_RATE,
141
+ TARGET_RMS,
142
+ convert_char_to_pinyin,
143
+ split_sentences,
144
+ )
145
+
146
+ from .utils import ensure_sample_rate
147
+
148
+ if stream:
149
+ raise Exception("F5-TTS does not support stream generation.")
150
+
151
+ prompt_speech: Optional[bytes] = kwargs.pop("prompt_speech", None)
152
+ prompt_text: Optional[str] = kwargs.pop("prompt_text", None)
153
+ duration: Optional[float] = kwargs.pop("duration", None)
154
+ steps: Optional[int] = kwargs.pop("steps", 8)
155
+ cfg_strength: Optional[float] = kwargs.pop("cfg_strength", 2.0)
156
+ method: Literal["euler", "midpoint"] = kwargs.pop("method", "rk4")
157
+ sway_sampling_coef: float = kwargs.pop("sway_sampling_coef", -1.0)
158
+ seed: Optional[int] = kwargs.pop("seed", None)
159
+
160
+ prompt_speech_path: Union[str, io.BytesIO]
161
+ if prompt_speech is None:
162
+ base = os.path.join(os.path.dirname(__file__), "../../thirdparty/f5_tts")
163
+ config = os.path.join(base, "infer/examples/basic/basic.toml")
164
+ with open(config, "rb") as f:
165
+ config_dict = tomli.load(f)
166
+ prompt_speech_path = os.path.join(base, config_dict["ref_audio"])
167
+ prompt_text = config_dict["ref_text"]
168
+ else:
169
+ prompt_speech_path = io.BytesIO(prompt_speech)
170
+
171
+ if prompt_text is None:
172
+ raise ValueError("`prompt_text` cannot be empty")
173
+
174
+ audio, sr = sf.read(prompt_speech_path)
175
+ audio = ensure_sample_rate(audio, sr, SAMPLE_RATE)
176
+
177
+ audio = mx.array(audio)
178
+ ref_audio_duration = audio.shape[0] / SAMPLE_RATE
179
+ logger.debug(
180
+ f"Got reference audio with duration: {ref_audio_duration:.2f} seconds"
181
+ )
182
+
183
+ rms = mx.sqrt(mx.mean(mx.square(audio)))
184
+ if rms < TARGET_RMS:
185
+ audio = audio * TARGET_RMS / rms
186
+
187
+ sentences = split_sentences(input)
188
+ is_single_generation = len(sentences) <= 1 or duration is not None
189
+
190
+ if is_single_generation:
191
+ generation_text = convert_char_to_pinyin([prompt_text + " " + input]) # type: ignore
192
+
193
+ if duration is not None:
194
+ duration = int(duration * FRAMES_PER_SEC)
195
+
196
+ start_date = datetime.datetime.now()
197
+
198
+ wave, _ = self._model.sample( # type: ignore
199
+ mx.expand_dims(audio, axis=0),
200
+ text=generation_text,
201
+ duration=duration,
202
+ steps=steps,
203
+ method=method,
204
+ speed=speed,
205
+ cfg_strength=cfg_strength,
206
+ sway_sampling_coef=sway_sampling_coef,
207
+ seed=seed,
208
+ )
209
+
210
+ wave = wave[audio.shape[0] :]
211
+ mx.eval(wave)
212
+
213
+ generated_duration = wave.shape[0] / SAMPLE_RATE
214
+ print(
215
+ f"Generated {generated_duration:.2f}s of audio in {datetime.datetime.now() - start_date}."
216
+ )
217
+
218
+ else:
219
+ start_date = datetime.datetime.now()
220
+
221
+ output = []
222
+
223
+ for sentence_text in tqdm(split_sentences(input)):
224
+ text = convert_char_to_pinyin([prompt_text + " " + sentence_text]) # type: ignore
225
+
226
+ if duration is not None:
227
+ duration = int(duration * FRAMES_PER_SEC)
228
+
229
+ wave, _ = self._model.sample( # type: ignore
230
+ mx.expand_dims(audio, axis=0),
231
+ text=text,
232
+ duration=duration,
233
+ steps=steps,
234
+ method=method,
235
+ speed=speed,
236
+ cfg_strength=cfg_strength,
237
+ sway_sampling_coef=sway_sampling_coef,
238
+ seed=seed,
239
+ )
240
+
241
+ # trim the reference audio
242
+ wave = wave[audio.shape[0] :]
243
+ mx.eval(wave)
244
+
245
+ output.append(wave)
246
+
247
+ wave = mx.concatenate(output, axis=0)
248
+
249
+ generated_duration = wave.shape[0] / SAMPLE_RATE
250
+ logger.debug(
251
+ f"Generated {generated_duration:.2f}s of audio in {datetime.datetime.now() - start_date}."
252
+ )
253
+
254
+ # Save the generated audio
255
+ with BytesIO() as out:
256
+ with sf.SoundFile(
257
+ out, "w", SAMPLE_RATE, 1, format=response_format.upper()
258
+ ) as f:
259
+ f.write(np.array(wave))
260
+ return out.getvalue()