xinference 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (258) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +49 -62
  3. xinference/client/restful/restful_client.py +10 -1
  4. xinference/conftest.py +2 -2
  5. xinference/constants.py +10 -0
  6. xinference/core/model.py +33 -10
  7. xinference/core/resource.py +12 -11
  8. xinference/core/supervisor.py +22 -2
  9. xinference/core/worker.py +44 -16
  10. xinference/deploy/cmdline.py +19 -9
  11. xinference/deploy/local.py +9 -1
  12. xinference/deploy/supervisor.py +16 -3
  13. xinference/deploy/utils.py +1 -0
  14. xinference/deploy/worker.py +1 -1
  15. xinference/model/embedding/__init__.py +10 -0
  16. xinference/model/embedding/core.py +3 -0
  17. xinference/model/embedding/custom.py +5 -4
  18. xinference/model/embedding/model_spec.json +16 -0
  19. xinference/model/embedding/model_spec_modelscope.json +16 -0
  20. xinference/model/llm/__init__.py +22 -2
  21. xinference/model/llm/core.py +2 -2
  22. xinference/model/llm/ggml/chatglm.py +79 -15
  23. xinference/model/llm/ggml/llamacpp.py +2 -2
  24. xinference/model/llm/llm_family.json +99 -4
  25. xinference/model/llm/llm_family.py +54 -8
  26. xinference/model/llm/llm_family_modelscope.json +81 -2
  27. xinference/model/llm/pytorch/chatglm.py +95 -2
  28. xinference/model/llm/utils.py +12 -8
  29. xinference/model/llm/vllm/core.py +26 -5
  30. xinference/model/utils.py +25 -0
  31. xinference/types.py +64 -5
  32. xinference/utils.py +20 -0
  33. xinference/web/ui/build/asset-manifest.json +3 -3
  34. xinference/web/ui/build/index.html +1 -1
  35. xinference/web/ui/build/static/js/main.8126d441.js +3 -0
  36. xinference/web/ui/build/static/js/main.8126d441.js.map +1 -0
  37. xinference/web/ui/node_modules/.cache/babel-loader/06eb9846159adb398d44df0b0debc256a9fd9e8171a7d68f5c4ee4d655acfa45.json +1 -0
  38. xinference/web/ui/node_modules/.cache/babel-loader/0cccfbe5d963b8e31eb679f9d9677392839cedd04aa2956ac6b33cf19599d597.json +1 -0
  39. xinference/web/ui/node_modules/.cache/babel-loader/2f651cf60b1bde50c0601c7110f77dd44819fb6e2501ff748a631724d91445d4.json +1 -0
  40. xinference/web/ui/node_modules/.cache/babel-loader/3bda436576ecb05f81f7b6ec475d1cfaf03e2b3066e3a75902fe6e8c4773b43b.json +1 -0
  41. xinference/web/ui/node_modules/.cache/babel-loader/47887a9524ffeecdc2a7839dace146b24f97a5564fc3d431d6179ad2b153cf1f.json +1 -0
  42. xinference/web/ui/node_modules/.cache/babel-loader/48878f5178bad1a47757e011af41c974a7946efa29485506c4d19f25bf5d522d.json +1 -0
  43. xinference/web/ui/node_modules/.cache/babel-loader/59574eb63cfe9ed2e58d2f5a420e1ae54354e243a602e9bc73deae3147ed4f98.json +1 -0
  44. xinference/web/ui/node_modules/.cache/babel-loader/6a60ae66b29c2f3634fd081d369b9e63b4522fe18eb9e43e9979d1ff264b68ad.json +1 -0
  45. xinference/web/ui/node_modules/.cache/babel-loader/75a5abcbc92da335fdde530f5689194ec79a4b2345b8cba594f8904d3b88e3c6.json +1 -0
  46. xinference/web/ui/node_modules/.cache/babel-loader/84bfe7afede38da1f8ad569d891276fe4d66cfb87bf5c9ff7a113788ba62bb88.json +1 -0
  47. xinference/web/ui/node_modules/.cache/babel-loader/940ed05006583b955894e2b8f65a4a5ebf34f8149d747f59fae5131f17d65482.json +1 -0
  48. xinference/web/ui/node_modules/.cache/babel-loader/9c5f03db9aa88582a9b69b25c7f1acc78ba7fc61f743c9ed7399abb292d5dbde.json +1 -0
  49. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/a5e2e9f707eb7039bea096ca117d996b8f9cbc2a5613fd8e0c5b0094444ce23c.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/c02e70e9b9efcf3bd056606308104308d6a6ac559f2bc0b4454c11fb5874457c.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/e26750d9556e9741912333349e4da454c53dbfddbfc6002ab49518dcf02af745.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/e610aefd7000a3f8542a25cb66c64671cc8da18350de4e5b577102ba4bb78d65.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/ef42ec014d7bc373b874b2a1ff0dcd785490f125e913698bc049b0bd778e4d66.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/fe3eb4d76c79ca98833f686d642224eeeb94cc83ad14300d281623796d087f0a.json +1 -0
  56. xinference/web/ui/node_modules/.package-lock.json +1077 -405
  57. xinference/web/ui/node_modules/@eslint/eslintrc/node_modules/globals/globals.json +163 -3
  58. xinference/web/ui/node_modules/@eslint/eslintrc/node_modules/globals/package.json +1 -1
  59. xinference/web/ui/node_modules/@eslint/eslintrc/node_modules/ignore/package.json +64 -0
  60. xinference/web/ui/node_modules/@eslint/eslintrc/package.json +18 -37
  61. xinference/web/ui/node_modules/@eslint/js/package.json +1 -1
  62. xinference/web/ui/node_modules/@eslint-community/regexpp/package.json +9 -4
  63. xinference/web/ui/node_modules/@humanwhocodes/config-array/package.json +14 -14
  64. xinference/web/ui/node_modules/@rushstack/eslint-patch/package.json +6 -4
  65. xinference/web/ui/node_modules/@types/semver/package.json +15 -15
  66. xinference/web/ui/node_modules/@ungap/structured-clone/cjs/package.json +1 -0
  67. xinference/web/ui/node_modules/@ungap/structured-clone/package.json +53 -0
  68. xinference/web/ui/node_modules/ansi-colors/package.json +129 -0
  69. xinference/web/ui/node_modules/array-includes/package.json +8 -8
  70. xinference/web/ui/node_modules/array.prototype.findlastindex/package.json +120 -0
  71. xinference/web/ui/node_modules/array.prototype.flat/package.json +8 -8
  72. xinference/web/ui/node_modules/array.prototype.flatmap/package.json +8 -8
  73. xinference/web/ui/node_modules/arraybuffer.prototype.slice/package.json +103 -0
  74. xinference/web/ui/node_modules/ast-types-flow/package.json +2 -2
  75. xinference/web/ui/node_modules/astral-regex/package.json +33 -0
  76. xinference/web/ui/node_modules/asynciterator.prototype/package.json +72 -0
  77. xinference/web/ui/node_modules/axe-core/locales/_template.json +0 -12
  78. xinference/web/ui/node_modules/axe-core/package.json +1 -2
  79. xinference/web/ui/node_modules/axe-core/sri-history.json +0 -8
  80. xinference/web/ui/node_modules/call-bind/package.json +33 -23
  81. xinference/web/ui/node_modules/define-data-property/package.json +113 -0
  82. xinference/web/ui/node_modules/define-data-property/tsconfig.json +59 -0
  83. xinference/web/ui/node_modules/define-properties/package.json +5 -4
  84. xinference/web/ui/node_modules/enquirer/package.json +112 -0
  85. xinference/web/ui/node_modules/es-abstract/helpers/caseFolding.json +1430 -0
  86. xinference/web/ui/node_modules/es-abstract/package.json +29 -23
  87. xinference/web/ui/node_modules/es-iterator-helpers/index.json +17 -0
  88. xinference/web/ui/node_modules/es-iterator-helpers/package.json +185 -0
  89. xinference/web/ui/node_modules/eslint/conf/{rule-type-list.json → category-list.json} +9 -6
  90. xinference/web/ui/node_modules/eslint/node_modules/@babel/code-frame/package.json +25 -0
  91. xinference/web/ui/node_modules/eslint/node_modules/eslint-visitor-keys/lib/visitor-keys.json +289 -0
  92. xinference/web/ui/node_modules/eslint/node_modules/eslint-visitor-keys/package.json +39 -0
  93. xinference/web/ui/node_modules/eslint/node_modules/glob-parent/package.json +48 -0
  94. xinference/web/ui/node_modules/eslint/node_modules/ignore/package.json +64 -0
  95. xinference/web/ui/node_modules/eslint/package.json +53 -82
  96. xinference/web/ui/node_modules/eslint-config-prettier/package.json +13 -0
  97. xinference/web/ui/node_modules/eslint-import-resolver-node/package.json +3 -3
  98. xinference/web/ui/node_modules/eslint-plugin-import/package.json +22 -17
  99. xinference/web/ui/node_modules/eslint-plugin-jsx-a11y/package.json +25 -24
  100. xinference/web/ui/node_modules/eslint-plugin-simple-import-sort/package.json +23 -0
  101. xinference/web/ui/node_modules/eslint-plugin-testing-library/package.json +1 -1
  102. xinference/web/ui/node_modules/eslint-scope/package.json +19 -34
  103. xinference/web/ui/node_modules/eslint-utils/node_modules/eslint-visitor-keys/lib/visitor-keys.json +284 -0
  104. xinference/web/ui/node_modules/eslint-utils/node_modules/eslint-visitor-keys/package.json +40 -0
  105. xinference/web/ui/node_modules/eslint-utils/package.json +65 -0
  106. xinference/web/ui/node_modules/eslint-visitor-keys/package.json +15 -15
  107. xinference/web/ui/node_modules/espree/node_modules/acorn/package.json +35 -0
  108. xinference/web/ui/node_modules/espree/node_modules/eslint-visitor-keys/lib/visitor-keys.json +284 -0
  109. xinference/web/ui/node_modules/espree/node_modules/eslint-visitor-keys/package.json +40 -0
  110. xinference/web/ui/node_modules/espree/package.json +27 -51
  111. xinference/web/ui/node_modules/function-bind/package.json +38 -14
  112. xinference/web/ui/node_modules/function.prototype.name/package.json +32 -13
  113. xinference/web/ui/node_modules/functional-red-black-tree/package.json +40 -0
  114. xinference/web/ui/node_modules/get-intrinsic/package.json +11 -11
  115. xinference/web/ui/node_modules/hasown/package.json +91 -0
  116. xinference/web/ui/node_modules/hasown/tsconfig.json +49 -0
  117. xinference/web/ui/node_modules/is-async-function/package.json +86 -0
  118. xinference/web/ui/node_modules/is-core-module/core.json +3 -3
  119. xinference/web/ui/node_modules/is-core-module/package.json +7 -7
  120. xinference/web/ui/node_modules/is-finalizationregistry/package.json +67 -0
  121. xinference/web/ui/node_modules/is-generator-function/package.json +87 -0
  122. xinference/web/ui/node_modules/is-typed-array/package.json +8 -10
  123. xinference/web/ui/node_modules/iterator.prototype/package.json +73 -0
  124. xinference/web/ui/node_modules/jsx-ast-utils/package.json +5 -5
  125. xinference/web/ui/node_modules/language-tags/package.json +48 -8
  126. xinference/web/ui/node_modules/lodash.truncate/package.json +17 -0
  127. xinference/web/ui/node_modules/object-inspect/package.json +8 -6
  128. xinference/web/ui/node_modules/object.entries/package.json +7 -7
  129. xinference/web/ui/node_modules/object.fromentries/package.json +7 -7
  130. xinference/web/ui/node_modules/object.groupby/package.json +83 -0
  131. xinference/web/ui/node_modules/object.values/package.json +7 -7
  132. xinference/web/ui/node_modules/prettier/package.json +21 -0
  133. xinference/web/ui/node_modules/progress/package.json +26 -0
  134. xinference/web/ui/node_modules/react-scripts/node_modules/@eslint/eslintrc/package.json +82 -0
  135. xinference/web/ui/node_modules/react-scripts/node_modules/@humanwhocodes/config-array/package.json +61 -0
  136. xinference/web/ui/node_modules/react-scripts/node_modules/@humanwhocodes/object-schema/package.json +33 -0
  137. xinference/web/ui/node_modules/react-scripts/node_modules/ansi-styles/package.json +56 -0
  138. xinference/web/ui/node_modules/react-scripts/node_modules/chalk/package.json +68 -0
  139. xinference/web/ui/node_modules/react-scripts/node_modules/color-convert/package.json +48 -0
  140. xinference/web/ui/node_modules/react-scripts/node_modules/color-name/package.json +28 -0
  141. xinference/web/ui/node_modules/react-scripts/node_modules/escape-string-regexp/package.json +38 -0
  142. xinference/web/ui/node_modules/react-scripts/node_modules/eslint/conf/replacements.json +22 -0
  143. xinference/web/ui/node_modules/react-scripts/node_modules/eslint/conf/rule-type-list.json +28 -0
  144. xinference/web/ui/node_modules/react-scripts/node_modules/eslint/package.json +179 -0
  145. xinference/web/ui/node_modules/react-scripts/node_modules/eslint-scope/package.json +63 -0
  146. xinference/web/ui/node_modules/react-scripts/node_modules/espree/package.json +88 -0
  147. xinference/web/ui/node_modules/react-scripts/node_modules/globals/globals.json +1974 -0
  148. xinference/web/ui/node_modules/react-scripts/node_modules/globals/package.json +55 -0
  149. xinference/web/ui/node_modules/react-scripts/node_modules/has-flag/package.json +46 -0
  150. xinference/web/ui/node_modules/react-scripts/node_modules/supports-color/package.json +53 -0
  151. xinference/web/ui/node_modules/react-scripts/node_modules/type-fest/package.json +58 -0
  152. xinference/web/ui/node_modules/reflect.getprototypeof/package.json +99 -0
  153. xinference/web/ui/node_modules/regexp.prototype.flags/package.json +8 -7
  154. xinference/web/ui/node_modules/regexpp/package.json +91 -0
  155. xinference/web/ui/node_modules/resolve/lib/core.json +4 -1
  156. xinference/web/ui/node_modules/resolve/package.json +9 -8
  157. xinference/web/ui/node_modules/resolve/test/resolver/multirepo/package.json +1 -1
  158. xinference/web/ui/node_modules/safe-array-concat/package.json +5 -5
  159. xinference/web/ui/node_modules/set-function-length/package.json +84 -0
  160. xinference/web/ui/node_modules/set-function-name/package.json +80 -0
  161. xinference/web/ui/node_modules/slice-ansi/node_modules/ansi-styles/package.json +56 -0
  162. xinference/web/ui/node_modules/slice-ansi/node_modules/color-convert/package.json +48 -0
  163. xinference/web/ui/node_modules/slice-ansi/node_modules/color-name/package.json +28 -0
  164. xinference/web/ui/node_modules/slice-ansi/package.json +52 -0
  165. xinference/web/ui/node_modules/string.prototype.trim/package.json +7 -7
  166. xinference/web/ui/node_modules/string.prototype.trimend/package.json +7 -7
  167. xinference/web/ui/node_modules/string.prototype.trimstart/package.json +7 -7
  168. xinference/web/ui/node_modules/table/dist/src/schemas/config.json +95 -0
  169. xinference/web/ui/node_modules/table/dist/src/schemas/shared.json +139 -0
  170. xinference/web/ui/node_modules/table/dist/src/schemas/streamConfig.json +25 -0
  171. xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/data.json +13 -0
  172. xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2019-09/meta/applicator.json +53 -0
  173. xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2019-09/meta/content.json +17 -0
  174. xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2019-09/meta/core.json +57 -0
  175. xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2019-09/meta/format.json +14 -0
  176. xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2019-09/meta/meta-data.json +37 -0
  177. xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2019-09/meta/validation.json +90 -0
  178. xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2019-09/schema.json +39 -0
  179. xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2020-12/meta/applicator.json +48 -0
  180. xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2020-12/meta/content.json +17 -0
  181. xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2020-12/meta/core.json +51 -0
  182. xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2020-12/meta/format-annotation.json +14 -0
  183. xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2020-12/meta/meta-data.json +37 -0
  184. xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2020-12/meta/unevaluated.json +15 -0
  185. xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2020-12/meta/validation.json +90 -0
  186. xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2020-12/schema.json +55 -0
  187. xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-draft-06.json +137 -0
  188. xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-draft-07.json +151 -0
  189. xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-secure.json +88 -0
  190. xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/data.json +13 -0
  191. xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2019-09/meta/applicator.json +53 -0
  192. xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2019-09/meta/content.json +17 -0
  193. xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2019-09/meta/core.json +57 -0
  194. xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2019-09/meta/format.json +14 -0
  195. xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2019-09/meta/meta-data.json +37 -0
  196. xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2019-09/meta/validation.json +90 -0
  197. xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2019-09/schema.json +39 -0
  198. xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2020-12/meta/applicator.json +48 -0
  199. xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2020-12/meta/content.json +17 -0
  200. xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2020-12/meta/core.json +51 -0
  201. xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2020-12/meta/format-annotation.json +14 -0
  202. xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2020-12/meta/meta-data.json +37 -0
  203. xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2020-12/meta/unevaluated.json +15 -0
  204. xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2020-12/meta/validation.json +90 -0
  205. xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2020-12/schema.json +55 -0
  206. xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-draft-06.json +137 -0
  207. xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-draft-07.json +151 -0
  208. xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-secure.json +88 -0
  209. xinference/web/ui/node_modules/table/node_modules/ajv/package.json +126 -0
  210. xinference/web/ui/node_modules/table/node_modules/json-schema-traverse/package.json +43 -0
  211. xinference/web/ui/node_modules/table/package.json +77 -0
  212. xinference/web/ui/node_modules/typed-array-buffer/package.json +73 -0
  213. xinference/web/ui/node_modules/typed-array-byte-length/package.json +98 -0
  214. xinference/web/ui/node_modules/v8-compile-cache/package.json +34 -0
  215. xinference/web/ui/node_modules/which-builtin-type/package.json +93 -0
  216. xinference/web/ui/node_modules/which-typed-array/package.json +4 -5
  217. xinference/web/ui/package-lock.json +1085 -406
  218. xinference/web/ui/package.json +10 -2
  219. {xinference-0.6.4.dist-info → xinference-0.7.0.dist-info}/METADATA +53 -36
  220. {xinference-0.6.4.dist-info → xinference-0.7.0.dist-info}/RECORD +232 -124
  221. {xinference-0.6.4.dist-info → xinference-0.7.0.dist-info}/WHEEL +1 -1
  222. xinference/web/ui/build/static/js/main.8ae3b6d9.js +0 -3
  223. xinference/web/ui/build/static/js/main.8ae3b6d9.js.map +0 -1
  224. xinference/web/ui/node_modules/.cache/babel-loader/06363becf51869c421a8b3e34b4e3f50aa0aac3d590446044f9412e379f4ebbe.json +0 -1
  225. xinference/web/ui/node_modules/.cache/babel-loader/2849edddeb99a8ecdda577e810eead74b8f8a291cdfbd987839d604666ed79d0.json +0 -1
  226. xinference/web/ui/node_modules/.cache/babel-loader/2c774712d327cdf0b192aaa22785ec380e9427c587350c33289828d99e9c4abc.json +0 -1
  227. xinference/web/ui/node_modules/.cache/babel-loader/34c578e50d3040519ca8dc28bf0f7fec8674c2d6c0fcc3e98401c0a3f9f013cf.json +0 -1
  228. xinference/web/ui/node_modules/.cache/babel-loader/5933910e7c33febbabc0297ef7ba80f5e53ed96aa125b6a44ff2910aec29ced1.json +0 -1
  229. xinference/web/ui/node_modules/.cache/babel-loader/5e18a8354ea03d22a967fd8cb2171aa798edcb3da5d66ab1fd3b9663affd0abe.json +0 -1
  230. xinference/web/ui/node_modules/.cache/babel-loader/717cd7c186ace4812d1e602bdd299d8dc507f072670cc43974d53aac2574df5d.json +0 -1
  231. xinference/web/ui/node_modules/.cache/babel-loader/82dd896a6674286c48c1ab9f9147dd6e542dccd99848d5b3133a38efba8bd7ee.json +0 -1
  232. xinference/web/ui/node_modules/.cache/babel-loader/a178cfde289ffd15fd54b1c80fd9d231ae0f9644db33acb02084e69b32bfee37.json +0 -1
  233. xinference/web/ui/node_modules/.cache/babel-loader/adaec65f73accce3171b51b0fbcbfd8d0cd83f81a2e1b28eb34148644875499a.json +0 -1
  234. xinference/web/ui/node_modules/.cache/babel-loader/ae8f44c77c2e6f79680fe32fb00174183cd867093ebbda967b8985c33cc10fa2.json +0 -1
  235. xinference/web/ui/node_modules/.cache/babel-loader/b10bd04b4d6e28bfcaaaab37b0a4c1986e87a5b7e62e5ce4d56019880ef26990.json +0 -1
  236. xinference/web/ui/node_modules/.cache/babel-loader/cfc5da1cedee985a556e04865affccb72d0f624cbfb73da348bbe8693e8a4983.json +0 -1
  237. xinference/web/ui/node_modules/.cache/babel-loader/eebd0123c4b4396737e56b9181406a9fd76b107dd32971d23b0de99f51dd38d6.json +0 -1
  238. xinference/web/ui/node_modules/@nicolo-ribaudo/eslint-scope-5-internals/node_modules/eslint-scope/package.json +0 -48
  239. xinference/web/ui/node_modules/@typescript-eslint/utils/node_modules/eslint-scope/package.json +0 -48
  240. xinference/web/ui/node_modules/@typescript-eslint/utils/node_modules/estraverse/package.json +0 -40
  241. xinference/web/ui/node_modules/eslint/node_modules/argparse/package.json +0 -31
  242. xinference/web/ui/node_modules/eslint/node_modules/js-yaml/package.json +0 -66
  243. xinference/web/ui/node_modules/eslint-plugin-jsx-a11y/node_modules/semver/package.json +0 -38
  244. xinference/web/ui/node_modules/function-bind/.jscs.json +0 -176
  245. xinference/web/ui/node_modules/resolve/test/resolver/malformed_package_json/package.json +0 -1
  246. xinference/web/ui/node_modules/webpack/node_modules/eslint-scope/package.json +0 -48
  247. xinference/web/ui/node_modules/webpack/node_modules/estraverse/package.json +0 -40
  248. /xinference/web/ui/build/static/js/{main.8ae3b6d9.js.LICENSE.txt → main.8126d441.js.LICENSE.txt} +0 -0
  249. /xinference/web/ui/node_modules/{@nicolo-ribaudo/eslint-scope-5-internals → eslint-scope}/node_modules/estraverse/package.json +0 -0
  250. /xinference/web/ui/node_modules/{@eslint/eslintrc → react-scripts}/node_modules/argparse/package.json +0 -0
  251. /xinference/web/ui/node_modules/{eslint → react-scripts/node_modules/eslint}/lib/cli-engine/formatters/formatters-meta.json +0 -0
  252. /xinference/web/ui/node_modules/{eslint-config-react-app → react-scripts/node_modules/eslint-config-react-app}/package.json +0 -0
  253. /xinference/web/ui/node_modules/{eslint-plugin-flowtype → react-scripts/node_modules/eslint-plugin-flowtype}/dist/configs/recommended.json +0 -0
  254. /xinference/web/ui/node_modules/{eslint-plugin-flowtype → react-scripts/node_modules/eslint-plugin-flowtype}/package.json +0 -0
  255. /xinference/web/ui/node_modules/{@eslint/eslintrc → react-scripts}/node_modules/js-yaml/package.json +0 -0
  256. {xinference-0.6.4.dist-info → xinference-0.7.0.dist-info}/LICENSE +0 -0
  257. {xinference-0.6.4.dist-info → xinference-0.7.0.dist-info}/entry_points.txt +0 -0
  258. {xinference-0.6.4.dist-info → xinference-0.7.0.dist-info}/top_level.txt +0 -0
xinference/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2023-11-24T12:37:09+0800",
11
+ "date": "2023-12-08T13:45:18+0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "8fd2e3bf97418bc99f65087c34c932e0318969bc",
15
- "version": "0.6.4"
14
+ "full-revisionid": "b5a5f0a270f85e451591eba34fe615a0fc8ce4bf",
15
+ "version": "0.7.0"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -21,7 +21,7 @@ import os
21
21
  import pprint
22
22
  import sys
23
23
  import warnings
24
- from typing import Any, Dict, List, Literal, Optional, Union
24
+ from typing import Any, List, Optional, Union
25
25
 
26
26
  import gradio as gr
27
27
  import xoscar as xo
@@ -43,28 +43,19 @@ from pydantic import BaseModel, Field
43
43
  from sse_starlette.sse import EventSourceResponse
44
44
  from starlette.responses import JSONResponse as StarletteJSONResponse
45
45
  from starlette.responses import RedirectResponse
46
- from typing_extensions import NotRequired, TypedDict
47
46
  from uvicorn import Config, Server
48
47
  from xoscar.utils import get_next_port
49
48
 
50
49
  from ..constants import XINFERENCE_DEFAULT_ENDPOINT_PORT
51
50
  from ..core.supervisor import SupervisorActor
52
51
  from ..core.utils import json_dumps
53
- from ..fields import (
54
- frequency_penalty_field,
55
- max_tokens_field,
56
- mirostat_eta_field,
57
- mirostat_mode_field,
58
- mirostat_tau_field,
59
- presence_penalty_field,
60
- repeat_penalty_field,
61
- stop_field,
62
- stream_field,
63
- temperature_field,
64
- top_k_field,
65
- top_p_field,
52
+ from ..types import (
53
+ ChatCompletion,
54
+ Completion,
55
+ CreateChatCompletion,
56
+ CreateCompletion,
57
+ ImageList,
66
58
  )
67
- from ..types import ChatCompletion, Completion, CreateCompletion, ImageList
68
59
 
69
60
  logger = logging.getLogger(__name__)
70
61
 
@@ -115,50 +106,6 @@ class TextToImageRequest(BaseModel):
115
106
  user: Optional[str] = None
116
107
 
117
108
 
118
- class ChatCompletionRequestMessage(TypedDict):
119
- role: Literal["assistant", "user", "system"]
120
- content: str
121
- user: NotRequired[str]
122
-
123
-
124
- class CreateChatCompletionRequest(BaseModel):
125
- messages: List[ChatCompletionRequestMessage] = Field(
126
- default=[], description="A list of messages to generate completions for."
127
- )
128
- max_tokens: int = max_tokens_field
129
- temperature: float = temperature_field
130
- top_p: float = top_p_field
131
- mirostat_mode: int = mirostat_mode_field
132
- mirostat_tau: float = mirostat_tau_field
133
- mirostat_eta: float = mirostat_eta_field
134
- stop: Optional[Union[str, List[str]]] = stop_field
135
- stream: bool = stream_field
136
- presence_penalty: Optional[float] = presence_penalty_field
137
- frequency_penalty: Optional[float] = frequency_penalty_field
138
- logit_bias: Optional[Dict[str, float]] = Field(None)
139
-
140
- model: str
141
- n: Optional[int] = 1
142
- user: Optional[str] = Field(None)
143
-
144
- # llama.cpp specific parameters
145
- top_k: int = top_k_field
146
- repeat_penalty: Optional[float] = repeat_penalty_field
147
- logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
148
- grammar: Optional[str] = Field(None)
149
-
150
- class Config:
151
- schema_extra = {
152
- "example": {
153
- "messages": [
154
- {"role": "system", "content": "you are a helpful AI assistant"},
155
- {"role": "user", "content": "Hello!"},
156
- {"role": "assistant", "content": "Hi what can I help you?"},
157
- ]
158
- }
159
- }
160
-
161
-
162
109
  class RegisterModelRequest(BaseModel):
163
110
  model: str
164
111
  persist: bool
@@ -208,6 +155,12 @@ class RESTfulAPI:
208
155
  )
209
156
  self._router.add_api_route("/status", self.get_status, methods=["GET"])
210
157
  self._router.add_api_route("/v1/models", self.list_models, methods=["GET"])
158
+ self._router.add_api_route(
159
+ "/v1/models/prompts", self._get_builtin_prompts, methods=["GET"]
160
+ )
161
+ self._router.add_api_route(
162
+ "/v1/cluster/devices", self._get_devices_count, methods=["GET"]
163
+ )
211
164
  self._router.add_api_route(
212
165
  "/v1/models/{model_uid}", self.describe_model, methods=["GET"]
213
166
  )
@@ -305,6 +258,9 @@ class RESTfulAPI:
305
258
  f"{pprint.pformat(invalid_routes)}"
306
259
  )
307
260
 
261
+ for tp in [CreateChatCompletion, CreateCompletion]:
262
+ logger.debug("Dump request model fields:\n%s", tp.__fields__)
263
+
308
264
  class SPAStaticFiles(StaticFiles):
309
265
  async def get_response(self, path: str, scope):
310
266
  response = await super().get_response(path, scope)
@@ -346,6 +302,28 @@ class RESTfulAPI:
346
302
  server = Server(config)
347
303
  server.run()
348
304
 
305
+ async def _get_builtin_prompts(self) -> JSONResponse:
306
+ """
307
+ For internal usage
308
+ """
309
+ try:
310
+ data = await (await self._get_supervisor_ref()).get_builtin_prompts()
311
+ return JSONResponse(content=data)
312
+ except Exception as e:
313
+ logger.error(e, exc_info=True)
314
+ raise HTTPException(status_code=500, detail=str(e))
315
+
316
+ async def _get_devices_count(self) -> JSONResponse:
317
+ """
318
+ For internal usage
319
+ """
320
+ try:
321
+ data = await (await self._get_supervisor_ref()).get_devices_count()
322
+ return JSONResponse(content=data)
323
+ except Exception as e:
324
+ logger.error(e, exc_info=True)
325
+ raise HTTPException(status_code=500, detail=str(e))
326
+
349
327
  async def get_status(self) -> JSONResponse:
350
328
  try:
351
329
  data = await (await self._get_supervisor_ref()).get_status()
@@ -725,7 +703,7 @@ class RESTfulAPI:
725
703
  async def create_chat_completion(
726
704
  self,
727
705
  request: Request,
728
- body: CreateChatCompletionRequest,
706
+ body: CreateChatCompletion,
729
707
  ) -> Response:
730
708
  exclude = {
731
709
  "prompt",
@@ -736,7 +714,7 @@ class RESTfulAPI:
736
714
  "logit_bias_type",
737
715
  "user",
738
716
  }
739
- kwargs = body.dict(exclude=exclude)
717
+ kwargs = body.dict(exclude_unset=True, exclude=exclude)
740
718
 
741
719
  if body.logit_bias is not None:
742
720
  raise HTTPException(status_code=501, detail="Not implemented")
@@ -795,6 +773,7 @@ class RESTfulAPI:
795
773
  is_chatglm_ggml = desc.get(
796
774
  "model_format"
797
775
  ) == "ggmlv3" and "chatglm" in desc.get("model_name", "")
776
+ is_chatglm3 = "chatglm3" == desc.get("model_name", "")
798
777
 
799
778
  is_qwen = desc.get("model_format") == "ggmlv3" and "qwen" in desc.get(
800
779
  "model_name", ""
@@ -804,6 +783,14 @@ class RESTfulAPI:
804
783
  raise HTTPException(
805
784
  status_code=400, detail="ChatGLM ggml does not have system prompt"
806
785
  )
786
+ if is_chatglm3 and body.tools and body.stream:
787
+ raise HTTPException(
788
+ status_code=400, detail="ChatGLM3 tool calls does not support stream"
789
+ )
790
+ if body.tools and not is_chatglm3:
791
+ raise HTTPException(
792
+ status_code=400, detail="Only ChatGLM3 support tool calls"
793
+ )
807
794
 
808
795
  if body.stream:
809
796
 
@@ -320,6 +320,7 @@ class RESTfulChatModelHandle(RESTfulGenerateModelHandle):
320
320
  prompt: str,
321
321
  system_prompt: Optional[str] = None,
322
322
  chat_history: Optional[List["ChatCompletionMessage"]] = None,
323
+ tools: Optional[List[Dict]] = None,
323
324
  generate_config: Optional[
324
325
  Union["LlamaCppGenerateConfig", "PytorchGenerateConfig"]
325
326
  ] = None,
@@ -335,6 +336,8 @@ class RESTfulChatModelHandle(RESTfulGenerateModelHandle):
335
336
  The system context provide to Model prior to any chats.
336
337
  chat_history: Optional[List["ChatCompletionMessage"]]
337
338
  A list of messages comprising the conversation so far.
339
+ tools: Optional[List[Dict]]
340
+ A tool list.
338
341
  generate_config: Optional[Union["LlamaCppGenerateConfig", "PytorchGenerateConfig"]]
339
342
  Additional configuration for the chat generation.
340
343
  "LlamaCppGenerateConfig" -> configuration for ggml model
@@ -373,6 +376,8 @@ class RESTfulChatModelHandle(RESTfulGenerateModelHandle):
373
376
  "model": self._model_uid,
374
377
  "messages": chat_history,
375
378
  }
379
+ if tools is not None:
380
+ request_body["tools"] = tools
376
381
  if generate_config is not None:
377
382
  for key, value in generate_config.items():
378
383
  request_body[key] = value
@@ -397,6 +402,7 @@ class RESTfulChatglmCppChatModelHandle(RESTfulEmbeddingModelHandle):
397
402
  self,
398
403
  prompt: str,
399
404
  chat_history: Optional[List["ChatCompletionMessage"]] = None,
405
+ tools: Optional[List[Dict]] = None,
400
406
  generate_config: Optional["ChatglmCppGenerateConfig"] = None,
401
407
  ) -> Union["ChatCompletion", Iterator["ChatCompletionChunk"]]:
402
408
  """
@@ -408,6 +414,8 @@ class RESTfulChatglmCppChatModelHandle(RESTfulEmbeddingModelHandle):
408
414
  The user's input.
409
415
  chat_history: Optional[List["ChatCompletionMessage"]]
410
416
  A list of messages comprising the conversation so far.
417
+ tools: Optional[List[Dict]]
418
+ A tool list.
411
419
  generate_config: Optional["ChatglmCppGenerateConfig"]
412
420
  Additional configuration for ChatGLM chat generation.
413
421
 
@@ -436,7 +444,8 @@ class RESTfulChatglmCppChatModelHandle(RESTfulEmbeddingModelHandle):
436
444
  "model": self._model_uid,
437
445
  "messages": chat_history,
438
446
  }
439
-
447
+ if tools is not None:
448
+ request_body["tools"] = tools
440
449
  if generate_config is not None:
441
450
  for key, value in generate_config.items():
442
451
  request_body[key] = value
xinference/conftest.py CHANGED
@@ -184,7 +184,7 @@ def setup():
184
184
  local_cluster_proc = run_test_cluster_in_subprocess(
185
185
  supervisor_addr, TEST_LOGGING_CONF
186
186
  )
187
- if not cluster_health_check(supervisor_addr, max_attempts=3, sleep_interval=3):
187
+ if not cluster_health_check(supervisor_addr, max_attempts=10, sleep_interval=3):
188
188
  raise RuntimeError("Cluster is not available after multiple attempts")
189
189
 
190
190
  port = xo.utils.get_next_port()
@@ -195,7 +195,7 @@ def setup():
195
195
  logging_conf=TEST_LOGGING_CONF,
196
196
  )
197
197
  endpoint = f"http://localhost:{port}"
198
- if not api_health_check(endpoint, max_attempts=3, sleep_interval=5):
198
+ if not api_health_check(endpoint, max_attempts=10, sleep_interval=5):
199
199
  raise RuntimeError("Endpoint is not available after multiple attempts")
200
200
 
201
201
  yield f"http://localhost:{port}", supervisor_addr
xinference/constants.py CHANGED
@@ -18,6 +18,9 @@ from pathlib import Path
18
18
  XINFERENCE_ENV_ENDPOINT = "XINFERENCE_ENDPOINT"
19
19
  XINFERENCE_ENV_MODEL_SRC = "XINFERENCE_MODEL_SRC"
20
20
  XINFERENCE_ENV_HOME_PATH = "XINFERENCE_HOME"
21
+ XINFERENCE_ENV_HEALTH_CHECK_ATTEMPTS = "XINFERENCE_HEALTH_CHECK_ATTEMPTS"
22
+ XINFERENCE_ENV_HEALTH_CHECK_INTERVAL = "XINFERENCE_HEALTH_CHECK_INTERVAL"
23
+ XINFERENCE_ENV_DISABLE_VLLM = "XINFERENCE_DISABLE_VLLM"
21
24
 
22
25
 
23
26
  def get_xinference_home():
@@ -36,3 +39,10 @@ XINFERENCE_DEFAULT_ENDPOINT_PORT = 9997
36
39
  XINFERENCE_DEFAULT_LOG_FILE_NAME = "xinference.log"
37
40
  XINFERENCE_LOG_MAX_BYTES = 100 * 1024 * 1024
38
41
  XINFERENCE_LOG_BACKUP_COUNT = 30
42
+ XINFERENCE_HEALTH_CHECK_ATTEMPTS = int(
43
+ os.environ.get(XINFERENCE_ENV_HEALTH_CHECK_ATTEMPTS, 3)
44
+ )
45
+ XINFERENCE_HEALTH_CHECK_INTERVAL = int(
46
+ os.environ.get(XINFERENCE_ENV_HEALTH_CHECK_INTERVAL, 3)
47
+ )
48
+ XINFERENCE_DISABLE_VLLM = bool(int(os.environ.get(XINFERENCE_ENV_DISABLE_VLLM, 0)))
xinference/core/model.py CHANGED
@@ -14,6 +14,7 @@
14
14
 
15
15
  import asyncio
16
16
  import inspect
17
+ import os
17
18
  import uuid
18
19
  from typing import (
19
20
  TYPE_CHECKING,
@@ -44,6 +45,15 @@ from .utils import json_dumps, log_async
44
45
 
45
46
  T = TypeVar("T")
46
47
 
48
+ try:
49
+ from torch.cuda import OutOfMemoryError
50
+ except ImportError:
51
+
52
+ class _OutOfMemoryError(Exception):
53
+ pass
54
+
55
+ OutOfMemoryError = _OutOfMemoryError
56
+
47
57
 
48
58
  def request_limit(fn):
49
59
  """
@@ -192,18 +202,30 @@ class ModelActor(xo.StatelessActor):
192
202
  return ret
193
203
 
194
204
  async def _call_wrapper(self, _wrapper: Callable):
195
- assert not (
196
- inspect.iscoroutinefunction(_wrapper)
197
- or inspect.isasyncgenfunction(_wrapper)
198
- )
199
- if self._lock is None:
200
- return await asyncio.to_thread(_wrapper)
201
- else:
202
- async with self._lock:
205
+ try:
206
+ assert not (
207
+ inspect.iscoroutinefunction(_wrapper)
208
+ or inspect.isasyncgenfunction(_wrapper)
209
+ )
210
+ if self._lock is None:
203
211
  return await asyncio.to_thread(_wrapper)
212
+ else:
213
+ async with self._lock:
214
+ return await asyncio.to_thread(_wrapper)
215
+ except OutOfMemoryError:
216
+ logger.exception(
217
+ "Model actor is out of memory, model id: %s", self.model_uid()
218
+ )
219
+ os._exit(1)
204
220
 
205
221
  async def _call_async_wrapper(self, _wrapper: Callable):
206
- return await asyncio.create_task(_wrapper())
222
+ try:
223
+ return await asyncio.create_task(_wrapper())
224
+ except OutOfMemoryError:
225
+ logger.exception(
226
+ "Model actor is out of memory, model id: %s", self.model_uid()
227
+ )
228
+ os._exit(1)
207
229
 
208
230
  @log_async(logger=logger)
209
231
  @request_limit
@@ -365,7 +387,8 @@ class ModelActor(xo.StatelessActor):
365
387
 
366
388
  async def _async_wrapper():
367
389
  try:
368
- return await anext(gen) # noqa: F821
390
+ # anext is only available for Python >= 3.10
391
+ return await gen.__anext__() # noqa: F821
369
392
  except StopAsyncIteration:
370
393
  return stop
371
394
 
@@ -15,7 +15,7 @@
15
15
  from dataclasses import dataclass
16
16
  from typing import Dict
17
17
 
18
- from xorbits._mars import resource
18
+ import psutil
19
19
 
20
20
 
21
21
  @dataclass
@@ -28,19 +28,20 @@ class ResourceStatus:
28
28
 
29
29
  def gather_node_info() -> Dict[str, ResourceStatus]:
30
30
  node_resource = dict()
31
- mem_info = resource.virtual_memory()
31
+ mem_info = psutil.virtual_memory()
32
32
  node_resource["cpu"] = ResourceStatus(
33
- available=resource.cpu_percent() / 100.0,
34
- total=resource.cpu_count(),
33
+ available=psutil.cpu_percent() / 100.0,
34
+ total=psutil.cpu_count(),
35
35
  memory_available=mem_info.available,
36
36
  memory_total=mem_info.total,
37
37
  )
38
- for idx, gpu_card_stat in enumerate(resource.cuda_card_stats()):
39
- node_resource[f"gpu-{idx}"] = ResourceStatus(
40
- available=gpu_card_stat.gpu_usage / 100.0,
41
- total=1,
42
- memory_available=gpu_card_stat.fb_mem_info.available,
43
- memory_total=gpu_card_stat.fb_mem_info.total,
44
- )
38
+ # TODO: record GPU stats
39
+ # for idx, gpu_card_stat in enumerate(resource.cuda_card_stats()):
40
+ # node_resource[f"gpu-{idx}"] = ResourceStatus(
41
+ # available=gpu_card_stat.gpu_usage / 100.0,
42
+ # total=1,
43
+ # memory_available=gpu_card_stat.fb_mem_info.available,
44
+ # memory_total=gpu_card_stat.fb_mem_info.total,
45
+ # )
45
46
 
46
47
  return node_resource
@@ -85,10 +85,11 @@ class SupervisorActor(xo.StatelessActor):
85
85
  register_embedding,
86
86
  unregister_embedding,
87
87
  )
88
- from ..model.llm import LLMFamilyV1, register_llm, unregister_llm
88
+ from ..model.llm import register_llm, unregister_llm
89
+ from ..model.llm.llm_family import CustomLLMFamilyV1
89
90
 
90
91
  self._custom_register_type_to_cls: Dict[str, Tuple] = {
91
- "LLM": (LLMFamilyV1, register_llm, unregister_llm),
92
+ "LLM": (CustomLLMFamilyV1, register_llm, unregister_llm),
92
93
  "embedding": (
93
94
  CustomEmbeddingModelSpec,
94
95
  register_embedding,
@@ -96,6 +97,25 @@ class SupervisorActor(xo.StatelessActor):
96
97
  ),
97
98
  }
98
99
 
100
+ @staticmethod
101
+ async def get_builtin_prompts() -> Dict[str, Any]:
102
+ from ..model.llm.llm_family import BUILTIN_LLM_PROMPT_STYLE
103
+
104
+ data = {}
105
+ for k, v in BUILTIN_LLM_PROMPT_STYLE.items():
106
+ data[k] = v.dict()
107
+ return data
108
+
109
+ async def get_devices_count(self) -> int:
110
+ from ..utils import cuda_count
111
+
112
+ if self.is_local_deployment():
113
+ return cuda_count()
114
+ # distributed deployment, choose a worker and return its cuda_count.
115
+ # Assume that each worker has the same count of cards.
116
+ worker_ref = await self._choose_worker()
117
+ return await worker_ref.get_devices_count()
118
+
99
119
  async def _choose_worker(self) -> xo.ActorRefType["WorkerActor"]:
100
120
  # TODO: better allocation strategy.
101
121
  min_running_model_count = None
xinference/core/worker.py CHANGED
@@ -20,12 +20,12 @@ from logging import getLogger
20
20
  from typing import Any, Dict, List, Optional, Set, Tuple, Union
21
21
 
22
22
  import xoscar as xo
23
- from xorbits._mars.resource import cuda_count
24
23
  from xoscar import MainActorPoolType
25
24
 
26
25
  from ..constants import XINFERENCE_CACHE_DIR
27
26
  from ..core import ModelActor
28
27
  from ..model.core import ModelDescription, create_model_instance
28
+ from ..utils import cuda_count
29
29
  from .resource import gather_node_info
30
30
  from .utils import log_async, log_sync, parse_replica_model_uid, purge_dir
31
31
 
@@ -48,6 +48,7 @@ class WorkerActor(xo.StatelessActor):
48
48
  self._supervisor_address = supervisor_address
49
49
  self._supervisor_ref = None
50
50
  self._main_pool = main_pool
51
+ self._main_pool.recover_sub_pool = self.recover_sub_pool
51
52
 
52
53
  # internal states.
53
54
  self._model_uid_to_model: Dict[str, xo.ActorRefType["ModelActor"]] = {}
@@ -55,9 +56,22 @@ class WorkerActor(xo.StatelessActor):
55
56
  self._gpu_to_model_uid: Dict[int, str] = {}
56
57
  self._gpu_to_embedding_model_uids: Dict[int, Set[str]] = defaultdict(set)
57
58
  self._model_uid_to_addr: Dict[str, str] = {}
59
+ self._model_uid_to_launch_args: Dict[str, Dict] = {}
58
60
 
59
61
  self._lock = asyncio.Lock()
60
62
 
63
+ async def recover_sub_pool(self, address):
64
+ logger.warning("Process %s is down, create model.", address)
65
+ for model_uid, addr in self._model_uid_to_addr.items():
66
+ if addr == address:
67
+ launch_args = self._model_uid_to_launch_args.get(model_uid)
68
+ try:
69
+ await self.terminate_model(model_uid)
70
+ except Exception:
71
+ pass
72
+ await self.launch_builtin_model(**launch_args)
73
+ break
74
+
61
75
  @classmethod
62
76
  def uid(cls) -> str:
63
77
  return "worker"
@@ -79,10 +93,11 @@ class WorkerActor(xo.StatelessActor):
79
93
  register_embedding,
80
94
  unregister_embedding,
81
95
  )
82
- from ..model.llm import LLMFamilyV1, register_llm, unregister_llm
96
+ from ..model.llm import register_llm, unregister_llm
97
+ from ..model.llm.llm_family import CustomLLMFamilyV1
83
98
 
84
99
  self._custom_register_type_to_cls: Dict[str, Tuple] = {
85
- "LLM": (LLMFamilyV1, register_llm, unregister_llm),
100
+ "LLM": (CustomLLMFamilyV1, register_llm, unregister_llm),
86
101
  "embedding": (
87
102
  CustomEmbeddingModelSpec,
88
103
  register_embedding,
@@ -93,6 +108,12 @@ class WorkerActor(xo.StatelessActor):
93
108
  async def __pre_destroy__(self):
94
109
  self._upload_task.cancel()
95
110
 
111
+ @staticmethod
112
+ def get_devices_count():
113
+ from ..utils import cuda_count
114
+
115
+ return cuda_count()
116
+
96
117
  @log_sync(logger=logger)
97
118
  def get_model_count(self) -> int:
98
119
  return len(self._model_uid_to_model)
@@ -174,7 +195,7 @@ class WorkerActor(xo.StatelessActor):
174
195
  gpu_cnt = n_gpu if isinstance(n_gpu, int) else 1
175
196
  devices = (
176
197
  [await self.allocate_devices_for_embedding(model_uid)]
177
- if model_type == "embedding"
198
+ if model_type in ["embedding", "rerank"]
178
199
  else self.allocate_devices(model_uid=model_uid, n_gpu=gpu_cnt)
179
200
  )
180
201
  env["CUDA_VISIBLE_DEVICES"] = ",".join([str(dev) for dev in devices])
@@ -277,7 +298,6 @@ class WorkerActor(xo.StatelessActor):
277
298
  for dev in devices:
278
299
  self._gpu_to_model_uid[int(dev)] = model_uid
279
300
  self._model_uid_to_addr[model_uid] = subpool_address
280
- return model_ref
281
301
 
282
302
  @log_async(logger=logger)
283
303
  async def launch_builtin_model(
@@ -291,7 +311,9 @@ class WorkerActor(xo.StatelessActor):
291
311
  n_gpu: Optional[Union[int, str]] = "auto",
292
312
  request_limits: Optional[int] = None,
293
313
  **kwargs,
294
- ) -> xo.ActorRefType["ModelActor"]:
314
+ ):
315
+ launch_args = locals()
316
+ launch_args.pop("self")
295
317
  if n_gpu is not None:
296
318
  if isinstance(n_gpu, int) and (n_gpu <= 0 or n_gpu > cuda_count()):
297
319
  raise ValueError(
@@ -342,7 +364,7 @@ class WorkerActor(xo.StatelessActor):
342
364
  self._model_uid_to_model[model_uid] = model_ref
343
365
  self._model_uid_to_model_spec[model_uid] = model_description
344
366
  self._model_uid_to_addr[model_uid] = subpool_address
345
- return model_ref
367
+ self._model_uid_to_launch_args[model_uid] = launch_args
346
368
 
347
369
  @log_async(logger=logger)
348
370
  async def terminate_model(self, model_uid: str):
@@ -350,15 +372,21 @@ class WorkerActor(xo.StatelessActor):
350
372
  if model_ref is None:
351
373
  raise ValueError(f"Model not found in the model list, uid: {model_uid}")
352
374
 
353
- await xo.destroy_actor(model_ref)
354
- del self._model_uid_to_model[model_uid]
355
- del self._model_uid_to_model_spec[model_uid]
356
-
357
- self.release_devices(model_uid)
358
-
359
- subpool_address = self._model_uid_to_addr[model_uid]
360
- await self._main_pool.remove_sub_pool(subpool_address)
361
- del self._model_uid_to_addr[model_uid]
375
+ try:
376
+ await xo.destroy_actor(model_ref)
377
+ except Exception as e:
378
+ logger.debug(
379
+ "Destroy model actor failed, model uid: %s, error: %s", model_uid, e
380
+ )
381
+ try:
382
+ subpool_address = self._model_uid_to_addr[model_uid]
383
+ await self._main_pool.remove_sub_pool(subpool_address)
384
+ finally:
385
+ del self._model_uid_to_model[model_uid]
386
+ del self._model_uid_to_model_spec[model_uid]
387
+ self.release_devices(model_uid)
388
+ del self._model_uid_to_addr[model_uid]
389
+ del self._model_uid_to_launch_args[model_uid]
362
390
 
363
391
  @log_async(logger=logger)
364
392
  async def list_models(self) -> Dict[str, Dict[str, Any]]:
@@ -189,13 +189,14 @@ def local(
189
189
  "-p",
190
190
  default=XINFERENCE_DEFAULT_ENDPOINT_PORT,
191
191
  type=int,
192
- help="Specify the port number for the supervisor.",
192
+ help="Specify the port number for the Xinference web ui and service.",
193
193
  )
194
- def supervisor(
195
- log_level: str,
196
- host: str,
197
- port: int,
198
- ):
194
+ @click.option(
195
+ "--supervisor-port",
196
+ type=int,
197
+ help="Specify the port number for the Xinference supervisor.",
198
+ )
199
+ def supervisor(log_level: str, host: str, port: int, supervisor_port: Optional[int]):
199
200
  from ..deploy.supervisor import main
200
201
 
201
202
  dict_config = get_config_dict(
@@ -206,7 +207,9 @@ def supervisor(
206
207
  )
207
208
  logging.config.dictConfig(dict_config) # type: ignore
208
209
 
209
- main(host=host, port=port, logging_conf=dict_config)
210
+ main(
211
+ host=host, port=port, supervisor_port=supervisor_port, logging_conf=dict_config
212
+ )
210
213
 
211
214
 
212
215
  @click.command(
@@ -227,7 +230,14 @@ def supervisor(
227
230
  type=str,
228
231
  help="Specify the host address for the worker.",
229
232
  )
230
- def worker(log_level: str, endpoint: Optional[str], host: str):
233
+ @click.option(
234
+ "--worker-port",
235
+ type=int,
236
+ help="Specify the port number for the Xinference worker.",
237
+ )
238
+ def worker(
239
+ log_level: str, endpoint: Optional[str], host: str, worker_port: Optional[int]
240
+ ):
231
241
  from ..deploy.worker import main
232
242
 
233
243
  dict_config = get_config_dict(
@@ -243,7 +253,7 @@ def worker(log_level: str, endpoint: Optional[str], host: str):
243
253
  client = RESTfulClient(base_url=endpoint)
244
254
  supervisor_internal_addr = client._get_supervisor_internal_address()
245
255
 
246
- address = f"{host}:{get_next_port()}"
256
+ address = f"{host}:{worker_port or get_next_port()}"
247
257
  main(
248
258
  address=address,
249
259
  supervisor_address=supervisor_internal_addr,
@@ -22,6 +22,10 @@ from typing import Dict, Optional
22
22
  import xoscar as xo
23
23
  from xoscar.utils import get_next_port
24
24
 
25
+ from ..constants import (
26
+ XINFERENCE_HEALTH_CHECK_ATTEMPTS,
27
+ XINFERENCE_HEALTH_CHECK_INTERVAL,
28
+ )
25
29
  from ..core.supervisor import SupervisorActor
26
30
  from .utils import health_check
27
31
  from .worker import start_worker_components
@@ -79,7 +83,11 @@ def main(host: str, port: int, logging_conf: Optional[Dict] = None):
79
83
  supervisor_address = f"{host}:{get_next_port()}"
80
84
  local_cluster = run_in_subprocess(supervisor_address, logging_conf)
81
85
 
82
- if not health_check(address=supervisor_address, max_attempts=3, sleep_interval=3):
86
+ if not health_check(
87
+ address=supervisor_address,
88
+ max_attempts=XINFERENCE_HEALTH_CHECK_ATTEMPTS,
89
+ sleep_interval=XINFERENCE_HEALTH_CHECK_INTERVAL,
90
+ ):
83
91
  raise RuntimeError("Cluster is not available after multiple attempts")
84
92
 
85
93
  try: