xinference 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +49 -62
- xinference/client/restful/restful_client.py +10 -1
- xinference/conftest.py +2 -2
- xinference/constants.py +10 -0
- xinference/core/model.py +33 -10
- xinference/core/resource.py +12 -11
- xinference/core/supervisor.py +22 -2
- xinference/core/worker.py +44 -16
- xinference/deploy/cmdline.py +19 -9
- xinference/deploy/local.py +9 -1
- xinference/deploy/supervisor.py +16 -3
- xinference/deploy/utils.py +1 -0
- xinference/deploy/worker.py +1 -1
- xinference/model/embedding/__init__.py +10 -0
- xinference/model/embedding/core.py +3 -0
- xinference/model/embedding/custom.py +5 -4
- xinference/model/embedding/model_spec.json +16 -0
- xinference/model/embedding/model_spec_modelscope.json +16 -0
- xinference/model/llm/__init__.py +22 -2
- xinference/model/llm/core.py +2 -2
- xinference/model/llm/ggml/chatglm.py +79 -15
- xinference/model/llm/ggml/llamacpp.py +2 -2
- xinference/model/llm/llm_family.json +99 -4
- xinference/model/llm/llm_family.py +54 -8
- xinference/model/llm/llm_family_modelscope.json +81 -2
- xinference/model/llm/pytorch/chatglm.py +95 -2
- xinference/model/llm/utils.py +12 -8
- xinference/model/llm/vllm/core.py +26 -5
- xinference/model/utils.py +25 -0
- xinference/types.py +64 -5
- xinference/utils.py +20 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.8126d441.js +3 -0
- xinference/web/ui/build/static/js/main.8126d441.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/06eb9846159adb398d44df0b0debc256a9fd9e8171a7d68f5c4ee4d655acfa45.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0cccfbe5d963b8e31eb679f9d9677392839cedd04aa2956ac6b33cf19599d597.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2f651cf60b1bde50c0601c7110f77dd44819fb6e2501ff748a631724d91445d4.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3bda436576ecb05f81f7b6ec475d1cfaf03e2b3066e3a75902fe6e8c4773b43b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/47887a9524ffeecdc2a7839dace146b24f97a5564fc3d431d6179ad2b153cf1f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/48878f5178bad1a47757e011af41c974a7946efa29485506c4d19f25bf5d522d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/59574eb63cfe9ed2e58d2f5a420e1ae54354e243a602e9bc73deae3147ed4f98.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/6a60ae66b29c2f3634fd081d369b9e63b4522fe18eb9e43e9979d1ff264b68ad.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/75a5abcbc92da335fdde530f5689194ec79a4b2345b8cba594f8904d3b88e3c6.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/84bfe7afede38da1f8ad569d891276fe4d66cfb87bf5c9ff7a113788ba62bb88.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/940ed05006583b955894e2b8f65a4a5ebf34f8149d747f59fae5131f17d65482.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9c5f03db9aa88582a9b69b25c7f1acc78ba7fc61f743c9ed7399abb292d5dbde.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a5e2e9f707eb7039bea096ca117d996b8f9cbc2a5613fd8e0c5b0094444ce23c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c02e70e9b9efcf3bd056606308104308d6a6ac559f2bc0b4454c11fb5874457c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e26750d9556e9741912333349e4da454c53dbfddbfc6002ab49518dcf02af745.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e610aefd7000a3f8542a25cb66c64671cc8da18350de4e5b577102ba4bb78d65.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/ef42ec014d7bc373b874b2a1ff0dcd785490f125e913698bc049b0bd778e4d66.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/fe3eb4d76c79ca98833f686d642224eeeb94cc83ad14300d281623796d087f0a.json +1 -0
- xinference/web/ui/node_modules/.package-lock.json +1077 -405
- xinference/web/ui/node_modules/@eslint/eslintrc/node_modules/globals/globals.json +163 -3
- xinference/web/ui/node_modules/@eslint/eslintrc/node_modules/globals/package.json +1 -1
- xinference/web/ui/node_modules/@eslint/eslintrc/node_modules/ignore/package.json +64 -0
- xinference/web/ui/node_modules/@eslint/eslintrc/package.json +18 -37
- xinference/web/ui/node_modules/@eslint/js/package.json +1 -1
- xinference/web/ui/node_modules/@eslint-community/regexpp/package.json +9 -4
- xinference/web/ui/node_modules/@humanwhocodes/config-array/package.json +14 -14
- xinference/web/ui/node_modules/@rushstack/eslint-patch/package.json +6 -4
- xinference/web/ui/node_modules/@types/semver/package.json +15 -15
- xinference/web/ui/node_modules/@ungap/structured-clone/cjs/package.json +1 -0
- xinference/web/ui/node_modules/@ungap/structured-clone/package.json +53 -0
- xinference/web/ui/node_modules/ansi-colors/package.json +129 -0
- xinference/web/ui/node_modules/array-includes/package.json +8 -8
- xinference/web/ui/node_modules/array.prototype.findlastindex/package.json +120 -0
- xinference/web/ui/node_modules/array.prototype.flat/package.json +8 -8
- xinference/web/ui/node_modules/array.prototype.flatmap/package.json +8 -8
- xinference/web/ui/node_modules/arraybuffer.prototype.slice/package.json +103 -0
- xinference/web/ui/node_modules/ast-types-flow/package.json +2 -2
- xinference/web/ui/node_modules/astral-regex/package.json +33 -0
- xinference/web/ui/node_modules/asynciterator.prototype/package.json +72 -0
- xinference/web/ui/node_modules/axe-core/locales/_template.json +0 -12
- xinference/web/ui/node_modules/axe-core/package.json +1 -2
- xinference/web/ui/node_modules/axe-core/sri-history.json +0 -8
- xinference/web/ui/node_modules/call-bind/package.json +33 -23
- xinference/web/ui/node_modules/define-data-property/package.json +113 -0
- xinference/web/ui/node_modules/define-data-property/tsconfig.json +59 -0
- xinference/web/ui/node_modules/define-properties/package.json +5 -4
- xinference/web/ui/node_modules/enquirer/package.json +112 -0
- xinference/web/ui/node_modules/es-abstract/helpers/caseFolding.json +1430 -0
- xinference/web/ui/node_modules/es-abstract/package.json +29 -23
- xinference/web/ui/node_modules/es-iterator-helpers/index.json +17 -0
- xinference/web/ui/node_modules/es-iterator-helpers/package.json +185 -0
- xinference/web/ui/node_modules/eslint/conf/{rule-type-list.json → category-list.json} +9 -6
- xinference/web/ui/node_modules/eslint/node_modules/@babel/code-frame/package.json +25 -0
- xinference/web/ui/node_modules/eslint/node_modules/eslint-visitor-keys/lib/visitor-keys.json +289 -0
- xinference/web/ui/node_modules/eslint/node_modules/eslint-visitor-keys/package.json +39 -0
- xinference/web/ui/node_modules/eslint/node_modules/glob-parent/package.json +48 -0
- xinference/web/ui/node_modules/eslint/node_modules/ignore/package.json +64 -0
- xinference/web/ui/node_modules/eslint/package.json +53 -82
- xinference/web/ui/node_modules/eslint-config-prettier/package.json +13 -0
- xinference/web/ui/node_modules/eslint-import-resolver-node/package.json +3 -3
- xinference/web/ui/node_modules/eslint-plugin-import/package.json +22 -17
- xinference/web/ui/node_modules/eslint-plugin-jsx-a11y/package.json +25 -24
- xinference/web/ui/node_modules/eslint-plugin-simple-import-sort/package.json +23 -0
- xinference/web/ui/node_modules/eslint-plugin-testing-library/package.json +1 -1
- xinference/web/ui/node_modules/eslint-scope/package.json +19 -34
- xinference/web/ui/node_modules/eslint-utils/node_modules/eslint-visitor-keys/lib/visitor-keys.json +284 -0
- xinference/web/ui/node_modules/eslint-utils/node_modules/eslint-visitor-keys/package.json +40 -0
- xinference/web/ui/node_modules/eslint-utils/package.json +65 -0
- xinference/web/ui/node_modules/eslint-visitor-keys/package.json +15 -15
- xinference/web/ui/node_modules/espree/node_modules/acorn/package.json +35 -0
- xinference/web/ui/node_modules/espree/node_modules/eslint-visitor-keys/lib/visitor-keys.json +284 -0
- xinference/web/ui/node_modules/espree/node_modules/eslint-visitor-keys/package.json +40 -0
- xinference/web/ui/node_modules/espree/package.json +27 -51
- xinference/web/ui/node_modules/function-bind/package.json +38 -14
- xinference/web/ui/node_modules/function.prototype.name/package.json +32 -13
- xinference/web/ui/node_modules/functional-red-black-tree/package.json +40 -0
- xinference/web/ui/node_modules/get-intrinsic/package.json +11 -11
- xinference/web/ui/node_modules/hasown/package.json +91 -0
- xinference/web/ui/node_modules/hasown/tsconfig.json +49 -0
- xinference/web/ui/node_modules/is-async-function/package.json +86 -0
- xinference/web/ui/node_modules/is-core-module/core.json +3 -3
- xinference/web/ui/node_modules/is-core-module/package.json +7 -7
- xinference/web/ui/node_modules/is-finalizationregistry/package.json +67 -0
- xinference/web/ui/node_modules/is-generator-function/package.json +87 -0
- xinference/web/ui/node_modules/is-typed-array/package.json +8 -10
- xinference/web/ui/node_modules/iterator.prototype/package.json +73 -0
- xinference/web/ui/node_modules/jsx-ast-utils/package.json +5 -5
- xinference/web/ui/node_modules/language-tags/package.json +48 -8
- xinference/web/ui/node_modules/lodash.truncate/package.json +17 -0
- xinference/web/ui/node_modules/object-inspect/package.json +8 -6
- xinference/web/ui/node_modules/object.entries/package.json +7 -7
- xinference/web/ui/node_modules/object.fromentries/package.json +7 -7
- xinference/web/ui/node_modules/object.groupby/package.json +83 -0
- xinference/web/ui/node_modules/object.values/package.json +7 -7
- xinference/web/ui/node_modules/prettier/package.json +21 -0
- xinference/web/ui/node_modules/progress/package.json +26 -0
- xinference/web/ui/node_modules/react-scripts/node_modules/@eslint/eslintrc/package.json +82 -0
- xinference/web/ui/node_modules/react-scripts/node_modules/@humanwhocodes/config-array/package.json +61 -0
- xinference/web/ui/node_modules/react-scripts/node_modules/@humanwhocodes/object-schema/package.json +33 -0
- xinference/web/ui/node_modules/react-scripts/node_modules/ansi-styles/package.json +56 -0
- xinference/web/ui/node_modules/react-scripts/node_modules/chalk/package.json +68 -0
- xinference/web/ui/node_modules/react-scripts/node_modules/color-convert/package.json +48 -0
- xinference/web/ui/node_modules/react-scripts/node_modules/color-name/package.json +28 -0
- xinference/web/ui/node_modules/react-scripts/node_modules/escape-string-regexp/package.json +38 -0
- xinference/web/ui/node_modules/react-scripts/node_modules/eslint/conf/replacements.json +22 -0
- xinference/web/ui/node_modules/react-scripts/node_modules/eslint/conf/rule-type-list.json +28 -0
- xinference/web/ui/node_modules/react-scripts/node_modules/eslint/package.json +179 -0
- xinference/web/ui/node_modules/react-scripts/node_modules/eslint-scope/package.json +63 -0
- xinference/web/ui/node_modules/react-scripts/node_modules/espree/package.json +88 -0
- xinference/web/ui/node_modules/react-scripts/node_modules/globals/globals.json +1974 -0
- xinference/web/ui/node_modules/react-scripts/node_modules/globals/package.json +55 -0
- xinference/web/ui/node_modules/react-scripts/node_modules/has-flag/package.json +46 -0
- xinference/web/ui/node_modules/react-scripts/node_modules/supports-color/package.json +53 -0
- xinference/web/ui/node_modules/react-scripts/node_modules/type-fest/package.json +58 -0
- xinference/web/ui/node_modules/reflect.getprototypeof/package.json +99 -0
- xinference/web/ui/node_modules/regexp.prototype.flags/package.json +8 -7
- xinference/web/ui/node_modules/regexpp/package.json +91 -0
- xinference/web/ui/node_modules/resolve/lib/core.json +4 -1
- xinference/web/ui/node_modules/resolve/package.json +9 -8
- xinference/web/ui/node_modules/resolve/test/resolver/multirepo/package.json +1 -1
- xinference/web/ui/node_modules/safe-array-concat/package.json +5 -5
- xinference/web/ui/node_modules/set-function-length/package.json +84 -0
- xinference/web/ui/node_modules/set-function-name/package.json +80 -0
- xinference/web/ui/node_modules/slice-ansi/node_modules/ansi-styles/package.json +56 -0
- xinference/web/ui/node_modules/slice-ansi/node_modules/color-convert/package.json +48 -0
- xinference/web/ui/node_modules/slice-ansi/node_modules/color-name/package.json +28 -0
- xinference/web/ui/node_modules/slice-ansi/package.json +52 -0
- xinference/web/ui/node_modules/string.prototype.trim/package.json +7 -7
- xinference/web/ui/node_modules/string.prototype.trimend/package.json +7 -7
- xinference/web/ui/node_modules/string.prototype.trimstart/package.json +7 -7
- xinference/web/ui/node_modules/table/dist/src/schemas/config.json +95 -0
- xinference/web/ui/node_modules/table/dist/src/schemas/shared.json +139 -0
- xinference/web/ui/node_modules/table/dist/src/schemas/streamConfig.json +25 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/data.json +13 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2019-09/meta/applicator.json +53 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2019-09/meta/content.json +17 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2019-09/meta/core.json +57 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2019-09/meta/format.json +14 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2019-09/meta/meta-data.json +37 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2019-09/meta/validation.json +90 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2019-09/schema.json +39 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2020-12/meta/applicator.json +48 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2020-12/meta/content.json +17 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2020-12/meta/core.json +51 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2020-12/meta/format-annotation.json +14 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2020-12/meta/meta-data.json +37 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2020-12/meta/unevaluated.json +15 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2020-12/meta/validation.json +90 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-2020-12/schema.json +55 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-draft-06.json +137 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-draft-07.json +151 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/dist/refs/json-schema-secure.json +88 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/data.json +13 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2019-09/meta/applicator.json +53 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2019-09/meta/content.json +17 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2019-09/meta/core.json +57 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2019-09/meta/format.json +14 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2019-09/meta/meta-data.json +37 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2019-09/meta/validation.json +90 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2019-09/schema.json +39 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2020-12/meta/applicator.json +48 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2020-12/meta/content.json +17 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2020-12/meta/core.json +51 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2020-12/meta/format-annotation.json +14 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2020-12/meta/meta-data.json +37 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2020-12/meta/unevaluated.json +15 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2020-12/meta/validation.json +90 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-2020-12/schema.json +55 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-draft-06.json +137 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-draft-07.json +151 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/lib/refs/json-schema-secure.json +88 -0
- xinference/web/ui/node_modules/table/node_modules/ajv/package.json +126 -0
- xinference/web/ui/node_modules/table/node_modules/json-schema-traverse/package.json +43 -0
- xinference/web/ui/node_modules/table/package.json +77 -0
- xinference/web/ui/node_modules/typed-array-buffer/package.json +73 -0
- xinference/web/ui/node_modules/typed-array-byte-length/package.json +98 -0
- xinference/web/ui/node_modules/v8-compile-cache/package.json +34 -0
- xinference/web/ui/node_modules/which-builtin-type/package.json +93 -0
- xinference/web/ui/node_modules/which-typed-array/package.json +4 -5
- xinference/web/ui/package-lock.json +1085 -406
- xinference/web/ui/package.json +10 -2
- {xinference-0.6.4.dist-info → xinference-0.7.0.dist-info}/METADATA +53 -36
- {xinference-0.6.4.dist-info → xinference-0.7.0.dist-info}/RECORD +232 -124
- {xinference-0.6.4.dist-info → xinference-0.7.0.dist-info}/WHEEL +1 -1
- xinference/web/ui/build/static/js/main.8ae3b6d9.js +0 -3
- xinference/web/ui/build/static/js/main.8ae3b6d9.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/06363becf51869c421a8b3e34b4e3f50aa0aac3d590446044f9412e379f4ebbe.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2849edddeb99a8ecdda577e810eead74b8f8a291cdfbd987839d604666ed79d0.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2c774712d327cdf0b192aaa22785ec380e9427c587350c33289828d99e9c4abc.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/34c578e50d3040519ca8dc28bf0f7fec8674c2d6c0fcc3e98401c0a3f9f013cf.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5933910e7c33febbabc0297ef7ba80f5e53ed96aa125b6a44ff2910aec29ced1.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5e18a8354ea03d22a967fd8cb2171aa798edcb3da5d66ab1fd3b9663affd0abe.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/717cd7c186ace4812d1e602bdd299d8dc507f072670cc43974d53aac2574df5d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/82dd896a6674286c48c1ab9f9147dd6e542dccd99848d5b3133a38efba8bd7ee.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/a178cfde289ffd15fd54b1c80fd9d231ae0f9644db33acb02084e69b32bfee37.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/adaec65f73accce3171b51b0fbcbfd8d0cd83f81a2e1b28eb34148644875499a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/ae8f44c77c2e6f79680fe32fb00174183cd867093ebbda967b8985c33cc10fa2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/b10bd04b4d6e28bfcaaaab37b0a4c1986e87a5b7e62e5ce4d56019880ef26990.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/cfc5da1cedee985a556e04865affccb72d0f624cbfb73da348bbe8693e8a4983.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/eebd0123c4b4396737e56b9181406a9fd76b107dd32971d23b0de99f51dd38d6.json +0 -1
- xinference/web/ui/node_modules/@nicolo-ribaudo/eslint-scope-5-internals/node_modules/eslint-scope/package.json +0 -48
- xinference/web/ui/node_modules/@typescript-eslint/utils/node_modules/eslint-scope/package.json +0 -48
- xinference/web/ui/node_modules/@typescript-eslint/utils/node_modules/estraverse/package.json +0 -40
- xinference/web/ui/node_modules/eslint/node_modules/argparse/package.json +0 -31
- xinference/web/ui/node_modules/eslint/node_modules/js-yaml/package.json +0 -66
- xinference/web/ui/node_modules/eslint-plugin-jsx-a11y/node_modules/semver/package.json +0 -38
- xinference/web/ui/node_modules/function-bind/.jscs.json +0 -176
- xinference/web/ui/node_modules/resolve/test/resolver/malformed_package_json/package.json +0 -1
- xinference/web/ui/node_modules/webpack/node_modules/eslint-scope/package.json +0 -48
- xinference/web/ui/node_modules/webpack/node_modules/estraverse/package.json +0 -40
- /xinference/web/ui/build/static/js/{main.8ae3b6d9.js.LICENSE.txt → main.8126d441.js.LICENSE.txt} +0 -0
- /xinference/web/ui/node_modules/{@nicolo-ribaudo/eslint-scope-5-internals → eslint-scope}/node_modules/estraverse/package.json +0 -0
- /xinference/web/ui/node_modules/{@eslint/eslintrc → react-scripts}/node_modules/argparse/package.json +0 -0
- /xinference/web/ui/node_modules/{eslint → react-scripts/node_modules/eslint}/lib/cli-engine/formatters/formatters-meta.json +0 -0
- /xinference/web/ui/node_modules/{eslint-config-react-app → react-scripts/node_modules/eslint-config-react-app}/package.json +0 -0
- /xinference/web/ui/node_modules/{eslint-plugin-flowtype → react-scripts/node_modules/eslint-plugin-flowtype}/dist/configs/recommended.json +0 -0
- /xinference/web/ui/node_modules/{eslint-plugin-flowtype → react-scripts/node_modules/eslint-plugin-flowtype}/package.json +0 -0
- /xinference/web/ui/node_modules/{@eslint/eslintrc → react-scripts}/node_modules/js-yaml/package.json +0 -0
- {xinference-0.6.4.dist-info → xinference-0.7.0.dist-info}/LICENSE +0 -0
- {xinference-0.6.4.dist-info → xinference-0.7.0.dist-info}/entry_points.txt +0 -0
- {xinference-0.6.4.dist-info → xinference-0.7.0.dist-info}/top_level.txt +0 -0
xinference/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2023-
|
|
11
|
+
"date": "2023-12-08T13:45:18+0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.
|
|
14
|
+
"full-revisionid": "b5a5f0a270f85e451591eba34fe615a0fc8ce4bf",
|
|
15
|
+
"version": "0.7.0"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
xinference/api/restful_api.py
CHANGED
|
@@ -21,7 +21,7 @@ import os
|
|
|
21
21
|
import pprint
|
|
22
22
|
import sys
|
|
23
23
|
import warnings
|
|
24
|
-
from typing import Any,
|
|
24
|
+
from typing import Any, List, Optional, Union
|
|
25
25
|
|
|
26
26
|
import gradio as gr
|
|
27
27
|
import xoscar as xo
|
|
@@ -43,28 +43,19 @@ from pydantic import BaseModel, Field
|
|
|
43
43
|
from sse_starlette.sse import EventSourceResponse
|
|
44
44
|
from starlette.responses import JSONResponse as StarletteJSONResponse
|
|
45
45
|
from starlette.responses import RedirectResponse
|
|
46
|
-
from typing_extensions import NotRequired, TypedDict
|
|
47
46
|
from uvicorn import Config, Server
|
|
48
47
|
from xoscar.utils import get_next_port
|
|
49
48
|
|
|
50
49
|
from ..constants import XINFERENCE_DEFAULT_ENDPOINT_PORT
|
|
51
50
|
from ..core.supervisor import SupervisorActor
|
|
52
51
|
from ..core.utils import json_dumps
|
|
53
|
-
from ..
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
presence_penalty_field,
|
|
60
|
-
repeat_penalty_field,
|
|
61
|
-
stop_field,
|
|
62
|
-
stream_field,
|
|
63
|
-
temperature_field,
|
|
64
|
-
top_k_field,
|
|
65
|
-
top_p_field,
|
|
52
|
+
from ..types import (
|
|
53
|
+
ChatCompletion,
|
|
54
|
+
Completion,
|
|
55
|
+
CreateChatCompletion,
|
|
56
|
+
CreateCompletion,
|
|
57
|
+
ImageList,
|
|
66
58
|
)
|
|
67
|
-
from ..types import ChatCompletion, Completion, CreateCompletion, ImageList
|
|
68
59
|
|
|
69
60
|
logger = logging.getLogger(__name__)
|
|
70
61
|
|
|
@@ -115,50 +106,6 @@ class TextToImageRequest(BaseModel):
|
|
|
115
106
|
user: Optional[str] = None
|
|
116
107
|
|
|
117
108
|
|
|
118
|
-
class ChatCompletionRequestMessage(TypedDict):
|
|
119
|
-
role: Literal["assistant", "user", "system"]
|
|
120
|
-
content: str
|
|
121
|
-
user: NotRequired[str]
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
class CreateChatCompletionRequest(BaseModel):
|
|
125
|
-
messages: List[ChatCompletionRequestMessage] = Field(
|
|
126
|
-
default=[], description="A list of messages to generate completions for."
|
|
127
|
-
)
|
|
128
|
-
max_tokens: int = max_tokens_field
|
|
129
|
-
temperature: float = temperature_field
|
|
130
|
-
top_p: float = top_p_field
|
|
131
|
-
mirostat_mode: int = mirostat_mode_field
|
|
132
|
-
mirostat_tau: float = mirostat_tau_field
|
|
133
|
-
mirostat_eta: float = mirostat_eta_field
|
|
134
|
-
stop: Optional[Union[str, List[str]]] = stop_field
|
|
135
|
-
stream: bool = stream_field
|
|
136
|
-
presence_penalty: Optional[float] = presence_penalty_field
|
|
137
|
-
frequency_penalty: Optional[float] = frequency_penalty_field
|
|
138
|
-
logit_bias: Optional[Dict[str, float]] = Field(None)
|
|
139
|
-
|
|
140
|
-
model: str
|
|
141
|
-
n: Optional[int] = 1
|
|
142
|
-
user: Optional[str] = Field(None)
|
|
143
|
-
|
|
144
|
-
# llama.cpp specific parameters
|
|
145
|
-
top_k: int = top_k_field
|
|
146
|
-
repeat_penalty: Optional[float] = repeat_penalty_field
|
|
147
|
-
logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
|
|
148
|
-
grammar: Optional[str] = Field(None)
|
|
149
|
-
|
|
150
|
-
class Config:
|
|
151
|
-
schema_extra = {
|
|
152
|
-
"example": {
|
|
153
|
-
"messages": [
|
|
154
|
-
{"role": "system", "content": "you are a helpful AI assistant"},
|
|
155
|
-
{"role": "user", "content": "Hello!"},
|
|
156
|
-
{"role": "assistant", "content": "Hi what can I help you?"},
|
|
157
|
-
]
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
|
|
162
109
|
class RegisterModelRequest(BaseModel):
|
|
163
110
|
model: str
|
|
164
111
|
persist: bool
|
|
@@ -208,6 +155,12 @@ class RESTfulAPI:
|
|
|
208
155
|
)
|
|
209
156
|
self._router.add_api_route("/status", self.get_status, methods=["GET"])
|
|
210
157
|
self._router.add_api_route("/v1/models", self.list_models, methods=["GET"])
|
|
158
|
+
self._router.add_api_route(
|
|
159
|
+
"/v1/models/prompts", self._get_builtin_prompts, methods=["GET"]
|
|
160
|
+
)
|
|
161
|
+
self._router.add_api_route(
|
|
162
|
+
"/v1/cluster/devices", self._get_devices_count, methods=["GET"]
|
|
163
|
+
)
|
|
211
164
|
self._router.add_api_route(
|
|
212
165
|
"/v1/models/{model_uid}", self.describe_model, methods=["GET"]
|
|
213
166
|
)
|
|
@@ -305,6 +258,9 @@ class RESTfulAPI:
|
|
|
305
258
|
f"{pprint.pformat(invalid_routes)}"
|
|
306
259
|
)
|
|
307
260
|
|
|
261
|
+
for tp in [CreateChatCompletion, CreateCompletion]:
|
|
262
|
+
logger.debug("Dump request model fields:\n%s", tp.__fields__)
|
|
263
|
+
|
|
308
264
|
class SPAStaticFiles(StaticFiles):
|
|
309
265
|
async def get_response(self, path: str, scope):
|
|
310
266
|
response = await super().get_response(path, scope)
|
|
@@ -346,6 +302,28 @@ class RESTfulAPI:
|
|
|
346
302
|
server = Server(config)
|
|
347
303
|
server.run()
|
|
348
304
|
|
|
305
|
+
async def _get_builtin_prompts(self) -> JSONResponse:
|
|
306
|
+
"""
|
|
307
|
+
For internal usage
|
|
308
|
+
"""
|
|
309
|
+
try:
|
|
310
|
+
data = await (await self._get_supervisor_ref()).get_builtin_prompts()
|
|
311
|
+
return JSONResponse(content=data)
|
|
312
|
+
except Exception as e:
|
|
313
|
+
logger.error(e, exc_info=True)
|
|
314
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
315
|
+
|
|
316
|
+
async def _get_devices_count(self) -> JSONResponse:
|
|
317
|
+
"""
|
|
318
|
+
For internal usage
|
|
319
|
+
"""
|
|
320
|
+
try:
|
|
321
|
+
data = await (await self._get_supervisor_ref()).get_devices_count()
|
|
322
|
+
return JSONResponse(content=data)
|
|
323
|
+
except Exception as e:
|
|
324
|
+
logger.error(e, exc_info=True)
|
|
325
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
326
|
+
|
|
349
327
|
async def get_status(self) -> JSONResponse:
|
|
350
328
|
try:
|
|
351
329
|
data = await (await self._get_supervisor_ref()).get_status()
|
|
@@ -725,7 +703,7 @@ class RESTfulAPI:
|
|
|
725
703
|
async def create_chat_completion(
|
|
726
704
|
self,
|
|
727
705
|
request: Request,
|
|
728
|
-
body:
|
|
706
|
+
body: CreateChatCompletion,
|
|
729
707
|
) -> Response:
|
|
730
708
|
exclude = {
|
|
731
709
|
"prompt",
|
|
@@ -736,7 +714,7 @@ class RESTfulAPI:
|
|
|
736
714
|
"logit_bias_type",
|
|
737
715
|
"user",
|
|
738
716
|
}
|
|
739
|
-
kwargs = body.dict(exclude=exclude)
|
|
717
|
+
kwargs = body.dict(exclude_unset=True, exclude=exclude)
|
|
740
718
|
|
|
741
719
|
if body.logit_bias is not None:
|
|
742
720
|
raise HTTPException(status_code=501, detail="Not implemented")
|
|
@@ -795,6 +773,7 @@ class RESTfulAPI:
|
|
|
795
773
|
is_chatglm_ggml = desc.get(
|
|
796
774
|
"model_format"
|
|
797
775
|
) == "ggmlv3" and "chatglm" in desc.get("model_name", "")
|
|
776
|
+
is_chatglm3 = "chatglm3" == desc.get("model_name", "")
|
|
798
777
|
|
|
799
778
|
is_qwen = desc.get("model_format") == "ggmlv3" and "qwen" in desc.get(
|
|
800
779
|
"model_name", ""
|
|
@@ -804,6 +783,14 @@ class RESTfulAPI:
|
|
|
804
783
|
raise HTTPException(
|
|
805
784
|
status_code=400, detail="ChatGLM ggml does not have system prompt"
|
|
806
785
|
)
|
|
786
|
+
if is_chatglm3 and body.tools and body.stream:
|
|
787
|
+
raise HTTPException(
|
|
788
|
+
status_code=400, detail="ChatGLM3 tool calls does not support stream"
|
|
789
|
+
)
|
|
790
|
+
if body.tools and not is_chatglm3:
|
|
791
|
+
raise HTTPException(
|
|
792
|
+
status_code=400, detail="Only ChatGLM3 support tool calls"
|
|
793
|
+
)
|
|
807
794
|
|
|
808
795
|
if body.stream:
|
|
809
796
|
|
|
@@ -320,6 +320,7 @@ class RESTfulChatModelHandle(RESTfulGenerateModelHandle):
|
|
|
320
320
|
prompt: str,
|
|
321
321
|
system_prompt: Optional[str] = None,
|
|
322
322
|
chat_history: Optional[List["ChatCompletionMessage"]] = None,
|
|
323
|
+
tools: Optional[List[Dict]] = None,
|
|
323
324
|
generate_config: Optional[
|
|
324
325
|
Union["LlamaCppGenerateConfig", "PytorchGenerateConfig"]
|
|
325
326
|
] = None,
|
|
@@ -335,6 +336,8 @@ class RESTfulChatModelHandle(RESTfulGenerateModelHandle):
|
|
|
335
336
|
The system context provide to Model prior to any chats.
|
|
336
337
|
chat_history: Optional[List["ChatCompletionMessage"]]
|
|
337
338
|
A list of messages comprising the conversation so far.
|
|
339
|
+
tools: Optional[List[Dict]]
|
|
340
|
+
A tool list.
|
|
338
341
|
generate_config: Optional[Union["LlamaCppGenerateConfig", "PytorchGenerateConfig"]]
|
|
339
342
|
Additional configuration for the chat generation.
|
|
340
343
|
"LlamaCppGenerateConfig" -> configuration for ggml model
|
|
@@ -373,6 +376,8 @@ class RESTfulChatModelHandle(RESTfulGenerateModelHandle):
|
|
|
373
376
|
"model": self._model_uid,
|
|
374
377
|
"messages": chat_history,
|
|
375
378
|
}
|
|
379
|
+
if tools is not None:
|
|
380
|
+
request_body["tools"] = tools
|
|
376
381
|
if generate_config is not None:
|
|
377
382
|
for key, value in generate_config.items():
|
|
378
383
|
request_body[key] = value
|
|
@@ -397,6 +402,7 @@ class RESTfulChatglmCppChatModelHandle(RESTfulEmbeddingModelHandle):
|
|
|
397
402
|
self,
|
|
398
403
|
prompt: str,
|
|
399
404
|
chat_history: Optional[List["ChatCompletionMessage"]] = None,
|
|
405
|
+
tools: Optional[List[Dict]] = None,
|
|
400
406
|
generate_config: Optional["ChatglmCppGenerateConfig"] = None,
|
|
401
407
|
) -> Union["ChatCompletion", Iterator["ChatCompletionChunk"]]:
|
|
402
408
|
"""
|
|
@@ -408,6 +414,8 @@ class RESTfulChatglmCppChatModelHandle(RESTfulEmbeddingModelHandle):
|
|
|
408
414
|
The user's input.
|
|
409
415
|
chat_history: Optional[List["ChatCompletionMessage"]]
|
|
410
416
|
A list of messages comprising the conversation so far.
|
|
417
|
+
tools: Optional[List[Dict]]
|
|
418
|
+
A tool list.
|
|
411
419
|
generate_config: Optional["ChatglmCppGenerateConfig"]
|
|
412
420
|
Additional configuration for ChatGLM chat generation.
|
|
413
421
|
|
|
@@ -436,7 +444,8 @@ class RESTfulChatglmCppChatModelHandle(RESTfulEmbeddingModelHandle):
|
|
|
436
444
|
"model": self._model_uid,
|
|
437
445
|
"messages": chat_history,
|
|
438
446
|
}
|
|
439
|
-
|
|
447
|
+
if tools is not None:
|
|
448
|
+
request_body["tools"] = tools
|
|
440
449
|
if generate_config is not None:
|
|
441
450
|
for key, value in generate_config.items():
|
|
442
451
|
request_body[key] = value
|
xinference/conftest.py
CHANGED
|
@@ -184,7 +184,7 @@ def setup():
|
|
|
184
184
|
local_cluster_proc = run_test_cluster_in_subprocess(
|
|
185
185
|
supervisor_addr, TEST_LOGGING_CONF
|
|
186
186
|
)
|
|
187
|
-
if not cluster_health_check(supervisor_addr, max_attempts=
|
|
187
|
+
if not cluster_health_check(supervisor_addr, max_attempts=10, sleep_interval=3):
|
|
188
188
|
raise RuntimeError("Cluster is not available after multiple attempts")
|
|
189
189
|
|
|
190
190
|
port = xo.utils.get_next_port()
|
|
@@ -195,7 +195,7 @@ def setup():
|
|
|
195
195
|
logging_conf=TEST_LOGGING_CONF,
|
|
196
196
|
)
|
|
197
197
|
endpoint = f"http://localhost:{port}"
|
|
198
|
-
if not api_health_check(endpoint, max_attempts=
|
|
198
|
+
if not api_health_check(endpoint, max_attempts=10, sleep_interval=5):
|
|
199
199
|
raise RuntimeError("Endpoint is not available after multiple attempts")
|
|
200
200
|
|
|
201
201
|
yield f"http://localhost:{port}", supervisor_addr
|
xinference/constants.py
CHANGED
|
@@ -18,6 +18,9 @@ from pathlib import Path
|
|
|
18
18
|
XINFERENCE_ENV_ENDPOINT = "XINFERENCE_ENDPOINT"
|
|
19
19
|
XINFERENCE_ENV_MODEL_SRC = "XINFERENCE_MODEL_SRC"
|
|
20
20
|
XINFERENCE_ENV_HOME_PATH = "XINFERENCE_HOME"
|
|
21
|
+
XINFERENCE_ENV_HEALTH_CHECK_ATTEMPTS = "XINFERENCE_HEALTH_CHECK_ATTEMPTS"
|
|
22
|
+
XINFERENCE_ENV_HEALTH_CHECK_INTERVAL = "XINFERENCE_HEALTH_CHECK_INTERVAL"
|
|
23
|
+
XINFERENCE_ENV_DISABLE_VLLM = "XINFERENCE_DISABLE_VLLM"
|
|
21
24
|
|
|
22
25
|
|
|
23
26
|
def get_xinference_home():
|
|
@@ -36,3 +39,10 @@ XINFERENCE_DEFAULT_ENDPOINT_PORT = 9997
|
|
|
36
39
|
XINFERENCE_DEFAULT_LOG_FILE_NAME = "xinference.log"
|
|
37
40
|
XINFERENCE_LOG_MAX_BYTES = 100 * 1024 * 1024
|
|
38
41
|
XINFERENCE_LOG_BACKUP_COUNT = 30
|
|
42
|
+
XINFERENCE_HEALTH_CHECK_ATTEMPTS = int(
|
|
43
|
+
os.environ.get(XINFERENCE_ENV_HEALTH_CHECK_ATTEMPTS, 3)
|
|
44
|
+
)
|
|
45
|
+
XINFERENCE_HEALTH_CHECK_INTERVAL = int(
|
|
46
|
+
os.environ.get(XINFERENCE_ENV_HEALTH_CHECK_INTERVAL, 3)
|
|
47
|
+
)
|
|
48
|
+
XINFERENCE_DISABLE_VLLM = bool(int(os.environ.get(XINFERENCE_ENV_DISABLE_VLLM, 0)))
|
xinference/core/model.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import asyncio
|
|
16
16
|
import inspect
|
|
17
|
+
import os
|
|
17
18
|
import uuid
|
|
18
19
|
from typing import (
|
|
19
20
|
TYPE_CHECKING,
|
|
@@ -44,6 +45,15 @@ from .utils import json_dumps, log_async
|
|
|
44
45
|
|
|
45
46
|
T = TypeVar("T")
|
|
46
47
|
|
|
48
|
+
try:
|
|
49
|
+
from torch.cuda import OutOfMemoryError
|
|
50
|
+
except ImportError:
|
|
51
|
+
|
|
52
|
+
class _OutOfMemoryError(Exception):
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
OutOfMemoryError = _OutOfMemoryError
|
|
56
|
+
|
|
47
57
|
|
|
48
58
|
def request_limit(fn):
|
|
49
59
|
"""
|
|
@@ -192,18 +202,30 @@ class ModelActor(xo.StatelessActor):
|
|
|
192
202
|
return ret
|
|
193
203
|
|
|
194
204
|
async def _call_wrapper(self, _wrapper: Callable):
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
else:
|
|
202
|
-
async with self._lock:
|
|
205
|
+
try:
|
|
206
|
+
assert not (
|
|
207
|
+
inspect.iscoroutinefunction(_wrapper)
|
|
208
|
+
or inspect.isasyncgenfunction(_wrapper)
|
|
209
|
+
)
|
|
210
|
+
if self._lock is None:
|
|
203
211
|
return await asyncio.to_thread(_wrapper)
|
|
212
|
+
else:
|
|
213
|
+
async with self._lock:
|
|
214
|
+
return await asyncio.to_thread(_wrapper)
|
|
215
|
+
except OutOfMemoryError:
|
|
216
|
+
logger.exception(
|
|
217
|
+
"Model actor is out of memory, model id: %s", self.model_uid()
|
|
218
|
+
)
|
|
219
|
+
os._exit(1)
|
|
204
220
|
|
|
205
221
|
async def _call_async_wrapper(self, _wrapper: Callable):
|
|
206
|
-
|
|
222
|
+
try:
|
|
223
|
+
return await asyncio.create_task(_wrapper())
|
|
224
|
+
except OutOfMemoryError:
|
|
225
|
+
logger.exception(
|
|
226
|
+
"Model actor is out of memory, model id: %s", self.model_uid()
|
|
227
|
+
)
|
|
228
|
+
os._exit(1)
|
|
207
229
|
|
|
208
230
|
@log_async(logger=logger)
|
|
209
231
|
@request_limit
|
|
@@ -365,7 +387,8 @@ class ModelActor(xo.StatelessActor):
|
|
|
365
387
|
|
|
366
388
|
async def _async_wrapper():
|
|
367
389
|
try:
|
|
368
|
-
|
|
390
|
+
# anext is only available for Python >= 3.10
|
|
391
|
+
return await gen.__anext__() # noqa: F821
|
|
369
392
|
except StopAsyncIteration:
|
|
370
393
|
return stop
|
|
371
394
|
|
xinference/core/resource.py
CHANGED
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
from dataclasses import dataclass
|
|
16
16
|
from typing import Dict
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
import psutil
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
@dataclass
|
|
@@ -28,19 +28,20 @@ class ResourceStatus:
|
|
|
28
28
|
|
|
29
29
|
def gather_node_info() -> Dict[str, ResourceStatus]:
|
|
30
30
|
node_resource = dict()
|
|
31
|
-
mem_info =
|
|
31
|
+
mem_info = psutil.virtual_memory()
|
|
32
32
|
node_resource["cpu"] = ResourceStatus(
|
|
33
|
-
available=
|
|
34
|
-
total=
|
|
33
|
+
available=psutil.cpu_percent() / 100.0,
|
|
34
|
+
total=psutil.cpu_count(),
|
|
35
35
|
memory_available=mem_info.available,
|
|
36
36
|
memory_total=mem_info.total,
|
|
37
37
|
)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
38
|
+
# TODO: record GPU stats
|
|
39
|
+
# for idx, gpu_card_stat in enumerate(resource.cuda_card_stats()):
|
|
40
|
+
# node_resource[f"gpu-{idx}"] = ResourceStatus(
|
|
41
|
+
# available=gpu_card_stat.gpu_usage / 100.0,
|
|
42
|
+
# total=1,
|
|
43
|
+
# memory_available=gpu_card_stat.fb_mem_info.available,
|
|
44
|
+
# memory_total=gpu_card_stat.fb_mem_info.total,
|
|
45
|
+
# )
|
|
45
46
|
|
|
46
47
|
return node_resource
|
xinference/core/supervisor.py
CHANGED
|
@@ -85,10 +85,11 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
85
85
|
register_embedding,
|
|
86
86
|
unregister_embedding,
|
|
87
87
|
)
|
|
88
|
-
from ..model.llm import
|
|
88
|
+
from ..model.llm import register_llm, unregister_llm
|
|
89
|
+
from ..model.llm.llm_family import CustomLLMFamilyV1
|
|
89
90
|
|
|
90
91
|
self._custom_register_type_to_cls: Dict[str, Tuple] = {
|
|
91
|
-
"LLM": (
|
|
92
|
+
"LLM": (CustomLLMFamilyV1, register_llm, unregister_llm),
|
|
92
93
|
"embedding": (
|
|
93
94
|
CustomEmbeddingModelSpec,
|
|
94
95
|
register_embedding,
|
|
@@ -96,6 +97,25 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
96
97
|
),
|
|
97
98
|
}
|
|
98
99
|
|
|
100
|
+
@staticmethod
|
|
101
|
+
async def get_builtin_prompts() -> Dict[str, Any]:
|
|
102
|
+
from ..model.llm.llm_family import BUILTIN_LLM_PROMPT_STYLE
|
|
103
|
+
|
|
104
|
+
data = {}
|
|
105
|
+
for k, v in BUILTIN_LLM_PROMPT_STYLE.items():
|
|
106
|
+
data[k] = v.dict()
|
|
107
|
+
return data
|
|
108
|
+
|
|
109
|
+
async def get_devices_count(self) -> int:
|
|
110
|
+
from ..utils import cuda_count
|
|
111
|
+
|
|
112
|
+
if self.is_local_deployment():
|
|
113
|
+
return cuda_count()
|
|
114
|
+
# distributed deployment, choose a worker and return its cuda_count.
|
|
115
|
+
# Assume that each worker has the same count of cards.
|
|
116
|
+
worker_ref = await self._choose_worker()
|
|
117
|
+
return await worker_ref.get_devices_count()
|
|
118
|
+
|
|
99
119
|
async def _choose_worker(self) -> xo.ActorRefType["WorkerActor"]:
|
|
100
120
|
# TODO: better allocation strategy.
|
|
101
121
|
min_running_model_count = None
|
xinference/core/worker.py
CHANGED
|
@@ -20,12 +20,12 @@ from logging import getLogger
|
|
|
20
20
|
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
|
21
21
|
|
|
22
22
|
import xoscar as xo
|
|
23
|
-
from xorbits._mars.resource import cuda_count
|
|
24
23
|
from xoscar import MainActorPoolType
|
|
25
24
|
|
|
26
25
|
from ..constants import XINFERENCE_CACHE_DIR
|
|
27
26
|
from ..core import ModelActor
|
|
28
27
|
from ..model.core import ModelDescription, create_model_instance
|
|
28
|
+
from ..utils import cuda_count
|
|
29
29
|
from .resource import gather_node_info
|
|
30
30
|
from .utils import log_async, log_sync, parse_replica_model_uid, purge_dir
|
|
31
31
|
|
|
@@ -48,6 +48,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
48
48
|
self._supervisor_address = supervisor_address
|
|
49
49
|
self._supervisor_ref = None
|
|
50
50
|
self._main_pool = main_pool
|
|
51
|
+
self._main_pool.recover_sub_pool = self.recover_sub_pool
|
|
51
52
|
|
|
52
53
|
# internal states.
|
|
53
54
|
self._model_uid_to_model: Dict[str, xo.ActorRefType["ModelActor"]] = {}
|
|
@@ -55,9 +56,22 @@ class WorkerActor(xo.StatelessActor):
|
|
|
55
56
|
self._gpu_to_model_uid: Dict[int, str] = {}
|
|
56
57
|
self._gpu_to_embedding_model_uids: Dict[int, Set[str]] = defaultdict(set)
|
|
57
58
|
self._model_uid_to_addr: Dict[str, str] = {}
|
|
59
|
+
self._model_uid_to_launch_args: Dict[str, Dict] = {}
|
|
58
60
|
|
|
59
61
|
self._lock = asyncio.Lock()
|
|
60
62
|
|
|
63
|
+
async def recover_sub_pool(self, address):
|
|
64
|
+
logger.warning("Process %s is down, create model.", address)
|
|
65
|
+
for model_uid, addr in self._model_uid_to_addr.items():
|
|
66
|
+
if addr == address:
|
|
67
|
+
launch_args = self._model_uid_to_launch_args.get(model_uid)
|
|
68
|
+
try:
|
|
69
|
+
await self.terminate_model(model_uid)
|
|
70
|
+
except Exception:
|
|
71
|
+
pass
|
|
72
|
+
await self.launch_builtin_model(**launch_args)
|
|
73
|
+
break
|
|
74
|
+
|
|
61
75
|
@classmethod
|
|
62
76
|
def uid(cls) -> str:
|
|
63
77
|
return "worker"
|
|
@@ -79,10 +93,11 @@ class WorkerActor(xo.StatelessActor):
|
|
|
79
93
|
register_embedding,
|
|
80
94
|
unregister_embedding,
|
|
81
95
|
)
|
|
82
|
-
from ..model.llm import
|
|
96
|
+
from ..model.llm import register_llm, unregister_llm
|
|
97
|
+
from ..model.llm.llm_family import CustomLLMFamilyV1
|
|
83
98
|
|
|
84
99
|
self._custom_register_type_to_cls: Dict[str, Tuple] = {
|
|
85
|
-
"LLM": (
|
|
100
|
+
"LLM": (CustomLLMFamilyV1, register_llm, unregister_llm),
|
|
86
101
|
"embedding": (
|
|
87
102
|
CustomEmbeddingModelSpec,
|
|
88
103
|
register_embedding,
|
|
@@ -93,6 +108,12 @@ class WorkerActor(xo.StatelessActor):
|
|
|
93
108
|
async def __pre_destroy__(self):
|
|
94
109
|
self._upload_task.cancel()
|
|
95
110
|
|
|
111
|
+
@staticmethod
|
|
112
|
+
def get_devices_count():
|
|
113
|
+
from ..utils import cuda_count
|
|
114
|
+
|
|
115
|
+
return cuda_count()
|
|
116
|
+
|
|
96
117
|
@log_sync(logger=logger)
|
|
97
118
|
def get_model_count(self) -> int:
|
|
98
119
|
return len(self._model_uid_to_model)
|
|
@@ -174,7 +195,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
174
195
|
gpu_cnt = n_gpu if isinstance(n_gpu, int) else 1
|
|
175
196
|
devices = (
|
|
176
197
|
[await self.allocate_devices_for_embedding(model_uid)]
|
|
177
|
-
if model_type
|
|
198
|
+
if model_type in ["embedding", "rerank"]
|
|
178
199
|
else self.allocate_devices(model_uid=model_uid, n_gpu=gpu_cnt)
|
|
179
200
|
)
|
|
180
201
|
env["CUDA_VISIBLE_DEVICES"] = ",".join([str(dev) for dev in devices])
|
|
@@ -277,7 +298,6 @@ class WorkerActor(xo.StatelessActor):
|
|
|
277
298
|
for dev in devices:
|
|
278
299
|
self._gpu_to_model_uid[int(dev)] = model_uid
|
|
279
300
|
self._model_uid_to_addr[model_uid] = subpool_address
|
|
280
|
-
return model_ref
|
|
281
301
|
|
|
282
302
|
@log_async(logger=logger)
|
|
283
303
|
async def launch_builtin_model(
|
|
@@ -291,7 +311,9 @@ class WorkerActor(xo.StatelessActor):
|
|
|
291
311
|
n_gpu: Optional[Union[int, str]] = "auto",
|
|
292
312
|
request_limits: Optional[int] = None,
|
|
293
313
|
**kwargs,
|
|
294
|
-
)
|
|
314
|
+
):
|
|
315
|
+
launch_args = locals()
|
|
316
|
+
launch_args.pop("self")
|
|
295
317
|
if n_gpu is not None:
|
|
296
318
|
if isinstance(n_gpu, int) and (n_gpu <= 0 or n_gpu > cuda_count()):
|
|
297
319
|
raise ValueError(
|
|
@@ -342,7 +364,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
342
364
|
self._model_uid_to_model[model_uid] = model_ref
|
|
343
365
|
self._model_uid_to_model_spec[model_uid] = model_description
|
|
344
366
|
self._model_uid_to_addr[model_uid] = subpool_address
|
|
345
|
-
|
|
367
|
+
self._model_uid_to_launch_args[model_uid] = launch_args
|
|
346
368
|
|
|
347
369
|
@log_async(logger=logger)
|
|
348
370
|
async def terminate_model(self, model_uid: str):
|
|
@@ -350,15 +372,21 @@ class WorkerActor(xo.StatelessActor):
|
|
|
350
372
|
if model_ref is None:
|
|
351
373
|
raise ValueError(f"Model not found in the model list, uid: {model_uid}")
|
|
352
374
|
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
375
|
+
try:
|
|
376
|
+
await xo.destroy_actor(model_ref)
|
|
377
|
+
except Exception as e:
|
|
378
|
+
logger.debug(
|
|
379
|
+
"Destroy model actor failed, model uid: %s, error: %s", model_uid, e
|
|
380
|
+
)
|
|
381
|
+
try:
|
|
382
|
+
subpool_address = self._model_uid_to_addr[model_uid]
|
|
383
|
+
await self._main_pool.remove_sub_pool(subpool_address)
|
|
384
|
+
finally:
|
|
385
|
+
del self._model_uid_to_model[model_uid]
|
|
386
|
+
del self._model_uid_to_model_spec[model_uid]
|
|
387
|
+
self.release_devices(model_uid)
|
|
388
|
+
del self._model_uid_to_addr[model_uid]
|
|
389
|
+
del self._model_uid_to_launch_args[model_uid]
|
|
362
390
|
|
|
363
391
|
@log_async(logger=logger)
|
|
364
392
|
async def list_models(self) -> Dict[str, Dict[str, Any]]:
|
xinference/deploy/cmdline.py
CHANGED
|
@@ -189,13 +189,14 @@ def local(
|
|
|
189
189
|
"-p",
|
|
190
190
|
default=XINFERENCE_DEFAULT_ENDPOINT_PORT,
|
|
191
191
|
type=int,
|
|
192
|
-
help="Specify the port number for the
|
|
192
|
+
help="Specify the port number for the Xinference web ui and service.",
|
|
193
193
|
)
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
port
|
|
198
|
-
)
|
|
194
|
+
@click.option(
|
|
195
|
+
"--supervisor-port",
|
|
196
|
+
type=int,
|
|
197
|
+
help="Specify the port number for the Xinference supervisor.",
|
|
198
|
+
)
|
|
199
|
+
def supervisor(log_level: str, host: str, port: int, supervisor_port: Optional[int]):
|
|
199
200
|
from ..deploy.supervisor import main
|
|
200
201
|
|
|
201
202
|
dict_config = get_config_dict(
|
|
@@ -206,7 +207,9 @@ def supervisor(
|
|
|
206
207
|
)
|
|
207
208
|
logging.config.dictConfig(dict_config) # type: ignore
|
|
208
209
|
|
|
209
|
-
main(
|
|
210
|
+
main(
|
|
211
|
+
host=host, port=port, supervisor_port=supervisor_port, logging_conf=dict_config
|
|
212
|
+
)
|
|
210
213
|
|
|
211
214
|
|
|
212
215
|
@click.command(
|
|
@@ -227,7 +230,14 @@ def supervisor(
|
|
|
227
230
|
type=str,
|
|
228
231
|
help="Specify the host address for the worker.",
|
|
229
232
|
)
|
|
230
|
-
|
|
233
|
+
@click.option(
|
|
234
|
+
"--worker-port",
|
|
235
|
+
type=int,
|
|
236
|
+
help="Specify the port number for the Xinference worker.",
|
|
237
|
+
)
|
|
238
|
+
def worker(
|
|
239
|
+
log_level: str, endpoint: Optional[str], host: str, worker_port: Optional[int]
|
|
240
|
+
):
|
|
231
241
|
from ..deploy.worker import main
|
|
232
242
|
|
|
233
243
|
dict_config = get_config_dict(
|
|
@@ -243,7 +253,7 @@ def worker(log_level: str, endpoint: Optional[str], host: str):
|
|
|
243
253
|
client = RESTfulClient(base_url=endpoint)
|
|
244
254
|
supervisor_internal_addr = client._get_supervisor_internal_address()
|
|
245
255
|
|
|
246
|
-
address = f"{host}:{get_next_port()}"
|
|
256
|
+
address = f"{host}:{worker_port or get_next_port()}"
|
|
247
257
|
main(
|
|
248
258
|
address=address,
|
|
249
259
|
supervisor_address=supervisor_internal_addr,
|
xinference/deploy/local.py
CHANGED
|
@@ -22,6 +22,10 @@ from typing import Dict, Optional
|
|
|
22
22
|
import xoscar as xo
|
|
23
23
|
from xoscar.utils import get_next_port
|
|
24
24
|
|
|
25
|
+
from ..constants import (
|
|
26
|
+
XINFERENCE_HEALTH_CHECK_ATTEMPTS,
|
|
27
|
+
XINFERENCE_HEALTH_CHECK_INTERVAL,
|
|
28
|
+
)
|
|
25
29
|
from ..core.supervisor import SupervisorActor
|
|
26
30
|
from .utils import health_check
|
|
27
31
|
from .worker import start_worker_components
|
|
@@ -79,7 +83,11 @@ def main(host: str, port: int, logging_conf: Optional[Dict] = None):
|
|
|
79
83
|
supervisor_address = f"{host}:{get_next_port()}"
|
|
80
84
|
local_cluster = run_in_subprocess(supervisor_address, logging_conf)
|
|
81
85
|
|
|
82
|
-
if not health_check(
|
|
86
|
+
if not health_check(
|
|
87
|
+
address=supervisor_address,
|
|
88
|
+
max_attempts=XINFERENCE_HEALTH_CHECK_ATTEMPTS,
|
|
89
|
+
sleep_interval=XINFERENCE_HEALTH_CHECK_INTERVAL,
|
|
90
|
+
):
|
|
83
91
|
raise RuntimeError("Cluster is not available after multiple attempts")
|
|
84
92
|
|
|
85
93
|
try:
|