xinference 0.12.3__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (71) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +6 -6
  3. xinference/client/restful/restful_client.py +0 -2
  4. xinference/core/model.py +21 -4
  5. xinference/core/scheduler.py +2 -0
  6. xinference/core/worker.py +74 -45
  7. xinference/deploy/utils.py +33 -2
  8. xinference/model/llm/__init__.py +5 -0
  9. xinference/model/llm/llm_family.json +240 -1
  10. xinference/model/llm/llm_family.py +32 -8
  11. xinference/model/llm/llm_family_modelscope.json +192 -0
  12. xinference/model/llm/mlx/__init__.py +13 -0
  13. xinference/model/llm/mlx/core.py +408 -0
  14. xinference/model/llm/pytorch/chatglm.py +2 -9
  15. xinference/model/llm/pytorch/cogvlm2.py +206 -21
  16. xinference/model/llm/pytorch/core.py +213 -40
  17. xinference/model/llm/pytorch/glm4v.py +171 -15
  18. xinference/model/llm/pytorch/qwen_vl.py +168 -7
  19. xinference/model/llm/pytorch/utils.py +53 -62
  20. xinference/model/llm/utils.py +24 -5
  21. xinference/model/rerank/core.py +5 -0
  22. xinference/thirdparty/deepseek_vl/serve/__init__.py +13 -0
  23. xinference/thirdparty/deepseek_vl/serve/app_deepseek.py +510 -0
  24. xinference/thirdparty/deepseek_vl/serve/app_modules/__init__.py +13 -0
  25. xinference/thirdparty/deepseek_vl/serve/app_modules/gradio_utils.py +94 -0
  26. xinference/thirdparty/deepseek_vl/serve/app_modules/overwrites.py +81 -0
  27. xinference/thirdparty/deepseek_vl/serve/app_modules/presets.py +96 -0
  28. xinference/thirdparty/deepseek_vl/serve/app_modules/utils.py +229 -0
  29. xinference/thirdparty/deepseek_vl/serve/inference.py +170 -0
  30. xinference/web/ui/build/asset-manifest.json +3 -3
  31. xinference/web/ui/build/index.html +1 -1
  32. xinference/web/ui/build/static/js/main.0fb6f3ab.js +3 -0
  33. xinference/web/ui/build/static/js/main.0fb6f3ab.js.map +1 -0
  34. xinference/web/ui/node_modules/.cache/babel-loader/0f6b391abec76271137faad13a3793fe7acc1024e8cd2269c147b653ecd3a73b.json +1 -0
  35. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +1 -0
  36. xinference/web/ui/node_modules/.cache/babel-loader/1444c41a4d04494f1cbc2d8c1537df107b451cb569cb2c1fbf5159f3a4841a5f.json +1 -0
  37. xinference/web/ui/node_modules/.cache/babel-loader/2c63090c842376cdd368c3ded88a333ef40d94785747651343040a6f7872a223.json +1 -0
  38. xinference/web/ui/node_modules/.cache/babel-loader/30a0c79d8025d6441eb75b2df5bc2750a14f30119c869ef02570d294dff65c2f.json +1 -0
  39. xinference/web/ui/node_modules/.cache/babel-loader/40486e655c3c5801f087e2cf206c0b5511aaa0dfdba78046b7181bf9c17e54c5.json +1 -0
  40. xinference/web/ui/node_modules/.cache/babel-loader/6450605fac003812485f6251b9f0caafbf2e5bfc3bbe2f000050d9e2fdb8dcd3.json +1 -0
  41. xinference/web/ui/node_modules/.cache/babel-loader/8a9742ddd8ba8546ef42dc14caca443f2b4524fabed7bf269e0eff3b7b64ee7d.json +1 -0
  42. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +1 -0
  43. xinference/web/ui/node_modules/.cache/babel-loader/b5507cd57f16a3a230aa0128e39fe103e928de139ea29e2679e4c64dcbba3b3a.json +1 -0
  44. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +1 -0
  45. xinference/web/ui/node_modules/.cache/babel-loader/d779b915f83f9c7b5a72515b6932fdd114f1822cef90ae01cc0d12bca59abc2d.json +1 -0
  46. xinference/web/ui/node_modules/.cache/babel-loader/d87824cb266194447a9c0c69ebab2d507bfc3e3148976173760d18c035e9dd26.json +1 -0
  47. xinference/web/ui/node_modules/.cache/babel-loader/d93730e2b5d7e8c957b4d0965d2ed1dac9045a649adbd47c220d11f255d4b1e0.json +1 -0
  48. xinference/web/ui/node_modules/.cache/babel-loader/e656dc00b4d8b387f0a81ba8fc558767df1601c66369e2eb86a5ef27cf080572.json +1 -0
  49. {xinference-0.12.3.dist-info → xinference-0.13.0.dist-info}/METADATA +4 -1
  50. {xinference-0.12.3.dist-info → xinference-0.13.0.dist-info}/RECORD +55 -44
  51. xinference/web/ui/build/static/js/main.77dd47c3.js +0 -3
  52. xinference/web/ui/build/static/js/main.77dd47c3.js.map +0 -1
  53. xinference/web/ui/node_modules/.cache/babel-loader/0cd591866aa345566e0b63fb51ff2043e163a770af6fdc2f3bad395d046353e2.json +0 -1
  54. xinference/web/ui/node_modules/.cache/babel-loader/37c1476717199863bbba1530e3513a9368f8f73001b75b4a85c2075956308027.json +0 -1
  55. xinference/web/ui/node_modules/.cache/babel-loader/3da7d55e87882a4af923e187b1351160e34ca102f589086439c15131a227fb6e.json +0 -1
  56. xinference/web/ui/node_modules/.cache/babel-loader/3fa1f69162f9c6dc0f6a6e21b64d49d6b8e6fa8dfa59a82cf829931c5f97d99f.json +0 -1
  57. xinference/web/ui/node_modules/.cache/babel-loader/46edc1fe657dfedb2e673148332bb442c6eb98f09f2592c389209e376510afa5.json +0 -1
  58. xinference/web/ui/node_modules/.cache/babel-loader/62e257ed9016471035fa1a7da57c9e2a4250974ed566b4d1295873d747c68eb2.json +0 -1
  59. xinference/web/ui/node_modules/.cache/babel-loader/72bcecc71c5267250edeb89608859d449b586f13ff9923a5e70e7172976ec403.json +0 -1
  60. xinference/web/ui/node_modules/.cache/babel-loader/82db357f3fd5b32215d747ee593f69ff06c95ad6cde37f71a96c8290aaab64c0.json +0 -1
  61. xinference/web/ui/node_modules/.cache/babel-loader/935efd2867664c58230378fdf2ff1ea85e58d853b7214014e20dfbca8dab7b05.json +0 -1
  62. xinference/web/ui/node_modules/.cache/babel-loader/bc6da27195ec4607bb472bf61f97c928ad4966fa64e4c2247661bedb7400abba.json +0 -1
  63. xinference/web/ui/node_modules/.cache/babel-loader/c2abe75f04ad82fba68f35ed9cbe2e287762c876684fddccccfa73f739489b65.json +0 -1
  64. xinference/web/ui/node_modules/.cache/babel-loader/e606671420d2937102c3c34b4b04056c11736408c1d3347b8cf42dfe61fb394b.json +0 -1
  65. xinference/web/ui/node_modules/.cache/babel-loader/f118f99c22b713c678c1209c4e1dd43fe86e3f6e801a4c0c35d3bbf41fd05fe6.json +0 -1
  66. xinference/web/ui/node_modules/.cache/babel-loader/f51bf63ddaa7afd125ef2254a105789333eecc1c94fdf5157a9b88ef7ad0a5bd.json +0 -1
  67. /xinference/web/ui/build/static/js/{main.77dd47c3.js.LICENSE.txt → main.0fb6f3ab.js.LICENSE.txt} +0 -0
  68. {xinference-0.12.3.dist-info → xinference-0.13.0.dist-info}/LICENSE +0 -0
  69. {xinference-0.12.3.dist-info → xinference-0.13.0.dist-info}/WHEEL +0 -0
  70. {xinference-0.12.3.dist-info → xinference-0.13.0.dist-info}/entry_points.txt +0 -0
  71. {xinference-0.12.3.dist-info → xinference-0.13.0.dist-info}/top_level.txt +0 -0
@@ -107,6 +107,28 @@ class PytorchLLMSpecV1(BaseModel):
107
107
  return v
108
108
 
109
109
 
110
+ class MLXLLMSpecV1(BaseModel):
111
+ model_format: Literal["mlx"]
112
+ # Must in order that `str` first, then `int`
113
+ model_size_in_billions: Union[str, int]
114
+ quantizations: List[str]
115
+ model_id: Optional[str]
116
+ model_hub: str = "huggingface"
117
+ model_uri: Optional[str]
118
+ model_revision: Optional[str]
119
+
120
+ @validator("model_size_in_billions", pre=False)
121
+ def validate_model_size_with_radix(cls, v: object) -> object:
122
+ if isinstance(v, str):
123
+ if (
124
+ "_" in v
125
+ ): # for example, "1_8" just returns "1_8", otherwise int("1_8") returns 18
126
+ return v
127
+ else:
128
+ return int(v)
129
+ return v
130
+
131
+
110
132
  class PromptStyleV1(BaseModel):
111
133
  style_name: str
112
134
  system_prompt: str = ""
@@ -226,7 +248,7 @@ class CustomLLMFamilyV1(LLMFamilyV1):
226
248
 
227
249
 
228
250
  LLMSpecV1 = Annotated[
229
- Union[GgmlLLMSpecV1, PytorchLLMSpecV1],
251
+ Union[GgmlLLMSpecV1, PytorchLLMSpecV1, MLXLLMSpecV1],
230
252
  Field(discriminator="model_format"),
231
253
  ]
232
254
 
@@ -249,6 +271,8 @@ UD_LLM_FAMILIES_LOCK = Lock()
249
271
 
250
272
  VLLM_CLASSES: List[Type[LLM]] = []
251
273
 
274
+ MLX_CLASSES: List[Type[LLM]] = []
275
+
252
276
  LLM_ENGINES: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
253
277
  SUPPORTED_ENGINES: Dict[str, List[Type[LLM]]] = {}
254
278
 
@@ -549,7 +573,7 @@ def _get_meta_path(
549
573
  return os.path.join(cache_dir, "__valid_download")
550
574
  else:
551
575
  return os.path.join(cache_dir, f"__valid_download_{model_hub}")
552
- elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
576
+ elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
553
577
  assert quantization is not None
554
578
  if model_hub == "huggingface":
555
579
  return os.path.join(cache_dir, f"__valid_download_{quantization}")
@@ -588,7 +612,7 @@ def _skip_download(
588
612
  logger.warning(f"Cache {cache_dir} exists, but it was from {hub}")
589
613
  return True
590
614
  return False
591
- elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
615
+ elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
592
616
  assert quantization is not None
593
617
  return os.path.exists(
594
618
  _get_meta_path(cache_dir, model_format, model_hub, quantization)
@@ -683,7 +707,7 @@ def cache_from_csghub(
683
707
  ):
684
708
  return cache_dir
685
709
 
686
- if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
710
+ if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
687
711
  download_dir = retry_download(
688
712
  snapshot_download,
689
713
  llm_family.model_name,
@@ -751,7 +775,7 @@ def cache_from_modelscope(
751
775
  ):
752
776
  return cache_dir
753
777
 
754
- if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
778
+ if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
755
779
  download_dir = retry_download(
756
780
  snapshot_download,
757
781
  llm_family.model_name,
@@ -820,8 +844,8 @@ def cache_from_huggingface(
820
844
  if not IS_NEW_HUGGINGFACE_HUB:
821
845
  use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
822
846
 
823
- if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
824
- assert isinstance(llm_spec, PytorchLLMSpecV1)
847
+ if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
848
+ assert isinstance(llm_spec, (PytorchLLMSpecV1, MLXLLMSpecV1))
825
849
  download_dir = retry_download(
826
850
  huggingface_hub.snapshot_download,
827
851
  llm_family.model_name,
@@ -910,7 +934,7 @@ def get_cache_status(
910
934
  ]
911
935
  return any(revisions)
912
936
  # just check meta file for ggml and gptq model
913
- elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
937
+ elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
914
938
  ret = []
915
939
  for q in llm_spec.quantizations:
916
940
  assert q is not None
@@ -2921,6 +2921,33 @@
2921
2921
  "model_id": "qwen/Qwen2-72B-Instruct-AWQ",
2922
2922
  "model_hub": "modelscope"
2923
2923
  },
2924
+ {
2925
+ "model_format": "mlx",
2926
+ "model_size_in_billions": "0_5",
2927
+ "quantizations": [
2928
+ "4-bit"
2929
+ ],
2930
+ "model_id": "qwen/Qwen2-0.5B-Instruct-MLX",
2931
+ "model_hub": "modelscope"
2932
+ },
2933
+ {
2934
+ "model_format": "mlx",
2935
+ "model_size_in_billions": "1_5",
2936
+ "quantizations": [
2937
+ "4-bit"
2938
+ ],
2939
+ "model_id": "qwen/Qwen2-1.5B-Instruct-MLX",
2940
+ "model_hub": "modelscope"
2941
+ },
2942
+ {
2943
+ "model_format": "mlx",
2944
+ "model_size_in_billions": 7,
2945
+ "quantizations": [
2946
+ "4-bit"
2947
+ ],
2948
+ "model_id": "qwen/Qwen2-7B-Instruct-MLX",
2949
+ "model_hub": "modelscope"
2950
+ },
2924
2951
  {
2925
2952
  "model_format": "ggufv2",
2926
2953
  "model_size_in_billions": "0_5",
@@ -2938,6 +2965,85 @@
2938
2965
  "model_id": "qwen/Qwen2-0.5B-Instruct-GGUF",
2939
2966
  "model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf",
2940
2967
  "model_hub": "modelscope"
2968
+ },
2969
+ {
2970
+ "model_format": "ggufv2",
2971
+ "model_size_in_billions": "1_5",
2972
+ "quantizations": [
2973
+ "q2_k",
2974
+ "q3_k_m",
2975
+ "q4_0",
2976
+ "q4_k_m",
2977
+ "q5_0",
2978
+ "q5_k_m",
2979
+ "q6_k",
2980
+ "q8_0",
2981
+ "fp16"
2982
+ ],
2983
+ "model_id": "qwen/Qwen2-1.5B-Instruct-GGUF",
2984
+ "model_file_name_template": "qwen2-1_5b-instruct-{quantization}.gguf",
2985
+ "model_hub": "modelscope"
2986
+ },
2987
+ {
2988
+ "model_format": "ggufv2",
2989
+ "model_size_in_billions": 7,
2990
+ "quantizations": [
2991
+ "q2_k",
2992
+ "q3_k_m",
2993
+ "q4_0",
2994
+ "q4_k_m",
2995
+ "q5_0",
2996
+ "q5_k_m",
2997
+ "q6_k",
2998
+ "q8_0",
2999
+ "fp16"
3000
+ ],
3001
+ "model_id": "qwen/Qwen2-7B-Instruct-GGUF",
3002
+ "model_file_name_template": "qwen2-7b-instruct-{quantization}.gguf",
3003
+ "model_hub": "modelscope"
3004
+ },
3005
+ {
3006
+ "model_format": "ggufv2",
3007
+ "model_size_in_billions": 72,
3008
+ "quantizations": [
3009
+ "q2_k",
3010
+ "q3_k_m",
3011
+ "q4_0",
3012
+ "q4_k_m",
3013
+ "q5_0",
3014
+ "q5_k_m",
3015
+ "q6_k",
3016
+ "q8_0",
3017
+ "fp16"
3018
+ ],
3019
+ "model_id": "qwen/Qwen2-72B-Instruct-GGUF",
3020
+ "model_hub": "modelscope",
3021
+ "model_file_name_template": "qwen2-72b-instruct-{quantization}.gguf",
3022
+ "model_file_name_split_template": "qwen2-72b-instruct-{quantization}-{part}.gguf",
3023
+ "quantization_parts": {
3024
+ "q5_0": [
3025
+ "00001-of-00002",
3026
+ "00002-of-00002"
3027
+ ],
3028
+ "q5_k_m": [
3029
+ "00001-of-00002",
3030
+ "00002-of-00002"
3031
+ ],
3032
+ "q6_k": [
3033
+ "00001-of-00002",
3034
+ "00002-of-00002"
3035
+ ],
3036
+ "q8_0": [
3037
+ "00001-of-00002",
3038
+ "00002-of-00002"
3039
+ ],
3040
+ "fp16": [
3041
+ "00001-of-00004",
3042
+ "00002-of-00004",
3043
+ "00003-of-00004",
3044
+ "00004-of-00004"
3045
+ ]
3046
+ }
2941
3047
  }
2942
3048
  ],
2943
3049
  "prompt_style": {
@@ -2993,6 +3099,35 @@
2993
3099
  ],
2994
3100
  "model_id": "qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4",
2995
3101
  "model_hub": "modelscope"
3102
+ },
3103
+ {
3104
+ "model_format": "ggufv2",
3105
+ "model_size_in_billions": 14,
3106
+ "quantizations": [
3107
+ "q3_k_m",
3108
+ "q4_0",
3109
+ "q4_k_m",
3110
+ "q5_0",
3111
+ "q5_k_m",
3112
+ "q6_k",
3113
+ "q8_0",
3114
+ "fp16"
3115
+ ],
3116
+ "model_id": "qwen/Qwen2-57B-A14B-Instruct-GGUF",
3117
+ "model_hub": "modelscope",
3118
+ "model_file_name_template": "qwen2-57b-a14b-instruct-{quantization}.gguf",
3119
+ "model_file_name_split_template": "qwen2-57b-a14b-instruct-{quantization}-{part}.gguf",
3120
+ "quantization_parts": {
3121
+ "q8_0": [
3122
+ "00001-of-00002",
3123
+ "00002-of-00002"
3124
+ ],
3125
+ "fp16": [
3126
+ "00001-of-00003",
3127
+ "00002-of-00003",
3128
+ "00003-of-00003"
3129
+ ]
3130
+ }
2996
3131
  }
2997
3132
  ],
2998
3133
  "prompt_style": {
@@ -3402,6 +3537,16 @@
3402
3537
  "roles": [
3403
3538
  "user",
3404
3539
  "assistant"
3540
+ ],
3541
+ "stop_token_ids": [
3542
+ 151643,
3543
+ 151644,
3544
+ 151645
3545
+ ],
3546
+ "stop": [
3547
+ "<|endoftext|>",
3548
+ "<|im_start|>",
3549
+ "<|im_end|>"
3405
3550
  ]
3406
3551
  }
3407
3552
  },
@@ -3593,6 +3738,53 @@
3593
3738
  ]
3594
3739
  }
3595
3740
  },
3741
+ {
3742
+ "version": 1,
3743
+ "context_length": 8192,
3744
+ "model_name": "gemma-2-it",
3745
+ "model_lang": [
3746
+ "en"
3747
+ ],
3748
+ "model_ability": [
3749
+ "chat"
3750
+ ],
3751
+ "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
3752
+ "model_specs": [
3753
+ {
3754
+ "model_format": "pytorch",
3755
+ "model_size_in_billions": 9,
3756
+ "quantizations": [
3757
+ "none",
3758
+ "4-bit",
3759
+ "8-bit"
3760
+ ],
3761
+ "model_id": "AI-ModelScope/gemma-2-9b-it",
3762
+ "model_hub": "modelscope"
3763
+ },
3764
+ {
3765
+ "model_format": "pytorch",
3766
+ "model_size_in_billions": 27,
3767
+ "quantizations": [
3768
+ "none",
3769
+ "4-bit",
3770
+ "8-bit"
3771
+ ],
3772
+ "model_id": "AI-ModelScope/gemma-2-27b-it",
3773
+ "model_hub": "modelscope"
3774
+ }
3775
+ ],
3776
+ "prompt_style": {
3777
+ "style_name": "gemma",
3778
+ "roles": [
3779
+ "user",
3780
+ "model"
3781
+ ],
3782
+ "stop": [
3783
+ "<end_of_turn>",
3784
+ "<start_of_turn>"
3785
+ ]
3786
+ }
3787
+ },
3596
3788
  {
3597
3789
  "version":1,
3598
3790
  "context_length":2048,
@@ -0,0 +1,13 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.