xinference 0.10.1__py3-none-any.whl → 0.10.2.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (55) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +9 -9
  3. xinference/client/restful/restful_client.py +29 -16
  4. xinference/core/supervisor.py +32 -9
  5. xinference/core/worker.py +13 -8
  6. xinference/deploy/cmdline.py +22 -9
  7. xinference/model/audio/__init__.py +40 -1
  8. xinference/model/audio/core.py +25 -45
  9. xinference/model/audio/custom.py +148 -0
  10. xinference/model/core.py +6 -9
  11. xinference/model/embedding/model_spec.json +24 -0
  12. xinference/model/embedding/model_spec_modelscope.json +24 -0
  13. xinference/model/image/core.py +12 -4
  14. xinference/model/image/stable_diffusion/core.py +8 -7
  15. xinference/model/llm/core.py +9 -14
  16. xinference/model/llm/llm_family.json +263 -0
  17. xinference/model/llm/llm_family.py +26 -4
  18. xinference/model/llm/llm_family_modelscope.json +160 -0
  19. xinference/model/llm/pytorch/baichuan.py +4 -3
  20. xinference/model/llm/pytorch/chatglm.py +3 -2
  21. xinference/model/llm/pytorch/core.py +15 -13
  22. xinference/model/llm/pytorch/falcon.py +6 -5
  23. xinference/model/llm/pytorch/internlm2.py +3 -2
  24. xinference/model/llm/pytorch/llama_2.py +6 -5
  25. xinference/model/llm/pytorch/vicuna.py +4 -3
  26. xinference/model/llm/vllm/core.py +3 -0
  27. xinference/model/rerank/core.py +23 -12
  28. xinference/model/rerank/model_spec.json +24 -0
  29. xinference/model/rerank/model_spec_modelscope.json +25 -1
  30. xinference/model/utils.py +12 -1
  31. xinference/types.py +55 -0
  32. xinference/utils.py +1 -0
  33. xinference/web/ui/build/asset-manifest.json +3 -3
  34. xinference/web/ui/build/index.html +1 -1
  35. xinference/web/ui/build/static/js/main.26fdbfbe.js +3 -0
  36. xinference/web/ui/build/static/js/main.26fdbfbe.js.map +1 -0
  37. xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +1 -0
  38. xinference/web/ui/node_modules/.cache/babel-loader/1e86938a0cdf706d21e99b21f5d868fa247c0c88b26807047e26dcdc4d9a9db3.json +1 -0
  39. xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +1 -0
  40. xinference/web/ui/node_modules/.cache/babel-loader/f9290c0738db50065492ceedc6a4af25083fe18399b7c44d942273349ad9e643.json +1 -0
  41. xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +1 -0
  42. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +1 -0
  43. {xinference-0.10.1.dist-info → xinference-0.10.2.post1.dist-info}/METADATA +4 -1
  44. {xinference-0.10.1.dist-info → xinference-0.10.2.post1.dist-info}/RECORD +49 -46
  45. xinference/web/ui/build/static/js/main.76ef2b17.js +0 -3
  46. xinference/web/ui/build/static/js/main.76ef2b17.js.map +0 -1
  47. xinference/web/ui/node_modules/.cache/babel-loader/35d0e4a317e5582cbb79d901302e9d706520ac53f8a734c2fd8bfde6eb5a4f02.json +0 -1
  48. xinference/web/ui/node_modules/.cache/babel-loader/d076fd56cf3b15ed2433e3744b98c6b4e4410a19903d1db4de5bba0e1a1b3347.json +0 -1
  49. xinference/web/ui/node_modules/.cache/babel-loader/daad8131d91134f6d7aef895a0c9c32e1cb928277cb5aa66c01028126d215be0.json +0 -1
  50. xinference/web/ui/node_modules/.cache/babel-loader/f16aec63602a77bd561d0e67fa00b76469ac54b8033754bba114ec5eb3257964.json +0 -1
  51. /xinference/web/ui/build/static/js/{main.76ef2b17.js.LICENSE.txt → main.26fdbfbe.js.LICENSE.txt} +0 -0
  52. {xinference-0.10.1.dist-info → xinference-0.10.2.post1.dist-info}/LICENSE +0 -0
  53. {xinference-0.10.1.dist-info → xinference-0.10.2.post1.dist-info}/WHEEL +0 -0
  54. {xinference-0.10.1.dist-info → xinference-0.10.2.post1.dist-info}/entry_points.txt +0 -0
  55. {xinference-0.10.1.dist-info → xinference-0.10.2.post1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,7 @@ from ..._compat import (
33
33
  validator,
34
34
  )
35
35
  from ...constants import XINFERENCE_CACHE_DIR, XINFERENCE_MODEL_DIR
36
+ from ...types import LoRA
36
37
  from ..utils import (
37
38
  download_from_modelscope,
38
39
  is_valid_model_uri,
@@ -797,10 +798,29 @@ def get_user_defined_llm_families():
797
798
  return UD_LLM_FAMILIES.copy()
798
799
 
799
800
 
801
+ def match_model_size(
802
+ model_size: Union[int, str], spec_model_size: Union[int, str]
803
+ ) -> bool:
804
+ if isinstance(model_size, str):
805
+ model_size = model_size.replace("_", ".")
806
+ if isinstance(spec_model_size, str):
807
+ spec_model_size = spec_model_size.replace("_", ".")
808
+
809
+ if model_size == spec_model_size:
810
+ return True
811
+
812
+ try:
813
+ ms = int(model_size)
814
+ ss = int(spec_model_size)
815
+ return ms == ss
816
+ except ValueError:
817
+ return False
818
+
819
+
800
820
  def match_llm(
801
821
  model_name: str,
802
822
  model_format: Optional[str] = None,
803
- model_size_in_billions: Optional[int] = None,
823
+ model_size_in_billions: Optional[Union[int, str]] = None,
804
824
  quantization: Optional[str] = None,
805
825
  is_local_deployment: bool = False,
806
826
  ) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
@@ -844,7 +864,9 @@ def match_llm(
844
864
  model_format
845
865
  and model_format != spec.model_format
846
866
  or model_size_in_billions
847
- and model_size_in_billions != spec.model_size_in_billions
867
+ and not match_model_size(
868
+ model_size_in_billions, spec.model_size_in_billions
869
+ )
848
870
  or quantization
849
871
  and matched_quantization is None
850
872
  ):
@@ -954,12 +976,12 @@ def match_llm_cls(
954
976
  family: LLMFamilyV1,
955
977
  llm_spec: "LLMSpecV1",
956
978
  quantization: str,
957
- peft_model_path: Optional[str] = None,
979
+ peft_model: Optional[List[LoRA]] = None,
958
980
  ) -> Optional[Type[LLM]]:
959
981
  """
960
982
  Find an LLM implementation for given LLM family and spec.
961
983
  """
962
- if peft_model_path is not None:
984
+ if peft_model is not None:
963
985
  for cls in PEFT_SUPPORTED_CLASSES:
964
986
  if cls.match(family, llm_spec, quantization):
965
987
  return cls
@@ -2175,6 +2175,77 @@
2175
2175
  ]
2176
2176
  }
2177
2177
  },
2178
+ {
2179
+ "version": 1,
2180
+ "context_length": 65536,
2181
+ "model_name": "codeqwen1.5-chat",
2182
+ "model_lang": [
2183
+ "en",
2184
+ "zh"
2185
+ ],
2186
+ "model_ability": [
2187
+ "chat"
2188
+ ],
2189
+ "model_description": "CodeQwen1.5 is the Code-Specific version of Qwen1.5. It is a transformer-based decoder-only language model pretrained on a large amount of data of codes.",
2190
+ "model_specs": [
2191
+ {
2192
+ "model_format": "ggufv2",
2193
+ "model_size_in_billions": 7,
2194
+ "quantizations": [
2195
+ "q2_k",
2196
+ "q3_k_m",
2197
+ "q4_0",
2198
+ "q4_k_m",
2199
+ "q5_0",
2200
+ "q5_k_m",
2201
+ "q6_k",
2202
+ "q8_0"
2203
+ ],
2204
+ "model_id": "qwen/CodeQwen1.5-7B-Chat-GGUF",
2205
+ "model_hub": "modelscope",
2206
+ "model_file_name_template": "codeqwen-1_5-7b-chat-{quantization}.gguf"
2207
+ },
2208
+ {
2209
+ "model_format": "pytorch",
2210
+ "model_size_in_billions": 7,
2211
+ "quantizations": [
2212
+ "4-bit",
2213
+ "8-bit",
2214
+ "none"
2215
+ ],
2216
+ "model_id": "qwen/CodeQwen1.5-7B-Chat",
2217
+ "model_hub": "modelscope"
2218
+ },
2219
+ {
2220
+ "model_format": "awq",
2221
+ "model_size_in_billions": 7,
2222
+ "quantizations": [
2223
+ "Int4"
2224
+ ],
2225
+ "model_id": "qwen/CodeQwen1.5-7B-Chat-AWQ",
2226
+ "model_hub": "modelscope"
2227
+ }
2228
+ ],
2229
+ "prompt_style": {
2230
+ "style_name": "QWEN",
2231
+ "system_prompt": "You are a helpful assistant.",
2232
+ "roles": [
2233
+ "user",
2234
+ "assistant"
2235
+ ],
2236
+ "intra_message_sep": "\n",
2237
+ "stop_token_ids": [
2238
+ 151643,
2239
+ 151644,
2240
+ 151645
2241
+ ],
2242
+ "stop": [
2243
+ "<|endoftext|>",
2244
+ "<|im_start|>",
2245
+ "<|im_end|>"
2246
+ ]
2247
+ }
2248
+ },
2178
2249
  {
2179
2250
  "version": 1,
2180
2251
  "context_length": 4096,
@@ -3045,5 +3116,94 @@
3045
3116
  "</s>"
3046
3117
  ]
3047
3118
  }
3119
+ },
3120
+ {
3121
+ "version": 1,
3122
+ "context_length": 131072,
3123
+ "model_name": "c4ai-command-r-v01",
3124
+ "model_lang": [
3125
+ "en",
3126
+ "fr",
3127
+ "de",
3128
+ "es",
3129
+ "it",
3130
+ "pt",
3131
+ "ja",
3132
+ "ko",
3133
+ "zh",
3134
+ "ar"
3135
+ ],
3136
+ "model_ability": [
3137
+ "generate"
3138
+ ],
3139
+ "model_description": "C4AI Command-R is a research release of a 35 billion parameter highly performant generative model.",
3140
+ "model_specs": [
3141
+ {
3142
+ "model_format": "pytorch",
3143
+ "model_size_in_billions": 35,
3144
+ "quantizations": [
3145
+ "none"
3146
+ ],
3147
+ "model_hub": "modelscope",
3148
+ "model_id": "AI-ModelScope/c4ai-command-r-v01",
3149
+ "model_revision": "master"
3150
+ },
3151
+ {
3152
+ "model_format": "ggufv2",
3153
+ "model_size_in_billions": 35,
3154
+ "quantizations": [
3155
+ "Q2_K",
3156
+ "Q4_K_M",
3157
+ "Q5_K_M"
3158
+ ],
3159
+ "model_id": "mirror013/C4AI-Command-R-v01-GGUF",
3160
+ "model_file_name_template": "c4ai-command-r-v01.{quantization}.gguf",
3161
+ "model_hub": "modelscope",
3162
+ "model_revision": "master"
3163
+ },
3164
+ {
3165
+ "model_format": "pytorch",
3166
+ "model_size_in_billions": 104,
3167
+ "quantizations": [
3168
+ "none"
3169
+ ],
3170
+ "model_hub": "modelscope",
3171
+ "model_id": "AI-ModelScope/c4ai-command-r-plus",
3172
+ "model_revision": "master"
3173
+ }
3174
+ ]
3175
+ },
3176
+ {
3177
+ "version": 1,
3178
+ "context_length": 131072,
3179
+ "model_name": "c4ai-command-r-v01-4bit",
3180
+ "model_lang": [
3181
+ "en",
3182
+ "fr",
3183
+ "de",
3184
+ "es",
3185
+ "it",
3186
+ "pt",
3187
+ "ja",
3188
+ "ko",
3189
+ "zh",
3190
+ "ar"
3191
+ ],
3192
+ "model_ability": [
3193
+ "generate"
3194
+ ],
3195
+ "model_description": "This model is 4bit quantized version of C4AI Command-R using bitsandbytes.",
3196
+ "model_specs": [
3197
+ {
3198
+ "model_format": "pytorch",
3199
+ "model_size_in_billions": 35,
3200
+ "quantizations": [
3201
+ "none"
3202
+ ],
3203
+ "model_hub": "modelscope",
3204
+ "model_id": "mirror013/c4ai-command-r-v01-4bit",
3205
+ "model_revision": "master"
3206
+ }
3207
+ ]
3048
3208
  }
3049
3209
  ]
@@ -12,8 +12,9 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Optional
15
+ from typing import List, Optional
16
16
 
17
+ from ....types import LoRA
17
18
  from ..llm_family import LLMFamilyV1, LLMSpecV1
18
19
  from .core import PytorchChatModel, PytorchModelConfig
19
20
 
@@ -27,7 +28,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
27
28
  quantization: str,
28
29
  model_path: str,
29
30
  pytorch_model_config: Optional[PytorchModelConfig] = None,
30
- peft_model_path: Optional[str] = None,
31
+ peft_model: Optional[List[LoRA]] = None,
31
32
  ):
32
33
  super().__init__(
33
34
  model_uid,
@@ -36,7 +37,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
36
37
  quantization,
37
38
  model_path,
38
39
  pytorch_model_config=pytorch_model_config,
39
- peft_model_path=peft_model_path,
40
+ peft_model=peft_model,
40
41
  )
41
42
  self._use_fast_tokenizer = False
42
43
 
@@ -24,6 +24,7 @@ from ....types import (
24
24
  CompletionChoice,
25
25
  CompletionChunk,
26
26
  CompletionUsage,
27
+ LoRA,
27
28
  PytorchGenerateConfig,
28
29
  )
29
30
  from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -39,7 +40,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
39
40
  quantization: str,
40
41
  model_path: str,
41
42
  pytorch_model_config: Optional[PytorchModelConfig] = None,
42
- peft_model_path: Optional[str] = None,
43
+ peft_model: Optional[List[LoRA]] = None,
43
44
  ):
44
45
  super().__init__(
45
46
  model_uid,
@@ -48,7 +49,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
48
49
  quantization,
49
50
  model_path,
50
51
  pytorch_model_config=pytorch_model_config,
51
- peft_model_path=peft_model_path,
52
+ peft_model=peft_model,
52
53
  )
53
54
 
54
55
  def _load_model(self, **kwargs):
@@ -32,6 +32,7 @@ from ....types import (
32
32
  Embedding,
33
33
  EmbeddingData,
34
34
  EmbeddingUsage,
35
+ LoRA,
35
36
  PytorchGenerateConfig,
36
37
  PytorchModelConfig,
37
38
  )
@@ -71,14 +72,14 @@ class PytorchModel(LLM):
71
72
  quantization: str,
72
73
  model_path: str,
73
74
  pytorch_model_config: Optional[PytorchModelConfig] = None,
74
- peft_model_path: Optional[str] = None,
75
+ peft_model: Optional[List[LoRA]] = None,
75
76
  ):
76
77
  super().__init__(model_uid, model_family, model_spec, quantization, model_path)
77
78
  self._use_fast_tokenizer = True
78
79
  self._pytorch_model_config: PytorchModelConfig = self._sanitize_model_config(
79
80
  pytorch_model_config
80
81
  )
81
- self._peft_model_path = peft_model_path
82
+ self._peft_model = peft_model
82
83
 
83
84
  def _sanitize_model_config(
84
85
  self, pytorch_model_config: Optional[PytorchModelConfig]
@@ -134,7 +135,7 @@ class PytorchModel(LLM):
134
135
  return model, tokenizer
135
136
 
136
137
  def _apply_lora(self):
137
- if self._peft_model_path is not None:
138
+ if self._peft_model is not None:
138
139
  try:
139
140
  from peft import PeftModel
140
141
  except ImportError:
@@ -142,14 +143,15 @@ class PytorchModel(LLM):
142
143
  f"Failed to import 'PeftModel' from 'peft'. Please make sure 'peft' is installed.\n\n"
143
144
  )
144
145
 
145
- # Apply LoRA
146
- self._model = PeftModel.from_pretrained(
147
- self._model,
148
- self._peft_model_path,
149
- )
150
- logger.info(
151
- f"Successfully loaded the PEFT adaptor for model {self.model_uid}."
152
- )
146
+ for peft_model in self._peft_model:
147
+ # Apply LoRA
148
+ self._model = PeftModel.from_pretrained(
149
+ self._model,
150
+ peft_model.local_path,
151
+ )
152
+ logger.info(
153
+ f"PEFT adaptor '{peft_model.lora_name}' successfully loaded for model '{self.model_uid}'."
154
+ )
153
155
 
154
156
  def load(self):
155
157
  try:
@@ -421,7 +423,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
421
423
  quantization: str,
422
424
  model_path: str,
423
425
  pytorch_model_config: Optional[PytorchModelConfig] = None,
424
- peft_model_path: Optional[str] = None,
426
+ peft_model: Optional[List[LoRA]] = None,
425
427
  ):
426
428
  super().__init__(
427
429
  model_uid,
@@ -430,7 +432,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
430
432
  quantization,
431
433
  model_path,
432
434
  pytorch_model_config,
433
- peft_model_path,
435
+ peft_model,
434
436
  )
435
437
 
436
438
  def _sanitize_generate_config(
@@ -12,8 +12,9 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Optional
15
+ from typing import List, Optional
16
16
 
17
+ from ....types import LoRA
17
18
  from ..llm_family import LLMFamilyV1, LLMSpecV1
18
19
  from .core import PytorchChatModel, PytorchModel, PytorchModelConfig
19
20
 
@@ -27,7 +28,7 @@ class FalconPytorchModel(PytorchModel):
27
28
  quantization: str,
28
29
  model_path: str,
29
30
  pytorch_model_config: Optional[PytorchModelConfig] = None,
30
- peft_model_path: Optional[str] = None,
31
+ peft_model: Optional[List[LoRA]] = None,
31
32
  ):
32
33
  super().__init__(
33
34
  model_uid,
@@ -36,7 +37,7 @@ class FalconPytorchModel(PytorchModel):
36
37
  quantization,
37
38
  model_path,
38
39
  pytorch_model_config=pytorch_model_config,
39
- peft_model_path=peft_model_path,
40
+ peft_model=peft_model,
40
41
  )
41
42
 
42
43
  def _load_model(self, **kwargs):
@@ -86,7 +87,7 @@ class FalconPytorchChatModel(PytorchChatModel):
86
87
  quantization: str,
87
88
  model_path: str,
88
89
  pytorch_model_config: Optional[PytorchModelConfig] = None,
89
- peft_model_path: Optional[str] = None,
90
+ peft_model: Optional[List[LoRA]] = None,
90
91
  ):
91
92
  super().__init__(
92
93
  model_uid,
@@ -95,7 +96,7 @@ class FalconPytorchChatModel(PytorchChatModel):
95
96
  quantization,
96
97
  model_path,
97
98
  pytorch_model_config=pytorch_model_config,
98
- peft_model_path=peft_model_path,
99
+ peft_model=peft_model,
99
100
  )
100
101
 
101
102
  def _load_model(self, **kwargs):
@@ -23,6 +23,7 @@ from ....types import (
23
23
  CompletionChoice,
24
24
  CompletionChunk,
25
25
  CompletionUsage,
26
+ LoRA,
26
27
  PytorchGenerateConfig,
27
28
  )
28
29
  from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -38,7 +39,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
38
39
  quantization: str,
39
40
  model_path: str,
40
41
  pytorch_model_config: Optional[PytorchModelConfig] = None,
41
- peft_model_path: Optional[str] = None,
42
+ peft_model: Optional[List[LoRA]] = None,
42
43
  ):
43
44
  super().__init__(
44
45
  model_uid,
@@ -47,7 +48,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
47
48
  quantization,
48
49
  model_path,
49
50
  pytorch_model_config=pytorch_model_config,
50
- peft_model_path=peft_model_path,
51
+ peft_model=peft_model,
51
52
  )
52
53
 
53
54
  def _load_model(self, **kwargs):
@@ -12,8 +12,9 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Optional
15
+ from typing import List, Optional
16
16
 
17
+ from ....types import LoRA
17
18
  from ..llm_family import LLMFamilyV1, LLMSpecV1
18
19
  from .core import PytorchChatModel, PytorchModel, PytorchModelConfig
19
20
 
@@ -27,7 +28,7 @@ class LlamaPytorchModel(PytorchModel):
27
28
  quantization: str,
28
29
  model_path: str,
29
30
  pytorch_model_config: Optional[PytorchModelConfig] = None,
30
- peft_model_path: Optional[str] = None,
31
+ peft_model: Optional[List[LoRA]] = None,
31
32
  ):
32
33
  super().__init__(
33
34
  model_uid,
@@ -36,7 +37,7 @@ class LlamaPytorchModel(PytorchModel):
36
37
  quantization,
37
38
  model_path,
38
39
  pytorch_model_config=pytorch_model_config,
39
- peft_model_path=peft_model_path,
40
+ peft_model=peft_model,
40
41
  )
41
42
 
42
43
  def _load_model(self, **kwargs):
@@ -69,8 +70,8 @@ class LlamaPytorchChatModel(PytorchChatModel):
69
70
  model_spec: "LLMSpecV1",
70
71
  quantization: str,
71
72
  model_path: str,
72
- peft_model_path: Optional[str] = None,
73
73
  pytorch_model_config: Optional["PytorchModelConfig"] = None,
74
+ peft_model: Optional[List[LoRA]] = None,
74
75
  ):
75
76
  super().__init__(
76
77
  model_uid,
@@ -78,7 +79,7 @@ class LlamaPytorchChatModel(PytorchChatModel):
78
79
  model_spec,
79
80
  quantization,
80
81
  model_path,
81
- peft_model_path=peft_model_path,
82
+ peft_model=peft_model,
82
83
  pytorch_model_config=pytorch_model_config,
83
84
  )
84
85
  self._use_fast_tokenizer = False
@@ -26,8 +26,9 @@
26
26
  # See the License for the specific language governing permissions and
27
27
  # limitations under the License.
28
28
 
29
- from typing import Optional
29
+ from typing import List, Optional
30
30
 
31
+ from ....types import LoRA
31
32
  from .. import LLMFamilyV1, LLMSpecV1
32
33
  from .core import PytorchChatModel, PytorchModelConfig
33
34
 
@@ -41,7 +42,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
41
42
  quantization: str,
42
43
  model_path: str,
43
44
  pytorch_model_config: Optional["PytorchModelConfig"] = None,
44
- peft_model_path: Optional[str] = None,
45
+ peft_model: Optional[List[LoRA]] = None,
45
46
  ):
46
47
  super().__init__(
47
48
  model_uid,
@@ -50,7 +51,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
50
51
  quantization,
51
52
  model_path,
52
53
  pytorch_model_config=pytorch_model_config,
53
- peft_model_path=peft_model_path,
54
+ peft_model=peft_model,
54
55
  )
55
56
  self._use_fast_tokenizer = False
56
57
 
@@ -116,6 +116,7 @@ VLLM_SUPPORTED_CHAT_MODELS = [
116
116
  ]
117
117
  if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
118
118
  VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-chat")
119
+ VLLM_SUPPORTED_CHAT_MODELS.append("codeqwen1.5-chat")
119
120
 
120
121
  if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
121
122
  VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
@@ -126,6 +127,8 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
126
127
 
127
128
  if VLLM_INSTALLED and vllm.__version__ >= "0.4.0":
128
129
  VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-moe-chat")
130
+ VLLM_SUPPORTED_MODELS.append("c4ai-command-r-v01")
131
+ VLLM_SUPPORTED_MODELS.append("c4ai-command-r-v01-4bit")
129
132
 
130
133
 
131
134
  class VLLMModel(LLM):
@@ -42,8 +42,9 @@ def get_rerank_model_descriptions():
42
42
  class RerankModelSpec(CacheableModelSpec):
43
43
  model_name: str
44
44
  language: List[str]
45
+ type: Optional[str] = "normal"
45
46
  model_id: str
46
- model_revision: str
47
+ model_revision: Optional[str]
47
48
  model_hub: str = "huggingface"
48
49
 
49
50
 
@@ -63,6 +64,7 @@ class RerankModelDescription(ModelDescription):
63
64
  "model_type": "rerank",
64
65
  "address": self.address,
65
66
  "accelerators": self.devices,
67
+ "type": self._model_spec.type,
66
68
  "model_name": self._model_spec.model_name,
67
69
  "language": self._model_spec.language,
68
70
  "model_revision": self._model_spec.model_revision,
@@ -97,12 +99,14 @@ def generate_rerank_description(model_spec: RerankModelSpec) -> Dict[str, List[D
97
99
  class RerankModel:
98
100
  def __init__(
99
101
  self,
102
+ model_spec: RerankModelSpec,
100
103
  model_uid: str,
101
104
  model_path: str,
102
105
  device: Optional[str] = None,
103
106
  use_fp16: bool = False,
104
107
  model_config: Optional[Dict] = None,
105
108
  ):
109
+ self._model_spec = model_spec
106
110
  self._model_uid = model_uid
107
111
  self._model_path = model_path
108
112
  self._device = device
@@ -112,20 +116,25 @@ class RerankModel:
112
116
 
113
117
  def load(self):
114
118
  try:
115
- from sentence_transformers.cross_encoder import CrossEncoder
119
+ if self._model_spec.type == "normal":
120
+ from FlagEmbedding import FlagReranker
121
+ elif self._model_spec.type == "LLM-based":
122
+ from FlagEmbedding import FlagLLMReranker as FlagReranker
123
+ elif self._model_spec.type == "LLM-based layerwise":
124
+ from FlagEmbedding import LayerWiseFlagLLMReranker as FlagReranker
125
+ else:
126
+ raise RuntimeError(
127
+ f"Unsupported Rank model type: {self._model_spec.type}"
128
+ )
116
129
  except ImportError:
117
- error_message = "Failed to import module 'SentenceTransformer'"
130
+ error_message = "Failed to import module 'FlagEmbedding'"
118
131
  installation_guide = [
119
- "Please make sure 'sentence-transformers' is installed. ",
120
- "You can install it by `pip install sentence-transformers`\n",
132
+ "Please make sure 'FlagEmbedding' is installed. ",
133
+ "You can install it by `pip install FlagEmbedding`\n",
121
134
  ]
122
135
 
123
136
  raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
124
- self._model = CrossEncoder(
125
- self._model_path, device=self._device, **self._model_config
126
- )
127
- if self._use_fp16:
128
- self._model.model.half()
137
+ self._model = FlagReranker(self._model_path, use_fp16=True)
129
138
 
130
139
  def rerank(
131
140
  self,
@@ -142,7 +151,7 @@ class RerankModel:
142
151
  if max_chunks_per_doc is not None:
143
152
  raise ValueError("rerank hasn't support `max_chunks_per_doc` parameter.")
144
153
  sentence_combinations = [[query, doc] for doc in documents]
145
- similarity_scores = self._model.predict(sentence_combinations)
154
+ similarity_scores = self._model.compute_score(sentence_combinations)
146
155
  sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))
147
156
  if top_n is not None:
148
157
  sim_scores_argsort = sim_scores_argsort[:top_n]
@@ -224,7 +233,9 @@ def create_rerank_model_instance(
224
233
 
225
234
  model_path = cache(model_spec)
226
235
  use_fp16 = kwargs.pop("use_fp16", False)
227
- model = RerankModel(model_uid, model_path, use_fp16=use_fp16, model_config=kwargs)
236
+ model = RerankModel(
237
+ model_spec, model_uid, model_path, use_fp16=use_fp16, model_config=kwargs
238
+ )
228
239
  model_description = RerankModelDescription(
229
240
  subpool_addr, devices, model_spec, model_path=model_path
230
241
  )
@@ -1,20 +1,44 @@
1
1
  [
2
2
  {
3
3
  "model_name": "bge-reranker-large",
4
+ "type": "normal",
4
5
  "language": ["en", "zh"],
5
6
  "model_id": "BAAI/bge-reranker-large",
6
7
  "model_revision": "27c9168d479987529781de8474dff94d69beca11"
7
8
  },
8
9
  {
9
10
  "model_name": "bge-reranker-base",
11
+ "type": "normal",
10
12
  "language": ["en", "zh"],
11
13
  "model_id": "BAAI/bge-reranker-base",
12
14
  "model_revision": "465b4b7ddf2be0a020c8ad6e525b9bb1dbb708ae"
13
15
  },
14
16
  {
15
17
  "model_name": "bce-reranker-base_v1",
18
+ "type": "normal",
16
19
  "language": ["en", "zh"],
17
20
  "model_id": "maidalun1020/bce-reranker-base_v1",
18
21
  "model_revision": "eaa31a577a0574e87a08959bd229ca14ce1b5496"
22
+ },
23
+ {
24
+ "model_name": "bge-reranker-v2-m3",
25
+ "type": "normal",
26
+ "language": ["en", "zh", "multilingual"],
27
+ "model_id": "BAAI/bge-reranker-v2-m3",
28
+ "model_revision": "12e974610ba9083ed95f3edf08d7e899581f4de4"
29
+ },
30
+ {
31
+ "model_name": "bge-reranker-v2-gemma",
32
+ "type": "LLM-based",
33
+ "language": ["en", "zh", "multilingual"],
34
+ "model_id": "BAAI/bge-reranker-v2-gemma",
35
+ "model_revision": "1787044f8b6fb740a9de4557c3a12377f84d9e17"
36
+ },
37
+ {
38
+ "model_name": "bge-reranker-v2-minicpm-layerwise",
39
+ "type": "LLM-based layerwise",
40
+ "language": ["en", "zh", "multilingual"],
41
+ "model_id": "BAAI/bge-reranker-v2-minicpm-layerwise",
42
+ "model_revision": "47b5332b296c4d8cb6ee2c60502cc62a0d708881"
19
43
  }
20
44
  ]