xinference 0.10.2__py3-none-any.whl → 0.10.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

xinference/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2024-04-19T11:39:12+0800",
11
+ "date": "2024-04-24T10:45:37+0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "f19e85be09bce966e0c0b3e01bc5690eb6016398",
15
- "version": "0.10.2"
14
+ "full-revisionid": "2ba72b0ed55c2dbff12491485ffacee7996d3490",
15
+ "version": "0.10.3"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -18,8 +18,6 @@ from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union
18
18
 
19
19
  import requests
20
20
 
21
- from ...model.utils import convert_float_to_int_or_str
22
- from ...types import LoRA, PeftModelConfig
23
21
  from ..common import streaming_response_iterator
24
22
 
25
23
  if TYPE_CHECKING:
@@ -37,6 +35,17 @@ if TYPE_CHECKING:
37
35
  )
38
36
 
39
37
 
38
+ def convert_float_to_int_or_str(model_size: float) -> Union[int, str]:
39
+ """convert float to int or string
40
+
41
+ if float can be presented as int, convert it to int, otherwise convert it to string
42
+ """
43
+ if int(model_size) == model_size:
44
+ return int(model_size)
45
+ else:
46
+ return str(model_size)
47
+
48
+
40
49
  def _get_error_string(response: requests.Response) -> str:
41
50
  try:
42
51
  if response.content:
@@ -856,18 +865,6 @@ class Client:
856
865
 
857
866
  url = f"{self.base_url}/v1/models"
858
867
 
859
- if peft_model_config is not None:
860
- lora_list = [
861
- LoRA.from_dict(model) for model in peft_model_config["lora_list"]
862
- ]
863
- peft_model = PeftModelConfig(
864
- lora_list,
865
- peft_model_config["image_lora_load_kwargs"],
866
- peft_model_config["image_lora_fuse_kwargs"],
867
- )
868
- else:
869
- peft_model = None
870
-
871
868
  # convert float to int or string since the RESTful API does not accept float.
872
869
  if isinstance(model_size_in_billions, float):
873
870
  model_size_in_billions = convert_float_to_int_or_str(model_size_in_billions)
@@ -875,7 +872,7 @@ class Client:
875
872
  payload = {
876
873
  "model_uid": model_uid,
877
874
  "model_name": model_name,
878
- "peft_model_config": peft_model.to_dict() if peft_model else None,
875
+ "peft_model_config": peft_model_config,
879
876
  "model_type": model_type,
880
877
  "model_size_in_billions": model_size_in_billions,
881
878
  "model_format": model_format,
xinference/core/worker.py CHANGED
@@ -612,6 +612,14 @@ class WorkerActor(xo.StatelessActor):
612
612
  gpu_idx: Optional[Union[int, List[int]]] = None,
613
613
  **kwargs,
614
614
  ):
615
+ # !!! Note that The following code must be placed at the very beginning of this function,
616
+ # or there will be problems with auto-recovery.
617
+ # Because `locals()` will collect all the local parameters of this function and pass to this function again.
618
+ launch_args = locals()
619
+ launch_args.pop("self")
620
+ launch_args.pop("kwargs")
621
+ launch_args.update(kwargs)
622
+
615
623
  event_model_uid, _, __ = parse_replica_model_uid(model_uid)
616
624
  await self._event_collector_ref.report_event(
617
625
  event_model_uid,
@@ -621,10 +629,6 @@ class WorkerActor(xo.StatelessActor):
621
629
  event_content="Launch model",
622
630
  ),
623
631
  )
624
- launch_args = locals()
625
- launch_args.pop("self")
626
- launch_args.pop("kwargs")
627
- launch_args.update(kwargs)
628
632
 
629
633
  if gpu_idx is not None:
630
634
  logger.info(
@@ -736,11 +736,15 @@ def model_launch(
736
736
  else []
737
737
  )
738
738
 
739
- peft_model_config = {
740
- "image_lora_load_kwargs": image_lora_load_params,
741
- "image_lora_fuse_kwargs": image_lora_fuse_params,
742
- "lora_list": lora_list,
743
- }
739
+ peft_model_config = (
740
+ {
741
+ "image_lora_load_kwargs": image_lora_load_params,
742
+ "image_lora_fuse_kwargs": image_lora_fuse_params,
743
+ "lora_list": lora_list,
744
+ }
745
+ if lora_list or image_lora_load_params or image_lora_fuse_params
746
+ else None
747
+ )
744
748
 
745
749
  _gpu_idx: Optional[List[int]] = (
746
750
  None if gpu_idx is None else [int(idx) for idx in gpu_idx.split(",")]
@@ -75,5 +75,12 @@
75
75
  "model_id": "BELLE-2/Belle-whisper-large-v2-zh",
76
76
  "model_revision": "ec5bd5d78598545b7585814edde86dac2002b5b9",
77
77
  "multilingual": false
78
+ },
79
+ {
80
+ "model_name": "Belle-whisper-large-v3-zh",
81
+ "model_family": "whisper",
82
+ "model_id": "BELLE-2/Belle-whisper-large-v3-zh",
83
+ "model_revision": "3bebc7247696b39f5ab9ed22db426943ac33f600",
84
+ "multilingual": false
78
85
  }
79
- ]
86
+ ]
@@ -12,12 +12,15 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import gc
15
16
  import logging
17
+ import os
16
18
  from collections import defaultdict
17
19
  from typing import Dict, List, Optional, Tuple, Union, no_type_check
18
20
 
19
21
  import numpy as np
20
22
 
23
+ from ...device_utils import empty_cache
21
24
  from ...types import Embedding, EmbeddingData, EmbeddingUsage
22
25
  from ..core import CacheableModelSpec, ModelDescription
23
26
  from ..utils import get_cache_dir, is_model_cached
@@ -28,6 +31,10 @@ logger = logging.getLogger(__name__)
28
31
  # Init when registering all the builtin models.
29
32
  MODEL_NAME_TO_REVISION: Dict[str, List[str]] = defaultdict(list)
30
33
  EMBEDDING_MODEL_DESCRIPTIONS: Dict[str, List[Dict]] = defaultdict(list)
34
+ EMBEDDING_EMPTY_CACHE_COUNT = int(
35
+ os.getenv("XINFERENCE_EMBEDDING_EMPTY_CACHE_COUNT", "10")
36
+ )
37
+ assert EMBEDDING_EMPTY_CACHE_COUNT > 0
31
38
 
32
39
 
33
40
  def get_embedding_model_descriptions():
@@ -116,6 +123,7 @@ class EmbeddingModel:
116
123
  self._model_path = model_path
117
124
  self._device = device
118
125
  self._model = None
126
+ self._counter = 0
119
127
 
120
128
  def load(self):
121
129
  try:
@@ -134,6 +142,11 @@ class EmbeddingModel:
134
142
  self._model = SentenceTransformer(self._model_path, device=self._device)
135
143
 
136
144
  def create_embedding(self, sentences: Union[str, List[str]], **kwargs):
145
+ self._counter += 1
146
+ if self._counter % EMBEDDING_EMPTY_CACHE_COUNT == 0:
147
+ logger.debug("Empty embedding cache.")
148
+ gc.collect()
149
+ empty_cache()
137
150
  from sentence_transformers import SentenceTransformer
138
151
 
139
152
  kwargs.setdefault("normalize_embeddings", True)
@@ -1220,6 +1220,148 @@
1220
1220
  }
1221
1221
  ]
1222
1222
  },
1223
+ {
1224
+ "version": 1,
1225
+ "context_length": 8192,
1226
+ "model_name": "llama-3",
1227
+ "model_lang": [
1228
+ "en"
1229
+ ],
1230
+ "model_ability": [
1231
+ "generate"
1232
+ ],
1233
+ "model_description": "Llama 3 is an auto-regressive language model that uses an optimized transformer architecture",
1234
+ "model_specs": [
1235
+ {
1236
+ "model_format": "pytorch",
1237
+ "model_size_in_billions": 8,
1238
+ "quantizations": [
1239
+ "4-bit",
1240
+ "8-bit",
1241
+ "none"
1242
+ ],
1243
+ "model_id": "meta-llama/Meta-Llama-3-8B"
1244
+ },
1245
+ {
1246
+ "model_format": "ggufv2",
1247
+ "model_size_in_billions": 8,
1248
+ "quantizations": [
1249
+ "Q2_K",
1250
+ "Q3_K_L",
1251
+ "Q3_K_M",
1252
+ "Q3_K_S",
1253
+ "Q4_0",
1254
+ "Q4_1",
1255
+ "Q4_K_M",
1256
+ "Q4_K_S",
1257
+ "Q5_0",
1258
+ "Q5_1",
1259
+ "Q5_K_M",
1260
+ "Q5_K_S",
1261
+ "Q6_K",
1262
+ "Q8_0"
1263
+ ],
1264
+ "model_id": "QuantFactory/Meta-Llama-3-8B-GGUF",
1265
+ "model_file_name_template": "Meta-Llama-3-8B.{quantization}.gguf"
1266
+ },
1267
+ {
1268
+ "model_format": "pytorch",
1269
+ "model_size_in_billions": 70,
1270
+ "quantizations": [
1271
+ "4-bit",
1272
+ "8-bit",
1273
+ "none"
1274
+ ],
1275
+ "model_id": "meta-llama/Meta-Llama-3-70B"
1276
+ },
1277
+ {
1278
+ "model_format": "ggufv2",
1279
+ "model_size_in_billions": 70,
1280
+ "quantizations": [
1281
+ "Q4_K_M",
1282
+ "Q5_K_M"
1283
+ ],
1284
+ "model_id": "NousResearch/Meta-Llama-3-70B-GGUF",
1285
+ "model_file_name_template": "Meta-Llama-3-70B-{quantization}.gguf"
1286
+ }
1287
+ ]
1288
+ },
1289
+ {
1290
+ "version": 1,
1291
+ "context_length": 8192,
1292
+ "model_name": "llama-3-instruct",
1293
+ "model_lang": [
1294
+ "en"
1295
+ ],
1296
+ "model_ability": [
1297
+ "chat"
1298
+ ],
1299
+ "model_description": "The Llama 3 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks..",
1300
+ "model_specs": [
1301
+ {
1302
+ "model_format": "ggufv2",
1303
+ "model_size_in_billions": 8,
1304
+ "quantizations": [
1305
+ "IQ3_M",
1306
+ "Q4_K_M",
1307
+ "Q5_K_M",
1308
+ "Q6_K",
1309
+ "Q8_0"
1310
+ ],
1311
+ "model_id": "lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
1312
+ "model_file_name_template": "Meta-Llama-3-8B-Instruct-{quantization}.gguf"
1313
+ },
1314
+ {
1315
+ "model_format": "pytorch",
1316
+ "model_size_in_billions": 8,
1317
+ "quantizations": [
1318
+ "4-bit",
1319
+ "8-bit",
1320
+ "none"
1321
+ ],
1322
+ "model_id": "meta-llama/Meta-Llama-3-8B-Instruct"
1323
+ },
1324
+ {
1325
+ "model_format": "ggufv2",
1326
+ "model_size_in_billions": 70,
1327
+ "quantizations": [
1328
+ "IQ1_M",
1329
+ "IQ2_XS",
1330
+ "Q4_K_M"
1331
+ ],
1332
+ "model_id": "lmstudio-community/Meta-Llama-3-70B-Instruct-GGUF",
1333
+ "model_file_name_template": "Meta-Llama-3-8B-Instruct-{quantization}.gguf"
1334
+ },
1335
+ {
1336
+ "model_format": "pytorch",
1337
+ "model_size_in_billions": 70,
1338
+ "quantizations": [
1339
+ "4-bit",
1340
+ "8-bit",
1341
+ "none"
1342
+ ],
1343
+ "model_id": "meta-llama/Meta-Llama-3-70B-Instruct"
1344
+ }
1345
+ ],
1346
+ "prompt_style": {
1347
+ "style_name": "LLAMA3",
1348
+ "system_prompt": "You are a helpful assistant.",
1349
+ "roles": [
1350
+ "user",
1351
+ "assistant"
1352
+ ],
1353
+ "intra_message_sep": "\n\n",
1354
+ "inter_message_sep": "<|eot_id|>",
1355
+ "stop_token_ids": [
1356
+ 128001,
1357
+ 128009
1358
+ ],
1359
+ "stop": [
1360
+ "<|end_of_text|>",
1361
+ "<|eot_id|>"
1362
+ ]
1363
+ }
1364
+ },
1223
1365
  {
1224
1366
  "version": 1,
1225
1367
  "context_length": 2048,
@@ -1932,7 +2074,7 @@
1932
2074
  },
1933
2075
  {
1934
2076
  "version": 1,
1935
- "context_length": 65536,
2077
+ "context_length": 32768,
1936
2078
  "model_name": "codeqwen1.5-chat",
1937
2079
  "model_lang": [
1938
2080
  "en",
@@ -84,6 +84,96 @@
84
84
  ]
85
85
  }
86
86
  },
87
+ {
88
+ "version": 1,
89
+ "context_length": 8192,
90
+ "model_name": "llama-3",
91
+ "model_lang": [
92
+ "en"
93
+ ],
94
+ "model_ability": [
95
+ "generate"
96
+ ],
97
+ "model_description": "Llama 3 is an auto-regressive language model that uses an optimized transformer architecture",
98
+ "model_specs": [
99
+ {
100
+ "model_format": "pytorch",
101
+ "model_size_in_billions": 8,
102
+ "quantizations": [
103
+ "4-bit",
104
+ "8-bit",
105
+ "none"
106
+ ],
107
+ "model_id": "LLM-Research/Meta-Llama-3-8B",
108
+ "model_hub": "modelscope"
109
+ },
110
+ {
111
+ "model_format": "pytorch",
112
+ "model_size_in_billions": 70,
113
+ "quantizations": [
114
+ "4-bit",
115
+ "8-bit",
116
+ "none"
117
+ ],
118
+ "model_id": "LLM-Research/Meta-Llama-3-70B",
119
+ "model_hub": "modelscope"
120
+ }
121
+ ]
122
+ },
123
+ {
124
+ "version": 1,
125
+ "context_length": 8192,
126
+ "model_name": "llama-3-instruct",
127
+ "model_lang": [
128
+ "en"
129
+ ],
130
+ "model_ability": [
131
+ "chat"
132
+ ],
133
+ "model_description": "The Llama 3 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks..",
134
+ "model_specs": [
135
+ {
136
+ "model_format": "pytorch",
137
+ "model_size_in_billions": 8,
138
+ "quantizations": [
139
+ "4-bit",
140
+ "8-bit",
141
+ "none"
142
+ ],
143
+ "model_id": "LLM-Research/Meta-Llama-3-8B-Instruct",
144
+ "model_hub": "modelscope"
145
+ },
146
+ {
147
+ "model_format": "pytorch",
148
+ "model_size_in_billions": 70,
149
+ "quantizations": [
150
+ "4-bit",
151
+ "8-bit",
152
+ "none"
153
+ ],
154
+ "model_id": "LLM-Research/Meta-Llama-3-70B-Instruct",
155
+ "model_hub": "modelscope"
156
+ }
157
+ ],
158
+ "prompt_style": {
159
+ "style_name": "LLAMA3",
160
+ "system_prompt": "You are a helpful assistant.",
161
+ "roles": [
162
+ "user",
163
+ "assistant"
164
+ ],
165
+ "intra_message_sep": "\n\n",
166
+ "inter_message_sep": "<|eot_id|>",
167
+ "stop_token_ids": [
168
+ 128001,
169
+ 128009
170
+ ],
171
+ "stop": [
172
+ "<|end_of_text|>",
173
+ "<|eot_id|>"
174
+ ]
175
+ }
176
+ },
87
177
  {
88
178
  "version": 1,
89
179
  "context_length": 2048,
@@ -2177,7 +2267,7 @@
2177
2267
  },
2178
2268
  {
2179
2269
  "version": 1,
2180
- "context_length": 65536,
2270
+ "context_length": 32768,
2181
2271
  "model_name": "codeqwen1.5-chat",
2182
2272
  "model_lang": [
2183
2273
  "en",
@@ -114,6 +114,22 @@ class ChatModelMixin:
114
114
  else:
115
115
  ret += role
116
116
  return ret
117
+ elif prompt_style.style_name == "LLAMA3":
118
+ ret = (
119
+ f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
120
+ f"{prompt_style.intra_message_sep}{prompt_style.system_prompt}{prompt_style.inter_message_sep}"
121
+ )
122
+ for i, message in enumerate(chat_history):
123
+ role = get_role(message["role"])
124
+ content = message["content"]
125
+ if content:
126
+ ret += (
127
+ f"<|start_header_id|>{role}<|end_header_id|>"
128
+ f"{prompt_style.intra_message_sep}{content}{prompt_style.inter_message_sep}"
129
+ )
130
+ else:
131
+ ret += f"<|start_header_id|>{role}<|end_header_id|>{prompt_style.intra_message_sep}"
132
+ return ret
117
133
  elif prompt_style.style_name == "FALCON":
118
134
  ret = prompt_style.system_prompt
119
135
  for message in chat_history:
@@ -85,6 +85,7 @@ except ImportError:
85
85
 
86
86
  VLLM_SUPPORTED_MODELS = [
87
87
  "llama-2",
88
+ "llama-3",
88
89
  "baichuan",
89
90
  "internlm-16k",
90
91
  "mistral-v0.1",
@@ -94,6 +95,7 @@ VLLM_SUPPORTED_MODELS = [
94
95
  ]
95
96
  VLLM_SUPPORTED_CHAT_MODELS = [
96
97
  "llama-2-chat",
98
+ "llama-3-instruct",
97
99
  "vicuna-v1.3",
98
100
  "vicuna-v1.5",
99
101
  "baichuan-chat",
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import gc
15
16
  import logging
16
17
  import os
17
18
  import uuid
@@ -21,6 +22,7 @@ from typing import Dict, List, Optional, Tuple
21
22
  import numpy as np
22
23
 
23
24
  from ...constants import XINFERENCE_CACHE_DIR
25
+ from ...device_utils import empty_cache
24
26
  from ...types import Document, DocumentObj, Rerank
25
27
  from ..core import CacheableModelSpec, ModelDescription
26
28
  from ..utils import is_model_cached
@@ -31,6 +33,8 @@ logger = logging.getLogger(__name__)
31
33
  # Init when registering all the builtin models.
32
34
  MODEL_NAME_TO_REVISION: Dict[str, List[str]] = defaultdict(list)
33
35
  RERANK_MODEL_DESCRIPTIONS: Dict[str, List[Dict]] = defaultdict(list)
36
+ RERANK_EMPTY_CACHE_COUNT = int(os.getenv("XINFERENCE_RERANK_EMPTY_CACHE_COUNT", "10"))
37
+ assert RERANK_EMPTY_CACHE_COUNT > 0
34
38
 
35
39
 
36
40
  def get_rerank_model_descriptions():
@@ -113,28 +117,44 @@ class RerankModel:
113
117
  self._model_config = model_config or dict()
114
118
  self._use_fp16 = use_fp16
115
119
  self._model = None
120
+ self._counter = 0
116
121
 
117
122
  def load(self):
118
- try:
119
- if self._model_spec.type == "normal":
120
- from FlagEmbedding import FlagReranker
121
- elif self._model_spec.type == "LLM-based":
122
- from FlagEmbedding import FlagLLMReranker as FlagReranker
123
- elif self._model_spec.type == "LLM-based layerwise":
124
- from FlagEmbedding import LayerWiseFlagLLMReranker as FlagReranker
125
- else:
126
- raise RuntimeError(
127
- f"Unsupported Rank model type: {self._model_spec.type}"
128
- )
129
- except ImportError:
130
- error_message = "Failed to import module 'FlagEmbedding'"
131
- installation_guide = [
132
- "Please make sure 'FlagEmbedding' is installed. ",
133
- "You can install it by `pip install FlagEmbedding`\n",
134
- ]
123
+ if self._model_spec.type == "normal":
124
+ try:
125
+ from sentence_transformers.cross_encoder import CrossEncoder
126
+ except ImportError:
127
+ error_message = "Failed to import module 'sentence-transformers'"
128
+ installation_guide = [
129
+ "Please make sure 'sentence-transformers' is installed. ",
130
+ "You can install it by `pip install sentence-transformers`\n",
131
+ ]
132
+
133
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
134
+ self._model = CrossEncoder(
135
+ self._model_path, device=self._device, **self._model_config
136
+ )
137
+ if self._use_fp16:
138
+ self._model.model.half()
139
+ else:
140
+ try:
141
+ if self._model_spec.type == "LLM-based":
142
+ from FlagEmbedding import FlagLLMReranker as FlagReranker
143
+ elif self._model_spec.type == "LLM-based layerwise":
144
+ from FlagEmbedding import LayerWiseFlagLLMReranker as FlagReranker
145
+ else:
146
+ raise RuntimeError(
147
+ f"Unsupported Rank model type: {self._model_spec.type}"
148
+ )
149
+ except ImportError:
150
+ error_message = "Failed to import module 'FlagEmbedding'"
151
+ installation_guide = [
152
+ "Please make sure 'FlagEmbedding' is installed. ",
153
+ "You can install it by `pip install FlagEmbedding`\n",
154
+ ]
135
155
 
136
- raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
137
- self._model = FlagReranker(self._model_path, use_fp16=True)
156
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
157
+ self._model = FlagReranker(self._model_path, use_fp16=self._use_fp16)
138
158
 
139
159
  def rerank(
140
160
  self,
@@ -145,13 +165,21 @@ class RerankModel:
145
165
  return_documents: Optional[bool],
146
166
  **kwargs,
147
167
  ) -> Rerank:
168
+ self._counter += 1
169
+ if self._counter % RERANK_EMPTY_CACHE_COUNT == 0:
170
+ logger.debug("Empty rerank cache.")
171
+ gc.collect()
172
+ empty_cache()
148
173
  assert self._model is not None
149
174
  if kwargs:
150
175
  raise ValueError("rerank hasn't support extra parameter.")
151
176
  if max_chunks_per_doc is not None:
152
177
  raise ValueError("rerank hasn't support `max_chunks_per_doc` parameter.")
153
178
  sentence_combinations = [[query, doc] for doc in documents]
154
- similarity_scores = self._model.compute_score(sentence_combinations)
179
+ if self._model_spec.type == "normal":
180
+ similarity_scores = self._model.predict(sentence_combinations)
181
+ else:
182
+ similarity_scores = self._model.compute_score(sentence_combinations)
155
183
  sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))
156
184
  if top_n is not None:
157
185
  sim_scores_argsort = sim_scores_argsort[:top_n]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: xinference
3
- Version: 0.10.2
3
+ Version: 0.10.3
4
4
  Summary: Model Serving Made Easy
5
5
  Home-page: https://github.com/xorbitsai/inference
6
6
  Author: Qin Xuye
@@ -176,13 +176,14 @@ potential of cutting-edge AI models.
176
176
  - Docker image: [#855](https://github.com/xorbitsai/inference/pull/855)
177
177
  - Support multimodal: [#829](https://github.com/xorbitsai/inference/pull/829)
178
178
  ### New Models
179
+ - Built-in support for [Llama 3](https://github.com/meta-llama/llama3): [#1332](https://github.com/xorbitsai/inference/pull/1332)
180
+ - Built-in support for [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01): [#1310](https://github.com/xorbitsai/inference/pull/1310)
179
181
  - Built-in support for [Qwen1.5 MOE](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat): [#1263](https://github.com/xorbitsai/inference/pull/1263)
180
182
  - Built-in support for [Qwen1.5 32B](https://huggingface.co/Qwen/Qwen1.5-32B-Chat): [#1249](https://github.com/xorbitsai/inference/pull/1249)
181
183
  - Built-in support for [OmniLMM](https://github.com/OpenBMB/OmniLMM): [#1171](https://github.com/xorbitsai/inference/pull/1171)
182
184
  - Built-in support for [Gemma](https://github.com/google-deepmind/gemma): [#1024](https://github.com/xorbitsai/inference/pull/1024)
183
- - Built-in support for [Qwen1.5](https://github.com/QwenLM/Qwen1.5): [#994](https://github.com/xorbitsai/inference/pull/994)
184
- - Built-in support for [Yi-VL](https://github.com/01-ai/Yi): [#946](https://github.com/xorbitsai/inference/pull/946)
185
185
  ### Integrations
186
+ - [FastGPT](https://github.com/labring/FastGPT): a knowledge-based platform built on the LLM, offers out-of-the-box data processing and model invocation capabilities, allows for workflow orchestration through Flow visualization.
186
187
  - [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): an LLMOps platform that enables developers (and even non-developers) to quickly build useful applications based on large language models, ensuring they are visual, operable, and improvable.
187
188
  - [Chatbox](https://chatboxai.app/): a desktop client for multiple cutting-edge LLM models, available on Windows, Mac and Linux.
188
189
 
@@ -1,6 +1,6 @@
1
1
  xinference/__init__.py,sha256=0LgIveLP6CXxoIaSrxhlFyOh0lOqPgJBVcBe0tkWJjc,987
2
2
  xinference/_compat.py,sha256=SQAjZMGxtBIce45qtW7ob7RWzA0zhv2yB3AxT0rb0uU,1778
3
- xinference/_version.py,sha256=ssSjGx-iKXJNU5J5mSKaxTe1YccRpRUOLSoZ8QSvejo,498
3
+ xinference/_version.py,sha256=AQ6rrRceWHquLfKWGWzSVXI8bGhcAlO5_Q3_EWaZt1Q,498
4
4
  xinference/conftest.py,sha256=RffV9htxwo6iDEGZwmcj0A_O_XBQM2RRUea4q6XTeGQ,9742
5
5
  xinference/constants.py,sha256=Bu_fOJUGAvvqF_6FY5OzOHl7fQ1Nomek3LY17xr9oz4,2882
6
6
  xinference/device_utils.py,sha256=WNKDD4Eni3Io3AehiyonsuoJaukT77Bc76Es7vNGvjc,2615
@@ -20,7 +20,7 @@ xinference/client/handlers.py,sha256=3gd9C7u4URbcVdR6Eyv8cpEZ175Ll4q_jGL07CnEIpg
20
20
  xinference/client/oscar/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
21
21
  xinference/client/oscar/actor_client.py,sha256=CAI8_UGsCIX94aKv9H1QoVinIxQ2Zm7gcMWXEaUQShw,21593
22
22
  xinference/client/restful/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
23
- xinference/client/restful/restful_client.py,sha256=COwWrZNZgvHwFRTdHG5MiIAR8l7AJlmpsemT-TNXXJ8,42575
23
+ xinference/client/restful/restful_client.py,sha256=ukox-JX1PRsR1BhIJrUPDlC0-c1Yf9RH8z2lQyygNqQ,42354
24
24
  xinference/core/__init__.py,sha256=Fe5tYCHDbYJ7PhxJhQ68VbfgKgOsAuslNPr4wPhFMJM,612
25
25
  xinference/core/cache_tracker.py,sha256=rBF8MXWK3rP5Q69LuhpWb2ZeF_bqbC3zCTCKs8FlfZE,4261
26
26
  xinference/core/chat_interface.py,sha256=CNqILarZfdMnZebuOaemK4FomouLqKAcd7lt24JF09Q,17073
@@ -32,9 +32,9 @@ xinference/core/resource.py,sha256=FQ0aRt3T4ZQo0P6CZZf5QUKHiCsr5llBvKb1f7wfnxg,1
32
32
  xinference/core/status_guard.py,sha256=ScmTFb3NPTp-RzufdHFpBh5TZHPc2bu907JA8l0gywE,2804
33
33
  xinference/core/supervisor.py,sha256=salJ3vIjkQblexxLYl7Mi46iiWIKhpsY9W8DRXxoHrA,41212
34
34
  xinference/core/utils.py,sha256=tUpUJUQv1zkE9i7fw1pAFfFdcB3PC6DvKJn4Bmmq75E,6008
35
- xinference/core/worker.py,sha256=kh7laY7FvNvimgxYh5eCodAwaoUVtRp-018Z2X0utxA,33512
35
+ xinference/core/worker.py,sha256=zfbxO3EJl3zJ7JKhXLEQ7EK3sd9yXSW8iUsn1dq5e00,33784
36
36
  xinference/deploy/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
37
- xinference/deploy/cmdline.py,sha256=_cWl6rUL5ZnxiWLRL5QoVA3xJOVguW2tqtU-rljeHpc,35524
37
+ xinference/deploy/cmdline.py,sha256=bpc6g8V6FwVFx-DOGU7n8XRPSZFrXqFRmRH6atD98DE,35647
38
38
  xinference/deploy/local.py,sha256=vlAvhcl8utP1DjW4MJpBgD4JLHQV-1Xebmdd8j9M8IM,3946
39
39
  xinference/deploy/supervisor.py,sha256=fMHeEGigQ72PD9JEFmZ5Xudn25Uj4DhD2OVIlAu_YpA,2978
40
40
  xinference/deploy/utils.py,sha256=_g4U6GJVzHnEHzF-KSMm-tffba2mtLNnxoEwnC8jmj8,5361
@@ -50,11 +50,11 @@ xinference/model/utils.py,sha256=qqCaje-dJvSarVzeGgmwKnq85e82JCLPVq2yCfAFZlo,145
50
50
  xinference/model/audio/__init__.py,sha256=0EVzX6b4pcOO63NAcNpYWTVYVa7w7yG5cPpGxOY9MXw,2347
51
51
  xinference/model/audio/core.py,sha256=ypbIvbueTFKeulYt7aJX7FfU4y3Hn3DzxkhhjKO6Dxw,4373
52
52
  xinference/model/audio/custom.py,sha256=Li6VpTmpZ17YXk_bwN2-tUKRAJwNcW-O4OwrJefzC2o,4966
53
- xinference/model/audio/model_spec.json,sha256=gXsXm33FdDr1SfuNfydmt96jjZac9uVPP0Pxe50HA0k,2362
53
+ xinference/model/audio/model_spec.json,sha256=dQUgG7HT9Ge4-0TBie7GcyXbPHz4lH_2HttVTm560Dg,2595
54
54
  xinference/model/audio/utils.py,sha256=pwo5cHh8nvhyBa9f-17QaVpXMSjmbpGbPYKwBBtEhGM,717
55
55
  xinference/model/audio/whisper.py,sha256=vWUn5huqER_g8ttxzHFNz6UNyDn2CnF7OzS_4PQjjKE,4599
56
56
  xinference/model/embedding/__init__.py,sha256=0FLzOZyOuMctxFvhobkLXRUepwHck6RPbtjCct1eMI8,2854
57
- xinference/model/embedding/core.py,sha256=UmLiclNhgJ83fg69pBDr3FK4emgnt5yDM_k-uDNew2Y,12609
57
+ xinference/model/embedding/core.py,sha256=VJ1b7zUwkm5VtmtQx3-bYpJuETiKb4345dYP6P4oRM4,13023
58
58
  xinference/model/embedding/custom.py,sha256=iE3-iWVzxarXdeTdw5e6rxv6HQRXVbPHp65wwhT2IL8,3919
59
59
  xinference/model/embedding/model_spec.json,sha256=hpM2_FhH6gSqmrgu2MMu4u94XMEw6r9A6aKUQObsCK0,6652
60
60
  xinference/model/embedding/model_spec_modelscope.json,sha256=No71OUu5OoALs6amJ0UiRU6JH9DkYRQvdvSgCf3IIHs,5814
@@ -68,10 +68,10 @@ xinference/model/image/stable_diffusion/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17k
68
68
  xinference/model/image/stable_diffusion/core.py,sha256=ib_ZeSg7hzynmRqSnhjtrVuhoLOgZPrR1ZH2LjBmH2E,6063
69
69
  xinference/model/llm/__init__.py,sha256=op1aUvEPtQ5KeWYvbP-skptyMC8osQphWKs7EbgNJ1c,6555
70
70
  xinference/model/llm/core.py,sha256=FeZv1UiA7zPdmDcAQpmFL9Bslj6grqOSRvqsqkVtBHg,9572
71
- xinference/model/llm/llm_family.json,sha256=nDTLZsol-aUoknQ8rNXyEco4AnfaExQOqfgP4Qr8REg,123909
71
+ xinference/model/llm/llm_family.json,sha256=Few9frWihmqwN_c_Q0B5S1esZ8DPhGdnNRvxAGEQIOE,127493
72
72
  xinference/model/llm/llm_family.py,sha256=pryVjq7WZ84x9kwzXQgXFgE5UxIqBn_LTudeXnDX5RE,34615
73
- xinference/model/llm/llm_family_modelscope.json,sha256=g50wfGhW0gIUgOA6FjbLLGDrZpO0MviMOKyvLEK8MQo,77600
74
- xinference/model/llm/utils.py,sha256=jjtPltmsoymFD6p8PK-3DLDUzmO4Veg7fBddFZn-0VI,28882
73
+ xinference/model/llm/llm_family_modelscope.json,sha256=PJlaTLjcYdaqR95U8GYaSJsbxxCZ_Q-8k6Di4ciGZ_k,79795
74
+ xinference/model/llm/utils.py,sha256=gNuRa1VIk5Dv0rrkuCCNQJCFQ7iqwEKIjiej4Cfo8eY,29706
75
75
  xinference/model/llm/ggml/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
76
76
  xinference/model/llm/ggml/chatglm.py,sha256=Zrzw8K2EroI5v2JlwOAJ08tNFs871n86zRtBxuK97Z8,13044
77
77
  xinference/model/llm/ggml/llamacpp.py,sha256=HLjcMOOrMoriaTx39jDOufyfY5lXdO84cCWZORjCc8U,11426
@@ -97,9 +97,9 @@ xinference/model/llm/pytorch/yi_vl.py,sha256=aZkMQPlIb522Ue1K62DAMclq1n9HVw4OQNu
97
97
  xinference/model/llm/sglang/__init__.py,sha256=-sjSIQ4K6w-TEzx49kVaWeWC443fnZqODU91GCQ_JNo,581
98
98
  xinference/model/llm/sglang/core.py,sha256=eqAczZfGJInC_jihXVeKiWQ79Llk3reHDBkdShQlH-0,12915
99
99
  xinference/model/llm/vllm/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
100
- xinference/model/llm/vllm/core.py,sha256=H1uLAhVzLEMMMpMpTSUHwOJtDpKV6sr7cJM9OlJcSM4,18039
100
+ xinference/model/llm/vllm/core.py,sha256=sV67VKfViYzX_IziSYKlwzO1rw7OUyZEJSCOnxRQSKY,18078
101
101
  xinference/model/rerank/__init__.py,sha256=BXIL1uu3ZpZHX9bODhW9lxKUXudZE7-OkXFmmM5rpMU,2817
102
- xinference/model/rerank/core.py,sha256=WSrZ7679av_9HRYd6pKD84Z0ZUJpN6-X8bO4OH7ixiY,8395
102
+ xinference/model/rerank/core.py,sha256=UVfue73hHE9UL5c-X7OajZfTR_mLTv673RLFWZAfWV4,9665
103
103
  xinference/model/rerank/custom.py,sha256=NKk7jA7p4xkuwS5WoOs2SY2wdnoAVpyCjBTvv317bBw,3917
104
104
  xinference/model/rerank/model_spec.json,sha256=LCiiCdNz4NYt9vKVnHffk3ZpwvgzzHxe4zsaxOqxL18,1367
105
105
  xinference/model/rerank/model_spec_modelscope.json,sha256=vSSC0aWy_DHnNDzzBcMWr2pqdISDmPS95FtD_YfMmn4,1275
@@ -15400,9 +15400,9 @@ xinference/web/ui/node_modules/yargs-parser/package.json,sha256=BSwbOzgetKXMK4u0
15400
15400
  xinference/web/ui/node_modules/yocto-queue/package.json,sha256=6U1XHQPGXJTqsiFvT953ORihUtXTblZy4fXBWP9qxC0,725
15401
15401
  xinference/web/ui/node_modules/yup/package.json,sha256=xRFSROB9NKxqSWHEVFvSTsPs9Ll074uo8OS1zEw0qhA,1206
15402
15402
  xinference/web/ui/node_modules/yup/node_modules/type-fest/package.json,sha256=JTv2zTTVgxQ2H82m1-6qEpdMv08lHjFx4Puf_MsbB_Q,1134
15403
- xinference-0.10.2.dist-info/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
15404
- xinference-0.10.2.dist-info/METADATA,sha256=GBKBO2j-z_gJExhHZh42Z2wCEM_t0qSR3qXI707AavE,14990
15405
- xinference-0.10.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
15406
- xinference-0.10.2.dist-info/entry_points.txt,sha256=-lDyyzqWMFQF0Rgm7VxBNz0V-bMBMQLRR3pvQ-Y8XTY,226
15407
- xinference-0.10.2.dist-info/top_level.txt,sha256=L1rQt7pl6m8tmKXpWVHzP-GtmzAxp663rXxGE7qnK00,11
15408
- xinference-0.10.2.dist-info/RECORD,,
15403
+ xinference-0.10.3.dist-info/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
15404
+ xinference-0.10.3.dist-info/METADATA,sha256=CjQ70PUW3asgEheRVmH7_P6AZMUglcWaUkIo1VHvcz8,15256
15405
+ xinference-0.10.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
15406
+ xinference-0.10.3.dist-info/entry_points.txt,sha256=-lDyyzqWMFQF0Rgm7VxBNz0V-bMBMQLRR3pvQ-Y8XTY,226
15407
+ xinference-0.10.3.dist-info/top_level.txt,sha256=L1rQt7pl6m8tmKXpWVHzP-GtmzAxp663rXxGE7qnK00,11
15408
+ xinference-0.10.3.dist-info/RECORD,,