xinference 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (97) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +34 -15
  3. xinference/client/oscar/actor_client.py +4 -3
  4. xinference/client/restful/restful_client.py +40 -18
  5. xinference/core/supervisor.py +48 -9
  6. xinference/core/worker.py +13 -8
  7. xinference/deploy/cmdline.py +22 -9
  8. xinference/model/audio/__init__.py +40 -1
  9. xinference/model/audio/core.py +25 -45
  10. xinference/model/audio/custom.py +148 -0
  11. xinference/model/core.py +6 -9
  12. xinference/model/embedding/core.py +1 -2
  13. xinference/model/embedding/model_spec.json +24 -0
  14. xinference/model/embedding/model_spec_modelscope.json +24 -0
  15. xinference/model/image/core.py +12 -4
  16. xinference/model/image/stable_diffusion/core.py +8 -7
  17. xinference/model/llm/__init__.py +0 -6
  18. xinference/model/llm/core.py +9 -14
  19. xinference/model/llm/ggml/llamacpp.py +2 -10
  20. xinference/model/llm/llm_family.json +507 -7
  21. xinference/model/llm/llm_family.py +41 -4
  22. xinference/model/llm/llm_family_modelscope.json +260 -0
  23. xinference/model/llm/pytorch/baichuan.py +4 -3
  24. xinference/model/llm/pytorch/chatglm.py +5 -2
  25. xinference/model/llm/pytorch/core.py +37 -41
  26. xinference/model/llm/pytorch/falcon.py +6 -5
  27. xinference/model/llm/pytorch/internlm2.py +5 -2
  28. xinference/model/llm/pytorch/llama_2.py +6 -5
  29. xinference/model/llm/pytorch/qwen_vl.py +2 -0
  30. xinference/model/llm/pytorch/vicuna.py +4 -3
  31. xinference/model/llm/pytorch/yi_vl.py +4 -2
  32. xinference/model/llm/utils.py +42 -4
  33. xinference/model/llm/vllm/core.py +54 -6
  34. xinference/model/rerank/core.py +26 -12
  35. xinference/model/rerank/model_spec.json +24 -0
  36. xinference/model/rerank/model_spec_modelscope.json +25 -1
  37. xinference/model/utils.py +12 -1
  38. xinference/thirdparty/omnilmm/chat.py +1 -1
  39. xinference/types.py +70 -19
  40. xinference/utils.py +1 -0
  41. xinference/web/ui/build/asset-manifest.json +3 -3
  42. xinference/web/ui/build/index.html +1 -1
  43. xinference/web/ui/build/static/js/main.26fdbfbe.js +3 -0
  44. xinference/web/ui/build/static/js/main.26fdbfbe.js.map +1 -0
  45. xinference/web/ui/node_modules/.cache/babel-loader/15e2cf8cd8d0989719b6349428ff576f9009ff4c2dcc52378be0bd938e82495e.json +1 -0
  46. xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +1 -0
  47. xinference/web/ui/node_modules/.cache/babel-loader/1e86938a0cdf706d21e99b21f5d868fa247c0c88b26807047e26dcdc4d9a9db3.json +1 -0
  48. xinference/web/ui/node_modules/.cache/babel-loader/3c2f277c93c5f1638e08db38df0d0fb4e58d1c5571aea03241a5c04ff4094704.json +1 -0
  49. xinference/web/ui/node_modules/.cache/babel-loader/3fa1f69162f9c6dc0f6a6e21b64d49d6b8e6fa8dfa59a82cf829931c5f97d99f.json +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/59ce49eae0f486af4c5034d4d2f9ca77c3ec3a32ecc560085caf5ef482b5f4c9.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/62e257ed9016471035fa1a7da57c9e2a4250974ed566b4d1295873d747c68eb2.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/b9cbcb6d77ba21b22c6950b6fb5b305d23c19cf747f99f7d48b6b046f8f7b1b0.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/e606671420d2937102c3c34b4b04056c11736408c1d3347b8cf42dfe61fb394b.json +1 -0
  59. xinference/web/ui/node_modules/.cache/babel-loader/e6eccc9aa641e7da833492e27846dc965f9750281420977dc84654ca6ed221e4.json +1 -0
  60. xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +1 -0
  61. xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/f9290c0738db50065492ceedc6a4af25083fe18399b7c44d942273349ad9e643.json +1 -0
  63. xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +1 -0
  65. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/METADATA +13 -10
  66. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/RECORD +71 -74
  67. xinference/model/llm/ggml/ctransformers.py +0 -281
  68. xinference/model/llm/ggml/ctransformers_util.py +0 -161
  69. xinference/web/ui/build/static/js/main.98516614.js +0 -3
  70. xinference/web/ui/build/static/js/main.98516614.js.map +0 -1
  71. xinference/web/ui/node_modules/.cache/babel-loader/0bd70b1ecf307e2681318e864f4692305b6350c8683863007f4caf2f9ac33b6e.json +0 -1
  72. xinference/web/ui/node_modules/.cache/babel-loader/0db651c046ef908f45cde73af0dbea0a797d3e35bb57f4a0863b481502103a64.json +0 -1
  73. xinference/web/ui/node_modules/.cache/babel-loader/139969fd25258eb7decc9505f30b779089bba50c402bb5c663008477c7bff73b.json +0 -1
  74. xinference/web/ui/node_modules/.cache/babel-loader/18e5d5422e2464abf4a3e6d38164570e2e426e0a921e9a2628bbae81b18da353.json +0 -1
  75. xinference/web/ui/node_modules/.cache/babel-loader/3d93bd9a74a1ab0cec85af40f9baa5f6a8e7384b9e18c409b95a81a7b45bb7e2.json +0 -1
  76. xinference/web/ui/node_modules/.cache/babel-loader/3e055de705e397e1d413d7f429589b1a98dd78ef378b97f0cdb462c5f2487d5e.json +0 -1
  77. xinference/web/ui/node_modules/.cache/babel-loader/3f357ab57b8e7fade54c667f0e0ebf2787566f72bfdca0fea14e395b5c203753.json +0 -1
  78. xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +0 -1
  79. xinference/web/ui/node_modules/.cache/babel-loader/52aa27272b4b9968f62666262b47661cb1992336a2aff3b13994cc36877b3ec3.json +0 -1
  80. xinference/web/ui/node_modules/.cache/babel-loader/60c4b98d8ea7479fb0c94cfd19c8128f17bd7e27a1e73e6dd9adf6e9d88d18eb.json +0 -1
  81. xinference/web/ui/node_modules/.cache/babel-loader/7e094845f611802b024b57439cbf911038169d06cdf6c34a72a7277f35aa71a4.json +0 -1
  82. xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +0 -1
  83. xinference/web/ui/node_modules/.cache/babel-loader/98b7ef307f436affe13d75a4f265b27e828ccc2b10ffae6513abe2681bc11971.json +0 -1
  84. xinference/web/ui/node_modules/.cache/babel-loader/9d7c49815d97539207e5aab2fb967591b5fed7791218a0762539efc9491f36af.json +0 -1
  85. xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +0 -1
  86. xinference/web/ui/node_modules/.cache/babel-loader/b400cfc9db57fa6c70cd2bad055b73c5079fde0ed37974009d898083f6af8cd8.json +0 -1
  87. xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +0 -1
  88. xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +0 -1
  89. xinference/web/ui/node_modules/.cache/babel-loader/d0d0b591d9adaf42b83ad6633f8b7c118541a4b80ea957c303d3bf9b86fbad0a.json +0 -1
  90. xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +0 -1
  91. xinference/web/ui/node_modules/.cache/babel-loader/e1d9b2ae4e1248658704bc6bfc5d6160dcd1a9e771ea4ae8c1fed0aaddeedd29.json +0 -1
  92. xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +0 -1
  93. /xinference/web/ui/build/static/js/{main.98516614.js.LICENSE.txt → main.26fdbfbe.js.LICENSE.txt} +0 -0
  94. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/LICENSE +0 -0
  95. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/WHEEL +0 -0
  96. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/entry_points.txt +0 -0
  97. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/top_level.txt +0 -0
@@ -59,6 +59,8 @@ class YiVLChatModel(PytorchChatModel):
59
59
 
60
60
  self._device = self._pytorch_model_config.get("device", "auto")
61
61
  self._device = select_device(self._device)
62
+ # for multiple GPU, set back to auto to make multiple devices work
63
+ self._device = "auto" if self._device == "cuda" else self._device
62
64
 
63
65
  key_info["model_path"] = self.model_path
64
66
  # Default device_map is auto, it can loads model to multiple cards.
@@ -190,7 +192,7 @@ class YiVLChatModel(PytorchChatModel):
190
192
  prompt, self._tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
191
193
  )
192
194
  .unsqueeze(0)
193
- .to(self._device)
195
+ .to(self._model.device)
194
196
  )
195
197
 
196
198
  images = state.get_images(return_pil=True)
@@ -215,7 +217,7 @@ class YiVLChatModel(PytorchChatModel):
215
217
  "input_ids": input_ids,
216
218
  "images": image_tensor.unsqueeze(0)
217
219
  .to(dtype=torch.bfloat16)
218
- .to(self._device),
220
+ .to(self._model.device),
219
221
  "streamer": streamer,
220
222
  "do_sample": True,
221
223
  "top_p": float(top_p),
@@ -163,7 +163,7 @@ class ChatModelMixin:
163
163
 
164
164
  for i, message in enumerate(chat_history):
165
165
  role = get_role(message["role"])
166
- content = message["content"]
166
+ content = message.get("content")
167
167
  tool_calls = message.get("tool_calls")
168
168
  if tool_calls:
169
169
  content = tool_calls[0]["function"]
@@ -248,7 +248,7 @@ Begin!"""
248
248
  ret = f"<|im_start|>system\n{prompt_style.system_prompt}<|im_end|>"
249
249
  for message in chat_history:
250
250
  role = get_role(message["role"])
251
- content = message["content"]
251
+ content = message.get("content")
252
252
 
253
253
  ret += prompt_style.intra_message_sep
254
254
  if tools:
@@ -446,6 +446,11 @@ Begin!"""
446
446
  "index": i,
447
447
  "delta": {
448
448
  "content": choice["text"],
449
+ **(
450
+ {"tool_calls": choice["tool_calls"]}
451
+ if "tool_calls" in choice
452
+ else {}
453
+ ),
449
454
  },
450
455
  "finish_reason": choice["finish_reason"],
451
456
  }
@@ -592,8 +597,7 @@ Begin!"""
592
597
  return text, None, None
593
598
 
594
599
  @classmethod
595
- def _tool_calls_completion(cls, model_family, model_uid, c, tools):
596
- _id = str(uuid.uuid4())
600
+ def _eval_tool_arguments(cls, model_family, c, tools):
597
601
  family = model_family.model_family or model_family.model_name
598
602
  if family in ["gorilla-openfunctions-v1", "gorilla-openfunctions-v2"]:
599
603
  content, func, args = cls._eval_gorilla_openfunctions_arguments(c, tools)
@@ -606,7 +610,41 @@ Begin!"""
606
610
  f"Model {model_family.model_name} is not support tool calls."
607
611
  )
608
612
  logger.debug("Tool call content: %s, func: %s, args: %s", content, func, args)
613
+ return content, func, args
614
+
615
+ @classmethod
616
+ def _tools_token_filter(cls, model_family):
617
+ """
618
+ Generates a filter function for Qwen series models to retain outputs after "\nFinal Answer:".
609
619
 
620
+ Returns:
621
+ A function that takes tokens (string output by the model so far) as input
622
+ returns True if current token is after "\nFinal Answer:", else False.
623
+ """
624
+ family = model_family.model_family or model_family.model_name
625
+ if family in ["qwen-chat", "qwen1.5-chat"]:
626
+ # Encapsulating function to reset 'found' after each call
627
+ found = False
628
+
629
+ def process_token(tokens: str):
630
+ nonlocal found
631
+ # Once "Final Answer:" is found, future tokens are allowed.
632
+ if found:
633
+ return True
634
+ # Check if the token ends with "\nFinal Answer:" and update `found`.
635
+ if tokens.endswith("\nFinal Answer:"):
636
+ found = True
637
+ return False
638
+
639
+ return process_token
640
+ else:
641
+ # For other families, allow all tokens.
642
+ return lambda tokens: True
643
+
644
+ @classmethod
645
+ def _tool_calls_completion(cls, model_family, model_uid, c, tools):
646
+ _id = str(uuid.uuid4())
647
+ content, func, args = cls._eval_tool_arguments(model_family, c, tools)
610
648
  if func:
611
649
  m = {
612
650
  "role": "assistant",
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import json
15
16
  import logging
16
17
  import multiprocessing
17
18
  import time
@@ -36,6 +37,8 @@ from ....types import (
36
37
  CompletionChoice,
37
38
  CompletionChunk,
38
39
  CompletionUsage,
40
+ ToolCallFunction,
41
+ ToolCalls,
39
42
  )
40
43
  from .. import LLM, LLMFamilyV1, LLMSpecV1
41
44
  from ..llm_family import CustomLLMFamilyV1
@@ -80,7 +83,15 @@ try:
80
83
  except ImportError:
81
84
  VLLM_INSTALLED = False
82
85
 
83
- VLLM_SUPPORTED_MODELS = ["llama-2", "baichuan", "internlm-16k", "mistral-v0.1"]
86
+ VLLM_SUPPORTED_MODELS = [
87
+ "llama-2",
88
+ "baichuan",
89
+ "internlm-16k",
90
+ "mistral-v0.1",
91
+ "Yi",
92
+ "code-llama",
93
+ "code-llama-python",
94
+ ]
84
95
  VLLM_SUPPORTED_CHAT_MODELS = [
85
96
  "llama-2-chat",
86
97
  "vicuna-v1.3",
@@ -90,21 +101,22 @@ VLLM_SUPPORTED_CHAT_MODELS = [
90
101
  "internlm-chat-7b",
91
102
  "internlm-chat-8k",
92
103
  "internlm-chat-20b",
104
+ "internlm2-chat",
93
105
  "qwen-chat",
94
- "Yi",
95
106
  "Yi-chat",
96
- "code-llama",
97
- "code-llama-python",
98
107
  "code-llama-instruct",
99
108
  "mistral-instruct-v0.1",
100
109
  "mistral-instruct-v0.2",
101
110
  "mixtral-instruct-v0.1",
102
111
  "chatglm3",
112
+ "chatglm3-32k",
113
+ "chatglm3-128k",
103
114
  "deepseek-chat",
104
115
  "deepseek-coder-instruct",
105
116
  ]
106
117
  if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
107
118
  VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-chat")
119
+ VLLM_SUPPORTED_CHAT_MODELS.append("codeqwen1.5-chat")
108
120
 
109
121
  if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
110
122
  VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
@@ -113,6 +125,11 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
113
125
  VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat")
114
126
  VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat-rag")
115
127
 
128
+ if VLLM_INSTALLED and vllm.__version__ >= "0.4.0":
129
+ VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-moe-chat")
130
+ VLLM_SUPPORTED_MODELS.append("c4ai-command-r-v01")
131
+ VLLM_SUPPORTED_MODELS.append("c4ai-command-r-v01-4bit")
132
+
116
133
 
117
134
  class VLLMModel(LLM):
118
135
  def __init__(
@@ -293,6 +310,7 @@ class VLLMModel(LLM):
293
310
  self,
294
311
  prompt: str,
295
312
  generate_config: Optional[Dict] = None,
313
+ tools: object = False,
296
314
  ) -> Union[Completion, AsyncGenerator[CompletionChunk, None]]:
297
315
  try:
298
316
  from vllm.sampling_params import SamplingParams
@@ -319,16 +337,46 @@ class VLLMModel(LLM):
319
337
 
320
338
  async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
321
339
  previous_texts = [""] * sanitized_generate_config["n"]
340
+ tools_token_filter = ChatModelMixin._tools_token_filter(self.model_family)
322
341
  async for _request_output in results_generator:
323
342
  chunk = self._convert_request_output_to_completion_chunk(
324
343
  request_id=request_id,
325
344
  model=self.model_uid,
326
345
  request_output=_request_output,
327
346
  )
347
+
328
348
  for i, choice in enumerate(chunk["choices"]):
329
349
  delta = choice["text"][len(previous_texts[i]) :]
330
350
  previous_texts[i] = choice["text"]
331
351
  choice["text"] = delta
352
+
353
+ if tools:
354
+ # only handle the first choice
355
+ choice = chunk["choices"][0]
356
+ if choice["finish_reason"] is not None:
357
+ # use previous text for evaluation temporarily
358
+ choice_delta = choice["text"]
359
+ choice["text"] = previous_texts[0]
360
+ _content, func, args = ChatModelMixin._eval_tool_arguments(
361
+ self.model_family, chunk, tools
362
+ )
363
+ choice["text"] = choice_delta
364
+ if func is not None:
365
+ choice["text"] = None
366
+ choice["finish_reason"] = "tool_calls"
367
+ choice["tool_calls"] = [
368
+ ToolCalls(
369
+ id=str(uuid.uuid4()),
370
+ type="function",
371
+ function=ToolCallFunction(
372
+ name=func,
373
+ arguments=json.dumps(args, ensure_ascii=False),
374
+ ),
375
+ )
376
+ ]
377
+ # use a filter function to skip Qwen's react thought process
378
+ elif not tools_token_filter(previous_texts[0]):
379
+ continue
332
380
  prompt_tokens = len(_request_output.prompt_token_ids)
333
381
  completion_tokens = sum(
334
382
  len(output.token_ids) for output in _request_output.outputs
@@ -416,7 +464,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
416
464
  generate_config = self._sanitize_chat_config(generate_config)
417
465
  # TODO(codingl2k1): qwen hacky to set stop for function call.
418
466
  model_family = self.model_family.model_family or self.model_family.model_name
419
- if tools and "qwen-chat" == model_family:
467
+ if tools and model_family in ["qwen-chat", "qwen1.5-chat"]:
420
468
  stop = generate_config.get("stop")
421
469
  if isinstance(stop, str):
422
470
  generate_config["stop"] = [stop, "Observation:"]
@@ -429,7 +477,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
429
477
  stream = generate_config.get("stream", None)
430
478
 
431
479
  if stream:
432
- agen = await self.async_generate(full_prompt, generate_config)
480
+ agen = await self.async_generate(full_prompt, generate_config, tools)
433
481
  assert isinstance(agen, AsyncGenerator)
434
482
  return self._async_to_chat_completion_chunks(agen)
435
483
  else:
@@ -42,8 +42,9 @@ def get_rerank_model_descriptions():
42
42
  class RerankModelSpec(CacheableModelSpec):
43
43
  model_name: str
44
44
  language: List[str]
45
+ type: Optional[str] = "normal"
45
46
  model_id: str
46
- model_revision: str
47
+ model_revision: Optional[str]
47
48
  model_hub: str = "huggingface"
48
49
 
49
50
 
@@ -63,6 +64,7 @@ class RerankModelDescription(ModelDescription):
63
64
  "model_type": "rerank",
64
65
  "address": self.address,
65
66
  "accelerators": self.devices,
67
+ "type": self._model_spec.type,
66
68
  "model_name": self._model_spec.model_name,
67
69
  "language": self._model_spec.language,
68
70
  "model_revision": self._model_spec.model_revision,
@@ -97,12 +99,14 @@ def generate_rerank_description(model_spec: RerankModelSpec) -> Dict[str, List[D
97
99
  class RerankModel:
98
100
  def __init__(
99
101
  self,
102
+ model_spec: RerankModelSpec,
100
103
  model_uid: str,
101
104
  model_path: str,
102
105
  device: Optional[str] = None,
103
106
  use_fp16: bool = False,
104
107
  model_config: Optional[Dict] = None,
105
108
  ):
109
+ self._model_spec = model_spec
106
110
  self._model_uid = model_uid
107
111
  self._model_path = model_path
108
112
  self._device = device
@@ -112,20 +116,25 @@ class RerankModel:
112
116
 
113
117
  def load(self):
114
118
  try:
115
- from sentence_transformers.cross_encoder import CrossEncoder
119
+ if self._model_spec.type == "normal":
120
+ from FlagEmbedding import FlagReranker
121
+ elif self._model_spec.type == "LLM-based":
122
+ from FlagEmbedding import FlagLLMReranker as FlagReranker
123
+ elif self._model_spec.type == "LLM-based layerwise":
124
+ from FlagEmbedding import LayerWiseFlagLLMReranker as FlagReranker
125
+ else:
126
+ raise RuntimeError(
127
+ f"Unsupported Rank model type: {self._model_spec.type}"
128
+ )
116
129
  except ImportError:
117
- error_message = "Failed to import module 'SentenceTransformer'"
130
+ error_message = "Failed to import module 'FlagEmbedding'"
118
131
  installation_guide = [
119
- "Please make sure 'sentence-transformers' is installed. ",
120
- "You can install it by `pip install sentence-transformers`\n",
132
+ "Please make sure 'FlagEmbedding' is installed. ",
133
+ "You can install it by `pip install FlagEmbedding`\n",
121
134
  ]
122
135
 
123
136
  raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
124
- self._model = CrossEncoder(
125
- self._model_path, device=self._device, **self._model_config
126
- )
127
- if self._use_fp16:
128
- self._model.model.half()
137
+ self._model = FlagReranker(self._model_path, use_fp16=True)
129
138
 
130
139
  def rerank(
131
140
  self,
@@ -134,12 +143,15 @@ class RerankModel:
134
143
  top_n: Optional[int],
135
144
  max_chunks_per_doc: Optional[int],
136
145
  return_documents: Optional[bool],
146
+ **kwargs,
137
147
  ) -> Rerank:
138
148
  assert self._model is not None
149
+ if kwargs:
150
+ raise ValueError("rerank hasn't support extra parameter.")
139
151
  if max_chunks_per_doc is not None:
140
152
  raise ValueError("rerank hasn't support `max_chunks_per_doc` parameter.")
141
153
  sentence_combinations = [[query, doc] for doc in documents]
142
- similarity_scores = self._model.predict(sentence_combinations)
154
+ similarity_scores = self._model.compute_score(sentence_combinations)
143
155
  sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))
144
156
  if top_n is not None:
145
157
  sim_scores_argsort = sim_scores_argsort[:top_n]
@@ -221,7 +233,9 @@ def create_rerank_model_instance(
221
233
 
222
234
  model_path = cache(model_spec)
223
235
  use_fp16 = kwargs.pop("use_fp16", False)
224
- model = RerankModel(model_uid, model_path, use_fp16=use_fp16, model_config=kwargs)
236
+ model = RerankModel(
237
+ model_spec, model_uid, model_path, use_fp16=use_fp16, model_config=kwargs
238
+ )
225
239
  model_description = RerankModelDescription(
226
240
  subpool_addr, devices, model_spec, model_path=model_path
227
241
  )
@@ -1,20 +1,44 @@
1
1
  [
2
2
  {
3
3
  "model_name": "bge-reranker-large",
4
+ "type": "normal",
4
5
  "language": ["en", "zh"],
5
6
  "model_id": "BAAI/bge-reranker-large",
6
7
  "model_revision": "27c9168d479987529781de8474dff94d69beca11"
7
8
  },
8
9
  {
9
10
  "model_name": "bge-reranker-base",
11
+ "type": "normal",
10
12
  "language": ["en", "zh"],
11
13
  "model_id": "BAAI/bge-reranker-base",
12
14
  "model_revision": "465b4b7ddf2be0a020c8ad6e525b9bb1dbb708ae"
13
15
  },
14
16
  {
15
17
  "model_name": "bce-reranker-base_v1",
18
+ "type": "normal",
16
19
  "language": ["en", "zh"],
17
20
  "model_id": "maidalun1020/bce-reranker-base_v1",
18
21
  "model_revision": "eaa31a577a0574e87a08959bd229ca14ce1b5496"
22
+ },
23
+ {
24
+ "model_name": "bge-reranker-v2-m3",
25
+ "type": "normal",
26
+ "language": ["en", "zh", "multilingual"],
27
+ "model_id": "BAAI/bge-reranker-v2-m3",
28
+ "model_revision": "12e974610ba9083ed95f3edf08d7e899581f4de4"
29
+ },
30
+ {
31
+ "model_name": "bge-reranker-v2-gemma",
32
+ "type": "LLM-based",
33
+ "language": ["en", "zh", "multilingual"],
34
+ "model_id": "BAAI/bge-reranker-v2-gemma",
35
+ "model_revision": "1787044f8b6fb740a9de4557c3a12377f84d9e17"
36
+ },
37
+ {
38
+ "model_name": "bge-reranker-v2-minicpm-layerwise",
39
+ "type": "LLM-based layerwise",
40
+ "language": ["en", "zh", "multilingual"],
41
+ "model_id": "BAAI/bge-reranker-v2-minicpm-layerwise",
42
+ "model_revision": "47b5332b296c4d8cb6ee2c60502cc62a0d708881"
19
43
  }
20
44
  ]
@@ -1,6 +1,7 @@
1
1
  [
2
2
  {
3
3
  "model_name": "bge-reranker-base",
4
+ "type": "normal",
4
5
  "language": ["en", "zh"],
5
6
  "model_id": "Xorbits/bge-reranker-base",
6
7
  "model_revision": "v0.0.1",
@@ -8,16 +9,39 @@
8
9
  },
9
10
  {
10
11
  "model_name": "bge-reranker-large",
12
+ "type": "normal",
11
13
  "language": ["en", "zh"],
12
14
  "model_id": "Xorbits/bge-reranker-large",
13
15
  "model_revision": "v0.0.1",
14
16
  "model_hub": "modelscope"
15
17
  },
16
- {
18
+ {
17
19
  "model_name": "bce-reranker-base_v1",
20
+ "type": "normal",
18
21
  "language": ["en", "zh"],
19
22
  "model_id": "maidalun/bce-reranker-base_v1",
20
23
  "model_revision": "v0.0.1",
21
24
  "model_hub": "modelscope"
25
+ },
26
+ {
27
+ "model_name": "bge-reranker-v2-m3",
28
+ "type": "normal",
29
+ "language": ["en", "zh", "multilingual"],
30
+ "model_id": "AI-ModelScope/bge-reranker-v2-m3",
31
+ "model_hub": "modelscope"
32
+ },
33
+ {
34
+ "model_name": "bge-reranker-v2-gemma",
35
+ "type": "LLM-based",
36
+ "language": ["en", "zh", "multilingual"],
37
+ "model_id": "AI-ModelScope/bge-reranker-v2-gemma",
38
+ "model_hub": "modelscope"
39
+ },
40
+ {
41
+ "model_name": "bge-reranker-v2-minicpm-layerwise",
42
+ "type": "LLM-based layerwise",
43
+ "language": ["en", "zh", "multilingual"],
44
+ "model_id": "zfffff/bge-reranker-v2-minicpm-layerwise",
45
+ "model_hub": "modelscope"
22
46
  }
23
47
  ]
xinference/model/utils.py CHANGED
@@ -17,7 +17,7 @@ import os
17
17
  import shutil
18
18
  from json import JSONDecodeError
19
19
  from pathlib import Path
20
- from typing import Any, Callable, Dict, Optional, Tuple
20
+ from typing import Any, Callable, Dict, Optional, Tuple, Union
21
21
 
22
22
  from fsspec import AbstractFileSystem
23
23
 
@@ -415,3 +415,14 @@ def select_device(device):
415
415
  raise ValueError(f"{device} is unavailable in your environment")
416
416
 
417
417
  return device
418
+
419
+
420
+ def convert_float_to_int_or_str(model_size: float) -> Union[int, str]:
421
+ """convert float to int or string
422
+
423
+ if float can be presented as int, convert it to int, otherwise convert it to string
424
+ """
425
+ if int(model_size) == model_size:
426
+ return int(model_size)
427
+ else:
428
+ return str(model_size)
@@ -207,7 +207,7 @@ class OmniLMM3B:
207
207
 
208
208
  class OmniLMMChat:
209
209
  def __init__(self, model_path, device_map) -> None:
210
- if "12B" in model_path:
210
+ if "12b" in model_path:
211
211
  self.model = OmniLMM12B(model_path, device_map)
212
212
  else:
213
213
  self.model = OmniLMM3B(model_path, device_map)
xinference/types.py CHANGED
@@ -91,11 +91,23 @@ class CompletionLogprobs(TypedDict):
91
91
  top_logprobs: List[Optional[Dict[str, float]]]
92
92
 
93
93
 
94
+ class ToolCallFunction(TypedDict):
95
+ name: str
96
+ arguments: str
97
+
98
+
99
+ class ToolCalls(TypedDict):
100
+ id: str
101
+ type: Literal["function"]
102
+ function: ToolCallFunction
103
+
104
+
94
105
  class CompletionChoice(TypedDict):
95
106
  text: str
96
107
  index: int
97
108
  logprobs: Optional[CompletionLogprobs]
98
109
  finish_reason: Optional[str]
110
+ tool_calls: NotRequired[List[ToolCalls]]
99
111
 
100
112
 
101
113
  class CompletionUsage(TypedDict):
@@ -147,6 +159,7 @@ class ChatCompletion(TypedDict):
147
159
  class ChatCompletionChunkDelta(TypedDict):
148
160
  role: NotRequired[str]
149
161
  content: NotRequired[str]
162
+ tool_calls: NotRequired[List[ToolCalls]]
150
163
 
151
164
 
152
165
  class ChatCompletionChunkChoice(TypedDict):
@@ -232,6 +245,8 @@ class LlamaCppModelConfig(TypedDict, total=False):
232
245
  n_ctx: int
233
246
  n_parts: int
234
247
  n_gpu_layers: int
248
+ split_mode: int
249
+ main_gpu: int
235
250
  seed: int
236
251
  f16_kv: bool
237
252
  logits_all: bool
@@ -355,21 +370,6 @@ try:
355
370
  except ImportError:
356
371
  CreateCompletionLlamaCpp = create_model("CreateCompletionLlamaCpp")
357
372
 
358
- CreateCompletionCTransformers: BaseModel
359
- try:
360
- from ctransformers.llm import LLM
361
-
362
- CreateCompletionCTransformers = get_pydantic_model_from_method(
363
- LLM.generate,
364
- exclude_fields=["tokens"],
365
- include_fields={
366
- "max_tokens": (Optional[int], max_tokens_field),
367
- "stream": (Optional[bool], stream_field),
368
- },
369
- )
370
- except ImportError:
371
- CreateCompletionCTransformers = create_model("CreateCompletionCTransformers")
372
-
373
373
 
374
374
  # This type is for openai API compatibility
375
375
  CreateCompletionOpenAI: BaseModel
@@ -415,7 +415,6 @@ class CreateCompletion(
415
415
  ModelAndPrompt,
416
416
  CreateCompletionTorch,
417
417
  CreateCompletionLlamaCpp,
418
- CreateCompletionCTransformers,
419
418
  CreateCompletionOpenAI,
420
419
  ):
421
420
  pass
@@ -428,8 +427,6 @@ class CreateChatModel(BaseModel):
428
427
  # Currently, chat calls generates, so the params share the same one.
429
428
  CreateChatCompletionTorch = CreateCompletionTorch
430
429
  CreateChatCompletionLlamaCpp: BaseModel = CreateCompletionLlamaCpp
431
- CreateChatCompletionCTransformers: BaseModel = CreateCompletionCTransformers
432
-
433
430
 
434
431
  # This type is for openai API compatibility
435
432
  CreateChatCompletionOpenAI: BaseModel
@@ -450,7 +447,61 @@ class CreateChatCompletion(
450
447
  CreateChatModel,
451
448
  CreateChatCompletionTorch,
452
449
  CreateChatCompletionLlamaCpp,
453
- CreateChatCompletionCTransformers,
454
450
  CreateChatCompletionOpenAI,
455
451
  ):
456
452
  pass
453
+
454
+
455
+ class LoRA:
456
+ def __init__(self, lora_name: str, local_path: str):
457
+ self.lora_name = lora_name
458
+ self.local_path = local_path
459
+
460
+ def to_dict(self):
461
+ return {
462
+ "lora_name": self.lora_name,
463
+ "local_path": self.local_path,
464
+ }
465
+
466
+ @classmethod
467
+ def from_dict(cls, data: Dict):
468
+ return cls(
469
+ lora_name=data["lora_name"],
470
+ local_path=data["local_path"],
471
+ )
472
+
473
+
474
+ class PeftModelConfig:
475
+ def __init__(
476
+ self,
477
+ peft_model: Optional[List[LoRA]] = None,
478
+ image_lora_load_kwargs: Optional[Dict] = None,
479
+ image_lora_fuse_kwargs: Optional[Dict] = None,
480
+ ):
481
+ self.peft_model = peft_model
482
+ self.image_lora_load_kwargs = image_lora_load_kwargs
483
+ self.image_lora_fuse_kwargs = image_lora_fuse_kwargs
484
+
485
+ def to_dict(self):
486
+ return {
487
+ "lora_list": [lora.to_dict() for lora in self.peft_model]
488
+ if self.peft_model
489
+ else None,
490
+ "image_lora_load_kwargs": self.image_lora_load_kwargs,
491
+ "image_lora_fuse_kwargs": self.image_lora_fuse_kwargs,
492
+ }
493
+
494
+ @classmethod
495
+ def from_dict(cls, data: Dict):
496
+ peft_model_list = data.get("lora_list", None)
497
+ peft_model = (
498
+ [LoRA.from_dict(lora_dict) for lora_dict in peft_model_list]
499
+ if peft_model_list is not None
500
+ else None
501
+ )
502
+
503
+ return cls(
504
+ peft_model=peft_model,
505
+ image_lora_load_kwargs=data.get("image_lora_load_kwargs"),
506
+ image_lora_fuse_kwargs=data.get("image_lora_fuse_kwargs"),
507
+ )
xinference/utils.py CHANGED
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+
15
16
  import torch
16
17
 
17
18
 
@@ -1,11 +1,11 @@
1
1
  {
2
2
  "files": {
3
- "main.js": "./static/js/main.98516614.js",
3
+ "main.js": "./static/js/main.26fdbfbe.js",
4
4
  "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
5
5
  "index.html": "./index.html",
6
- "main.98516614.js.map": "./static/js/main.98516614.js.map"
6
+ "main.26fdbfbe.js.map": "./static/js/main.26fdbfbe.js.map"
7
7
  },
8
8
  "entrypoints": [
9
- "static/js/main.98516614.js"
9
+ "static/js/main.26fdbfbe.js"
10
10
  ]
11
11
  }
@@ -1 +1 @@
1
- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.98516614.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1
+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.26fdbfbe.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>